diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..9760d97 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "files.associations": { + "xstring": "cpp", + "xutility": "cpp" + } +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 58c4bcf..9bfd57c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -318,6 +318,12 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "cust" version = "0.3.2" @@ -795,7 +801,7 @@ checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" [[package]] name = "karlsen-miner" -version = "0.0.1-GPU-0.1" +version = "0.0.3-GPU-0.1" dependencies = [ "async-trait", "blake2b_simd", @@ -844,6 +850,7 @@ dependencies = [ "log", "nvml-wrapper", "rand 0.8.4", + "tiny-keccak", ] [[package]] @@ -1723,6 +1730,15 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tokio" version = "1.29.1" diff --git a/README.md b/README.md index 8b867e9..57a13ad 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ OPTIONS: --opencl-platform Which OpenCL platform to use (limited to one per executable) --opencl-workload Ratio of nonces to GPU possible parrallel run in OpenCL [default: 512] --opencl-workload-absolute The values given by workload are not ratio, but absolute number of nonces in OpenCL [default: false] - -p, --port karlsend port [default: Mainnet = 16110, Testnet = 16211] + -p, --port karlsend port [default: Mainnet = 42110, Testnet = 42210] -s, --karlsend-address The IP of the karlsend instance [default: 127.0.0.1] -t, --threads Amount of CPU miner threads to launch [default: 0] --testnet Use testnet instead of mainnet [default: false] diff --git a/build_fishlibs.sh b/build_fishlibs.sh new file mode 100644 index 0000000..ada2a13 --- /dev/null +++ b/build_fishlibs.sh @@ -0,0 +1,14 @@ +nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_86 --gpu-code=sm_86 -o plugins/cuda/resources/kaspa-cuda-sm86.ptx -Xptxas -O3 -Xcompiler -O3 + +nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_75 --gpu-code=sm_75 -o plugins/cuda/resources/kaspa-cuda-sm75.ptx -Xptxas -O3 -Xcompiler -O3 + +nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_61 --gpu-code=sm_61 -o plugins/cuda/resources/kaspa-cuda-sm61.ptx -Xptxas -O3 -Xcompiler -O3 + +nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -ccbin=gcc-7 -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_30 --gpu-code=sm_30 -o plugins/cuda/resources/kaspa-cuda-sm30.ptx + +nvcc plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu -ccbin=gcc-5 -std=c++11 -O3 --restrict --ptx --gpu-architecture=compute_20 --gpu-code=sm_20 -o plugins/cuda/resources/kaspa-cuda-sm20.ptx + +cargo build --release + + + diff --git a/kaspa-cuda.ptx b/kaspa-cuda.ptx new file mode 100644 index 0000000..cacb098 --- /dev/null +++ b/kaspa-cuda.ptx @@ -0,0 +1,1092 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-31833905 +// Cuda compilation tools, release 11.8, V11.8.89 +// Based on NVVM 7.0.1 +// + +.version 7.8 +.target sm_86 +.address_size 64 + + // .globl heavy_hash +.global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; +.global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; +.global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; +.global .align 1 .b8 pi[24] = {10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1}; +.global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; +.global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; +.const .align 8 .b8 keccak_round_constants[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .u64 _ZZN10item_state6updateEjE9num_words = 16; +.global .align 8 .u64 _ZZ15fishhash_kernelRK16fishhash_contextRK7hash512E9num_words = 32; +.const .align 1 .b8 matrix[4096]; +.const .align 1 .b8 hash_header[72]; +.const .align 8 .b8 target[32]; +.const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; +.const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; +.global .align 8 .u64 light_cache; +.global .align 8 .u64 full_dataset; +.const .align 8 .b8 ctx[32]; + +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5 +) +{ + .local .align 8 .b8 __local_depot0[1912]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<5>; + .reg .b16 %rs<14>; + .reg .b32 %r<923>; + .reg .b64 %rd<54>; + + + mov.u64 %SPL, __local_depot0; + ld.param.u8 %rs1, [heavy_hash_param_3]; + ld.param.u64 %rd7, [heavy_hash_param_0]; + ld.param.u64 %rd8, [heavy_hash_param_1]; + ld.param.u64 %rd10, [heavy_hash_param_2]; + ld.param.u64 %rd11, [heavy_hash_param_4]; + ld.param.u64 %rd9, [heavy_hash_param_5]; + cvta.to.global.u64 %rd1, %rd11; + add.u64 %rd2, %SPL, 0; + mov.u32 %r1, %ntid.x; + mov.u32 %r2, %ctaid.x; + mov.u32 %r3, %tid.x; + mad.lo.s32 %r4, %r2, %r1, %r3; + cvt.s64.s32 %rd3, %r4; + setp.lt.u64 %p1, %rd3, %rd10; + @%p1 bra $L__BB0_2; + + ret; + +$L__BB0_2: + cvt.u32.u64 %r5, %rd3; + setp.ne.s32 %p2, %r5, 0; + @%p2 bra $L__BB0_4; + + cvta.to.global.u64 %rd13, %rd9; + mov.u64 %rd14, 0; + st.global.u64 [%rd13], %rd14; + +$L__BB0_4: + setp.eq.s16 %p3, %rs1, 0; + @%p3 bra $L__BB0_6; + bra.uni $L__BB0_5; + +$L__BB0_6: + ld.global.u64 %rd34, [%rd1]; + xor.b64 %rd53, %rd34, %rd3; + bra.uni $L__BB0_7; + +$L__BB0_5: + shl.b64 %rd15, %rd3, 5; + add.s64 %rd16, %rd1, %rd15; + ld.global.v2.u64 {%rd17, %rd18}, [%rd16]; + mul.lo.s64 %rd21, %rd18, 5; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd21, 7; + shr.b64 %rhs, %rd21, 57; + add.u64 %rd22, %lhs, %rhs; + } + mul.lo.s64 %rd53, %rd22, 9; + shl.b64 %rd23, %rd18, 17; + ld.global.v2.u64 {%rd24, %rd25}, [%rd16+16]; + xor.b64 %rd28, %rd24, %rd17; + xor.b64 %rd29, %rd25, %rd18; + xor.b64 %rd30, %rd18, %rd28; + xor.b64 %rd31, %rd17, %rd29; + st.global.v2.u64 [%rd16], {%rd31, %rd30}; + { + .reg .b32 %dummy; + mov.b64 {%r6,%dummy}, %rd29; + } + { + .reg .b32 %dummy; + mov.b64 {%dummy,%r7}, %rd29; + } + shf.r.wrap.b32 %r8, %r7, %r6, 19; + shf.r.wrap.b32 %r9, %r6, %r7, 19; + mov.b64 %rd32, {%r9, %r8}; + xor.b64 %rd33, %rd28, %rd23; + st.global.v2.u64 [%rd16+16], {%rd33, %rd32}; + +$L__BB0_7: + and.b64 %rd35, %rd53, %rd7; + or.b64 %rd36, %rd35, %rd8; + ld.const.u8 %r10, [hash_header]; + mov.u64 %rd37, 0; + ld.const.u8 %r11, [hash_header+1]; + ld.const.u8 %r12, [hash_header+2]; + ld.const.u8 %r13, [hash_header+3]; + ld.const.u8 %r14, [hash_header+4]; + ld.const.u8 %r15, [hash_header+5]; + ld.const.u8 %r16, [hash_header+6]; + ld.const.u8 %r17, [hash_header+7]; + ld.const.u8 %r18, [hash_header+8]; + ld.const.u8 %r19, [hash_header+9]; + ld.const.u8 %r20, [hash_header+10]; + ld.const.u8 %r21, [hash_header+11]; + ld.const.u8 %r22, [hash_header+12]; + ld.const.u8 %r23, [hash_header+13]; + ld.const.u8 %r24, [hash_header+14]; + ld.const.u8 %r25, [hash_header+15]; + ld.const.u8 %r26, [hash_header+16]; + ld.const.u8 %r27, [hash_header+17]; + ld.const.u8 %r28, [hash_header+18]; + ld.const.u8 %r29, [hash_header+19]; + ld.const.u8 %r30, [hash_header+20]; + ld.const.u8 %r31, [hash_header+21]; + ld.const.u8 %r32, [hash_header+22]; + ld.const.u8 %r33, [hash_header+23]; + ld.const.u8 %r34, [hash_header+24]; + ld.const.u8 %r35, [hash_header+25]; + ld.const.u8 %r36, [hash_header+26]; + ld.const.u8 %r37, [hash_header+27]; + ld.const.u8 %r38, [hash_header+28]; + ld.const.u8 %r39, [hash_header+29]; + ld.const.u8 %r40, [hash_header+30]; + ld.const.u8 %r41, [hash_header+31]; + ld.const.u8 %r42, [hash_header+32]; + ld.const.u8 %r43, [hash_header+33]; + ld.const.u8 %r44, [hash_header+34]; + ld.const.u8 %r45, [hash_header+35]; + ld.const.u8 %r46, [hash_header+36]; + ld.const.u8 %r47, [hash_header+37]; + ld.const.u8 %r48, [hash_header+38]; + ld.const.u8 %r49, [hash_header+39]; + ld.const.u8 %r50, [hash_header+40]; + ld.const.u8 %r51, [hash_header+41]; + ld.const.u8 %r52, [hash_header+42]; + ld.const.u8 %r53, [hash_header+43]; + ld.const.u8 %r54, [hash_header+44]; + ld.const.u8 %r55, [hash_header+45]; + ld.const.u8 %r56, [hash_header+46]; + ld.const.u8 %r57, [hash_header+47]; + ld.const.u8 %r58, [hash_header+48]; + ld.const.u8 %r59, [hash_header+49]; + ld.const.u8 %r60, [hash_header+50]; + ld.const.u8 %r61, [hash_header+51]; + ld.const.u8 %r62, [hash_header+52]; + ld.const.u8 %r63, [hash_header+53]; + ld.const.u8 %r64, [hash_header+54]; + ld.const.u8 %r65, [hash_header+55]; + ld.const.u8 %r66, [hash_header+56]; + ld.const.u8 %r67, [hash_header+57]; + ld.const.u8 %r68, [hash_header+58]; + ld.const.u8 %r69, [hash_header+59]; + ld.const.u8 %r70, [hash_header+60]; + ld.const.u8 %r71, [hash_header+61]; + ld.const.u8 %r72, [hash_header+62]; + ld.const.u8 %r73, [hash_header+63]; + ld.const.u8 %rd38, [hash_header+64]; + ld.const.u8 %rd39, [hash_header+65]; + bfi.b64 %rd40, %rd39, %rd38, 8, 8; + ld.const.u8 %rd41, [hash_header+66]; + ld.const.u8 %rd42, [hash_header+67]; + bfi.b64 %rd43, %rd42, %rd41, 8, 8; + bfi.b64 %rd44, %rd43, %rd40, 16, 16; + ld.const.u8 %rd45, [hash_header+68]; + ld.const.u8 %rd46, [hash_header+69]; + bfi.b64 %rd47, %rd46, %rd45, 8, 8; + ld.const.u8 %rd48, [hash_header+70]; + ld.const.u8 %rd49, [hash_header+71]; + bfi.b64 %rd50, %rd49, %rd48, 8, 8; + bfi.b64 %rd51, %rd50, %rd47, 16, 16; + bfi.b64 %rd52, %rd51, %rd44, 32, 32; + mov.u32 %r74, -1150833019; + mov.u32 %r75, 1779033703; + st.local.v2.u32 [%rd2], {%r75, %r74}; + mov.u32 %r76, -1521486534; + mov.u32 %r77, 1013904242; + st.local.v2.u32 [%rd2+8], {%r77, %r76}; + mov.u32 %r78, -1694144372; + mov.u32 %r79, 1359893119; + st.local.v2.u32 [%rd2+16], {%r79, %r78}; + mov.u32 %r80, 1541459225; + mov.u32 %r81, 528734635; + st.local.v2.u32 [%rd2+24], {%r81, %r80}; + st.local.u64 [%rd2+64], %rd37; + mov.u32 %r82, 0; + st.local.v2.u32 [%rd2+88], {%r82, %r82}; + st.local.v2.u32 [%rd2+96], {%r82, %r82}; + st.local.v2.u32 [%rd2+104], {%r82, %r82}; + st.local.v2.u32 [%rd2+112], {%r82, %r82}; + st.local.v2.u32 [%rd2+120], {%r82, %r82}; + st.local.v2.u32 [%rd2+128], {%r82, %r82}; + mov.u16 %rs2, 0; + st.local.v2.u8 [%rd2+136], {%rs2, %rs2}; + st.local.u8 [%rd2+138], %rs2; + st.local.v2.u32 [%rd2+32], {%r75, %r74}; + st.local.v2.u32 [%rd2+40], {%r77, %r76}; + st.local.v2.u32 [%rd2+48], {%r79, %r78}; + st.local.v2.u32 [%rd2+56], {%r81, %r80}; + st.local.v2.u32 [%rd2+72], {%r82, %r82}; + st.local.v2.u32 [%rd2+80], {%r82, %r82}; + st.local.u8 [%rd2+144], %rs2; + ld.local.v4.u8 {%rs3, %rs4, %rs5, %rs6}, [%rd2+136]; + setp.eq.s16 %p4, %rs4, 0; + selp.u16 %rs10, 1, 0, %p4; + or.b16 %rs11, %rs5, %rs10; + prmt.b32 %r83, %r11, %r10, 30212; + prmt.b32 %r84, %r12, %r83, 28756; + prmt.b32 %r85, %r13, %r84, 1620; + prmt.b32 %r86, %r15, %r14, 30212; + prmt.b32 %r87, %r16, %r86, 28756; + prmt.b32 %r88, %r17, %r87, 1620; + prmt.b32 %r89, %r19, %r18, 30212; + prmt.b32 %r90, %r20, %r89, 28756; + prmt.b32 %r91, %r21, %r90, 1620; + prmt.b32 %r92, %r23, %r22, 30212; + prmt.b32 %r93, %r24, %r92, 28756; + prmt.b32 %r94, %r25, %r93, 1620; + prmt.b32 %r95, %r27, %r26, 30212; + prmt.b32 %r96, %r28, %r95, 28756; + prmt.b32 %r97, %r29, %r96, 1620; + prmt.b32 %r98, %r31, %r30, 30212; + prmt.b32 %r99, %r32, %r98, 28756; + prmt.b32 %r100, %r33, %r99, 1620; + prmt.b32 %r101, %r35, %r34, 30212; + prmt.b32 %r102, %r36, %r101, 28756; + prmt.b32 %r103, %r37, %r102, 1620; + prmt.b32 %r104, %r39, %r38, 30212; + prmt.b32 %r105, %r40, %r104, 28756; + prmt.b32 %r106, %r41, %r105, 1620; + prmt.b32 %r107, %r43, %r42, 30212; + prmt.b32 %r108, %r44, %r107, 28756; + prmt.b32 %r109, %r45, %r108, 1620; + prmt.b32 %r110, %r47, %r46, 30212; + prmt.b32 %r111, %r48, %r110, 28756; + prmt.b32 %r112, %r49, %r111, 1620; + prmt.b32 %r113, %r51, %r50, 30212; + prmt.b32 %r114, %r52, %r113, 28756; + prmt.b32 %r115, %r53, %r114, 1620; + prmt.b32 %r116, %r55, %r54, 30212; + prmt.b32 %r117, %r56, %r116, 28756; + prmt.b32 %r118, %r57, %r117, 1620; + prmt.b32 %r119, %r59, %r58, 30212; + prmt.b32 %r120, %r60, %r119, 28756; + prmt.b32 %r121, %r61, %r120, 1620; + prmt.b32 %r122, %r63, %r62, 30212; + prmt.b32 %r123, %r64, %r122, 28756; + prmt.b32 %r124, %r65, %r123, 1620; + prmt.b32 %r125, %r67, %r66, 30212; + prmt.b32 %r126, %r68, %r125, 28756; + prmt.b32 %r127, %r69, %r126, 1620; + prmt.b32 %r128, %r71, %r70, 30212; + prmt.b32 %r129, %r72, %r128, 28756; + prmt.b32 %r130, %r73, %r129, 1620; + cvt.u32.u16 %r131, %rs11; + and.b32 %r132, %r131, 255; + add.s32 %r133, %r85, -1156040474; + shf.l.wrap.b32 %r134, %r133, %r133, 16; + add.s32 %r135, %r134, 1779033703; + xor.b32 %r136, %r135, 1359893119; + shf.l.wrap.b32 %r137, %r136, %r136, 20; + add.s32 %r138, %r88, %r133; + add.s32 %r139, %r138, %r137; + xor.b32 %r140, %r139, %r134; + shf.l.wrap.b32 %r141, %r140, %r140, 24; + add.s32 %r142, %r141, %r135; + xor.b32 %r143, %r142, %r137; + shf.l.wrap.b32 %r144, %r143, %r143, 25; + add.s32 %r145, %r91, 1449989905; + shf.l.wrap.b32 %r146, %r145, %r145, 16; + add.s32 %r147, %r146, -1150833019; + xor.b32 %r148, %r147, -1694144372; + shf.l.wrap.b32 %r149, %r148, %r148, 20; + add.s32 %r150, %r94, %r145; + add.s32 %r151, %r150, %r149; + xor.b32 %r152, %r151, %r146; + shf.l.wrap.b32 %r153, %r152, %r152, 24; + add.s32 %r154, %r153, %r147; + xor.b32 %r155, %r154, %r149; + shf.l.wrap.b32 %r156, %r155, %r155, 25; + add.s32 %r157, %r97, 1542638877; + shr.u32 %r158, %r157, 16; + shl.b32 %r159, %r157, 16; + xor.b32 %r160, %r159, 4194304; + or.b32 %r161, %r160, %r158; + add.s32 %r162, %r161, 1013904242; + xor.b32 %r163, %r162, 528734635; + shf.l.wrap.b32 %r164, %r163, %r163, 20; + add.s32 %r165, %r100, %r157; + add.s32 %r166, %r165, %r164; + xor.b32 %r167, %r166, %r161; + shf.l.wrap.b32 %r168, %r167, %r167, 24; + add.s32 %r169, %r168, %r162; + xor.b32 %r170, %r169, %r164; + shf.l.wrap.b32 %r171, %r170, %r170, 25; + add.s32 %r172, %r103, 19972691; + xor.b32 %r173, %r172, %r132; + shr.u32 %r174, %r172, 16; + shl.b32 %r175, %r173, 16; + or.b32 %r176, %r175, %r174; + add.s32 %r177, %r176, -1521486534; + xor.b32 %r178, %r177, 1541459225; + shf.l.wrap.b32 %r179, %r178, %r178, 20; + add.s32 %r180, %r106, %r172; + add.s32 %r181, %r180, %r179; + xor.b32 %r182, %r181, %r176; + shf.l.wrap.b32 %r183, %r182, %r182, 24; + add.s32 %r184, %r183, %r177; + xor.b32 %r185, %r184, %r179; + shf.l.wrap.b32 %r186, %r185, %r185, 25; + add.s32 %r187, %r156, %r139; + add.s32 %r188, %r187, %r109; + xor.b32 %r189, %r183, %r188; + shf.l.wrap.b32 %r190, %r189, %r189, 16; + add.s32 %r191, %r190, %r169; + xor.b32 %r192, %r191, %r156; + shf.l.wrap.b32 %r193, %r192, %r192, 20; + add.s32 %r194, %r112, %r188; + add.s32 %r195, %r194, %r193; + xor.b32 %r196, %r195, %r190; + shf.l.wrap.b32 %r197, %r196, %r196, 24; + add.s32 %r198, %r197, %r191; + xor.b32 %r199, %r198, %r193; + shf.l.wrap.b32 %r200, %r199, %r199, 25; + add.s32 %r201, %r171, %r151; + add.s32 %r202, %r201, %r115; + xor.b32 %r203, %r202, %r141; + shf.l.wrap.b32 %r204, %r203, %r203, 16; + add.s32 %r205, %r204, %r184; + xor.b32 %r206, %r205, %r171; + shf.l.wrap.b32 %r207, %r206, %r206, 20; + add.s32 %r208, %r118, %r202; + add.s32 %r209, %r208, %r207; + xor.b32 %r210, %r209, %r204; + shf.l.wrap.b32 %r211, %r210, %r210, 24; + add.s32 %r212, %r211, %r205; + xor.b32 %r213, %r212, %r207; + shf.l.wrap.b32 %r214, %r213, %r213, 25; + add.s32 %r215, %r186, %r166; + add.s32 %r216, %r215, %r121; + xor.b32 %r217, %r216, %r153; + shf.l.wrap.b32 %r218, %r217, %r217, 16; + add.s32 %r219, %r218, %r142; + xor.b32 %r220, %r219, %r186; + shf.l.wrap.b32 %r221, %r220, %r220, 20; + add.s32 %r222, %r124, %r216; + add.s32 %r223, %r222, %r221; + xor.b32 %r224, %r223, %r218; + shf.l.wrap.b32 %r225, %r224, %r224, 24; + add.s32 %r226, %r225, %r219; + xor.b32 %r227, %r226, %r221; + shf.l.wrap.b32 %r228, %r227, %r227, 25; + add.s32 %r229, %r181, %r144; + add.s32 %r230, %r229, %r127; + xor.b32 %r231, %r230, %r168; + shf.l.wrap.b32 %r232, %r231, %r231, 16; + add.s32 %r233, %r232, %r154; + xor.b32 %r234, %r233, %r144; + shf.l.wrap.b32 %r235, %r234, %r234, 20; + add.s32 %r236, %r130, %r230; + add.s32 %r237, %r236, %r235; + xor.b32 %r238, %r237, %r232; + shf.l.wrap.b32 %r239, %r238, %r238, 24; + add.s32 %r240, %r239, %r233; + xor.b32 %r241, %r240, %r235; + shf.l.wrap.b32 %r242, %r241, %r241, 25; + add.s32 %r243, %r195, %r91; + add.s32 %r244, %r243, %r242; + xor.b32 %r245, %r244, %r211; + shf.l.wrap.b32 %r246, %r245, %r245, 16; + add.s32 %r247, %r246, %r226; + xor.b32 %r248, %r247, %r242; + shf.l.wrap.b32 %r249, %r248, %r248, 20; + add.s32 %r250, %r244, %r103; + add.s32 %r251, %r250, %r249; + xor.b32 %r252, %r251, %r246; + shf.l.wrap.b32 %r253, %r252, %r252, 24; + add.s32 %r254, %r253, %r247; + xor.b32 %r255, %r254, %r249; + shf.l.wrap.b32 %r256, %r255, %r255, 25; + add.s32 %r257, %r209, %r94; + add.s32 %r258, %r257, %r200; + xor.b32 %r259, %r225, %r258; + shf.l.wrap.b32 %r260, %r259, %r259, 16; + add.s32 %r261, %r240, %r260; + xor.b32 %r262, %r261, %r200; + shf.l.wrap.b32 %r263, %r262, %r262, 20; + add.s32 %r264, %r258, %r115; + add.s32 %r265, %r264, %r263; + xor.b32 %r266, %r265, %r260; + shf.l.wrap.b32 %r267, %r266, %r266, 24; + add.s32 %r268, %r267, %r261; + xor.b32 %r269, %r268, %r263; + shf.l.wrap.b32 %r270, %r269, %r269, 25; + add.s32 %r271, %r214, %r106; + add.s32 %r272, %r271, %r223; + xor.b32 %r273, %r239, %r272; + shf.l.wrap.b32 %r274, %r273, %r273, 16; + add.s32 %r275, %r274, %r198; + xor.b32 %r276, %r275, %r214; + shf.l.wrap.b32 %r277, %r276, %r276, 20; + add.s32 %r278, %r272, %r85; + add.s32 %r279, %r278, %r277; + xor.b32 %r280, %r279, %r274; + shf.l.wrap.b32 %r281, %r280, %r280, 24; + add.s32 %r282, %r281, %r275; + xor.b32 %r283, %r282, %r277; + shf.l.wrap.b32 %r284, %r283, %r283, 25; + add.s32 %r285, %r228, %r97; + add.s32 %r286, %r285, %r237; + xor.b32 %r287, %r286, %r197; + shf.l.wrap.b32 %r288, %r287, %r287, 16; + add.s32 %r289, %r288, %r212; + xor.b32 %r290, %r289, %r228; + shf.l.wrap.b32 %r291, %r290, %r290, 20; + add.s32 %r292, %r286, %r124; + add.s32 %r293, %r292, %r291; + xor.b32 %r294, %r293, %r288; + shf.l.wrap.b32 %r295, %r294, %r294, 24; + add.s32 %r296, %r295, %r289; + xor.b32 %r297, %r296, %r291; + shf.l.wrap.b32 %r298, %r297, %r297, 25; + add.s32 %r299, %r251, %r88; + add.s32 %r300, %r299, %r270; + xor.b32 %r301, %r300, %r295; + shf.l.wrap.b32 %r302, %r301, %r301, 16; + add.s32 %r303, %r302, %r282; + xor.b32 %r304, %r303, %r270; + shf.l.wrap.b32 %r305, %r304, %r304, 20; + add.s32 %r306, %r300, %r118; + add.s32 %r307, %r306, %r305; + xor.b32 %r308, %r307, %r302; + shf.l.wrap.b32 %r309, %r308, %r308, 24; + add.s32 %r310, %r309, %r303; + xor.b32 %r311, %r310, %r305; + shf.l.wrap.b32 %r312, %r311, %r311, 25; + add.s32 %r313, %r265, %r121; + add.s32 %r314, %r313, %r284; + xor.b32 %r315, %r314, %r253; + shf.l.wrap.b32 %r316, %r315, %r315, 16; + add.s32 %r317, %r316, %r296; + xor.b32 %r318, %r317, %r284; + shf.l.wrap.b32 %r319, %r318, %r318, 20; + add.s32 %r320, %r314, %r100; + add.s32 %r321, %r320, %r319; + xor.b32 %r322, %r321, %r316; + shf.l.wrap.b32 %r323, %r322, %r322, 24; + add.s32 %r324, %r323, %r317; + xor.b32 %r325, %r324, %r319; + shf.l.wrap.b32 %r326, %r325, %r325, 25; + add.s32 %r327, %r279, %r112; + add.s32 %r328, %r327, %r298; + xor.b32 %r329, %r328, %r267; + shf.l.wrap.b32 %r330, %r329, %r329, 16; + add.s32 %r331, %r330, %r254; + xor.b32 %r332, %r331, %r298; + shf.l.wrap.b32 %r333, %r332, %r332, 20; + add.s32 %r334, %r328, %r127; + add.s32 %r335, %r334, %r333; + xor.b32 %r336, %r335, %r330; + shf.l.wrap.b32 %r337, %r336, %r336, 24; + add.s32 %r338, %r337, %r331; + xor.b32 %r339, %r338, %r333; + shf.l.wrap.b32 %r340, %r339, %r339, 25; + add.s32 %r341, %r293, %r130; + add.s32 %r342, %r341, %r256; + xor.b32 %r343, %r342, %r281; + shf.l.wrap.b32 %r344, %r343, %r343, 16; + add.s32 %r345, %r344, %r268; + xor.b32 %r346, %r345, %r256; + shf.l.wrap.b32 %r347, %r346, %r346, 20; + add.s32 %r348, %r342, %r109; + add.s32 %r349, %r348, %r347; + xor.b32 %r350, %r349, %r344; + shf.l.wrap.b32 %r351, %r350, %r350, 24; + add.s32 %r352, %r351, %r345; + xor.b32 %r353, %r352, %r347; + shf.l.wrap.b32 %r354, %r353, %r353, 25; + add.s32 %r355, %r307, %r94; + add.s32 %r356, %r355, %r354; + xor.b32 %r357, %r356, %r323; + shf.l.wrap.b32 %r358, %r357, %r357, 16; + add.s32 %r359, %r358, %r338; + xor.b32 %r360, %r359, %r354; + shf.l.wrap.b32 %r361, %r360, %r360, 20; + add.s32 %r362, %r356, %r97; + add.s32 %r363, %r362, %r361; + xor.b32 %r364, %r363, %r358; + shf.l.wrap.b32 %r365, %r364, %r364, 24; + add.s32 %r366, %r365, %r359; + xor.b32 %r367, %r366, %r361; + shf.l.wrap.b32 %r368, %r367, %r367, 25; + add.s32 %r369, %r321, %r115; + add.s32 %r370, %r369, %r312; + xor.b32 %r371, %r370, %r337; + shf.l.wrap.b32 %r372, %r371, %r371, 16; + add.s32 %r373, %r372, %r352; + xor.b32 %r374, %r373, %r312; + shf.l.wrap.b32 %r375, %r374, %r374, 20; + add.s32 %r376, %r370, %r121; + add.s32 %r377, %r376, %r375; + xor.b32 %r378, %r377, %r372; + shf.l.wrap.b32 %r379, %r378, %r378, 24; + add.s32 %r380, %r379, %r373; + xor.b32 %r381, %r380, %r375; + shf.l.wrap.b32 %r382, %r381, %r381, 25; + add.s32 %r383, %r335, %r124; + add.s32 %r384, %r383, %r326; + xor.b32 %r385, %r384, %r351; + shf.l.wrap.b32 %r386, %r385, %r385, 16; + add.s32 %r387, %r386, %r310; + xor.b32 %r388, %r387, %r326; + shf.l.wrap.b32 %r389, %r388, %r388, 20; + add.s32 %r390, %r384, %r91; + add.s32 %r391, %r390, %r389; + xor.b32 %r392, %r391, %r386; + shf.l.wrap.b32 %r393, %r392, %r392, 24; + add.s32 %r394, %r393, %r387; + xor.b32 %r395, %r394, %r389; + shf.l.wrap.b32 %r396, %r395, %r395, 25; + add.s32 %r397, %r349, %r106; + add.s32 %r398, %r397, %r340; + xor.b32 %r399, %r398, %r309; + shf.l.wrap.b32 %r400, %r399, %r399, 16; + add.s32 %r401, %r400, %r324; + xor.b32 %r402, %r401, %r340; + shf.l.wrap.b32 %r403, %r402, %r402, 20; + add.s32 %r404, %r398, %r127; + add.s32 %r405, %r404, %r403; + xor.b32 %r406, %r405, %r400; + shf.l.wrap.b32 %r407, %r406, %r406, 24; + add.s32 %r408, %r407, %r401; + xor.b32 %r409, %r408, %r403; + shf.l.wrap.b32 %r410, %r409, %r409, 25; + add.s32 %r411, %r363, %r103; + add.s32 %r412, %r411, %r382; + xor.b32 %r413, %r412, %r407; + shf.l.wrap.b32 %r414, %r413, %r413, 16; + add.s32 %r415, %r414, %r394; + xor.b32 %r416, %r415, %r382; + shf.l.wrap.b32 %r417, %r416, %r416, 20; + add.s32 %r418, %r412, %r100; + add.s32 %r419, %r418, %r417; + xor.b32 %r420, %r419, %r414; + shf.l.wrap.b32 %r421, %r420, %r420, 24; + add.s32 %r422, %r421, %r415; + xor.b32 %r423, %r422, %r417; + shf.l.wrap.b32 %r424, %r423, %r423, 25; + add.s32 %r425, %r377, %r112; + add.s32 %r426, %r425, %r396; + xor.b32 %r427, %r426, %r365; + shf.l.wrap.b32 %r428, %r427, %r427, 16; + add.s32 %r429, %r428, %r408; + xor.b32 %r430, %r429, %r396; + shf.l.wrap.b32 %r431, %r430, %r430, 20; + add.s32 %r432, %r426, %r85; + add.s32 %r433, %r432, %r431; + xor.b32 %r434, %r433, %r428; + shf.l.wrap.b32 %r435, %r434, %r434, 24; + add.s32 %r436, %r435, %r429; + xor.b32 %r437, %r436, %r431; + shf.l.wrap.b32 %r438, %r437, %r437, 25; + add.s32 %r439, %r391, %r118; + add.s32 %r440, %r439, %r410; + xor.b32 %r441, %r440, %r379; + shf.l.wrap.b32 %r442, %r441, %r441, 16; + add.s32 %r443, %r442, %r366; + xor.b32 %r444, %r443, %r410; + shf.l.wrap.b32 %r445, %r444, %r444, 20; + add.s32 %r446, %r440, %r130; + add.s32 %r447, %r446, %r445; + xor.b32 %r448, %r447, %r442; + shf.l.wrap.b32 %r449, %r448, %r448, 24; + add.s32 %r450, %r449, %r443; + xor.b32 %r451, %r450, %r445; + shf.l.wrap.b32 %r452, %r451, %r451, 25; + add.s32 %r453, %r405, %r109; + add.s32 %r454, %r453, %r368; + xor.b32 %r455, %r454, %r393; + shf.l.wrap.b32 %r456, %r455, %r455, 16; + add.s32 %r457, %r456, %r380; + xor.b32 %r458, %r457, %r368; + shf.l.wrap.b32 %r459, %r458, %r458, 20; + add.s32 %r460, %r454, %r88; + add.s32 %r461, %r460, %r459; + xor.b32 %r462, %r461, %r456; + shf.l.wrap.b32 %r463, %r462, %r462, 24; + add.s32 %r464, %r463, %r457; + xor.b32 %r465, %r464, %r459; + shf.l.wrap.b32 %r466, %r465, %r465, 25; + add.s32 %r467, %r419, %r115; + add.s32 %r468, %r467, %r466; + xor.b32 %r469, %r468, %r435; + shf.l.wrap.b32 %r470, %r469, %r469, 16; + add.s32 %r471, %r470, %r450; + xor.b32 %r472, %r471, %r466; + shf.l.wrap.b32 %r473, %r472, %r472, 20; + add.s32 %r474, %r468, %r106; + add.s32 %r475, %r474, %r473; + xor.b32 %r476, %r475, %r470; + shf.l.wrap.b32 %r477, %r476, %r476, 24; + add.s32 %r478, %r477, %r471; + xor.b32 %r479, %r478, %r473; + shf.l.wrap.b32 %r480, %r479, %r479, 25; + add.s32 %r481, %r433, %r121; + add.s32 %r482, %r481, %r424; + xor.b32 %r483, %r482, %r449; + shf.l.wrap.b32 %r484, %r483, %r483, 16; + add.s32 %r485, %r484, %r464; + xor.b32 %r486, %r485, %r424; + shf.l.wrap.b32 %r487, %r486, %r486, 20; + add.s32 %r488, %r482, %r112; + add.s32 %r489, %r488, %r487; + xor.b32 %r490, %r489, %r484; + shf.l.wrap.b32 %r491, %r490, %r490, 24; + add.s32 %r492, %r491, %r485; + xor.b32 %r493, %r492, %r487; + shf.l.wrap.b32 %r494, %r493, %r493, 25; + add.s32 %r495, %r447, %r127; + add.s32 %r496, %r495, %r438; + xor.b32 %r497, %r496, %r463; + shf.l.wrap.b32 %r498, %r497, %r497, 16; + add.s32 %r499, %r498, %r422; + xor.b32 %r500, %r499, %r438; + shf.l.wrap.b32 %r501, %r500, %r500, 20; + add.s32 %r502, %r496, %r94; + add.s32 %r503, %r502, %r501; + xor.b32 %r504, %r503, %r498; + shf.l.wrap.b32 %r505, %r504, %r504, 24; + add.s32 %r506, %r505, %r499; + xor.b32 %r507, %r506, %r501; + shf.l.wrap.b32 %r508, %r507, %r507, 25; + add.s32 %r509, %r461, %r124; + add.s32 %r510, %r509, %r452; + xor.b32 %r511, %r510, %r421; + shf.l.wrap.b32 %r512, %r511, %r511, 16; + add.s32 %r513, %r512, %r436; + xor.b32 %r514, %r513, %r452; + shf.l.wrap.b32 %r515, %r514, %r514, 20; + add.s32 %r516, %r510, %r130; + add.s32 %r517, %r516, %r515; + xor.b32 %r518, %r517, %r512; + shf.l.wrap.b32 %r519, %r518, %r518, 24; + add.s32 %r520, %r519, %r513; + xor.b32 %r521, %r520, %r515; + shf.l.wrap.b32 %r522, %r521, %r521, 25; + add.s32 %r523, %r475, %r97; + add.s32 %r524, %r523, %r494; + xor.b32 %r525, %r524, %r519; + shf.l.wrap.b32 %r526, %r525, %r525, 16; + add.s32 %r527, %r526, %r506; + xor.b32 %r528, %r527, %r494; + shf.l.wrap.b32 %r529, %r528, %r528, 20; + add.s32 %r530, %r524, %r85; + add.s32 %r531, %r530, %r529; + xor.b32 %r532, %r531, %r526; + shf.l.wrap.b32 %r533, %r532, %r532, 24; + add.s32 %r534, %r533, %r527; + xor.b32 %r535, %r534, %r529; + shf.l.wrap.b32 %r536, %r535, %r535, 25; + add.s32 %r537, %r489, %r118; + add.s32 %r538, %r537, %r508; + xor.b32 %r539, %r538, %r477; + shf.l.wrap.b32 %r540, %r539, %r539, 16; + add.s32 %r541, %r540, %r520; + xor.b32 %r542, %r541, %r508; + shf.l.wrap.b32 %r543, %r542, %r542, 20; + add.s32 %r544, %r538, %r91; + add.s32 %r545, %r544, %r543; + xor.b32 %r546, %r545, %r540; + shf.l.wrap.b32 %r547, %r546, %r546, 24; + add.s32 %r548, %r547, %r541; + xor.b32 %r549, %r548, %r543; + shf.l.wrap.b32 %r550, %r549, %r549, 25; + add.s32 %r551, %r503, %r100; + add.s32 %r552, %r551, %r522; + xor.b32 %r553, %r552, %r491; + shf.l.wrap.b32 %r554, %r553, %r553, 16; + add.s32 %r555, %r554, %r478; + xor.b32 %r556, %r555, %r522; + shf.l.wrap.b32 %r557, %r556, %r556, 20; + add.s32 %r558, %r552, %r109; + add.s32 %r559, %r558, %r557; + xor.b32 %r560, %r559, %r554; + shf.l.wrap.b32 %r561, %r560, %r560, 24; + add.s32 %r562, %r561, %r555; + xor.b32 %r563, %r562, %r557; + shf.l.wrap.b32 %r564, %r563, %r563, 25; + add.s32 %r565, %r517, %r88; + add.s32 %r566, %r565, %r480; + xor.b32 %r567, %r566, %r505; + shf.l.wrap.b32 %r568, %r567, %r567, 16; + add.s32 %r569, %r568, %r492; + xor.b32 %r570, %r569, %r480; + shf.l.wrap.b32 %r571, %r570, %r570, 20; + add.s32 %r572, %r566, %r103; + add.s32 %r573, %r572, %r571; + xor.b32 %r574, %r573, %r568; + shf.l.wrap.b32 %r575, %r574, %r574, 24; + add.s32 %r576, %r575, %r569; + xor.b32 %r577, %r576, %r571; + shf.l.wrap.b32 %r578, %r577, %r577, 25; + add.s32 %r579, %r531, %r121; + add.s32 %r580, %r579, %r578; + xor.b32 %r581, %r580, %r547; + shf.l.wrap.b32 %r582, %r581, %r581, 16; + add.s32 %r583, %r582, %r562; + xor.b32 %r584, %r583, %r578; + shf.l.wrap.b32 %r585, %r584, %r584, 20; + add.s32 %r586, %r580, %r124; + add.s32 %r587, %r586, %r585; + xor.b32 %r588, %r587, %r582; + shf.l.wrap.b32 %r589, %r588, %r588, 24; + add.s32 %r590, %r589, %r583; + xor.b32 %r591, %r590, %r585; + shf.l.wrap.b32 %r592, %r591, %r591, 25; + add.s32 %r593, %r545, %r112; + add.s32 %r594, %r593, %r536; + xor.b32 %r595, %r594, %r561; + shf.l.wrap.b32 %r596, %r595, %r595, 16; + add.s32 %r597, %r596, %r576; + xor.b32 %r598, %r597, %r536; + shf.l.wrap.b32 %r599, %r598, %r598, 20; + add.s32 %r600, %r594, %r118; + add.s32 %r601, %r600, %r599; + xor.b32 %r602, %r601, %r596; + shf.l.wrap.b32 %r603, %r602, %r602, 24; + add.s32 %r604, %r603, %r597; + xor.b32 %r605, %r604, %r599; + shf.l.wrap.b32 %r606, %r605, %r605, 25; + add.s32 %r607, %r559, %r130; + add.s32 %r608, %r607, %r550; + xor.b32 %r609, %r608, %r575; + shf.l.wrap.b32 %r610, %r609, %r609, 16; + add.s32 %r611, %r610, %r534; + xor.b32 %r612, %r611, %r550; + shf.l.wrap.b32 %r613, %r612, %r612, 20; + add.s32 %r614, %r608, %r115; + add.s32 %r615, %r614, %r613; + xor.b32 %r616, %r615, %r610; + shf.l.wrap.b32 %r617, %r616, %r616, 24; + add.s32 %r618, %r617, %r611; + xor.b32 %r619, %r618, %r613; + shf.l.wrap.b32 %r620, %r619, %r619, 25; + add.s32 %r621, %r573, %r127; + add.s32 %r622, %r621, %r564; + xor.b32 %r623, %r622, %r533; + shf.l.wrap.b32 %r624, %r623, %r623, 16; + add.s32 %r625, %r624, %r548; + xor.b32 %r626, %r625, %r564; + shf.l.wrap.b32 %r627, %r626, %r626, 20; + add.s32 %r628, %r622, %r109; + add.s32 %r629, %r628, %r627; + xor.b32 %r630, %r629, %r624; + shf.l.wrap.b32 %r631, %r630, %r630, 24; + add.s32 %r632, %r631, %r625; + xor.b32 %r633, %r632, %r627; + shf.l.wrap.b32 %r634, %r633, %r633, 25; + add.s32 %r635, %r587, %r106; + add.s32 %r636, %r635, %r606; + xor.b32 %r637, %r636, %r631; + shf.l.wrap.b32 %r638, %r637, %r637, 16; + add.s32 %r639, %r638, %r618; + xor.b32 %r640, %r639, %r606; + shf.l.wrap.b32 %r641, %r640, %r640, 20; + add.s32 %r642, %r636, %r91; + add.s32 %r643, %r642, %r641; + xor.b32 %r644, %r643, %r638; + shf.l.wrap.b32 %r645, %r644, %r644, 24; + add.s32 %r646, %r645, %r639; + xor.b32 %r647, %r646, %r641; + shf.l.wrap.b32 %r648, %r647, %r647, 25; + add.s32 %r649, %r601, %r100; + add.s32 %r650, %r649, %r620; + xor.b32 %r651, %r650, %r589; + shf.l.wrap.b32 %r652, %r651, %r651, 16; + add.s32 %r653, %r652, %r632; + xor.b32 %r654, %r653, %r620; + shf.l.wrap.b32 %r655, %r654, %r654, 20; + add.s32 %r656, %r650, %r94; + add.s32 %r657, %r656, %r655; + xor.b32 %r658, %r657, %r652; + shf.l.wrap.b32 %r659, %r658, %r658, 24; + add.s32 %r660, %r659, %r653; + xor.b32 %r661, %r660, %r655; + shf.l.wrap.b32 %r662, %r661, %r661, 25; + add.s32 %r663, %r615, %r85; + add.s32 %r664, %r663, %r634; + xor.b32 %r665, %r664, %r603; + shf.l.wrap.b32 %r666, %r665, %r665, 16; + add.s32 %r667, %r666, %r590; + xor.b32 %r668, %r667, %r634; + shf.l.wrap.b32 %r669, %r668, %r668, 20; + add.s32 %r670, %r664, %r88; + add.s32 %r671, %r670, %r669; + xor.b32 %r672, %r671, %r666; + shf.l.wrap.b32 %r673, %r672, %r672, 24; + add.s32 %r674, %r673, %r667; + xor.b32 %r675, %r674, %r669; + shf.l.wrap.b32 %r676, %r675, %r675, 25; + add.s32 %r677, %r629, %r103; + add.s32 %r678, %r677, %r592; + xor.b32 %r679, %r678, %r617; + shf.l.wrap.b32 %r680, %r679, %r679, 16; + add.s32 %r681, %r680, %r604; + xor.b32 %r682, %r681, %r592; + shf.l.wrap.b32 %r683, %r682, %r682, 20; + add.s32 %r684, %r678, %r97; + add.s32 %r685, %r684, %r683; + xor.b32 %r686, %r685, %r680; + shf.l.wrap.b32 %r687, %r686, %r686, 24; + add.s32 %r688, %r687, %r681; + xor.b32 %r689, %r688, %r683; + shf.l.wrap.b32 %r690, %r689, %r689, 25; + add.s32 %r691, %r643, %r112; + add.s32 %r692, %r691, %r690; + xor.b32 %r693, %r692, %r659; + shf.l.wrap.b32 %r694, %r693, %r693, 16; + add.s32 %r695, %r694, %r674; + xor.b32 %r696, %r695, %r690; + shf.l.wrap.b32 %r697, %r696, %r696, 20; + add.s32 %r698, %r692, %r127; + add.s32 %r699, %r698, %r697; + xor.b32 %r700, %r699, %r694; + shf.l.wrap.b32 %r701, %r700, %r700, 24; + add.s32 %r702, %r701, %r695; + xor.b32 %r703, %r702, %r697; + shf.l.wrap.b32 %r704, %r703, %r703, 25; + add.s32 %r705, %r657, %r118; + add.s32 %r706, %r705, %r648; + xor.b32 %r707, %r706, %r673; + shf.l.wrap.b32 %r708, %r707, %r707, 16; + add.s32 %r709, %r708, %r688; + xor.b32 %r710, %r709, %r648; + shf.l.wrap.b32 %r711, %r710, %r710, 20; + add.s32 %r712, %r706, %r100; + add.s32 %r713, %r712, %r711; + xor.b32 %r714, %r713, %r708; + shf.l.wrap.b32 %r715, %r714, %r714, 24; + add.s32 %r716, %r715, %r709; + xor.b32 %r717, %r716, %r711; + shf.l.wrap.b32 %r718, %r717, %r717, 25; + add.s32 %r719, %r671, %r109; + add.s32 %r720, %r719, %r662; + xor.b32 %r721, %r720, %r687; + shf.l.wrap.b32 %r722, %r721, %r721, 16; + add.s32 %r723, %r722, %r646; + xor.b32 %r724, %r723, %r662; + shf.l.wrap.b32 %r725, %r724, %r724, 20; + add.s32 %r726, %r720, %r121; + add.s32 %r727, %r726, %r725; + xor.b32 %r728, %r727, %r722; + shf.l.wrap.b32 %r729, %r728, %r728, 24; + add.s32 %r730, %r729, %r723; + xor.b32 %r731, %r730, %r725; + shf.l.wrap.b32 %r732, %r731, %r731, 25; + add.s32 %r733, %r685, %r130; + add.s32 %r734, %r733, %r676; + xor.b32 %r735, %r734, %r645; + shf.l.wrap.b32 %r736, %r735, %r735, 16; + add.s32 %r737, %r736, %r660; + xor.b32 %r738, %r737, %r676; + shf.l.wrap.b32 %r739, %r738, %r738, 20; + add.s32 %r740, %r734, %r88; + add.s32 %r741, %r740, %r739; + xor.b32 %r742, %r741, %r736; + shf.l.wrap.b32 %r743, %r742, %r742, 24; + add.s32 %r744, %r743, %r737; + xor.b32 %r745, %r744, %r739; + shf.l.wrap.b32 %r746, %r745, %r745, 25; + add.s32 %r747, %r699, %r124; + add.s32 %r748, %r747, %r718; + xor.b32 %r749, %r748, %r743; + shf.l.wrap.b32 %r750, %r749, %r749, 16; + add.s32 %r751, %r750, %r730; + xor.b32 %r752, %r751, %r718; + shf.l.wrap.b32 %r753, %r752, %r752, 20; + add.s32 %r754, %r748, %r94; + add.s32 %r755, %r754, %r753; + xor.b32 %r756, %r755, %r750; + shf.l.wrap.b32 %r757, %r756, %r756, 24; + add.s32 %r758, %r757, %r751; + xor.b32 %r759, %r758, %r753; + shf.l.wrap.b32 %r760, %r759, %r759, 25; + add.s32 %r761, %r713, %r85; + add.s32 %r762, %r761, %r732; + xor.b32 %r763, %r762, %r701; + shf.l.wrap.b32 %r764, %r763, %r763, 16; + add.s32 %r765, %r764, %r744; + xor.b32 %r766, %r765, %r732; + shf.l.wrap.b32 %r767, %r766, %r766, 20; + add.s32 %r768, %r762, %r115; + add.s32 %r769, %r768, %r767; + xor.b32 %r770, %r769, %r764; + shf.l.wrap.b32 %r771, %r770, %r770, 24; + add.s32 %r772, %r771, %r765; + xor.b32 %r773, %r772, %r767; + shf.l.wrap.b32 %r774, %r773, %r773, 25; + add.s32 %r775, %r727, %r91; + add.s32 %r776, %r775, %r746; + xor.b32 %r777, %r776, %r715; + shf.l.wrap.b32 %r778, %r777, %r777, 16; + add.s32 %r779, %r778, %r702; + xor.b32 %r780, %r779, %r746; + shf.l.wrap.b32 %r781, %r780, %r780, 20; + add.s32 %r782, %r776, %r103; + add.s32 %r783, %r782, %r781; + xor.b32 %r784, %r783, %r778; + shf.l.wrap.b32 %r785, %r784, %r784, 24; + add.s32 %r786, %r785, %r779; + xor.b32 %r787, %r786, %r781; + shf.l.wrap.b32 %r788, %r787, %r787, 25; + add.s32 %r789, %r741, %r97; + add.s32 %r790, %r789, %r704; + xor.b32 %r791, %r790, %r729; + shf.l.wrap.b32 %r792, %r791, %r791, 16; + add.s32 %r793, %r792, %r716; + xor.b32 %r794, %r793, %r704; + shf.l.wrap.b32 %r795, %r794, %r794, 20; + add.s32 %r796, %r790, %r106; + add.s32 %r797, %r796, %r795; + xor.b32 %r798, %r797, %r792; + shf.l.wrap.b32 %r799, %r798, %r798, 24; + add.s32 %r800, %r799, %r793; + xor.b32 %r801, %r800, %r795; + shf.l.wrap.b32 %r802, %r801, %r801, 25; + add.s32 %r803, %r755, %r118; + add.s32 %r804, %r803, %r802; + xor.b32 %r805, %r804, %r771; + shf.l.wrap.b32 %r806, %r805, %r805, 16; + add.s32 %r807, %r806, %r786; + xor.b32 %r808, %r807, %r802; + shf.l.wrap.b32 %r809, %r808, %r808, 20; + add.s32 %r810, %r804, %r130; + add.s32 %r811, %r810, %r809; + xor.b32 %r812, %r811, %r806; + shf.l.wrap.b32 %r813, %r812, %r812, 24; + add.s32 %r814, %r813, %r807; + xor.b32 %r815, %r814, %r809; + shf.l.wrap.b32 %r816, %r815, %r815, 25; + add.s32 %r817, %r769, %r100; + add.s32 %r818, %r817, %r760; + xor.b32 %r819, %r818, %r785; + shf.l.wrap.b32 %r820, %r819, %r819, 16; + add.s32 %r821, %r820, %r800; + xor.b32 %r822, %r821, %r760; + shf.l.wrap.b32 %r823, %r822, %r822, 20; + add.s32 %r824, %r818, %r85; + add.s32 %r825, %r824, %r823; + xor.b32 %r826, %r825, %r820; + shf.l.wrap.b32 %r827, %r826, %r826, 24; + add.s32 %r828, %r827, %r821; + xor.b32 %r829, %r828, %r823; + shf.l.wrap.b32 %r830, %r829, %r829, 25; + add.s32 %r831, %r783, %r88; + add.s32 %r832, %r831, %r774; + xor.b32 %r833, %r832, %r799; + shf.l.wrap.b32 %r834, %r833, %r833, 16; + add.s32 %r835, %r834, %r758; + xor.b32 %r836, %r835, %r774; + shf.l.wrap.b32 %r837, %r836, %r836, 20; + add.s32 %r838, %r832, %r112; + add.s32 %r839, %r838, %r837; + xor.b32 %r840, %r839, %r834; + shf.l.wrap.b32 %r841, %r840, %r840, 24; + add.s32 %r842, %r841, %r835; + xor.b32 %r843, %r842, %r837; + shf.l.wrap.b32 %r844, %r843, %r843, 25; + add.s32 %r845, %r797, %r109; + add.s32 %r846, %r845, %r788; + xor.b32 %r847, %r846, %r757; + shf.l.wrap.b32 %r848, %r847, %r847, 16; + add.s32 %r849, %r848, %r772; + xor.b32 %r850, %r849, %r788; + shf.l.wrap.b32 %r851, %r850, %r850, 20; + add.s32 %r852, %r846, %r103; + add.s32 %r853, %r852, %r851; + xor.b32 %r854, %r853, %r848; + shf.l.wrap.b32 %r855, %r854, %r854, 24; + add.s32 %r856, %r855, %r849; + xor.b32 %r857, %r856, %r851; + shf.l.wrap.b32 %r858, %r857, %r857, 25; + add.s32 %r859, %r811, %r127; + add.s32 %r860, %r859, %r830; + xor.b32 %r861, %r860, %r855; + shf.l.wrap.b32 %r862, %r861, %r861, 16; + add.s32 %r863, %r862, %r842; + xor.b32 %r864, %r863, %r830; + shf.l.wrap.b32 %r865, %r864, %r864, 20; + add.s32 %r866, %r860, %r115; + add.s32 %r867, %r866, %r865; + xor.b32 %r868, %r867, %r862; + shf.l.wrap.b32 %r869, %r868, %r868, 24; + add.s32 %r870, %r869, %r863; + xor.b32 %r871, %r870, %r865; + shf.l.wrap.b32 %r872, %r871, %r871, 25; + add.s32 %r873, %r825, %r91; + add.s32 %r874, %r873, %r844; + xor.b32 %r875, %r874, %r813; + shf.l.wrap.b32 %r876, %r875, %r875, 16; + add.s32 %r877, %r876, %r856; + xor.b32 %r878, %r877, %r844; + shf.l.wrap.b32 %r879, %r878, %r878, 20; + add.s32 %r880, %r874, %r121; + add.s32 %r881, %r880, %r879; + xor.b32 %r882, %r881, %r876; + shf.l.wrap.b32 %r883, %r882, %r882, 24; + add.s32 %r884, %r883, %r877; + xor.b32 %r885, %r884, %r879; + shf.l.wrap.b32 %r886, %r885, %r885, 25; + add.s32 %r887, %r839, %r94; + add.s32 %r888, %r887, %r858; + xor.b32 %r889, %r888, %r827; + shf.l.wrap.b32 %r890, %r889, %r889, 16; + add.s32 %r891, %r890, %r814; + xor.b32 %r892, %r891, %r858; + shf.l.wrap.b32 %r893, %r892, %r892, 20; + add.s32 %r894, %r888, %r97; + add.s32 %r895, %r894, %r893; + xor.b32 %r896, %r895, %r890; + shf.l.wrap.b32 %r897, %r896, %r896, 24; + add.s32 %r898, %r897, %r891; + xor.b32 %r899, %r898, %r893; + shf.l.wrap.b32 %r900, %r899, %r899, 25; + add.s32 %r901, %r853, %r106; + add.s32 %r902, %r901, %r816; + xor.b32 %r903, %r902, %r841; + shf.l.wrap.b32 %r904, %r903, %r903, 16; + add.s32 %r905, %r904, %r828; + xor.b32 %r906, %r905, %r816; + shf.l.wrap.b32 %r907, %r906, %r906, 20; + add.s32 %r908, %r902, %r124; + add.s32 %r909, %r908, %r907; + xor.b32 %r910, %r909, %r904; + shf.l.wrap.b32 %r911, %r910, %r910, 24; + add.s32 %r912, %r911, %r905; + xor.b32 %r913, %r912, %r907; + shf.l.wrap.b32 %r914, %r913, %r913, 25; + xor.b32 %r915, %r912, %r881; + xor.b32 %r916, %r898, %r867; + st.local.v2.u32 [%rd2+32], {%r916, %r915}; + xor.b32 %r917, %r870, %r895; + xor.b32 %r918, %r909, %r884; + st.local.v2.u32 [%rd2+40], {%r917, %r918}; + xor.b32 %r919, %r872, %r897; + xor.b32 %r920, %r914, %r883; + st.local.v2.u32 [%rd2+48], {%r920, %r919}; + xor.b32 %r921, %r911, %r886; + xor.b32 %r922, %r900, %r869; + st.local.v2.u32 [%rd2+56], {%r921, %r922}; + st.local.u64 [%rd2+72], %rd52; + st.local.u64 [%rd2+80], %rd36; + add.s16 %rs12, %rs4, 1; + add.s16 %rs13, %rs3, 16; + st.local.v2.u8 [%rd2+136], {%rs13, %rs12}; + trap; + +} + diff --git a/plugins/cuda/Cargo.toml b/plugins/cuda/Cargo.toml index 59a2e9d..bfc43c8 100644 --- a/plugins/cuda/Cargo.toml +++ b/plugins/cuda/Cargo.toml @@ -13,6 +13,7 @@ rand = "0.8" clap = { version = "3.0", features = ["color", "derive"]} env_logger = "0.9" nvml-wrapper = { git = "https://github.com/benrod3k/nvml-wrapper", branch = "495.29.05", optional = true } +tiny-keccak = { version = "2.0.2", features = ["keccak"] } [lib] crate-type = ["cdylib", "rlib"] diff --git a/plugins/cuda/kaspa-cuda-native/src/cuda_helper.h b/plugins/cuda/kaspa-cuda-native/src/cuda_helper.h new file mode 100644 index 0000000..7a48709 --- /dev/null +++ b/plugins/cuda/kaspa-cuda-native/src/cuda_helper.h @@ -0,0 +1,989 @@ +#pragma once + +#include + +#include + +#define DEV_INLINE __device__ __forceinline__ + +#ifdef __INTELLISENSE__ +/* reduce vstudio warnings (__byteperm, blockIdx...) */ +#include +#include +#define __launch_bounds__(max_tpb, min_blocks) +#define asm("a" : "=l"(result) : "l"(a)) +#define __CUDA_ARCH__ 520 // highlight shuffle code by default. + +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); +uint32_t atomicExch(uint32_t* x, uint32_t y); +uint32_t atomicAdd(uint32_t* x, uint32_t y); +void __syncthreads(void); +void __threadfence(void); +void __threadfence_block(void); +#endif + +#include + +#ifndef MAX_GPUS +#define MAX_GPUS 32 +#endif + +extern "C" int device_map[MAX_GPUS]; +extern "C" long device_sm[MAX_GPUS]; +extern cudaStream_t gpustream[MAX_GPUS]; + +// common functions +extern void cuda_check_cpu_init(int thr_id, uint32_t threads); +extern void cuda_check_cpu_setTarget(const void* ptarget); +extern void cuda_check_cpu_setTarget_mod(const void* ptarget, const void* ptarget2); +extern uint32_t cuda_check_hash( + int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* d_inputHash); +extern uint32_t cuda_check_hash_suppl( + int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* d_inputHash, uint32_t foundnonce); +extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); + +#ifndef __CUDA_ARCH__ +// define blockDim and threadIdx for host +extern const dim3 blockDim; +extern const uint3 threadIdx; +#endif + + +#ifndef SPH_C32 +#define SPH_C32(x) ((x##U)) +// #define SPH_C32(x) ((uint32_t)(x ## U)) +#endif + +#ifndef SPH_C64 +#define SPH_C64(x) ((x##ULL)) +// #define SPH_C64(x) ((uint64_t)(x ## ULL)) +#endif + +#ifndef SPH_T32 +#define SPH_T32(x) (x) +// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#endif +#ifndef SPH_T64 +#define SPH_T64(x) (x) +// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) +#endif + +#define ROTL32c(x, n) ((x) << (n)) | ((x) >> (32 - (n))) + +#if __CUDA_ARCH__ < 320 +// Kepler (Compute 3.0) +#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) +#else +// Kepler (Compute 3.5, 5.0) +DEV_INLINE uint32_t ROTL32(const uint32_t x, const uint32_t n) +{ + return (__funnelshift_l((x), (x), (n))); +} +#endif +#if __CUDA_ARCH__ < 320 +// Kepler (Compute 3.0) +#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else +DEV_INLINE uint32_t ROTR32(const uint32_t x, const uint32_t n) +{ + return (__funnelshift_r((x), (x), (n))); +} +#endif + +DEV_INLINE uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI) +{ + uint64_t result; + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(LO), "r"(HI)); + return result; +} + +// Endian Drehung für 32 Bit Typen +#ifdef __CUDA_ARCH__ +DEV_INLINE uint32_t cuda_swab32(const uint32_t x) +{ + /* device */ + return __byte_perm(x, x, 0x0123); +} +#else +/* host */ +#define cuda_swab32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | \ + (((x) >> 24) & 0x000000ffu)) +#endif + +#ifdef __CUDA_ARCH__ +DEV_INLINE uint64_t cuda_swab64(const uint64_t x) +{ + uint64_t result; + uint2 t; + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(t.x), "=r"(t.y) : "l"(x)); + t.x = __byte_perm(t.x, 0, 0x0123); + t.y = __byte_perm(t.y, 0, 0x0123); + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(t.y), "r"(t.x)); + return result; +} +#else +/* host */ +#define cuda_swab64(x) \ + ((uint64_t)((((uint64_t)(x)&0xff00000000000000ULL) >> 56) | \ + (((uint64_t)(x)&0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x)&0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x)&0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x)&0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x)&0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x)&0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x)&0x00000000000000ffULL) << 56))) +#endif + + +#ifdef _WIN64 +#define USE_XOR_ASM_OPTS 0 +#else +#define USE_XOR_ASM_OPTS 1 +#endif + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +DEV_INLINE uint64_t xor1(const uint64_t a, const uint64_t b) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b)); + return result; +} +#else +#define xor1(a, b) (a ^ b) +#endif + +/* +#if USE_XOR_ASM_OPTS +// device asm for whirpool +DEV_INLINE +uint64_t xor3(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; + asm("xor.b64 %0, %2, %3;\n\t" + "xor.b64 %0, %0, %1;\n\t" + //output : input registers + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} +#else +#define xor3(a,b,c) (a ^ b ^ c) +#endif +*/ + +#if USE_XOR_ASM_OPTS +// device asm for whirpool +DEV_INLINE uint64_t xor8(const uint64_t a, const uint64_t b, const uint64_t c, + const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const uint64_t h) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g), "l"(h)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); + return result; +} +#else +#define xor8(a, b, c, d, e, f, g, h) ((a ^ b) ^ (c ^ d) ^ (e ^ f) ^ (g ^ h)) +#endif + +// device asm for x17 +DEV_INLINE uint64_t xandx(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; + asm("{\n\t" + ".reg .u64 n;\n\t" + "xor.b64 %0, %2, %3;\n\t" + "and.b64 n, %0, %1;\n\t" + "xor.b64 %0, n, %3;" + "}\n" + : "=l"(result) + : "l"(a), "l"(b), "l"(c)); + return result; +} + +// device asm for x17 +DEV_INLINE uint64_t andor(uint64_t a, uint64_t b, uint64_t c) +{ + uint64_t result; + asm("{\n\t" + ".reg .u64 m,n;\n\t" + "and.b64 m, %1, %2;\n\t" + " or.b64 n, %1, %2;\n\t" + "and.b64 %0, n, %3;\n\t" + " or.b64 %0, %0, m ;\n\t" + "}\n" + : "=l"(result) + : "l"(a), "l"(b), "l"(c)); + return result; +} + +// device asm for x17 +DEV_INLINE uint64_t shr_t64(uint64_t x, uint32_t n) +{ + uint64_t result; + asm("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); + return result; +} + +// device asm for ? +DEV_INLINE uint64_t shl_t64(uint64_t x, uint32_t n) +{ + uint64_t result; + asm("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); + return result; +} + +#ifndef USE_ROT_ASM_OPT +#define USE_ROT_ASM_OPT 2 +#endif + +// 64-bit ROTATE RIGHT +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 +/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ +DEV_INLINE uint64_t ROTR64(const uint64_t value, const int offset) +{ + uint2 result; + if (offset < 32) + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2loint(__longlong_as_double(value))), + "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2hiint(__longlong_as_double(value))), + "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } + else + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2hiint(__longlong_as_double(value))), + "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2loint(__longlong_as_double(value))), + "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 +DEV_INLINE uint64_t ROTR64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shr.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shl.b64 %0, %1, roff;\n\t" + "add.u64 %0, %0, lhs;\n\t" + "}\n" + : "=l"(result) + : "l"(x), "r"(offset)); + return result; +} +#else +/* host */ +#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#endif + +// 64-bit ROTATE LEFT +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 +DEV_INLINE uint64_t ROTL64(const uint64_t value, const int offset) +{ + uint2 result; + if (offset >= 32) + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2loint(__longlong_as_double(value))), + "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2hiint(__longlong_as_double(value))), + "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } + else + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2hiint(__longlong_as_double(value))), + "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2loint(__longlong_as_double(value))), + "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 +DEV_INLINE uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t result; + asm("{\n\t" + ".reg .b64 lhs;\n\t" + ".reg .u32 roff;\n\t" + "shl.b64 lhs, %1, %2;\n\t" + "sub.u32 roff, 64, %2;\n\t" + "shr.b64 %0, %1, roff;\n\t" + "add.u64 %0, lhs, %0;\n\t" + "}\n" + : "=l"(result) + : "l"(x), "r"(offset)); + return result; +} +#elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3 +__device__ uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t res; + asm("{\n\t" + ".reg .u32 tl,th,vl,vh;\n\t" + ".reg .pred p;\n\t" + "mov.b64 {tl,th}, %1;\n\t" + "shf.l.wrap.b32 vl, tl, th, %2;\n\t" + "shf.l.wrap.b32 vh, th, tl, %2;\n\t" + "setp.lt.u32 p, %2, 32;\n\t" + "@!p mov.b64 %0, {vl,vh};\n\t" + "@p mov.b64 %0, {vh,vl};\n\t" + "}" + : "=l"(res) + : "l"(x), "r"(offset)); + return res; +} +#else +/* host */ +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif + +DEV_INLINE uint64_t SWAPDWORDS(uint64_t value) +{ +#if __CUDA_ARCH__ >= 320 + uint2 temp; + asm("mov.b64 {%0, %1}, %2; " : "=r"(temp.x), "=r"(temp.y) : "l"(value)); + asm("mov.b64 %0, {%1, %2}; " : "=l"(value) : "r"(temp.y), "r"(temp.x)); + return value; +#else + return ROTL64(value, 32); +#endif +} + +/* lyra2 - int2 operators */ + +DEV_INLINE void LOHI(uint32_t& lo, uint32_t& hi, uint64_t x) +{ + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(lo), "=r"(hi) : "l"(x)); +} + +DEV_INLINE uint64_t devectorize(uint2 x) +{ + uint64_t result; + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(result) : "r"(x.x), "r"(x.y)); + return result; +} + + +DEV_INLINE uint2 vectorize(const uint64_t x) +{ + uint2 result; + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x)); + return result; +} +DEV_INLINE void devectorize2(uint4 inn, uint2& x, uint2& y) +{ + x.x = inn.x; + x.y = inn.y; + y.x = inn.z; + y.y = inn.w; +} + + +DEV_INLINE uint4 vectorize2(uint2 x, uint2 y) +{ + uint4 result; + result.x = x.x; + result.y = x.y; + result.z = y.x; + result.w = y.y; + + return result; +} + +DEV_INLINE uint4 vectorize2(uint2 x) +{ + uint4 result; + result.x = x.x; + result.y = x.y; + result.z = x.x; + result.w = x.y; + return result; +} + + +DEV_INLINE uint4 vectorize4(uint64_t x, uint64_t y) +{ + uint4 result; + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.x), "=r"(result.y) : "l"(x)); + asm("mov.b64 {%0,%1},%2; \n\t" : "=r"(result.z), "=r"(result.w) : "l"(y)); + return result; +} +DEV_INLINE void devectorize4(uint4 inn, uint64_t& x, uint64_t& y) +{ + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(x) : "r"(inn.x), "r"(inn.y)); + asm("mov.b64 %0,{%1,%2}; \n\t" : "=l"(y) : "r"(inn.z), "r"(inn.w)); +} + + +static DEV_INLINE uint2 vectorizelow(uint32_t v) +{ + uint2 result; + result.x = v; + result.y = 0; + return result; +} +static DEV_INLINE uint2 vectorizehigh(uint32_t v) +{ + uint2 result; + result.x = 0; + result.y = v; + return result; +} + +static DEV_INLINE uint2 operator^(uint2 a, uint32_t b) +{ + return make_uint2(a.x ^ b, a.y); +} +static DEV_INLINE uint2 operator^(uint2 a, uint2 b) +{ + return make_uint2(a.x ^ b.x, a.y ^ b.y); +} +static DEV_INLINE uint2 operator&(uint2 a, uint2 b) +{ + return make_uint2(a.x & b.x, a.y & b.y); +} +static DEV_INLINE uint2 operator|(uint2 a, uint2 b) +{ + return make_uint2(a.x | b.x, a.y | b.y); +} +static DEV_INLINE uint2 operator~(uint2 a) +{ + return make_uint2(~a.x, ~a.y); +} +static DEV_INLINE void operator^=(uint2& a, uint2 b) +{ + a = a ^ b; +} +static DEV_INLINE uint2 operator+(uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + +static DEV_INLINE uint2 operator+(uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} + + +static DEV_INLINE uint2 operator-(uint2 a, uint32_t b) +{ + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +} + + +static DEV_INLINE uint2 operator-(uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + + +static DEV_INLINE uint4 operator^(uint4 a, uint4 b) +{ + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} +static DEV_INLINE uint4 operator&(uint4 a, uint4 b) +{ + return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); +} +static DEV_INLINE uint4 operator|(uint4 a, uint4 b) +{ + return make_uint4(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); +} +static DEV_INLINE uint4 operator~(uint4 a) +{ + return make_uint4(~a.x, ~a.y, ~a.z, ~a.w); +} +static DEV_INLINE void operator^=(uint4& a, uint4 b) +{ + a = a ^ b; +} +static DEV_INLINE uint4 operator^(uint4 a, uint2 b) +{ + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.x, a.w ^ b.y); +} + + +static DEV_INLINE void operator+=(uint2& a, uint2 b) +{ + a = a + b; +} + +/** + * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b)) + * (what does uint64 "*" operator) + */ +static DEV_INLINE uint2 operator*(uint2 a, uint2 b) +{ + uint2 result; + asm("{\n\t" + "mul.lo.u32 %0,%2,%4; \n\t" + "mul.hi.u32 %1,%2,%4; \n\t" + "mad.lo.cc.u32 %1,%3,%4,%1; \n\t" + "madc.lo.u32 %1,%3,%5,%1; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + +// uint2 method +#if __CUDA_ARCH__ >= 350 +DEV_INLINE uint2 ROR2(const uint2 a, const int offset) +{ + uint2 result; + if (offset < 32) + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +DEV_INLINE uint2 ROR2(const uint2 v, const int n) +{ + uint2 result; + if (n <= 32) + { + result.y = ((v.y >> (n)) | (v.x << (32 - n))); + result.x = ((v.x >> (n)) | (v.y << (32 - n))); + } + else + { + result.y = ((v.x >> (n - 32)) | (v.y << (64 - n))); + result.x = ((v.y >> (n - 32)) | (v.x << (64 - n))); + } + return result; +} +#endif + + +DEV_INLINE uint32_t ROL8(const uint32_t x) +{ + return __byte_perm(x, x, 0x2103); +} +DEV_INLINE uint32_t ROL16(const uint32_t x) +{ + return __byte_perm(x, x, 0x1032); +} +DEV_INLINE uint32_t ROL24(const uint32_t x) +{ + return __byte_perm(x, x, 0x0321); +} + +DEV_INLINE uint2 ROR8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x0765); + result.y = __byte_perm(a.y, a.x, 0x4321); + + return result; +} + +DEV_INLINE uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + + return result; +} + +DEV_INLINE uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + + return result; +} + +DEV_INLINE uint2 ROL8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x6543); + result.y = __byte_perm(a.y, a.x, 0x2107); + + return result; +} + +DEV_INLINE uint2 ROL16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x5432); + result.y = __byte_perm(a.y, a.x, 0x1076); + + return result; +} + +DEV_INLINE uint2 ROL24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x4321); + result.y = __byte_perm(a.y, a.x, 0x0765); + + return result; +} + + +#if __CUDA_ARCH__ >= 350 +__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) +{ + uint2 result; + if (offset >= 32) + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } + return result; +} +#else +__inline__ __device__ uint2 ROL2(const uint2 v, const int n) +{ + uint2 result; + if (n <= 32) + { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else + { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + } + return result; +} +#endif + +DEV_INLINE uint64_t ROTR16(uint64_t x) +{ +#if __CUDA_ARCH__ > 500 + short4 temp; + asm("mov.b64 { %0, %1, %2, %3 }, %4; " + : "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) + : "l"(x)); + asm("mov.b64 %0, {%1, %2, %3 , %4}; " + : "=l"(x) + : "h"(temp.y), "h"(temp.z), "h"(temp.w), "h"(temp.x)); + return x; +#else + return ROTR64(x, 16); +#endif +} +DEV_INLINE uint64_t ROTL16(uint64_t x) +{ +#if __CUDA_ARCH__ > 500 + short4 temp; + asm("mov.b64 { %0, %1, %2, %3 }, %4; " + : "=h"(temp.x), "=h"(temp.y), "=h"(temp.z), "=h"(temp.w) + : "l"(x)); + asm("mov.b64 %0, {%1, %2, %3 , %4}; " + : "=l"(x) + : "h"(temp.w), "h"(temp.x), "h"(temp.y), "h"(temp.z)); + return x; +#else + return ROTL64(x, 16); +#endif +} + +static __forceinline__ __device__ uint2 SHL2(uint2 a, int offset) +{ +#if __CUDA_ARCH__ > 300 + uint2 result; + if (offset < 32) + { + asm("{\n\t" + "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" + "shl.b32 %0,%2,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(offset)); + } + else + { + asm("{\n\t" + "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" + "shl.b32 %0,%2,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.y), "r"(a.x), "r"(offset)); + } + return result; +#else + if (offset <= 32) + { + a.y = (a.y << offset) | (a.x >> (32 - offset)); + a.x = (a.x << offset); + } + else + { + a.y = (a.x << (offset - 32)); + a.x = 0; + } + return a; +#endif +} +static __forceinline__ __device__ uint2 SHR2(uint2 a, int offset) +{ +#if __CUDA_ARCH__ > 300 + uint2 result; + if (offset < 32) + { + asm("{\n\t" + "shf.r.clamp.b32 %0,%2,%3,%4; \n\t" + "shr.b32 %1,%3,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(a.y), "r"(offset)); + } + else + { + asm("{\n\t" + "shf.l.clamp.b32 %0,%2,%3,%4; \n\t" + "shl.b32 %1,%3,%4; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) + : "r"(a.y), "r"(a.x), "r"(offset)); + } + return result; +#else + if (offset <= 32) + { + a.x = (a.x >> offset) | (a.y << (32 - offset)); + a.y = (a.y >> offset); + } + else + { + a.x = (a.y >> (offset - 32)); + a.y = 0; + } + return a; +#endif +} + +static DEV_INLINE uint64_t devectorizeswap(uint2 v) +{ + return MAKE_ULONGLONG(cuda_swab32(v.y), cuda_swab32(v.x)); +} +static DEV_INLINE uint2 vectorizeswap(uint64_t v) +{ + uint2 result; + LOHI(result.y, result.x, v); + result.x = cuda_swab32(result.x); + result.y = cuda_swab32(result.y); + return result; +} + + +DEV_INLINE uint32_t devectorize16(ushort2 x) +{ + uint32_t result; + asm("mov.b32 %0,{%1,%2}; \n\t" : "=r"(result) : "h"(x.x), "h"(x.y)); + return result; +} + + +DEV_INLINE ushort2 vectorize16(uint32_t x) +{ + ushort2 result; + asm("mov.b32 {%0,%1},%2; \n\t" : "=h"(result.x), "=h"(result.y) : "r"(x)); + return result; +} + + +static DEV_INLINE uint4 mul4(uint4 a) +{ + uint4 result; + asm("{\n\t" + "mul.lo.u32 %0,%4,%5; \n\t" + "mul.hi.u32 %1,%4,%5; \n\t" + "mul.lo.u32 %2,%6,%7; \n\t" + "mul.hi.u32 %3,%6,%7; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) + : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w)); + return result; +} +static DEV_INLINE uint4 add4(uint4 a, uint4 b) +{ + uint4 result; + asm("{\n\t" + "add.cc.u32 %0,%4,%8; \n\t" + "addc.u32 %1,%5,%9; \n\t" + "add.cc.u32 %2,%6,%10; \n\t" + "addc.u32 %3,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) + : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; +} + +static DEV_INLINE uint4 madd4(uint4 a, uint4 b) +{ + uint4 result; + asm("{\n\t" + "mad.lo.cc.u32 %0,%4,%5,%8; \n\t" + "madc.hi.u32 %1,%4,%5,%9; \n\t" + "mad.lo.cc.u32 %2,%6,%7,%10; \n\t" + "madc.hi.u32 %3,%6,%7,%11; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y), "=r"(result.z), "=r"(result.w) + : "r"(a.x), "r"(a.y), "r"(a.z), "r"(a.w), "r"(b.x), "r"(b.y), "r"(b.z), "r"(b.w)); + return result; +} + +static DEV_INLINE ulonglong2 madd4long(ulonglong2 a, ulonglong2 b) +{ + ulonglong2 result; + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %2;\n\t" + "mov.b64 {a2,a3}, %3;\n\t" + "mov.b64 {b0,b1}, %4;\n\t" + "mov.b64 {b2,b3}, %5;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "=l"(result.x), "=l"(result.y) + : "l"(a.x), "l"(a.y), "l"(b.x), "l"(b.y)); + return result; +} +static DEV_INLINE void madd4long2(ulonglong2& a, ulonglong2 b) +{ + asm("{\n\t" + ".reg .u32 a0,a1,a2,a3,b0,b1,b2,b3;\n\t" + "mov.b64 {a0,a1}, %0;\n\t" + "mov.b64 {a2,a3}, %1;\n\t" + "mov.b64 {b0,b1}, %2;\n\t" + "mov.b64 {b2,b3}, %3;\n\t" + "mad.lo.cc.u32 b0,a0,a1,b0; \n\t" + "madc.hi.u32 b1,a0,a1,b1; \n\t" + "mad.lo.cc.u32 b2,a2,a3,b2; \n\t" + "madc.hi.u32 b3,a2,a3,b3; \n\t" + "mov.b64 %0, {b0,b1};\n\t" + "mov.b64 %1, {b2,b3};\n\t" + "}\n\t" + : "+l"(a.x), "+l"(a.y) + : "l"(b.x), "l"(b.y)); +} + +DEV_INLINE uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) +{ + uint32_t result; + asm("{ .reg .u32 t1;\n\t" + "xor.b32 t1, %2, %3;\n\t" + "xor.b32 %0, %1, t1;\n\t" + "}" + : "=r"(result) + : "r"(a), "r"(b), "r"(c)); + return result; +} + +DEV_INLINE uint32_t shr_t32(uint32_t x, uint32_t n) +{ + uint32_t result; + asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +} + +DEV_INLINE uint32_t shl_t32(uint32_t x, uint32_t n) +{ + uint32_t result; + asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +} + +// device asm 32 for pluck +DEV_INLINE uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) +{ + uint32_t result; + asm("{ .reg .u32 m,n,o;\n\t" + "and.b32 m, %1, %2;\n\t" + " or.b32 n, %1, %2;\n\t" + "and.b32 o, n, %3;\n\t" + " or.b32 %0, m, o ;\n\t" + "}\n\t" + : "=r"(result) + : "r"(a), "r"(b), "r"(c)); + return result; +} + +DEV_INLINE uint32_t bfe(uint32_t x, uint32_t bit, uint32_t numBits) +{ + uint32_t ret; + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(bit), "r"(numBits)); + return ret; +} + +DEV_INLINE uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits) +{ + uint32_t ret; + asm("bfi.b32 %0, %1, %2, %3,%4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits)); + return ret; +} \ No newline at end of file diff --git a/plugins/cuda/kaspa-cuda-native/src/fishhash_cuda_kernel.cuh b/plugins/cuda/kaspa-cuda-native/src/fishhash_cuda_kernel.cuh new file mode 100644 index 0000000..e2227c0 --- /dev/null +++ b/plugins/cuda/kaspa-cuda-native/src/fishhash_cuda_kernel.cuh @@ -0,0 +1,329 @@ +#include "fishhash_cuda_kernel.h" +#include "keccak.cuh" + +#define FNV_PRIME 0x01000193 + +//change these in #define +//static int full_dataset_item_parents = 512; +#define full_dataset_item_parents 512 +//static int num_dataset_accesses = 32; +#define num_dataset_accesses 32 +//static int light_cache_rounds = 3; +#define light_cache_rounds 3 + +const int light_cache_num_items = 1179641; +//#define light_cache_num_items 1179641 +const int full_dataset_num_items = 37748717; +//#define full_dataset_num_items 37748717 + +#define DEV_INLINE __device__ __forceinline__ + +#define copy(dst, src, count) \ + for (int i = 0; i != count; ++i) \ + { \ + (dst)[i] = (src)[i]; \ + } + + +static DEV_INLINE uint32_t fnv1(uint32_t u, uint32_t v) noexcept { + return (u * FNV_PRIME) ^ v; +} + +DEV_INLINE hash512 fnv1(const hash512& u, const hash512& v) noexcept { + hash512 r; + for (size_t i = 0; i < sizeof(r) / sizeof(r.word32s[0]); ++i) + r.word32s[i] = fnv1(u.word32s[i], v.word32s[i]); + return r; +} + +typedef struct item_state + { + const hash512* const cache; + const int64_t num_cache_items; + const uint32_t seed; + + hash512 mix; + + DEV_INLINE item_state(const fishhash_context& ctx, int64_t index) noexcept + : cache{ctx.light_cache}, + num_cache_items{ctx.light_cache_num_items}, + seed{static_cast(index)} { + //printf("item_state debug 1 %p - %d", &cache, num_cache_items); + mix = cache[index % num_cache_items]; + //printf("item_state debug 2"); + mix.word32s[0] ^= seed; + //keccak(mix.word64s, 512, mix.bytes, 64); + //printf("item_state debug 3"); + SHA3_512(mix.uint2s); + } + + DEV_INLINE void update(uint32_t round) noexcept { + static constexpr size_t num_words = sizeof(mix) / sizeof(uint32_t); + const uint32_t t = fnv1(seed ^ round, mix.word32s[round % num_words]); + const int64_t parent_index = t % num_cache_items; + mix = fnv1(mix, cache[parent_index]); + } + + DEV_INLINE hash512 final() noexcept { + //keccak(mix.word64s, 512, mix.bytes, 64); + SHA3_512(mix.uint2s); + return mix; + } + }; + + + +DEV_INLINE hash1024 calculate_dataset_item_1024(const fishhash_context& ctx, uint32_t index) noexcept { + //printf("heavy_hash Thread %d, Block %d\n", threadIdx.x, blockIdx.x); + //printf("calculate_dataset_item_1024 debug 1"); + item_state item0{ctx, int64_t(index) * 2}; + //printf("calculate_dataset_item_1024 debug 2"); + item_state item1{ctx, int64_t(index) * 2 + 1}; + + //printf("calculate_dataset_item_1024 debug 3"); + for (uint32_t j = 0; j < full_dataset_item_parents; ++j) { + item0.update(j); + item1.update(j); + } + + hash512 it0 = item0.final(); + hash512 it1 = item1.final(); + + return hash1024{{it0, it1}}; +} + +DEV_INLINE hash1024 lookup(const fishhash_context& ctx, uint32_t index) { + if (ctx.full_dataset != NULL) { + //printf("lookup debug 1"); + hash1024 * item = &ctx.full_dataset[index]; + + // Ability to handle lazy lookup + if (item->word64s[0] == 0) { + *item = calculate_dataset_item_1024(ctx, index); + } + + return *item; + } else { + //printf("lookup debug 2"); + return calculate_dataset_item_1024(ctx, index); + } +} + +DEV_INLINE hash256 fishhash_kernel( const fishhash_context& ctx, const hash512& seed) noexcept { + //printf("fishhash_kernel debug 1"); + const uint32_t index_limit = static_cast(ctx.full_dataset_num_items); + //printf("fishhash_kernel debug 1.1"); + //const uint32_t seed_init = seed.word32s[0]; + //printf("fishhash_kernel debug 2"); + hash1024 mix{seed, seed}; + //printf("fishhash_kernel debug 3"); + //printf("The index_limit is : %d \n", index_limit); + for (uint32_t i = 0; i < num_dataset_accesses; ++i) { + + //printf("fishhash_kernel debug 4, %d", index_limit); + //printf("fishhash_kernel debug 4.1, %032x", mix.word32s[0]); + // Calculate new fetching indexes + const uint32_t p0 = mix.word32s[0] % index_limit; + //printf("fishhash_kernel debug 4.2, %032x", mix.word32s[4]); + const uint32_t p1 = mix.word32s[4] % index_limit; + //printf("fishhash_kernel debug 4.3, %032x", mix.word32s[8]); + const uint32_t p2 = mix.word32s[8] % index_limit; + + //printf("fishhash_kernel debug 5"); + hash1024 fetch0 = lookup(ctx, p0); + hash1024 fetch1 = lookup(ctx, p1); + hash1024 fetch2 = lookup(ctx, p2); + + //printf("fishhash_kernel debug 6"); + // Modify fetch1 and fetch2 + for (size_t j = 0; j < 32; ++j) { + fetch1.word32s[j] = fnv1(mix.word32s[j], fetch1.word32s[j]); + fetch2.word32s[j] = mix.word32s[j] ^ fetch2.word32s[j]; + } + + //printf("fishhash_kernel debug 7"); + // Final computation of new mix + for (size_t j = 0; j < 16; ++j) + mix.word64s[j] = fetch0.word64s[j] * fetch1.word64s[j] + fetch2.word64s[j]; + } + + //printf("fishhash_kernel debug 8"); + // Collapse the result into 32 bytes + hash256 mix_hash; + static constexpr size_t num_words = sizeof(mix) / sizeof(uint32_t); + //printf("fishhash_kernel debug 9"); + for (size_t i = 0; i < num_words; i += 4) { + const uint32_t h1 = fnv1(mix.word32s[i], mix.word32s[i + 1]); + const uint32_t h2 = fnv1(h1, mix.word32s[i + 2]); + const uint32_t h3 = fnv1(h2, mix.word32s[i + 3]); + mix_hash.word32s[i / 4] = h3; + } + + //printf("fishhash_kernel debug 10"); + return mix_hash; + } + +DEV_INLINE void printHash(char* msg, const uint8_t* hash, int size) { + printf(msg); + for(int i = 0; i < size; i++) { + //printf("%02x", output[i]); + printf("%02x", hash[i]); + } + printf("\n"); + } + +//DEV_INLINE void hashFish(uint8_t * output, const fishhash_context * ctx, const uint8_t * header, uint64_t header_size) noexcept { +DEV_INLINE void hashFish( + const fishhash_context * ctx, + uint8_t* out, + const uint8_t* in) { + hash512 seed; + *seed.bytes = *in; + //printf("hashFish debug 1"); + const hash256 mix_hash = fishhash_kernel(*ctx, seed); + *out = *mix_hash.bytes; + } + + + +DEV_INLINE hash512 bitwise_xor(const hash512& x, const hash512& y) noexcept { + hash512 z; + for (size_t i = 0; i < sizeof(z) / sizeof(z.word64s[0]); ++i) + z.word64s[i] = x.word64s[i] ^ y.word64s[i]; + return z; + } + + +/* +void build_light_cache( hash512 cache[], int num_items, const hash256& seed) noexcept { + hash512 item; + //keccak(item.word64s, 512, seed.bytes, sizeof(seed)); + copy(item.uint2s, seed.uint2s, sizeof(seed.uint2s)); + SHA3_512(item.uint2s); + cache[0] = item; + + for (int i = 1; i < num_items; ++i) { + //keccak(item.word64s, 512, item.bytes, sizeof(item)); + SHA3_512(item.uint2s); + cache[i] = item; + } + + for (int q = 0; q < light_cache_rounds; ++q) { + for (int i = 0; i < num_items; ++i) { + const uint32_t index_limit = static_cast(num_items); + + // First index: 4 first bytes of the item as little-endian integer. + const uint32_t t = cache[i].word32s[0]; + const uint32_t v = t % index_limit; + + // Second index. + const uint32_t w = static_cast(num_items + (i - 1)) % index_limit; + const hash512 x = bitwise_xor(cache[v], cache[w]); + //keccak(cache[i].word64s, 512, x.bytes, sizeof(x)); + copy(cache[i].uint2s, x.uint2s, sizeof(x.uint2s)); + SHA3_512(cache[i].uint2s); + } + } + } + + void build_dataset_segment(fishhash_context * ctx, uint32_t start, uint32_t end) { + for (uint32_t i=start; i full_dataset[i] = calculate_dataset_item_1024(*ctx, i); + } + } + + void prebuild_dataset(fishhash_context * ctx, uint32_t numThreads) noexcept { + // If the context is not initialized as full context, return to avoid segmentation faults + if (ctx->full_dataset == NULL) return; + + if (numThreads > 1) { + uint32_t batch_size = ctx->full_dataset_num_items / numThreads; + + // Launch worker threads + std::vector< std::thread > threads(numThreads); + for(unsigned i = 0; i < numThreads; ++i) { + int start = i * batch_size; + int end = i == (numThreads-1) ? ctx->full_dataset_num_items : (i+1) * batch_size; + + threads[i] = std::thread(build_dataset_segment, ctx, start, end); + } + + // Join them in for completion + for(unsigned i = 0; i < numThreads; ++i) { + threads[i].join(); + } + } else { + build_dataset_segment(ctx, 0, ctx->full_dataset_num_items); + } + + } +*/ + + + +// ========================================================================== + + +/* + +__global__ void ethash_calculate_dag_item(uint32_t start) +{ + uint32_t const node_index = start + blockIdx.x * blockDim.x + threadIdx.x; + if (((node_index >> 1) & (~1)) >= d_dag_size) + return; + union { + hash128_t dag_node; + uint2 dag_node_mem[25]; + }; + copy(dag_node.uint4s, d_light[node_index % d_light_size].uint4s, 4); + dag_node.words[0] ^= node_index; + SHA3_512(dag_node_mem); + + const int thread_id = threadIdx.x & 3; + + for (uint32_t i = 0; i != ETHASH_DATASET_PARENTS; ++i) + { + uint32_t parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % d_light_size; + for (uint32_t t = 0; t < 4; t++) + { + uint32_t shuffle_index = SHFL(parent_index, t, 4); + + uint4 p4 = d_light[shuffle_index].uint4s[thread_id]; + for (int w = 0; w < 4; w++) + { + uint4 s4 = make_uint4(SHFL(p4.x, w, 4), SHFL(p4.y, w, 4), SHFL(p4.z, w, 4), SHFL(p4.w, w, 4)); + if (t == thread_id) + { + dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], s4); + } + } + } + } + SHA3_512(dag_node_mem); + hash64_t* dag_nodes = (hash64_t*)d_dag; + copy(dag_nodes[node_index].uint4s, dag_node.uint4s, 4); +} + +void ethash_generate_dag( + uint64_t dag_size, uint32_t gridSize, uint32_t blockSize, cudaStream_t stream) +{ + const uint32_t work = (uint32_t)(dag_size / sizeof(hash64_t)); + const uint32_t run = gridSize * blockSize; + + uint32_t base; + for (base = 0; base <= work - run; base += run) + { + ethash_calculate_dag_item<<>>(base); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + if (base < work) + { + uint32_t lastGrid = work - base; + lastGrid = (lastGrid + blockSize - 1) / blockSize; + ethash_calculate_dag_item<<>>(base); + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + CUDA_SAFE_CALL(cudaGetLastError()); +} + +*/ \ No newline at end of file diff --git a/plugins/cuda/kaspa-cuda-native/src/fishhash_cuda_kernel.h b/plugins/cuda/kaspa-cuda-native/src/fishhash_cuda_kernel.h new file mode 100644 index 0000000..9dbb03b --- /dev/null +++ b/plugins/cuda/kaspa-cuda-native/src/fishhash_cuda_kernel.h @@ -0,0 +1,51 @@ +#pragma once + +#include + +typedef union { + uint64_t word64s[4]; + uint32_t word32s[8]; + uint8_t bytes[32]; + char str[32]; + uint2 uint2s[4]; +} hash256; + +typedef union { + uint64_t word64s[8]; + uint32_t word32s[16]; + uint8_t bytes[64]; + char str[64]; + uint2 uint2s[8]; +} hash512; + +typedef union { + //union hash512 hash512s[2]; + hash512 hash512s[2]; + uint64_t word64s[16]; + uint32_t word32s[32]; + uint8_t bytes[128]; + char str[128]; + uint2 uint2s[16]; +} hash1024; + +typedef struct { + const int light_cache_num_items; + //hash512* const light_cache; + hash512* light_cache; + const int full_dataset_num_items; + hash1024* full_dataset; +} fishhash_context; + + +#define CUDA_SAFE_CALL(call) \ + do \ + { \ + cudaError_t err = call; \ + if (cudaSuccess != err) \ + { \ + std::stringstream ss; \ + ss << "CUDA error in func " << __FUNCTION__ << " at line " << __LINE__ << ' ' \ + << cudaGetErrorString(err); \ + throw cuda_runtime_error(ss.str()); \ + } \ + } while (0) \ No newline at end of file diff --git a/plugins/cuda/kaspa-cuda-native/src/fnv.cuh b/plugins/cuda/kaspa-cuda-native/src/fnv.cuh new file mode 100644 index 0000000..6526521 --- /dev/null +++ b/plugins/cuda/kaspa-cuda-native/src/fnv.cuh @@ -0,0 +1,18 @@ +#define FNV_PRIME 0x01000193 + +#define fnv(x, y) ((x)*FNV_PRIME ^ (y)) + +DEV_INLINE uint4 fnv4(uint4 a, uint4 b) +{ + uint4 c; + c.x = a.x * FNV_PRIME ^ b.x; + c.y = a.y * FNV_PRIME ^ b.y; + c.z = a.z * FNV_PRIME ^ b.z; + c.w = a.w * FNV_PRIME ^ b.w; + return c; +} + +DEV_INLINE uint32_t fnv_reduce(uint4 v) +{ + return fnv(fnv(fnv(v.x, v.y), v.z), v.w); +} \ No newline at end of file diff --git a/plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu b/plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu index fa703ab..bf0b634 100644 --- a/plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu +++ b/plugins/cuda/kaspa-cuda-native/src/kaspa-cuda.cu @@ -2,6 +2,7 @@ #include #include "keccak-tiny.c" #include "xoshiro256starstar.c" +#include "fishhash_cuda_kernel.cuh" @@ -29,6 +30,33 @@ __constant__ uint256_t target; __constant__ static const uint8_t powP[Plen] = { 0x3d, 0xd8, 0xf6, 0xa1, 0x0d, 0xff, 0x3c, 0x11, 0x3c, 0x7e, 0x02, 0xb7, 0x55, 0x88, 0xbf, 0x29, 0xd2, 0x44, 0xfb, 0x0e, 0x72, 0x2e, 0x5f, 0x1e, 0xa0, 0x69, 0x98, 0xf5, 0xa3, 0xa4, 0xa5, 0x1b, 0x65, 0x2d, 0x5e, 0x87, 0xca, 0xaf, 0x2f, 0x7b, 0x46, 0xe2, 0xdc, 0x29, 0xd6, 0x61, 0xef, 0x4a, 0x10, 0x5b, 0x41, 0xad, 0x1e, 0x98, 0x3a, 0x18, 0x9c, 0xc2, 0x9b, 0x78, 0x0c, 0xf6, 0x6b, 0x77, 0x40, 0x31, 0x66, 0x88, 0x33, 0xf1, 0xeb, 0xf8, 0xf0, 0x5f, 0x28, 0x43, 0x3c, 0x1c, 0x65, 0x2e, 0x0a, 0x4a, 0xf1, 0x40, 0x05, 0x07, 0x96, 0x0f, 0x52, 0x91, 0x29, 0x5b, 0x87, 0x67, 0xe3, 0x44, 0x15, 0x37, 0xb1, 0x25, 0xa4, 0xf1, 0x70, 0xec, 0x89, 0xda, 0xe9, 0x82, 0x8f, 0x5d, 0xc8, 0xe6, 0x23, 0xb2, 0xb4, 0x85, 0x1f, 0x60, 0x1a, 0xb2, 0x46, 0x6a, 0xa3, 0x64, 0x90, 0x54, 0x85, 0x34, 0x1a, 0x85, 0x2f, 0x7a, 0x1c, 0xdd, 0x06, 0x0f, 0x42, 0xb1, 0x3b, 0x56, 0x1d, 0x02, 0xa2, 0xc1, 0xe4, 0x68, 0x16, 0x45, 0xe4, 0xe5, 0x1d, 0xba, 0x8d, 0x5f, 0x09, 0x05, 0x41, 0x57, 0x02, 0xd1, 0x4a, 0xcf, 0xce, 0x9b, 0x84, 0x4e, 0xca, 0x89, 0xdb, 0x2e, 0x74, 0xa8, 0x27, 0x94, 0xb0, 0x48, 0x72, 0x52, 0x8b, 0xe7, 0x9c, 0xce, 0xfc, 0xb1, 0xbc, 0xa5, 0xaf, 0x82, 0xcf, 0x29, 0x11, 0x5d, 0x83, 0x43, 0x82, 0x6f, 0x78, 0x7c, 0xb9, 0x02 }; __constant__ static const uint8_t heavyP[Plen] = { 0x09, 0x85, 0x24, 0xb2, 0x52, 0x4c, 0xd7, 0x3a, 0x16, 0x42, 0x9f, 0x2f, 0x0e, 0x9b, 0x62, 0x79, 0xee, 0xf8, 0xc7, 0x16, 0x48, 0xff, 0x14, 0x7a, 0x98, 0x64, 0x05, 0x80, 0x4c, 0x5f, 0xa7, 0x11, 0xda, 0xce, 0xee, 0x44, 0xdf, 0xe0, 0x20, 0xe7, 0x69, 0x40, 0xf3, 0x14, 0x2e, 0xd8, 0xc7, 0x72, 0xba, 0x35, 0x89, 0x93, 0x2a, 0xff, 0x00, 0xc1, 0x62, 0xc4, 0x0f, 0x25, 0x40, 0x90, 0x21, 0x5e, 0x48, 0x6a, 0xcf, 0x0d, 0xa6, 0xf9, 0x39, 0x80, 0x0c, 0x3d, 0x2a, 0x79, 0x9f, 0xaa, 0xbc, 0xa0, 0x26, 0xa2, 0xa9, 0xd0, 0x5d, 0xc0, 0x31, 0xf4, 0x3f, 0x8c, 0xc1, 0x54, 0xc3, 0x4c, 0x1f, 0xd3, 0x3d, 0xcc, 0x69, 0xa7, 0x01, 0x7d, 0x6b, 0x6c, 0xe4, 0x93, 0x24, 0x56, 0xd3, 0x5b, 0xc6, 0x2e, 0x44, 0xb0, 0xcd, 0x99, 0x3a, 0x4b, 0xf7, 0x4e, 0xb0, 0xf2, 0x34, 0x54, 0x83, 0x86, 0x4c, 0x77, 0x16, 0x94, 0xbc, 0x36, 0xb0, 0x61, 0xe9, 0x07, 0x07, 0xcc, 0x65, 0x77, 0xb1, 0x1d, 0x8f, 0x7e, 0x39, 0x6d, 0xc4, 0xba, 0x80, 0xdb, 0x8f, 0xea, 0x58, 0xca, 0x34, 0x7b, 0xd3, 0xf2, 0x92, 0xb9, 0x57, 0xb9, 0x81, 0x84, 0x04, 0xc5, 0x76, 0xc7, 0x2e, 0xc2, 0x12, 0x51, 0x67, 0x9f, 0xc3, 0x47, 0x0a, 0x0c, 0x29, 0xb5, 0x9d, 0x39, 0xbb, 0x92, 0x15, 0xc6, 0x9f, 0x2f, 0x31, 0xe0, 0x9a, 0x54, 0x35, 0xda, 0xb9, 0x10, 0x7d, 0x32, 0x19, 0x16 }; + //__device__ hash512* const light_cache = reinterpret_cast(light_cache_num_items); + //__device__ hash1024* full_dataset = reinterpret_cast(full_dataset_num_items); + + //__shared__ hash512 light_cache[10]; + //__shared__ hash1024 full_dataset[10]; + + __shared__ hash512* light_cache; + __shared__ hash1024* full_dataset; + + /* + __device__ fishhash_context ctx = { + light_cache_num_items, + light_cache, + full_dataset_num_items, + full_dataset}; +*/ +/* +__shared__ fishhash_context ctx { + light_cache_num_items, + light_cache, + full_dataset_num_items, + full_dataset +}; +*/ + +__shared__ uint8_t cache_test[10]; + __device__ __inline__ void amul4bit(uint32_t packed_vec1[32], uint32_t packed_vec2[32], uint32_t *ret) { // We assume each 32 bits have four values: A0 B0 C0 D0 unsigned int res = 0; @@ -51,12 +79,33 @@ __device__ __inline__ void amul4bit(uint32_t packed_vec1[32], uint32_t packed_ve *ret = res; } - extern "C" { + __global__ void heavy_hash( + const uint64_t nonce_mask, + const uint64_t nonce_fixed, + const uint64_t nonces_len, + uint8_t random_type, + void* states, + uint64_t *final_nonce, + hash1024* dataset, + hash512* cache + ) { - __global__ void heavy_hash(const uint64_t nonce_mask, const uint64_t nonce_fixed, const uint64_t nonces_len, uint8_t random_type, void* states, uint64_t *final_nonce) { // assuming header_len is 72 + + //printf("heavy_hash Thread %d, Block %d\n", dataset[10]); + //printHash("The cache[42] is : ", cache[42].bytes, 128); + //printHash("The dataset[10] is : ", dataset[10].bytes, 128); + + if (threadIdx.x == 0 && blockIdx.x == 0) { + printf("heavy_hash Thread %d, Block %d\n", threadIdx.x, blockIdx.x); + printHash("The dataset[10] is : ", dataset[10].bytes, 128); + printHash("The dataset[42] is : ", dataset[42].bytes, 128); + printHash("The dataset[12345] is : ", dataset[12345].bytes, 128); + } + + int nonceId = threadIdx.x + blockIdx.x*blockDim.x; if (nonceId < nonces_len) { if (nonceId == 0) *final_nonce = 0; @@ -70,6 +119,7 @@ extern "C" { nonce = xoshiro256_next(((ulonglong4 *)states) + nonceId); break; } + //printf("heavy_hash debug 1"); nonce = (nonce & nonce_mask) | nonce_fixed; // header uint8_t input[80]; @@ -80,6 +130,33 @@ extern "C" { memcpy(input + HASH_HEADER_SIZE, (uint8_t *)(&nonce), 8); hashB3(powP, hash_.hash, input); +/* + if (sizeof(ctx.full_dataset) < 1) { + printf("cuda malloc ========================================\n"); + cudaMalloc(&ctx.light_cache, 10*sizeof(hash512)); + cudaMalloc(&ctx.full_dataset, 10*sizeof(hash1024)); + //ctx.full_dataset = dataset; + //ctx.light_cache = cache; + } + */ + + //printf("heavy_hash debug 2 %lld >>>>>>>>>>>>>>>>>>> \n", sizeof(cache)); + + fishhash_context ctx { + light_cache_num_items, + cache, + full_dataset_num_items, + dataset + }; + + //printf("heavy_hash debug 2.1 %llu >>>>>>>>>>>>>>>>>>> \n", sizeof(ctx.light_cache)); + + memset(input, 0, 80); + memcpy(input, hash_.hash, 32); + hashFish(&ctx, hash_.hash, input); + + //printf("heavy_hash debug 3"); +/* //assert((rowId != 0) || (hashId != 0) ); uchar4 packed_hash[QUARTER_MATRIX_SIZE] = {0}; #pragma unroll @@ -108,12 +185,17 @@ extern "C" { hash_.hash[rowId] = lop_temp; #endif } + +*/ + + memset(input, 0, 80); memcpy(input, hash_.hash, 32); - hash(heavyP, hash_.hash, input); + hashB3(heavyP, hash_.hash, input); if (LT_U256(hash_, target)){ atomicCAS((unsigned long long int*) final_nonce, 0, (unsigned long long int) nonce); } + //printf("heavy_hash debug 4"); } } diff --git a/plugins/cuda/kaspa-cuda-native/src/keccak-tiny.c b/plugins/cuda/kaspa-cuda-native/src/keccak-tiny.c index fd05241..1c46dbb 100644 --- a/plugins/cuda/kaspa-cuda-native/src/keccak-tiny.c +++ b/plugins/cuda/kaspa-cuda-native/src/keccak-tiny.c @@ -106,6 +106,22 @@ __device__ __forceinline__ static void hash( } +/** The sponge-based hash construction. **/ +__device__ __forceinline__ static void hashK( + uint8_t* out, + const uint8_t* in) { + + uint8_t a[Plen] = {0}; + + #pragma unroll + for (int i=0; i<10; i++) ((uint64_t *)a)[i] = ((uint64_t *)in)[i]; + // Apply P + P(a); + // Squeeze output. + #pragma unroll + for (int i=0; i<4; i++) ((uint64_t *)out)[i] = ((uint64_t *)a)[i]; +} + /** The sponge-based hash construction. **/ __device__ __forceinline__ static void hashB3( const uint8_t initP[Plen], diff --git a/plugins/cuda/kaspa-cuda-native/src/keccak.cuh b/plugins/cuda/kaspa-cuda-native/src/keccak.cuh new file mode 100644 index 0000000..a710b8f --- /dev/null +++ b/plugins/cuda/kaspa-cuda-native/src/keccak.cuh @@ -0,0 +1,260 @@ +#include "cuda_helper.h" + +__device__ __constant__ uint2 const keccak_round_constants[24] = { + { 0x00000001, 0x00000000 }, { 0x00008082, 0x00000000 }, { 0x0000808a, 0x80000000 }, { 0x80008000, 0x80000000 }, + { 0x0000808b, 0x00000000 }, { 0x80000001, 0x00000000 }, { 0x80008081, 0x80000000 }, { 0x00008009, 0x80000000 }, + { 0x0000008a, 0x00000000 }, { 0x00000088, 0x00000000 }, { 0x80008009, 0x00000000 }, { 0x8000000a, 0x00000000 }, + { 0x8000808b, 0x00000000 }, { 0x0000008b, 0x80000000 }, { 0x00008089, 0x80000000 }, { 0x00008003, 0x80000000 }, + { 0x00008002, 0x80000000 }, { 0x00000080, 0x80000000 }, { 0x0000800a, 0x00000000 }, { 0x8000000a, 0x80000000 }, + { 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 }, { 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 } +}; + +DEV_INLINE uint2 xor5( + const uint2 a, const uint2 b, const uint2 c, const uint2 d, const uint2 e) +{ +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + uint2 result; + asm volatile ( + "// xor5\n\t" + "lop3.b32 %0, %2, %3, %4, 0x96;\n\t" + "lop3.b32 %0, %0, %5, %6, 0x96;\n\t" + "lop3.b32 %1, %7, %8, %9, 0x96;\n\t" + "lop3.b32 %1, %1, %10, %11, 0x96;" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(b.x), "r"(c.x),"r"(d.x),"r"(e.x), + "r"(a.y), "r"(b.y), "r"(c.y),"r"(d.y),"r"(e.y)); + return result; +#else + return a ^ b ^ c ^ d ^ e; +#endif +} + +DEV_INLINE uint2 xor3(const uint2 a, const uint2 b, const uint2 c) +{ +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + uint2 result; + asm volatile ( + "// xor3\n\t" + "lop3.b32 %0, %2, %3, %4, 0x96;\n\t" + "lop3.b32 %1, %5, %6, %7, 0x96;" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(b.x), "r"(c.x), "r"(a.y), "r"(b.y), "r"(c.y)); + return result; +#else + return a ^ b ^ c; +#endif +} + +DEV_INLINE uint2 chi(const uint2 a, const uint2 b, const uint2 c) +{ +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + uint2 result; + asm volatile ( + "// chi\n\t" + "lop3.b32 %0, %2, %3, %4, 0xD2;\n\t" + "lop3.b32 %1, %5, %6, %7, 0xD2;" + : "=r"(result.x), "=r"(result.y) + : "r"(a.x), "r"(b.x), "r"(c.x), // 0xD2 = 0xF0 ^ ((~0xCC) & 0xAA) + "r"(a.y), "r"(b.y), "r"(c.y)); // 0xD2 = 0xF0 ^ ((~0xCC) & 0xAA) + return result; +#else + return a ^ (~b) & c; +#endif +} + +#if (__CUDA_ARCH__ >= 320) +#define LDG(x) __ldg(&(x)) +#else +#define LDG(x) (x) +#endif + +DEV_INLINE void SHA3_512(uint2* s) +{ + uint2 t[5], u, v; + + for (uint32_t i = 8; i < 25; i++) + { + s[i] = make_uint2(0, 0); + } + s[8].x = 1; + s[8].y = 0x80000000; + + for (int i = 0; i < 23; i++) + { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = xor5(s[0], s[5], s[10], s[15], s[20]); + t[1] = xor5(s[1], s[6], s[11], s[16], s[21]); + t[2] = xor5(s[2], s[7], s[12], s[17], s[22]); + t[3] = xor5(s[3], s[8], s[13], s[18], s[23]); + t[4] = xor5(s[4], s[9], s[14], s[19], s[24]); + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + + u = t[4] ^ ROL2(t[1], 1); + s[0] ^= u; + s[5] ^= u; + s[10] ^= u; + s[15] ^= u; + s[20] ^= u; + + u = t[0] ^ ROL2(t[2], 1); + s[1] ^= u; + s[6] ^= u; + s[11] ^= u; + s[16] ^= u; + s[21] ^= u; + + u = t[1] ^ ROL2(t[3], 1); + s[2] ^= u; + s[7] ^= u; + s[12] ^= u; + s[17] ^= u; + s[22] ^= u; + + u = t[2] ^ ROL2(t[4], 1); + s[3] ^= u; + s[8] ^= u; + s[13] ^= u; + s[18] ^= u; + s[23] ^= u; + + u = t[3] ^ ROL2(t[0], 1); + s[4] ^= u; + s[9] ^= u; + s[14] ^= u; + s[19] ^= u; + s[24] ^= u; + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL2(s[19], 8); + s[19] = ROL2(s[23], 56); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(u, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + u = s[0]; + v = s[1]; + s[0] = chi(s[0], s[1], s[2]); + s[1] = chi(s[1], s[2], s[3]); + s[2] = chi(s[2], s[3], s[4]); + s[3] = chi(s[3], s[4], u); + s[4] = chi(s[4], u, v); + + u = s[5]; + v = s[6]; + s[5] = chi(s[5], s[6], s[7]); + s[6] = chi(s[6], s[7], s[8]); + s[7] = chi(s[7], s[8], s[9]); + s[8] = chi(s[8], s[9], u); + s[9] = chi(s[9], u, v); + + u = s[10]; + v = s[11]; + s[10] = chi(s[10], s[11], s[12]); + s[11] = chi(s[11], s[12], s[13]); + s[12] = chi(s[12], s[13], s[14]); + s[13] = chi(s[13], s[14], u); + s[14] = chi(s[14], u, v); + + u = s[15]; + v = s[16]; + s[15] = chi(s[15], s[16], s[17]); + s[16] = chi(s[16], s[17], s[18]); + s[17] = chi(s[17], s[18], s[19]); + s[18] = chi(s[18], s[19], u); + s[19] = chi(s[19], u, v); + + u = s[20]; + v = s[21]; + s[20] = chi(s[20], s[21], s[22]); + s[21] = chi(s[21], s[22], s[23]); + s[22] = chi(s[22], s[23], s[24]); + s[23] = chi(s[23], s[24], u); + s[24] = chi(s[24], u, v); + + /* iota: a[0,0] ^= round constant */ + s[0] ^= LDG(keccak_round_constants[i]); + } + + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = xor5(s[0], s[5], s[10], s[15], s[20]); + t[1] = xor5(s[1], s[6], s[11], s[16], s[21]); + t[2] = xor5(s[2], s[7], s[12], s[17], s[22]); + t[3] = xor5(s[3], s[8], s[13], s[18], s[23]); + t[4] = xor5(s[4], s[9], s[14], s[19], s[24]); + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + + u = t[4] ^ ROL2(t[1], 1); + s[0] ^= u; + s[10] ^= u; + + u = t[0] ^ ROL2(t[2], 1); + s[6] ^= u; + s[16] ^= u; + + u = t[1] ^ ROL2(t[3], 1); + s[12] ^= u; + s[22] ^= u; + + u = t[2] ^ ROL2(t[4], 1); + s[3] ^= u; + s[18] ^= u; + + u = t[3] ^ ROL2(t[0], 1); + s[9] ^= u; + s[24] ^= u; + + /* rho pi: b[..] = rotl(a[..], ..) */ + u = s[1]; + + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[2] = ROL2(s[12], 43); + s[4] = ROL2(s[24], 14); + s[8] = ROL2(s[16], 45); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[7] = ROL2(s[10], 3); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + + u = s[0]; + v = s[1]; + s[0] = chi(s[0], s[1], s[2]); + s[1] = chi(s[1], s[2], s[3]); + s[2] = chi(s[2], s[3], s[4]); + s[3] = chi(s[3], s[4], u); + s[4] = chi(s[4], u, v); + s[5] = chi(s[5], s[6], s[7]); + s[6] = chi(s[6], s[7], s[8]); + s[7] = chi(s[7], s[8], s[9]); + + /* iota: a[0,0] ^= round constant */ + s[0] ^= LDG(keccak_round_constants[23]); +} \ No newline at end of file diff --git a/plugins/cuda/resources/kaspa-cuda-sm61.ptx b/plugins/cuda/resources/kaspa-cuda-sm61.ptx index 0ad05d0..d9710c6 100644 --- a/plugins/cuda/resources/kaspa-cuda-sm61.ptx +++ b/plugins/cuda/resources/kaspa-cuda-sm61.ptx @@ -10,7 +10,12 @@ .target sm_61 .address_size 64 - // .globl heavy_hash +.extern .func (.param .b32 func_retval0) vprintf +( + .param .b64 vprintf_param_0, + .param .b64 vprintf_param_1 +) +; .global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; .global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; .global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; @@ -18,7105 +23,41898 @@ .global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; .global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; .global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; -.const .align 4 .b8 matrix[4096]; -.const .align 8 .b8 hash_header[72]; +.const .align 8 .b8 keccak_round_constants[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .u64 _ZZN10item_state6updateEjE9num_words = 16; +.global .align 8 .u64 _ZZ15fishhash_kernelRK16fishhash_contextRK7hash512E9num_words = 32; +.const .align 1 .b8 matrix[4096]; +.const .align 1 .b8 hash_header[72]; .const .align 8 .b8 target[32]; .const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; .const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; +.global .align 1 .b8 $str[5] = {37, 48, 50, 120, 0}; +.global .align 1 .b8 $str$1[2] = {10, 0}; +.global .align 1 .b8 $str$2[32] = {104, 101, 97, 118, 121, 95, 104, 97, 115, 104, 32, 84, 104, 114, 101, 97, 100, 32, 37, 100, 44, 32, 66, 108, 111, 99, 107, 32, 37, 100, 10, 0}; +.global .align 1 .b8 $str$3[22] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 49, 48, 93, 32, 105, 115, 32, 58, 32, 0}; +.global .align 1 .b8 $str$4[22] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 52, 50, 93, 32, 105, 115, 32, 58, 32, 0}; +.global .align 1 .b8 $str$5[25] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 49, 50, 51, 52, 53, 93, 32, 105, 115, 32, 58, 32, 0}; -.visible .entry heavy_hash( - .param .u64 heavy_hash_param_0, - .param .u64 heavy_hash_param_1, - .param .u64 heavy_hash_param_2, - .param .u8 heavy_hash_param_3, - .param .u64 heavy_hash_param_4, - .param .u64 heavy_hash_param_5 +.func (.param .b64 func_retval0) _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh( + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3, + .param .b32 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5 ) { - .local .align 8 .b8 __local_depot0[1912]; + .local .align 16 .b8 __local_depot0[224]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<17>; - .reg .b16 %rs<113>; - .reg .b32 %r<6250>; - .reg .b64 %rd<466>; + .reg .pred %p<28>; + .reg .b16 %rs<233>; + .reg .b32 %r<3965>; + .reg .b64 %rd<174>; mov.u64 %SPL, __local_depot0; - ld.param.u8 %rs11, [heavy_hash_param_3]; - ld.param.u64 %rd78, [heavy_hash_param_0]; - ld.param.u64 %rd79, [heavy_hash_param_1]; - ld.param.u64 %rd80, [heavy_hash_param_2]; - ld.param.u64 %rd81, [heavy_hash_param_4]; - ld.param.u64 %rd82, [heavy_hash_param_5]; - cvta.to.global.u64 %rd1, %rd81; - cvta.to.global.u64 %rd2, %rd82; - add.u64 %rd3, %SPL, 0; - mov.u32 %r17, %ntid.x; - mov.u32 %r18, %ctaid.x; - mov.u32 %r19, %tid.x; - mad.lo.s32 %r20, %r18, %r17, %r19; - cvt.s64.s32 %rd4, %r20; - setp.ge.u64 %p6, %rd4, %rd80; - @%p6 bra $L__BB0_19; - - cvt.u32.u64 %r21, %rd4; - setp.ne.s32 %p7, %r21, 0; - @%p7 bra $L__BB0_3; - - mov.u64 %rd84, 0; - st.global.u64 [%rd2], %rd84; - -$L__BB0_3: - setp.eq.s16 %p8, %rs11, 0; - @%p8 bra $L__BB0_5; - - shl.b64 %rd85, %rd4, 5; - add.s64 %rd86, %rd1, %rd85; - ld.global.v2.u64 {%rd87, %rd88}, [%rd86]; - mul.lo.s64 %rd91, %rd88, 5; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd91, 7; - shr.b64 %rhs, %rd91, 57; - add.u64 %rd92, %lhs, %rhs; - } - mul.lo.s64 %rd439, %rd92, 9; - shl.b64 %rd93, %rd88, 17; - ld.global.v2.u64 {%rd94, %rd95}, [%rd86+16]; - xor.b64 %rd98, %rd94, %rd87; - xor.b64 %rd99, %rd95, %rd88; - xor.b64 %rd100, %rd88, %rd98; - xor.b64 %rd101, %rd87, %rd99; - st.global.v2.u64 [%rd86], {%rd101, %rd100}; - { - .reg .b32 %dummy; - mov.b64 {%r22,%dummy}, %rd99; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r23}, %rd99; - } - shf.r.wrap.b32 %r24, %r23, %r22, 19; - shf.r.wrap.b32 %r25, %r22, %r23, 19; - mov.b64 %rd102, {%r25, %r24}; - xor.b64 %rd103, %rd98, %rd93; - st.global.v2.u64 [%rd86+16], {%rd103, %rd102}; - bra.uni $L__BB0_6; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs75, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4]; + ld.param.u64 %rd69, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd170, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + ld.param.u64 %rd71, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2]; + ld.param.u64 %rd164, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + ld.param.u64 %rd73, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd154, %rd73; + cvta.to.local.u64 %rd2, %rd71; + add.u64 %rd152, %SPL, 16; + add.u64 %rd148, %SP, 96; + cvta.to.local.u64 %rd4, %rd148; + setp.lt.u64 %p1, %rd170, 1025; + @%p1 bra $L__BB0_14; + bra.uni $L__BB0_1; + +$L__BB0_14: + add.u64 %rd161, %SPL, 0; + setp.ne.s64 %p16, %rd170, 1024; + mov.u64 %rd158, 0; + mov.u64 %rd150, %rd158; + @%p16 bra $L__BB0_16; + + mov.u64 %rd170, 0; + st.local.u64 [%rd161], %rd69; + mov.u64 %rd150, 1; + mov.u64 %rd158, 1024; + +$L__BB0_16: + setp.eq.s64 %p17, %rd150, 0; + @%p17 bra $L__BB0_21; + + or.b16 %rs1, %rs75, 1; + mov.u64 %rd162, %rd150; + +$L__BB0_18: + ld.local.u64 %rd165, [%rd161]; + ld.local.u8 %r1060, [%rd2]; + ld.local.u8 %r1061, [%rd2+1]; + prmt.b32 %r1062, %r1061, %r1060, 30212; + ld.local.u8 %r1063, [%rd2+2]; + ld.local.u8 %r1064, [%rd2+3]; + prmt.b32 %r1065, %r1064, %r1063, 30212; + prmt.b32 %r3948, %r1065, %r1062, 4180; + ld.local.u8 %r1066, [%rd2+4]; + ld.local.u8 %r1067, [%rd2+5]; + prmt.b32 %r1068, %r1067, %r1066, 30212; + ld.local.u8 %r1069, [%rd2+6]; + ld.local.u8 %r1070, [%rd2+7]; + prmt.b32 %r1071, %r1070, %r1069, 30212; + prmt.b32 %r3947, %r1071, %r1068, 4180; + ld.local.u8 %r1072, [%rd2+8]; + ld.local.u8 %r1073, [%rd2+9]; + prmt.b32 %r1074, %r1073, %r1072, 30212; + ld.local.u8 %r1075, [%rd2+10]; + ld.local.u8 %r1076, [%rd2+11]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + prmt.b32 %r3946, %r1077, %r1074, 4180; + ld.local.u8 %r1078, [%rd2+12]; + ld.local.u8 %r1079, [%rd2+13]; + prmt.b32 %r1080, %r1079, %r1078, 30212; + ld.local.u8 %r1081, [%rd2+14]; + ld.local.u8 %r1082, [%rd2+15]; + prmt.b32 %r1083, %r1082, %r1081, 30212; + prmt.b32 %r3945, %r1083, %r1080, 4180; + mov.u64 %rd166, 16; + ld.local.u8 %r1084, [%rd2+16]; + ld.local.u8 %r1085, [%rd2+17]; + prmt.b32 %r1086, %r1085, %r1084, 30212; + ld.local.u8 %r1087, [%rd2+18]; + ld.local.u8 %r1088, [%rd2+19]; + prmt.b32 %r1089, %r1088, %r1087, 30212; + prmt.b32 %r3944, %r1089, %r1086, 4180; + ld.local.u8 %r1090, [%rd2+20]; + ld.local.u8 %r1091, [%rd2+21]; + prmt.b32 %r1092, %r1091, %r1090, 30212; + ld.local.u8 %r1093, [%rd2+22]; + ld.local.u8 %r1094, [%rd2+23]; + prmt.b32 %r1095, %r1094, %r1093, 30212; + prmt.b32 %r3943, %r1095, %r1092, 4180; + ld.local.u8 %r1096, [%rd2+24]; + ld.local.u8 %r1097, [%rd2+25]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd2+26]; + ld.local.u8 %r1100, [%rd2+27]; + prmt.b32 %r1101, %r1100, %r1099, 30212; + prmt.b32 %r3942, %r1101, %r1098, 4180; + ld.local.u8 %r1102, [%rd2+28]; + ld.local.u8 %r1103, [%rd2+29]; + prmt.b32 %r1104, %r1103, %r1102, 30212; + ld.local.u8 %r1105, [%rd2+30]; + ld.local.u8 %r1106, [%rd2+31]; + prmt.b32 %r1107, %r1106, %r1105, 30212; + prmt.b32 %r3941, %r1107, %r1104, 4180; + mov.u16 %rs197, %rs1; + +$L__BB0_19: + shr.u64 %rd142, %rd164, 32; + cvt.u32.u64 %r3940, %rd142; + cvt.u32.u64 %r3939, %rd164; + setp.eq.s64 %p18, %rd166, 1; + selp.b16 %rs79, 2, 0, %p18; + or.b16 %rs80, %rs79, %rs197; + ld.u8 %r1108, [%rd165]; + ld.u8 %r1109, [%rd165+1]; + prmt.b32 %r1110, %r1109, %r1108, 30212; + ld.u8 %r1111, [%rd165+2]; + prmt.b32 %r1112, %r1111, %r1110, 28756; + ld.u8 %r1113, [%rd165+3]; + prmt.b32 %r1114, %r1113, %r1112, 1620; + ld.u8 %r1115, [%rd165+4]; + ld.u8 %r1116, [%rd165+5]; + prmt.b32 %r1117, %r1116, %r1115, 30212; + ld.u8 %r1118, [%rd165+6]; + prmt.b32 %r1119, %r1118, %r1117, 28756; + ld.u8 %r1120, [%rd165+7]; + prmt.b32 %r1121, %r1120, %r1119, 1620; + ld.u8 %r1122, [%rd165+8]; + ld.u8 %r1123, [%rd165+9]; + prmt.b32 %r1124, %r1123, %r1122, 30212; + ld.u8 %r1125, [%rd165+10]; + prmt.b32 %r1126, %r1125, %r1124, 28756; + ld.u8 %r1127, [%rd165+11]; + prmt.b32 %r1128, %r1127, %r1126, 1620; + ld.u8 %r1129, [%rd165+12]; + ld.u8 %r1130, [%rd165+13]; + prmt.b32 %r1131, %r1130, %r1129, 30212; + ld.u8 %r1132, [%rd165+14]; + prmt.b32 %r1133, %r1132, %r1131, 28756; + ld.u8 %r1134, [%rd165+15]; + prmt.b32 %r1135, %r1134, %r1133, 1620; + ld.u8 %r1136, [%rd165+16]; + ld.u8 %r1137, [%rd165+17]; + prmt.b32 %r1138, %r1137, %r1136, 30212; + ld.u8 %r1139, [%rd165+18]; + prmt.b32 %r1140, %r1139, %r1138, 28756; + ld.u8 %r1141, [%rd165+19]; + prmt.b32 %r1142, %r1141, %r1140, 1620; + ld.u8 %r1143, [%rd165+20]; + ld.u8 %r1144, [%rd165+21]; + prmt.b32 %r1145, %r1144, %r1143, 30212; + ld.u8 %r1146, [%rd165+22]; + prmt.b32 %r1147, %r1146, %r1145, 28756; + ld.u8 %r1148, [%rd165+23]; + prmt.b32 %r1149, %r1148, %r1147, 1620; + ld.u8 %r1150, [%rd165+24]; + ld.u8 %r1151, [%rd165+25]; + prmt.b32 %r1152, %r1151, %r1150, 30212; + ld.u8 %r1153, [%rd165+26]; + prmt.b32 %r1154, %r1153, %r1152, 28756; + ld.u8 %r1155, [%rd165+27]; + prmt.b32 %r1156, %r1155, %r1154, 1620; + ld.u8 %r1157, [%rd165+28]; + ld.u8 %r1158, [%rd165+29]; + prmt.b32 %r1159, %r1158, %r1157, 30212; + ld.u8 %r1160, [%rd165+30]; + prmt.b32 %r1161, %r1160, %r1159, 28756; + ld.u8 %r1162, [%rd165+31]; + prmt.b32 %r1163, %r1162, %r1161, 1620; + ld.u8 %r1164, [%rd165+32]; + ld.u8 %r1165, [%rd165+33]; + prmt.b32 %r1166, %r1165, %r1164, 30212; + ld.u8 %r1167, [%rd165+34]; + prmt.b32 %r1168, %r1167, %r1166, 28756; + ld.u8 %r1169, [%rd165+35]; + prmt.b32 %r1170, %r1169, %r1168, 1620; + ld.u8 %r1171, [%rd165+36]; + ld.u8 %r1172, [%rd165+37]; + prmt.b32 %r1173, %r1172, %r1171, 30212; + ld.u8 %r1174, [%rd165+38]; + prmt.b32 %r1175, %r1174, %r1173, 28756; + ld.u8 %r1176, [%rd165+39]; + prmt.b32 %r1177, %r1176, %r1175, 1620; + ld.u8 %r1178, [%rd165+40]; + ld.u8 %r1179, [%rd165+41]; + prmt.b32 %r1180, %r1179, %r1178, 30212; + ld.u8 %r1181, [%rd165+42]; + prmt.b32 %r1182, %r1181, %r1180, 28756; + ld.u8 %r1183, [%rd165+43]; + prmt.b32 %r1184, %r1183, %r1182, 1620; + ld.u8 %r1185, [%rd165+44]; + ld.u8 %r1186, [%rd165+45]; + prmt.b32 %r1187, %r1186, %r1185, 30212; + ld.u8 %r1188, [%rd165+46]; + prmt.b32 %r1189, %r1188, %r1187, 28756; + ld.u8 %r1190, [%rd165+47]; + prmt.b32 %r1191, %r1190, %r1189, 1620; + ld.u8 %r1192, [%rd165+48]; + ld.u8 %r1193, [%rd165+49]; + prmt.b32 %r1194, %r1193, %r1192, 30212; + ld.u8 %r1195, [%rd165+50]; + prmt.b32 %r1196, %r1195, %r1194, 28756; + ld.u8 %r1197, [%rd165+51]; + prmt.b32 %r1198, %r1197, %r1196, 1620; + ld.u8 %r1199, [%rd165+52]; + ld.u8 %r1200, [%rd165+53]; + prmt.b32 %r1201, %r1200, %r1199, 30212; + ld.u8 %r1202, [%rd165+54]; + prmt.b32 %r1203, %r1202, %r1201, 28756; + ld.u8 %r1204, [%rd165+55]; + prmt.b32 %r1205, %r1204, %r1203, 1620; + ld.u8 %r1206, [%rd165+56]; + ld.u8 %r1207, [%rd165+57]; + prmt.b32 %r1208, %r1207, %r1206, 30212; + ld.u8 %r1209, [%rd165+58]; + prmt.b32 %r1210, %r1209, %r1208, 28756; + ld.u8 %r1211, [%rd165+59]; + prmt.b32 %r1212, %r1211, %r1210, 1620; + ld.u8 %r1213, [%rd165+60]; + ld.u8 %r1214, [%rd165+61]; + prmt.b32 %r1215, %r1214, %r1213, 30212; + ld.u8 %r1216, [%rd165+62]; + prmt.b32 %r1217, %r1216, %r1215, 28756; + ld.u8 %r1218, [%rd165+63]; + prmt.b32 %r1219, %r1218, %r1217, 1620; + cvt.u32.u16 %r1220, %rs80; + and.b32 %r1221, %r1220, 255; + add.s32 %r1222, %r3944, %r3948; + add.s32 %r1223, %r1222, %r1114; + xor.b32 %r1224, %r1223, %r3939; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 16; + add.s32 %r1226, %r1225, 1779033703; + xor.b32 %r1227, %r1226, %r3944; + shf.l.wrap.b32 %r1228, %r1227, %r1227, 20; + add.s32 %r1229, %r1121, %r1223; + add.s32 %r1230, %r1229, %r1228; + xor.b32 %r1231, %r1230, %r1225; + shf.l.wrap.b32 %r1232, %r1231, %r1231, 24; + add.s32 %r1233, %r1232, %r1226; + xor.b32 %r1234, %r1233, %r1228; + shf.l.wrap.b32 %r1235, %r1234, %r1234, 25; + add.s32 %r1236, %r3943, %r3947; + add.s32 %r1237, %r1236, %r1128; + xor.b32 %r1238, %r1237, %r3940; + shf.l.wrap.b32 %r1239, %r1238, %r1238, 16; + add.s32 %r1240, %r1239, -1150833019; + xor.b32 %r1241, %r1240, %r3943; + shf.l.wrap.b32 %r1242, %r1241, %r1241, 20; + add.s32 %r1243, %r1135, %r1237; + add.s32 %r1244, %r1243, %r1242; + xor.b32 %r1245, %r1244, %r1239; + shf.l.wrap.b32 %r1246, %r1245, %r1245, 24; + add.s32 %r1247, %r1246, %r1240; + xor.b32 %r1248, %r1247, %r1242; + shf.l.wrap.b32 %r1249, %r1248, %r1248, 25; + add.s32 %r1250, %r3942, %r3946; + add.s32 %r1251, %r1250, %r1142; + shr.u32 %r1252, %r1251, 16; + shl.b32 %r1253, %r1251, 16; + xor.b32 %r1254, %r1253, 4194304; + or.b32 %r1255, %r1254, %r1252; + add.s32 %r1256, %r1255, 1013904242; + xor.b32 %r1257, %r1256, %r3942; + shf.l.wrap.b32 %r1258, %r1257, %r1257, 20; + add.s32 %r1259, %r1149, %r1251; + add.s32 %r1260, %r1259, %r1258; + xor.b32 %r1261, %r1260, %r1255; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 24; + add.s32 %r1263, %r1262, %r1256; + xor.b32 %r1264, %r1263, %r1258; + shf.l.wrap.b32 %r1265, %r1264, %r1264, 25; + add.s32 %r1266, %r3941, %r3945; + add.s32 %r1267, %r1266, %r1156; + xor.b32 %r1268, %r1267, %r1221; + shr.u32 %r1269, %r1267, 16; + shl.b32 %r1270, %r1268, 16; + or.b32 %r1271, %r1270, %r1269; + add.s32 %r1272, %r1271, -1521486534; + xor.b32 %r1273, %r1272, %r3941; + shf.l.wrap.b32 %r1274, %r1273, %r1273, 20; + add.s32 %r1275, %r1163, %r1267; + add.s32 %r1276, %r1275, %r1274; + xor.b32 %r1277, %r1276, %r1271; + shf.l.wrap.b32 %r1278, %r1277, %r1277, 24; + add.s32 %r1279, %r1278, %r1272; + xor.b32 %r1280, %r1279, %r1274; + shf.l.wrap.b32 %r1281, %r1280, %r1280, 25; + add.s32 %r1282, %r1249, %r1230; + add.s32 %r1283, %r1282, %r1170; + xor.b32 %r1284, %r1278, %r1283; + shf.l.wrap.b32 %r1285, %r1284, %r1284, 16; + add.s32 %r1286, %r1285, %r1263; + xor.b32 %r1287, %r1286, %r1249; + shf.l.wrap.b32 %r1288, %r1287, %r1287, 20; + add.s32 %r1289, %r1177, %r1283; + add.s32 %r1290, %r1289, %r1288; + xor.b32 %r1291, %r1290, %r1285; + shf.l.wrap.b32 %r1292, %r1291, %r1291, 24; + add.s32 %r1293, %r1292, %r1286; + xor.b32 %r1294, %r1293, %r1288; + shf.l.wrap.b32 %r1295, %r1294, %r1294, 25; + add.s32 %r1296, %r1265, %r1244; + add.s32 %r1297, %r1296, %r1184; + xor.b32 %r1298, %r1297, %r1232; + shf.l.wrap.b32 %r1299, %r1298, %r1298, 16; + add.s32 %r1300, %r1299, %r1279; + xor.b32 %r1301, %r1300, %r1265; + shf.l.wrap.b32 %r1302, %r1301, %r1301, 20; + add.s32 %r1303, %r1191, %r1297; + add.s32 %r1304, %r1303, %r1302; + xor.b32 %r1305, %r1304, %r1299; + shf.l.wrap.b32 %r1306, %r1305, %r1305, 24; + add.s32 %r1307, %r1306, %r1300; + xor.b32 %r1308, %r1307, %r1302; + shf.l.wrap.b32 %r1309, %r1308, %r1308, 25; + add.s32 %r1310, %r1281, %r1260; + add.s32 %r1311, %r1310, %r1198; + xor.b32 %r1312, %r1311, %r1246; + shf.l.wrap.b32 %r1313, %r1312, %r1312, 16; + add.s32 %r1314, %r1313, %r1233; + xor.b32 %r1315, %r1314, %r1281; + shf.l.wrap.b32 %r1316, %r1315, %r1315, 20; + add.s32 %r1317, %r1205, %r1311; + add.s32 %r1318, %r1317, %r1316; + xor.b32 %r1319, %r1318, %r1313; + shf.l.wrap.b32 %r1320, %r1319, %r1319, 24; + add.s32 %r1321, %r1320, %r1314; + xor.b32 %r1322, %r1321, %r1316; + shf.l.wrap.b32 %r1323, %r1322, %r1322, 25; + add.s32 %r1324, %r1276, %r1235; + add.s32 %r1325, %r1324, %r1212; + xor.b32 %r1326, %r1325, %r1262; + shf.l.wrap.b32 %r1327, %r1326, %r1326, 16; + add.s32 %r1328, %r1327, %r1247; + xor.b32 %r1329, %r1328, %r1235; + shf.l.wrap.b32 %r1330, %r1329, %r1329, 20; + add.s32 %r1331, %r1219, %r1325; + add.s32 %r1332, %r1331, %r1330; + xor.b32 %r1333, %r1332, %r1327; + shf.l.wrap.b32 %r1334, %r1333, %r1333, 24; + add.s32 %r1335, %r1334, %r1328; + xor.b32 %r1336, %r1335, %r1330; + shf.l.wrap.b32 %r1337, %r1336, %r1336, 25; + add.s32 %r1338, %r1290, %r1128; + add.s32 %r1339, %r1338, %r1337; + xor.b32 %r1340, %r1339, %r1306; + shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; + add.s32 %r1342, %r1341, %r1321; + xor.b32 %r1343, %r1342, %r1337; + shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; + add.s32 %r1345, %r1339, %r1156; + add.s32 %r1346, %r1345, %r1344; + xor.b32 %r1347, %r1346, %r1341; + shf.l.wrap.b32 %r1348, %r1347, %r1347, 24; + add.s32 %r1349, %r1348, %r1342; + xor.b32 %r1350, %r1349, %r1344; + shf.l.wrap.b32 %r1351, %r1350, %r1350, 25; + add.s32 %r1352, %r1304, %r1135; + add.s32 %r1353, %r1352, %r1295; + xor.b32 %r1354, %r1320, %r1353; + shf.l.wrap.b32 %r1355, %r1354, %r1354, 16; + add.s32 %r1356, %r1335, %r1355; + xor.b32 %r1357, %r1356, %r1295; + shf.l.wrap.b32 %r1358, %r1357, %r1357, 20; + add.s32 %r1359, %r1353, %r1184; + add.s32 %r1360, %r1359, %r1358; + xor.b32 %r1361, %r1360, %r1355; + shf.l.wrap.b32 %r1362, %r1361, %r1361, 24; + add.s32 %r1363, %r1362, %r1356; + xor.b32 %r1364, %r1363, %r1358; + shf.l.wrap.b32 %r1365, %r1364, %r1364, 25; + add.s32 %r1366, %r1309, %r1163; + add.s32 %r1367, %r1366, %r1318; + xor.b32 %r1368, %r1334, %r1367; + shf.l.wrap.b32 %r1369, %r1368, %r1368, 16; + add.s32 %r1370, %r1369, %r1293; + xor.b32 %r1371, %r1370, %r1309; + shf.l.wrap.b32 %r1372, %r1371, %r1371, 20; + add.s32 %r1373, %r1367, %r1114; + add.s32 %r1374, %r1373, %r1372; + xor.b32 %r1375, %r1374, %r1369; + shf.l.wrap.b32 %r1376, %r1375, %r1375, 24; + add.s32 %r1377, %r1376, %r1370; + xor.b32 %r1378, %r1377, %r1372; + shf.l.wrap.b32 %r1379, %r1378, %r1378, 25; + add.s32 %r1380, %r1323, %r1142; + add.s32 %r1381, %r1380, %r1332; + xor.b32 %r1382, %r1381, %r1292; + shf.l.wrap.b32 %r1383, %r1382, %r1382, 16; + add.s32 %r1384, %r1383, %r1307; + xor.b32 %r1385, %r1384, %r1323; + shf.l.wrap.b32 %r1386, %r1385, %r1385, 20; + add.s32 %r1387, %r1381, %r1205; + add.s32 %r1388, %r1387, %r1386; + xor.b32 %r1389, %r1388, %r1383; + shf.l.wrap.b32 %r1390, %r1389, %r1389, 24; + add.s32 %r1391, %r1390, %r1384; + xor.b32 %r1392, %r1391, %r1386; + shf.l.wrap.b32 %r1393, %r1392, %r1392, 25; + add.s32 %r1394, %r1365, %r1121; + add.s32 %r1395, %r1394, %r1346; + xor.b32 %r1396, %r1395, %r1390; + shf.l.wrap.b32 %r1397, %r1396, %r1396, 16; + add.s32 %r1398, %r1397, %r1377; + xor.b32 %r1399, %r1398, %r1365; + shf.l.wrap.b32 %r1400, %r1399, %r1399, 20; + add.s32 %r1401, %r1395, %r1191; + add.s32 %r1402, %r1401, %r1400; + xor.b32 %r1403, %r1402, %r1397; + shf.l.wrap.b32 %r1404, %r1403, %r1403, 24; + add.s32 %r1405, %r1404, %r1398; + xor.b32 %r1406, %r1405, %r1400; + shf.l.wrap.b32 %r1407, %r1406, %r1406, 25; + add.s32 %r1408, %r1360, %r1198; + add.s32 %r1409, %r1408, %r1379; + xor.b32 %r1410, %r1348, %r1409; + shf.l.wrap.b32 %r1411, %r1410, %r1410, 16; + add.s32 %r1412, %r1411, %r1391; + xor.b32 %r1413, %r1412, %r1379; + shf.l.wrap.b32 %r1414, %r1413, %r1413, 20; + add.s32 %r1415, %r1409, %r1149; + add.s32 %r1416, %r1415, %r1414; + xor.b32 %r1417, %r1416, %r1411; + shf.l.wrap.b32 %r1418, %r1417, %r1417, 24; + add.s32 %r1419, %r1418, %r1412; + xor.b32 %r1420, %r1419, %r1414; + shf.l.wrap.b32 %r1421, %r1420, %r1420, 25; + add.s32 %r1422, %r1374, %r1177; + add.s32 %r1423, %r1422, %r1393; + xor.b32 %r1424, %r1423, %r1362; + shf.l.wrap.b32 %r1425, %r1424, %r1424, 16; + add.s32 %r1426, %r1425, %r1349; + xor.b32 %r1427, %r1426, %r1393; + shf.l.wrap.b32 %r1428, %r1427, %r1427, 20; + add.s32 %r1429, %r1423, %r1212; + add.s32 %r1430, %r1429, %r1428; + xor.b32 %r1431, %r1430, %r1425; + shf.l.wrap.b32 %r1432, %r1431, %r1431, 24; + add.s32 %r1433, %r1432, %r1426; + xor.b32 %r1434, %r1433, %r1428; + shf.l.wrap.b32 %r1435, %r1434, %r1434, 25; + add.s32 %r1436, %r1388, %r1219; + add.s32 %r1437, %r1436, %r1351; + xor.b32 %r1438, %r1437, %r1376; + shf.l.wrap.b32 %r1439, %r1438, %r1438, 16; + add.s32 %r1440, %r1439, %r1363; + xor.b32 %r1441, %r1440, %r1351; + shf.l.wrap.b32 %r1442, %r1441, %r1441, 20; + add.s32 %r1443, %r1437, %r1170; + add.s32 %r1444, %r1443, %r1442; + xor.b32 %r1445, %r1444, %r1439; + shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; + add.s32 %r1447, %r1446, %r1440; + xor.b32 %r1448, %r1447, %r1442; + shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; + add.s32 %r1450, %r1402, %r1135; + add.s32 %r1451, %r1450, %r1449; + xor.b32 %r1452, %r1451, %r1418; + shf.l.wrap.b32 %r1453, %r1452, %r1452, 16; + add.s32 %r1454, %r1453, %r1433; + xor.b32 %r1455, %r1454, %r1449; + shf.l.wrap.b32 %r1456, %r1455, %r1455, 20; + add.s32 %r1457, %r1451, %r1142; + add.s32 %r1458, %r1457, %r1456; + xor.b32 %r1459, %r1458, %r1453; + shf.l.wrap.b32 %r1460, %r1459, %r1459, 24; + add.s32 %r1461, %r1460, %r1454; + xor.b32 %r1462, %r1461, %r1456; + shf.l.wrap.b32 %r1463, %r1462, %r1462, 25; + add.s32 %r1464, %r1416, %r1184; + add.s32 %r1465, %r1464, %r1407; + xor.b32 %r1466, %r1465, %r1432; + shf.l.wrap.b32 %r1467, %r1466, %r1466, 16; + add.s32 %r1468, %r1467, %r1447; + xor.b32 %r1469, %r1468, %r1407; + shf.l.wrap.b32 %r1470, %r1469, %r1469, 20; + add.s32 %r1471, %r1465, %r1198; + add.s32 %r1472, %r1471, %r1470; + xor.b32 %r1473, %r1472, %r1467; + shf.l.wrap.b32 %r1474, %r1473, %r1473, 24; + add.s32 %r1475, %r1474, %r1468; + xor.b32 %r1476, %r1475, %r1470; + shf.l.wrap.b32 %r1477, %r1476, %r1476, 25; + add.s32 %r1478, %r1430, %r1205; + add.s32 %r1479, %r1478, %r1421; + xor.b32 %r1480, %r1446, %r1479; + shf.l.wrap.b32 %r1481, %r1480, %r1480, 16; + add.s32 %r1482, %r1481, %r1405; + xor.b32 %r1483, %r1482, %r1421; + shf.l.wrap.b32 %r1484, %r1483, %r1483, 20; + add.s32 %r1485, %r1479, %r1128; + add.s32 %r1486, %r1485, %r1484; + xor.b32 %r1487, %r1486, %r1481; + shf.l.wrap.b32 %r1488, %r1487, %r1487, 24; + add.s32 %r1489, %r1488, %r1482; + xor.b32 %r1490, %r1489, %r1484; + shf.l.wrap.b32 %r1491, %r1490, %r1490, 25; + add.s32 %r1492, %r1435, %r1163; + add.s32 %r1493, %r1492, %r1444; + xor.b32 %r1494, %r1493, %r1404; + shf.l.wrap.b32 %r1495, %r1494, %r1494, 16; + add.s32 %r1496, %r1495, %r1419; + xor.b32 %r1497, %r1496, %r1435; + shf.l.wrap.b32 %r1498, %r1497, %r1497, 20; + add.s32 %r1499, %r1493, %r1212; + add.s32 %r1500, %r1499, %r1498; + xor.b32 %r1501, %r1500, %r1495; + shf.l.wrap.b32 %r1502, %r1501, %r1501, 24; + add.s32 %r1503, %r1502, %r1496; + xor.b32 %r1504, %r1503, %r1498; + shf.l.wrap.b32 %r1505, %r1504, %r1504, 25; + add.s32 %r1506, %r1477, %r1156; + add.s32 %r1507, %r1506, %r1458; + xor.b32 %r1508, %r1507, %r1502; + shf.l.wrap.b32 %r1509, %r1508, %r1508, 16; + add.s32 %r1510, %r1509, %r1489; + xor.b32 %r1511, %r1510, %r1477; + shf.l.wrap.b32 %r1512, %r1511, %r1511, 20; + add.s32 %r1513, %r1507, %r1149; + add.s32 %r1514, %r1513, %r1512; + xor.b32 %r1515, %r1514, %r1509; + shf.l.wrap.b32 %r1516, %r1515, %r1515, 24; + add.s32 %r1517, %r1516, %r1510; + xor.b32 %r1518, %r1517, %r1512; + shf.l.wrap.b32 %r1519, %r1518, %r1518, 25; + add.s32 %r1520, %r1472, %r1177; + add.s32 %r1521, %r1520, %r1491; + xor.b32 %r1522, %r1460, %r1521; + shf.l.wrap.b32 %r1523, %r1522, %r1522, 16; + add.s32 %r1524, %r1523, %r1503; + xor.b32 %r1525, %r1524, %r1491; + shf.l.wrap.b32 %r1526, %r1525, %r1525, 20; + add.s32 %r1527, %r1521, %r1114; + add.s32 %r1528, %r1527, %r1526; + xor.b32 %r1529, %r1528, %r1523; + shf.l.wrap.b32 %r1530, %r1529, %r1529, 24; + add.s32 %r1531, %r1530, %r1524; + xor.b32 %r1532, %r1531, %r1526; + shf.l.wrap.b32 %r1533, %r1532, %r1532, 25; + add.s32 %r1534, %r1486, %r1191; + add.s32 %r1535, %r1534, %r1505; + xor.b32 %r1536, %r1535, %r1474; + shf.l.wrap.b32 %r1537, %r1536, %r1536, 16; + add.s32 %r1538, %r1537, %r1461; + xor.b32 %r1539, %r1538, %r1505; + shf.l.wrap.b32 %r1540, %r1539, %r1539, 20; + add.s32 %r1541, %r1535, %r1219; + add.s32 %r1542, %r1541, %r1540; + xor.b32 %r1543, %r1542, %r1537; + shf.l.wrap.b32 %r1544, %r1543, %r1543, 24; + add.s32 %r1545, %r1544, %r1538; + xor.b32 %r1546, %r1545, %r1540; + shf.l.wrap.b32 %r1547, %r1546, %r1546, 25; + add.s32 %r1548, %r1500, %r1170; + add.s32 %r1549, %r1548, %r1463; + xor.b32 %r1550, %r1549, %r1488; + shf.l.wrap.b32 %r1551, %r1550, %r1550, 16; + add.s32 %r1552, %r1551, %r1475; + xor.b32 %r1553, %r1552, %r1463; + shf.l.wrap.b32 %r1554, %r1553, %r1553, 20; + add.s32 %r1555, %r1549, %r1121; + add.s32 %r1556, %r1555, %r1554; + xor.b32 %r1557, %r1556, %r1551; + shf.l.wrap.b32 %r1558, %r1557, %r1557, 24; + add.s32 %r1559, %r1558, %r1552; + xor.b32 %r1560, %r1559, %r1554; + shf.l.wrap.b32 %r1561, %r1560, %r1560, 25; + add.s32 %r1562, %r1514, %r1184; + add.s32 %r1563, %r1562, %r1561; + xor.b32 %r1564, %r1563, %r1530; + shf.l.wrap.b32 %r1565, %r1564, %r1564, 16; + add.s32 %r1566, %r1565, %r1545; + xor.b32 %r1567, %r1566, %r1561; + shf.l.wrap.b32 %r1568, %r1567, %r1567, 20; + add.s32 %r1569, %r1563, %r1163; + add.s32 %r1570, %r1569, %r1568; + xor.b32 %r1571, %r1570, %r1565; + shf.l.wrap.b32 %r1572, %r1571, %r1571, 24; + add.s32 %r1573, %r1572, %r1566; + xor.b32 %r1574, %r1573, %r1568; + shf.l.wrap.b32 %r1575, %r1574, %r1574, 25; + add.s32 %r1576, %r1528, %r1198; + add.s32 %r1577, %r1576, %r1519; + xor.b32 %r1578, %r1577, %r1544; + shf.l.wrap.b32 %r1579, %r1578, %r1578, 16; + add.s32 %r1580, %r1579, %r1559; + xor.b32 %r1581, %r1580, %r1519; + shf.l.wrap.b32 %r1582, %r1581, %r1581, 20; + add.s32 %r1583, %r1577, %r1177; + add.s32 %r1584, %r1583, %r1582; + xor.b32 %r1585, %r1584, %r1579; + shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; + add.s32 %r1587, %r1586, %r1580; + xor.b32 %r1588, %r1587, %r1582; + shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; + add.s32 %r1590, %r1542, %r1212; + add.s32 %r1591, %r1590, %r1533; + xor.b32 %r1592, %r1558, %r1591; + shf.l.wrap.b32 %r1593, %r1592, %r1592, 16; + add.s32 %r1594, %r1593, %r1517; + xor.b32 %r1595, %r1594, %r1533; + shf.l.wrap.b32 %r1596, %r1595, %r1595, 20; + add.s32 %r1597, %r1591, %r1135; + add.s32 %r1598, %r1597, %r1596; + xor.b32 %r1599, %r1598, %r1593; + shf.l.wrap.b32 %r1600, %r1599, %r1599, 24; + add.s32 %r1601, %r1600, %r1594; + xor.b32 %r1602, %r1601, %r1596; + shf.l.wrap.b32 %r1603, %r1602, %r1602, 25; + add.s32 %r1604, %r1547, %r1205; + add.s32 %r1605, %r1604, %r1556; + xor.b32 %r1606, %r1605, %r1516; + shf.l.wrap.b32 %r1607, %r1606, %r1606, 16; + add.s32 %r1608, %r1607, %r1531; + xor.b32 %r1609, %r1608, %r1547; + shf.l.wrap.b32 %r1610, %r1609, %r1609, 20; + add.s32 %r1611, %r1605, %r1219; + add.s32 %r1612, %r1611, %r1610; + xor.b32 %r1613, %r1612, %r1607; + shf.l.wrap.b32 %r1614, %r1613, %r1613, 24; + add.s32 %r1615, %r1614, %r1608; + xor.b32 %r1616, %r1615, %r1610; + shf.l.wrap.b32 %r1617, %r1616, %r1616, 25; + add.s32 %r1618, %r1589, %r1142; + add.s32 %r1619, %r1618, %r1570; + xor.b32 %r1620, %r1619, %r1614; + shf.l.wrap.b32 %r1621, %r1620, %r1620, 16; + add.s32 %r1622, %r1621, %r1601; + xor.b32 %r1623, %r1622, %r1589; + shf.l.wrap.b32 %r1624, %r1623, %r1623, 20; + add.s32 %r1625, %r1619, %r1114; + add.s32 %r1626, %r1625, %r1624; + xor.b32 %r1627, %r1626, %r1621; + shf.l.wrap.b32 %r1628, %r1627, %r1627, 24; + add.s32 %r1629, %r1628, %r1622; + xor.b32 %r1630, %r1629, %r1624; + shf.l.wrap.b32 %r1631, %r1630, %r1630, 25; + add.s32 %r1632, %r1584, %r1191; + add.s32 %r1633, %r1632, %r1603; + xor.b32 %r1634, %r1572, %r1633; + shf.l.wrap.b32 %r1635, %r1634, %r1634, 16; + add.s32 %r1636, %r1635, %r1615; + xor.b32 %r1637, %r1636, %r1603; + shf.l.wrap.b32 %r1638, %r1637, %r1637, 20; + add.s32 %r1639, %r1633, %r1128; + add.s32 %r1640, %r1639, %r1638; + xor.b32 %r1641, %r1640, %r1635; + shf.l.wrap.b32 %r1642, %r1641, %r1641, 24; + add.s32 %r1643, %r1642, %r1636; + xor.b32 %r1644, %r1643, %r1638; + shf.l.wrap.b32 %r1645, %r1644, %r1644, 25; + add.s32 %r1646, %r1598, %r1149; + add.s32 %r1647, %r1646, %r1617; + xor.b32 %r1648, %r1647, %r1586; + shf.l.wrap.b32 %r1649, %r1648, %r1648, 16; + add.s32 %r1650, %r1649, %r1573; + xor.b32 %r1651, %r1650, %r1617; + shf.l.wrap.b32 %r1652, %r1651, %r1651, 20; + add.s32 %r1653, %r1647, %r1170; + add.s32 %r1654, %r1653, %r1652; + xor.b32 %r1655, %r1654, %r1649; + shf.l.wrap.b32 %r1656, %r1655, %r1655, 24; + add.s32 %r1657, %r1656, %r1650; + xor.b32 %r1658, %r1657, %r1652; + shf.l.wrap.b32 %r1659, %r1658, %r1658, 25; + add.s32 %r1660, %r1612, %r1121; + add.s32 %r1661, %r1660, %r1575; + xor.b32 %r1662, %r1661, %r1600; + shf.l.wrap.b32 %r1663, %r1662, %r1662, 16; + add.s32 %r1664, %r1663, %r1587; + xor.b32 %r1665, %r1664, %r1575; + shf.l.wrap.b32 %r1666, %r1665, %r1665, 20; + add.s32 %r1667, %r1661, %r1156; + add.s32 %r1668, %r1667, %r1666; + xor.b32 %r1669, %r1668, %r1663; + shf.l.wrap.b32 %r1670, %r1669, %r1669, 24; + add.s32 %r1671, %r1670, %r1664; + xor.b32 %r1672, %r1671, %r1666; + shf.l.wrap.b32 %r1673, %r1672, %r1672, 25; + add.s32 %r1674, %r1626, %r1198; + add.s32 %r1675, %r1674, %r1673; + xor.b32 %r1676, %r1675, %r1642; + shf.l.wrap.b32 %r1677, %r1676, %r1676, 16; + add.s32 %r1678, %r1677, %r1657; + xor.b32 %r1679, %r1678, %r1673; + shf.l.wrap.b32 %r1680, %r1679, %r1679, 20; + add.s32 %r1681, %r1675, %r1205; + add.s32 %r1682, %r1681, %r1680; + xor.b32 %r1683, %r1682, %r1677; + shf.l.wrap.b32 %r1684, %r1683, %r1683, 24; + add.s32 %r1685, %r1684, %r1678; + xor.b32 %r1686, %r1685, %r1680; + shf.l.wrap.b32 %r1687, %r1686, %r1686, 25; + add.s32 %r1688, %r1640, %r1177; + add.s32 %r1689, %r1688, %r1631; + xor.b32 %r1690, %r1689, %r1656; + shf.l.wrap.b32 %r1691, %r1690, %r1690, 16; + add.s32 %r1692, %r1691, %r1671; + xor.b32 %r1693, %r1692, %r1631; + shf.l.wrap.b32 %r1694, %r1693, %r1693, 20; + add.s32 %r1695, %r1689, %r1191; + add.s32 %r1696, %r1695, %r1694; + xor.b32 %r1697, %r1696, %r1691; + shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; + add.s32 %r1699, %r1698, %r1692; + xor.b32 %r1700, %r1699, %r1694; + shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; + add.s32 %r1702, %r1654, %r1219; + add.s32 %r1703, %r1702, %r1645; + xor.b32 %r1704, %r1670, %r1703; + shf.l.wrap.b32 %r1705, %r1704, %r1704, 16; + add.s32 %r1706, %r1705, %r1629; + xor.b32 %r1707, %r1706, %r1645; + shf.l.wrap.b32 %r1708, %r1707, %r1707, 20; + add.s32 %r1709, %r1703, %r1184; + add.s32 %r1710, %r1709, %r1708; + xor.b32 %r1711, %r1710, %r1705; + shf.l.wrap.b32 %r1712, %r1711, %r1711, 24; + add.s32 %r1713, %r1712, %r1706; + xor.b32 %r1714, %r1713, %r1708; + shf.l.wrap.b32 %r1715, %r1714, %r1714, 25; + add.s32 %r1716, %r1659, %r1212; + add.s32 %r1717, %r1716, %r1668; + xor.b32 %r1718, %r1717, %r1628; + shf.l.wrap.b32 %r1719, %r1718, %r1718, 16; + add.s32 %r1720, %r1719, %r1643; + xor.b32 %r1721, %r1720, %r1659; + shf.l.wrap.b32 %r1722, %r1721, %r1721, 20; + add.s32 %r1723, %r1717, %r1170; + add.s32 %r1724, %r1723, %r1722; + xor.b32 %r1725, %r1724, %r1719; + shf.l.wrap.b32 %r1726, %r1725, %r1725, 24; + add.s32 %r1727, %r1726, %r1720; + xor.b32 %r1728, %r1727, %r1722; + shf.l.wrap.b32 %r1729, %r1728, %r1728, 25; + add.s32 %r1730, %r1701, %r1163; + add.s32 %r1731, %r1730, %r1682; + xor.b32 %r1732, %r1731, %r1726; + shf.l.wrap.b32 %r1733, %r1732, %r1732, 16; + add.s32 %r1734, %r1733, %r1713; + xor.b32 %r1735, %r1734, %r1701; + shf.l.wrap.b32 %r1736, %r1735, %r1735, 20; + add.s32 %r1737, %r1731, %r1128; + add.s32 %r1738, %r1737, %r1736; + xor.b32 %r1739, %r1738, %r1733; + shf.l.wrap.b32 %r1740, %r1739, %r1739, 24; + add.s32 %r1741, %r1740, %r1734; + xor.b32 %r1742, %r1741, %r1736; + shf.l.wrap.b32 %r1743, %r1742, %r1742, 25; + add.s32 %r1744, %r1696, %r1149; + add.s32 %r1745, %r1744, %r1715; + xor.b32 %r1746, %r1684, %r1745; + shf.l.wrap.b32 %r1747, %r1746, %r1746, 16; + add.s32 %r1748, %r1747, %r1727; + xor.b32 %r1749, %r1748, %r1715; + shf.l.wrap.b32 %r1750, %r1749, %r1749, 20; + add.s32 %r1751, %r1745, %r1135; + add.s32 %r1752, %r1751, %r1750; + xor.b32 %r1753, %r1752, %r1747; + shf.l.wrap.b32 %r1754, %r1753, %r1753, 24; + add.s32 %r1755, %r1754, %r1748; + xor.b32 %r1756, %r1755, %r1750; + shf.l.wrap.b32 %r1757, %r1756, %r1756, 25; + add.s32 %r1758, %r1710, %r1114; + add.s32 %r1759, %r1758, %r1729; + xor.b32 %r1760, %r1759, %r1698; + shf.l.wrap.b32 %r1761, %r1760, %r1760, 16; + add.s32 %r1762, %r1761, %r1685; + xor.b32 %r1763, %r1762, %r1729; + shf.l.wrap.b32 %r1764, %r1763, %r1763, 20; + add.s32 %r1765, %r1759, %r1121; + add.s32 %r1766, %r1765, %r1764; + xor.b32 %r1767, %r1766, %r1761; + shf.l.wrap.b32 %r1768, %r1767, %r1767, 24; + add.s32 %r1769, %r1768, %r1762; + xor.b32 %r1770, %r1769, %r1764; + shf.l.wrap.b32 %r1771, %r1770, %r1770, 25; + add.s32 %r1772, %r1724, %r1156; + add.s32 %r1773, %r1772, %r1687; + xor.b32 %r1774, %r1773, %r1712; + shf.l.wrap.b32 %r1775, %r1774, %r1774, 16; + add.s32 %r1776, %r1775, %r1699; + xor.b32 %r1777, %r1776, %r1687; + shf.l.wrap.b32 %r1778, %r1777, %r1777, 20; + add.s32 %r1779, %r1773, %r1142; + add.s32 %r1780, %r1779, %r1778; + xor.b32 %r1781, %r1780, %r1775; + shf.l.wrap.b32 %r1782, %r1781, %r1781, 24; + add.s32 %r1783, %r1782, %r1776; + xor.b32 %r1784, %r1783, %r1778; + shf.l.wrap.b32 %r1785, %r1784, %r1784, 25; + add.s32 %r1786, %r1738, %r1177; + add.s32 %r1787, %r1786, %r1785; + xor.b32 %r1788, %r1787, %r1754; + shf.l.wrap.b32 %r1789, %r1788, %r1788, 16; + add.s32 %r1790, %r1789, %r1769; + xor.b32 %r1791, %r1790, %r1785; + shf.l.wrap.b32 %r1792, %r1791, %r1791, 20; + add.s32 %r1793, %r1787, %r1212; + add.s32 %r1794, %r1793, %r1792; + xor.b32 %r1795, %r1794, %r1789; + shf.l.wrap.b32 %r1796, %r1795, %r1795, 24; + add.s32 %r1797, %r1796, %r1790; + xor.b32 %r1798, %r1797, %r1792; + shf.l.wrap.b32 %r1799, %r1798, %r1798, 25; + add.s32 %r1800, %r1752, %r1191; + add.s32 %r1801, %r1800, %r1743; + xor.b32 %r1802, %r1801, %r1768; + shf.l.wrap.b32 %r1803, %r1802, %r1802, 16; + add.s32 %r1804, %r1803, %r1783; + xor.b32 %r1805, %r1804, %r1743; + shf.l.wrap.b32 %r1806, %r1805, %r1805, 20; + add.s32 %r1807, %r1801, %r1149; + add.s32 %r1808, %r1807, %r1806; + xor.b32 %r1809, %r1808, %r1803; + shf.l.wrap.b32 %r1810, %r1809, %r1809, 24; + add.s32 %r1811, %r1810, %r1804; + xor.b32 %r1812, %r1811, %r1806; + shf.l.wrap.b32 %r1813, %r1812, %r1812, 25; + add.s32 %r1814, %r1766, %r1170; + add.s32 %r1815, %r1814, %r1757; + xor.b32 %r1816, %r1782, %r1815; + shf.l.wrap.b32 %r1817, %r1816, %r1816, 16; + add.s32 %r1818, %r1817, %r1741; + xor.b32 %r1819, %r1818, %r1757; + shf.l.wrap.b32 %r1820, %r1819, %r1819, 20; + add.s32 %r1821, %r1815, %r1198; + add.s32 %r1822, %r1821, %r1820; + xor.b32 %r1823, %r1822, %r1817; + shf.l.wrap.b32 %r1824, %r1823, %r1823, 24; + add.s32 %r1825, %r1824, %r1818; + xor.b32 %r1826, %r1825, %r1820; + shf.l.wrap.b32 %r1827, %r1826, %r1826, 25; + add.s32 %r1828, %r1771, %r1219; + add.s32 %r1829, %r1828, %r1780; + xor.b32 %r1830, %r1829, %r1740; + shf.l.wrap.b32 %r1831, %r1830, %r1830, 16; + add.s32 %r1832, %r1831, %r1755; + xor.b32 %r1833, %r1832, %r1771; + shf.l.wrap.b32 %r1834, %r1833, %r1833, 20; + add.s32 %r1835, %r1829, %r1121; + add.s32 %r1836, %r1835, %r1834; + xor.b32 %r1837, %r1836, %r1831; + shf.l.wrap.b32 %r1838, %r1837, %r1837, 24; + add.s32 %r1839, %r1838, %r1832; + xor.b32 %r1840, %r1839, %r1834; + shf.l.wrap.b32 %r1841, %r1840, %r1840, 25; + add.s32 %r1842, %r1813, %r1205; + add.s32 %r1843, %r1842, %r1794; + xor.b32 %r1844, %r1843, %r1838; + shf.l.wrap.b32 %r1845, %r1844, %r1844, 16; + add.s32 %r1846, %r1845, %r1825; + xor.b32 %r1847, %r1846, %r1813; + shf.l.wrap.b32 %r1848, %r1847, %r1847, 20; + add.s32 %r1849, %r1843, %r1135; + add.s32 %r1850, %r1849, %r1848; + xor.b32 %r1851, %r1850, %r1845; + shf.l.wrap.b32 %r1852, %r1851, %r1851, 24; + add.s32 %r1853, %r1852, %r1846; + xor.b32 %r1854, %r1853, %r1848; + shf.l.wrap.b32 %r1855, %r1854, %r1854, 25; + add.s32 %r1856, %r1808, %r1114; + add.s32 %r1857, %r1856, %r1827; + xor.b32 %r1858, %r1796, %r1857; + shf.l.wrap.b32 %r1859, %r1858, %r1858, 16; + add.s32 %r1860, %r1859, %r1839; + xor.b32 %r1861, %r1860, %r1827; + shf.l.wrap.b32 %r1862, %r1861, %r1861, 20; + add.s32 %r1863, %r1857, %r1184; + add.s32 %r1864, %r1863, %r1862; + xor.b32 %r1865, %r1864, %r1859; + shf.l.wrap.b32 %r1866, %r1865, %r1865, 24; + add.s32 %r1867, %r1866, %r1860; + xor.b32 %r1868, %r1867, %r1862; + shf.l.wrap.b32 %r1869, %r1868, %r1868, 25; + add.s32 %r1870, %r1822, %r1128; + add.s32 %r1871, %r1870, %r1841; + xor.b32 %r1872, %r1871, %r1810; + shf.l.wrap.b32 %r1873, %r1872, %r1872, 16; + add.s32 %r1874, %r1873, %r1797; + xor.b32 %r1875, %r1874, %r1841; + shf.l.wrap.b32 %r1876, %r1875, %r1875, 20; + add.s32 %r1877, %r1871, %r1156; + add.s32 %r1878, %r1877, %r1876; + xor.b32 %r1879, %r1878, %r1873; + shf.l.wrap.b32 %r1880, %r1879, %r1879, 24; + add.s32 %r1881, %r1880, %r1874; + xor.b32 %r1882, %r1881, %r1876; + shf.l.wrap.b32 %r1883, %r1882, %r1882, 25; + add.s32 %r1884, %r1836, %r1142; + add.s32 %r1885, %r1884, %r1799; + xor.b32 %r1886, %r1885, %r1824; + shf.l.wrap.b32 %r1887, %r1886, %r1886, 16; + add.s32 %r1888, %r1887, %r1811; + xor.b32 %r1889, %r1888, %r1799; + shf.l.wrap.b32 %r1890, %r1889, %r1889, 20; + add.s32 %r1891, %r1885, %r1163; + add.s32 %r1892, %r1891, %r1890; + xor.b32 %r1893, %r1892, %r1887; + shf.l.wrap.b32 %r1894, %r1893, %r1893, 24; + add.s32 %r1895, %r1894, %r1888; + xor.b32 %r1896, %r1895, %r1890; + shf.l.wrap.b32 %r1897, %r1896, %r1896, 25; + add.s32 %r1898, %r1850, %r1191; + add.s32 %r1899, %r1898, %r1897; + xor.b32 %r1900, %r1899, %r1866; + shf.l.wrap.b32 %r1901, %r1900, %r1900, 16; + add.s32 %r1902, %r1901, %r1881; + xor.b32 %r1903, %r1902, %r1897; + shf.l.wrap.b32 %r1904, %r1903, %r1903, 20; + add.s32 %r1905, %r1899, %r1219; + add.s32 %r1906, %r1905, %r1904; + xor.b32 %r1907, %r1906, %r1901; + shf.l.wrap.b32 %r1908, %r1907, %r1907, 24; + add.s32 %r1909, %r1908, %r1902; + xor.b32 %r1910, %r1909, %r1904; + shf.l.wrap.b32 %r1911, %r1910, %r1910, 25; + add.s32 %r1912, %r1864, %r1149; + add.s32 %r1913, %r1912, %r1855; + xor.b32 %r1914, %r1913, %r1880; + shf.l.wrap.b32 %r1915, %r1914, %r1914, 16; + add.s32 %r1916, %r1915, %r1895; + xor.b32 %r1917, %r1916, %r1855; + shf.l.wrap.b32 %r1918, %r1917, %r1917, 20; + add.s32 %r1919, %r1913, %r1114; + add.s32 %r1920, %r1919, %r1918; + xor.b32 %r1921, %r1920, %r1915; + shf.l.wrap.b32 %r1922, %r1921, %r1921, 24; + add.s32 %r1923, %r1922, %r1916; + xor.b32 %r1924, %r1923, %r1918; + shf.l.wrap.b32 %r1925, %r1924, %r1924, 25; + add.s32 %r1926, %r1878, %r1121; + add.s32 %r1927, %r1926, %r1869; + xor.b32 %r1928, %r1894, %r1927; + shf.l.wrap.b32 %r1929, %r1928, %r1928, 16; + add.s32 %r1930, %r1929, %r1853; + xor.b32 %r1931, %r1930, %r1869; + shf.l.wrap.b32 %r1932, %r1931, %r1931, 20; + add.s32 %r1933, %r1927, %r1177; + add.s32 %r1934, %r1933, %r1932; + xor.b32 %r1935, %r1934, %r1929; + shf.l.wrap.b32 %r1936, %r1935, %r1935, 24; + add.s32 %r1937, %r1936, %r1930; + xor.b32 %r1938, %r1937, %r1932; + shf.l.wrap.b32 %r1939, %r1938, %r1938, 25; + add.s32 %r1940, %r1883, %r1170; + add.s32 %r1941, %r1940, %r1892; + xor.b32 %r1942, %r1941, %r1852; + shf.l.wrap.b32 %r1943, %r1942, %r1942, 16; + add.s32 %r1944, %r1943, %r1867; + xor.b32 %r1945, %r1944, %r1883; + shf.l.wrap.b32 %r1946, %r1945, %r1945, 20; + add.s32 %r1947, %r1941, %r1156; + add.s32 %r1948, %r1947, %r1946; + xor.b32 %r1949, %r1948, %r1943; + shf.l.wrap.b32 %r1950, %r1949, %r1949, 24; + add.s32 %r1951, %r1950, %r1944; + xor.b32 %r1952, %r1951, %r1946; + shf.l.wrap.b32 %r1953, %r1952, %r1952, 25; + add.s32 %r1954, %r1925, %r1212; + add.s32 %r1955, %r1954, %r1906; + xor.b32 %r1956, %r1955, %r1950; + shf.l.wrap.b32 %r1957, %r1956, %r1956, 16; + add.s32 %r1958, %r1957, %r1937; + xor.b32 %r1959, %r1958, %r1925; + shf.l.wrap.b32 %r1960, %r1959, %r1959, 20; + add.s32 %r1961, %r1955, %r1184; + add.s32 %r1962, %r1961, %r1960; + xor.b32 %r1963, %r1962, %r1957; + shf.l.wrap.b32 %r1964, %r1963, %r1963, 24; + add.s32 %r1965, %r1964, %r1958; + xor.b32 %r1966, %r1965, %r1960; + shf.l.wrap.b32 %r1967, %r1966, %r1966, 25; + add.s32 %r1968, %r1920, %r1128; + add.s32 %r1969, %r1968, %r1939; + xor.b32 %r1970, %r1908, %r1969; + shf.l.wrap.b32 %r1971, %r1970, %r1970, 16; + add.s32 %r1972, %r1971, %r1951; + xor.b32 %r1973, %r1972, %r1939; + shf.l.wrap.b32 %r1974, %r1973, %r1973, 20; + add.s32 %r1975, %r1969, %r1198; + add.s32 %r1976, %r1975, %r1974; + xor.b32 %r1977, %r1976, %r1971; + shf.l.wrap.b32 %r1978, %r1977, %r1977, 24; + add.s32 %r1979, %r1978, %r1972; + xor.b32 %r1980, %r1979, %r1974; + shf.l.wrap.b32 %r1981, %r1980, %r1980, 25; + add.s32 %r1982, %r1934, %r1135; + add.s32 %r1983, %r1982, %r1953; + xor.b32 %r1984, %r1983, %r1922; + shf.l.wrap.b32 %r1985, %r1984, %r1984, 16; + add.s32 %r1986, %r1985, %r1909; + xor.b32 %r1987, %r1986, %r1953; + shf.l.wrap.b32 %r1988, %r1987, %r1987, 20; + add.s32 %r1989, %r1983, %r1142; + add.s32 %r1990, %r1989, %r1988; + xor.b32 %r1991, %r1990, %r1985; + shf.l.wrap.b32 %r1992, %r1991, %r1991, 24; + add.s32 %r1993, %r1992, %r1986; + xor.b32 %r1994, %r1993, %r1988; + shf.l.wrap.b32 %r1995, %r1994, %r1994, 25; + add.s32 %r1996, %r1948, %r1163; + add.s32 %r1997, %r1996, %r1911; + xor.b32 %r1998, %r1997, %r1936; + shf.l.wrap.b32 %r1999, %r1998, %r1998, 16; + add.s32 %r2000, %r1999, %r1923; + xor.b32 %r2001, %r2000, %r1911; + shf.l.wrap.b32 %r2002, %r2001, %r2001, 20; + add.s32 %r2003, %r1997, %r1205; + add.s32 %r2004, %r2003, %r2002; + xor.b32 %r2005, %r2004, %r1999; + shf.l.wrap.b32 %r2006, %r2005, %r2005, 24; + add.s32 %r2007, %r2006, %r2000; + xor.b32 %r2008, %r2007, %r2002; + shf.l.wrap.b32 %r2009, %r2008, %r2008, 25; + xor.b32 %r3948, %r1993, %r1962; + xor.b32 %r3947, %r2007, %r1976; + xor.b32 %r3946, %r1965, %r1990; + xor.b32 %r3945, %r2004, %r1979; + xor.b32 %r3944, %r2009, %r1978; + xor.b32 %r3943, %r1967, %r1992; + xor.b32 %r3942, %r2006, %r1981; + xor.b32 %r3941, %r1995, %r1964; + add.s64 %rd165, %rd165, 64; + add.s64 %rd166, %rd166, -1; + setp.ne.s64 %p19, %rd166, 0; + mov.u16 %rs197, %rs75; + @%p19 bra $L__BB0_19; + + st.local.u8 [%rd154], %r3948; + shr.u32 %r2010, %r3948, 8; + st.local.u8 [%rd154+1], %r2010; + shr.u32 %r2011, %r3948, 16; + st.local.u8 [%rd154+2], %r2011; + shr.u32 %r2012, %r3948, 24; + st.local.u8 [%rd154+3], %r2012; + st.local.u8 [%rd154+4], %r3947; + shr.u32 %r2013, %r3947, 8; + st.local.u8 [%rd154+5], %r2013; + shr.u32 %r2014, %r3947, 16; + st.local.u8 [%rd154+6], %r2014; + shr.u32 %r2015, %r3947, 24; + st.local.u8 [%rd154+7], %r2015; + st.local.u8 [%rd154+8], %r3946; + shr.u32 %r2016, %r3946, 8; + st.local.u8 [%rd154+9], %r2016; + shr.u32 %r2017, %r3946, 16; + st.local.u8 [%rd154+10], %r2017; + shr.u32 %r2018, %r3946, 24; + st.local.u8 [%rd154+11], %r2018; + st.local.u8 [%rd154+12], %r3945; + shr.u32 %r2019, %r3945, 8; + st.local.u8 [%rd154+13], %r2019; + shr.u32 %r2020, %r3945, 16; + st.local.u8 [%rd154+14], %r2020; + shr.u32 %r2021, %r3945, 24; + st.local.u8 [%rd154+15], %r2021; + st.local.u8 [%rd154+16], %r3944; + shr.u32 %r2022, %r3944, 8; + st.local.u8 [%rd154+17], %r2022; + shr.u32 %r2023, %r3944, 16; + st.local.u8 [%rd154+18], %r2023; + shr.u32 %r2024, %r3944, 24; + st.local.u8 [%rd154+19], %r2024; + st.local.u8 [%rd154+20], %r3943; + shr.u32 %r2025, %r3943, 8; + st.local.u8 [%rd154+21], %r2025; + shr.u32 %r2026, %r3943, 16; + st.local.u8 [%rd154+22], %r2026; + shr.u32 %r2027, %r3943, 24; + st.local.u8 [%rd154+23], %r2027; + st.local.u8 [%rd154+24], %r3942; + shr.u32 %r2028, %r3942, 8; + st.local.u8 [%rd154+25], %r2028; + shr.u32 %r2029, %r3942, 16; + st.local.u8 [%rd154+26], %r2029; + shr.u32 %r2030, %r3942, 24; + st.local.u8 [%rd154+27], %r2030; + st.local.u8 [%rd154+28], %r3941; + shr.u32 %r2031, %r3941, 8; + st.local.u8 [%rd154+29], %r2031; + shr.u32 %r2032, %r3941, 16; + st.local.u8 [%rd154+30], %r2032; + shr.u32 %r2033, %r3941, 24; + st.local.u8 [%rd154+31], %r2033; + add.s64 %rd164, %rd164, 1; + add.s64 %rd161, %rd161, 8; + add.s64 %rd154, %rd154, 32; + add.s64 %rd162, %rd162, -1; + setp.ne.s64 %p20, %rd162, 0; + @%p20 bra $L__BB0_18; + +$L__BB0_21: + ld.param.u64 %rd138, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + setp.ge.u64 %p21, %rd158, %rd138; + @%p21 bra $L__BB0_30; + + ld.param.u64 %rd139, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd134, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + add.s64 %rd126, %rd150, %rd134; + ld.local.u8 %r2034, [%rd2]; + ld.local.u8 %r2035, [%rd2+1]; + prmt.b32 %r2036, %r2035, %r2034, 30212; + ld.local.u8 %r2037, [%rd2+2]; + ld.local.u8 %r2038, [%rd2+3]; + prmt.b32 %r2039, %r2038, %r2037, 30212; + prmt.b32 %r3964, %r2039, %r2036, 4180; + ld.local.u8 %r2040, [%rd2+4]; + ld.local.u8 %r2041, [%rd2+5]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd2+6]; + ld.local.u8 %r2044, [%rd2+7]; + prmt.b32 %r2045, %r2044, %r2043, 30212; + prmt.b32 %r3963, %r2045, %r2042, 4180; + ld.local.u8 %r2046, [%rd2+8]; + ld.local.u8 %r2047, [%rd2+9]; + prmt.b32 %r2048, %r2047, %r2046, 30212; + ld.local.u8 %r2049, [%rd2+10]; + ld.local.u8 %r2050, [%rd2+11]; + prmt.b32 %r2051, %r2050, %r2049, 30212; + prmt.b32 %r3962, %r2051, %r2048, 4180; + ld.local.u8 %r2052, [%rd2+12]; + ld.local.u8 %r2053, [%rd2+13]; + prmt.b32 %r2054, %r2053, %r2052, 30212; + ld.local.u8 %r2055, [%rd2+14]; + ld.local.u8 %r2056, [%rd2+15]; + prmt.b32 %r2057, %r2056, %r2055, 30212; + prmt.b32 %r3961, %r2057, %r2054, 4180; + ld.local.u8 %r2058, [%rd2+16]; + ld.local.u8 %r2059, [%rd2+17]; + prmt.b32 %r2060, %r2059, %r2058, 30212; + ld.local.u8 %r2061, [%rd2+18]; + ld.local.u8 %r2062, [%rd2+19]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + prmt.b32 %r3960, %r2063, %r2060, 4180; + ld.local.u8 %r2064, [%rd2+20]; + ld.local.u8 %r2065, [%rd2+21]; + prmt.b32 %r2066, %r2065, %r2064, 30212; + ld.local.u8 %r2067, [%rd2+22]; + ld.local.u8 %r2068, [%rd2+23]; + prmt.b32 %r2069, %r2068, %r2067, 30212; + prmt.b32 %r3959, %r2069, %r2066, 4180; + ld.local.u8 %r2070, [%rd2+24]; + ld.local.u8 %r2071, [%rd2+25]; + prmt.b32 %r2072, %r2071, %r2070, 30212; + ld.local.u8 %r2073, [%rd2+26]; + ld.local.u8 %r2074, [%rd2+27]; + prmt.b32 %r2075, %r2074, %r2073, 30212; + prmt.b32 %r3958, %r2075, %r2072, 4180; + ld.local.u8 %r2076, [%rd2+28]; + ld.local.u8 %r2077, [%rd2+29]; + prmt.b32 %r2078, %r2077, %r2076, 30212; + ld.local.u8 %r2079, [%rd2+30]; + ld.local.u8 %r2080, [%rd2+31]; + prmt.b32 %r2081, %r2080, %r2079, 30212; + prmt.b32 %r3957, %r2081, %r2078, 4180; + add.u64 %rd53, %SPL, 16; + mov.u32 %r2082, 0; + st.local.v2.u32 [%rd53], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+8], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+16], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+24], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+32], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+40], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+48], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+56], {%r2082, %r2082}; + mov.u16 %rs199, 0; + st.local.v2.u8 [%rd53+64], {%rs199, %rs199}; + st.local.u8 [%rd53+66], %rs75; + add.s64 %rd169, %rd139, %rd158; + cvt.u32.u64 %r36, %rd126; + shr.u64 %rd128, %rd126, 32; + cvt.u32.u64 %r37, %rd128; + setp.lt.u64 %p22, %rd170, 65; + @%p22 bra $L__BB0_25; + + add.s64 %rd56, %rd53, 64; + mov.u16 %rs198, 0; + +$L__BB0_24: + and.b16 %rs83, %rs198, 255; + setp.eq.s16 %p23, %rs83, 0; + selp.u16 %rs84, 1, 0, %p23; + or.b16 %rs85, %rs84, %rs75; + ld.u8 %r2083, [%rd169]; + ld.u8 %r2084, [%rd169+1]; + prmt.b32 %r2085, %r2084, %r2083, 30212; + ld.u8 %r2086, [%rd169+2]; + prmt.b32 %r2087, %r2086, %r2085, 28756; + ld.u8 %r2088, [%rd169+3]; + prmt.b32 %r2089, %r2088, %r2087, 1620; + ld.u8 %r2090, [%rd169+4]; + ld.u8 %r2091, [%rd169+5]; + prmt.b32 %r2092, %r2091, %r2090, 30212; + ld.u8 %r2093, [%rd169+6]; + prmt.b32 %r2094, %r2093, %r2092, 28756; + ld.u8 %r2095, [%rd169+7]; + prmt.b32 %r2096, %r2095, %r2094, 1620; + ld.u8 %r2097, [%rd169+8]; + ld.u8 %r2098, [%rd169+9]; + prmt.b32 %r2099, %r2098, %r2097, 30212; + ld.u8 %r2100, [%rd169+10]; + prmt.b32 %r2101, %r2100, %r2099, 28756; + ld.u8 %r2102, [%rd169+11]; + prmt.b32 %r2103, %r2102, %r2101, 1620; + ld.u8 %r2104, [%rd169+12]; + ld.u8 %r2105, [%rd169+13]; + prmt.b32 %r2106, %r2105, %r2104, 30212; + ld.u8 %r2107, [%rd169+14]; + prmt.b32 %r2108, %r2107, %r2106, 28756; + ld.u8 %r2109, [%rd169+15]; + prmt.b32 %r2110, %r2109, %r2108, 1620; + ld.u8 %r2111, [%rd169+16]; + ld.u8 %r2112, [%rd169+17]; + prmt.b32 %r2113, %r2112, %r2111, 30212; + ld.u8 %r2114, [%rd169+18]; + prmt.b32 %r2115, %r2114, %r2113, 28756; + ld.u8 %r2116, [%rd169+19]; + prmt.b32 %r2117, %r2116, %r2115, 1620; + ld.u8 %r2118, [%rd169+20]; + ld.u8 %r2119, [%rd169+21]; + prmt.b32 %r2120, %r2119, %r2118, 30212; + ld.u8 %r2121, [%rd169+22]; + prmt.b32 %r2122, %r2121, %r2120, 28756; + ld.u8 %r2123, [%rd169+23]; + prmt.b32 %r2124, %r2123, %r2122, 1620; + ld.u8 %r2125, [%rd169+24]; + ld.u8 %r2126, [%rd169+25]; + prmt.b32 %r2127, %r2126, %r2125, 30212; + ld.u8 %r2128, [%rd169+26]; + prmt.b32 %r2129, %r2128, %r2127, 28756; + ld.u8 %r2130, [%rd169+27]; + prmt.b32 %r2131, %r2130, %r2129, 1620; + ld.u8 %r2132, [%rd169+28]; + ld.u8 %r2133, [%rd169+29]; + prmt.b32 %r2134, %r2133, %r2132, 30212; + ld.u8 %r2135, [%rd169+30]; + prmt.b32 %r2136, %r2135, %r2134, 28756; + ld.u8 %r2137, [%rd169+31]; + prmt.b32 %r2138, %r2137, %r2136, 1620; + ld.u8 %r2139, [%rd169+32]; + ld.u8 %r2140, [%rd169+33]; + prmt.b32 %r2141, %r2140, %r2139, 30212; + ld.u8 %r2142, [%rd169+34]; + prmt.b32 %r2143, %r2142, %r2141, 28756; + ld.u8 %r2144, [%rd169+35]; + prmt.b32 %r2145, %r2144, %r2143, 1620; + ld.u8 %r2146, [%rd169+36]; + ld.u8 %r2147, [%rd169+37]; + prmt.b32 %r2148, %r2147, %r2146, 30212; + ld.u8 %r2149, [%rd169+38]; + prmt.b32 %r2150, %r2149, %r2148, 28756; + ld.u8 %r2151, [%rd169+39]; + prmt.b32 %r2152, %r2151, %r2150, 1620; + ld.u8 %r2153, [%rd169+40]; + ld.u8 %r2154, [%rd169+41]; + prmt.b32 %r2155, %r2154, %r2153, 30212; + ld.u8 %r2156, [%rd169+42]; + prmt.b32 %r2157, %r2156, %r2155, 28756; + ld.u8 %r2158, [%rd169+43]; + prmt.b32 %r2159, %r2158, %r2157, 1620; + ld.u8 %r2160, [%rd169+44]; + ld.u8 %r2161, [%rd169+45]; + prmt.b32 %r2162, %r2161, %r2160, 30212; + ld.u8 %r2163, [%rd169+46]; + prmt.b32 %r2164, %r2163, %r2162, 28756; + ld.u8 %r2165, [%rd169+47]; + prmt.b32 %r2166, %r2165, %r2164, 1620; + ld.u8 %r2167, [%rd169+48]; + ld.u8 %r2168, [%rd169+49]; + prmt.b32 %r2169, %r2168, %r2167, 30212; + ld.u8 %r2170, [%rd169+50]; + prmt.b32 %r2171, %r2170, %r2169, 28756; + ld.u8 %r2172, [%rd169+51]; + prmt.b32 %r2173, %r2172, %r2171, 1620; + ld.u8 %r2174, [%rd169+52]; + ld.u8 %r2175, [%rd169+53]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.u8 %r2177, [%rd169+54]; + prmt.b32 %r2178, %r2177, %r2176, 28756; + ld.u8 %r2179, [%rd169+55]; + prmt.b32 %r2180, %r2179, %r2178, 1620; + ld.u8 %r2181, [%rd169+56]; + ld.u8 %r2182, [%rd169+57]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.u8 %r2184, [%rd169+58]; + prmt.b32 %r2185, %r2184, %r2183, 28756; + ld.u8 %r2186, [%rd169+59]; + prmt.b32 %r2187, %r2186, %r2185, 1620; + ld.u8 %r2188, [%rd169+60]; + ld.u8 %r2189, [%rd169+61]; + prmt.b32 %r2190, %r2189, %r2188, 30212; + ld.u8 %r2191, [%rd169+62]; + prmt.b32 %r2192, %r2191, %r2190, 28756; + ld.u8 %r2193, [%rd169+63]; + prmt.b32 %r2194, %r2193, %r2192, 1620; + cvt.u32.u16 %r2195, %rs85; + add.s32 %r2196, %r3964, %r2089; + add.s32 %r2197, %r2196, %r3960; + xor.b32 %r2198, %r2197, %r36; + shf.l.wrap.b32 %r2199, %r2198, %r2198, 16; + add.s32 %r2200, %r2199, 1779033703; + xor.b32 %r2201, %r2200, %r3960; + shf.l.wrap.b32 %r2202, %r2201, %r2201, 20; + add.s32 %r2203, %r2197, %r2096; + add.s32 %r2204, %r2203, %r2202; + xor.b32 %r2205, %r2204, %r2199; + shf.l.wrap.b32 %r2206, %r2205, %r2205, 24; + add.s32 %r2207, %r2206, %r2200; + xor.b32 %r2208, %r2207, %r2202; + shf.l.wrap.b32 %r2209, %r2208, %r2208, 25; + add.s32 %r2210, %r3963, %r2103; + add.s32 %r2211, %r2210, %r3959; + xor.b32 %r2212, %r2211, %r37; + shf.l.wrap.b32 %r2213, %r2212, %r2212, 16; + add.s32 %r2214, %r2213, -1150833019; + xor.b32 %r2215, %r2214, %r3959; + shf.l.wrap.b32 %r2216, %r2215, %r2215, 20; + add.s32 %r2217, %r2211, %r2110; + add.s32 %r2218, %r2217, %r2216; + xor.b32 %r2219, %r2218, %r2213; + shf.l.wrap.b32 %r2220, %r2219, %r2219, 24; + add.s32 %r2221, %r2220, %r2214; + xor.b32 %r2222, %r2221, %r2216; + shf.l.wrap.b32 %r2223, %r2222, %r2222, 25; + add.s32 %r2224, %r3962, %r2117; + add.s32 %r2225, %r2224, %r3958; + shr.u32 %r2226, %r2225, 16; + shl.b32 %r2227, %r2225, 16; + xor.b32 %r2228, %r2227, 4194304; + or.b32 %r2229, %r2228, %r2226; + add.s32 %r2230, %r2229, 1013904242; + xor.b32 %r2231, %r2230, %r3958; + shf.l.wrap.b32 %r2232, %r2231, %r2231, 20; + add.s32 %r2233, %r2225, %r2124; + add.s32 %r2234, %r2233, %r2232; + xor.b32 %r2235, %r2234, %r2229; + shf.l.wrap.b32 %r2236, %r2235, %r2235, 24; + add.s32 %r2237, %r2236, %r2230; + xor.b32 %r2238, %r2237, %r2232; + shf.l.wrap.b32 %r2239, %r2238, %r2238, 25; + add.s32 %r2240, %r3961, %r2131; + add.s32 %r2241, %r2240, %r3957; + xor.b32 %r2242, %r2241, %r2195; + shr.u32 %r2243, %r2241, 16; + shl.b32 %r2244, %r2242, 16; + or.b32 %r2245, %r2244, %r2243; + add.s32 %r2246, %r2245, -1521486534; + xor.b32 %r2247, %r2246, %r3957; + shf.l.wrap.b32 %r2248, %r2247, %r2247, 20; + add.s32 %r2249, %r2241, %r2138; + add.s32 %r2250, %r2249, %r2248; + xor.b32 %r2251, %r2250, %r2245; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 24; + add.s32 %r2253, %r2252, %r2246; + xor.b32 %r2254, %r2253, %r2248; + shf.l.wrap.b32 %r2255, %r2254, %r2254, 25; + add.s32 %r2256, %r2204, %r2145; + add.s32 %r2257, %r2256, %r2223; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 16; + add.s32 %r2260, %r2259, %r2237; + xor.b32 %r2261, %r2260, %r2223; + shf.l.wrap.b32 %r2262, %r2261, %r2261, 20; + add.s32 %r2263, %r2257, %r2152; + add.s32 %r2264, %r2263, %r2262; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 24; + add.s32 %r2267, %r2266, %r2260; + xor.b32 %r2268, %r2267, %r2262; + shf.l.wrap.b32 %r2269, %r2268, %r2268, 25; + add.s32 %r2270, %r2218, %r2159; + add.s32 %r2271, %r2270, %r2239; + xor.b32 %r2272, %r2271, %r2206; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 16; + add.s32 %r2274, %r2273, %r2253; + xor.b32 %r2275, %r2274, %r2239; + shf.l.wrap.b32 %r2276, %r2275, %r2275, 20; + add.s32 %r2277, %r2271, %r2166; + add.s32 %r2278, %r2277, %r2276; + xor.b32 %r2279, %r2278, %r2273; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 24; + add.s32 %r2281, %r2280, %r2274; + xor.b32 %r2282, %r2281, %r2276; + shf.l.wrap.b32 %r2283, %r2282, %r2282, 25; + add.s32 %r2284, %r2234, %r2173; + add.s32 %r2285, %r2284, %r2255; + xor.b32 %r2286, %r2285, %r2220; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 16; + add.s32 %r2288, %r2287, %r2207; + xor.b32 %r2289, %r2288, %r2255; + shf.l.wrap.b32 %r2290, %r2289, %r2289, 20; + add.s32 %r2291, %r2285, %r2180; + add.s32 %r2292, %r2291, %r2290; + xor.b32 %r2293, %r2292, %r2287; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 24; + add.s32 %r2295, %r2294, %r2288; + xor.b32 %r2296, %r2295, %r2290; + shf.l.wrap.b32 %r2297, %r2296, %r2296, 25; + add.s32 %r2298, %r2250, %r2187; + add.s32 %r2299, %r2298, %r2209; + xor.b32 %r2300, %r2299, %r2236; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 16; + add.s32 %r2302, %r2301, %r2221; + xor.b32 %r2303, %r2302, %r2209; + shf.l.wrap.b32 %r2304, %r2303, %r2303, 20; + add.s32 %r2305, %r2299, %r2194; + add.s32 %r2306, %r2305, %r2304; + xor.b32 %r2307, %r2306, %r2301; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 24; + add.s32 %r2309, %r2308, %r2302; + xor.b32 %r2310, %r2309, %r2304; + shf.l.wrap.b32 %r2311, %r2310, %r2310, 25; + add.s32 %r2312, %r2264, %r2103; + add.s32 %r2313, %r2312, %r2311; + xor.b32 %r2314, %r2313, %r2280; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 16; + add.s32 %r2316, %r2315, %r2295; + xor.b32 %r2317, %r2316, %r2311; + shf.l.wrap.b32 %r2318, %r2317, %r2317, 20; + add.s32 %r2319, %r2313, %r2131; + add.s32 %r2320, %r2319, %r2318; + xor.b32 %r2321, %r2320, %r2315; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 24; + add.s32 %r2323, %r2322, %r2316; + xor.b32 %r2324, %r2323, %r2318; + shf.l.wrap.b32 %r2325, %r2324, %r2324, 25; + add.s32 %r2326, %r2278, %r2110; + add.s32 %r2327, %r2326, %r2269; + xor.b32 %r2328, %r2327, %r2294; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 16; + add.s32 %r2330, %r2329, %r2309; + xor.b32 %r2331, %r2330, %r2269; + shf.l.wrap.b32 %r2332, %r2331, %r2331, 20; + add.s32 %r2333, %r2327, %r2159; + add.s32 %r2334, %r2333, %r2332; + xor.b32 %r2335, %r2334, %r2329; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 24; + add.s32 %r2337, %r2336, %r2330; + xor.b32 %r2338, %r2337, %r2332; + shf.l.wrap.b32 %r2339, %r2338, %r2338, 25; + add.s32 %r2340, %r2292, %r2138; + add.s32 %r2341, %r2340, %r2283; + xor.b32 %r2342, %r2341, %r2308; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 16; + add.s32 %r2344, %r2343, %r2267; + xor.b32 %r2345, %r2344, %r2283; + shf.l.wrap.b32 %r2346, %r2345, %r2345, 20; + add.s32 %r2347, %r2341, %r2089; + add.s32 %r2348, %r2347, %r2346; + xor.b32 %r2349, %r2348, %r2343; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 24; + add.s32 %r2351, %r2350, %r2344; + xor.b32 %r2352, %r2351, %r2346; + shf.l.wrap.b32 %r2353, %r2352, %r2352, 25; + add.s32 %r2354, %r2306, %r2117; + add.s32 %r2355, %r2354, %r2297; + xor.b32 %r2356, %r2355, %r2266; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 16; + add.s32 %r2358, %r2357, %r2281; + xor.b32 %r2359, %r2358, %r2297; + shf.l.wrap.b32 %r2360, %r2359, %r2359, 20; + add.s32 %r2361, %r2355, %r2180; + add.s32 %r2362, %r2361, %r2360; + xor.b32 %r2363, %r2362, %r2357; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 24; + add.s32 %r2365, %r2364, %r2358; + xor.b32 %r2366, %r2365, %r2360; + shf.l.wrap.b32 %r2367, %r2366, %r2366, 25; + add.s32 %r2368, %r2320, %r2096; + add.s32 %r2369, %r2368, %r2339; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 16; + add.s32 %r2372, %r2371, %r2351; + xor.b32 %r2373, %r2372, %r2339; + shf.l.wrap.b32 %r2374, %r2373, %r2373, 20; + add.s32 %r2375, %r2369, %r2166; + add.s32 %r2376, %r2375, %r2374; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 24; + add.s32 %r2379, %r2378, %r2372; + xor.b32 %r2380, %r2379, %r2374; + shf.l.wrap.b32 %r2381, %r2380, %r2380, 25; + add.s32 %r2382, %r2334, %r2173; + add.s32 %r2383, %r2382, %r2353; + xor.b32 %r2384, %r2383, %r2322; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 16; + add.s32 %r2386, %r2385, %r2365; + xor.b32 %r2387, %r2386, %r2353; + shf.l.wrap.b32 %r2388, %r2387, %r2387, 20; + add.s32 %r2389, %r2383, %r2124; + add.s32 %r2390, %r2389, %r2388; + xor.b32 %r2391, %r2390, %r2385; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 24; + add.s32 %r2393, %r2392, %r2386; + xor.b32 %r2394, %r2393, %r2388; + shf.l.wrap.b32 %r2395, %r2394, %r2394, 25; + add.s32 %r2396, %r2348, %r2152; + add.s32 %r2397, %r2396, %r2367; + xor.b32 %r2398, %r2397, %r2336; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 16; + add.s32 %r2400, %r2399, %r2323; + xor.b32 %r2401, %r2400, %r2367; + shf.l.wrap.b32 %r2402, %r2401, %r2401, 20; + add.s32 %r2403, %r2397, %r2187; + add.s32 %r2404, %r2403, %r2402; + xor.b32 %r2405, %r2404, %r2399; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 24; + add.s32 %r2407, %r2406, %r2400; + xor.b32 %r2408, %r2407, %r2402; + shf.l.wrap.b32 %r2409, %r2408, %r2408, 25; + add.s32 %r2410, %r2362, %r2194; + add.s32 %r2411, %r2410, %r2325; + xor.b32 %r2412, %r2411, %r2350; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 16; + add.s32 %r2414, %r2413, %r2337; + xor.b32 %r2415, %r2414, %r2325; + shf.l.wrap.b32 %r2416, %r2415, %r2415, 20; + add.s32 %r2417, %r2411, %r2145; + add.s32 %r2418, %r2417, %r2416; + xor.b32 %r2419, %r2418, %r2413; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 24; + add.s32 %r2421, %r2420, %r2414; + xor.b32 %r2422, %r2421, %r2416; + shf.l.wrap.b32 %r2423, %r2422, %r2422, 25; + add.s32 %r2424, %r2376, %r2110; + add.s32 %r2425, %r2424, %r2423; + xor.b32 %r2426, %r2425, %r2392; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 16; + add.s32 %r2428, %r2427, %r2407; + xor.b32 %r2429, %r2428, %r2423; + shf.l.wrap.b32 %r2430, %r2429, %r2429, 20; + add.s32 %r2431, %r2425, %r2117; + add.s32 %r2432, %r2431, %r2430; + xor.b32 %r2433, %r2432, %r2427; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 24; + add.s32 %r2435, %r2434, %r2428; + xor.b32 %r2436, %r2435, %r2430; + shf.l.wrap.b32 %r2437, %r2436, %r2436, 25; + add.s32 %r2438, %r2390, %r2159; + add.s32 %r2439, %r2438, %r2381; + xor.b32 %r2440, %r2439, %r2406; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 16; + add.s32 %r2442, %r2441, %r2421; + xor.b32 %r2443, %r2442, %r2381; + shf.l.wrap.b32 %r2444, %r2443, %r2443, 20; + add.s32 %r2445, %r2439, %r2173; + add.s32 %r2446, %r2445, %r2444; + xor.b32 %r2447, %r2446, %r2441; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 24; + add.s32 %r2449, %r2448, %r2442; + xor.b32 %r2450, %r2449, %r2444; + shf.l.wrap.b32 %r2451, %r2450, %r2450, 25; + add.s32 %r2452, %r2404, %r2180; + add.s32 %r2453, %r2452, %r2395; + xor.b32 %r2454, %r2453, %r2420; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 16; + add.s32 %r2456, %r2455, %r2379; + xor.b32 %r2457, %r2456, %r2395; + shf.l.wrap.b32 %r2458, %r2457, %r2457, 20; + add.s32 %r2459, %r2453, %r2103; + add.s32 %r2460, %r2459, %r2458; + xor.b32 %r2461, %r2460, %r2455; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 24; + add.s32 %r2463, %r2462, %r2456; + xor.b32 %r2464, %r2463, %r2458; + shf.l.wrap.b32 %r2465, %r2464, %r2464, 25; + add.s32 %r2466, %r2418, %r2138; + add.s32 %r2467, %r2466, %r2409; + xor.b32 %r2468, %r2467, %r2378; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 16; + add.s32 %r2470, %r2469, %r2393; + xor.b32 %r2471, %r2470, %r2409; + shf.l.wrap.b32 %r2472, %r2471, %r2471, 20; + add.s32 %r2473, %r2467, %r2187; + add.s32 %r2474, %r2473, %r2472; + xor.b32 %r2475, %r2474, %r2469; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 24; + add.s32 %r2477, %r2476, %r2470; + xor.b32 %r2478, %r2477, %r2472; + shf.l.wrap.b32 %r2479, %r2478, %r2478, 25; + add.s32 %r2480, %r2432, %r2131; + add.s32 %r2481, %r2480, %r2451; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 16; + add.s32 %r2484, %r2483, %r2463; + xor.b32 %r2485, %r2484, %r2451; + shf.l.wrap.b32 %r2486, %r2485, %r2485, 20; + add.s32 %r2487, %r2481, %r2124; + add.s32 %r2488, %r2487, %r2486; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 24; + add.s32 %r2491, %r2490, %r2484; + xor.b32 %r2492, %r2491, %r2486; + shf.l.wrap.b32 %r2493, %r2492, %r2492, 25; + add.s32 %r2494, %r2446, %r2152; + add.s32 %r2495, %r2494, %r2465; + xor.b32 %r2496, %r2495, %r2434; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 16; + add.s32 %r2498, %r2497, %r2477; + xor.b32 %r2499, %r2498, %r2465; + shf.l.wrap.b32 %r2500, %r2499, %r2499, 20; + add.s32 %r2501, %r2495, %r2089; + add.s32 %r2502, %r2501, %r2500; + xor.b32 %r2503, %r2502, %r2497; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 24; + add.s32 %r2505, %r2504, %r2498; + xor.b32 %r2506, %r2505, %r2500; + shf.l.wrap.b32 %r2507, %r2506, %r2506, 25; + add.s32 %r2508, %r2460, %r2166; + add.s32 %r2509, %r2508, %r2479; + xor.b32 %r2510, %r2509, %r2448; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 16; + add.s32 %r2512, %r2511, %r2435; + xor.b32 %r2513, %r2512, %r2479; + shf.l.wrap.b32 %r2514, %r2513, %r2513, 20; + add.s32 %r2515, %r2509, %r2194; + add.s32 %r2516, %r2515, %r2514; + xor.b32 %r2517, %r2516, %r2511; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 24; + add.s32 %r2519, %r2518, %r2512; + xor.b32 %r2520, %r2519, %r2514; + shf.l.wrap.b32 %r2521, %r2520, %r2520, 25; + add.s32 %r2522, %r2474, %r2145; + add.s32 %r2523, %r2522, %r2437; + xor.b32 %r2524, %r2523, %r2462; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 16; + add.s32 %r2526, %r2525, %r2449; + xor.b32 %r2527, %r2526, %r2437; + shf.l.wrap.b32 %r2528, %r2527, %r2527, 20; + add.s32 %r2529, %r2523, %r2096; + add.s32 %r2530, %r2529, %r2528; + xor.b32 %r2531, %r2530, %r2525; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 24; + add.s32 %r2533, %r2532, %r2526; + xor.b32 %r2534, %r2533, %r2528; + shf.l.wrap.b32 %r2535, %r2534, %r2534, 25; + add.s32 %r2536, %r2488, %r2159; + add.s32 %r2537, %r2536, %r2535; + xor.b32 %r2538, %r2537, %r2504; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 16; + add.s32 %r2540, %r2539, %r2519; + xor.b32 %r2541, %r2540, %r2535; + shf.l.wrap.b32 %r2542, %r2541, %r2541, 20; + add.s32 %r2543, %r2537, %r2138; + add.s32 %r2544, %r2543, %r2542; + xor.b32 %r2545, %r2544, %r2539; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 24; + add.s32 %r2547, %r2546, %r2540; + xor.b32 %r2548, %r2547, %r2542; + shf.l.wrap.b32 %r2549, %r2548, %r2548, 25; + add.s32 %r2550, %r2502, %r2173; + add.s32 %r2551, %r2550, %r2493; + xor.b32 %r2552, %r2551, %r2518; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 16; + add.s32 %r2554, %r2553, %r2533; + xor.b32 %r2555, %r2554, %r2493; + shf.l.wrap.b32 %r2556, %r2555, %r2555, 20; + add.s32 %r2557, %r2551, %r2152; + add.s32 %r2558, %r2557, %r2556; + xor.b32 %r2559, %r2558, %r2553; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 24; + add.s32 %r2561, %r2560, %r2554; + xor.b32 %r2562, %r2561, %r2556; + shf.l.wrap.b32 %r2563, %r2562, %r2562, 25; + add.s32 %r2564, %r2516, %r2187; + add.s32 %r2565, %r2564, %r2507; + xor.b32 %r2566, %r2565, %r2532; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 16; + add.s32 %r2568, %r2567, %r2491; + xor.b32 %r2569, %r2568, %r2507; + shf.l.wrap.b32 %r2570, %r2569, %r2569, 20; + add.s32 %r2571, %r2565, %r2110; + add.s32 %r2572, %r2571, %r2570; + xor.b32 %r2573, %r2572, %r2567; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 24; + add.s32 %r2575, %r2574, %r2568; + xor.b32 %r2576, %r2575, %r2570; + shf.l.wrap.b32 %r2577, %r2576, %r2576, 25; + add.s32 %r2578, %r2530, %r2180; + add.s32 %r2579, %r2578, %r2521; + xor.b32 %r2580, %r2579, %r2490; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 16; + add.s32 %r2582, %r2581, %r2505; + xor.b32 %r2583, %r2582, %r2521; + shf.l.wrap.b32 %r2584, %r2583, %r2583, 20; + add.s32 %r2585, %r2579, %r2194; + add.s32 %r2586, %r2585, %r2584; + xor.b32 %r2587, %r2586, %r2581; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 24; + add.s32 %r2589, %r2588, %r2582; + xor.b32 %r2590, %r2589, %r2584; + shf.l.wrap.b32 %r2591, %r2590, %r2590, 25; + add.s32 %r2592, %r2544, %r2117; + add.s32 %r2593, %r2592, %r2563; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 16; + add.s32 %r2596, %r2595, %r2575; + xor.b32 %r2597, %r2596, %r2563; + shf.l.wrap.b32 %r2598, %r2597, %r2597, 20; + add.s32 %r2599, %r2593, %r2089; + add.s32 %r2600, %r2599, %r2598; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 24; + add.s32 %r2603, %r2602, %r2596; + xor.b32 %r2604, %r2603, %r2598; + shf.l.wrap.b32 %r2605, %r2604, %r2604, 25; + add.s32 %r2606, %r2558, %r2166; + add.s32 %r2607, %r2606, %r2577; + xor.b32 %r2608, %r2607, %r2546; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 16; + add.s32 %r2610, %r2609, %r2589; + xor.b32 %r2611, %r2610, %r2577; + shf.l.wrap.b32 %r2612, %r2611, %r2611, 20; + add.s32 %r2613, %r2607, %r2103; + add.s32 %r2614, %r2613, %r2612; + xor.b32 %r2615, %r2614, %r2609; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 24; + add.s32 %r2617, %r2616, %r2610; + xor.b32 %r2618, %r2617, %r2612; + shf.l.wrap.b32 %r2619, %r2618, %r2618, 25; + add.s32 %r2620, %r2572, %r2124; + add.s32 %r2621, %r2620, %r2591; + xor.b32 %r2622, %r2621, %r2560; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 16; + add.s32 %r2624, %r2623, %r2547; + xor.b32 %r2625, %r2624, %r2591; + shf.l.wrap.b32 %r2626, %r2625, %r2625, 20; + add.s32 %r2627, %r2621, %r2145; + add.s32 %r2628, %r2627, %r2626; + xor.b32 %r2629, %r2628, %r2623; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 24; + add.s32 %r2631, %r2630, %r2624; + xor.b32 %r2632, %r2631, %r2626; + shf.l.wrap.b32 %r2633, %r2632, %r2632, 25; + add.s32 %r2634, %r2586, %r2096; + add.s32 %r2635, %r2634, %r2549; + xor.b32 %r2636, %r2635, %r2574; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 16; + add.s32 %r2638, %r2637, %r2561; + xor.b32 %r2639, %r2638, %r2549; + shf.l.wrap.b32 %r2640, %r2639, %r2639, 20; + add.s32 %r2641, %r2635, %r2131; + add.s32 %r2642, %r2641, %r2640; + xor.b32 %r2643, %r2642, %r2637; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 24; + add.s32 %r2645, %r2644, %r2638; + xor.b32 %r2646, %r2645, %r2640; + shf.l.wrap.b32 %r2647, %r2646, %r2646, 25; + add.s32 %r2648, %r2600, %r2173; + add.s32 %r2649, %r2648, %r2647; + xor.b32 %r2650, %r2649, %r2616; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 16; + add.s32 %r2652, %r2651, %r2631; + xor.b32 %r2653, %r2652, %r2647; + shf.l.wrap.b32 %r2654, %r2653, %r2653, 20; + add.s32 %r2655, %r2649, %r2180; + add.s32 %r2656, %r2655, %r2654; + xor.b32 %r2657, %r2656, %r2651; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 24; + add.s32 %r2659, %r2658, %r2652; + xor.b32 %r2660, %r2659, %r2654; + shf.l.wrap.b32 %r2661, %r2660, %r2660, 25; + add.s32 %r2662, %r2614, %r2152; + add.s32 %r2663, %r2662, %r2605; + xor.b32 %r2664, %r2663, %r2630; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 16; + add.s32 %r2666, %r2665, %r2645; + xor.b32 %r2667, %r2666, %r2605; + shf.l.wrap.b32 %r2668, %r2667, %r2667, 20; + add.s32 %r2669, %r2663, %r2166; + add.s32 %r2670, %r2669, %r2668; + xor.b32 %r2671, %r2670, %r2665; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 24; + add.s32 %r2673, %r2672, %r2666; + xor.b32 %r2674, %r2673, %r2668; + shf.l.wrap.b32 %r2675, %r2674, %r2674, 25; + add.s32 %r2676, %r2628, %r2194; + add.s32 %r2677, %r2676, %r2619; + xor.b32 %r2678, %r2677, %r2644; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 16; + add.s32 %r2680, %r2679, %r2603; + xor.b32 %r2681, %r2680, %r2619; + shf.l.wrap.b32 %r2682, %r2681, %r2681, 20; + add.s32 %r2683, %r2677, %r2159; + add.s32 %r2684, %r2683, %r2682; + xor.b32 %r2685, %r2684, %r2679; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 24; + add.s32 %r2687, %r2686, %r2680; + xor.b32 %r2688, %r2687, %r2682; + shf.l.wrap.b32 %r2689, %r2688, %r2688, 25; + add.s32 %r2690, %r2642, %r2187; + add.s32 %r2691, %r2690, %r2633; + xor.b32 %r2692, %r2691, %r2602; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 16; + add.s32 %r2694, %r2693, %r2617; + xor.b32 %r2695, %r2694, %r2633; + shf.l.wrap.b32 %r2696, %r2695, %r2695, 20; + add.s32 %r2697, %r2691, %r2145; + add.s32 %r2698, %r2697, %r2696; + xor.b32 %r2699, %r2698, %r2693; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 24; + add.s32 %r2701, %r2700, %r2694; + xor.b32 %r2702, %r2701, %r2696; + shf.l.wrap.b32 %r2703, %r2702, %r2702, 25; + add.s32 %r2704, %r2656, %r2138; + add.s32 %r2705, %r2704, %r2675; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 16; + add.s32 %r2708, %r2707, %r2687; + xor.b32 %r2709, %r2708, %r2675; + shf.l.wrap.b32 %r2710, %r2709, %r2709, 20; + add.s32 %r2711, %r2705, %r2103; + add.s32 %r2712, %r2711, %r2710; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 24; + add.s32 %r2715, %r2714, %r2708; + xor.b32 %r2716, %r2715, %r2710; + shf.l.wrap.b32 %r2717, %r2716, %r2716, 25; + add.s32 %r2718, %r2670, %r2124; + add.s32 %r2719, %r2718, %r2689; + xor.b32 %r2720, %r2719, %r2658; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 16; + add.s32 %r2722, %r2721, %r2701; + xor.b32 %r2723, %r2722, %r2689; + shf.l.wrap.b32 %r2724, %r2723, %r2723, 20; + add.s32 %r2725, %r2719, %r2110; + add.s32 %r2726, %r2725, %r2724; + xor.b32 %r2727, %r2726, %r2721; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 24; + add.s32 %r2729, %r2728, %r2722; + xor.b32 %r2730, %r2729, %r2724; + shf.l.wrap.b32 %r2731, %r2730, %r2730, 25; + add.s32 %r2732, %r2684, %r2089; + add.s32 %r2733, %r2732, %r2703; + xor.b32 %r2734, %r2733, %r2672; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 16; + add.s32 %r2736, %r2735, %r2659; + xor.b32 %r2737, %r2736, %r2703; + shf.l.wrap.b32 %r2738, %r2737, %r2737, 20; + add.s32 %r2739, %r2733, %r2096; + add.s32 %r2740, %r2739, %r2738; + xor.b32 %r2741, %r2740, %r2735; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 24; + add.s32 %r2743, %r2742, %r2736; + xor.b32 %r2744, %r2743, %r2738; + shf.l.wrap.b32 %r2745, %r2744, %r2744, 25; + add.s32 %r2746, %r2698, %r2131; + add.s32 %r2747, %r2746, %r2661; + xor.b32 %r2748, %r2747, %r2686; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 16; + add.s32 %r2750, %r2749, %r2673; + xor.b32 %r2751, %r2750, %r2661; + shf.l.wrap.b32 %r2752, %r2751, %r2751, 20; + add.s32 %r2753, %r2747, %r2117; + add.s32 %r2754, %r2753, %r2752; + xor.b32 %r2755, %r2754, %r2749; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 24; + add.s32 %r2757, %r2756, %r2750; + xor.b32 %r2758, %r2757, %r2752; + shf.l.wrap.b32 %r2759, %r2758, %r2758, 25; + add.s32 %r2760, %r2712, %r2152; + add.s32 %r2761, %r2760, %r2759; + xor.b32 %r2762, %r2761, %r2728; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 16; + add.s32 %r2764, %r2763, %r2743; + xor.b32 %r2765, %r2764, %r2759; + shf.l.wrap.b32 %r2766, %r2765, %r2765, 20; + add.s32 %r2767, %r2761, %r2187; + add.s32 %r2768, %r2767, %r2766; + xor.b32 %r2769, %r2768, %r2763; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 24; + add.s32 %r2771, %r2770, %r2764; + xor.b32 %r2772, %r2771, %r2766; + shf.l.wrap.b32 %r2773, %r2772, %r2772, 25; + add.s32 %r2774, %r2726, %r2166; + add.s32 %r2775, %r2774, %r2717; + xor.b32 %r2776, %r2775, %r2742; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 16; + add.s32 %r2778, %r2777, %r2757; + xor.b32 %r2779, %r2778, %r2717; + shf.l.wrap.b32 %r2780, %r2779, %r2779, 20; + add.s32 %r2781, %r2775, %r2124; + add.s32 %r2782, %r2781, %r2780; + xor.b32 %r2783, %r2782, %r2777; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 24; + add.s32 %r2785, %r2784, %r2778; + xor.b32 %r2786, %r2785, %r2780; + shf.l.wrap.b32 %r2787, %r2786, %r2786, 25; + add.s32 %r2788, %r2740, %r2145; + add.s32 %r2789, %r2788, %r2731; + xor.b32 %r2790, %r2789, %r2756; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 16; + add.s32 %r2792, %r2791, %r2715; + xor.b32 %r2793, %r2792, %r2731; + shf.l.wrap.b32 %r2794, %r2793, %r2793, 20; + add.s32 %r2795, %r2789, %r2173; + add.s32 %r2796, %r2795, %r2794; + xor.b32 %r2797, %r2796, %r2791; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 24; + add.s32 %r2799, %r2798, %r2792; + xor.b32 %r2800, %r2799, %r2794; + shf.l.wrap.b32 %r2801, %r2800, %r2800, 25; + add.s32 %r2802, %r2754, %r2194; + add.s32 %r2803, %r2802, %r2745; + xor.b32 %r2804, %r2803, %r2714; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 16; + add.s32 %r2806, %r2805, %r2729; + xor.b32 %r2807, %r2806, %r2745; + shf.l.wrap.b32 %r2808, %r2807, %r2807, 20; + add.s32 %r2809, %r2803, %r2096; + add.s32 %r2810, %r2809, %r2808; + xor.b32 %r2811, %r2810, %r2805; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 24; + add.s32 %r2813, %r2812, %r2806; + xor.b32 %r2814, %r2813, %r2808; + shf.l.wrap.b32 %r2815, %r2814, %r2814, 25; + add.s32 %r2816, %r2768, %r2180; + add.s32 %r2817, %r2816, %r2787; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 16; + add.s32 %r2820, %r2819, %r2799; + xor.b32 %r2821, %r2820, %r2787; + shf.l.wrap.b32 %r2822, %r2821, %r2821, 20; + add.s32 %r2823, %r2817, %r2110; + add.s32 %r2824, %r2823, %r2822; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 24; + add.s32 %r2827, %r2826, %r2820; + xor.b32 %r2828, %r2827, %r2822; + shf.l.wrap.b32 %r2829, %r2828, %r2828, 25; + add.s32 %r2830, %r2782, %r2089; + add.s32 %r2831, %r2830, %r2801; + xor.b32 %r2832, %r2831, %r2770; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 16; + add.s32 %r2834, %r2833, %r2813; + xor.b32 %r2835, %r2834, %r2801; + shf.l.wrap.b32 %r2836, %r2835, %r2835, 20; + add.s32 %r2837, %r2831, %r2159; + add.s32 %r2838, %r2837, %r2836; + xor.b32 %r2839, %r2838, %r2833; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 24; + add.s32 %r2841, %r2840, %r2834; + xor.b32 %r2842, %r2841, %r2836; + shf.l.wrap.b32 %r2843, %r2842, %r2842, 25; + add.s32 %r2844, %r2796, %r2103; + add.s32 %r2845, %r2844, %r2815; + xor.b32 %r2846, %r2845, %r2784; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 16; + add.s32 %r2848, %r2847, %r2771; + xor.b32 %r2849, %r2848, %r2815; + shf.l.wrap.b32 %r2850, %r2849, %r2849, 20; + add.s32 %r2851, %r2845, %r2131; + add.s32 %r2852, %r2851, %r2850; + xor.b32 %r2853, %r2852, %r2847; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 24; + add.s32 %r2855, %r2854, %r2848; + xor.b32 %r2856, %r2855, %r2850; + shf.l.wrap.b32 %r2857, %r2856, %r2856, 25; + add.s32 %r2858, %r2810, %r2117; + add.s32 %r2859, %r2858, %r2773; + xor.b32 %r2860, %r2859, %r2798; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 16; + add.s32 %r2862, %r2861, %r2785; + xor.b32 %r2863, %r2862, %r2773; + shf.l.wrap.b32 %r2864, %r2863, %r2863, 20; + add.s32 %r2865, %r2859, %r2138; + add.s32 %r2866, %r2865, %r2864; + xor.b32 %r2867, %r2866, %r2861; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 24; + add.s32 %r2869, %r2868, %r2862; + xor.b32 %r2870, %r2869, %r2864; + shf.l.wrap.b32 %r2871, %r2870, %r2870, 25; + add.s32 %r2872, %r2824, %r2166; + add.s32 %r2873, %r2872, %r2871; + xor.b32 %r2874, %r2873, %r2840; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 16; + add.s32 %r2876, %r2875, %r2855; + xor.b32 %r2877, %r2876, %r2871; + shf.l.wrap.b32 %r2878, %r2877, %r2877, 20; + add.s32 %r2879, %r2873, %r2194; + add.s32 %r2880, %r2879, %r2878; + xor.b32 %r2881, %r2880, %r2875; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 24; + add.s32 %r2883, %r2882, %r2876; + xor.b32 %r2884, %r2883, %r2878; + shf.l.wrap.b32 %r2885, %r2884, %r2884, 25; + add.s32 %r2886, %r2838, %r2124; + add.s32 %r2887, %r2886, %r2829; + xor.b32 %r2888, %r2887, %r2854; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 16; + add.s32 %r2890, %r2889, %r2869; + xor.b32 %r2891, %r2890, %r2829; + shf.l.wrap.b32 %r2892, %r2891, %r2891, 20; + add.s32 %r2893, %r2887, %r2089; + add.s32 %r2894, %r2893, %r2892; + xor.b32 %r2895, %r2894, %r2889; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 24; + add.s32 %r2897, %r2896, %r2890; + xor.b32 %r2898, %r2897, %r2892; + shf.l.wrap.b32 %r2899, %r2898, %r2898, 25; + add.s32 %r2900, %r2852, %r2096; + add.s32 %r2901, %r2900, %r2843; + xor.b32 %r2902, %r2901, %r2868; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 16; + add.s32 %r2904, %r2903, %r2827; + xor.b32 %r2905, %r2904, %r2843; + shf.l.wrap.b32 %r2906, %r2905, %r2905, 20; + add.s32 %r2907, %r2901, %r2152; + add.s32 %r2908, %r2907, %r2906; + xor.b32 %r2909, %r2908, %r2903; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 24; + add.s32 %r2911, %r2910, %r2904; + xor.b32 %r2912, %r2911, %r2906; + shf.l.wrap.b32 %r2913, %r2912, %r2912, 25; + add.s32 %r2914, %r2866, %r2145; + add.s32 %r2915, %r2914, %r2857; + xor.b32 %r2916, %r2915, %r2826; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 16; + add.s32 %r2918, %r2917, %r2841; + xor.b32 %r2919, %r2918, %r2857; + shf.l.wrap.b32 %r2920, %r2919, %r2919, 20; + add.s32 %r2921, %r2915, %r2131; + add.s32 %r2922, %r2921, %r2920; + xor.b32 %r2923, %r2922, %r2917; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 24; + add.s32 %r2925, %r2924, %r2918; + xor.b32 %r2926, %r2925, %r2920; + shf.l.wrap.b32 %r2927, %r2926, %r2926, 25; + add.s32 %r2928, %r2880, %r2187; + add.s32 %r2929, %r2928, %r2899; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 16; + add.s32 %r2932, %r2931, %r2911; + xor.b32 %r2933, %r2932, %r2899; + shf.l.wrap.b32 %r2934, %r2933, %r2933, 20; + add.s32 %r2935, %r2929, %r2159; + add.s32 %r2936, %r2935, %r2934; + xor.b32 %r2937, %r2936, %r2931; + shf.l.wrap.b32 %r2938, %r2937, %r2937, 24; + add.s32 %r2939, %r2938, %r2932; + xor.b32 %r2940, %r2939, %r2934; + shf.l.wrap.b32 %r2941, %r2940, %r2940, 25; + add.s32 %r2942, %r2894, %r2103; + add.s32 %r2943, %r2942, %r2913; + xor.b32 %r2944, %r2943, %r2882; + shf.l.wrap.b32 %r2945, %r2944, %r2944, 16; + add.s32 %r2946, %r2945, %r2925; + xor.b32 %r2947, %r2946, %r2913; + shf.l.wrap.b32 %r2948, %r2947, %r2947, 20; + add.s32 %r2949, %r2943, %r2173; + add.s32 %r2950, %r2949, %r2948; + xor.b32 %r2951, %r2950, %r2945; + shf.l.wrap.b32 %r2952, %r2951, %r2951, 24; + add.s32 %r2953, %r2952, %r2946; + xor.b32 %r2954, %r2953, %r2948; + shf.l.wrap.b32 %r2955, %r2954, %r2954, 25; + add.s32 %r2956, %r2908, %r2110; + add.s32 %r2957, %r2956, %r2927; + xor.b32 %r2958, %r2957, %r2896; + shf.l.wrap.b32 %r2959, %r2958, %r2958, 16; + add.s32 %r2960, %r2959, %r2883; + xor.b32 %r2961, %r2960, %r2927; + shf.l.wrap.b32 %r2962, %r2961, %r2961, 20; + add.s32 %r2963, %r2957, %r2117; + add.s32 %r2964, %r2963, %r2962; + xor.b32 %r2965, %r2964, %r2959; + shf.l.wrap.b32 %r2966, %r2965, %r2965, 24; + add.s32 %r2967, %r2966, %r2960; + xor.b32 %r2968, %r2967, %r2962; + shf.l.wrap.b32 %r2969, %r2968, %r2968, 25; + add.s32 %r2970, %r2922, %r2138; + add.s32 %r2971, %r2970, %r2885; + xor.b32 %r2972, %r2971, %r2910; + shf.l.wrap.b32 %r2973, %r2972, %r2972, 16; + add.s32 %r2974, %r2973, %r2897; + xor.b32 %r2975, %r2974, %r2885; + shf.l.wrap.b32 %r2976, %r2975, %r2975, 20; + add.s32 %r2977, %r2971, %r2180; + add.s32 %r2978, %r2977, %r2976; + xor.b32 %r2979, %r2978, %r2973; + shf.l.wrap.b32 %r2980, %r2979, %r2979, 24; + add.s32 %r2981, %r2980, %r2974; + xor.b32 %r2982, %r2981, %r2976; + shf.l.wrap.b32 %r2983, %r2982, %r2982, 25; + xor.b32 %r3964, %r2967, %r2936; + xor.b32 %r3963, %r2981, %r2950; + xor.b32 %r3962, %r2939, %r2964; + xor.b32 %r3961, %r2953, %r2978; + xor.b32 %r3960, %r2983, %r2952; + xor.b32 %r3959, %r2941, %r2966; + xor.b32 %r3958, %r2955, %r2980; + xor.b32 %r3957, %r2969, %r2938; + add.s16 %rs198, %rs198, 1; + st.local.u8 [%rd56+1], %rs198; + add.s64 %rd169, %rd169, 64; + add.s64 %rd170, %rd170, -64; + setp.gt.u64 %p24, %rd170, 64; + @%p24 bra $L__BB0_24; + +$L__BB0_25: + min.u64 %rd63, %rd170, 64; + setp.eq.s64 %p25, %rd63, 0; + mov.u16 %rs200, %rs199; + mov.u16 %rs201, %rs199; + mov.u16 %rs202, %rs199; + mov.u16 %rs203, %rs199; + mov.u16 %rs204, %rs199; + mov.u16 %rs205, %rs199; + mov.u16 %rs206, %rs199; + mov.u16 %rs207, %rs199; + mov.u16 %rs208, %rs199; + mov.u16 %rs209, %rs199; + mov.u16 %rs210, %rs199; + mov.u16 %rs211, %rs199; + mov.u16 %rs212, %rs199; + mov.u16 %rs213, %rs199; + mov.u16 %rs214, %rs199; + mov.u16 %rs215, %rs199; + mov.u16 %rs216, %rs199; + mov.u16 %rs217, %rs199; + mov.u16 %rs218, %rs199; + mov.u16 %rs219, %rs199; + mov.u16 %rs220, %rs199; + mov.u16 %rs221, %rs199; + mov.u16 %rs222, %rs199; + mov.u16 %rs223, %rs199; + mov.u16 %rs224, %rs199; + mov.u16 %rs225, %rs199; + mov.u16 %rs226, %rs199; + mov.u16 %rs227, %rs199; + mov.u16 %rs228, %rs199; + mov.u16 %rs229, %rs199; + mov.u16 %rs230, %rs199; + mov.u16 %rs231, %rs199; + mov.u16 %rs232, %rs199; + @%p25 bra $L__BB0_29; + + mov.u64 %rd171, 0; + +$L__BB0_27: + add.s64 %rd130, %rd169, %rd171; + ld.u8 %rs121, [%rd130]; + add.s64 %rd131, %rd53, %rd171; + st.local.u8 [%rd131], %rs121; + add.s64 %rd171, %rd171, 1; + setp.lt.u64 %p26, %rd171, %rd63; + @%p26 bra $L__BB0_27; + + ld.local.v4.u16 {%rs229, %rs230, %rs231, %rs232}, [%rd53]; + ld.local.v4.u16 {%rs225, %rs226, %rs227, %rs228}, [%rd53+8]; + ld.local.v4.u16 {%rs221, %rs222, %rs223, %rs224}, [%rd53+16]; + ld.local.v4.u16 {%rs217, %rs218, %rs219, %rs220}, [%rd53+24]; + ld.local.v4.u16 {%rs213, %rs214, %rs215, %rs216}, [%rd53+32]; + ld.local.v4.u16 {%rs209, %rs210, %rs211, %rs212}, [%rd53+40]; + ld.local.v4.u16 {%rs205, %rs206, %rs207, %rs208}, [%rd53+48]; + ld.local.v4.u16 {%rs202, %rs203, %rs204, %rs153}, [%rd53+56]; + ld.local.u8 %rs201, [%rd53+61]; + ld.local.v2.u8 {%rs199, %rs200}, [%rd53+62]; + +$L__BB0_29: + ld.param.u64 %rd137, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd136, %rd137; + ld.local.v4.u8 {%rs156, %rs157, %rs158, %rs159}, [%rd53+64]; + cvt.u16.u64 %rs162, %rd63; + add.s16 %rs163, %rs156, %rs162; + st.local.u8 [%rd53+64], %rs163; + setp.eq.s16 %p27, %rs157, 0; + selp.u16 %rs164, 1, 0, %p27; + or.b16 %rs165, %rs158, %rs164; + or.b16 %rs166, %rs165, 2; + shr.u16 %rs167, %rs229, 8; + shr.u16 %rs168, %rs230, 8; + shr.u16 %rs169, %rs231, 8; + shr.u16 %rs170, %rs232, 8; + shr.u16 %rs171, %rs225, 8; + shr.u16 %rs172, %rs226, 8; + shr.u16 %rs173, %rs227, 8; + shr.u16 %rs174, %rs228, 8; + shr.u16 %rs175, %rs221, 8; + shr.u16 %rs176, %rs222, 8; + shr.u16 %rs177, %rs223, 8; + shr.u16 %rs178, %rs224, 8; + shr.u16 %rs179, %rs217, 8; + shr.u16 %rs180, %rs218, 8; + shr.u16 %rs181, %rs219, 8; + shr.u16 %rs182, %rs220, 8; + shr.u16 %rs183, %rs213, 8; + shr.u16 %rs184, %rs214, 8; + shr.u16 %rs185, %rs215, 8; + shr.u16 %rs186, %rs216, 8; + shr.u16 %rs187, %rs209, 8; + shr.u16 %rs188, %rs210, 8; + shr.u16 %rs189, %rs211, 8; + shr.u16 %rs190, %rs212, 8; + shr.u16 %rs191, %rs205, 8; + shr.u16 %rs192, %rs206, 8; + shr.u16 %rs193, %rs207, 8; + shr.u16 %rs194, %rs208, 8; + shr.u16 %rs195, %rs202, 8; + shr.u16 %rs196, %rs203, 8; + shl.b64 %rd132, %rd150, 5; + add.s64 %rd133, %rd136, %rd132; + cvt.u32.u16 %r2984, %rs229; + and.b32 %r2985, %r2984, 255; + cvt.u32.u16 %r2986, %rs167; + prmt.b32 %r2987, %r2986, %r2985, 30212; + cvt.u32.u16 %r2988, %rs230; + prmt.b32 %r2989, %r2988, %r2987, 28756; + cvt.u32.u16 %r2990, %rs168; + prmt.b32 %r2991, %r2990, %r2989, 1620; + cvt.u32.u16 %r2992, %rs231; + and.b32 %r2993, %r2992, 255; + cvt.u32.u16 %r2994, %rs169; + prmt.b32 %r2995, %r2994, %r2993, 30212; + cvt.u32.u16 %r2996, %rs232; + prmt.b32 %r2997, %r2996, %r2995, 28756; + cvt.u32.u16 %r2998, %rs170; + prmt.b32 %r2999, %r2998, %r2997, 1620; + cvt.u32.u16 %r3000, %rs225; + and.b32 %r3001, %r3000, 255; + cvt.u32.u16 %r3002, %rs171; + prmt.b32 %r3003, %r3002, %r3001, 30212; + cvt.u32.u16 %r3004, %rs226; + prmt.b32 %r3005, %r3004, %r3003, 28756; + cvt.u32.u16 %r3006, %rs172; + prmt.b32 %r3007, %r3006, %r3005, 1620; + cvt.u32.u16 %r3008, %rs227; + and.b32 %r3009, %r3008, 255; + cvt.u32.u16 %r3010, %rs173; + prmt.b32 %r3011, %r3010, %r3009, 30212; + cvt.u32.u16 %r3012, %rs228; + prmt.b32 %r3013, %r3012, %r3011, 28756; + cvt.u32.u16 %r3014, %rs174; + prmt.b32 %r3015, %r3014, %r3013, 1620; + cvt.u32.u16 %r3016, %rs221; + and.b32 %r3017, %r3016, 255; + cvt.u32.u16 %r3018, %rs175; + prmt.b32 %r3019, %r3018, %r3017, 30212; + cvt.u32.u16 %r3020, %rs222; + prmt.b32 %r3021, %r3020, %r3019, 28756; + cvt.u32.u16 %r3022, %rs176; + prmt.b32 %r3023, %r3022, %r3021, 1620; + cvt.u32.u16 %r3024, %rs223; + and.b32 %r3025, %r3024, 255; + cvt.u32.u16 %r3026, %rs177; + prmt.b32 %r3027, %r3026, %r3025, 30212; + cvt.u32.u16 %r3028, %rs224; + prmt.b32 %r3029, %r3028, %r3027, 28756; + cvt.u32.u16 %r3030, %rs178; + prmt.b32 %r3031, %r3030, %r3029, 1620; + cvt.u32.u16 %r3032, %rs217; + and.b32 %r3033, %r3032, 255; + cvt.u32.u16 %r3034, %rs179; + prmt.b32 %r3035, %r3034, %r3033, 30212; + cvt.u32.u16 %r3036, %rs218; + prmt.b32 %r3037, %r3036, %r3035, 28756; + cvt.u32.u16 %r3038, %rs180; + prmt.b32 %r3039, %r3038, %r3037, 1620; + cvt.u32.u16 %r3040, %rs219; + and.b32 %r3041, %r3040, 255; + cvt.u32.u16 %r3042, %rs181; + prmt.b32 %r3043, %r3042, %r3041, 30212; + cvt.u32.u16 %r3044, %rs220; + prmt.b32 %r3045, %r3044, %r3043, 28756; + cvt.u32.u16 %r3046, %rs182; + prmt.b32 %r3047, %r3046, %r3045, 1620; + cvt.u32.u16 %r3048, %rs213; + and.b32 %r3049, %r3048, 255; + cvt.u32.u16 %r3050, %rs183; + prmt.b32 %r3051, %r3050, %r3049, 30212; + cvt.u32.u16 %r3052, %rs214; + prmt.b32 %r3053, %r3052, %r3051, 28756; + cvt.u32.u16 %r3054, %rs184; + prmt.b32 %r3055, %r3054, %r3053, 1620; + cvt.u32.u16 %r3056, %rs215; + and.b32 %r3057, %r3056, 255; + cvt.u32.u16 %r3058, %rs185; + prmt.b32 %r3059, %r3058, %r3057, 30212; + cvt.u32.u16 %r3060, %rs216; + prmt.b32 %r3061, %r3060, %r3059, 28756; + cvt.u32.u16 %r3062, %rs186; + prmt.b32 %r3063, %r3062, %r3061, 1620; + cvt.u32.u16 %r3064, %rs209; + and.b32 %r3065, %r3064, 255; + cvt.u32.u16 %r3066, %rs187; + prmt.b32 %r3067, %r3066, %r3065, 30212; + cvt.u32.u16 %r3068, %rs210; + prmt.b32 %r3069, %r3068, %r3067, 28756; + cvt.u32.u16 %r3070, %rs188; + prmt.b32 %r3071, %r3070, %r3069, 1620; + cvt.u32.u16 %r3072, %rs211; + and.b32 %r3073, %r3072, 255; + cvt.u32.u16 %r3074, %rs189; + prmt.b32 %r3075, %r3074, %r3073, 30212; + cvt.u32.u16 %r3076, %rs212; + prmt.b32 %r3077, %r3076, %r3075, 28756; + cvt.u32.u16 %r3078, %rs190; + prmt.b32 %r3079, %r3078, %r3077, 1620; + cvt.u32.u16 %r3080, %rs205; + and.b32 %r3081, %r3080, 255; + cvt.u32.u16 %r3082, %rs191; + prmt.b32 %r3083, %r3082, %r3081, 30212; + cvt.u32.u16 %r3084, %rs206; + prmt.b32 %r3085, %r3084, %r3083, 28756; + cvt.u32.u16 %r3086, %rs192; + prmt.b32 %r3087, %r3086, %r3085, 1620; + cvt.u32.u16 %r3088, %rs207; + and.b32 %r3089, %r3088, 255; + cvt.u32.u16 %r3090, %rs193; + prmt.b32 %r3091, %r3090, %r3089, 30212; + cvt.u32.u16 %r3092, %rs208; + prmt.b32 %r3093, %r3092, %r3091, 28756; + cvt.u32.u16 %r3094, %rs194; + prmt.b32 %r3095, %r3094, %r3093, 1620; + cvt.u32.u16 %r3096, %rs202; + and.b32 %r3097, %r3096, 255; + cvt.u32.u16 %r3098, %rs195; + prmt.b32 %r3099, %r3098, %r3097, 30212; + cvt.u32.u16 %r3100, %rs203; + prmt.b32 %r3101, %r3100, %r3099, 28756; + cvt.u32.u16 %r3102, %rs196; + prmt.b32 %r3103, %r3102, %r3101, 1620; + cvt.u32.u16 %r3104, %rs204; + and.b32 %r3105, %r3104, 255; + cvt.u32.u16 %r3106, %rs201; + prmt.b32 %r3107, %r3106, %r3105, 30212; + cvt.u32.u16 %r3108, %rs199; + shl.b32 %r3109, %r3108, 16; + and.b32 %r3110, %r3109, 16711680; + or.b32 %r3111, %r3107, %r3110; + cvt.u32.u16 %r3112, %rs200; + shl.b32 %r3113, %r3112, 24; + or.b32 %r3114, %r3111, %r3113; + cvt.u32.u16 %r3115, %rs163; + and.b32 %r3116, %r3115, 255; + cvt.u32.u16 %r3117, %rs166; + and.b32 %r3118, %r3117, 255; + add.s32 %r3119, %r3960, %r3964; + add.s32 %r3120, %r3119, %r2991; + xor.b32 %r3121, %r3120, %r36; + shf.l.wrap.b32 %r3122, %r3121, %r3121, 16; + add.s32 %r3123, %r3122, 1779033703; + xor.b32 %r3124, %r3123, %r3960; + shf.l.wrap.b32 %r3125, %r3124, %r3124, 20; + add.s32 %r3126, %r2999, %r3120; + add.s32 %r3127, %r3126, %r3125; + xor.b32 %r3128, %r3127, %r3122; + shf.l.wrap.b32 %r3129, %r3128, %r3128, 24; + add.s32 %r3130, %r3129, %r3123; + xor.b32 %r3131, %r3130, %r3125; + shf.l.wrap.b32 %r3132, %r3131, %r3131, 25; + add.s32 %r3133, %r3959, %r3963; + add.s32 %r3134, %r3133, %r3007; + xor.b32 %r3135, %r3134, %r37; + shf.l.wrap.b32 %r3136, %r3135, %r3135, 16; + add.s32 %r3137, %r3136, -1150833019; + xor.b32 %r3138, %r3137, %r3959; + shf.l.wrap.b32 %r3139, %r3138, %r3138, 20; + add.s32 %r3140, %r3015, %r3134; + add.s32 %r3141, %r3140, %r3139; + xor.b32 %r3142, %r3141, %r3136; + shf.l.wrap.b32 %r3143, %r3142, %r3142, 24; + add.s32 %r3144, %r3143, %r3137; + xor.b32 %r3145, %r3144, %r3139; + shf.l.wrap.b32 %r3146, %r3145, %r3145, 25; + add.s32 %r3147, %r3958, %r3962; + add.s32 %r3148, %r3147, %r3023; + xor.b32 %r3149, %r3148, %r3116; + shr.u32 %r3150, %r3148, 16; + shl.b32 %r3151, %r3149, 16; + or.b32 %r3152, %r3151, %r3150; + add.s32 %r3153, %r3152, 1013904242; + xor.b32 %r3154, %r3153, %r3958; + shf.l.wrap.b32 %r3155, %r3154, %r3154, 20; + add.s32 %r3156, %r3031, %r3148; + add.s32 %r3157, %r3156, %r3155; + xor.b32 %r3158, %r3157, %r3152; + shf.l.wrap.b32 %r3159, %r3158, %r3158, 24; + add.s32 %r3160, %r3159, %r3153; + xor.b32 %r3161, %r3160, %r3155; + shf.l.wrap.b32 %r3162, %r3161, %r3161, 25; + add.s32 %r3163, %r3957, %r3961; + add.s32 %r3164, %r3163, %r3039; + xor.b32 %r3165, %r3164, %r3118; + shr.u32 %r3166, %r3164, 16; + shl.b32 %r3167, %r3165, 16; + or.b32 %r3168, %r3167, %r3166; + add.s32 %r3169, %r3168, -1521486534; + xor.b32 %r3170, %r3169, %r3957; + shf.l.wrap.b32 %r3171, %r3170, %r3170, 20; + add.s32 %r3172, %r3047, %r3164; + add.s32 %r3173, %r3172, %r3171; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 24; + add.s32 %r3176, %r3175, %r3169; + xor.b32 %r3177, %r3176, %r3171; + shf.l.wrap.b32 %r3178, %r3177, %r3177, 25; + add.s32 %r3179, %r3146, %r3127; + add.s32 %r3180, %r3179, %r3055; + xor.b32 %r3181, %r3175, %r3180; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 16; + add.s32 %r3183, %r3182, %r3160; + xor.b32 %r3184, %r3183, %r3146; + shf.l.wrap.b32 %r3185, %r3184, %r3184, 20; + add.s32 %r3186, %r3063, %r3180; + add.s32 %r3187, %r3186, %r3185; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 24; + add.s32 %r3190, %r3189, %r3183; + xor.b32 %r3191, %r3190, %r3185; + shf.l.wrap.b32 %r3192, %r3191, %r3191, 25; + add.s32 %r3193, %r3162, %r3141; + add.s32 %r3194, %r3193, %r3071; + xor.b32 %r3195, %r3194, %r3129; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 16; + add.s32 %r3197, %r3196, %r3176; + xor.b32 %r3198, %r3197, %r3162; + shf.l.wrap.b32 %r3199, %r3198, %r3198, 20; + add.s32 %r3200, %r3079, %r3194; + add.s32 %r3201, %r3200, %r3199; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 24; + add.s32 %r3204, %r3203, %r3197; + xor.b32 %r3205, %r3204, %r3199; + shf.l.wrap.b32 %r3206, %r3205, %r3205, 25; + add.s32 %r3207, %r3178, %r3157; + add.s32 %r3208, %r3207, %r3087; + xor.b32 %r3209, %r3208, %r3143; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 16; + add.s32 %r3211, %r3210, %r3130; + xor.b32 %r3212, %r3211, %r3178; + shf.l.wrap.b32 %r3213, %r3212, %r3212, 20; + add.s32 %r3214, %r3095, %r3208; + add.s32 %r3215, %r3214, %r3213; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 24; + add.s32 %r3218, %r3217, %r3211; + xor.b32 %r3219, %r3218, %r3213; + shf.l.wrap.b32 %r3220, %r3219, %r3219, 25; + add.s32 %r3221, %r3173, %r3132; + add.s32 %r3222, %r3221, %r3103; + xor.b32 %r3223, %r3222, %r3159; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 16; + add.s32 %r3225, %r3224, %r3144; + xor.b32 %r3226, %r3225, %r3132; + shf.l.wrap.b32 %r3227, %r3226, %r3226, 20; + add.s32 %r3228, %r3114, %r3222; + add.s32 %r3229, %r3228, %r3227; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 24; + add.s32 %r3232, %r3231, %r3225; + xor.b32 %r3233, %r3232, %r3227; + shf.l.wrap.b32 %r3234, %r3233, %r3233, 25; + add.s32 %r3235, %r3187, %r3007; + add.s32 %r3236, %r3235, %r3234; + xor.b32 %r3237, %r3236, %r3203; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 16; + add.s32 %r3239, %r3238, %r3218; + xor.b32 %r3240, %r3239, %r3234; + shf.l.wrap.b32 %r3241, %r3240, %r3240, 20; + add.s32 %r3242, %r3236, %r3039; + add.s32 %r3243, %r3242, %r3241; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 24; + add.s32 %r3246, %r3245, %r3239; + xor.b32 %r3247, %r3246, %r3241; + shf.l.wrap.b32 %r3248, %r3247, %r3247, 25; + add.s32 %r3249, %r3201, %r3015; + add.s32 %r3250, %r3249, %r3192; + xor.b32 %r3251, %r3217, %r3250; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 16; + add.s32 %r3253, %r3232, %r3252; + xor.b32 %r3254, %r3253, %r3192; + shf.l.wrap.b32 %r3255, %r3254, %r3254, 20; + add.s32 %r3256, %r3250, %r3071; + add.s32 %r3257, %r3256, %r3255; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 24; + add.s32 %r3260, %r3259, %r3253; + xor.b32 %r3261, %r3260, %r3255; + shf.l.wrap.b32 %r3262, %r3261, %r3261, 25; + add.s32 %r3263, %r3206, %r3047; + add.s32 %r3264, %r3263, %r3215; + xor.b32 %r3265, %r3231, %r3264; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 16; + add.s32 %r3267, %r3266, %r3190; + xor.b32 %r3268, %r3267, %r3206; + shf.l.wrap.b32 %r3269, %r3268, %r3268, 20; + add.s32 %r3270, %r3264, %r2991; + add.s32 %r3271, %r3270, %r3269; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 24; + add.s32 %r3274, %r3273, %r3267; + xor.b32 %r3275, %r3274, %r3269; + shf.l.wrap.b32 %r3276, %r3275, %r3275, 25; + add.s32 %r3277, %r3220, %r3023; + add.s32 %r3278, %r3277, %r3229; + xor.b32 %r3279, %r3278, %r3189; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 16; + add.s32 %r3281, %r3280, %r3204; + xor.b32 %r3282, %r3281, %r3220; + shf.l.wrap.b32 %r3283, %r3282, %r3282, 20; + add.s32 %r3284, %r3278, %r3095; + add.s32 %r3285, %r3284, %r3283; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 24; + add.s32 %r3288, %r3287, %r3281; + xor.b32 %r3289, %r3288, %r3283; + shf.l.wrap.b32 %r3290, %r3289, %r3289, 25; + add.s32 %r3291, %r3262, %r2999; + add.s32 %r3292, %r3291, %r3243; + xor.b32 %r3293, %r3292, %r3287; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 16; + add.s32 %r3295, %r3294, %r3274; + xor.b32 %r3296, %r3295, %r3262; + shf.l.wrap.b32 %r3297, %r3296, %r3296, 20; + add.s32 %r3298, %r3292, %r3079; + add.s32 %r3299, %r3298, %r3297; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 24; + add.s32 %r3302, %r3301, %r3295; + xor.b32 %r3303, %r3302, %r3297; + shf.l.wrap.b32 %r3304, %r3303, %r3303, 25; + add.s32 %r3305, %r3257, %r3087; + add.s32 %r3306, %r3305, %r3276; + xor.b32 %r3307, %r3245, %r3306; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 16; + add.s32 %r3309, %r3308, %r3288; + xor.b32 %r3310, %r3309, %r3276; + shf.l.wrap.b32 %r3311, %r3310, %r3310, 20; + add.s32 %r3312, %r3306, %r3031; + add.s32 %r3313, %r3312, %r3311; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 24; + add.s32 %r3316, %r3315, %r3309; + xor.b32 %r3317, %r3316, %r3311; + shf.l.wrap.b32 %r3318, %r3317, %r3317, 25; + add.s32 %r3319, %r3271, %r3063; + add.s32 %r3320, %r3319, %r3290; + xor.b32 %r3321, %r3320, %r3259; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 16; + add.s32 %r3323, %r3322, %r3246; + xor.b32 %r3324, %r3323, %r3290; + shf.l.wrap.b32 %r3325, %r3324, %r3324, 20; + add.s32 %r3326, %r3320, %r3103; + add.s32 %r3327, %r3326, %r3325; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 24; + add.s32 %r3330, %r3329, %r3323; + xor.b32 %r3331, %r3330, %r3325; + shf.l.wrap.b32 %r3332, %r3331, %r3331, 25; + add.s32 %r3333, %r3285, %r3114; + add.s32 %r3334, %r3333, %r3248; + xor.b32 %r3335, %r3334, %r3273; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 16; + add.s32 %r3337, %r3336, %r3260; + xor.b32 %r3338, %r3337, %r3248; + shf.l.wrap.b32 %r3339, %r3338, %r3338, 20; + add.s32 %r3340, %r3334, %r3055; + add.s32 %r3341, %r3340, %r3339; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 24; + add.s32 %r3344, %r3343, %r3337; + xor.b32 %r3345, %r3344, %r3339; + shf.l.wrap.b32 %r3346, %r3345, %r3345, 25; + add.s32 %r3347, %r3299, %r3015; + add.s32 %r3348, %r3347, %r3346; + xor.b32 %r3349, %r3348, %r3315; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 16; + add.s32 %r3351, %r3350, %r3330; + xor.b32 %r3352, %r3351, %r3346; + shf.l.wrap.b32 %r3353, %r3352, %r3352, 20; + add.s32 %r3354, %r3348, %r3023; + add.s32 %r3355, %r3354, %r3353; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 24; + add.s32 %r3358, %r3357, %r3351; + xor.b32 %r3359, %r3358, %r3353; + shf.l.wrap.b32 %r3360, %r3359, %r3359, 25; + add.s32 %r3361, %r3313, %r3071; + add.s32 %r3362, %r3361, %r3304; + xor.b32 %r3363, %r3362, %r3329; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 16; + add.s32 %r3365, %r3364, %r3344; + xor.b32 %r3366, %r3365, %r3304; + shf.l.wrap.b32 %r3367, %r3366, %r3366, 20; + add.s32 %r3368, %r3362, %r3087; + add.s32 %r3369, %r3368, %r3367; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 24; + add.s32 %r3372, %r3371, %r3365; + xor.b32 %r3373, %r3372, %r3367; + shf.l.wrap.b32 %r3374, %r3373, %r3373, 25; + add.s32 %r3375, %r3327, %r3095; + add.s32 %r3376, %r3375, %r3318; + xor.b32 %r3377, %r3343, %r3376; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 16; + add.s32 %r3379, %r3378, %r3302; + xor.b32 %r3380, %r3379, %r3318; + shf.l.wrap.b32 %r3381, %r3380, %r3380, 20; + add.s32 %r3382, %r3376, %r3007; + add.s32 %r3383, %r3382, %r3381; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 24; + add.s32 %r3386, %r3385, %r3379; + xor.b32 %r3387, %r3386, %r3381; + shf.l.wrap.b32 %r3388, %r3387, %r3387, 25; + add.s32 %r3389, %r3332, %r3047; + add.s32 %r3390, %r3389, %r3341; + xor.b32 %r3391, %r3390, %r3301; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 16; + add.s32 %r3393, %r3392, %r3316; + xor.b32 %r3394, %r3393, %r3332; + shf.l.wrap.b32 %r3395, %r3394, %r3394, 20; + add.s32 %r3396, %r3390, %r3103; + add.s32 %r3397, %r3396, %r3395; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 24; + add.s32 %r3400, %r3399, %r3393; + xor.b32 %r3401, %r3400, %r3395; + shf.l.wrap.b32 %r3402, %r3401, %r3401, 25; + add.s32 %r3403, %r3374, %r3039; + add.s32 %r3404, %r3403, %r3355; + xor.b32 %r3405, %r3404, %r3399; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 16; + add.s32 %r3407, %r3406, %r3386; + xor.b32 %r3408, %r3407, %r3374; + shf.l.wrap.b32 %r3409, %r3408, %r3408, 20; + add.s32 %r3410, %r3404, %r3031; + add.s32 %r3411, %r3410, %r3409; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 24; + add.s32 %r3414, %r3413, %r3407; + xor.b32 %r3415, %r3414, %r3409; + shf.l.wrap.b32 %r3416, %r3415, %r3415, 25; + add.s32 %r3417, %r3369, %r3063; + add.s32 %r3418, %r3417, %r3388; + xor.b32 %r3419, %r3357, %r3418; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 16; + add.s32 %r3421, %r3420, %r3400; + xor.b32 %r3422, %r3421, %r3388; + shf.l.wrap.b32 %r3423, %r3422, %r3422, 20; + add.s32 %r3424, %r3418, %r2991; + add.s32 %r3425, %r3424, %r3423; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 24; + add.s32 %r3428, %r3427, %r3421; + xor.b32 %r3429, %r3428, %r3423; + shf.l.wrap.b32 %r3430, %r3429, %r3429, 25; + add.s32 %r3431, %r3383, %r3079; + add.s32 %r3432, %r3431, %r3402; + xor.b32 %r3433, %r3432, %r3371; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 16; + add.s32 %r3435, %r3434, %r3358; + xor.b32 %r3436, %r3435, %r3402; + shf.l.wrap.b32 %r3437, %r3436, %r3436, 20; + add.s32 %r3438, %r3432, %r3114; + add.s32 %r3439, %r3438, %r3437; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 24; + add.s32 %r3442, %r3441, %r3435; + xor.b32 %r3443, %r3442, %r3437; + shf.l.wrap.b32 %r3444, %r3443, %r3443, 25; + add.s32 %r3445, %r3397, %r3055; + add.s32 %r3446, %r3445, %r3360; + xor.b32 %r3447, %r3446, %r3385; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 16; + add.s32 %r3449, %r3448, %r3372; + xor.b32 %r3450, %r3449, %r3360; + shf.l.wrap.b32 %r3451, %r3450, %r3450, 20; + add.s32 %r3452, %r3446, %r2999; + add.s32 %r3453, %r3452, %r3451; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 24; + add.s32 %r3456, %r3455, %r3449; + xor.b32 %r3457, %r3456, %r3451; + shf.l.wrap.b32 %r3458, %r3457, %r3457, 25; + add.s32 %r3459, %r3411, %r3071; + add.s32 %r3460, %r3459, %r3458; + xor.b32 %r3461, %r3460, %r3427; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 16; + add.s32 %r3463, %r3462, %r3442; + xor.b32 %r3464, %r3463, %r3458; + shf.l.wrap.b32 %r3465, %r3464, %r3464, 20; + add.s32 %r3466, %r3460, %r3047; + add.s32 %r3467, %r3466, %r3465; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 24; + add.s32 %r3470, %r3469, %r3463; + xor.b32 %r3471, %r3470, %r3465; + shf.l.wrap.b32 %r3472, %r3471, %r3471, 25; + add.s32 %r3473, %r3425, %r3087; + add.s32 %r3474, %r3473, %r3416; + xor.b32 %r3475, %r3474, %r3441; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 16; + add.s32 %r3477, %r3476, %r3456; + xor.b32 %r3478, %r3477, %r3416; + shf.l.wrap.b32 %r3479, %r3478, %r3478, 20; + add.s32 %r3480, %r3474, %r3063; + add.s32 %r3481, %r3480, %r3479; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 24; + add.s32 %r3484, %r3483, %r3477; + xor.b32 %r3485, %r3484, %r3479; + shf.l.wrap.b32 %r3486, %r3485, %r3485, 25; + add.s32 %r3487, %r3439, %r3103; + add.s32 %r3488, %r3487, %r3430; + xor.b32 %r3489, %r3455, %r3488; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 16; + add.s32 %r3491, %r3490, %r3414; + xor.b32 %r3492, %r3491, %r3430; + shf.l.wrap.b32 %r3493, %r3492, %r3492, 20; + add.s32 %r3494, %r3488, %r3015; + add.s32 %r3495, %r3494, %r3493; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 24; + add.s32 %r3498, %r3497, %r3491; + xor.b32 %r3499, %r3498, %r3493; + shf.l.wrap.b32 %r3500, %r3499, %r3499, 25; + add.s32 %r3501, %r3444, %r3095; + add.s32 %r3502, %r3501, %r3453; + xor.b32 %r3503, %r3502, %r3413; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 16; + add.s32 %r3505, %r3504, %r3428; + xor.b32 %r3506, %r3505, %r3444; + shf.l.wrap.b32 %r3507, %r3506, %r3506, 20; + add.s32 %r3508, %r3502, %r3114; + add.s32 %r3509, %r3508, %r3507; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 24; + add.s32 %r3512, %r3511, %r3505; + xor.b32 %r3513, %r3512, %r3507; + shf.l.wrap.b32 %r3514, %r3513, %r3513, 25; + add.s32 %r3515, %r3486, %r3023; + add.s32 %r3516, %r3515, %r3467; + xor.b32 %r3517, %r3516, %r3511; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 16; + add.s32 %r3519, %r3518, %r3498; + xor.b32 %r3520, %r3519, %r3486; + shf.l.wrap.b32 %r3521, %r3520, %r3520, 20; + add.s32 %r3522, %r3516, %r2991; + add.s32 %r3523, %r3522, %r3521; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 24; + add.s32 %r3526, %r3525, %r3519; + xor.b32 %r3527, %r3526, %r3521; + shf.l.wrap.b32 %r3528, %r3527, %r3527, 25; + add.s32 %r3529, %r3481, %r3079; + add.s32 %r3530, %r3529, %r3500; + xor.b32 %r3531, %r3469, %r3530; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 16; + add.s32 %r3533, %r3532, %r3512; + xor.b32 %r3534, %r3533, %r3500; + shf.l.wrap.b32 %r3535, %r3534, %r3534, 20; + add.s32 %r3536, %r3530, %r3007; + add.s32 %r3537, %r3536, %r3535; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 24; + add.s32 %r3540, %r3539, %r3533; + xor.b32 %r3541, %r3540, %r3535; + shf.l.wrap.b32 %r3542, %r3541, %r3541, 25; + add.s32 %r3543, %r3495, %r3031; + add.s32 %r3544, %r3543, %r3514; + xor.b32 %r3545, %r3544, %r3483; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 16; + add.s32 %r3547, %r3546, %r3470; + xor.b32 %r3548, %r3547, %r3514; + shf.l.wrap.b32 %r3549, %r3548, %r3548, 20; + add.s32 %r3550, %r3544, %r3055; + add.s32 %r3551, %r3550, %r3549; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 24; + add.s32 %r3554, %r3553, %r3547; + xor.b32 %r3555, %r3554, %r3549; + shf.l.wrap.b32 %r3556, %r3555, %r3555, 25; + add.s32 %r3557, %r3509, %r2999; + add.s32 %r3558, %r3557, %r3472; + xor.b32 %r3559, %r3558, %r3497; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 16; + add.s32 %r3561, %r3560, %r3484; + xor.b32 %r3562, %r3561, %r3472; + shf.l.wrap.b32 %r3563, %r3562, %r3562, 20; + add.s32 %r3564, %r3558, %r3039; + add.s32 %r3565, %r3564, %r3563; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 24; + add.s32 %r3568, %r3567, %r3561; + xor.b32 %r3569, %r3568, %r3563; + shf.l.wrap.b32 %r3570, %r3569, %r3569, 25; + add.s32 %r3571, %r3523, %r3087; + add.s32 %r3572, %r3571, %r3570; + xor.b32 %r3573, %r3572, %r3539; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 16; + add.s32 %r3575, %r3574, %r3554; + xor.b32 %r3576, %r3575, %r3570; + shf.l.wrap.b32 %r3577, %r3576, %r3576, 20; + add.s32 %r3578, %r3572, %r3095; + add.s32 %r3579, %r3578, %r3577; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 24; + add.s32 %r3582, %r3581, %r3575; + xor.b32 %r3583, %r3582, %r3577; + shf.l.wrap.b32 %r3584, %r3583, %r3583, 25; + add.s32 %r3585, %r3537, %r3063; + add.s32 %r3586, %r3585, %r3528; + xor.b32 %r3587, %r3586, %r3553; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 16; + add.s32 %r3589, %r3588, %r3568; + xor.b32 %r3590, %r3589, %r3528; + shf.l.wrap.b32 %r3591, %r3590, %r3590, 20; + add.s32 %r3592, %r3586, %r3079; + add.s32 %r3593, %r3592, %r3591; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 24; + add.s32 %r3596, %r3595, %r3589; + xor.b32 %r3597, %r3596, %r3591; + shf.l.wrap.b32 %r3598, %r3597, %r3597, 25; + add.s32 %r3599, %r3551, %r3114; + add.s32 %r3600, %r3599, %r3542; + xor.b32 %r3601, %r3567, %r3600; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 16; + add.s32 %r3603, %r3602, %r3526; + xor.b32 %r3604, %r3603, %r3542; + shf.l.wrap.b32 %r3605, %r3604, %r3604, 20; + add.s32 %r3606, %r3600, %r3071; + add.s32 %r3607, %r3606, %r3605; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 24; + add.s32 %r3610, %r3609, %r3603; + xor.b32 %r3611, %r3610, %r3605; + shf.l.wrap.b32 %r3612, %r3611, %r3611, 25; + add.s32 %r3613, %r3556, %r3103; + add.s32 %r3614, %r3613, %r3565; + xor.b32 %r3615, %r3614, %r3525; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 16; + add.s32 %r3617, %r3616, %r3540; + xor.b32 %r3618, %r3617, %r3556; + shf.l.wrap.b32 %r3619, %r3618, %r3618, 20; + add.s32 %r3620, %r3614, %r3055; + add.s32 %r3621, %r3620, %r3619; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 24; + add.s32 %r3624, %r3623, %r3617; + xor.b32 %r3625, %r3624, %r3619; + shf.l.wrap.b32 %r3626, %r3625, %r3625, 25; + add.s32 %r3627, %r3598, %r3047; + add.s32 %r3628, %r3627, %r3579; + xor.b32 %r3629, %r3628, %r3623; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 16; + add.s32 %r3631, %r3630, %r3610; + xor.b32 %r3632, %r3631, %r3598; + shf.l.wrap.b32 %r3633, %r3632, %r3632, 20; + add.s32 %r3634, %r3628, %r3007; + add.s32 %r3635, %r3634, %r3633; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 24; + add.s32 %r3638, %r3637, %r3631; + xor.b32 %r3639, %r3638, %r3633; + shf.l.wrap.b32 %r3640, %r3639, %r3639, 25; + add.s32 %r3641, %r3593, %r3031; + add.s32 %r3642, %r3641, %r3612; + xor.b32 %r3643, %r3581, %r3642; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 16; + add.s32 %r3645, %r3644, %r3624; + xor.b32 %r3646, %r3645, %r3612; + shf.l.wrap.b32 %r3647, %r3646, %r3646, 20; + add.s32 %r3648, %r3642, %r3015; + add.s32 %r3649, %r3648, %r3647; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 24; + add.s32 %r3652, %r3651, %r3645; + xor.b32 %r3653, %r3652, %r3647; + shf.l.wrap.b32 %r3654, %r3653, %r3653, 25; + add.s32 %r3655, %r3607, %r2991; + add.s32 %r3656, %r3655, %r3626; + xor.b32 %r3657, %r3656, %r3595; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 16; + add.s32 %r3659, %r3658, %r3582; + xor.b32 %r3660, %r3659, %r3626; + shf.l.wrap.b32 %r3661, %r3660, %r3660, 20; + add.s32 %r3662, %r3656, %r2999; + add.s32 %r3663, %r3662, %r3661; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 24; + add.s32 %r3666, %r3665, %r3659; + xor.b32 %r3667, %r3666, %r3661; + shf.l.wrap.b32 %r3668, %r3667, %r3667, 25; + add.s32 %r3669, %r3621, %r3039; + add.s32 %r3670, %r3669, %r3584; + xor.b32 %r3671, %r3670, %r3609; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 16; + add.s32 %r3673, %r3672, %r3596; + xor.b32 %r3674, %r3673, %r3584; + shf.l.wrap.b32 %r3675, %r3674, %r3674, 20; + add.s32 %r3676, %r3670, %r3023; + add.s32 %r3677, %r3676, %r3675; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 24; + add.s32 %r3680, %r3679, %r3673; + xor.b32 %r3681, %r3680, %r3675; + shf.l.wrap.b32 %r3682, %r3681, %r3681, 25; + add.s32 %r3683, %r3635, %r3063; + add.s32 %r3684, %r3683, %r3682; + xor.b32 %r3685, %r3684, %r3651; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 16; + add.s32 %r3687, %r3686, %r3666; + xor.b32 %r3688, %r3687, %r3682; + shf.l.wrap.b32 %r3689, %r3688, %r3688, 20; + add.s32 %r3690, %r3684, %r3103; + add.s32 %r3691, %r3690, %r3689; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 24; + add.s32 %r3694, %r3693, %r3687; + xor.b32 %r3695, %r3694, %r3689; + shf.l.wrap.b32 %r3696, %r3695, %r3695, 25; + add.s32 %r3697, %r3649, %r3079; + add.s32 %r3698, %r3697, %r3640; + xor.b32 %r3699, %r3698, %r3665; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 16; + add.s32 %r3701, %r3700, %r3680; + xor.b32 %r3702, %r3701, %r3640; + shf.l.wrap.b32 %r3703, %r3702, %r3702, 20; + add.s32 %r3704, %r3698, %r3031; + add.s32 %r3705, %r3704, %r3703; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 24; + add.s32 %r3708, %r3707, %r3701; + xor.b32 %r3709, %r3708, %r3703; + shf.l.wrap.b32 %r3710, %r3709, %r3709, 25; + add.s32 %r3711, %r3663, %r3055; + add.s32 %r3712, %r3711, %r3654; + xor.b32 %r3713, %r3679, %r3712; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 16; + add.s32 %r3715, %r3714, %r3638; + xor.b32 %r3716, %r3715, %r3654; + shf.l.wrap.b32 %r3717, %r3716, %r3716, 20; + add.s32 %r3718, %r3712, %r3087; + add.s32 %r3719, %r3718, %r3717; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 24; + add.s32 %r3722, %r3721, %r3715; + xor.b32 %r3723, %r3722, %r3717; + shf.l.wrap.b32 %r3724, %r3723, %r3723, 25; + add.s32 %r3725, %r3668, %r3114; + add.s32 %r3726, %r3725, %r3677; + xor.b32 %r3727, %r3726, %r3637; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 16; + add.s32 %r3729, %r3728, %r3652; + xor.b32 %r3730, %r3729, %r3668; + shf.l.wrap.b32 %r3731, %r3730, %r3730, 20; + add.s32 %r3732, %r3726, %r2999; + add.s32 %r3733, %r3732, %r3731; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 24; + add.s32 %r3736, %r3735, %r3729; + xor.b32 %r3737, %r3736, %r3731; + shf.l.wrap.b32 %r3738, %r3737, %r3737, 25; + add.s32 %r3739, %r3710, %r3095; + add.s32 %r3740, %r3739, %r3691; + xor.b32 %r3741, %r3740, %r3735; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 16; + add.s32 %r3743, %r3742, %r3722; + xor.b32 %r3744, %r3743, %r3710; + shf.l.wrap.b32 %r3745, %r3744, %r3744, 20; + add.s32 %r3746, %r3740, %r3015; + add.s32 %r3747, %r3746, %r3745; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 24; + add.s32 %r3750, %r3749, %r3743; + xor.b32 %r3751, %r3750, %r3745; + shf.l.wrap.b32 %r3752, %r3751, %r3751, 25; + add.s32 %r3753, %r3705, %r2991; + add.s32 %r3754, %r3753, %r3724; + xor.b32 %r3755, %r3693, %r3754; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 16; + add.s32 %r3757, %r3756, %r3736; + xor.b32 %r3758, %r3757, %r3724; + shf.l.wrap.b32 %r3759, %r3758, %r3758, 20; + add.s32 %r3760, %r3754, %r3071; + add.s32 %r3761, %r3760, %r3759; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 24; + add.s32 %r3764, %r3763, %r3757; + xor.b32 %r3765, %r3764, %r3759; + shf.l.wrap.b32 %r3766, %r3765, %r3765, 25; + add.s32 %r3767, %r3719, %r3007; + add.s32 %r3768, %r3767, %r3738; + xor.b32 %r3769, %r3768, %r3707; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 16; + add.s32 %r3771, %r3770, %r3694; + xor.b32 %r3772, %r3771, %r3738; + shf.l.wrap.b32 %r3773, %r3772, %r3772, 20; + add.s32 %r3774, %r3768, %r3039; + add.s32 %r3775, %r3774, %r3773; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 24; + add.s32 %r3778, %r3777, %r3771; + xor.b32 %r3779, %r3778, %r3773; + shf.l.wrap.b32 %r3780, %r3779, %r3779, 25; + add.s32 %r3781, %r3733, %r3023; + add.s32 %r3782, %r3781, %r3696; + xor.b32 %r3783, %r3782, %r3721; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 16; + add.s32 %r3785, %r3784, %r3708; + xor.b32 %r3786, %r3785, %r3696; + shf.l.wrap.b32 %r3787, %r3786, %r3786, 20; + add.s32 %r3788, %r3782, %r3047; + add.s32 %r3789, %r3788, %r3787; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 24; + add.s32 %r3792, %r3791, %r3785; + xor.b32 %r3793, %r3792, %r3787; + shf.l.wrap.b32 %r3794, %r3793, %r3793, 25; + add.s32 %r3795, %r3747, %r3079; + add.s32 %r3796, %r3795, %r3794; + xor.b32 %r3797, %r3796, %r3763; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 16; + add.s32 %r3799, %r3798, %r3778; + xor.b32 %r3800, %r3799, %r3794; + shf.l.wrap.b32 %r3801, %r3800, %r3800, 20; + add.s32 %r3802, %r3796, %r3114; + add.s32 %r3803, %r3802, %r3801; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 24; + add.s32 %r3806, %r3805, %r3799; + xor.b32 %r3807, %r3806, %r3801; + shf.l.wrap.b32 %r3808, %r3807, %r3807, 25; + add.s32 %r3809, %r3761, %r3031; + add.s32 %r3810, %r3809, %r3752; + xor.b32 %r3811, %r3810, %r3777; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 16; + add.s32 %r3813, %r3812, %r3792; + xor.b32 %r3814, %r3813, %r3752; + shf.l.wrap.b32 %r3815, %r3814, %r3814, 20; + add.s32 %r3816, %r3810, %r2991; + add.s32 %r3817, %r3816, %r3815; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 24; + add.s32 %r3820, %r3819, %r3813; + xor.b32 %r3821, %r3820, %r3815; + shf.l.wrap.b32 %r3822, %r3821, %r3821, 25; + add.s32 %r3823, %r3775, %r2999; + add.s32 %r3824, %r3823, %r3766; + xor.b32 %r3825, %r3791, %r3824; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 16; + add.s32 %r3827, %r3826, %r3750; + xor.b32 %r3828, %r3827, %r3766; + shf.l.wrap.b32 %r3829, %r3828, %r3828, 20; + add.s32 %r3830, %r3824, %r3063; + add.s32 %r3831, %r3830, %r3829; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 24; + add.s32 %r3834, %r3833, %r3827; + xor.b32 %r3835, %r3834, %r3829; + shf.l.wrap.b32 %r3836, %r3835, %r3835, 25; + add.s32 %r3837, %r3780, %r3055; + add.s32 %r3838, %r3837, %r3789; + xor.b32 %r3839, %r3838, %r3749; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 16; + add.s32 %r3841, %r3840, %r3764; + xor.b32 %r3842, %r3841, %r3780; + shf.l.wrap.b32 %r3843, %r3842, %r3842, 20; + add.s32 %r3844, %r3838, %r3039; + add.s32 %r3845, %r3844, %r3843; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 24; + add.s32 %r3848, %r3847, %r3841; + xor.b32 %r3849, %r3848, %r3843; + shf.l.wrap.b32 %r3850, %r3849, %r3849, 25; + add.s32 %r3851, %r3822, %r3103; + add.s32 %r3852, %r3851, %r3803; + xor.b32 %r3853, %r3852, %r3847; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 16; + add.s32 %r3855, %r3854, %r3834; + xor.b32 %r3856, %r3855, %r3822; + shf.l.wrap.b32 %r3857, %r3856, %r3856, 20; + add.s32 %r3858, %r3852, %r3071; + add.s32 %r3859, %r3858, %r3857; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 24; + add.s32 %r3862, %r3861, %r3855; + xor.b32 %r3863, %r3862, %r3857; + shf.l.wrap.b32 %r3864, %r3863, %r3863, 25; + add.s32 %r3865, %r3817, %r3007; + add.s32 %r3866, %r3865, %r3836; + xor.b32 %r3867, %r3805, %r3866; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 16; + add.s32 %r3869, %r3868, %r3848; + xor.b32 %r3870, %r3869, %r3836; + shf.l.wrap.b32 %r3871, %r3870, %r3870, 20; + add.s32 %r3872, %r3866, %r3087; + add.s32 %r3873, %r3872, %r3871; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 24; + add.s32 %r3876, %r3875, %r3869; + xor.b32 %r3877, %r3876, %r3871; + shf.l.wrap.b32 %r3878, %r3877, %r3877, 25; + add.s32 %r3879, %r3831, %r3015; + add.s32 %r3880, %r3879, %r3850; + xor.b32 %r3881, %r3880, %r3819; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 16; + add.s32 %r3883, %r3882, %r3806; + xor.b32 %r3884, %r3883, %r3850; + shf.l.wrap.b32 %r3885, %r3884, %r3884, 20; + add.s32 %r3886, %r3880, %r3023; + add.s32 %r3887, %r3886, %r3885; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 24; + add.s32 %r3890, %r3889, %r3883; + xor.b32 %r3891, %r3890, %r3885; + shf.l.wrap.b32 %r3892, %r3891, %r3891, 25; + add.s32 %r3893, %r3845, %r3047; + add.s32 %r3894, %r3893, %r3808; + xor.b32 %r3895, %r3894, %r3833; + shf.l.wrap.b32 %r3896, %r3895, %r3895, 16; + add.s32 %r3897, %r3896, %r3820; + xor.b32 %r3898, %r3897, %r3808; + shf.l.wrap.b32 %r3899, %r3898, %r3898, 20; + add.s32 %r3900, %r3894, %r3095; + add.s32 %r3901, %r3900, %r3899; + xor.b32 %r3902, %r3901, %r3896; + shf.l.wrap.b32 %r3903, %r3902, %r3902, 24; + add.s32 %r3904, %r3903, %r3897; + xor.b32 %r3905, %r3904, %r3899; + shf.l.wrap.b32 %r3906, %r3905, %r3905, 25; + xor.b32 %r3907, %r3890, %r3859; + xor.b32 %r3908, %r3904, %r3873; + xor.b32 %r3909, %r3862, %r3887; + xor.b32 %r3910, %r3901, %r3876; + xor.b32 %r3911, %r3906, %r3875; + xor.b32 %r3912, %r3864, %r3889; + xor.b32 %r3913, %r3903, %r3878; + xor.b32 %r3914, %r3892, %r3861; + st.local.u8 [%rd133], %r3907; + shr.u32 %r3915, %r3907, 8; + st.local.u8 [%rd133+1], %r3915; + shr.u32 %r3916, %r3907, 16; + st.local.u8 [%rd133+2], %r3916; + shr.u32 %r3917, %r3907, 24; + st.local.u8 [%rd133+3], %r3917; + st.local.u8 [%rd133+4], %r3908; + shr.u32 %r3918, %r3908, 8; + st.local.u8 [%rd133+5], %r3918; + shr.u32 %r3919, %r3908, 16; + st.local.u8 [%rd133+6], %r3919; + shr.u32 %r3920, %r3908, 24; + st.local.u8 [%rd133+7], %r3920; + st.local.u8 [%rd133+8], %r3909; + shr.u32 %r3921, %r3909, 8; + st.local.u8 [%rd133+9], %r3921; + shr.u32 %r3922, %r3909, 16; + st.local.u8 [%rd133+10], %r3922; + shr.u32 %r3923, %r3909, 24; + st.local.u8 [%rd133+11], %r3923; + st.local.u8 [%rd133+12], %r3910; + shr.u32 %r3924, %r3910, 8; + st.local.u8 [%rd133+13], %r3924; + shr.u32 %r3925, %r3910, 16; + st.local.u8 [%rd133+14], %r3925; + shr.u32 %r3926, %r3910, 24; + st.local.u8 [%rd133+15], %r3926; + st.local.u8 [%rd133+16], %r3911; + shr.u32 %r3927, %r3911, 8; + st.local.u8 [%rd133+17], %r3927; + shr.u32 %r3928, %r3911, 16; + st.local.u8 [%rd133+18], %r3928; + shr.u32 %r3929, %r3911, 24; + st.local.u8 [%rd133+19], %r3929; + st.local.u8 [%rd133+20], %r3912; + shr.u32 %r3930, %r3912, 8; + st.local.u8 [%rd133+21], %r3930; + shr.u32 %r3931, %r3912, 16; + st.local.u8 [%rd133+22], %r3931; + shr.u32 %r3932, %r3912, 24; + st.local.u8 [%rd133+23], %r3932; + st.local.u8 [%rd133+24], %r3913; + shr.u32 %r3933, %r3913, 8; + st.local.u8 [%rd133+25], %r3933; + shr.u32 %r3934, %r3913, 16; + st.local.u8 [%rd133+26], %r3934; + shr.u32 %r3935, %r3913, 24; + st.local.u8 [%rd133+27], %r3935; + st.local.u8 [%rd133+28], %r3914; + shr.u32 %r3936, %r3914, 8; + st.local.u8 [%rd133+29], %r3936; + shr.u32 %r3937, %r3914, 16; + st.local.u8 [%rd133+30], %r3937; + shr.u32 %r3938, %r3914, 24; + st.local.u8 [%rd133+31], %r3938; + add.s64 %rd150, %rd150, 1; + bra.uni $L__BB0_30; + +$L__BB0_1: + add.s64 %rd76, %rd170, -1; + shr.u64 %rd77, %rd76, 10; + or.b64 %rd78, %rd77, 1; + setp.gt.u64 %p2, %rd78, 4294967295; + shr.u64 %rd79, %rd76, 42; + selp.b64 %rd80, %rd79, %rd78, %p2; + selp.b32 %r62, 32, 0, %p2; + and.b64 %rd81, %rd80, 4294901760; + setp.ne.s64 %p3, %rd81, 0; + shr.u64 %rd82, %rd80, 16; + or.b32 %r63, %r62, 16; + selp.b64 %rd83, %rd82, %rd80, %p3; + selp.b32 %r64, %r63, %r62, %p3; + and.b64 %rd84, %rd83, 65280; + setp.ne.s64 %p4, %rd84, 0; + shr.u64 %rd85, %rd83, 8; + or.b32 %r65, %r64, 8; + selp.b64 %rd86, %rd85, %rd83, %p4; + selp.b32 %r66, %r65, %r64, %p4; + and.b64 %rd87, %rd86, 240; + setp.ne.s64 %p5, %rd87, 0; + shr.u64 %rd88, %rd86, 4; + or.b32 %r67, %r66, 4; + selp.b64 %rd89, %rd88, %rd86, %p5; + selp.b32 %r68, %r67, %r66, %p5; + and.b64 %rd90, %rd89, 12; + setp.ne.s64 %p6, %rd90, 0; + shr.u64 %rd91, %rd89, 2; + add.s32 %r69, %r68, 2; + selp.b64 %rd92, %rd91, %rd89, %p6; + selp.b32 %r70, %r69, %r68, %p6; + bfe.u64 %rd93, %rd92, 1, 1; + cvt.u32.u64 %r71, %rd93; + add.s32 %r72, %r70, %r71; + mov.u64 %rd94, 1024; + shl.b64 %rd95, %rd94, %r72; + sub.s64 %rd96, %rd170, %rd95; + add.s64 %rd97, %rd69, %rd95; + shr.u64 %rd98, %rd95, 10; + add.s64 %rd99, %rd98, %rd164; + setp.gt.u64 %p7, %rd95, 1024; + selp.b64 %rd100, 64, 32, %p7; + add.s64 %rd102, %rd148, %rd100; + cvt.u32.u16 %r73, %rs75; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd69; + .param .b64 param1; + st.param.b64 [param1+0], %rd95; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd164; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd148; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd5, [retval0+0]; + } // callseq 0 + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd97; + .param .b64 param1; + st.param.b64 [param1+0], %rd96; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd99; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd102; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd6, [retval0+0]; + } // callseq 1 + setp.eq.s64 %p8, %rd5, 1; + @%p8 bra $L__BB0_12; + bra.uni $L__BB0_2; + +$L__BB0_12: + mov.u64 %rd157, 0; + +$L__BB0_13: + add.s64 %rd116, %rd4, %rd157; + ld.local.u8 %rs78, [%rd116]; + add.s64 %rd117, %rd154, %rd157; + st.local.u8 [%rd117], %rs78; + add.s64 %rd157, %rd157, 1; + setp.lt.u64 %p15, %rd157, 64; + mov.u64 %rd150, 2; + @%p15 bra $L__BB0_13; + bra.uni $L__BB0_30; + +$L__BB0_2: + add.s64 %rd7, %rd6, %rd5; + setp.lt.u64 %p9, %rd7, 2; + mov.u64 %rd150, 0; + mov.u64 %rd151, %rd150; + @%p9 bra $L__BB0_5; + + mov.u64 %rd145, %rd152; + mov.u64 %rd146, %rd7; + +$L__BB0_4: + st.local.u64 [%rd145], %rd148; + add.s64 %rd150, %rd150, 1; + add.s64 %rd148, %rd148, 64; + add.s64 %rd151, %rd151, 2; + add.s64 %rd145, %rd145, 8; + add.s64 %rd146, %rd146, -2; + setp.gt.u64 %p10, %rd146, 1; + @%p10 bra $L__BB0_4; $L__BB0_5: - ld.global.u64 %rd104, [%rd1]; - xor.b64 %rd439, %rd104, %rd4; - -$L__BB0_6: - and.b64 %rd105, %rd439, %rd78; - or.b64 %rd8, %rd105, %rd79; - mov.b64 {%r26, %r27}, %rd8; - mov.u64 %rd106, 0; - ld.const.u64 %rd107, [hash_header]; - cvt.u32.u64 %r28, %rd107; - shr.u64 %rd108, %rd107, 8; - cvt.u32.u64 %r29, %rd108; - shr.u64 %rd109, %rd107, 16; - cvt.u32.u64 %r30, %rd109; - shr.u64 %rd110, %rd107, 32; - cvt.u32.u64 %r31, %rd110; - shr.u64 %rd111, %rd107, 40; - cvt.u32.u64 %r32, %rd111; - shr.u64 %rd112, %rd107, 48; - cvt.u32.u64 %r33, %rd112; - ld.const.u64 %rd113, [hash_header+8]; - cvt.u32.u64 %r34, %rd113; - shr.u64 %rd114, %rd113, 8; - cvt.u32.u64 %r35, %rd114; - shr.u64 %rd115, %rd113, 16; - cvt.u32.u64 %r36, %rd115; - shr.u64 %rd116, %rd113, 32; - cvt.u32.u64 %r37, %rd116; - shr.u64 %rd117, %rd113, 40; - cvt.u32.u64 %r38, %rd117; - shr.u64 %rd118, %rd113, 48; - cvt.u32.u64 %r39, %rd118; - ld.const.u64 %rd119, [hash_header+16]; - cvt.u32.u64 %r40, %rd119; - shr.u64 %rd120, %rd119, 8; - cvt.u32.u64 %r41, %rd120; - shr.u64 %rd121, %rd119, 16; - cvt.u32.u64 %r42, %rd121; - shr.u64 %rd122, %rd119, 32; - cvt.u32.u64 %r43, %rd122; - shr.u64 %rd123, %rd119, 40; - cvt.u32.u64 %r44, %rd123; - shr.u64 %rd124, %rd119, 48; - cvt.u32.u64 %r45, %rd124; - ld.const.u64 %rd125, [hash_header+24]; - cvt.u32.u64 %r46, %rd125; - shr.u64 %rd126, %rd125, 8; - cvt.u32.u64 %r47, %rd126; - shr.u64 %rd127, %rd125, 16; - cvt.u32.u64 %r48, %rd127; - shr.u64 %rd128, %rd125, 32; - cvt.u32.u64 %r49, %rd128; - shr.u64 %rd129, %rd125, 40; - cvt.u32.u64 %r50, %rd129; - shr.u64 %rd130, %rd125, 48; - cvt.u32.u64 %r51, %rd130; - ld.const.v4.u16 {%rs12, %rs13, %rs14, %rs15}, [hash_header+32]; - shr.u16 %rs17, %rs12, 8; - shr.u16 %rs19, %rs13, 8; - shr.u16 %rs21, %rs14, 8; - shr.u16 %rs23, %rs15, 8; - ld.const.v4.u16 {%rs24, %rs25, %rs26, %rs27}, [hash_header+40]; - shr.u16 %rs29, %rs24, 8; - shr.u16 %rs31, %rs25, 8; - shr.u16 %rs33, %rs26, 8; - shr.u16 %rs35, %rs27, 8; - ld.const.v4.u16 {%rs36, %rs37, %rs38, %rs39}, [hash_header+48]; - shr.u16 %rs41, %rs36, 8; - shr.u16 %rs43, %rs37, 8; - shr.u16 %rs45, %rs38, 8; - shr.u16 %rs47, %rs39, 8; - ld.const.v4.u16 {%rs48, %rs49, %rs50, %rs51}, [hash_header+56]; - shr.u16 %rs53, %rs48, 8; - shr.u16 %rs55, %rs49, 8; - shr.u16 %rs57, %rs50, 8; - shr.u16 %rs59, %rs51, 8; - ld.const.u64 %rd131, [hash_header+64]; - mov.b64 {%r52, %r53}, %rd131; - mov.u32 %r54, -1150833019; - mov.u32 %r55, 1779033703; - st.local.v2.u32 [%rd3], {%r55, %r54}; - mov.u32 %r56, -1521486534; - mov.u32 %r57, 1013904242; - st.local.v2.u32 [%rd3+8], {%r57, %r56}; - mov.u32 %r58, -1694144372; - mov.u32 %r59, 1359893119; - st.local.v2.u32 [%rd3+16], {%r59, %r58}; - mov.u32 %r60, 1541459225; - mov.u32 %r61, 528734635; - st.local.v2.u32 [%rd3+24], {%r61, %r60}; - st.local.u64 [%rd3+64], %rd106; - mov.u32 %r62, 0; - st.local.v2.u32 [%rd3+88], {%r62, %r62}; - st.local.v2.u32 [%rd3+96], {%r62, %r62}; - st.local.v2.u32 [%rd3+104], {%r62, %r62}; - st.local.v2.u32 [%rd3+112], {%r62, %r62}; - st.local.v2.u32 [%rd3+120], {%r62, %r62}; - st.local.v2.u32 [%rd3+128], {%r62, %r62}; - mov.u16 %rs60, 0; - st.local.v2.u8 [%rd3+136], {%rs60, %rs60}; - st.local.u8 [%rd3+138], %rs60; - st.local.v2.u32 [%rd3+32], {%r55, %r54}; - st.local.v2.u32 [%rd3+40], {%r57, %r56}; - st.local.v2.u32 [%rd3+48], {%r59, %r58}; - st.local.v2.u32 [%rd3+56], {%r61, %r60}; - st.local.v2.u32 [%rd3+72], {%r62, %r62}; - st.local.v2.u32 [%rd3+80], {%r62, %r62}; - st.local.u8 [%rd3+144], %rs60; - ld.local.v4.u8 {%rs61, %rs62, %rs63, %rs64}, [%rd3+136]; - setp.eq.s16 %p9, %rs62, 0; - selp.u16 %rs68, 1, 0, %p9; - or.b16 %rs69, %rs63, %rs68; - shr.u32 %r63, %r28, 24; - mov.u32 %r64, 64; - prmt.b32 %r65, %r28, %r29, %r64; - mov.u32 %r66, 1040; - prmt.b32 %r67, %r65, %r30, %r66; - mov.u32 %r68, 16912; - prmt.b32 %r69, %r67, %r63, %r68; - and.b32 %r70, %r31, 255; - and.b32 %r71, %r32, 255; - prmt.b32 %r72, %r71, %r70, 30212; - shl.b32 %r73, %r33, 16; - and.b32 %r74, %r73, 16711680; - or.b32 %r75, %r72, %r74; - and.b32 %r76, %r31, -16777216; - or.b32 %r77, %r75, %r76; - shr.u32 %r78, %r34, 24; - prmt.b32 %r79, %r34, %r35, %r64; - prmt.b32 %r80, %r79, %r36, %r66; - prmt.b32 %r81, %r80, %r78, %r68; - and.b32 %r82, %r37, 255; - and.b32 %r83, %r38, 255; - prmt.b32 %r84, %r83, %r82, 30212; - shl.b32 %r85, %r39, 16; - and.b32 %r86, %r85, 16711680; - or.b32 %r87, %r84, %r86; - and.b32 %r88, %r37, -16777216; - or.b32 %r89, %r87, %r88; - shr.u32 %r90, %r40, 24; - prmt.b32 %r91, %r40, %r41, %r64; - prmt.b32 %r92, %r91, %r42, %r66; - prmt.b32 %r93, %r92, %r90, %r68; - and.b32 %r94, %r43, 255; - and.b32 %r95, %r44, 255; - prmt.b32 %r96, %r95, %r94, 30212; - shl.b32 %r97, %r45, 16; - and.b32 %r98, %r97, 16711680; - or.b32 %r99, %r96, %r98; - and.b32 %r100, %r43, -16777216; - or.b32 %r101, %r99, %r100; - shr.u32 %r102, %r46, 24; - prmt.b32 %r103, %r46, %r47, %r64; - prmt.b32 %r104, %r103, %r48, %r66; - prmt.b32 %r105, %r104, %r102, %r68; - and.b32 %r106, %r49, 255; - and.b32 %r107, %r50, 255; - prmt.b32 %r108, %r107, %r106, 30212; - shl.b32 %r109, %r51, 16; - and.b32 %r110, %r109, 16711680; - or.b32 %r111, %r108, %r110; - and.b32 %r112, %r49, -16777216; - or.b32 %r113, %r111, %r112; - cvt.u32.u16 %r114, %rs12; - and.b32 %r115, %r114, 255; - cvt.u32.u16 %r116, %rs17; - prmt.b32 %r117, %r116, %r115, 30212; - cvt.u32.u16 %r118, %rs13; - prmt.b32 %r119, %r118, %r117, 28756; - cvt.u32.u16 %r120, %rs19; - prmt.b32 %r121, %r120, %r119, 1620; - cvt.u32.u16 %r122, %rs14; - and.b32 %r123, %r122, 255; - cvt.u32.u16 %r124, %rs21; + setp.eq.s64 %p11, %rd150, 0; + @%p11 bra $L__BB0_8; + + or.b16 %rs76, %rs75, 4; + cvt.u32.u16 %r1, %rs76; + mov.u64 %rd153, %rd150; + +$L__BB0_7: + ld.local.u64 %rd108, [%rd152]; + ld.u8 %r74, [%rd108]; + ld.u8 %r75, [%rd108+1]; + prmt.b32 %r76, %r75, %r74, 30212; + ld.u8 %r77, [%rd108+2]; + prmt.b32 %r78, %r77, %r76, 28756; + ld.u8 %r79, [%rd108+3]; + prmt.b32 %r80, %r79, %r78, 1620; + ld.u8 %r81, [%rd108+4]; + ld.u8 %r82, [%rd108+5]; + prmt.b32 %r83, %r82, %r81, 30212; + ld.u8 %r84, [%rd108+6]; + prmt.b32 %r85, %r84, %r83, 28756; + ld.u8 %r86, [%rd108+7]; + prmt.b32 %r87, %r86, %r85, 1620; + ld.u8 %r88, [%rd108+8]; + ld.u8 %r89, [%rd108+9]; + prmt.b32 %r90, %r89, %r88, 30212; + ld.u8 %r91, [%rd108+10]; + prmt.b32 %r92, %r91, %r90, 28756; + ld.u8 %r93, [%rd108+11]; + prmt.b32 %r94, %r93, %r92, 1620; + ld.u8 %r95, [%rd108+12]; + ld.u8 %r96, [%rd108+13]; + prmt.b32 %r97, %r96, %r95, 30212; + ld.u8 %r98, [%rd108+14]; + prmt.b32 %r99, %r98, %r97, 28756; + ld.u8 %r100, [%rd108+15]; + prmt.b32 %r101, %r100, %r99, 1620; + ld.u8 %r102, [%rd108+16]; + ld.u8 %r103, [%rd108+17]; + prmt.b32 %r104, %r103, %r102, 30212; + ld.u8 %r105, [%rd108+18]; + prmt.b32 %r106, %r105, %r104, 28756; + ld.u8 %r107, [%rd108+19]; + prmt.b32 %r108, %r107, %r106, 1620; + ld.u8 %r109, [%rd108+20]; + ld.u8 %r110, [%rd108+21]; + prmt.b32 %r111, %r110, %r109, 30212; + ld.u8 %r112, [%rd108+22]; + prmt.b32 %r113, %r112, %r111, 28756; + ld.u8 %r114, [%rd108+23]; + prmt.b32 %r115, %r114, %r113, 1620; + ld.u8 %r116, [%rd108+24]; + ld.u8 %r117, [%rd108+25]; + prmt.b32 %r118, %r117, %r116, 30212; + ld.u8 %r119, [%rd108+26]; + prmt.b32 %r120, %r119, %r118, 28756; + ld.u8 %r121, [%rd108+27]; + prmt.b32 %r122, %r121, %r120, 1620; + ld.u8 %r123, [%rd108+28]; + ld.u8 %r124, [%rd108+29]; prmt.b32 %r125, %r124, %r123, 30212; - cvt.u32.u16 %r126, %rs15; + ld.u8 %r126, [%rd108+30]; prmt.b32 %r127, %r126, %r125, 28756; - cvt.u32.u16 %r128, %rs23; + ld.u8 %r128, [%rd108+31]; prmt.b32 %r129, %r128, %r127, 1620; - cvt.u32.u16 %r130, %rs24; - and.b32 %r131, %r130, 255; - cvt.u32.u16 %r132, %rs29; - prmt.b32 %r133, %r132, %r131, 30212; - cvt.u32.u16 %r134, %rs25; - prmt.b32 %r135, %r134, %r133, 28756; - cvt.u32.u16 %r136, %rs31; - prmt.b32 %r137, %r136, %r135, 1620; - cvt.u32.u16 %r138, %rs26; - and.b32 %r139, %r138, 255; - cvt.u32.u16 %r140, %rs33; - prmt.b32 %r141, %r140, %r139, 30212; - cvt.u32.u16 %r142, %rs27; - prmt.b32 %r143, %r142, %r141, 28756; - cvt.u32.u16 %r144, %rs35; - prmt.b32 %r145, %r144, %r143, 1620; - cvt.u32.u16 %r146, %rs36; - and.b32 %r147, %r146, 255; - cvt.u32.u16 %r148, %rs41; - prmt.b32 %r149, %r148, %r147, 30212; - cvt.u32.u16 %r150, %rs37; - prmt.b32 %r151, %r150, %r149, 28756; - cvt.u32.u16 %r152, %rs43; - prmt.b32 %r153, %r152, %r151, 1620; - cvt.u32.u16 %r154, %rs38; - and.b32 %r155, %r154, 255; - cvt.u32.u16 %r156, %rs45; + ld.u8 %r130, [%rd108+32]; + ld.u8 %r131, [%rd108+33]; + prmt.b32 %r132, %r131, %r130, 30212; + ld.u8 %r133, [%rd108+34]; + prmt.b32 %r134, %r133, %r132, 28756; + ld.u8 %r135, [%rd108+35]; + prmt.b32 %r136, %r135, %r134, 1620; + ld.u8 %r137, [%rd108+36]; + ld.u8 %r138, [%rd108+37]; + prmt.b32 %r139, %r138, %r137, 30212; + ld.u8 %r140, [%rd108+38]; + prmt.b32 %r141, %r140, %r139, 28756; + ld.u8 %r142, [%rd108+39]; + prmt.b32 %r143, %r142, %r141, 1620; + ld.u8 %r144, [%rd108+40]; + ld.u8 %r145, [%rd108+41]; + prmt.b32 %r146, %r145, %r144, 30212; + ld.u8 %r147, [%rd108+42]; + prmt.b32 %r148, %r147, %r146, 28756; + ld.u8 %r149, [%rd108+43]; + prmt.b32 %r150, %r149, %r148, 1620; + ld.u8 %r151, [%rd108+44]; + ld.u8 %r152, [%rd108+45]; + prmt.b32 %r153, %r152, %r151, 30212; + ld.u8 %r154, [%rd108+46]; + prmt.b32 %r155, %r154, %r153, 28756; + ld.u8 %r156, [%rd108+47]; + prmt.b32 %r157, %r156, %r155, 1620; + ld.u8 %r158, [%rd108+48]; + ld.u8 %r159, [%rd108+49]; + prmt.b32 %r160, %r159, %r158, 30212; + ld.u8 %r161, [%rd108+50]; + prmt.b32 %r162, %r161, %r160, 28756; + ld.u8 %r163, [%rd108+51]; + prmt.b32 %r164, %r163, %r162, 1620; + ld.u8 %r165, [%rd108+52]; + ld.u8 %r166, [%rd108+53]; + prmt.b32 %r167, %r166, %r165, 30212; + ld.u8 %r168, [%rd108+54]; + prmt.b32 %r169, %r168, %r167, 28756; + ld.u8 %r170, [%rd108+55]; + prmt.b32 %r171, %r170, %r169, 1620; + ld.u8 %r172, [%rd108+56]; + ld.u8 %r173, [%rd108+57]; + prmt.b32 %r174, %r173, %r172, 30212; + ld.u8 %r175, [%rd108+58]; + prmt.b32 %r176, %r175, %r174, 28756; + ld.u8 %r177, [%rd108+59]; + prmt.b32 %r178, %r177, %r176, 1620; + ld.u8 %r179, [%rd108+60]; + ld.u8 %r180, [%rd108+61]; + prmt.b32 %r181, %r180, %r179, 30212; + ld.u8 %r182, [%rd108+62]; + prmt.b32 %r183, %r182, %r181, 28756; + ld.u8 %r184, [%rd108+63]; + prmt.b32 %r185, %r184, %r183, 1620; + ld.local.u8 %r186, [%rd2+16]; + ld.local.u8 %r187, [%rd2+17]; + prmt.b32 %r188, %r187, %r186, 30212; + ld.local.u8 %r189, [%rd2+18]; + ld.local.u8 %r190, [%rd2+19]; + prmt.b32 %r191, %r190, %r189, 30212; + prmt.b32 %r192, %r191, %r188, 4180; + ld.local.u8 %r193, [%rd2]; + ld.local.u8 %r194, [%rd2+1]; + prmt.b32 %r195, %r194, %r193, 30212; + ld.local.u8 %r196, [%rd2+2]; + ld.local.u8 %r197, [%rd2+3]; + prmt.b32 %r198, %r197, %r196, 30212; + prmt.b32 %r199, %r198, %r195, 4180; + add.s32 %r200, %r192, %r199; + add.s32 %r201, %r200, %r80; + shf.l.wrap.b32 %r202, %r201, %r201, 16; + add.s32 %r203, %r202, 1779033703; + xor.b32 %r204, %r203, %r192; + shf.l.wrap.b32 %r205, %r204, %r204, 20; + add.s32 %r206, %r87, %r201; + add.s32 %r207, %r206, %r205; + xor.b32 %r208, %r207, %r202; + shf.l.wrap.b32 %r209, %r208, %r208, 24; + add.s32 %r210, %r209, %r203; + xor.b32 %r211, %r210, %r205; + shf.l.wrap.b32 %r212, %r211, %r211, 25; + ld.local.u8 %r213, [%rd2+20]; + ld.local.u8 %r214, [%rd2+21]; + prmt.b32 %r215, %r214, %r213, 30212; + ld.local.u8 %r216, [%rd2+22]; + ld.local.u8 %r217, [%rd2+23]; + prmt.b32 %r218, %r217, %r216, 30212; + prmt.b32 %r219, %r218, %r215, 4180; + ld.local.u8 %r220, [%rd2+4]; + ld.local.u8 %r221, [%rd2+5]; + prmt.b32 %r222, %r221, %r220, 30212; + ld.local.u8 %r223, [%rd2+6]; + ld.local.u8 %r224, [%rd2+7]; + prmt.b32 %r225, %r224, %r223, 30212; + prmt.b32 %r226, %r225, %r222, 4180; + add.s32 %r227, %r219, %r226; + add.s32 %r228, %r227, %r94; + shf.l.wrap.b32 %r229, %r228, %r228, 16; + add.s32 %r230, %r229, -1150833019; + xor.b32 %r231, %r230, %r219; + shf.l.wrap.b32 %r232, %r231, %r231, 20; + add.s32 %r233, %r101, %r228; + add.s32 %r234, %r233, %r232; + xor.b32 %r235, %r234, %r229; + shf.l.wrap.b32 %r236, %r235, %r235, 24; + add.s32 %r237, %r236, %r230; + xor.b32 %r238, %r237, %r232; + shf.l.wrap.b32 %r239, %r238, %r238, 25; + ld.local.u8 %r240, [%rd2+24]; + ld.local.u8 %r241, [%rd2+25]; + prmt.b32 %r242, %r241, %r240, 30212; + ld.local.u8 %r243, [%rd2+26]; + ld.local.u8 %r244, [%rd2+27]; + prmt.b32 %r245, %r244, %r243, 30212; + prmt.b32 %r246, %r245, %r242, 4180; + ld.local.u8 %r247, [%rd2+8]; + ld.local.u8 %r248, [%rd2+9]; + prmt.b32 %r249, %r248, %r247, 30212; + ld.local.u8 %r250, [%rd2+10]; + ld.local.u8 %r251, [%rd2+11]; + prmt.b32 %r252, %r251, %r250, 30212; + prmt.b32 %r253, %r252, %r249, 4180; + add.s32 %r254, %r246, %r253; + add.s32 %r255, %r254, %r108; + shr.u32 %r256, %r255, 16; + shl.b32 %r257, %r255, 16; + xor.b32 %r258, %r257, 4194304; + or.b32 %r259, %r258, %r256; + add.s32 %r260, %r259, 1013904242; + xor.b32 %r261, %r260, %r246; + shf.l.wrap.b32 %r262, %r261, %r261, 20; + add.s32 %r263, %r115, %r255; + add.s32 %r264, %r263, %r262; + xor.b32 %r265, %r264, %r259; + shf.l.wrap.b32 %r266, %r265, %r265, 24; + add.s32 %r267, %r266, %r260; + xor.b32 %r268, %r267, %r262; + shf.l.wrap.b32 %r269, %r268, %r268, 25; + ld.local.u8 %r270, [%rd2+28]; + ld.local.u8 %r271, [%rd2+29]; + prmt.b32 %r272, %r271, %r270, 30212; + ld.local.u8 %r273, [%rd2+30]; + ld.local.u8 %r274, [%rd2+31]; + prmt.b32 %r275, %r274, %r273, 30212; + prmt.b32 %r276, %r275, %r272, 4180; + ld.local.u8 %r277, [%rd2+12]; + ld.local.u8 %r278, [%rd2+13]; + prmt.b32 %r279, %r278, %r277, 30212; + ld.local.u8 %r280, [%rd2+14]; + ld.local.u8 %r281, [%rd2+15]; + prmt.b32 %r282, %r281, %r280, 30212; + prmt.b32 %r283, %r282, %r279, 4180; + add.s32 %r284, %r276, %r283; + add.s32 %r285, %r284, %r122; + xor.b32 %r286, %r285, %r1; + shr.u32 %r287, %r285, 16; + shl.b32 %r288, %r286, 16; + or.b32 %r289, %r288, %r287; + add.s32 %r290, %r289, -1521486534; + xor.b32 %r291, %r290, %r276; + shf.l.wrap.b32 %r292, %r291, %r291, 20; + add.s32 %r293, %r129, %r285; + add.s32 %r294, %r293, %r292; + xor.b32 %r295, %r294, %r289; + shf.l.wrap.b32 %r296, %r295, %r295, 24; + add.s32 %r297, %r296, %r290; + xor.b32 %r298, %r297, %r292; + shf.l.wrap.b32 %r299, %r298, %r298, 25; + add.s32 %r300, %r239, %r207; + add.s32 %r301, %r300, %r136; + xor.b32 %r302, %r296, %r301; + shf.l.wrap.b32 %r303, %r302, %r302, 16; + add.s32 %r304, %r303, %r267; + xor.b32 %r305, %r304, %r239; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r143, %r301; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + add.s32 %r314, %r269, %r234; + add.s32 %r315, %r314, %r150; + xor.b32 %r316, %r315, %r209; + shf.l.wrap.b32 %r317, %r316, %r316, 16; + add.s32 %r318, %r317, %r297; + xor.b32 %r319, %r318, %r269; + shf.l.wrap.b32 %r320, %r319, %r319, 20; + add.s32 %r321, %r157, %r315; + add.s32 %r322, %r321, %r320; + xor.b32 %r323, %r322, %r317; + shf.l.wrap.b32 %r324, %r323, %r323, 24; + add.s32 %r325, %r324, %r318; + xor.b32 %r326, %r325, %r320; + shf.l.wrap.b32 %r327, %r326, %r326, 25; + add.s32 %r328, %r299, %r264; + add.s32 %r329, %r328, %r164; + xor.b32 %r330, %r329, %r236; + shf.l.wrap.b32 %r331, %r330, %r330, 16; + add.s32 %r332, %r331, %r210; + xor.b32 %r333, %r332, %r299; + shf.l.wrap.b32 %r334, %r333, %r333, 20; + add.s32 %r335, %r171, %r329; + add.s32 %r336, %r335, %r334; + xor.b32 %r337, %r336, %r331; + shf.l.wrap.b32 %r338, %r337, %r337, 24; + add.s32 %r339, %r338, %r332; + xor.b32 %r340, %r339, %r334; + shf.l.wrap.b32 %r341, %r340, %r340, 25; + add.s32 %r342, %r294, %r212; + add.s32 %r343, %r342, %r178; + xor.b32 %r344, %r343, %r266; + shf.l.wrap.b32 %r345, %r344, %r344, 16; + add.s32 %r346, %r345, %r237; + xor.b32 %r347, %r346, %r212; + shf.l.wrap.b32 %r348, %r347, %r347, 20; + add.s32 %r349, %r185, %r343; + add.s32 %r350, %r349, %r348; + xor.b32 %r351, %r350, %r345; + shf.l.wrap.b32 %r352, %r351, %r351, 24; + add.s32 %r353, %r352, %r346; + xor.b32 %r354, %r353, %r348; + shf.l.wrap.b32 %r355, %r354, %r354, 25; + add.s32 %r356, %r308, %r94; + add.s32 %r357, %r356, %r355; + xor.b32 %r358, %r357, %r324; + shf.l.wrap.b32 %r359, %r358, %r358, 16; + add.s32 %r360, %r359, %r339; + xor.b32 %r361, %r360, %r355; + shf.l.wrap.b32 %r362, %r361, %r361, 20; + add.s32 %r363, %r357, %r122; + add.s32 %r364, %r363, %r362; + xor.b32 %r365, %r364, %r359; + shf.l.wrap.b32 %r366, %r365, %r365, 24; + add.s32 %r367, %r366, %r360; + xor.b32 %r368, %r367, %r362; + shf.l.wrap.b32 %r369, %r368, %r368, 25; + add.s32 %r370, %r322, %r101; + add.s32 %r371, %r370, %r313; + xor.b32 %r372, %r338, %r371; + shf.l.wrap.b32 %r373, %r372, %r372, 16; + add.s32 %r374, %r353, %r373; + xor.b32 %r375, %r374, %r313; + shf.l.wrap.b32 %r376, %r375, %r375, 20; + add.s32 %r377, %r371, %r150; + add.s32 %r378, %r377, %r376; + xor.b32 %r379, %r378, %r373; + shf.l.wrap.b32 %r380, %r379, %r379, 24; + add.s32 %r381, %r380, %r374; + xor.b32 %r382, %r381, %r376; + shf.l.wrap.b32 %r383, %r382, %r382, 25; + add.s32 %r384, %r327, %r129; + add.s32 %r385, %r384, %r336; + xor.b32 %r386, %r352, %r385; + shf.l.wrap.b32 %r387, %r386, %r386, 16; + add.s32 %r388, %r387, %r311; + xor.b32 %r389, %r388, %r327; + shf.l.wrap.b32 %r390, %r389, %r389, 20; + add.s32 %r391, %r385, %r80; + add.s32 %r392, %r391, %r390; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 24; + add.s32 %r395, %r394, %r388; + xor.b32 %r396, %r395, %r390; + shf.l.wrap.b32 %r397, %r396, %r396, 25; + add.s32 %r398, %r341, %r108; + add.s32 %r399, %r398, %r350; + xor.b32 %r400, %r399, %r310; + shf.l.wrap.b32 %r401, %r400, %r400, 16; + add.s32 %r402, %r401, %r325; + xor.b32 %r403, %r402, %r341; + shf.l.wrap.b32 %r404, %r403, %r403, 20; + add.s32 %r405, %r399, %r171; + add.s32 %r406, %r405, %r404; + xor.b32 %r407, %r406, %r401; + shf.l.wrap.b32 %r408, %r407, %r407, 24; + add.s32 %r409, %r408, %r402; + xor.b32 %r410, %r409, %r404; + shf.l.wrap.b32 %r411, %r410, %r410, 25; + add.s32 %r412, %r383, %r87; + add.s32 %r413, %r412, %r364; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 16; + add.s32 %r416, %r415, %r395; + xor.b32 %r417, %r416, %r383; + shf.l.wrap.b32 %r418, %r417, %r417, 20; + add.s32 %r419, %r413, %r157; + add.s32 %r420, %r419, %r418; + xor.b32 %r421, %r420, %r415; + shf.l.wrap.b32 %r422, %r421, %r421, 24; + add.s32 %r423, %r422, %r416; + xor.b32 %r424, %r423, %r418; + shf.l.wrap.b32 %r425, %r424, %r424, 25; + add.s32 %r426, %r378, %r164; + add.s32 %r427, %r426, %r397; + xor.b32 %r428, %r366, %r427; + shf.l.wrap.b32 %r429, %r428, %r428, 16; + add.s32 %r430, %r429, %r409; + xor.b32 %r431, %r430, %r397; + shf.l.wrap.b32 %r432, %r431, %r431, 20; + add.s32 %r433, %r427, %r115; + add.s32 %r434, %r433, %r432; + xor.b32 %r435, %r434, %r429; + shf.l.wrap.b32 %r436, %r435, %r435, 24; + add.s32 %r437, %r436, %r430; + xor.b32 %r438, %r437, %r432; + shf.l.wrap.b32 %r439, %r438, %r438, 25; + add.s32 %r440, %r392, %r143; + add.s32 %r441, %r440, %r411; + xor.b32 %r442, %r441, %r380; + shf.l.wrap.b32 %r443, %r442, %r442, 16; + add.s32 %r444, %r443, %r367; + xor.b32 %r445, %r444, %r411; + shf.l.wrap.b32 %r446, %r445, %r445, 20; + add.s32 %r447, %r441, %r178; + add.s32 %r448, %r447, %r446; + xor.b32 %r449, %r448, %r443; + shf.l.wrap.b32 %r450, %r449, %r449, 24; + add.s32 %r451, %r450, %r444; + xor.b32 %r452, %r451, %r446; + shf.l.wrap.b32 %r453, %r452, %r452, 25; + add.s32 %r454, %r406, %r185; + add.s32 %r455, %r454, %r369; + xor.b32 %r456, %r455, %r394; + shf.l.wrap.b32 %r457, %r456, %r456, 16; + add.s32 %r458, %r457, %r381; + xor.b32 %r459, %r458, %r369; + shf.l.wrap.b32 %r460, %r459, %r459, 20; + add.s32 %r461, %r455, %r136; + add.s32 %r462, %r461, %r460; + xor.b32 %r463, %r462, %r457; + shf.l.wrap.b32 %r464, %r463, %r463, 24; + add.s32 %r465, %r464, %r458; + xor.b32 %r466, %r465, %r460; + shf.l.wrap.b32 %r467, %r466, %r466, 25; + add.s32 %r468, %r420, %r101; + add.s32 %r469, %r468, %r467; + xor.b32 %r470, %r469, %r436; + shf.l.wrap.b32 %r471, %r470, %r470, 16; + add.s32 %r472, %r471, %r451; + xor.b32 %r473, %r472, %r467; + shf.l.wrap.b32 %r474, %r473, %r473, 20; + add.s32 %r475, %r469, %r108; + add.s32 %r476, %r475, %r474; + xor.b32 %r477, %r476, %r471; + shf.l.wrap.b32 %r478, %r477, %r477, 24; + add.s32 %r479, %r478, %r472; + xor.b32 %r480, %r479, %r474; + shf.l.wrap.b32 %r481, %r480, %r480, 25; + add.s32 %r482, %r434, %r150; + add.s32 %r483, %r482, %r425; + xor.b32 %r484, %r483, %r450; + shf.l.wrap.b32 %r485, %r484, %r484, 16; + add.s32 %r486, %r485, %r465; + xor.b32 %r487, %r486, %r425; + shf.l.wrap.b32 %r488, %r487, %r487, 20; + add.s32 %r489, %r483, %r164; + add.s32 %r490, %r489, %r488; + xor.b32 %r491, %r490, %r485; + shf.l.wrap.b32 %r492, %r491, %r491, 24; + add.s32 %r493, %r492, %r486; + xor.b32 %r494, %r493, %r488; + shf.l.wrap.b32 %r495, %r494, %r494, 25; + add.s32 %r496, %r448, %r171; + add.s32 %r497, %r496, %r439; + xor.b32 %r498, %r464, %r497; + shf.l.wrap.b32 %r499, %r498, %r498, 16; + add.s32 %r500, %r499, %r423; + xor.b32 %r501, %r500, %r439; + shf.l.wrap.b32 %r502, %r501, %r501, 20; + add.s32 %r503, %r497, %r94; + add.s32 %r504, %r503, %r502; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 24; + add.s32 %r507, %r506, %r500; + xor.b32 %r508, %r507, %r502; + shf.l.wrap.b32 %r509, %r508, %r508, 25; + add.s32 %r510, %r453, %r129; + add.s32 %r511, %r510, %r462; + xor.b32 %r512, %r511, %r422; + shf.l.wrap.b32 %r513, %r512, %r512, 16; + add.s32 %r514, %r513, %r437; + xor.b32 %r515, %r514, %r453; + shf.l.wrap.b32 %r516, %r515, %r515, 20; + add.s32 %r517, %r511, %r178; + add.s32 %r518, %r517, %r516; + xor.b32 %r519, %r518, %r513; + shf.l.wrap.b32 %r520, %r519, %r519, 24; + add.s32 %r521, %r520, %r514; + xor.b32 %r522, %r521, %r516; + shf.l.wrap.b32 %r523, %r522, %r522, 25; + add.s32 %r524, %r495, %r122; + add.s32 %r525, %r524, %r476; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 16; + add.s32 %r528, %r527, %r507; + xor.b32 %r529, %r528, %r495; + shf.l.wrap.b32 %r530, %r529, %r529, 20; + add.s32 %r531, %r525, %r115; + add.s32 %r532, %r531, %r530; + xor.b32 %r533, %r532, %r527; + shf.l.wrap.b32 %r534, %r533, %r533, 24; + add.s32 %r535, %r534, %r528; + xor.b32 %r536, %r535, %r530; + shf.l.wrap.b32 %r537, %r536, %r536, 25; + add.s32 %r538, %r490, %r143; + add.s32 %r539, %r538, %r509; + xor.b32 %r540, %r478, %r539; + shf.l.wrap.b32 %r541, %r540, %r540, 16; + add.s32 %r542, %r541, %r521; + xor.b32 %r543, %r542, %r509; + shf.l.wrap.b32 %r544, %r543, %r543, 20; + add.s32 %r545, %r539, %r80; + add.s32 %r546, %r545, %r544; + xor.b32 %r547, %r546, %r541; + shf.l.wrap.b32 %r548, %r547, %r547, 24; + add.s32 %r549, %r548, %r542; + xor.b32 %r550, %r549, %r544; + shf.l.wrap.b32 %r551, %r550, %r550, 25; + add.s32 %r552, %r504, %r157; + add.s32 %r553, %r552, %r523; + xor.b32 %r554, %r553, %r492; + shf.l.wrap.b32 %r555, %r554, %r554, 16; + add.s32 %r556, %r555, %r479; + xor.b32 %r557, %r556, %r523; + shf.l.wrap.b32 %r558, %r557, %r557, 20; + add.s32 %r559, %r553, %r185; + add.s32 %r560, %r559, %r558; + xor.b32 %r561, %r560, %r555; + shf.l.wrap.b32 %r562, %r561, %r561, 24; + add.s32 %r563, %r562, %r556; + xor.b32 %r564, %r563, %r558; + shf.l.wrap.b32 %r565, %r564, %r564, 25; + add.s32 %r566, %r518, %r136; + add.s32 %r567, %r566, %r481; + xor.b32 %r568, %r567, %r506; + shf.l.wrap.b32 %r569, %r568, %r568, 16; + add.s32 %r570, %r569, %r493; + xor.b32 %r571, %r570, %r481; + shf.l.wrap.b32 %r572, %r571, %r571, 20; + add.s32 %r573, %r567, %r87; + add.s32 %r574, %r573, %r572; + xor.b32 %r575, %r574, %r569; + shf.l.wrap.b32 %r576, %r575, %r575, 24; + add.s32 %r577, %r576, %r570; + xor.b32 %r578, %r577, %r572; + shf.l.wrap.b32 %r579, %r578, %r578, 25; + add.s32 %r580, %r532, %r150; + add.s32 %r581, %r580, %r579; + xor.b32 %r582, %r581, %r548; + shf.l.wrap.b32 %r583, %r582, %r582, 16; + add.s32 %r584, %r583, %r563; + xor.b32 %r585, %r584, %r579; + shf.l.wrap.b32 %r586, %r585, %r585, 20; + add.s32 %r587, %r581, %r129; + add.s32 %r588, %r587, %r586; + xor.b32 %r589, %r588, %r583; + shf.l.wrap.b32 %r590, %r589, %r589, 24; + add.s32 %r591, %r590, %r584; + xor.b32 %r592, %r591, %r586; + shf.l.wrap.b32 %r593, %r592, %r592, 25; + add.s32 %r594, %r546, %r164; + add.s32 %r595, %r594, %r537; + xor.b32 %r596, %r595, %r562; + shf.l.wrap.b32 %r597, %r596, %r596, 16; + add.s32 %r598, %r597, %r577; + xor.b32 %r599, %r598, %r537; + shf.l.wrap.b32 %r600, %r599, %r599, 20; + add.s32 %r601, %r595, %r143; + add.s32 %r602, %r601, %r600; + xor.b32 %r603, %r602, %r597; + shf.l.wrap.b32 %r604, %r603, %r603, 24; + add.s32 %r605, %r604, %r598; + xor.b32 %r606, %r605, %r600; + shf.l.wrap.b32 %r607, %r606, %r606, 25; + add.s32 %r608, %r560, %r178; + add.s32 %r609, %r608, %r551; + xor.b32 %r610, %r576, %r609; + shf.l.wrap.b32 %r611, %r610, %r610, 16; + add.s32 %r612, %r611, %r535; + xor.b32 %r613, %r612, %r551; + shf.l.wrap.b32 %r614, %r613, %r613, 20; + add.s32 %r615, %r609, %r101; + add.s32 %r616, %r615, %r614; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 24; + add.s32 %r619, %r618, %r612; + xor.b32 %r620, %r619, %r614; + shf.l.wrap.b32 %r621, %r620, %r620, 25; + add.s32 %r622, %r565, %r171; + add.s32 %r623, %r622, %r574; + xor.b32 %r624, %r623, %r534; + shf.l.wrap.b32 %r625, %r624, %r624, 16; + add.s32 %r626, %r625, %r549; + xor.b32 %r627, %r626, %r565; + shf.l.wrap.b32 %r628, %r627, %r627, 20; + add.s32 %r629, %r623, %r185; + add.s32 %r630, %r629, %r628; + xor.b32 %r631, %r630, %r625; + shf.l.wrap.b32 %r632, %r631, %r631, 24; + add.s32 %r633, %r632, %r626; + xor.b32 %r634, %r633, %r628; + shf.l.wrap.b32 %r635, %r634, %r634, 25; + add.s32 %r636, %r607, %r108; + add.s32 %r637, %r636, %r588; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 16; + add.s32 %r640, %r639, %r619; + xor.b32 %r641, %r640, %r607; + shf.l.wrap.b32 %r642, %r641, %r641, 20; + add.s32 %r643, %r637, %r80; + add.s32 %r644, %r643, %r642; + xor.b32 %r645, %r644, %r639; + shf.l.wrap.b32 %r646, %r645, %r645, 24; + add.s32 %r647, %r646, %r640; + xor.b32 %r648, %r647, %r642; + shf.l.wrap.b32 %r649, %r648, %r648, 25; + add.s32 %r650, %r602, %r157; + add.s32 %r651, %r650, %r621; + xor.b32 %r652, %r590, %r651; + shf.l.wrap.b32 %r653, %r652, %r652, 16; + add.s32 %r654, %r653, %r633; + xor.b32 %r655, %r654, %r621; + shf.l.wrap.b32 %r656, %r655, %r655, 20; + add.s32 %r657, %r651, %r94; + add.s32 %r658, %r657, %r656; + xor.b32 %r659, %r658, %r653; + shf.l.wrap.b32 %r660, %r659, %r659, 24; + add.s32 %r661, %r660, %r654; + xor.b32 %r662, %r661, %r656; + shf.l.wrap.b32 %r663, %r662, %r662, 25; + add.s32 %r664, %r616, %r115; + add.s32 %r665, %r664, %r635; + xor.b32 %r666, %r665, %r604; + shf.l.wrap.b32 %r667, %r666, %r666, 16; + add.s32 %r668, %r667, %r591; + xor.b32 %r669, %r668, %r635; + shf.l.wrap.b32 %r670, %r669, %r669, 20; + add.s32 %r671, %r665, %r136; + add.s32 %r672, %r671, %r670; + xor.b32 %r673, %r672, %r667; + shf.l.wrap.b32 %r674, %r673, %r673, 24; + add.s32 %r675, %r674, %r668; + xor.b32 %r676, %r675, %r670; + shf.l.wrap.b32 %r677, %r676, %r676, 25; + add.s32 %r678, %r630, %r87; + add.s32 %r679, %r678, %r593; + xor.b32 %r680, %r679, %r618; + shf.l.wrap.b32 %r681, %r680, %r680, 16; + add.s32 %r682, %r681, %r605; + xor.b32 %r683, %r682, %r593; + shf.l.wrap.b32 %r684, %r683, %r683, 20; + add.s32 %r685, %r679, %r122; + add.s32 %r686, %r685, %r684; + xor.b32 %r687, %r686, %r681; + shf.l.wrap.b32 %r688, %r687, %r687, 24; + add.s32 %r689, %r688, %r682; + xor.b32 %r690, %r689, %r684; + shf.l.wrap.b32 %r691, %r690, %r690, 25; + add.s32 %r692, %r644, %r164; + add.s32 %r693, %r692, %r691; + xor.b32 %r694, %r693, %r660; + shf.l.wrap.b32 %r695, %r694, %r694, 16; + add.s32 %r696, %r695, %r675; + xor.b32 %r697, %r696, %r691; + shf.l.wrap.b32 %r698, %r697, %r697, 20; + add.s32 %r699, %r693, %r171; + add.s32 %r700, %r699, %r698; + xor.b32 %r701, %r700, %r695; + shf.l.wrap.b32 %r702, %r701, %r701, 24; + add.s32 %r703, %r702, %r696; + xor.b32 %r704, %r703, %r698; + shf.l.wrap.b32 %r705, %r704, %r704, 25; + add.s32 %r706, %r658, %r143; + add.s32 %r707, %r706, %r649; + xor.b32 %r708, %r707, %r674; + shf.l.wrap.b32 %r709, %r708, %r708, 16; + add.s32 %r710, %r709, %r689; + xor.b32 %r711, %r710, %r649; + shf.l.wrap.b32 %r712, %r711, %r711, 20; + add.s32 %r713, %r707, %r157; + add.s32 %r714, %r713, %r712; + xor.b32 %r715, %r714, %r709; + shf.l.wrap.b32 %r716, %r715, %r715, 24; + add.s32 %r717, %r716, %r710; + xor.b32 %r718, %r717, %r712; + shf.l.wrap.b32 %r719, %r718, %r718, 25; + add.s32 %r720, %r672, %r185; + add.s32 %r721, %r720, %r663; + xor.b32 %r722, %r688, %r721; + shf.l.wrap.b32 %r723, %r722, %r722, 16; + add.s32 %r724, %r723, %r647; + xor.b32 %r725, %r724, %r663; + shf.l.wrap.b32 %r726, %r725, %r725, 20; + add.s32 %r727, %r721, %r150; + add.s32 %r728, %r727, %r726; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 24; + add.s32 %r731, %r730, %r724; + xor.b32 %r732, %r731, %r726; + shf.l.wrap.b32 %r733, %r732, %r732, 25; + add.s32 %r734, %r677, %r178; + add.s32 %r735, %r734, %r686; + xor.b32 %r736, %r735, %r646; + shf.l.wrap.b32 %r737, %r736, %r736, 16; + add.s32 %r738, %r737, %r661; + xor.b32 %r739, %r738, %r677; + shf.l.wrap.b32 %r740, %r739, %r739, 20; + add.s32 %r741, %r735, %r136; + add.s32 %r742, %r741, %r740; + xor.b32 %r743, %r742, %r737; + shf.l.wrap.b32 %r744, %r743, %r743, 24; + add.s32 %r745, %r744, %r738; + xor.b32 %r746, %r745, %r740; + shf.l.wrap.b32 %r747, %r746, %r746, 25; + add.s32 %r748, %r719, %r129; + add.s32 %r749, %r748, %r700; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 16; + add.s32 %r752, %r751, %r731; + xor.b32 %r753, %r752, %r719; + shf.l.wrap.b32 %r754, %r753, %r753, 20; + add.s32 %r755, %r749, %r94; + add.s32 %r756, %r755, %r754; + xor.b32 %r757, %r756, %r751; + shf.l.wrap.b32 %r758, %r757, %r757, 24; + add.s32 %r759, %r758, %r752; + xor.b32 %r760, %r759, %r754; + shf.l.wrap.b32 %r761, %r760, %r760, 25; + add.s32 %r762, %r714, %r115; + add.s32 %r763, %r762, %r733; + xor.b32 %r764, %r702, %r763; + shf.l.wrap.b32 %r765, %r764, %r764, 16; + add.s32 %r766, %r765, %r745; + xor.b32 %r767, %r766, %r733; + shf.l.wrap.b32 %r768, %r767, %r767, 20; + add.s32 %r769, %r763, %r101; + add.s32 %r770, %r769, %r768; + xor.b32 %r771, %r770, %r765; + shf.l.wrap.b32 %r772, %r771, %r771, 24; + add.s32 %r773, %r772, %r766; + xor.b32 %r774, %r773, %r768; + shf.l.wrap.b32 %r775, %r774, %r774, 25; + add.s32 %r776, %r728, %r80; + add.s32 %r777, %r776, %r747; + xor.b32 %r778, %r777, %r716; + shf.l.wrap.b32 %r779, %r778, %r778, 16; + add.s32 %r780, %r779, %r703; + xor.b32 %r781, %r780, %r747; + shf.l.wrap.b32 %r782, %r781, %r781, 20; + add.s32 %r783, %r777, %r87; + add.s32 %r784, %r783, %r782; + xor.b32 %r785, %r784, %r779; + shf.l.wrap.b32 %r786, %r785, %r785, 24; + add.s32 %r787, %r786, %r780; + xor.b32 %r788, %r787, %r782; + shf.l.wrap.b32 %r789, %r788, %r788, 25; + add.s32 %r790, %r742, %r122; + add.s32 %r791, %r790, %r705; + xor.b32 %r792, %r791, %r730; + shf.l.wrap.b32 %r793, %r792, %r792, 16; + add.s32 %r794, %r793, %r717; + xor.b32 %r795, %r794, %r705; + shf.l.wrap.b32 %r796, %r795, %r795, 20; + add.s32 %r797, %r791, %r108; + add.s32 %r798, %r797, %r796; + xor.b32 %r799, %r798, %r793; + shf.l.wrap.b32 %r800, %r799, %r799, 24; + add.s32 %r801, %r800, %r794; + xor.b32 %r802, %r801, %r796; + shf.l.wrap.b32 %r803, %r802, %r802, 25; + add.s32 %r804, %r756, %r143; + add.s32 %r805, %r804, %r803; + xor.b32 %r806, %r805, %r772; + shf.l.wrap.b32 %r807, %r806, %r806, 16; + add.s32 %r808, %r807, %r787; + xor.b32 %r809, %r808, %r803; + shf.l.wrap.b32 %r810, %r809, %r809, 20; + add.s32 %r811, %r805, %r178; + add.s32 %r812, %r811, %r810; + xor.b32 %r813, %r812, %r807; + shf.l.wrap.b32 %r814, %r813, %r813, 24; + add.s32 %r815, %r814, %r808; + xor.b32 %r816, %r815, %r810; + shf.l.wrap.b32 %r817, %r816, %r816, 25; + add.s32 %r818, %r770, %r157; + add.s32 %r819, %r818, %r761; + xor.b32 %r820, %r819, %r786; + shf.l.wrap.b32 %r821, %r820, %r820, 16; + add.s32 %r822, %r821, %r801; + xor.b32 %r823, %r822, %r761; + shf.l.wrap.b32 %r824, %r823, %r823, 20; + add.s32 %r825, %r819, %r115; + add.s32 %r826, %r825, %r824; + xor.b32 %r827, %r826, %r821; + shf.l.wrap.b32 %r828, %r827, %r827, 24; + add.s32 %r829, %r828, %r822; + xor.b32 %r830, %r829, %r824; + shf.l.wrap.b32 %r831, %r830, %r830, 25; + add.s32 %r832, %r784, %r136; + add.s32 %r833, %r832, %r775; + xor.b32 %r834, %r800, %r833; + shf.l.wrap.b32 %r835, %r834, %r834, 16; + add.s32 %r836, %r835, %r759; + xor.b32 %r837, %r836, %r775; + shf.l.wrap.b32 %r838, %r837, %r837, 20; + add.s32 %r839, %r833, %r164; + add.s32 %r840, %r839, %r838; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 24; + add.s32 %r843, %r842, %r836; + xor.b32 %r844, %r843, %r838; + shf.l.wrap.b32 %r845, %r844, %r844, 25; + add.s32 %r846, %r789, %r185; + add.s32 %r847, %r846, %r798; + xor.b32 %r848, %r847, %r758; + shf.l.wrap.b32 %r849, %r848, %r848, 16; + add.s32 %r850, %r849, %r773; + xor.b32 %r851, %r850, %r789; + shf.l.wrap.b32 %r852, %r851, %r851, 20; + add.s32 %r853, %r847, %r87; + add.s32 %r854, %r853, %r852; + xor.b32 %r855, %r854, %r849; + shf.l.wrap.b32 %r856, %r855, %r855, 24; + add.s32 %r857, %r856, %r850; + xor.b32 %r858, %r857, %r852; + shf.l.wrap.b32 %r859, %r858, %r858, 25; + add.s32 %r860, %r831, %r171; + add.s32 %r861, %r860, %r812; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 16; + add.s32 %r864, %r863, %r843; + xor.b32 %r865, %r864, %r831; + shf.l.wrap.b32 %r866, %r865, %r865, 20; + add.s32 %r867, %r861, %r101; + add.s32 %r868, %r867, %r866; + xor.b32 %r869, %r868, %r863; + shf.l.wrap.b32 %r870, %r869, %r869, 24; + add.s32 %r871, %r870, %r864; + xor.b32 %r872, %r871, %r866; + shf.l.wrap.b32 %r873, %r872, %r872, 25; + add.s32 %r874, %r826, %r80; + add.s32 %r875, %r874, %r845; + xor.b32 %r876, %r814, %r875; + shf.l.wrap.b32 %r877, %r876, %r876, 16; + add.s32 %r878, %r877, %r857; + xor.b32 %r879, %r878, %r845; + shf.l.wrap.b32 %r880, %r879, %r879, 20; + add.s32 %r881, %r875, %r150; + add.s32 %r882, %r881, %r880; + xor.b32 %r883, %r882, %r877; + shf.l.wrap.b32 %r884, %r883, %r883, 24; + add.s32 %r885, %r884, %r878; + xor.b32 %r886, %r885, %r880; + shf.l.wrap.b32 %r887, %r886, %r886, 25; + add.s32 %r888, %r840, %r94; + add.s32 %r889, %r888, %r859; + xor.b32 %r890, %r889, %r828; + shf.l.wrap.b32 %r891, %r890, %r890, 16; + add.s32 %r892, %r891, %r815; + xor.b32 %r893, %r892, %r859; + shf.l.wrap.b32 %r894, %r893, %r893, 20; + add.s32 %r895, %r889, %r122; + add.s32 %r896, %r895, %r894; + xor.b32 %r897, %r896, %r891; + shf.l.wrap.b32 %r898, %r897, %r897, 24; + add.s32 %r899, %r898, %r892; + xor.b32 %r900, %r899, %r894; + shf.l.wrap.b32 %r901, %r900, %r900, 25; + add.s32 %r902, %r854, %r108; + add.s32 %r903, %r902, %r817; + xor.b32 %r904, %r903, %r842; + shf.l.wrap.b32 %r905, %r904, %r904, 16; + add.s32 %r906, %r905, %r829; + xor.b32 %r907, %r906, %r817; + shf.l.wrap.b32 %r908, %r907, %r907, 20; + add.s32 %r909, %r903, %r129; + add.s32 %r910, %r909, %r908; + xor.b32 %r911, %r910, %r905; + shf.l.wrap.b32 %r912, %r911, %r911, 24; + add.s32 %r913, %r912, %r906; + xor.b32 %r914, %r913, %r908; + shf.l.wrap.b32 %r915, %r914, %r914, 25; + add.s32 %r916, %r868, %r157; + add.s32 %r917, %r916, %r915; + xor.b32 %r918, %r917, %r884; + shf.l.wrap.b32 %r919, %r918, %r918, 16; + add.s32 %r920, %r919, %r899; + xor.b32 %r921, %r920, %r915; + shf.l.wrap.b32 %r922, %r921, %r921, 20; + add.s32 %r923, %r917, %r185; + add.s32 %r924, %r923, %r922; + xor.b32 %r925, %r924, %r919; + shf.l.wrap.b32 %r926, %r925, %r925, 24; + add.s32 %r927, %r926, %r920; + xor.b32 %r928, %r927, %r922; + shf.l.wrap.b32 %r929, %r928, %r928, 25; + add.s32 %r930, %r882, %r115; + add.s32 %r931, %r930, %r873; + xor.b32 %r932, %r931, %r898; + shf.l.wrap.b32 %r933, %r932, %r932, 16; + add.s32 %r934, %r933, %r913; + xor.b32 %r935, %r934, %r873; + shf.l.wrap.b32 %r936, %r935, %r935, 20; + add.s32 %r937, %r931, %r80; + add.s32 %r938, %r937, %r936; + xor.b32 %r939, %r938, %r933; + shf.l.wrap.b32 %r940, %r939, %r939, 24; + add.s32 %r941, %r940, %r934; + xor.b32 %r942, %r941, %r936; + shf.l.wrap.b32 %r943, %r942, %r942, 25; + add.s32 %r944, %r896, %r87; + add.s32 %r945, %r944, %r887; + xor.b32 %r946, %r912, %r945; + shf.l.wrap.b32 %r947, %r946, %r946, 16; + add.s32 %r948, %r947, %r871; + xor.b32 %r949, %r948, %r887; + shf.l.wrap.b32 %r950, %r949, %r949, 20; + add.s32 %r951, %r945, %r143; + add.s32 %r952, %r951, %r950; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 24; + add.s32 %r955, %r954, %r948; + xor.b32 %r956, %r955, %r950; + shf.l.wrap.b32 %r957, %r956, %r956, 25; + add.s32 %r958, %r901, %r136; + add.s32 %r959, %r958, %r910; + xor.b32 %r960, %r959, %r870; + shf.l.wrap.b32 %r961, %r960, %r960, 16; + add.s32 %r962, %r961, %r885; + xor.b32 %r963, %r962, %r901; + shf.l.wrap.b32 %r964, %r963, %r963, 20; + add.s32 %r965, %r959, %r122; + add.s32 %r966, %r965, %r964; + xor.b32 %r967, %r966, %r961; + shf.l.wrap.b32 %r968, %r967, %r967, 24; + add.s32 %r969, %r968, %r962; + xor.b32 %r970, %r969, %r964; + shf.l.wrap.b32 %r971, %r970, %r970, 25; + add.s32 %r972, %r943, %r178; + add.s32 %r973, %r972, %r924; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 16; + add.s32 %r976, %r975, %r955; + xor.b32 %r977, %r976, %r943; + shf.l.wrap.b32 %r978, %r977, %r977, 20; + add.s32 %r979, %r973, %r150; + add.s32 %r980, %r979, %r978; + xor.b32 %r981, %r980, %r975; + shf.l.wrap.b32 %r982, %r981, %r981, 24; + add.s32 %r983, %r982, %r976; + xor.b32 %r984, %r983, %r978; + shf.l.wrap.b32 %r985, %r984, %r984, 25; + add.s32 %r986, %r938, %r94; + add.s32 %r987, %r986, %r957; + xor.b32 %r988, %r926, %r987; + shf.l.wrap.b32 %r989, %r988, %r988, 16; + add.s32 %r990, %r989, %r969; + xor.b32 %r991, %r990, %r957; + shf.l.wrap.b32 %r992, %r991, %r991, 20; + add.s32 %r993, %r987, %r164; + add.s32 %r994, %r993, %r992; + xor.b32 %r995, %r994, %r989; + shf.l.wrap.b32 %r996, %r995, %r995, 24; + add.s32 %r997, %r996, %r990; + xor.b32 %r998, %r997, %r992; + shf.l.wrap.b32 %r999, %r998, %r998, 25; + add.s32 %r1000, %r952, %r101; + add.s32 %r1001, %r1000, %r971; + xor.b32 %r1002, %r1001, %r940; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 16; + add.s32 %r1004, %r1003, %r927; + xor.b32 %r1005, %r1004, %r971; + shf.l.wrap.b32 %r1006, %r1005, %r1005, 20; + add.s32 %r1007, %r1001, %r108; + add.s32 %r1008, %r1007, %r1006; + xor.b32 %r1009, %r1008, %r1003; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 24; + add.s32 %r1011, %r1010, %r1004; + xor.b32 %r1012, %r1011, %r1006; + shf.l.wrap.b32 %r1013, %r1012, %r1012, 25; + add.s32 %r1014, %r966, %r129; + add.s32 %r1015, %r1014, %r929; + xor.b32 %r1016, %r1015, %r954; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 16; + add.s32 %r1018, %r1017, %r941; + xor.b32 %r1019, %r1018, %r929; + shf.l.wrap.b32 %r1020, %r1019, %r1019, 20; + add.s32 %r1021, %r1015, %r171; + add.s32 %r1022, %r1021, %r1020; + xor.b32 %r1023, %r1022, %r1017; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 24; + add.s32 %r1025, %r1024, %r1018; + xor.b32 %r1026, %r1025, %r1020; + shf.l.wrap.b32 %r1027, %r1026, %r1026, 25; + xor.b32 %r1028, %r1011, %r980; + xor.b32 %r1029, %r1025, %r994; + xor.b32 %r1030, %r983, %r1008; + xor.b32 %r1031, %r1022, %r997; + xor.b32 %r1032, %r1027, %r996; + xor.b32 %r1033, %r985, %r1010; + xor.b32 %r1034, %r1024, %r999; + xor.b32 %r1035, %r1013, %r982; + st.local.u8 [%rd154], %r1028; + shr.u32 %r1036, %r1028, 8; + st.local.u8 [%rd154+1], %r1036; + shr.u32 %r1037, %r1028, 16; + st.local.u8 [%rd154+2], %r1037; + shr.u32 %r1038, %r1028, 24; + st.local.u8 [%rd154+3], %r1038; + st.local.u8 [%rd154+4], %r1029; + shr.u32 %r1039, %r1029, 8; + st.local.u8 [%rd154+5], %r1039; + shr.u32 %r1040, %r1029, 16; + st.local.u8 [%rd154+6], %r1040; + shr.u32 %r1041, %r1029, 24; + st.local.u8 [%rd154+7], %r1041; + st.local.u8 [%rd154+8], %r1030; + shr.u32 %r1042, %r1030, 8; + st.local.u8 [%rd154+9], %r1042; + shr.u32 %r1043, %r1030, 16; + st.local.u8 [%rd154+10], %r1043; + shr.u32 %r1044, %r1030, 24; + st.local.u8 [%rd154+11], %r1044; + st.local.u8 [%rd154+12], %r1031; + shr.u32 %r1045, %r1031, 8; + st.local.u8 [%rd154+13], %r1045; + shr.u32 %r1046, %r1031, 16; + st.local.u8 [%rd154+14], %r1046; + shr.u32 %r1047, %r1031, 24; + st.local.u8 [%rd154+15], %r1047; + st.local.u8 [%rd154+16], %r1032; + shr.u32 %r1048, %r1032, 8; + st.local.u8 [%rd154+17], %r1048; + shr.u32 %r1049, %r1032, 16; + st.local.u8 [%rd154+18], %r1049; + shr.u32 %r1050, %r1032, 24; + st.local.u8 [%rd154+19], %r1050; + st.local.u8 [%rd154+20], %r1033; + shr.u32 %r1051, %r1033, 8; + st.local.u8 [%rd154+21], %r1051; + shr.u32 %r1052, %r1033, 16; + st.local.u8 [%rd154+22], %r1052; + shr.u32 %r1053, %r1033, 24; + st.local.u8 [%rd154+23], %r1053; + st.local.u8 [%rd154+24], %r1034; + shr.u32 %r1054, %r1034, 8; + st.local.u8 [%rd154+25], %r1054; + shr.u32 %r1055, %r1034, 16; + st.local.u8 [%rd154+26], %r1055; + shr.u32 %r1056, %r1034, 24; + st.local.u8 [%rd154+27], %r1056; + st.local.u8 [%rd154+28], %r1035; + shr.u32 %r1057, %r1035, 8; + st.local.u8 [%rd154+29], %r1057; + shr.u32 %r1058, %r1035, 16; + st.local.u8 [%rd154+30], %r1058; + shr.u32 %r1059, %r1035, 24; + st.local.u8 [%rd154+31], %r1059; + add.s64 %rd152, %rd152, 8; + add.s64 %rd154, %rd154, 32; + add.s64 %rd153, %rd153, -1; + setp.ne.s64 %p12, %rd153, 0; + @%p12 bra $L__BB0_7; + +$L__BB0_8: + setp.le.u64 %p13, %rd7, %rd151; + @%p13 bra $L__BB0_30; + + add.u64 %rd143, %SPL, 96; + ld.param.u64 %rd141, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd140, %rd141; + shl.b64 %rd110, %rd150, 6; + shl.b64 %rd111, %rd150, 5; + add.s64 %rd27, %rd140, %rd111; + add.s64 %rd28, %rd143, %rd110; + mov.u64 %rd155, 0; + +$L__BB0_10: + add.s64 %rd112, %rd28, %rd155; + ld.local.u8 %rs77, [%rd112]; + add.s64 %rd113, %rd27, %rd155; + st.local.u8 [%rd113], %rs77; + add.s64 %rd155, %rd155, 1; + setp.lt.u64 %p14, %rd155, 32; + @%p14 bra $L__BB0_10; + + add.s64 %rd150, %rd150, 1; + +$L__BB0_30: + st.param.b64 [func_retval0+0], %rd150; + ret; + +} +.func _Z20blake3_hasher_updateP13blake3_hasherPKvy( + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0, + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1 +) +{ + .local .align 16 .b8 __local_depot1[144]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<54>; + .reg .b16 %rs<393>; + .reg .b32 %r<11690>; + .reg .b64 %rd<272>; + + + mov.u64 %SPL, __local_depot1; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd98, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + ld.param.u64 %rd253, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd260, %rd253; + cvta.to.local.u64 %rd2, %rd98; + add.s64 %rd3, %rd2, 136; + ld.local.v2.u8 {%rs102, %rs103}, [%rd2+136]; + cvt.u64.u16 %rd4, %rs103; + cvt.u32.u16 %r144, %rs103; + mul.wide.u32 %rd101, %r144, 64; + cvt.u64.u16 %rd5, %rs102; + neg.s64 %rd102, %rd5; + setp.eq.s64 %p1, %rd101, %rd102; + mov.u64 %rd261, 80; + @%p1 bra $L__BB1_24; + + shl.b64 %rd103, %rd4, 6; + mov.u64 %rd104, 1024; + sub.s64 %rd105, %rd104, %rd5; + sub.s64 %rd106, %rd105, %rd103; + min.u64 %rd6, %rd106, 80; + setp.eq.s16 %p2, %rs102, 0; + mov.u16 %rs351, 0; + mov.u64 %rd243, %rd6; + @%p2 bra $L__BB1_9; + + cvt.u32.u16 %r145, %rs102; + prmt.b32 %r147, %r144, %r145, 30212; + cvt.u16.u32 %rs350, %r147; + mov.u64 %rd107, 64; + sub.s64 %rd108, %rd107, %rd5; + min.u64 %rd7, %rd108, %rd6; + setp.eq.s64 %p3, %rd7, 0; + @%p3 bra $L__BB1_6; + + add.s64 %rd110, %rd2, %rd5; + add.s64 %rd8, %rd110, 72; + mov.u64 %rd236, 0; + +$L__BB1_4: + add.s64 %rd111, %rd260, %rd236; + ld.local.u8 %rs107, [%rd111]; + add.s64 %rd112, %rd8, %rd236; + st.local.u8 [%rd112], %rs107; + add.s64 %rd236, %rd236, 1; + setp.lt.u64 %p4, %rd236, %rd7; + @%p4 bra $L__BB1_4; + + ld.local.u8 %rs350, [%rd3]; + +$L__BB1_6: + cvt.u16.u64 %rs108, %rd7; + add.s16 %rs351, %rs350, %rs108; + mov.u64 %rd243, 0; + st.local.u8 [%rd3], %rs351; + add.s64 %rd260, %rd260, %rd7; + sub.s64 %rd12, %rd6, %rd7; + setp.eq.s64 %p5, %rd12, 0; + @%p5 bra $L__BB1_9; + + add.s64 %rd13, %rd2, 72; + ld.local.u8 %rs109, [%rd3+1]; + mov.u64 %rd237, 0; + setp.eq.s16 %p6, %rs109, 0; + mov.u16 %rs351, 0; + selp.u16 %rs111, 1, 0, %p6; + ld.local.u8 %rs112, [%rd3+2]; + or.b16 %rs113, %rs112, %rs111; + ld.local.u8 %r148, [%rd3+-64]; + ld.local.u8 %r149, [%rd3+-63]; + prmt.b32 %r150, %r149, %r148, 30212; + ld.local.u8 %r151, [%rd3+-62]; + prmt.b32 %r152, %r151, %r150, 28756; + ld.local.u8 %r153, [%rd3+-61]; + prmt.b32 %r154, %r153, %r152, 1620; + ld.local.u8 %r155, [%rd3+-60]; + ld.local.u8 %r156, [%rd3+-59]; prmt.b32 %r157, %r156, %r155, 30212; - cvt.u32.u16 %r158, %rs39; + ld.local.u8 %r158, [%rd3+-58]; prmt.b32 %r159, %r158, %r157, 28756; - cvt.u32.u16 %r160, %rs47; + ld.local.u8 %r160, [%rd3+-57]; prmt.b32 %r161, %r160, %r159, 1620; - cvt.u32.u16 %r162, %rs48; - and.b32 %r163, %r162, 255; - cvt.u32.u16 %r164, %rs53; - prmt.b32 %r165, %r164, %r163, 30212; - cvt.u32.u16 %r166, %rs49; - prmt.b32 %r167, %r166, %r165, 28756; - cvt.u32.u16 %r168, %rs55; - prmt.b32 %r169, %r168, %r167, 1620; - cvt.u32.u16 %r170, %rs50; - and.b32 %r171, %r170, 255; - cvt.u32.u16 %r172, %rs57; - prmt.b32 %r173, %r172, %r171, 30212; - cvt.u32.u16 %r174, %rs51; - prmt.b32 %r175, %r174, %r173, 28756; - cvt.u32.u16 %r176, %rs59; - prmt.b32 %r177, %r176, %r175, 1620; - cvt.u32.u16 %r178, %rs69; - and.b32 %r179, %r178, 255; - add.s32 %r180, %r69, -1156040474; - shf.l.wrap.b32 %r181, %r180, %r180, 16; - add.s32 %r182, %r181, 1779033703; - xor.b32 %r183, %r182, 1359893119; - shf.l.wrap.b32 %r184, %r183, %r183, 20; - add.s32 %r185, %r77, %r180; - add.s32 %r186, %r185, %r184; - xor.b32 %r187, %r186, %r181; - shf.l.wrap.b32 %r188, %r187, %r187, 24; - add.s32 %r189, %r188, %r182; - xor.b32 %r190, %r189, %r184; - shf.l.wrap.b32 %r191, %r190, %r190, 25; - add.s32 %r192, %r81, 1449989905; - shf.l.wrap.b32 %r193, %r192, %r192, 16; - add.s32 %r194, %r193, -1150833019; - xor.b32 %r195, %r194, -1694144372; - shf.l.wrap.b32 %r196, %r195, %r195, 20; - add.s32 %r197, %r89, %r192; - add.s32 %r198, %r197, %r196; - xor.b32 %r199, %r198, %r193; - shf.l.wrap.b32 %r200, %r199, %r199, 24; - add.s32 %r201, %r200, %r194; - xor.b32 %r202, %r201, %r196; - shf.l.wrap.b32 %r203, %r202, %r202, 25; - add.s32 %r204, %r93, 1542638877; - shr.u32 %r205, %r204, 16; - shl.b32 %r206, %r204, 16; - xor.b32 %r207, %r206, 4194304; - or.b32 %r208, %r207, %r205; - add.s32 %r209, %r208, 1013904242; - xor.b32 %r210, %r209, 528734635; - shf.l.wrap.b32 %r211, %r210, %r210, 20; - add.s32 %r212, %r101, %r204; - add.s32 %r213, %r212, %r211; - xor.b32 %r214, %r213, %r208; - shf.l.wrap.b32 %r215, %r214, %r214, 24; - add.s32 %r216, %r215, %r209; - xor.b32 %r217, %r216, %r211; - shf.l.wrap.b32 %r218, %r217, %r217, 25; - add.s32 %r219, %r105, 19972691; - xor.b32 %r220, %r219, %r179; - shr.u32 %r221, %r219, 16; - shl.b32 %r222, %r220, 16; - or.b32 %r223, %r222, %r221; - add.s32 %r224, %r223, -1521486534; - xor.b32 %r225, %r224, 1541459225; - shf.l.wrap.b32 %r226, %r225, %r225, 20; - add.s32 %r227, %r113, %r219; - add.s32 %r228, %r227, %r226; - xor.b32 %r229, %r228, %r223; - shf.l.wrap.b32 %r230, %r229, %r229, 24; - add.s32 %r231, %r230, %r224; - xor.b32 %r232, %r231, %r226; - shf.l.wrap.b32 %r233, %r232, %r232, 25; - add.s32 %r234, %r203, %r186; - add.s32 %r235, %r234, %r121; - xor.b32 %r236, %r230, %r235; - shf.l.wrap.b32 %r237, %r236, %r236, 16; - add.s32 %r238, %r237, %r216; - xor.b32 %r239, %r238, %r203; - shf.l.wrap.b32 %r240, %r239, %r239, 20; - add.s32 %r241, %r129, %r235; - add.s32 %r242, %r241, %r240; - xor.b32 %r243, %r242, %r237; - shf.l.wrap.b32 %r244, %r243, %r243, 24; - add.s32 %r245, %r244, %r238; - xor.b32 %r246, %r245, %r240; - shf.l.wrap.b32 %r247, %r246, %r246, 25; - add.s32 %r248, %r218, %r198; - add.s32 %r249, %r248, %r137; - xor.b32 %r250, %r249, %r188; - shf.l.wrap.b32 %r251, %r250, %r250, 16; - add.s32 %r252, %r251, %r231; - xor.b32 %r253, %r252, %r218; - shf.l.wrap.b32 %r254, %r253, %r253, 20; - add.s32 %r255, %r145, %r249; - add.s32 %r256, %r255, %r254; - xor.b32 %r257, %r256, %r251; - shf.l.wrap.b32 %r258, %r257, %r257, 24; - add.s32 %r259, %r258, %r252; - xor.b32 %r260, %r259, %r254; - shf.l.wrap.b32 %r261, %r260, %r260, 25; - add.s32 %r262, %r233, %r213; - add.s32 %r263, %r262, %r153; - xor.b32 %r264, %r263, %r200; - shf.l.wrap.b32 %r265, %r264, %r264, 16; - add.s32 %r266, %r265, %r189; - xor.b32 %r267, %r266, %r233; - shf.l.wrap.b32 %r268, %r267, %r267, 20; - add.s32 %r269, %r161, %r263; - add.s32 %r270, %r269, %r268; - xor.b32 %r271, %r270, %r265; - shf.l.wrap.b32 %r272, %r271, %r271, 24; - add.s32 %r273, %r272, %r266; - xor.b32 %r274, %r273, %r268; - shf.l.wrap.b32 %r275, %r274, %r274, 25; - add.s32 %r276, %r228, %r191; - add.s32 %r277, %r276, %r169; - xor.b32 %r278, %r277, %r215; - shf.l.wrap.b32 %r279, %r278, %r278, 16; - add.s32 %r280, %r279, %r201; - xor.b32 %r281, %r280, %r191; - shf.l.wrap.b32 %r282, %r281, %r281, 20; - add.s32 %r283, %r177, %r277; - add.s32 %r284, %r283, %r282; - xor.b32 %r285, %r284, %r279; - shf.l.wrap.b32 %r286, %r285, %r285, 24; - add.s32 %r287, %r286, %r280; - xor.b32 %r288, %r287, %r282; - shf.l.wrap.b32 %r289, %r288, %r288, 25; - add.s32 %r290, %r242, %r81; - add.s32 %r291, %r290, %r289; - xor.b32 %r292, %r291, %r258; - shf.l.wrap.b32 %r293, %r292, %r292, 16; - add.s32 %r294, %r293, %r273; - xor.b32 %r295, %r294, %r289; - shf.l.wrap.b32 %r296, %r295, %r295, 20; - add.s32 %r297, %r291, %r105; - add.s32 %r298, %r297, %r296; - xor.b32 %r299, %r298, %r293; - shf.l.wrap.b32 %r300, %r299, %r299, 24; - add.s32 %r301, %r300, %r294; - xor.b32 %r302, %r301, %r296; - shf.l.wrap.b32 %r303, %r302, %r302, 25; - add.s32 %r304, %r256, %r89; - add.s32 %r305, %r304, %r247; - xor.b32 %r306, %r272, %r305; - shf.l.wrap.b32 %r307, %r306, %r306, 16; - add.s32 %r308, %r287, %r307; - xor.b32 %r309, %r308, %r247; - shf.l.wrap.b32 %r310, %r309, %r309, 20; - add.s32 %r311, %r305, %r137; - add.s32 %r312, %r311, %r310; - xor.b32 %r313, %r312, %r307; - shf.l.wrap.b32 %r314, %r313, %r313, 24; - add.s32 %r315, %r314, %r308; - xor.b32 %r316, %r315, %r310; - shf.l.wrap.b32 %r317, %r316, %r316, 25; - add.s32 %r318, %r261, %r113; - add.s32 %r319, %r318, %r270; - xor.b32 %r320, %r286, %r319; - shf.l.wrap.b32 %r321, %r320, %r320, 16; - add.s32 %r322, %r321, %r245; - xor.b32 %r323, %r322, %r261; + ld.local.u8 %r162, [%rd3+-56]; + ld.local.u8 %r163, [%rd3+-55]; + prmt.b32 %r164, %r163, %r162, 30212; + ld.local.u8 %r165, [%rd3+-54]; + prmt.b32 %r166, %r165, %r164, 28756; + ld.local.u8 %r167, [%rd3+-53]; + prmt.b32 %r168, %r167, %r166, 1620; + ld.local.u8 %r169, [%rd3+-52]; + ld.local.u8 %r170, [%rd3+-51]; + prmt.b32 %r171, %r170, %r169, 30212; + ld.local.u8 %r172, [%rd3+-50]; + prmt.b32 %r173, %r172, %r171, 28756; + ld.local.u8 %r174, [%rd3+-49]; + prmt.b32 %r175, %r174, %r173, 1620; + ld.local.u8 %r176, [%rd3+-48]; + ld.local.u8 %r177, [%rd3+-47]; + prmt.b32 %r178, %r177, %r176, 30212; + ld.local.u8 %r179, [%rd3+-46]; + prmt.b32 %r180, %r179, %r178, 28756; + ld.local.u8 %r181, [%rd3+-45]; + prmt.b32 %r182, %r181, %r180, 1620; + ld.local.u8 %r183, [%rd3+-44]; + ld.local.u8 %r184, [%rd3+-43]; + prmt.b32 %r185, %r184, %r183, 30212; + ld.local.u8 %r186, [%rd3+-42]; + prmt.b32 %r187, %r186, %r185, 28756; + ld.local.u8 %r188, [%rd3+-41]; + prmt.b32 %r189, %r188, %r187, 1620; + ld.local.u8 %r190, [%rd3+-40]; + ld.local.u8 %r191, [%rd3+-39]; + prmt.b32 %r192, %r191, %r190, 30212; + ld.local.u8 %r193, [%rd3+-38]; + prmt.b32 %r194, %r193, %r192, 28756; + ld.local.u8 %r195, [%rd3+-37]; + prmt.b32 %r196, %r195, %r194, 1620; + ld.local.u8 %r197, [%rd3+-36]; + ld.local.u8 %r198, [%rd3+-35]; + prmt.b32 %r199, %r198, %r197, 30212; + ld.local.u8 %r200, [%rd3+-34]; + prmt.b32 %r201, %r200, %r199, 28756; + ld.local.u8 %r202, [%rd3+-33]; + prmt.b32 %r203, %r202, %r201, 1620; + ld.local.u8 %r204, [%rd3+-32]; + ld.local.u8 %r205, [%rd3+-31]; + prmt.b32 %r206, %r205, %r204, 30212; + ld.local.u8 %r207, [%rd3+-30]; + prmt.b32 %r208, %r207, %r206, 28756; + ld.local.u8 %r209, [%rd3+-29]; + prmt.b32 %r210, %r209, %r208, 1620; + ld.local.u8 %r211, [%rd3+-28]; + ld.local.u8 %r212, [%rd3+-27]; + prmt.b32 %r213, %r212, %r211, 30212; + ld.local.u8 %r214, [%rd3+-26]; + prmt.b32 %r215, %r214, %r213, 28756; + ld.local.u8 %r216, [%rd3+-25]; + prmt.b32 %r217, %r216, %r215, 1620; + ld.local.u8 %r218, [%rd3+-24]; + ld.local.u8 %r219, [%rd3+-23]; + prmt.b32 %r220, %r219, %r218, 30212; + ld.local.u8 %r221, [%rd3+-22]; + prmt.b32 %r222, %r221, %r220, 28756; + ld.local.u8 %r223, [%rd3+-21]; + prmt.b32 %r224, %r223, %r222, 1620; + ld.local.u8 %r225, [%rd3+-20]; + ld.local.u8 %r226, [%rd3+-19]; + prmt.b32 %r227, %r226, %r225, 30212; + ld.local.u8 %r228, [%rd3+-18]; + prmt.b32 %r229, %r228, %r227, 28756; + ld.local.u8 %r230, [%rd3+-17]; + prmt.b32 %r231, %r230, %r229, 1620; + ld.local.u8 %r232, [%rd3+-16]; + ld.local.u8 %r233, [%rd3+-15]; + prmt.b32 %r234, %r233, %r232, 30212; + ld.local.u8 %r235, [%rd3+-14]; + prmt.b32 %r236, %r235, %r234, 28756; + ld.local.u8 %r237, [%rd3+-13]; + prmt.b32 %r238, %r237, %r236, 1620; + ld.local.u8 %r239, [%rd3+-12]; + ld.local.u8 %r240, [%rd3+-11]; + prmt.b32 %r241, %r240, %r239, 30212; + ld.local.u8 %r242, [%rd3+-10]; + prmt.b32 %r243, %r242, %r241, 28756; + ld.local.u8 %r244, [%rd3+-9]; + prmt.b32 %r245, %r244, %r243, 1620; + ld.local.u8 %r246, [%rd3+-8]; + ld.local.u8 %r247, [%rd3+-7]; + prmt.b32 %r248, %r247, %r246, 30212; + ld.local.u8 %r249, [%rd3+-6]; + prmt.b32 %r250, %r249, %r248, 28756; + ld.local.u8 %r251, [%rd3+-5]; + prmt.b32 %r252, %r251, %r250, 1620; + ld.local.u8 %r253, [%rd3+-4]; + ld.local.u8 %r254, [%rd3+-3]; + prmt.b32 %r255, %r254, %r253, 30212; + ld.local.u8 %r256, [%rd3+-2]; + prmt.b32 %r257, %r256, %r255, 28756; + ld.local.u8 %r258, [%rd3+-1]; + prmt.b32 %r259, %r258, %r257, 1620; + ld.local.u64 %rd115, [%rd3+-72]; + cvt.u32.u64 %r260, %rd115; + shr.u64 %rd116, %rd115, 32; + cvt.u32.u64 %r261, %rd116; + cvt.u32.u16 %r262, %rs113; + and.b32 %r263, %r262, 255; + ld.local.u32 %r264, [%rd3+-104]; + add.s32 %r265, %r264, %r154; + ld.local.u32 %r266, [%rd3+-88]; + add.s32 %r267, %r265, %r266; + xor.b32 %r268, %r267, %r260; + shf.l.wrap.b32 %r269, %r268, %r268, 16; + add.s32 %r270, %r269, 1779033703; + xor.b32 %r271, %r270, %r266; + shf.l.wrap.b32 %r272, %r271, %r271, 20; + add.s32 %r273, %r267, %r161; + add.s32 %r274, %r273, %r272; + xor.b32 %r275, %r274, %r269; + shf.l.wrap.b32 %r276, %r275, %r275, 24; + add.s32 %r277, %r276, %r270; + xor.b32 %r278, %r277, %r272; + shf.l.wrap.b32 %r279, %r278, %r278, 25; + ld.local.u32 %r280, [%rd3+-100]; + add.s32 %r281, %r280, %r168; + ld.local.u32 %r282, [%rd3+-84]; + add.s32 %r283, %r281, %r282; + xor.b32 %r284, %r283, %r261; + shf.l.wrap.b32 %r285, %r284, %r284, 16; + add.s32 %r286, %r285, -1150833019; + xor.b32 %r287, %r286, %r282; + shf.l.wrap.b32 %r288, %r287, %r287, 20; + add.s32 %r289, %r283, %r175; + add.s32 %r290, %r289, %r288; + xor.b32 %r291, %r290, %r285; + shf.l.wrap.b32 %r292, %r291, %r291, 24; + add.s32 %r293, %r292, %r286; + xor.b32 %r294, %r293, %r288; + shf.l.wrap.b32 %r295, %r294, %r294, 25; + ld.local.u32 %r296, [%rd3+-96]; + add.s32 %r297, %r296, %r182; + ld.local.u32 %r298, [%rd3+-80]; + add.s32 %r299, %r297, %r298; + shr.u32 %r300, %r299, 16; + shl.b32 %r301, %r299, 16; + xor.b32 %r302, %r301, 4194304; + or.b32 %r303, %r302, %r300; + add.s32 %r304, %r303, 1013904242; + xor.b32 %r305, %r304, %r298; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r299, %r189; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + ld.local.u32 %r314, [%rd3+-92]; + add.s32 %r315, %r314, %r196; + ld.local.u32 %r316, [%rd3+-76]; + add.s32 %r317, %r315, %r316; + xor.b32 %r318, %r317, %r263; + shr.u32 %r319, %r317, 16; + shl.b32 %r320, %r318, 16; + or.b32 %r321, %r320, %r319; + add.s32 %r322, %r321, -1521486534; + xor.b32 %r323, %r322, %r316; shf.l.wrap.b32 %r324, %r323, %r323, 20; - add.s32 %r325, %r319, %r69; + add.s32 %r325, %r317, %r203; add.s32 %r326, %r325, %r324; xor.b32 %r327, %r326, %r321; shf.l.wrap.b32 %r328, %r327, %r327, 24; add.s32 %r329, %r328, %r322; xor.b32 %r330, %r329, %r324; shf.l.wrap.b32 %r331, %r330, %r330, 25; - add.s32 %r332, %r275, %r93; - add.s32 %r333, %r332, %r284; - xor.b32 %r334, %r333, %r244; + add.s32 %r332, %r274, %r210; + add.s32 %r333, %r332, %r295; + xor.b32 %r334, %r333, %r328; shf.l.wrap.b32 %r335, %r334, %r334, 16; - add.s32 %r336, %r335, %r259; - xor.b32 %r337, %r336, %r275; + add.s32 %r336, %r335, %r311; + xor.b32 %r337, %r336, %r295; shf.l.wrap.b32 %r338, %r337, %r337, 20; - add.s32 %r339, %r333, %r161; + add.s32 %r339, %r333, %r217; add.s32 %r340, %r339, %r338; xor.b32 %r341, %r340, %r335; shf.l.wrap.b32 %r342, %r341, %r341, 24; add.s32 %r343, %r342, %r336; xor.b32 %r344, %r343, %r338; shf.l.wrap.b32 %r345, %r344, %r344, 25; - add.s32 %r346, %r298, %r77; - add.s32 %r347, %r346, %r317; - xor.b32 %r348, %r347, %r342; + add.s32 %r346, %r290, %r224; + add.s32 %r347, %r346, %r313; + xor.b32 %r348, %r347, %r276; shf.l.wrap.b32 %r349, %r348, %r348, 16; add.s32 %r350, %r349, %r329; - xor.b32 %r351, %r350, %r317; + xor.b32 %r351, %r350, %r313; shf.l.wrap.b32 %r352, %r351, %r351, 20; - add.s32 %r353, %r347, %r145; + add.s32 %r353, %r347, %r231; add.s32 %r354, %r353, %r352; xor.b32 %r355, %r354, %r349; shf.l.wrap.b32 %r356, %r355, %r355, 24; add.s32 %r357, %r356, %r350; xor.b32 %r358, %r357, %r352; shf.l.wrap.b32 %r359, %r358, %r358, 25; - add.s32 %r360, %r312, %r153; + add.s32 %r360, %r308, %r238; add.s32 %r361, %r360, %r331; - xor.b32 %r362, %r361, %r300; + xor.b32 %r362, %r361, %r292; shf.l.wrap.b32 %r363, %r362, %r362, 16; - add.s32 %r364, %r363, %r343; + add.s32 %r364, %r363, %r277; xor.b32 %r365, %r364, %r331; shf.l.wrap.b32 %r366, %r365, %r365, 20; - add.s32 %r367, %r361, %r101; + add.s32 %r367, %r361, %r245; add.s32 %r368, %r367, %r366; xor.b32 %r369, %r368, %r363; shf.l.wrap.b32 %r370, %r369, %r369, 24; add.s32 %r371, %r370, %r364; xor.b32 %r372, %r371, %r366; shf.l.wrap.b32 %r373, %r372, %r372, 25; - add.s32 %r374, %r326, %r129; - add.s32 %r375, %r374, %r345; - xor.b32 %r376, %r375, %r314; + add.s32 %r374, %r326, %r252; + add.s32 %r375, %r374, %r279; + xor.b32 %r376, %r375, %r310; shf.l.wrap.b32 %r377, %r376, %r376, 16; - add.s32 %r378, %r377, %r301; - xor.b32 %r379, %r378, %r345; + add.s32 %r378, %r377, %r293; + xor.b32 %r379, %r378, %r279; shf.l.wrap.b32 %r380, %r379, %r379, 20; - add.s32 %r381, %r375, %r169; + add.s32 %r381, %r375, %r259; add.s32 %r382, %r381, %r380; xor.b32 %r383, %r382, %r377; shf.l.wrap.b32 %r384, %r383, %r383, 24; add.s32 %r385, %r384, %r378; xor.b32 %r386, %r385, %r380; shf.l.wrap.b32 %r387, %r386, %r386, 25; - add.s32 %r388, %r340, %r177; - add.s32 %r389, %r388, %r303; - xor.b32 %r390, %r389, %r328; + add.s32 %r388, %r340, %r168; + add.s32 %r389, %r388, %r387; + xor.b32 %r390, %r389, %r356; shf.l.wrap.b32 %r391, %r390, %r390, 16; - add.s32 %r392, %r391, %r315; - xor.b32 %r393, %r392, %r303; + add.s32 %r392, %r391, %r371; + xor.b32 %r393, %r392, %r387; shf.l.wrap.b32 %r394, %r393, %r393, 20; - add.s32 %r395, %r389, %r121; + add.s32 %r395, %r389, %r196; add.s32 %r396, %r395, %r394; xor.b32 %r397, %r396, %r391; shf.l.wrap.b32 %r398, %r397, %r397, 24; add.s32 %r399, %r398, %r392; xor.b32 %r400, %r399, %r394; shf.l.wrap.b32 %r401, %r400, %r400, 25; - add.s32 %r402, %r354, %r89; - add.s32 %r403, %r402, %r401; + add.s32 %r402, %r354, %r175; + add.s32 %r403, %r402, %r345; xor.b32 %r404, %r403, %r370; shf.l.wrap.b32 %r405, %r404, %r404, 16; add.s32 %r406, %r405, %r385; - xor.b32 %r407, %r406, %r401; + xor.b32 %r407, %r406, %r345; shf.l.wrap.b32 %r408, %r407, %r407, 20; - add.s32 %r409, %r403, %r93; + add.s32 %r409, %r403, %r224; add.s32 %r410, %r409, %r408; xor.b32 %r411, %r410, %r405; shf.l.wrap.b32 %r412, %r411, %r411, 24; add.s32 %r413, %r412, %r406; xor.b32 %r414, %r413, %r408; shf.l.wrap.b32 %r415, %r414, %r414, 25; - add.s32 %r416, %r368, %r137; + add.s32 %r416, %r368, %r203; add.s32 %r417, %r416, %r359; xor.b32 %r418, %r417, %r384; shf.l.wrap.b32 %r419, %r418, %r418, 16; - add.s32 %r420, %r419, %r399; + add.s32 %r420, %r419, %r343; xor.b32 %r421, %r420, %r359; shf.l.wrap.b32 %r422, %r421, %r421, 20; - add.s32 %r423, %r417, %r153; + add.s32 %r423, %r417, %r154; add.s32 %r424, %r423, %r422; xor.b32 %r425, %r424, %r419; shf.l.wrap.b32 %r426, %r425, %r425, 24; add.s32 %r427, %r426, %r420; xor.b32 %r428, %r427, %r422; shf.l.wrap.b32 %r429, %r428, %r428, 25; - add.s32 %r430, %r382, %r161; + add.s32 %r430, %r382, %r182; add.s32 %r431, %r430, %r373; - xor.b32 %r432, %r431, %r398; + xor.b32 %r432, %r431, %r342; shf.l.wrap.b32 %r433, %r432, %r432, 16; add.s32 %r434, %r433, %r357; xor.b32 %r435, %r434, %r373; shf.l.wrap.b32 %r436, %r435, %r435, 20; - add.s32 %r437, %r431, %r81; + add.s32 %r437, %r431, %r245; add.s32 %r438, %r437, %r436; xor.b32 %r439, %r438, %r433; shf.l.wrap.b32 %r440, %r439, %r439, 24; add.s32 %r441, %r440, %r434; xor.b32 %r442, %r441, %r436; shf.l.wrap.b32 %r443, %r442, %r442, 25; - add.s32 %r444, %r396, %r113; - add.s32 %r445, %r444, %r387; - xor.b32 %r446, %r445, %r356; + add.s32 %r444, %r396, %r161; + add.s32 %r445, %r444, %r415; + xor.b32 %r446, %r445, %r440; shf.l.wrap.b32 %r447, %r446, %r446, 16; - add.s32 %r448, %r447, %r371; - xor.b32 %r449, %r448, %r387; + add.s32 %r448, %r447, %r427; + xor.b32 %r449, %r448, %r415; shf.l.wrap.b32 %r450, %r449, %r449, 20; - add.s32 %r451, %r445, %r169; + add.s32 %r451, %r445, %r231; add.s32 %r452, %r451, %r450; xor.b32 %r453, %r452, %r447; shf.l.wrap.b32 %r454, %r453, %r453, 24; add.s32 %r455, %r454, %r448; xor.b32 %r456, %r455, %r450; shf.l.wrap.b32 %r457, %r456, %r456, 25; - add.s32 %r458, %r410, %r105; + add.s32 %r458, %r410, %r238; add.s32 %r459, %r458, %r429; - xor.b32 %r460, %r459, %r454; + xor.b32 %r460, %r459, %r398; shf.l.wrap.b32 %r461, %r460, %r460, 16; add.s32 %r462, %r461, %r441; xor.b32 %r463, %r462, %r429; shf.l.wrap.b32 %r464, %r463, %r463, 20; - add.s32 %r465, %r459, %r101; + add.s32 %r465, %r459, %r189; add.s32 %r466, %r465, %r464; xor.b32 %r467, %r466, %r461; shf.l.wrap.b32 %r468, %r467, %r467, 24; add.s32 %r469, %r468, %r462; xor.b32 %r470, %r469, %r464; shf.l.wrap.b32 %r471, %r470, %r470, 25; - add.s32 %r472, %r424, %r129; + add.s32 %r472, %r424, %r217; add.s32 %r473, %r472, %r443; xor.b32 %r474, %r473, %r412; shf.l.wrap.b32 %r475, %r474, %r474, 16; - add.s32 %r476, %r475, %r455; + add.s32 %r476, %r475, %r399; xor.b32 %r477, %r476, %r443; shf.l.wrap.b32 %r478, %r477, %r477, 20; - add.s32 %r479, %r473, %r69; + add.s32 %r479, %r473, %r252; add.s32 %r480, %r479, %r478; xor.b32 %r481, %r480, %r475; shf.l.wrap.b32 %r482, %r481, %r481, 24; add.s32 %r483, %r482, %r476; xor.b32 %r484, %r483, %r478; shf.l.wrap.b32 %r485, %r484, %r484, 25; - add.s32 %r486, %r438, %r145; - add.s32 %r487, %r486, %r457; + add.s32 %r486, %r438, %r259; + add.s32 %r487, %r486, %r401; xor.b32 %r488, %r487, %r426; shf.l.wrap.b32 %r489, %r488, %r488, 16; add.s32 %r490, %r489, %r413; - xor.b32 %r491, %r490, %r457; + xor.b32 %r491, %r490, %r401; shf.l.wrap.b32 %r492, %r491, %r491, 20; - add.s32 %r493, %r487, %r177; + add.s32 %r493, %r487, %r210; add.s32 %r494, %r493, %r492; xor.b32 %r495, %r494, %r489; shf.l.wrap.b32 %r496, %r495, %r495, 24; add.s32 %r497, %r496, %r490; xor.b32 %r498, %r497, %r492; shf.l.wrap.b32 %r499, %r498, %r498, 25; - add.s32 %r500, %r452, %r121; - add.s32 %r501, %r500, %r415; - xor.b32 %r502, %r501, %r440; + add.s32 %r500, %r452, %r175; + add.s32 %r501, %r500, %r499; + xor.b32 %r502, %r501, %r468; shf.l.wrap.b32 %r503, %r502, %r502, 16; - add.s32 %r504, %r503, %r427; - xor.b32 %r505, %r504, %r415; + add.s32 %r504, %r503, %r483; + xor.b32 %r505, %r504, %r499; shf.l.wrap.b32 %r506, %r505, %r505, 20; - add.s32 %r507, %r501, %r77; + add.s32 %r507, %r501, %r182; add.s32 %r508, %r507, %r506; xor.b32 %r509, %r508, %r503; shf.l.wrap.b32 %r510, %r509, %r509, 24; add.s32 %r511, %r510, %r504; xor.b32 %r512, %r511, %r506; shf.l.wrap.b32 %r513, %r512, %r512, 25; - add.s32 %r514, %r466, %r137; - add.s32 %r515, %r514, %r513; + add.s32 %r514, %r466, %r224; + add.s32 %r515, %r514, %r457; xor.b32 %r516, %r515, %r482; shf.l.wrap.b32 %r517, %r516, %r516, 16; add.s32 %r518, %r517, %r497; - xor.b32 %r519, %r518, %r513; + xor.b32 %r519, %r518, %r457; shf.l.wrap.b32 %r520, %r519, %r519, 20; - add.s32 %r521, %r515, %r113; + add.s32 %r521, %r515, %r238; add.s32 %r522, %r521, %r520; xor.b32 %r523, %r522, %r517; shf.l.wrap.b32 %r524, %r523, %r523, 24; add.s32 %r525, %r524, %r518; xor.b32 %r526, %r525, %r520; shf.l.wrap.b32 %r527, %r526, %r526, 25; - add.s32 %r528, %r480, %r153; + add.s32 %r528, %r480, %r245; add.s32 %r529, %r528, %r471; xor.b32 %r530, %r529, %r496; shf.l.wrap.b32 %r531, %r530, %r530, 16; - add.s32 %r532, %r531, %r511; + add.s32 %r532, %r531, %r455; xor.b32 %r533, %r532, %r471; shf.l.wrap.b32 %r534, %r533, %r533, 20; - add.s32 %r535, %r529, %r129; + add.s32 %r535, %r529, %r168; add.s32 %r536, %r535, %r534; xor.b32 %r537, %r536, %r531; shf.l.wrap.b32 %r538, %r537, %r537, 24; add.s32 %r539, %r538, %r532; xor.b32 %r540, %r539, %r534; shf.l.wrap.b32 %r541, %r540, %r540, 25; - add.s32 %r542, %r494, %r169; + add.s32 %r542, %r494, %r203; add.s32 %r543, %r542, %r485; - xor.b32 %r544, %r543, %r510; + xor.b32 %r544, %r543, %r454; shf.l.wrap.b32 %r545, %r544, %r544, 16; add.s32 %r546, %r545, %r469; xor.b32 %r547, %r546, %r485; shf.l.wrap.b32 %r548, %r547, %r547, 20; - add.s32 %r549, %r543, %r89; + add.s32 %r549, %r543, %r252; add.s32 %r550, %r549, %r548; xor.b32 %r551, %r550, %r545; shf.l.wrap.b32 %r552, %r551, %r551, 24; add.s32 %r553, %r552, %r546; xor.b32 %r554, %r553, %r548; shf.l.wrap.b32 %r555, %r554, %r554, 25; - add.s32 %r556, %r508, %r161; - add.s32 %r557, %r556, %r499; - xor.b32 %r558, %r557, %r468; + add.s32 %r556, %r508, %r196; + add.s32 %r557, %r556, %r527; + xor.b32 %r558, %r557, %r552; shf.l.wrap.b32 %r559, %r558, %r558, 16; - add.s32 %r560, %r559, %r483; - xor.b32 %r561, %r560, %r499; + add.s32 %r560, %r559, %r539; + xor.b32 %r561, %r560, %r527; shf.l.wrap.b32 %r562, %r561, %r561, 20; - add.s32 %r563, %r557, %r177; + add.s32 %r563, %r557, %r189; add.s32 %r564, %r563, %r562; xor.b32 %r565, %r564, %r559; shf.l.wrap.b32 %r566, %r565, %r565, 24; add.s32 %r567, %r566, %r560; xor.b32 %r568, %r567, %r562; shf.l.wrap.b32 %r569, %r568, %r568, 25; - add.s32 %r570, %r522, %r93; + add.s32 %r570, %r522, %r217; add.s32 %r571, %r570, %r541; - xor.b32 %r572, %r571, %r566; + xor.b32 %r572, %r571, %r510; shf.l.wrap.b32 %r573, %r572, %r572, 16; add.s32 %r574, %r573, %r553; xor.b32 %r575, %r574, %r541; shf.l.wrap.b32 %r576, %r575, %r575, 20; - add.s32 %r577, %r571, %r69; + add.s32 %r577, %r571, %r154; add.s32 %r578, %r577, %r576; xor.b32 %r579, %r578, %r573; shf.l.wrap.b32 %r580, %r579, %r579, 24; add.s32 %r581, %r580, %r574; xor.b32 %r582, %r581, %r576; shf.l.wrap.b32 %r583, %r582, %r582, 25; - add.s32 %r584, %r536, %r145; + add.s32 %r584, %r536, %r231; add.s32 %r585, %r584, %r555; xor.b32 %r586, %r585, %r524; shf.l.wrap.b32 %r587, %r586, %r586, 16; - add.s32 %r588, %r587, %r567; + add.s32 %r588, %r587, %r511; xor.b32 %r589, %r588, %r555; shf.l.wrap.b32 %r590, %r589, %r589, 20; - add.s32 %r591, %r585, %r81; + add.s32 %r591, %r585, %r259; add.s32 %r592, %r591, %r590; xor.b32 %r593, %r592, %r587; shf.l.wrap.b32 %r594, %r593, %r593, 24; add.s32 %r595, %r594, %r588; xor.b32 %r596, %r595, %r590; shf.l.wrap.b32 %r597, %r596, %r596, 25; - add.s32 %r598, %r550, %r101; - add.s32 %r599, %r598, %r569; + add.s32 %r598, %r550, %r210; + add.s32 %r599, %r598, %r513; xor.b32 %r600, %r599, %r538; shf.l.wrap.b32 %r601, %r600, %r600, 16; add.s32 %r602, %r601, %r525; - xor.b32 %r603, %r602, %r569; + xor.b32 %r603, %r602, %r513; shf.l.wrap.b32 %r604, %r603, %r603, 20; - add.s32 %r605, %r599, %r121; + add.s32 %r605, %r599, %r161; add.s32 %r606, %r605, %r604; xor.b32 %r607, %r606, %r601; shf.l.wrap.b32 %r608, %r607, %r607, 24; add.s32 %r609, %r608, %r602; xor.b32 %r610, %r609, %r604; shf.l.wrap.b32 %r611, %r610, %r610, 25; - add.s32 %r612, %r564, %r77; - add.s32 %r613, %r612, %r527; - xor.b32 %r614, %r613, %r552; + add.s32 %r612, %r564, %r224; + add.s32 %r613, %r612, %r611; + xor.b32 %r614, %r613, %r580; shf.l.wrap.b32 %r615, %r614, %r614, 16; - add.s32 %r616, %r615, %r539; - xor.b32 %r617, %r616, %r527; + add.s32 %r616, %r615, %r595; + xor.b32 %r617, %r616, %r611; shf.l.wrap.b32 %r618, %r617, %r617, 20; - add.s32 %r619, %r613, %r105; + add.s32 %r619, %r613, %r203; add.s32 %r620, %r619, %r618; xor.b32 %r621, %r620, %r615; shf.l.wrap.b32 %r622, %r621, %r621, 24; add.s32 %r623, %r622, %r616; xor.b32 %r624, %r623, %r618; shf.l.wrap.b32 %r625, %r624, %r624, 25; - add.s32 %r626, %r578, %r153; - add.s32 %r627, %r626, %r625; + add.s32 %r626, %r578, %r238; + add.s32 %r627, %r626, %r569; xor.b32 %r628, %r627, %r594; shf.l.wrap.b32 %r629, %r628, %r628, 16; add.s32 %r630, %r629, %r609; - xor.b32 %r631, %r630, %r625; + xor.b32 %r631, %r630, %r569; shf.l.wrap.b32 %r632, %r631, %r631, 20; - add.s32 %r633, %r627, %r161; + add.s32 %r633, %r627, %r217; add.s32 %r634, %r633, %r632; xor.b32 %r635, %r634, %r629; shf.l.wrap.b32 %r636, %r635, %r635, 24; add.s32 %r637, %r636, %r630; xor.b32 %r638, %r637, %r632; shf.l.wrap.b32 %r639, %r638, %r638, 25; - add.s32 %r640, %r592, %r129; + add.s32 %r640, %r592, %r252; add.s32 %r641, %r640, %r583; xor.b32 %r642, %r641, %r608; shf.l.wrap.b32 %r643, %r642, %r642, 16; - add.s32 %r644, %r643, %r623; + add.s32 %r644, %r643, %r567; xor.b32 %r645, %r644, %r583; shf.l.wrap.b32 %r646, %r645, %r645, 20; - add.s32 %r647, %r641, %r145; + add.s32 %r647, %r641, %r175; add.s32 %r648, %r647, %r646; xor.b32 %r649, %r648, %r643; shf.l.wrap.b32 %r650, %r649, %r649, 24; add.s32 %r651, %r650, %r644; xor.b32 %r652, %r651, %r646; shf.l.wrap.b32 %r653, %r652, %r652, 25; - add.s32 %r654, %r606, %r177; + add.s32 %r654, %r606, %r245; add.s32 %r655, %r654, %r597; - xor.b32 %r656, %r655, %r622; + xor.b32 %r656, %r655, %r566; shf.l.wrap.b32 %r657, %r656, %r656, 16; add.s32 %r658, %r657, %r581; xor.b32 %r659, %r658, %r597; shf.l.wrap.b32 %r660, %r659, %r659, 20; - add.s32 %r661, %r655, %r137; + add.s32 %r661, %r655, %r259; add.s32 %r662, %r661, %r660; xor.b32 %r663, %r662, %r657; shf.l.wrap.b32 %r664, %r663, %r663, 24; add.s32 %r665, %r664, %r658; xor.b32 %r666, %r665, %r660; shf.l.wrap.b32 %r667, %r666, %r666, 25; - add.s32 %r668, %r620, %r169; - add.s32 %r669, %r668, %r611; - xor.b32 %r670, %r669, %r580; + add.s32 %r668, %r620, %r182; + add.s32 %r669, %r668, %r639; + xor.b32 %r670, %r669, %r664; shf.l.wrap.b32 %r671, %r670, %r670, 16; - add.s32 %r672, %r671, %r595; - xor.b32 %r673, %r672, %r611; + add.s32 %r672, %r671, %r651; + xor.b32 %r673, %r672, %r639; shf.l.wrap.b32 %r674, %r673, %r673, 20; - add.s32 %r675, %r669, %r121; + add.s32 %r675, %r669, %r154; add.s32 %r676, %r675, %r674; xor.b32 %r677, %r676, %r671; shf.l.wrap.b32 %r678, %r677, %r677, 24; add.s32 %r679, %r678, %r672; xor.b32 %r680, %r679, %r674; shf.l.wrap.b32 %r681, %r680, %r680, 25; - add.s32 %r682, %r634, %r113; + add.s32 %r682, %r634, %r231; add.s32 %r683, %r682, %r653; - xor.b32 %r684, %r683, %r678; + xor.b32 %r684, %r683, %r622; shf.l.wrap.b32 %r685, %r684, %r684, 16; add.s32 %r686, %r685, %r665; xor.b32 %r687, %r686, %r653; shf.l.wrap.b32 %r688, %r687, %r687, 20; - add.s32 %r689, %r683, %r81; + add.s32 %r689, %r683, %r168; add.s32 %r690, %r689, %r688; xor.b32 %r691, %r690, %r685; shf.l.wrap.b32 %r692, %r691, %r691, 24; add.s32 %r693, %r692, %r686; xor.b32 %r694, %r693, %r688; shf.l.wrap.b32 %r695, %r694, %r694, 25; - add.s32 %r696, %r648, %r101; + add.s32 %r696, %r648, %r189; add.s32 %r697, %r696, %r667; xor.b32 %r698, %r697, %r636; shf.l.wrap.b32 %r699, %r698, %r698, 16; - add.s32 %r700, %r699, %r679; + add.s32 %r700, %r699, %r623; xor.b32 %r701, %r700, %r667; shf.l.wrap.b32 %r702, %r701, %r701, 20; - add.s32 %r703, %r697, %r89; + add.s32 %r703, %r697, %r210; add.s32 %r704, %r703, %r702; xor.b32 %r705, %r704, %r699; shf.l.wrap.b32 %r706, %r705, %r705, 24; add.s32 %r707, %r706, %r700; xor.b32 %r708, %r707, %r702; shf.l.wrap.b32 %r709, %r708, %r708, 25; - add.s32 %r710, %r662, %r69; - add.s32 %r711, %r710, %r681; + add.s32 %r710, %r662, %r161; + add.s32 %r711, %r710, %r625; xor.b32 %r712, %r711, %r650; shf.l.wrap.b32 %r713, %r712, %r712, 16; add.s32 %r714, %r713, %r637; - xor.b32 %r715, %r714, %r681; + xor.b32 %r715, %r714, %r625; shf.l.wrap.b32 %r716, %r715, %r715, 20; - add.s32 %r717, %r711, %r77; + add.s32 %r717, %r711, %r196; add.s32 %r718, %r717, %r716; xor.b32 %r719, %r718, %r713; shf.l.wrap.b32 %r720, %r719, %r719, 24; add.s32 %r721, %r720, %r714; xor.b32 %r722, %r721, %r716; shf.l.wrap.b32 %r723, %r722, %r722, 25; - add.s32 %r724, %r676, %r105; - add.s32 %r725, %r724, %r639; - xor.b32 %r726, %r725, %r664; + add.s32 %r724, %r676, %r238; + add.s32 %r725, %r724, %r723; + xor.b32 %r726, %r725, %r692; shf.l.wrap.b32 %r727, %r726, %r726, 16; - add.s32 %r728, %r727, %r651; - xor.b32 %r729, %r728, %r639; + add.s32 %r728, %r727, %r707; + xor.b32 %r729, %r728, %r723; shf.l.wrap.b32 %r730, %r729, %r729, 20; - add.s32 %r731, %r725, %r93; + add.s32 %r731, %r725, %r245; add.s32 %r732, %r731, %r730; xor.b32 %r733, %r732, %r727; shf.l.wrap.b32 %r734, %r733, %r733, 24; add.s32 %r735, %r734, %r728; xor.b32 %r736, %r735, %r730; shf.l.wrap.b32 %r737, %r736, %r736, 25; - add.s32 %r738, %r690, %r129; - add.s32 %r739, %r738, %r737; + add.s32 %r738, %r690, %r217; + add.s32 %r739, %r738, %r681; xor.b32 %r740, %r739, %r706; shf.l.wrap.b32 %r741, %r740, %r740, 16; add.s32 %r742, %r741, %r721; - xor.b32 %r743, %r742, %r737; + xor.b32 %r743, %r742, %r681; shf.l.wrap.b32 %r744, %r743, %r743, 20; - add.s32 %r745, %r739, %r169; + add.s32 %r745, %r739, %r231; add.s32 %r746, %r745, %r744; xor.b32 %r747, %r746, %r741; shf.l.wrap.b32 %r748, %r747, %r747, 24; add.s32 %r749, %r748, %r742; xor.b32 %r750, %r749, %r744; shf.l.wrap.b32 %r751, %r750, %r750, 25; - add.s32 %r752, %r704, %r145; + add.s32 %r752, %r704, %r259; add.s32 %r753, %r752, %r695; xor.b32 %r754, %r753, %r720; shf.l.wrap.b32 %r755, %r754, %r754, 16; - add.s32 %r756, %r755, %r735; + add.s32 %r756, %r755, %r679; xor.b32 %r757, %r756, %r695; shf.l.wrap.b32 %r758, %r757, %r757, 20; - add.s32 %r759, %r753, %r101; + add.s32 %r759, %r753, %r224; add.s32 %r760, %r759, %r758; xor.b32 %r761, %r760, %r755; shf.l.wrap.b32 %r762, %r761, %r761, 24; add.s32 %r763, %r762, %r756; xor.b32 %r764, %r763, %r758; shf.l.wrap.b32 %r765, %r764, %r764, 25; - add.s32 %r766, %r718, %r121; + add.s32 %r766, %r718, %r252; add.s32 %r767, %r766, %r709; - xor.b32 %r768, %r767, %r734; + xor.b32 %r768, %r767, %r678; shf.l.wrap.b32 %r769, %r768, %r768, 16; add.s32 %r770, %r769, %r693; xor.b32 %r771, %r770, %r709; shf.l.wrap.b32 %r772, %r771, %r771, 20; - add.s32 %r773, %r767, %r153; + add.s32 %r773, %r767, %r210; add.s32 %r774, %r773, %r772; xor.b32 %r775, %r774, %r769; shf.l.wrap.b32 %r776, %r775, %r775, 24; add.s32 %r777, %r776, %r770; xor.b32 %r778, %r777, %r772; shf.l.wrap.b32 %r779, %r778, %r778, 25; - add.s32 %r780, %r732, %r177; - add.s32 %r781, %r780, %r723; - xor.b32 %r782, %r781, %r692; + add.s32 %r780, %r732, %r203; + add.s32 %r781, %r780, %r751; + xor.b32 %r782, %r781, %r776; shf.l.wrap.b32 %r783, %r782, %r782, 16; - add.s32 %r784, %r783, %r707; - xor.b32 %r785, %r784, %r723; + add.s32 %r784, %r783, %r763; + xor.b32 %r785, %r784, %r751; shf.l.wrap.b32 %r786, %r785, %r785, 20; - add.s32 %r787, %r781, %r77; + add.s32 %r787, %r781, %r168; add.s32 %r788, %r787, %r786; xor.b32 %r789, %r788, %r783; shf.l.wrap.b32 %r790, %r789, %r789, 24; add.s32 %r791, %r790, %r784; xor.b32 %r792, %r791, %r786; shf.l.wrap.b32 %r793, %r792, %r792, 25; - add.s32 %r794, %r746, %r161; + add.s32 %r794, %r746, %r189; add.s32 %r795, %r794, %r765; - xor.b32 %r796, %r795, %r790; + xor.b32 %r796, %r795, %r734; shf.l.wrap.b32 %r797, %r796, %r796, 16; add.s32 %r798, %r797, %r777; xor.b32 %r799, %r798, %r765; shf.l.wrap.b32 %r800, %r799, %r799, 20; - add.s32 %r801, %r795, %r89; + add.s32 %r801, %r795, %r175; add.s32 %r802, %r801, %r800; xor.b32 %r803, %r802, %r797; shf.l.wrap.b32 %r804, %r803, %r803, 24; add.s32 %r805, %r804, %r798; xor.b32 %r806, %r805, %r800; shf.l.wrap.b32 %r807, %r806, %r806, 25; - add.s32 %r808, %r760, %r69; + add.s32 %r808, %r760, %r154; add.s32 %r809, %r808, %r779; xor.b32 %r810, %r809, %r748; shf.l.wrap.b32 %r811, %r810, %r810, 16; - add.s32 %r812, %r811, %r791; + add.s32 %r812, %r811, %r735; xor.b32 %r813, %r812, %r779; shf.l.wrap.b32 %r814, %r813, %r813, 20; - add.s32 %r815, %r809, %r137; + add.s32 %r815, %r809, %r161; add.s32 %r816, %r815, %r814; xor.b32 %r817, %r816, %r811; shf.l.wrap.b32 %r818, %r817, %r817, 24; add.s32 %r819, %r818, %r812; xor.b32 %r820, %r819, %r814; shf.l.wrap.b32 %r821, %r820, %r820, 25; - add.s32 %r822, %r774, %r81; - add.s32 %r823, %r822, %r793; + add.s32 %r822, %r774, %r196; + add.s32 %r823, %r822, %r737; xor.b32 %r824, %r823, %r762; shf.l.wrap.b32 %r825, %r824, %r824, 16; add.s32 %r826, %r825, %r749; - xor.b32 %r827, %r826, %r793; + xor.b32 %r827, %r826, %r737; shf.l.wrap.b32 %r828, %r827, %r827, 20; - add.s32 %r829, %r823, %r105; + add.s32 %r829, %r823, %r182; add.s32 %r830, %r829, %r828; xor.b32 %r831, %r830, %r825; shf.l.wrap.b32 %r832, %r831, %r831, 24; add.s32 %r833, %r832, %r826; xor.b32 %r834, %r833, %r828; shf.l.wrap.b32 %r835, %r834, %r834, 25; - add.s32 %r836, %r788, %r93; - add.s32 %r837, %r836, %r751; - xor.b32 %r838, %r837, %r776; + add.s32 %r836, %r788, %r217; + add.s32 %r837, %r836, %r835; + xor.b32 %r838, %r837, %r804; shf.l.wrap.b32 %r839, %r838, %r838, 16; - add.s32 %r840, %r839, %r763; - xor.b32 %r841, %r840, %r751; + add.s32 %r840, %r839, %r819; + xor.b32 %r841, %r840, %r835; shf.l.wrap.b32 %r842, %r841, %r841, 20; - add.s32 %r843, %r837, %r113; + add.s32 %r843, %r837, %r252; add.s32 %r844, %r843, %r842; xor.b32 %r845, %r844, %r839; shf.l.wrap.b32 %r846, %r845, %r845, 24; add.s32 %r847, %r846, %r840; xor.b32 %r848, %r847, %r842; shf.l.wrap.b32 %r849, %r848, %r848, 25; - add.s32 %r850, %r802, %r145; - add.s32 %r851, %r850, %r849; + add.s32 %r850, %r802, %r231; + add.s32 %r851, %r850, %r793; xor.b32 %r852, %r851, %r818; shf.l.wrap.b32 %r853, %r852, %r852, 16; add.s32 %r854, %r853, %r833; - xor.b32 %r855, %r854, %r849; + xor.b32 %r855, %r854, %r793; shf.l.wrap.b32 %r856, %r855, %r855, 20; - add.s32 %r857, %r851, %r177; + add.s32 %r857, %r851, %r189; add.s32 %r858, %r857, %r856; xor.b32 %r859, %r858, %r853; shf.l.wrap.b32 %r860, %r859, %r859, 24; add.s32 %r861, %r860, %r854; xor.b32 %r862, %r861, %r856; shf.l.wrap.b32 %r863, %r862, %r862, 25; - add.s32 %r864, %r816, %r101; + add.s32 %r864, %r816, %r210; add.s32 %r865, %r864, %r807; xor.b32 %r866, %r865, %r832; shf.l.wrap.b32 %r867, %r866, %r866, 16; - add.s32 %r868, %r867, %r847; + add.s32 %r868, %r867, %r791; xor.b32 %r869, %r868, %r807; shf.l.wrap.b32 %r870, %r869, %r869, 20; - add.s32 %r871, %r865, %r69; + add.s32 %r871, %r865, %r238; add.s32 %r872, %r871, %r870; xor.b32 %r873, %r872, %r867; shf.l.wrap.b32 %r874, %r873, %r873, 24; add.s32 %r875, %r874, %r868; xor.b32 %r876, %r875, %r870; shf.l.wrap.b32 %r877, %r876, %r876, 25; - add.s32 %r878, %r830, %r77; + add.s32 %r878, %r830, %r259; add.s32 %r879, %r878, %r821; - xor.b32 %r880, %r879, %r846; + xor.b32 %r880, %r879, %r790; shf.l.wrap.b32 %r881, %r880, %r880, 16; add.s32 %r882, %r881, %r805; xor.b32 %r883, %r882, %r821; shf.l.wrap.b32 %r884, %r883, %r883, 20; - add.s32 %r885, %r879, %r129; + add.s32 %r885, %r879, %r161; add.s32 %r886, %r885, %r884; xor.b32 %r887, %r886, %r881; shf.l.wrap.b32 %r888, %r887, %r887, 24; add.s32 %r889, %r888, %r882; xor.b32 %r890, %r889, %r884; shf.l.wrap.b32 %r891, %r890, %r890, 25; - add.s32 %r892, %r844, %r121; - add.s32 %r893, %r892, %r835; - xor.b32 %r894, %r893, %r804; + add.s32 %r892, %r844, %r245; + add.s32 %r893, %r892, %r863; + xor.b32 %r894, %r893, %r888; shf.l.wrap.b32 %r895, %r894, %r894, 16; - add.s32 %r896, %r895, %r819; - xor.b32 %r897, %r896, %r835; + add.s32 %r896, %r895, %r875; + xor.b32 %r897, %r896, %r863; shf.l.wrap.b32 %r898, %r897, %r897, 20; - add.s32 %r899, %r893, %r105; + add.s32 %r899, %r893, %r175; add.s32 %r900, %r899, %r898; xor.b32 %r901, %r900, %r895; shf.l.wrap.b32 %r902, %r901, %r901, 24; add.s32 %r903, %r902, %r896; xor.b32 %r904, %r903, %r898; shf.l.wrap.b32 %r905, %r904, %r904, 25; - add.s32 %r906, %r858, %r169; + add.s32 %r906, %r858, %r154; add.s32 %r907, %r906, %r877; - xor.b32 %r908, %r907, %r902; + xor.b32 %r908, %r907, %r846; shf.l.wrap.b32 %r909, %r908, %r908, 16; add.s32 %r910, %r909, %r889; xor.b32 %r911, %r910, %r877; shf.l.wrap.b32 %r912, %r911, %r911, 20; - add.s32 %r913, %r907, %r137; + add.s32 %r913, %r907, %r224; add.s32 %r914, %r913, %r912; xor.b32 %r915, %r914, %r909; shf.l.wrap.b32 %r916, %r915, %r915, 24; add.s32 %r917, %r916, %r910; xor.b32 %r918, %r917, %r912; shf.l.wrap.b32 %r919, %r918, %r918, 25; - add.s32 %r920, %r872, %r81; + add.s32 %r920, %r872, %r168; add.s32 %r921, %r920, %r891; xor.b32 %r922, %r921, %r860; shf.l.wrap.b32 %r923, %r922, %r922, 16; - add.s32 %r924, %r923, %r903; + add.s32 %r924, %r923, %r847; xor.b32 %r925, %r924, %r891; shf.l.wrap.b32 %r926, %r925, %r925, 20; - add.s32 %r927, %r921, %r153; + add.s32 %r927, %r921, %r196; add.s32 %r928, %r927, %r926; xor.b32 %r929, %r928, %r923; shf.l.wrap.b32 %r930, %r929, %r929, 24; add.s32 %r931, %r930, %r924; xor.b32 %r932, %r931, %r926; shf.l.wrap.b32 %r933, %r932, %r932, 25; - add.s32 %r934, %r886, %r89; - add.s32 %r935, %r934, %r905; + add.s32 %r934, %r886, %r182; + add.s32 %r935, %r934, %r849; xor.b32 %r936, %r935, %r874; shf.l.wrap.b32 %r937, %r936, %r936, 16; add.s32 %r938, %r937, %r861; - xor.b32 %r939, %r938, %r905; + xor.b32 %r939, %r938, %r849; shf.l.wrap.b32 %r940, %r939, %r939, 20; - add.s32 %r941, %r935, %r93; + add.s32 %r941, %r935, %r203; add.s32 %r942, %r941, %r940; xor.b32 %r943, %r942, %r937; shf.l.wrap.b32 %r944, %r943, %r943, 24; add.s32 %r945, %r944, %r938; xor.b32 %r946, %r945, %r940; shf.l.wrap.b32 %r947, %r946, %r946, 25; - add.s32 %r948, %r900, %r113; - add.s32 %r949, %r948, %r863; - xor.b32 %r950, %r949, %r888; + add.s32 %r948, %r900, %r231; + add.s32 %r949, %r948, %r947; + xor.b32 %r950, %r949, %r916; shf.l.wrap.b32 %r951, %r950, %r950, 16; - add.s32 %r952, %r951, %r875; - xor.b32 %r953, %r952, %r863; + add.s32 %r952, %r951, %r931; + xor.b32 %r953, %r952, %r947; shf.l.wrap.b32 %r954, %r953, %r953, 20; - add.s32 %r955, %r949, %r161; + add.s32 %r955, %r949, %r259; add.s32 %r956, %r955, %r954; xor.b32 %r957, %r956, %r951; shf.l.wrap.b32 %r958, %r957, %r957, 24; add.s32 %r959, %r958, %r952; xor.b32 %r960, %r959, %r954; shf.l.wrap.b32 %r961, %r960, %r960, 25; - xor.b32 %r1, %r945, %r914; - xor.b32 %r2, %r959, %r928; - st.local.v2.u32 [%rd3+32], {%r1, %r2}; - xor.b32 %r3, %r917, %r942; - xor.b32 %r4, %r956, %r931; - st.local.v2.u32 [%rd3+40], {%r3, %r4}; - xor.b32 %r5, %r961, %r930; - xor.b32 %r6, %r919, %r944; - st.local.v2.u32 [%rd3+48], {%r5, %r6}; - xor.b32 %r7, %r958, %r933; - xor.b32 %r8, %r947, %r916; - st.local.v2.u32 [%rd3+56], {%r7, %r8}; - st.local.u64 [%rd3+72], %rd131; - st.local.u64 [%rd3+80], %rd8; - add.s16 %rs1, %rs61, 16; - and.b16 %rs70, %rs1, 255; - add.s16 %rs71, %rs62, 1; - st.local.v2.u8 [%rd3+136], {%rs1, %rs71}; - cvt.u32.u16 %r962, %rs71; - cvt.u32.u16 %r963, %rs70; - prmt.b32 %r964, %r962, %r963, 30212; - cvt.u16.u32 %rs72, %r964; - shr.u16 %rs2, %rs72, 8; - mov.b32 {%rs5, %rs6}, %r53; - mov.b32 {%rs3, %rs4}, %r52; - mov.b32 {%rs9, %rs10}, %r27; - mov.b32 {%rs7, %rs8}, %r26; - setp.eq.s16 %p10, %rs2, 0; - selp.u16 %rs73, 1, 0, %p10; - shr.u16 %rs74, %rs3, 8; - shr.u16 %rs75, %rs4, 8; - shr.u16 %rs76, %rs5, 8; - shr.u16 %rs77, %rs6, 8; - shr.u16 %rs78, %rs7, 8; - shr.u16 %rs79, %rs8, 8; - shr.u16 %rs80, %rs9, 8; - shr.u16 %rs81, %rs10, 8; - or.b16 %rs82, %rs73, 10; - cvt.u32.u16 %r965, %rs3; - and.b32 %r966, %r965, 255; - cvt.u32.u16 %r967, %rs74; - prmt.b32 %r968, %r967, %r966, 30212; - cvt.u32.u16 %r969, %rs4; - prmt.b32 %r970, %r969, %r968, 28756; - cvt.u32.u16 %r971, %rs75; - prmt.b32 %r972, %r971, %r970, 1620; - cvt.u32.u16 %r973, %rs5; - and.b32 %r974, %r973, 255; - cvt.u32.u16 %r975, %rs76; - prmt.b32 %r976, %r975, %r974, 30212; - cvt.u32.u16 %r977, %rs6; - prmt.b32 %r978, %r977, %r976, 28756; - cvt.u32.u16 %r979, %rs77; - prmt.b32 %r980, %r979, %r978, 1620; - cvt.u32.u16 %r981, %rs7; - and.b32 %r982, %r981, 255; - cvt.u32.u16 %r983, %rs78; - prmt.b32 %r984, %r983, %r982, 30212; - cvt.u32.u16 %r985, %rs8; - prmt.b32 %r986, %r985, %r984, 28756; - cvt.u32.u16 %r987, %rs79; - prmt.b32 %r988, %r987, %r986, 1620; - cvt.u32.u16 %r989, %rs9; - and.b32 %r990, %r989, 255; - cvt.u32.u16 %r991, %rs80; - prmt.b32 %r992, %r991, %r990, 30212; - cvt.u32.u16 %r993, %rs10; - prmt.b32 %r994, %r993, %r992, 28756; - cvt.u32.u16 %r995, %rs81; - prmt.b32 %r996, %r995, %r994, 1620; - cvt.u32.u16 %r997, %rs82; - add.s32 %r998, %r5, %r1; - add.s32 %r999, %r998, %r972; - add.s32 %r1000, %r980, %r999; - add.s32 %r1001, %r6, %r2; - add.s32 %r1002, %r1001, %r988; - add.s32 %r1003, %r996, %r1002; - add.s32 %r1004, %r7, %r3; - cvt.u32.u16 %r1005, %rs1; - and.b32 %r1006, %r1005, 255; - xor.b32 %r1007, %r1004, %r1006; - shr.u32 %r1008, %r1004, 16; - shl.b32 %r1009, %r1007, 16; - or.b32 %r1010, %r1009, %r1008; - add.s32 %r1011, %r1010, 1013904242; - xor.b32 %r1012, %r1011, %r7; - shf.l.wrap.b32 %r1013, %r1012, %r1012, 20; - add.s32 %r1014, %r1004, %r1013; - xor.b32 %r1015, %r1014, %r1010; - shf.l.wrap.b32 %r1016, %r1015, %r1015, 24; - add.s32 %r1017, %r1016, %r1011; - xor.b32 %r1018, %r1017, %r1013; - shf.l.wrap.b32 %r1019, %r1018, %r1018, 25; - add.s32 %r1020, %r8, %r4; - xor.b32 %r1021, %r1020, %r997; - shr.u32 %r1022, %r1020, 16; - shl.b32 %r1023, %r1021, 16; - or.b32 %r1024, %r1023, %r1022; - add.s32 %r1025, %r1024, -1521486534; - xor.b32 %r1026, %r1025, %r8; - shf.l.wrap.b32 %r1027, %r1026, %r1026, 20; - add.s32 %r1028, %r1020, %r1027; - xor.b32 %r1029, %r1028, %r1024; - shf.l.wrap.b32 %r1030, %r1029, %r1029, 24; - add.s32 %r1031, %r1030, %r1025; - xor.b32 %r1032, %r1031, %r1027; - shf.l.wrap.b32 %r1033, %r1032, %r1032, 25; - add.s32 %r1034, %r1033, %r1014; - shf.l.wrap.b32 %r1035, %r999, %r999, 16; - add.s32 %r1036, %r1035, 1779033703; - xor.b32 %r1037, %r1036, %r5; + add.s32 %r962, %r914, %r189; + add.s32 %r963, %r962, %r905; + xor.b32 %r964, %r963, %r930; + shf.l.wrap.b32 %r965, %r964, %r964, 16; + add.s32 %r966, %r965, %r945; + xor.b32 %r967, %r966, %r905; + shf.l.wrap.b32 %r968, %r967, %r967, 20; + add.s32 %r969, %r963, %r154; + add.s32 %r970, %r969, %r968; + xor.b32 %r971, %r970, %r965; + shf.l.wrap.b32 %r972, %r971, %r971, 24; + add.s32 %r973, %r972, %r966; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 25; + add.s32 %r976, %r928, %r161; + add.s32 %r977, %r976, %r919; + xor.b32 %r978, %r977, %r944; + shf.l.wrap.b32 %r979, %r978, %r978, 16; + add.s32 %r980, %r979, %r903; + xor.b32 %r981, %r980, %r919; + shf.l.wrap.b32 %r982, %r981, %r981, 20; + add.s32 %r983, %r977, %r217; + add.s32 %r984, %r983, %r982; + xor.b32 %r985, %r984, %r979; + shf.l.wrap.b32 %r986, %r985, %r985, 24; + add.s32 %r987, %r986, %r980; + xor.b32 %r988, %r987, %r982; + shf.l.wrap.b32 %r989, %r988, %r988, 25; + add.s32 %r990, %r942, %r210; + add.s32 %r991, %r990, %r933; + xor.b32 %r992, %r991, %r902; + shf.l.wrap.b32 %r993, %r992, %r992, 16; + add.s32 %r994, %r993, %r917; + xor.b32 %r995, %r994, %r933; + shf.l.wrap.b32 %r996, %r995, %r995, 20; + add.s32 %r997, %r991, %r196; + add.s32 %r998, %r997, %r996; + xor.b32 %r999, %r998, %r993; + shf.l.wrap.b32 %r1000, %r999, %r999, 24; + add.s32 %r1001, %r1000, %r994; + xor.b32 %r1002, %r1001, %r996; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 25; + add.s32 %r1004, %r956, %r252; + add.s32 %r1005, %r1004, %r975; + xor.b32 %r1006, %r1005, %r1000; + shf.l.wrap.b32 %r1007, %r1006, %r1006, 16; + add.s32 %r1008, %r1007, %r987; + xor.b32 %r1009, %r1008, %r975; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 20; + add.s32 %r1011, %r1005, %r224; + add.s32 %r1012, %r1011, %r1010; + xor.b32 %r1013, %r1012, %r1007; + shf.l.wrap.b32 %r1014, %r1013, %r1013, 24; + add.s32 %r1015, %r1014, %r1008; + xor.b32 %r1016, %r1015, %r1010; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 25; + add.s32 %r1018, %r970, %r168; + add.s32 %r1019, %r1018, %r989; + xor.b32 %r1020, %r1019, %r958; + shf.l.wrap.b32 %r1021, %r1020, %r1020, 16; + add.s32 %r1022, %r1021, %r1001; + xor.b32 %r1023, %r1022, %r989; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 20; + add.s32 %r1025, %r1019, %r238; + add.s32 %r1026, %r1025, %r1024; + xor.b32 %r1027, %r1026, %r1021; + shf.l.wrap.b32 %r1028, %r1027, %r1027, 24; + add.s32 %r1029, %r1028, %r1022; + xor.b32 %r1030, %r1029, %r1024; + shf.l.wrap.b32 %r1031, %r1030, %r1030, 25; + add.s32 %r1032, %r984, %r175; + add.s32 %r1033, %r1032, %r1003; + xor.b32 %r1034, %r1033, %r972; + shf.l.wrap.b32 %r1035, %r1034, %r1034, 16; + add.s32 %r1036, %r1035, %r959; + xor.b32 %r1037, %r1036, %r1003; shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; - add.s32 %r1039, %r1000, %r1038; - xor.b32 %r1040, %r1039, %r1035; - shf.l.wrap.b32 %r1041, %r1040, %r1040, 24; - add.s32 %r1042, %r1041, %r1036; - xor.b32 %r1043, %r1042, %r1038; - shf.l.wrap.b32 %r1044, %r1043, %r1043, 25; - shf.l.wrap.b32 %r1045, %r1002, %r1002, 16; - add.s32 %r1046, %r1045, -1150833019; - xor.b32 %r1047, %r1046, %r6; - shf.l.wrap.b32 %r1048, %r1047, %r1047, 20; - add.s32 %r1049, %r1003, %r1048; - xor.b32 %r1050, %r1049, %r1045; - shf.l.wrap.b32 %r1051, %r1050, %r1050, 24; - add.s32 %r1052, %r1051, %r1046; - xor.b32 %r1053, %r1052, %r1048; - shf.l.wrap.b32 %r1054, %r1053, %r1053, 25; - add.s32 %r1055, %r1039, %r1054; - xor.b32 %r1056, %r1055, %r1030; - shf.l.wrap.b32 %r1057, %r1056, %r1056, 16; - add.s32 %r1058, %r1057, %r1017; - xor.b32 %r1059, %r1058, %r1054; - shf.l.wrap.b32 %r1060, %r1059, %r1059, 20; - add.s32 %r1061, %r1055, %r1060; - xor.b32 %r1062, %r1061, %r1057; - shf.l.wrap.b32 %r1063, %r1062, %r1062, 24; - add.s32 %r1064, %r1063, %r1058; - xor.b32 %r1065, %r1064, %r1060; - shf.l.wrap.b32 %r1066, %r1065, %r1065, 25; - add.s32 %r1067, %r1019, %r1049; - xor.b32 %r1068, %r1041, %r1067; - shf.l.wrap.b32 %r1069, %r1068, %r1068, 16; - add.s32 %r1070, %r1069, %r1031; - xor.b32 %r1071, %r1070, %r1019; - shf.l.wrap.b32 %r1072, %r1071, %r1071, 20; - add.s32 %r1073, %r1067, %r1072; - xor.b32 %r1074, %r1073, %r1069; - shf.l.wrap.b32 %r1075, %r1074, %r1074, 24; - add.s32 %r1076, %r1075, %r1070; - xor.b32 %r1077, %r1076, %r1072; - shf.l.wrap.b32 %r1078, %r1077, %r1077, 25; - xor.b32 %r1079, %r1051, %r1034; - shf.l.wrap.b32 %r1080, %r1079, %r1079, 16; - add.s32 %r1081, %r1080, %r1042; - xor.b32 %r1082, %r1081, %r1033; - shf.l.wrap.b32 %r1083, %r1082, %r1082, 20; - add.s32 %r1084, %r1034, %r1083; - xor.b32 %r1085, %r1084, %r1080; - shf.l.wrap.b32 %r1086, %r1085, %r1085, 24; - add.s32 %r1087, %r1086, %r1081; - xor.b32 %r1088, %r1087, %r1083; - shf.l.wrap.b32 %r1089, %r1088, %r1088, 25; - add.s32 %r1090, %r1028, %r1044; - xor.b32 %r1091, %r1090, %r1016; - shf.l.wrap.b32 %r1092, %r1091, %r1091, 16; - add.s32 %r1093, %r1092, %r1052; - xor.b32 %r1094, %r1093, %r1044; - shf.l.wrap.b32 %r1095, %r1094, %r1094, 20; - add.s32 %r1096, %r1090, %r1095; - xor.b32 %r1097, %r1096, %r1092; - shf.l.wrap.b32 %r1098, %r1097, %r1097, 24; - add.s32 %r1099, %r1098, %r1093; - xor.b32 %r1100, %r1099, %r1095; - shf.l.wrap.b32 %r1101, %r1100, %r1100, 25; - add.s32 %r1102, %r1061, %r988; - add.s32 %r1103, %r1102, %r1101; - xor.b32 %r1104, %r1103, %r1075; - shf.l.wrap.b32 %r1105, %r1104, %r1104, 16; - add.s32 %r1106, %r1105, %r1087; - xor.b32 %r1107, %r1106, %r1101; - shf.l.wrap.b32 %r1108, %r1107, %r1107, 20; - add.s32 %r1109, %r1103, %r1108; - xor.b32 %r1110, %r1109, %r1105; - shf.l.wrap.b32 %r1111, %r1110, %r1110, 24; - add.s32 %r1112, %r1111, %r1106; - xor.b32 %r1113, %r1112, %r1108; - shf.l.wrap.b32 %r1114, %r1113, %r1113, 25; - add.s32 %r1115, %r1073, %r996; - add.s32 %r1116, %r1115, %r1066; - xor.b32 %r1117, %r1116, %r1086; - shf.l.wrap.b32 %r1118, %r1117, %r1117, 16; - add.s32 %r1119, %r1118, %r1099; - xor.b32 %r1120, %r1119, %r1066; - shf.l.wrap.b32 %r1121, %r1120, %r1120, 20; - add.s32 %r1122, %r1116, %r1121; - xor.b32 %r1123, %r1122, %r1118; - shf.l.wrap.b32 %r1124, %r1123, %r1123, 24; - add.s32 %r1125, %r1124, %r1119; - xor.b32 %r1126, %r1125, %r1121; - shf.l.wrap.b32 %r1127, %r1126, %r1126, 25; - add.s32 %r1128, %r1084, %r1078; - xor.b32 %r1129, %r1098, %r1128; - shf.l.wrap.b32 %r1130, %r1129, %r1129, 16; - add.s32 %r1131, %r1130, %r1064; - xor.b32 %r1132, %r1131, %r1078; - shf.l.wrap.b32 %r1133, %r1132, %r1132, 20; - add.s32 %r1134, %r1128, %r972; - add.s32 %r1135, %r1134, %r1133; - xor.b32 %r1136, %r1135, %r1130; - shf.l.wrap.b32 %r1137, %r1136, %r1136, 24; - add.s32 %r1138, %r1137, %r1131; - xor.b32 %r1139, %r1138, %r1133; - shf.l.wrap.b32 %r1140, %r1139, %r1139, 25; - add.s32 %r1141, %r1096, %r1089; - xor.b32 %r1142, %r1063, %r1141; - shf.l.wrap.b32 %r1143, %r1142, %r1142, 16; - add.s32 %r1144, %r1143, %r1076; - xor.b32 %r1145, %r1144, %r1089; - shf.l.wrap.b32 %r1146, %r1145, %r1145, 20; - add.s32 %r1147, %r1141, %r1146; - xor.b32 %r1148, %r1147, %r1143; - shf.l.wrap.b32 %r1149, %r1148, %r1148, 24; - add.s32 %r1150, %r1149, %r1144; - xor.b32 %r1151, %r1150, %r1146; - shf.l.wrap.b32 %r1152, %r1151, %r1151, 25; - add.s32 %r1153, %r1109, %r980; - add.s32 %r1154, %r1153, %r1127; - xor.b32 %r1155, %r1154, %r1149; - shf.l.wrap.b32 %r1156, %r1155, %r1155, 16; - add.s32 %r1157, %r1156, %r1138; - xor.b32 %r1158, %r1157, %r1127; - shf.l.wrap.b32 %r1159, %r1158, %r1158, 20; - add.s32 %r1160, %r1154, %r1159; - xor.b32 %r1161, %r1160, %r1156; - shf.l.wrap.b32 %r1162, %r1161, %r1161, 24; - add.s32 %r1163, %r1162, %r1157; - xor.b32 %r1164, %r1163, %r1159; - shf.l.wrap.b32 %r1165, %r1164, %r1164, 25; - add.s32 %r1166, %r1140, %r1122; - xor.b32 %r1167, %r1111, %r1166; - shf.l.wrap.b32 %r1168, %r1167, %r1167, 16; - add.s32 %r1169, %r1168, %r1150; - xor.b32 %r1170, %r1169, %r1140; - shf.l.wrap.b32 %r1171, %r1170, %r1170, 20; - add.s32 %r1172, %r1166, %r1171; - xor.b32 %r1173, %r1172, %r1168; - shf.l.wrap.b32 %r1174, %r1173, %r1173, 24; - add.s32 %r1175, %r1174, %r1169; - xor.b32 %r1176, %r1175, %r1171; - shf.l.wrap.b32 %r1177, %r1176, %r1176, 25; - add.s32 %r1178, %r1135, %r1152; - xor.b32 %r1179, %r1124, %r1178; - shf.l.wrap.b32 %r1180, %r1179, %r1179, 16; - add.s32 %r1181, %r1180, %r1112; - xor.b32 %r1182, %r1181, %r1152; - shf.l.wrap.b32 %r1183, %r1182, %r1182, 20; - add.s32 %r1184, %r1178, %r1183; - xor.b32 %r1185, %r1184, %r1180; - shf.l.wrap.b32 %r1186, %r1185, %r1185, 24; - add.s32 %r1187, %r1186, %r1181; - xor.b32 %r1188, %r1187, %r1183; - shf.l.wrap.b32 %r1189, %r1188, %r1188, 25; - add.s32 %r1190, %r1147, %r1114; - xor.b32 %r1191, %r1190, %r1137; - shf.l.wrap.b32 %r1192, %r1191, %r1191, 16; - add.s32 %r1193, %r1192, %r1125; - xor.b32 %r1194, %r1193, %r1114; - shf.l.wrap.b32 %r1195, %r1194, %r1194, 20; - add.s32 %r1196, %r1190, %r1195; - xor.b32 %r1197, %r1196, %r1192; - shf.l.wrap.b32 %r1198, %r1197, %r1197, 24; - add.s32 %r1199, %r1198, %r1193; - xor.b32 %r1200, %r1199, %r1195; - shf.l.wrap.b32 %r1201, %r1200, %r1200, 25; - add.s32 %r1202, %r1160, %r996; - add.s32 %r1203, %r1202, %r1201; - xor.b32 %r1204, %r1203, %r1174; - shf.l.wrap.b32 %r1205, %r1204, %r1204, 16; - add.s32 %r1206, %r1205, %r1187; - xor.b32 %r1207, %r1206, %r1201; - shf.l.wrap.b32 %r1208, %r1207, %r1207, 20; - add.s32 %r1209, %r1203, %r1208; - xor.b32 %r1210, %r1209, %r1205; - shf.l.wrap.b32 %r1211, %r1210, %r1210, 24; - add.s32 %r1212, %r1211, %r1206; - xor.b32 %r1213, %r1212, %r1208; - shf.l.wrap.b32 %r1214, %r1213, %r1213, 25; - add.s32 %r1215, %r1172, %r1165; - xor.b32 %r1216, %r1215, %r1186; - shf.l.wrap.b32 %r1217, %r1216, %r1216, 16; - add.s32 %r1218, %r1217, %r1199; - xor.b32 %r1219, %r1218, %r1165; - shf.l.wrap.b32 %r1220, %r1219, %r1219, 20; - add.s32 %r1221, %r1215, %r1220; - xor.b32 %r1222, %r1221, %r1217; - shf.l.wrap.b32 %r1223, %r1222, %r1222, 24; - add.s32 %r1224, %r1223, %r1218; - xor.b32 %r1225, %r1224, %r1220; - shf.l.wrap.b32 %r1226, %r1225, %r1225, 25; - add.s32 %r1227, %r1184, %r1177; - xor.b32 %r1228, %r1198, %r1227; - shf.l.wrap.b32 %r1229, %r1228, %r1228, 16; - add.s32 %r1230, %r1229, %r1163; - xor.b32 %r1231, %r1230, %r1177; - shf.l.wrap.b32 %r1232, %r1231, %r1231, 20; - add.s32 %r1233, %r1227, %r988; - add.s32 %r1234, %r1233, %r1232; - xor.b32 %r1235, %r1234, %r1229; - shf.l.wrap.b32 %r1236, %r1235, %r1235, 24; - add.s32 %r1237, %r1236, %r1230; - xor.b32 %r1238, %r1237, %r1232; - shf.l.wrap.b32 %r1239, %r1238, %r1238, 25; - add.s32 %r1240, %r1196, %r1189; - xor.b32 %r1241, %r1162, %r1240; - shf.l.wrap.b32 %r1242, %r1241, %r1241, 16; - add.s32 %r1243, %r1242, %r1175; - xor.b32 %r1244, %r1243, %r1189; - shf.l.wrap.b32 %r1245, %r1244, %r1244, 20; - add.s32 %r1246, %r1240, %r1245; - xor.b32 %r1247, %r1246, %r1242; - shf.l.wrap.b32 %r1248, %r1247, %r1247, 24; - add.s32 %r1249, %r1248, %r1243; - xor.b32 %r1250, %r1249, %r1245; - shf.l.wrap.b32 %r1251, %r1250, %r1250, 25; - add.s32 %r1252, %r1209, %r1226; - xor.b32 %r1253, %r1252, %r1248; - shf.l.wrap.b32 %r1254, %r1253, %r1253, 16; - add.s32 %r1255, %r1254, %r1237; - xor.b32 %r1256, %r1255, %r1226; - shf.l.wrap.b32 %r1257, %r1256, %r1256, 20; - add.s32 %r1258, %r1252, %r1257; - xor.b32 %r1259, %r1258, %r1254; - shf.l.wrap.b32 %r1260, %r1259, %r1259, 24; - add.s32 %r1261, %r1260, %r1255; - xor.b32 %r1262, %r1261, %r1257; - shf.l.wrap.b32 %r1263, %r1262, %r1262, 25; - add.s32 %r1264, %r1239, %r1221; - xor.b32 %r1265, %r1211, %r1264; - shf.l.wrap.b32 %r1266, %r1265, %r1265, 16; - add.s32 %r1267, %r1266, %r1249; - xor.b32 %r1268, %r1267, %r1239; - shf.l.wrap.b32 %r1269, %r1268, %r1268, 20; - add.s32 %r1270, %r1264, %r972; - add.s32 %r1271, %r1270, %r1269; - xor.b32 %r1272, %r1271, %r1266; - shf.l.wrap.b32 %r1273, %r1272, %r1272, 24; - add.s32 %r1274, %r1273, %r1267; - xor.b32 %r1275, %r1274, %r1269; - shf.l.wrap.b32 %r1276, %r1275, %r1275, 25; - add.s32 %r1277, %r1234, %r1251; - xor.b32 %r1278, %r1223, %r1277; - shf.l.wrap.b32 %r1279, %r1278, %r1278, 16; - add.s32 %r1280, %r1279, %r1212; - xor.b32 %r1281, %r1280, %r1251; - shf.l.wrap.b32 %r1282, %r1281, %r1281, 20; - add.s32 %r1283, %r1277, %r1282; - xor.b32 %r1284, %r1283, %r1279; - shf.l.wrap.b32 %r1285, %r1284, %r1284, 24; - add.s32 %r1286, %r1285, %r1280; - xor.b32 %r1287, %r1286, %r1282; - shf.l.wrap.b32 %r1288, %r1287, %r1287, 25; - add.s32 %r1289, %r1246, %r1214; - xor.b32 %r1290, %r1289, %r1236; - shf.l.wrap.b32 %r1291, %r1290, %r1290, 16; - add.s32 %r1292, %r1291, %r1224; - xor.b32 %r1293, %r1292, %r1214; - shf.l.wrap.b32 %r1294, %r1293, %r1293, 20; - add.s32 %r1295, %r1289, %r980; - add.s32 %r1296, %r1295, %r1294; - xor.b32 %r1297, %r1296, %r1291; - shf.l.wrap.b32 %r1298, %r1297, %r1297, 24; - add.s32 %r1299, %r1298, %r1292; - xor.b32 %r1300, %r1299, %r1294; - shf.l.wrap.b32 %r1301, %r1300, %r1300, 25; - add.s32 %r1302, %r1258, %r1301; - xor.b32 %r1303, %r1302, %r1273; - shf.l.wrap.b32 %r1304, %r1303, %r1303, 16; - add.s32 %r1305, %r1304, %r1286; - xor.b32 %r1306, %r1305, %r1301; - shf.l.wrap.b32 %r1307, %r1306, %r1306, 20; - add.s32 %r1308, %r1302, %r1307; - xor.b32 %r1309, %r1308, %r1304; - shf.l.wrap.b32 %r1310, %r1309, %r1309, 24; - add.s32 %r1311, %r1310, %r1305; - xor.b32 %r1312, %r1311, %r1307; - shf.l.wrap.b32 %r1313, %r1312, %r1312, 25; - add.s32 %r1314, %r1271, %r1263; - xor.b32 %r1315, %r1314, %r1285; - shf.l.wrap.b32 %r1316, %r1315, %r1315, 16; - add.s32 %r1317, %r1316, %r1299; - xor.b32 %r1318, %r1317, %r1263; - shf.l.wrap.b32 %r1319, %r1318, %r1318, 20; - add.s32 %r1320, %r1314, %r1319; - xor.b32 %r1321, %r1320, %r1316; + add.s32 %r1039, %r1033, %r182; + add.s32 %r1040, %r1039, %r1038; + xor.b32 %r1041, %r1040, %r1035; + shf.l.wrap.b32 %r1042, %r1041, %r1041, 24; + add.s32 %r1043, %r1042, %r1036; + xor.b32 %r1044, %r1043, %r1038; + shf.l.wrap.b32 %r1045, %r1044, %r1044, 25; + add.s32 %r1046, %r998, %r203; + add.s32 %r1047, %r1046, %r961; + xor.b32 %r1048, %r1047, %r986; + shf.l.wrap.b32 %r1049, %r1048, %r1048, 16; + add.s32 %r1050, %r1049, %r973; + xor.b32 %r1051, %r1050, %r961; + shf.l.wrap.b32 %r1052, %r1051, %r1051, 20; + add.s32 %r1053, %r1047, %r245; + add.s32 %r1054, %r1053, %r1052; + xor.b32 %r1055, %r1054, %r1049; + shf.l.wrap.b32 %r1056, %r1055, %r1055, 24; + add.s32 %r1057, %r1056, %r1050; + xor.b32 %r1058, %r1057, %r1052; + shf.l.wrap.b32 %r1059, %r1058, %r1058, 25; + xor.b32 %r1060, %r1043, %r1012; + st.local.u32 [%rd3+-104], %r1060; + xor.b32 %r1061, %r1057, %r1026; + st.local.u32 [%rd3+-100], %r1061; + xor.b32 %r1062, %r1015, %r1040; + st.local.u32 [%rd3+-96], %r1062; + xor.b32 %r1063, %r1029, %r1054; + st.local.u32 [%rd3+-92], %r1063; + xor.b32 %r1064, %r1059, %r1028; + st.local.u32 [%rd3+-88], %r1064; + xor.b32 %r1065, %r1017, %r1042; + st.local.u32 [%rd3+-84], %r1065; + xor.b32 %r1066, %r1031, %r1056; + st.local.u32 [%rd3+-80], %r1066; + xor.b32 %r1067, %r1045, %r1014; + st.local.u32 [%rd3+-76], %r1067; + add.s16 %rs114, %rs109, 1; + st.local.v2.u8 [%rd3], {%rs351, %rs114}; + +$L__BB1_8: + add.s64 %rd117, %rd13, %rd237; + st.local.u8 [%rd117], %rs351; + add.s64 %rd237, %rd237, 1; + setp.lt.u64 %p7, %rd237, 64; + mov.u64 %rd243, %rd12; + @%p7 bra $L__BB1_8; + +$L__BB1_9: + setp.lt.u64 %p8, %rd243, 65; + @%p8 bra $L__BB1_12; + + ld.local.u8 %rs9, [%rd3+2]; + ld.local.u8 %rs352, [%rd3+1]; + ld.local.u32 %r11657, [%rd3+-104]; + ld.local.u32 %r11656, [%rd3+-100]; + ld.local.u32 %r11655, [%rd3+-96]; + ld.local.u32 %r11654, [%rd3+-92]; + ld.local.u32 %r11653, [%rd3+-88]; + ld.local.u32 %r11652, [%rd3+-84]; + ld.local.u32 %r11651, [%rd3+-80]; + ld.local.u32 %r11650, [%rd3+-76]; + ld.local.u64 %rd118, [%rd3+-72]; + cvt.u32.u64 %r9, %rd118; + shr.u64 %rd119, %rd118, 32; + cvt.u32.u64 %r10, %rd119; + +$L__BB1_11: + and.b16 %rs116, %rs352, 255; + setp.eq.s16 %p9, %rs116, 0; + selp.u16 %rs117, 1, 0, %p9; + or.b16 %rs118, %rs9, %rs117; + ld.local.u8 %r1068, [%rd260]; + ld.local.u8 %r1069, [%rd260+1]; + prmt.b32 %r1070, %r1069, %r1068, 30212; + ld.local.u8 %r1071, [%rd260+2]; + prmt.b32 %r1072, %r1071, %r1070, 28756; + ld.local.u8 %r1073, [%rd260+3]; + prmt.b32 %r1074, %r1073, %r1072, 1620; + ld.local.u8 %r1075, [%rd260+4]; + ld.local.u8 %r1076, [%rd260+5]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + ld.local.u8 %r1078, [%rd260+6]; + prmt.b32 %r1079, %r1078, %r1077, 28756; + ld.local.u8 %r1080, [%rd260+7]; + prmt.b32 %r1081, %r1080, %r1079, 1620; + ld.local.u8 %r1082, [%rd260+8]; + ld.local.u8 %r1083, [%rd260+9]; + prmt.b32 %r1084, %r1083, %r1082, 30212; + ld.local.u8 %r1085, [%rd260+10]; + prmt.b32 %r1086, %r1085, %r1084, 28756; + ld.local.u8 %r1087, [%rd260+11]; + prmt.b32 %r1088, %r1087, %r1086, 1620; + ld.local.u8 %r1089, [%rd260+12]; + ld.local.u8 %r1090, [%rd260+13]; + prmt.b32 %r1091, %r1090, %r1089, 30212; + ld.local.u8 %r1092, [%rd260+14]; + prmt.b32 %r1093, %r1092, %r1091, 28756; + ld.local.u8 %r1094, [%rd260+15]; + prmt.b32 %r1095, %r1094, %r1093, 1620; + ld.local.u8 %r1096, [%rd260+16]; + ld.local.u8 %r1097, [%rd260+17]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd260+18]; + prmt.b32 %r1100, %r1099, %r1098, 28756; + ld.local.u8 %r1101, [%rd260+19]; + prmt.b32 %r1102, %r1101, %r1100, 1620; + ld.local.u8 %r1103, [%rd260+20]; + ld.local.u8 %r1104, [%rd260+21]; + prmt.b32 %r1105, %r1104, %r1103, 30212; + ld.local.u8 %r1106, [%rd260+22]; + prmt.b32 %r1107, %r1106, %r1105, 28756; + ld.local.u8 %r1108, [%rd260+23]; + prmt.b32 %r1109, %r1108, %r1107, 1620; + ld.local.u8 %r1110, [%rd260+24]; + ld.local.u8 %r1111, [%rd260+25]; + prmt.b32 %r1112, %r1111, %r1110, 30212; + ld.local.u8 %r1113, [%rd260+26]; + prmt.b32 %r1114, %r1113, %r1112, 28756; + ld.local.u8 %r1115, [%rd260+27]; + prmt.b32 %r1116, %r1115, %r1114, 1620; + ld.local.u8 %r1117, [%rd260+28]; + ld.local.u8 %r1118, [%rd260+29]; + prmt.b32 %r1119, %r1118, %r1117, 30212; + ld.local.u8 %r1120, [%rd260+30]; + prmt.b32 %r1121, %r1120, %r1119, 28756; + ld.local.u8 %r1122, [%rd260+31]; + prmt.b32 %r1123, %r1122, %r1121, 1620; + ld.local.u8 %r1124, [%rd260+32]; + ld.local.u8 %r1125, [%rd260+33]; + prmt.b32 %r1126, %r1125, %r1124, 30212; + ld.local.u8 %r1127, [%rd260+34]; + prmt.b32 %r1128, %r1127, %r1126, 28756; + ld.local.u8 %r1129, [%rd260+35]; + prmt.b32 %r1130, %r1129, %r1128, 1620; + ld.local.u8 %r1131, [%rd260+36]; + ld.local.u8 %r1132, [%rd260+37]; + prmt.b32 %r1133, %r1132, %r1131, 30212; + ld.local.u8 %r1134, [%rd260+38]; + prmt.b32 %r1135, %r1134, %r1133, 28756; + ld.local.u8 %r1136, [%rd260+39]; + prmt.b32 %r1137, %r1136, %r1135, 1620; + ld.local.u8 %r1138, [%rd260+40]; + ld.local.u8 %r1139, [%rd260+41]; + prmt.b32 %r1140, %r1139, %r1138, 30212; + ld.local.u8 %r1141, [%rd260+42]; + prmt.b32 %r1142, %r1141, %r1140, 28756; + ld.local.u8 %r1143, [%rd260+43]; + prmt.b32 %r1144, %r1143, %r1142, 1620; + ld.local.u8 %r1145, [%rd260+44]; + ld.local.u8 %r1146, [%rd260+45]; + prmt.b32 %r1147, %r1146, %r1145, 30212; + ld.local.u8 %r1148, [%rd260+46]; + prmt.b32 %r1149, %r1148, %r1147, 28756; + ld.local.u8 %r1150, [%rd260+47]; + prmt.b32 %r1151, %r1150, %r1149, 1620; + ld.local.u8 %r1152, [%rd260+48]; + ld.local.u8 %r1153, [%rd260+49]; + prmt.b32 %r1154, %r1153, %r1152, 30212; + ld.local.u8 %r1155, [%rd260+50]; + prmt.b32 %r1156, %r1155, %r1154, 28756; + ld.local.u8 %r1157, [%rd260+51]; + prmt.b32 %r1158, %r1157, %r1156, 1620; + ld.local.u8 %r1159, [%rd260+52]; + ld.local.u8 %r1160, [%rd260+53]; + prmt.b32 %r1161, %r1160, %r1159, 30212; + ld.local.u8 %r1162, [%rd260+54]; + prmt.b32 %r1163, %r1162, %r1161, 28756; + ld.local.u8 %r1164, [%rd260+55]; + prmt.b32 %r1165, %r1164, %r1163, 1620; + ld.local.u8 %r1166, [%rd260+56]; + ld.local.u8 %r1167, [%rd260+57]; + prmt.b32 %r1168, %r1167, %r1166, 30212; + ld.local.u8 %r1169, [%rd260+58]; + prmt.b32 %r1170, %r1169, %r1168, 28756; + ld.local.u8 %r1171, [%rd260+59]; + prmt.b32 %r1172, %r1171, %r1170, 1620; + ld.local.u8 %r1173, [%rd260+60]; + ld.local.u8 %r1174, [%rd260+61]; + prmt.b32 %r1175, %r1174, %r1173, 30212; + ld.local.u8 %r1176, [%rd260+62]; + prmt.b32 %r1177, %r1176, %r1175, 28756; + ld.local.u8 %r1178, [%rd260+63]; + prmt.b32 %r1179, %r1178, %r1177, 1620; + cvt.u32.u16 %r1180, %rs118; + and.b32 %r1181, %r1180, 255; + add.s32 %r1182, %r11657, %r1074; + add.s32 %r1183, %r1182, %r11653; + xor.b32 %r1184, %r1183, %r9; + shf.l.wrap.b32 %r1185, %r1184, %r1184, 16; + add.s32 %r1186, %r1185, 1779033703; + xor.b32 %r1187, %r1186, %r11653; + shf.l.wrap.b32 %r1188, %r1187, %r1187, 20; + add.s32 %r1189, %r1183, %r1081; + add.s32 %r1190, %r1189, %r1188; + xor.b32 %r1191, %r1190, %r1185; + shf.l.wrap.b32 %r1192, %r1191, %r1191, 24; + add.s32 %r1193, %r1192, %r1186; + xor.b32 %r1194, %r1193, %r1188; + shf.l.wrap.b32 %r1195, %r1194, %r1194, 25; + add.s32 %r1196, %r11656, %r1088; + add.s32 %r1197, %r1196, %r11652; + xor.b32 %r1198, %r1197, %r10; + shf.l.wrap.b32 %r1199, %r1198, %r1198, 16; + add.s32 %r1200, %r1199, -1150833019; + xor.b32 %r1201, %r1200, %r11652; + shf.l.wrap.b32 %r1202, %r1201, %r1201, 20; + add.s32 %r1203, %r1197, %r1095; + add.s32 %r1204, %r1203, %r1202; + xor.b32 %r1205, %r1204, %r1199; + shf.l.wrap.b32 %r1206, %r1205, %r1205, 24; + add.s32 %r1207, %r1206, %r1200; + xor.b32 %r1208, %r1207, %r1202; + shf.l.wrap.b32 %r1209, %r1208, %r1208, 25; + add.s32 %r1210, %r11655, %r1102; + add.s32 %r1211, %r1210, %r11651; + shr.u32 %r1212, %r1211, 16; + shl.b32 %r1213, %r1211, 16; + xor.b32 %r1214, %r1213, 4194304; + or.b32 %r1215, %r1214, %r1212; + add.s32 %r1216, %r1215, 1013904242; + xor.b32 %r1217, %r1216, %r11651; + shf.l.wrap.b32 %r1218, %r1217, %r1217, 20; + add.s32 %r1219, %r1211, %r1109; + add.s32 %r1220, %r1219, %r1218; + xor.b32 %r1221, %r1220, %r1215; + shf.l.wrap.b32 %r1222, %r1221, %r1221, 24; + add.s32 %r1223, %r1222, %r1216; + xor.b32 %r1224, %r1223, %r1218; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 25; + add.s32 %r1226, %r11654, %r1116; + add.s32 %r1227, %r1226, %r11650; + xor.b32 %r1228, %r1227, %r1181; + shr.u32 %r1229, %r1227, 16; + shl.b32 %r1230, %r1228, 16; + or.b32 %r1231, %r1230, %r1229; + add.s32 %r1232, %r1231, -1521486534; + xor.b32 %r1233, %r1232, %r11650; + shf.l.wrap.b32 %r1234, %r1233, %r1233, 20; + add.s32 %r1235, %r1227, %r1123; + add.s32 %r1236, %r1235, %r1234; + xor.b32 %r1237, %r1236, %r1231; + shf.l.wrap.b32 %r1238, %r1237, %r1237, 24; + add.s32 %r1239, %r1238, %r1232; + xor.b32 %r1240, %r1239, %r1234; + shf.l.wrap.b32 %r1241, %r1240, %r1240, 25; + add.s32 %r1242, %r1190, %r1130; + add.s32 %r1243, %r1242, %r1209; + xor.b32 %r1244, %r1243, %r1238; + shf.l.wrap.b32 %r1245, %r1244, %r1244, 16; + add.s32 %r1246, %r1245, %r1223; + xor.b32 %r1247, %r1246, %r1209; + shf.l.wrap.b32 %r1248, %r1247, %r1247, 20; + add.s32 %r1249, %r1243, %r1137; + add.s32 %r1250, %r1249, %r1248; + xor.b32 %r1251, %r1250, %r1245; + shf.l.wrap.b32 %r1252, %r1251, %r1251, 24; + add.s32 %r1253, %r1252, %r1246; + xor.b32 %r1254, %r1253, %r1248; + shf.l.wrap.b32 %r1255, %r1254, %r1254, 25; + add.s32 %r1256, %r1204, %r1144; + add.s32 %r1257, %r1256, %r1225; + xor.b32 %r1258, %r1257, %r1192; + shf.l.wrap.b32 %r1259, %r1258, %r1258, 16; + add.s32 %r1260, %r1259, %r1239; + xor.b32 %r1261, %r1260, %r1225; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 20; + add.s32 %r1263, %r1257, %r1151; + add.s32 %r1264, %r1263, %r1262; + xor.b32 %r1265, %r1264, %r1259; + shf.l.wrap.b32 %r1266, %r1265, %r1265, 24; + add.s32 %r1267, %r1266, %r1260; + xor.b32 %r1268, %r1267, %r1262; + shf.l.wrap.b32 %r1269, %r1268, %r1268, 25; + add.s32 %r1270, %r1220, %r1158; + add.s32 %r1271, %r1270, %r1241; + xor.b32 %r1272, %r1271, %r1206; + shf.l.wrap.b32 %r1273, %r1272, %r1272, 16; + add.s32 %r1274, %r1273, %r1193; + xor.b32 %r1275, %r1274, %r1241; + shf.l.wrap.b32 %r1276, %r1275, %r1275, 20; + add.s32 %r1277, %r1271, %r1165; + add.s32 %r1278, %r1277, %r1276; + xor.b32 %r1279, %r1278, %r1273; + shf.l.wrap.b32 %r1280, %r1279, %r1279, 24; + add.s32 %r1281, %r1280, %r1274; + xor.b32 %r1282, %r1281, %r1276; + shf.l.wrap.b32 %r1283, %r1282, %r1282, 25; + add.s32 %r1284, %r1236, %r1172; + add.s32 %r1285, %r1284, %r1195; + xor.b32 %r1286, %r1285, %r1222; + shf.l.wrap.b32 %r1287, %r1286, %r1286, 16; + add.s32 %r1288, %r1287, %r1207; + xor.b32 %r1289, %r1288, %r1195; + shf.l.wrap.b32 %r1290, %r1289, %r1289, 20; + add.s32 %r1291, %r1285, %r1179; + add.s32 %r1292, %r1291, %r1290; + xor.b32 %r1293, %r1292, %r1287; + shf.l.wrap.b32 %r1294, %r1293, %r1293, 24; + add.s32 %r1295, %r1294, %r1288; + xor.b32 %r1296, %r1295, %r1290; + shf.l.wrap.b32 %r1297, %r1296, %r1296, 25; + add.s32 %r1298, %r1250, %r1088; + add.s32 %r1299, %r1298, %r1297; + xor.b32 %r1300, %r1299, %r1266; + shf.l.wrap.b32 %r1301, %r1300, %r1300, 16; + add.s32 %r1302, %r1301, %r1281; + xor.b32 %r1303, %r1302, %r1297; + shf.l.wrap.b32 %r1304, %r1303, %r1303, 20; + add.s32 %r1305, %r1299, %r1116; + add.s32 %r1306, %r1305, %r1304; + xor.b32 %r1307, %r1306, %r1301; + shf.l.wrap.b32 %r1308, %r1307, %r1307, 24; + add.s32 %r1309, %r1308, %r1302; + xor.b32 %r1310, %r1309, %r1304; + shf.l.wrap.b32 %r1311, %r1310, %r1310, 25; + add.s32 %r1312, %r1264, %r1095; + add.s32 %r1313, %r1312, %r1255; + xor.b32 %r1314, %r1313, %r1280; + shf.l.wrap.b32 %r1315, %r1314, %r1314, 16; + add.s32 %r1316, %r1315, %r1295; + xor.b32 %r1317, %r1316, %r1255; + shf.l.wrap.b32 %r1318, %r1317, %r1317, 20; + add.s32 %r1319, %r1313, %r1144; + add.s32 %r1320, %r1319, %r1318; + xor.b32 %r1321, %r1320, %r1315; shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; - add.s32 %r1323, %r1322, %r1317; - xor.b32 %r1324, %r1323, %r1319; + add.s32 %r1323, %r1322, %r1316; + xor.b32 %r1324, %r1323, %r1318; shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; - add.s32 %r1326, %r1283, %r1276; - xor.b32 %r1327, %r1298, %r1326; - shf.l.wrap.b32 %r1328, %r1327, %r1327, 16; - add.s32 %r1329, %r1328, %r1261; - xor.b32 %r1330, %r1329, %r1276; - shf.l.wrap.b32 %r1331, %r1330, %r1330, 20; - add.s32 %r1332, %r1326, %r996; - add.s32 %r1333, %r1332, %r1331; - xor.b32 %r1334, %r1333, %r1328; - shf.l.wrap.b32 %r1335, %r1334, %r1334, 24; - add.s32 %r1336, %r1335, %r1329; - xor.b32 %r1337, %r1336, %r1331; - shf.l.wrap.b32 %r1338, %r1337, %r1337, 25; - add.s32 %r1339, %r1296, %r1288; - xor.b32 %r1340, %r1260, %r1339; - shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; - add.s32 %r1342, %r1341, %r1274; - xor.b32 %r1343, %r1342, %r1288; - shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; - add.s32 %r1345, %r1339, %r1344; - xor.b32 %r1346, %r1345, %r1341; - shf.l.wrap.b32 %r1347, %r1346, %r1346, 24; - add.s32 %r1348, %r1347, %r1342; - xor.b32 %r1349, %r1348, %r1344; - shf.l.wrap.b32 %r1350, %r1349, %r1349, 25; - add.s32 %r1351, %r1308, %r1325; - xor.b32 %r1352, %r1351, %r1347; - shf.l.wrap.b32 %r1353, %r1352, %r1352, 16; - add.s32 %r1354, %r1353, %r1336; - xor.b32 %r1355, %r1354, %r1325; - shf.l.wrap.b32 %r1356, %r1355, %r1355, 20; - add.s32 %r1357, %r1351, %r972; - add.s32 %r1358, %r1357, %r1356; - xor.b32 %r1359, %r1358, %r1353; - shf.l.wrap.b32 %r1360, %r1359, %r1359, 24; - add.s32 %r1361, %r1360, %r1354; - xor.b32 %r1362, %r1361, %r1356; - shf.l.wrap.b32 %r1363, %r1362, %r1362, 25; - add.s32 %r1364, %r1338, %r1320; - xor.b32 %r1365, %r1310, %r1364; - shf.l.wrap.b32 %r1366, %r1365, %r1365, 16; - add.s32 %r1367, %r1366, %r1348; - xor.b32 %r1368, %r1367, %r1338; - shf.l.wrap.b32 %r1369, %r1368, %r1368, 20; - add.s32 %r1370, %r1364, %r988; - add.s32 %r1371, %r1370, %r1369; - xor.b32 %r1372, %r1371, %r1366; - shf.l.wrap.b32 %r1373, %r1372, %r1372, 24; - add.s32 %r1374, %r1373, %r1367; - xor.b32 %r1375, %r1374, %r1369; - shf.l.wrap.b32 %r1376, %r1375, %r1375, 25; - add.s32 %r1377, %r1333, %r1350; - xor.b32 %r1378, %r1322, %r1377; - shf.l.wrap.b32 %r1379, %r1378, %r1378, 16; - add.s32 %r1380, %r1379, %r1311; - xor.b32 %r1381, %r1380, %r1350; - shf.l.wrap.b32 %r1382, %r1381, %r1381, 20; - add.s32 %r1383, %r1377, %r1382; - xor.b32 %r1384, %r1383, %r1379; - shf.l.wrap.b32 %r1385, %r1384, %r1384, 24; - add.s32 %r1386, %r1385, %r1380; - xor.b32 %r1387, %r1386, %r1382; - shf.l.wrap.b32 %r1388, %r1387, %r1387, 25; - add.s32 %r1389, %r1345, %r980; - add.s32 %r1390, %r1389, %r1313; - xor.b32 %r1391, %r1390, %r1335; - shf.l.wrap.b32 %r1392, %r1391, %r1391, 16; - add.s32 %r1393, %r1392, %r1323; - xor.b32 %r1394, %r1393, %r1313; - shf.l.wrap.b32 %r1395, %r1394, %r1394, 20; - add.s32 %r1396, %r1390, %r1395; - xor.b32 %r1397, %r1396, %r1392; - shf.l.wrap.b32 %r1398, %r1397, %r1397, 24; - add.s32 %r1399, %r1398, %r1393; - xor.b32 %r1400, %r1399, %r1395; - shf.l.wrap.b32 %r1401, %r1400, %r1400, 25; - add.s32 %r1402, %r1358, %r1401; - xor.b32 %r1403, %r1402, %r1373; - shf.l.wrap.b32 %r1404, %r1403, %r1403, 16; - add.s32 %r1405, %r1404, %r1386; - xor.b32 %r1406, %r1405, %r1401; - shf.l.wrap.b32 %r1407, %r1406, %r1406, 20; - add.s32 %r1408, %r1402, %r1407; - xor.b32 %r1409, %r1408, %r1404; - shf.l.wrap.b32 %r1410, %r1409, %r1409, 24; - add.s32 %r1411, %r1410, %r1405; - xor.b32 %r1412, %r1411, %r1407; - shf.l.wrap.b32 %r1413, %r1412, %r1412, 25; - add.s32 %r1414, %r1371, %r1363; - xor.b32 %r1415, %r1414, %r1385; - shf.l.wrap.b32 %r1416, %r1415, %r1415, 16; - add.s32 %r1417, %r1416, %r1399; - xor.b32 %r1418, %r1417, %r1363; - shf.l.wrap.b32 %r1419, %r1418, %r1418, 20; - add.s32 %r1420, %r1414, %r1419; - xor.b32 %r1421, %r1420, %r1416; - shf.l.wrap.b32 %r1422, %r1421, %r1421, 24; - add.s32 %r1423, %r1422, %r1417; - xor.b32 %r1424, %r1423, %r1419; - shf.l.wrap.b32 %r1425, %r1424, %r1424, 25; - add.s32 %r1426, %r1383, %r1376; - xor.b32 %r1427, %r1398, %r1426; - shf.l.wrap.b32 %r1428, %r1427, %r1427, 16; - add.s32 %r1429, %r1428, %r1361; - xor.b32 %r1430, %r1429, %r1376; - shf.l.wrap.b32 %r1431, %r1430, %r1430, 20; - add.s32 %r1432, %r1426, %r1431; - xor.b32 %r1433, %r1432, %r1428; + add.s32 %r1326, %r1278, %r1123; + add.s32 %r1327, %r1326, %r1269; + xor.b32 %r1328, %r1327, %r1294; + shf.l.wrap.b32 %r1329, %r1328, %r1328, 16; + add.s32 %r1330, %r1329, %r1253; + xor.b32 %r1331, %r1330, %r1269; + shf.l.wrap.b32 %r1332, %r1331, %r1331, 20; + add.s32 %r1333, %r1327, %r1074; + add.s32 %r1334, %r1333, %r1332; + xor.b32 %r1335, %r1334, %r1329; + shf.l.wrap.b32 %r1336, %r1335, %r1335, 24; + add.s32 %r1337, %r1336, %r1330; + xor.b32 %r1338, %r1337, %r1332; + shf.l.wrap.b32 %r1339, %r1338, %r1338, 25; + add.s32 %r1340, %r1292, %r1102; + add.s32 %r1341, %r1340, %r1283; + xor.b32 %r1342, %r1341, %r1252; + shf.l.wrap.b32 %r1343, %r1342, %r1342, 16; + add.s32 %r1344, %r1343, %r1267; + xor.b32 %r1345, %r1344, %r1283; + shf.l.wrap.b32 %r1346, %r1345, %r1345, 20; + add.s32 %r1347, %r1341, %r1165; + add.s32 %r1348, %r1347, %r1346; + xor.b32 %r1349, %r1348, %r1343; + shf.l.wrap.b32 %r1350, %r1349, %r1349, 24; + add.s32 %r1351, %r1350, %r1344; + xor.b32 %r1352, %r1351, %r1346; + shf.l.wrap.b32 %r1353, %r1352, %r1352, 25; + add.s32 %r1354, %r1306, %r1081; + add.s32 %r1355, %r1354, %r1325; + xor.b32 %r1356, %r1355, %r1350; + shf.l.wrap.b32 %r1357, %r1356, %r1356, 16; + add.s32 %r1358, %r1357, %r1337; + xor.b32 %r1359, %r1358, %r1325; + shf.l.wrap.b32 %r1360, %r1359, %r1359, 20; + add.s32 %r1361, %r1355, %r1151; + add.s32 %r1362, %r1361, %r1360; + xor.b32 %r1363, %r1362, %r1357; + shf.l.wrap.b32 %r1364, %r1363, %r1363, 24; + add.s32 %r1365, %r1364, %r1358; + xor.b32 %r1366, %r1365, %r1360; + shf.l.wrap.b32 %r1367, %r1366, %r1366, 25; + add.s32 %r1368, %r1320, %r1158; + add.s32 %r1369, %r1368, %r1339; + xor.b32 %r1370, %r1369, %r1308; + shf.l.wrap.b32 %r1371, %r1370, %r1370, 16; + add.s32 %r1372, %r1371, %r1351; + xor.b32 %r1373, %r1372, %r1339; + shf.l.wrap.b32 %r1374, %r1373, %r1373, 20; + add.s32 %r1375, %r1369, %r1109; + add.s32 %r1376, %r1375, %r1374; + xor.b32 %r1377, %r1376, %r1371; + shf.l.wrap.b32 %r1378, %r1377, %r1377, 24; + add.s32 %r1379, %r1378, %r1372; + xor.b32 %r1380, %r1379, %r1374; + shf.l.wrap.b32 %r1381, %r1380, %r1380, 25; + add.s32 %r1382, %r1334, %r1137; + add.s32 %r1383, %r1382, %r1353; + xor.b32 %r1384, %r1383, %r1322; + shf.l.wrap.b32 %r1385, %r1384, %r1384, 16; + add.s32 %r1386, %r1385, %r1309; + xor.b32 %r1387, %r1386, %r1353; + shf.l.wrap.b32 %r1388, %r1387, %r1387, 20; + add.s32 %r1389, %r1383, %r1172; + add.s32 %r1390, %r1389, %r1388; + xor.b32 %r1391, %r1390, %r1385; + shf.l.wrap.b32 %r1392, %r1391, %r1391, 24; + add.s32 %r1393, %r1392, %r1386; + xor.b32 %r1394, %r1393, %r1388; + shf.l.wrap.b32 %r1395, %r1394, %r1394, 25; + add.s32 %r1396, %r1348, %r1179; + add.s32 %r1397, %r1396, %r1311; + xor.b32 %r1398, %r1397, %r1336; + shf.l.wrap.b32 %r1399, %r1398, %r1398, 16; + add.s32 %r1400, %r1399, %r1323; + xor.b32 %r1401, %r1400, %r1311; + shf.l.wrap.b32 %r1402, %r1401, %r1401, 20; + add.s32 %r1403, %r1397, %r1130; + add.s32 %r1404, %r1403, %r1402; + xor.b32 %r1405, %r1404, %r1399; + shf.l.wrap.b32 %r1406, %r1405, %r1405, 24; + add.s32 %r1407, %r1406, %r1400; + xor.b32 %r1408, %r1407, %r1402; + shf.l.wrap.b32 %r1409, %r1408, %r1408, 25; + add.s32 %r1410, %r1362, %r1095; + add.s32 %r1411, %r1410, %r1409; + xor.b32 %r1412, %r1411, %r1378; + shf.l.wrap.b32 %r1413, %r1412, %r1412, 16; + add.s32 %r1414, %r1413, %r1393; + xor.b32 %r1415, %r1414, %r1409; + shf.l.wrap.b32 %r1416, %r1415, %r1415, 20; + add.s32 %r1417, %r1411, %r1102; + add.s32 %r1418, %r1417, %r1416; + xor.b32 %r1419, %r1418, %r1413; + shf.l.wrap.b32 %r1420, %r1419, %r1419, 24; + add.s32 %r1421, %r1420, %r1414; + xor.b32 %r1422, %r1421, %r1416; + shf.l.wrap.b32 %r1423, %r1422, %r1422, 25; + add.s32 %r1424, %r1376, %r1144; + add.s32 %r1425, %r1424, %r1367; + xor.b32 %r1426, %r1425, %r1392; + shf.l.wrap.b32 %r1427, %r1426, %r1426, 16; + add.s32 %r1428, %r1427, %r1407; + xor.b32 %r1429, %r1428, %r1367; + shf.l.wrap.b32 %r1430, %r1429, %r1429, 20; + add.s32 %r1431, %r1425, %r1158; + add.s32 %r1432, %r1431, %r1430; + xor.b32 %r1433, %r1432, %r1427; shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; - add.s32 %r1435, %r1434, %r1429; - xor.b32 %r1436, %r1435, %r1431; + add.s32 %r1435, %r1434, %r1428; + xor.b32 %r1436, %r1435, %r1430; shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; - add.s32 %r1438, %r1396, %r1388; - xor.b32 %r1439, %r1360, %r1438; - shf.l.wrap.b32 %r1440, %r1439, %r1439, 16; - add.s32 %r1441, %r1440, %r1374; - xor.b32 %r1442, %r1441, %r1388; - shf.l.wrap.b32 %r1443, %r1442, %r1442, 20; - add.s32 %r1444, %r1438, %r1443; - xor.b32 %r1445, %r1444, %r1440; - shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; - add.s32 %r1447, %r1446, %r1441; - xor.b32 %r1448, %r1447, %r1443; - shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; - add.s32 %r1450, %r1408, %r1425; - xor.b32 %r1451, %r1450, %r1446; - shf.l.wrap.b32 %r1452, %r1451, %r1451, 16; - add.s32 %r1453, %r1452, %r1435; - xor.b32 %r1454, %r1453, %r1425; - shf.l.wrap.b32 %r1455, %r1454, %r1454, 20; - add.s32 %r1456, %r1450, %r988; - add.s32 %r1457, %r1456, %r1455; - xor.b32 %r1458, %r1457, %r1452; - shf.l.wrap.b32 %r1459, %r1458, %r1458, 24; - add.s32 %r1460, %r1459, %r1453; + add.s32 %r1438, %r1390, %r1165; + add.s32 %r1439, %r1438, %r1381; + xor.b32 %r1440, %r1439, %r1406; + shf.l.wrap.b32 %r1441, %r1440, %r1440, 16; + add.s32 %r1442, %r1441, %r1365; + xor.b32 %r1443, %r1442, %r1381; + shf.l.wrap.b32 %r1444, %r1443, %r1443, 20; + add.s32 %r1445, %r1439, %r1088; + add.s32 %r1446, %r1445, %r1444; + xor.b32 %r1447, %r1446, %r1441; + shf.l.wrap.b32 %r1448, %r1447, %r1447, 24; + add.s32 %r1449, %r1448, %r1442; + xor.b32 %r1450, %r1449, %r1444; + shf.l.wrap.b32 %r1451, %r1450, %r1450, 25; + add.s32 %r1452, %r1404, %r1123; + add.s32 %r1453, %r1452, %r1395; + xor.b32 %r1454, %r1453, %r1364; + shf.l.wrap.b32 %r1455, %r1454, %r1454, 16; + add.s32 %r1456, %r1455, %r1379; + xor.b32 %r1457, %r1456, %r1395; + shf.l.wrap.b32 %r1458, %r1457, %r1457, 20; + add.s32 %r1459, %r1453, %r1172; + add.s32 %r1460, %r1459, %r1458; xor.b32 %r1461, %r1460, %r1455; - shf.l.wrap.b32 %r1462, %r1461, %r1461, 25; - add.s32 %r1463, %r1437, %r1420; - xor.b32 %r1464, %r1410, %r1463; - shf.l.wrap.b32 %r1465, %r1464, %r1464, 16; - add.s32 %r1466, %r1465, %r1447; - xor.b32 %r1467, %r1466, %r1437; - shf.l.wrap.b32 %r1468, %r1467, %r1467, 20; - add.s32 %r1469, %r1463, %r996; - add.s32 %r1470, %r1469, %r1468; - xor.b32 %r1471, %r1470, %r1465; - shf.l.wrap.b32 %r1472, %r1471, %r1471, 24; - add.s32 %r1473, %r1472, %r1466; - xor.b32 %r1474, %r1473, %r1468; - shf.l.wrap.b32 %r1475, %r1474, %r1474, 25; - add.s32 %r1476, %r1432, %r972; - add.s32 %r1477, %r1476, %r1449; - xor.b32 %r1478, %r1422, %r1477; - shf.l.wrap.b32 %r1479, %r1478, %r1478, 16; - add.s32 %r1480, %r1479, %r1411; - xor.b32 %r1481, %r1480, %r1449; - shf.l.wrap.b32 %r1482, %r1481, %r1481, 20; - add.s32 %r1483, %r1477, %r980; - add.s32 %r1484, %r1483, %r1482; - xor.b32 %r1485, %r1484, %r1479; - shf.l.wrap.b32 %r1486, %r1485, %r1485, 24; - add.s32 %r1487, %r1486, %r1480; - xor.b32 %r1488, %r1487, %r1482; - shf.l.wrap.b32 %r1489, %r1488, %r1488, 25; - add.s32 %r1490, %r1444, %r1413; - xor.b32 %r1491, %r1490, %r1434; - shf.l.wrap.b32 %r1492, %r1491, %r1491, 16; - add.s32 %r1493, %r1492, %r1423; - xor.b32 %r1494, %r1493, %r1413; - shf.l.wrap.b32 %r1495, %r1494, %r1494, 20; - add.s32 %r1496, %r1490, %r1495; - xor.b32 %r1497, %r1496, %r1492; - shf.l.wrap.b32 %r1498, %r1497, %r1497, 24; - add.s32 %r1499, %r1498, %r1493; - xor.b32 %r1500, %r1499, %r1495; - shf.l.wrap.b32 %r1501, %r1500, %r1500, 25; - add.s32 %r1502, %r1457, %r1501; - xor.b32 %r1503, %r1502, %r1472; - shf.l.wrap.b32 %r1504, %r1503, %r1503, 16; - add.s32 %r1505, %r1504, %r1487; - xor.b32 %r1506, %r1505, %r1501; - shf.l.wrap.b32 %r1507, %r1506, %r1506, 20; - add.s32 %r1508, %r1502, %r1507; - xor.b32 %r1509, %r1508, %r1504; - shf.l.wrap.b32 %r1510, %r1509, %r1509, 24; - add.s32 %r1511, %r1510, %r1505; - xor.b32 %r1512, %r1511, %r1507; - shf.l.wrap.b32 %r1513, %r1512, %r1512, 25; - add.s32 %r1514, %r1470, %r1462; - xor.b32 %r1515, %r1514, %r1486; - shf.l.wrap.b32 %r1516, %r1515, %r1515, 16; - add.s32 %r1517, %r1516, %r1499; - xor.b32 %r1518, %r1517, %r1462; - shf.l.wrap.b32 %r1519, %r1518, %r1518, 20; - add.s32 %r1520, %r1514, %r1519; - xor.b32 %r1521, %r1520, %r1516; - shf.l.wrap.b32 %r1522, %r1521, %r1521, 24; - add.s32 %r1523, %r1522, %r1517; - xor.b32 %r1524, %r1523, %r1519; - shf.l.wrap.b32 %r1525, %r1524, %r1524, 25; - add.s32 %r1526, %r1484, %r1475; - xor.b32 %r1527, %r1498, %r1526; - shf.l.wrap.b32 %r1528, %r1527, %r1527, 16; - add.s32 %r1529, %r1528, %r1460; - xor.b32 %r1530, %r1529, %r1475; - shf.l.wrap.b32 %r1531, %r1530, %r1530, 20; - add.s32 %r1532, %r1526, %r1531; - xor.b32 %r1533, %r1532, %r1528; - shf.l.wrap.b32 %r1534, %r1533, %r1533, 24; - add.s32 %r1535, %r1534, %r1529; - xor.b32 %r1536, %r1535, %r1531; - shf.l.wrap.b32 %r1537, %r1536, %r1536, 25; - add.s32 %r1538, %r1496, %r1489; - xor.b32 %r1539, %r1459, %r1538; - shf.l.wrap.b32 %r1540, %r1539, %r1539, 16; - add.s32 %r1541, %r1540, %r1473; - xor.b32 %r1542, %r1541, %r1489; - shf.l.wrap.b32 %r1543, %r1542, %r1542, 20; - add.s32 %r1544, %r1538, %r980; - add.s32 %r1545, %r1544, %r1543; - xor.b32 %r1546, %r1545, %r1540; - shf.l.wrap.b32 %r1547, %r1546, %r1546, 24; - add.s32 %r1548, %r1547, %r1541; - xor.b32 %r1549, %r1548, %r1543; - shf.l.wrap.b32 %r1550, %r1549, %r1549, 25; - add.s32 %r1551, %r1508, %r1525; - xor.b32 %r1552, %r1551, %r1547; + shf.l.wrap.b32 %r1462, %r1461, %r1461, 24; + add.s32 %r1463, %r1462, %r1456; + xor.b32 %r1464, %r1463, %r1458; + shf.l.wrap.b32 %r1465, %r1464, %r1464, 25; + add.s32 %r1466, %r1418, %r1116; + add.s32 %r1467, %r1466, %r1437; + xor.b32 %r1468, %r1467, %r1462; + shf.l.wrap.b32 %r1469, %r1468, %r1468, 16; + add.s32 %r1470, %r1469, %r1449; + xor.b32 %r1471, %r1470, %r1437; + shf.l.wrap.b32 %r1472, %r1471, %r1471, 20; + add.s32 %r1473, %r1467, %r1109; + add.s32 %r1474, %r1473, %r1472; + xor.b32 %r1475, %r1474, %r1469; + shf.l.wrap.b32 %r1476, %r1475, %r1475, 24; + add.s32 %r1477, %r1476, %r1470; + xor.b32 %r1478, %r1477, %r1472; + shf.l.wrap.b32 %r1479, %r1478, %r1478, 25; + add.s32 %r1480, %r1432, %r1137; + add.s32 %r1481, %r1480, %r1451; + xor.b32 %r1482, %r1481, %r1420; + shf.l.wrap.b32 %r1483, %r1482, %r1482, 16; + add.s32 %r1484, %r1483, %r1463; + xor.b32 %r1485, %r1484, %r1451; + shf.l.wrap.b32 %r1486, %r1485, %r1485, 20; + add.s32 %r1487, %r1481, %r1074; + add.s32 %r1488, %r1487, %r1486; + xor.b32 %r1489, %r1488, %r1483; + shf.l.wrap.b32 %r1490, %r1489, %r1489, 24; + add.s32 %r1491, %r1490, %r1484; + xor.b32 %r1492, %r1491, %r1486; + shf.l.wrap.b32 %r1493, %r1492, %r1492, 25; + add.s32 %r1494, %r1446, %r1151; + add.s32 %r1495, %r1494, %r1465; + xor.b32 %r1496, %r1495, %r1434; + shf.l.wrap.b32 %r1497, %r1496, %r1496, 16; + add.s32 %r1498, %r1497, %r1421; + xor.b32 %r1499, %r1498, %r1465; + shf.l.wrap.b32 %r1500, %r1499, %r1499, 20; + add.s32 %r1501, %r1495, %r1179; + add.s32 %r1502, %r1501, %r1500; + xor.b32 %r1503, %r1502, %r1497; + shf.l.wrap.b32 %r1504, %r1503, %r1503, 24; + add.s32 %r1505, %r1504, %r1498; + xor.b32 %r1506, %r1505, %r1500; + shf.l.wrap.b32 %r1507, %r1506, %r1506, 25; + add.s32 %r1508, %r1460, %r1130; + add.s32 %r1509, %r1508, %r1423; + xor.b32 %r1510, %r1509, %r1448; + shf.l.wrap.b32 %r1511, %r1510, %r1510, 16; + add.s32 %r1512, %r1511, %r1435; + xor.b32 %r1513, %r1512, %r1423; + shf.l.wrap.b32 %r1514, %r1513, %r1513, 20; + add.s32 %r1515, %r1509, %r1081; + add.s32 %r1516, %r1515, %r1514; + xor.b32 %r1517, %r1516, %r1511; + shf.l.wrap.b32 %r1518, %r1517, %r1517, 24; + add.s32 %r1519, %r1518, %r1512; + xor.b32 %r1520, %r1519, %r1514; + shf.l.wrap.b32 %r1521, %r1520, %r1520, 25; + add.s32 %r1522, %r1474, %r1144; + add.s32 %r1523, %r1522, %r1521; + xor.b32 %r1524, %r1523, %r1490; + shf.l.wrap.b32 %r1525, %r1524, %r1524, 16; + add.s32 %r1526, %r1525, %r1505; + xor.b32 %r1527, %r1526, %r1521; + shf.l.wrap.b32 %r1528, %r1527, %r1527, 20; + add.s32 %r1529, %r1523, %r1123; + add.s32 %r1530, %r1529, %r1528; + xor.b32 %r1531, %r1530, %r1525; + shf.l.wrap.b32 %r1532, %r1531, %r1531, 24; + add.s32 %r1533, %r1532, %r1526; + xor.b32 %r1534, %r1533, %r1528; + shf.l.wrap.b32 %r1535, %r1534, %r1534, 25; + add.s32 %r1536, %r1488, %r1158; + add.s32 %r1537, %r1536, %r1479; + xor.b32 %r1538, %r1537, %r1504; + shf.l.wrap.b32 %r1539, %r1538, %r1538, 16; + add.s32 %r1540, %r1539, %r1519; + xor.b32 %r1541, %r1540, %r1479; + shf.l.wrap.b32 %r1542, %r1541, %r1541, 20; + add.s32 %r1543, %r1537, %r1137; + add.s32 %r1544, %r1543, %r1542; + xor.b32 %r1545, %r1544, %r1539; + shf.l.wrap.b32 %r1546, %r1545, %r1545, 24; + add.s32 %r1547, %r1546, %r1540; + xor.b32 %r1548, %r1547, %r1542; + shf.l.wrap.b32 %r1549, %r1548, %r1548, 25; + add.s32 %r1550, %r1502, %r1172; + add.s32 %r1551, %r1550, %r1493; + xor.b32 %r1552, %r1551, %r1518; shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; - add.s32 %r1554, %r1553, %r1535; - xor.b32 %r1555, %r1554, %r1525; + add.s32 %r1554, %r1553, %r1477; + xor.b32 %r1555, %r1554, %r1493; shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; - add.s32 %r1557, %r1551, %r996; + add.s32 %r1557, %r1551, %r1095; add.s32 %r1558, %r1557, %r1556; xor.b32 %r1559, %r1558, %r1553; shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; add.s32 %r1561, %r1560, %r1554; xor.b32 %r1562, %r1561, %r1556; shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; - add.s32 %r1564, %r1537, %r972; - add.s32 %r1565, %r1564, %r1520; - xor.b32 %r1566, %r1510, %r1565; + add.s32 %r1564, %r1516, %r1165; + add.s32 %r1565, %r1564, %r1507; + xor.b32 %r1566, %r1565, %r1476; shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; - add.s32 %r1568, %r1567, %r1548; - xor.b32 %r1569, %r1568, %r1537; + add.s32 %r1568, %r1567, %r1491; + xor.b32 %r1569, %r1568, %r1507; shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; - add.s32 %r1571, %r1565, %r1570; - xor.b32 %r1572, %r1571, %r1567; - shf.l.wrap.b32 %r1573, %r1572, %r1572, 24; - add.s32 %r1574, %r1573, %r1568; - xor.b32 %r1575, %r1574, %r1570; - shf.l.wrap.b32 %r1576, %r1575, %r1575, 25; - add.s32 %r1577, %r1532, %r988; - add.s32 %r1578, %r1577, %r1550; - xor.b32 %r1579, %r1522, %r1578; - shf.l.wrap.b32 %r1580, %r1579, %r1579, 16; - add.s32 %r1581, %r1580, %r1511; - xor.b32 %r1582, %r1581, %r1550; - shf.l.wrap.b32 %r1583, %r1582, %r1582, 20; - add.s32 %r1584, %r1578, %r1583; - xor.b32 %r1585, %r1584, %r1580; - shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; - add.s32 %r1587, %r1586, %r1581; - xor.b32 %r1588, %r1587, %r1583; - shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; - add.s32 %r1590, %r1545, %r1513; - xor.b32 %r1591, %r1590, %r1534; - shf.l.wrap.b32 %r1592, %r1591, %r1591, 16; - add.s32 %r1593, %r1592, %r1523; - xor.b32 %r1594, %r1593, %r1513; - shf.l.wrap.b32 %r1595, %r1594, %r1594, 20; - add.s32 %r1596, %r1590, %r1595; - xor.b32 %r1597, %r1596, %r1592; - shf.l.wrap.b32 %r1598, %r1597, %r1597, 24; - add.s32 %r1599, %r1598, %r1593; - xor.b32 %r1600, %r1599, %r1595; - shf.l.wrap.b32 %r1601, %r1600, %r1600, 25; - add.s32 %r1602, %r1558, %r1601; - xor.b32 %r1603, %r1602, %r1573; - shf.l.wrap.b32 %r1604, %r1603, %r1603, 16; - add.s32 %r1605, %r1604, %r1587; - xor.b32 %r1606, %r1605, %r1601; - shf.l.wrap.b32 %r1607, %r1606, %r1606, 20; - add.s32 %r1608, %r1602, %r1607; - xor.b32 %r1609, %r1608, %r1604; - shf.l.wrap.b32 %r1610, %r1609, %r1609, 24; - add.s32 %r1611, %r1610, %r1605; - xor.b32 %r1612, %r1611, %r1607; - shf.l.wrap.b32 %r1613, %r1612, %r1612, 25; - add.s32 %r1614, %r1571, %r1563; - xor.b32 %r1615, %r1614, %r1586; - shf.l.wrap.b32 %r1616, %r1615, %r1615, 16; - add.s32 %r1617, %r1616, %r1599; - xor.b32 %r1618, %r1617, %r1563; - shf.l.wrap.b32 %r1619, %r1618, %r1618, 20; - add.s32 %r1620, %r1614, %r972; - add.s32 %r1621, %r1620, %r1619; - xor.b32 %r1622, %r1621, %r1616; - shf.l.wrap.b32 %r1623, %r1622, %r1622, 24; - add.s32 %r1624, %r1623, %r1617; - xor.b32 %r1625, %r1624, %r1619; - shf.l.wrap.b32 %r1626, %r1625, %r1625, 25; - add.s32 %r1627, %r1584, %r980; - add.s32 %r1628, %r1627, %r1576; - xor.b32 %r1629, %r1598, %r1628; - shf.l.wrap.b32 %r1630, %r1629, %r1629, 16; - add.s32 %r1631, %r1630, %r1561; - xor.b32 %r1632, %r1631, %r1576; - shf.l.wrap.b32 %r1633, %r1632, %r1632, 20; - add.s32 %r1634, %r1628, %r1633; - xor.b32 %r1635, %r1634, %r1630; - shf.l.wrap.b32 %r1636, %r1635, %r1635, 24; - add.s32 %r1637, %r1636, %r1631; - xor.b32 %r1638, %r1637, %r1633; - shf.l.wrap.b32 %r1639, %r1638, %r1638, 25; - add.s32 %r1640, %r1596, %r1589; - xor.b32 %r1641, %r1560, %r1640; - shf.l.wrap.b32 %r1642, %r1641, %r1641, 16; - add.s32 %r1643, %r1642, %r1574; - xor.b32 %r1644, %r1643, %r1589; - shf.l.wrap.b32 %r1645, %r1644, %r1644, 20; - add.s32 %r1646, %r1640, %r1645; - xor.b32 %r1647, %r1646, %r1642; - shf.l.wrap.b32 %r1648, %r1647, %r1647, 24; - add.s32 %r1649, %r1648, %r1643; - xor.b32 %r1650, %r1649, %r1645; - shf.l.wrap.b32 %r1651, %r1650, %r1650, 25; - add.s32 %r1652, %r1608, %r1626; - xor.b32 %r1653, %r1652, %r1648; - shf.l.wrap.b32 %r1654, %r1653, %r1653, 16; - add.s32 %r1655, %r1654, %r1637; - xor.b32 %r1656, %r1655, %r1626; - shf.l.wrap.b32 %r1657, %r1656, %r1656, 20; - add.s32 %r1658, %r1652, %r1657; - xor.b32 %r1659, %r1658, %r1654; - shf.l.wrap.b32 %r1660, %r1659, %r1659, 24; - add.s32 %r1661, %r1660, %r1655; - xor.b32 %r1662, %r1661, %r1657; - shf.l.wrap.b32 %r1663, %r1662, %r1662, 25; - add.s32 %r1664, %r1639, %r988; - add.s32 %r1665, %r1664, %r1621; - xor.b32 %r1666, %r1610, %r1665; - shf.l.wrap.b32 %r1667, %r1666, %r1666, 16; - add.s32 %r1668, %r1667, %r1649; - xor.b32 %r1669, %r1668, %r1639; - shf.l.wrap.b32 %r1670, %r1669, %r1669, 20; - add.s32 %r1671, %r1665, %r1670; - xor.b32 %r1672, %r1671, %r1667; - shf.l.wrap.b32 %r1673, %r1672, %r1672, 24; - add.s32 %r1674, %r1673, %r1668; - xor.b32 %r1675, %r1674, %r1670; - shf.l.wrap.b32 %r1676, %r1675, %r1675, 25; - add.s32 %r1677, %r1634, %r996; - add.s32 %r1678, %r1677, %r1651; - xor.b32 %r1679, %r1623, %r1678; - shf.l.wrap.b32 %r1680, %r1679, %r1679, 16; - add.s32 %r1681, %r1680, %r1611; - xor.b32 %r1682, %r1681, %r1651; - shf.l.wrap.b32 %r1683, %r1682, %r1682, 20; - add.s32 %r1684, %r1678, %r1683; - xor.b32 %r1685, %r1684, %r1680; + add.s32 %r1571, %r1565, %r1179; + add.s32 %r1572, %r1571, %r1570; + xor.b32 %r1573, %r1572, %r1567; + shf.l.wrap.b32 %r1574, %r1573, %r1573, 24; + add.s32 %r1575, %r1574, %r1568; + xor.b32 %r1576, %r1575, %r1570; + shf.l.wrap.b32 %r1577, %r1576, %r1576, 25; + add.s32 %r1578, %r1530, %r1102; + add.s32 %r1579, %r1578, %r1549; + xor.b32 %r1580, %r1579, %r1574; + shf.l.wrap.b32 %r1581, %r1580, %r1580, 16; + add.s32 %r1582, %r1581, %r1561; + xor.b32 %r1583, %r1582, %r1549; + shf.l.wrap.b32 %r1584, %r1583, %r1583, 20; + add.s32 %r1585, %r1579, %r1074; + add.s32 %r1586, %r1585, %r1584; + xor.b32 %r1587, %r1586, %r1581; + shf.l.wrap.b32 %r1588, %r1587, %r1587, 24; + add.s32 %r1589, %r1588, %r1582; + xor.b32 %r1590, %r1589, %r1584; + shf.l.wrap.b32 %r1591, %r1590, %r1590, 25; + add.s32 %r1592, %r1544, %r1151; + add.s32 %r1593, %r1592, %r1563; + xor.b32 %r1594, %r1593, %r1532; + shf.l.wrap.b32 %r1595, %r1594, %r1594, 16; + add.s32 %r1596, %r1595, %r1575; + xor.b32 %r1597, %r1596, %r1563; + shf.l.wrap.b32 %r1598, %r1597, %r1597, 20; + add.s32 %r1599, %r1593, %r1088; + add.s32 %r1600, %r1599, %r1598; + xor.b32 %r1601, %r1600, %r1595; + shf.l.wrap.b32 %r1602, %r1601, %r1601, 24; + add.s32 %r1603, %r1602, %r1596; + xor.b32 %r1604, %r1603, %r1598; + shf.l.wrap.b32 %r1605, %r1604, %r1604, 25; + add.s32 %r1606, %r1558, %r1109; + add.s32 %r1607, %r1606, %r1577; + xor.b32 %r1608, %r1607, %r1546; + shf.l.wrap.b32 %r1609, %r1608, %r1608, 16; + add.s32 %r1610, %r1609, %r1533; + xor.b32 %r1611, %r1610, %r1577; + shf.l.wrap.b32 %r1612, %r1611, %r1611, 20; + add.s32 %r1613, %r1607, %r1130; + add.s32 %r1614, %r1613, %r1612; + xor.b32 %r1615, %r1614, %r1609; + shf.l.wrap.b32 %r1616, %r1615, %r1615, 24; + add.s32 %r1617, %r1616, %r1610; + xor.b32 %r1618, %r1617, %r1612; + shf.l.wrap.b32 %r1619, %r1618, %r1618, 25; + add.s32 %r1620, %r1572, %r1081; + add.s32 %r1621, %r1620, %r1535; + xor.b32 %r1622, %r1621, %r1560; + shf.l.wrap.b32 %r1623, %r1622, %r1622, 16; + add.s32 %r1624, %r1623, %r1547; + xor.b32 %r1625, %r1624, %r1535; + shf.l.wrap.b32 %r1626, %r1625, %r1625, 20; + add.s32 %r1627, %r1621, %r1116; + add.s32 %r1628, %r1627, %r1626; + xor.b32 %r1629, %r1628, %r1623; + shf.l.wrap.b32 %r1630, %r1629, %r1629, 24; + add.s32 %r1631, %r1630, %r1624; + xor.b32 %r1632, %r1631, %r1626; + shf.l.wrap.b32 %r1633, %r1632, %r1632, 25; + add.s32 %r1634, %r1586, %r1158; + add.s32 %r1635, %r1634, %r1633; + xor.b32 %r1636, %r1635, %r1602; + shf.l.wrap.b32 %r1637, %r1636, %r1636, 16; + add.s32 %r1638, %r1637, %r1617; + xor.b32 %r1639, %r1638, %r1633; + shf.l.wrap.b32 %r1640, %r1639, %r1639, 20; + add.s32 %r1641, %r1635, %r1165; + add.s32 %r1642, %r1641, %r1640; + xor.b32 %r1643, %r1642, %r1637; + shf.l.wrap.b32 %r1644, %r1643, %r1643, 24; + add.s32 %r1645, %r1644, %r1638; + xor.b32 %r1646, %r1645, %r1640; + shf.l.wrap.b32 %r1647, %r1646, %r1646, 25; + add.s32 %r1648, %r1600, %r1137; + add.s32 %r1649, %r1648, %r1591; + xor.b32 %r1650, %r1649, %r1616; + shf.l.wrap.b32 %r1651, %r1650, %r1650, 16; + add.s32 %r1652, %r1651, %r1631; + xor.b32 %r1653, %r1652, %r1591; + shf.l.wrap.b32 %r1654, %r1653, %r1653, 20; + add.s32 %r1655, %r1649, %r1151; + add.s32 %r1656, %r1655, %r1654; + xor.b32 %r1657, %r1656, %r1651; + shf.l.wrap.b32 %r1658, %r1657, %r1657, 24; + add.s32 %r1659, %r1658, %r1652; + xor.b32 %r1660, %r1659, %r1654; + shf.l.wrap.b32 %r1661, %r1660, %r1660, 25; + add.s32 %r1662, %r1614, %r1179; + add.s32 %r1663, %r1662, %r1605; + xor.b32 %r1664, %r1663, %r1630; + shf.l.wrap.b32 %r1665, %r1664, %r1664, 16; + add.s32 %r1666, %r1665, %r1589; + xor.b32 %r1667, %r1666, %r1605; + shf.l.wrap.b32 %r1668, %r1667, %r1667, 20; + add.s32 %r1669, %r1663, %r1144; + add.s32 %r1670, %r1669, %r1668; + xor.b32 %r1671, %r1670, %r1665; + shf.l.wrap.b32 %r1672, %r1671, %r1671, 24; + add.s32 %r1673, %r1672, %r1666; + xor.b32 %r1674, %r1673, %r1668; + shf.l.wrap.b32 %r1675, %r1674, %r1674, 25; + add.s32 %r1676, %r1628, %r1172; + add.s32 %r1677, %r1676, %r1619; + xor.b32 %r1678, %r1677, %r1588; + shf.l.wrap.b32 %r1679, %r1678, %r1678, 16; + add.s32 %r1680, %r1679, %r1603; + xor.b32 %r1681, %r1680, %r1619; + shf.l.wrap.b32 %r1682, %r1681, %r1681, 20; + add.s32 %r1683, %r1677, %r1130; + add.s32 %r1684, %r1683, %r1682; + xor.b32 %r1685, %r1684, %r1679; shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; - add.s32 %r1687, %r1686, %r1681; - xor.b32 %r1688, %r1687, %r1683; + add.s32 %r1687, %r1686, %r1680; + xor.b32 %r1688, %r1687, %r1682; shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; - add.s32 %r1690, %r1646, %r1613; - xor.b32 %r1691, %r1690, %r1636; - shf.l.wrap.b32 %r1692, %r1691, %r1691, 16; - add.s32 %r1693, %r1692, %r1624; - xor.b32 %r1694, %r1693, %r1613; - shf.l.wrap.b32 %r1695, %r1694, %r1694, 20; - add.s32 %r1696, %r1690, %r1695; - xor.b32 %r1697, %r1696, %r1692; - shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; - add.s32 %r1699, %r1698, %r1693; - xor.b32 %r1700, %r1699, %r1695; - shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; - xor.b32 %r9, %r1658, %r1687; - cvt.u64.u32 %rd132, %r9; - xor.b32 %r1702, %r1699, %r1671; - and.b32 %r1703, %r1702, 255; - cvt.u64.u32 %rd133, %r1703; - bfi.b64 %rd134, %rd133, %rd132, 32, 32; - cvt.u64.u32 %rd135, %r1702; - shl.b64 %rd136, %rd135, 32; - and.b64 %rd137, %rd136, 280375465082880; - or.b64 %rd138, %rd134, %rd137; - and.b64 %rd139, %rd136, 71776119061217280; - shr.u32 %r10, %r1702, 24; - cvt.u64.u32 %rd140, %r10; - shl.b64 %rd141, %rd140, 56; - or.b64 %rd142, %rd138, %rd139; - or.b64 %rd143, %rd142, %rd141; - xor.b32 %r11, %r1661, %r1684; - cvt.u64.u32 %rd144, %r11; - xor.b32 %r1704, %r1696, %r1674; - and.b32 %r1705, %r1704, 255; - cvt.u64.u32 %rd145, %r1705; - bfi.b64 %rd146, %rd145, %rd144, 32, 32; - cvt.u64.u32 %rd147, %r1704; - shl.b64 %rd148, %rd147, 32; - and.b64 %rd149, %rd148, 280375465082880; - or.b64 %rd150, %rd146, %rd149; - and.b64 %rd151, %rd148, 71776119061217280; - shr.u32 %r12, %r1704, 24; - cvt.u64.u32 %rd152, %r12; - shl.b64 %rd153, %rd152, 56; - or.b64 %rd154, %rd150, %rd151; - or.b64 %rd155, %rd154, %rd153; - xor.b32 %r13, %r1701, %r1673; - cvt.u64.u32 %rd156, %r13; - xor.b32 %r1706, %r1663, %r1686; - and.b32 %r1707, %r1706, 255; - cvt.u64.u32 %rd157, %r1707; - bfi.b64 %rd158, %rd157, %rd156, 32, 32; - cvt.u64.u32 %rd159, %r1706; - shl.b64 %rd160, %rd159, 32; - and.b64 %rd161, %rd160, 280375465082880; - or.b64 %rd162, %rd158, %rd161; - and.b64 %rd163, %rd160, 71776119061217280; - shr.u32 %r14, %r1706, 24; - cvt.u64.u32 %rd164, %r14; - shl.b64 %rd165, %rd164, 56; - or.b64 %rd166, %rd162, %rd163; - or.b64 %rd167, %rd166, %rd165; - xor.b32 %r1708, %r1698, %r1676; - cvt.u64.u32 %rd168, %r1708; - xor.b32 %r1709, %r1660, %r1689; - and.b32 %r1710, %r1709, 255; - cvt.u64.u32 %rd169, %r1710; - bfi.b64 %rd170, %rd169, %rd168, 32, 32; - cvt.u64.u32 %rd171, %r1709; - shl.b64 %rd172, %rd171, 32; - and.b64 %rd173, %rd172, 280375465082880; - or.b64 %rd174, %rd170, %rd173; - and.b64 %rd175, %rd172, 71776119061217280; - shr.u32 %r1711, %r1709, 24; - cvt.u64.u32 %rd176, %r1711; - shl.b64 %rd177, %rd176, 56; - or.b64 %rd178, %rd174, %rd175; - or.b64 %rd9, %rd178, %rd177; - shr.u64 %rd10, %rd143, 32; - shr.u64 %rd11, %rd143, 40; - shr.u64 %rd12, %rd143, 48; - shr.u64 %rd13, %rd155, 32; - shr.u64 %rd14, %rd155, 40; - shr.u64 %rd15, %rd155, 48; - shr.u64 %rd16, %rd167, 32; - shr.u64 %rd17, %rd167, 40; - shr.u64 %rd18, %rd167, 48; - shr.u32 %r5937, %r9, 12; - bfe.u32 %r5938, %r9, 4, 4; - and.b32 %r5939, %r9, 15; - bfi.b32 %r5940, %r5939, %r5938, 8, 4; - shl.b32 %r5941, %r9, 4; - and.b32 %r5942, %r5941, 983040; - or.b32 %r5943, %r5940, %r5942; - shl.b32 %r5944, %r9, 16; - and.b32 %r5945, %r5944, 251658240; - or.b32 %r5870, %r5943, %r5945; - bfe.u32 %r5946, %r9, 20, 4; - bfe.u32 %r5947, %r9, 16, 4; - shr.u32 %r2239, %r9, 24; - bfi.b32 %r5948, %r5947, %r5946, 8, 4; - and.b32 %r5949, %r5937, 983040; - or.b32 %r5950, %r5948, %r5949; - and.b32 %r5951, %r9, 251658240; - or.b32 %r5874, %r5950, %r5951; - cvt.u16.u64 %rs83, %rd10; - and.b16 %rs84, %rs83, 240; - shr.u16 %rs85, %rs84, 4; - cvt.u16.u64 %rs86, %rd11; - and.b16 %rs87, %rs86, 240; - shr.u16 %rs88, %rs87, 4; - cvt.u32.u16 %r5952, %rs85; - cvt.u32.u64 %r5953, %rd10; - and.b32 %r5954, %r5953, 15; - prmt.b32 %r5955, %r5954, %r5952, 30212; - cvt.u32.u16 %r5956, %rs88; - prmt.b32 %r5957, %r5956, %r5955, 28756; - cvt.u32.u64 %r5958, %rd11; - shl.b32 %r5959, %r5958, 24; - and.b32 %r5960, %r5959, 251658240; - or.b32 %r5878, %r5957, %r5960; - cvt.u16.u64 %rs89, %rd12; - and.b16 %rs90, %rs89, 240; - shr.u16 %rs91, %rs90, 4; - cvt.u32.u16 %r5961, %rs91; - cvt.u32.u64 %r5962, %rd12; - and.b32 %r5963, %r5962, 15; - prmt.b32 %r5964, %r5963, %r5961, 30212; - shl.b32 %r5965, %r10, 12; - and.b32 %r5966, %r5965, 983040; - or.b32 %r5967, %r5964, %r5966; - shl.b32 %r5968, %r10, 24; - and.b32 %r5969, %r5968, 251658240; - or.b32 %r5882, %r5967, %r5969; - shr.u32 %r5970, %r11, 12; - bfe.u32 %r5971, %r11, 4, 4; - and.b32 %r5972, %r11, 15; - bfi.b32 %r5973, %r5972, %r5971, 8, 4; - shl.b32 %r5974, %r11, 4; - and.b32 %r5975, %r5974, 983040; - or.b32 %r5976, %r5973, %r5975; - shl.b32 %r5977, %r11, 16; - and.b32 %r5978, %r5977, 251658240; - or.b32 %r5886, %r5976, %r5978; - bfe.u32 %r5979, %r11, 20, 4; - bfe.u32 %r5980, %r11, 16, 4; - shr.u32 %r3295, %r11, 24; - bfi.b32 %r5981, %r5980, %r5979, 8, 4; - and.b32 %r5982, %r5970, 983040; - or.b32 %r5983, %r5981, %r5982; - and.b32 %r5984, %r11, 251658240; - or.b32 %r5890, %r5983, %r5984; - cvt.u16.u64 %rs92, %rd13; - and.b16 %rs93, %rs92, 240; - shr.u16 %rs94, %rs93, 4; - cvt.u16.u64 %rs95, %rd14; - and.b16 %rs96, %rs95, 240; - shr.u16 %rs97, %rs96, 4; - cvt.u32.u16 %r5985, %rs94; - cvt.u32.u64 %r5986, %rd13; - and.b32 %r5987, %r5986, 15; - prmt.b32 %r5988, %r5987, %r5985, 30212; - cvt.u32.u16 %r5989, %rs97; - prmt.b32 %r5990, %r5989, %r5988, 28756; - cvt.u32.u64 %r5991, %rd14; - shl.b32 %r5992, %r5991, 24; - and.b32 %r5993, %r5992, 251658240; - or.b32 %r5894, %r5990, %r5993; - cvt.u16.u64 %rs98, %rd15; - and.b16 %rs99, %rs98, 240; - shr.u16 %rs100, %rs99, 4; - cvt.u32.u16 %r5994, %rs100; - cvt.u32.u64 %r5995, %rd15; - and.b32 %r5996, %r5995, 15; - prmt.b32 %r5997, %r5996, %r5994, 30212; - shl.b32 %r5998, %r12, 12; - and.b32 %r5999, %r5998, 983040; - or.b32 %r6000, %r5997, %r5999; - shl.b32 %r6001, %r12, 24; - and.b32 %r6002, %r6001, 251658240; - or.b32 %r5898, %r6000, %r6002; - shr.u32 %r6003, %r13, 12; - bfe.u32 %r6004, %r13, 4, 4; - and.b32 %r6005, %r13, 15; - bfi.b32 %r6006, %r6005, %r6004, 8, 4; - shl.b32 %r6007, %r13, 4; - and.b32 %r6008, %r6007, 983040; - or.b32 %r6009, %r6006, %r6008; - shl.b32 %r6010, %r13, 16; - and.b32 %r6011, %r6010, 251658240; - or.b32 %r5902, %r6009, %r6011; - bfe.u32 %r6012, %r13, 20, 4; - bfe.u32 %r6013, %r13, 16, 4; - shr.u32 %r4351, %r13, 24; - bfi.b32 %r6014, %r6013, %r6012, 8, 4; - and.b32 %r6015, %r6003, 983040; - or.b32 %r6016, %r6014, %r6015; - and.b32 %r6017, %r13, 251658240; - or.b32 %r5906, %r6016, %r6017; - cvt.u16.u64 %rs101, %rd16; - and.b16 %rs102, %rs101, 240; - shr.u16 %rs103, %rs102, 4; - cvt.u16.u64 %rs104, %rd17; - and.b16 %rs105, %rs104, 240; - shr.u16 %rs106, %rs105, 4; - cvt.u32.u16 %r6018, %rs103; - cvt.u32.u64 %r6019, %rd16; - and.b32 %r6020, %r6019, 15; - prmt.b32 %r6021, %r6020, %r6018, 30212; - cvt.u32.u16 %r6022, %rs106; - prmt.b32 %r6023, %r6022, %r6021, 28756; - cvt.u32.u64 %r6024, %rd17; - shl.b32 %r6025, %r6024, 24; - and.b32 %r6026, %r6025, 251658240; - or.b32 %r5910, %r6023, %r6026; - cvt.u16.u64 %rs107, %rd18; - and.b16 %rs108, %rs107, 240; - shr.u16 %rs109, %rs108, 4; - cvt.u32.u16 %r6027, %rs109; - cvt.u32.u64 %r6028, %rd18; - and.b32 %r6029, %r6028, 15; - prmt.b32 %r6030, %r6029, %r6027, 30212; - shl.b32 %r6031, %r14, 12; - and.b32 %r6032, %r6031, 983040; - or.b32 %r6033, %r6030, %r6032; - shl.b32 %r6034, %r14, 24; - and.b32 %r6035, %r6034, 251658240; - or.b32 %r5914, %r6033, %r6035; - cvt.u16.u64 %rs110, %rd9; - and.b16 %rs111, %rs110, 240; - shr.u16 %rs112, %rs111, 4; - shr.u64 %rd201, %rd9, 8; - cvt.u32.u64 %r6036, %rd201; - cvt.u32.u64 %r6037, %rd9; - shr.u32 %r6038, %r6037, 12; - cvt.u32.u16 %r6039, %rs112; - and.b32 %r6040, %r6037, 15; + add.s32 %r1690, %r1642, %r1123; + add.s32 %r1691, %r1690, %r1661; + xor.b32 %r1692, %r1691, %r1686; + shf.l.wrap.b32 %r1693, %r1692, %r1692, 16; + add.s32 %r1694, %r1693, %r1673; + xor.b32 %r1695, %r1694, %r1661; + shf.l.wrap.b32 %r1696, %r1695, %r1695, 20; + add.s32 %r1697, %r1691, %r1088; + add.s32 %r1698, %r1697, %r1696; + xor.b32 %r1699, %r1698, %r1693; + shf.l.wrap.b32 %r1700, %r1699, %r1699, 24; + add.s32 %r1701, %r1700, %r1694; + xor.b32 %r1702, %r1701, %r1696; + shf.l.wrap.b32 %r1703, %r1702, %r1702, 25; + add.s32 %r1704, %r1656, %r1109; + add.s32 %r1705, %r1704, %r1675; + xor.b32 %r1706, %r1705, %r1644; + shf.l.wrap.b32 %r1707, %r1706, %r1706, 16; + add.s32 %r1708, %r1707, %r1687; + xor.b32 %r1709, %r1708, %r1675; + shf.l.wrap.b32 %r1710, %r1709, %r1709, 20; + add.s32 %r1711, %r1705, %r1095; + add.s32 %r1712, %r1711, %r1710; + xor.b32 %r1713, %r1712, %r1707; + shf.l.wrap.b32 %r1714, %r1713, %r1713, 24; + add.s32 %r1715, %r1714, %r1708; + xor.b32 %r1716, %r1715, %r1710; + shf.l.wrap.b32 %r1717, %r1716, %r1716, 25; + add.s32 %r1718, %r1670, %r1074; + add.s32 %r1719, %r1718, %r1689; + xor.b32 %r1720, %r1719, %r1658; + shf.l.wrap.b32 %r1721, %r1720, %r1720, 16; + add.s32 %r1722, %r1721, %r1645; + xor.b32 %r1723, %r1722, %r1689; + shf.l.wrap.b32 %r1724, %r1723, %r1723, 20; + add.s32 %r1725, %r1719, %r1081; + add.s32 %r1726, %r1725, %r1724; + xor.b32 %r1727, %r1726, %r1721; + shf.l.wrap.b32 %r1728, %r1727, %r1727, 24; + add.s32 %r1729, %r1728, %r1722; + xor.b32 %r1730, %r1729, %r1724; + shf.l.wrap.b32 %r1731, %r1730, %r1730, 25; + add.s32 %r1732, %r1684, %r1116; + add.s32 %r1733, %r1732, %r1647; + xor.b32 %r1734, %r1733, %r1672; + shf.l.wrap.b32 %r1735, %r1734, %r1734, 16; + add.s32 %r1736, %r1735, %r1659; + xor.b32 %r1737, %r1736, %r1647; + shf.l.wrap.b32 %r1738, %r1737, %r1737, 20; + add.s32 %r1739, %r1733, %r1102; + add.s32 %r1740, %r1739, %r1738; + xor.b32 %r1741, %r1740, %r1735; + shf.l.wrap.b32 %r1742, %r1741, %r1741, 24; + add.s32 %r1743, %r1742, %r1736; + xor.b32 %r1744, %r1743, %r1738; + shf.l.wrap.b32 %r1745, %r1744, %r1744, 25; + add.s32 %r1746, %r1698, %r1137; + add.s32 %r1747, %r1746, %r1745; + xor.b32 %r1748, %r1747, %r1714; + shf.l.wrap.b32 %r1749, %r1748, %r1748, 16; + add.s32 %r1750, %r1749, %r1729; + xor.b32 %r1751, %r1750, %r1745; + shf.l.wrap.b32 %r1752, %r1751, %r1751, 20; + add.s32 %r1753, %r1747, %r1172; + add.s32 %r1754, %r1753, %r1752; + xor.b32 %r1755, %r1754, %r1749; + shf.l.wrap.b32 %r1756, %r1755, %r1755, 24; + add.s32 %r1757, %r1756, %r1750; + xor.b32 %r1758, %r1757, %r1752; + shf.l.wrap.b32 %r1759, %r1758, %r1758, 25; + add.s32 %r1760, %r1712, %r1151; + add.s32 %r1761, %r1760, %r1703; + xor.b32 %r1762, %r1761, %r1728; + shf.l.wrap.b32 %r1763, %r1762, %r1762, 16; + add.s32 %r1764, %r1763, %r1743; + xor.b32 %r1765, %r1764, %r1703; + shf.l.wrap.b32 %r1766, %r1765, %r1765, 20; + add.s32 %r1767, %r1761, %r1109; + add.s32 %r1768, %r1767, %r1766; + xor.b32 %r1769, %r1768, %r1763; + shf.l.wrap.b32 %r1770, %r1769, %r1769, 24; + add.s32 %r1771, %r1770, %r1764; + xor.b32 %r1772, %r1771, %r1766; + shf.l.wrap.b32 %r1773, %r1772, %r1772, 25; + add.s32 %r1774, %r1726, %r1130; + add.s32 %r1775, %r1774, %r1717; + xor.b32 %r1776, %r1775, %r1742; + shf.l.wrap.b32 %r1777, %r1776, %r1776, 16; + add.s32 %r1778, %r1777, %r1701; + xor.b32 %r1779, %r1778, %r1717; + shf.l.wrap.b32 %r1780, %r1779, %r1779, 20; + add.s32 %r1781, %r1775, %r1158; + add.s32 %r1782, %r1781, %r1780; + xor.b32 %r1783, %r1782, %r1777; + shf.l.wrap.b32 %r1784, %r1783, %r1783, 24; + add.s32 %r1785, %r1784, %r1778; + xor.b32 %r1786, %r1785, %r1780; + shf.l.wrap.b32 %r1787, %r1786, %r1786, 25; + add.s32 %r1788, %r1740, %r1179; + add.s32 %r1789, %r1788, %r1731; + xor.b32 %r1790, %r1789, %r1700; + shf.l.wrap.b32 %r1791, %r1790, %r1790, 16; + add.s32 %r1792, %r1791, %r1715; + xor.b32 %r1793, %r1792, %r1731; + shf.l.wrap.b32 %r1794, %r1793, %r1793, 20; + add.s32 %r1795, %r1789, %r1081; + add.s32 %r1796, %r1795, %r1794; + xor.b32 %r1797, %r1796, %r1791; + shf.l.wrap.b32 %r1798, %r1797, %r1797, 24; + add.s32 %r1799, %r1798, %r1792; + xor.b32 %r1800, %r1799, %r1794; + shf.l.wrap.b32 %r1801, %r1800, %r1800, 25; + add.s32 %r1802, %r1754, %r1165; + add.s32 %r1803, %r1802, %r1773; + xor.b32 %r1804, %r1803, %r1798; + shf.l.wrap.b32 %r1805, %r1804, %r1804, 16; + add.s32 %r1806, %r1805, %r1785; + xor.b32 %r1807, %r1806, %r1773; + shf.l.wrap.b32 %r1808, %r1807, %r1807, 20; + add.s32 %r1809, %r1803, %r1095; + add.s32 %r1810, %r1809, %r1808; + xor.b32 %r1811, %r1810, %r1805; + shf.l.wrap.b32 %r1812, %r1811, %r1811, 24; + add.s32 %r1813, %r1812, %r1806; + xor.b32 %r1814, %r1813, %r1808; + shf.l.wrap.b32 %r1815, %r1814, %r1814, 25; + add.s32 %r1816, %r1768, %r1074; + add.s32 %r1817, %r1816, %r1787; + xor.b32 %r1818, %r1817, %r1756; + shf.l.wrap.b32 %r1819, %r1818, %r1818, 16; + add.s32 %r1820, %r1819, %r1799; + xor.b32 %r1821, %r1820, %r1787; + shf.l.wrap.b32 %r1822, %r1821, %r1821, 20; + add.s32 %r1823, %r1817, %r1144; + add.s32 %r1824, %r1823, %r1822; + xor.b32 %r1825, %r1824, %r1819; + shf.l.wrap.b32 %r1826, %r1825, %r1825, 24; + add.s32 %r1827, %r1826, %r1820; + xor.b32 %r1828, %r1827, %r1822; + shf.l.wrap.b32 %r1829, %r1828, %r1828, 25; + add.s32 %r1830, %r1782, %r1088; + add.s32 %r1831, %r1830, %r1801; + xor.b32 %r1832, %r1831, %r1770; + shf.l.wrap.b32 %r1833, %r1832, %r1832, 16; + add.s32 %r1834, %r1833, %r1757; + xor.b32 %r1835, %r1834, %r1801; + shf.l.wrap.b32 %r1836, %r1835, %r1835, 20; + add.s32 %r1837, %r1831, %r1116; + add.s32 %r1838, %r1837, %r1836; + xor.b32 %r1839, %r1838, %r1833; + shf.l.wrap.b32 %r1840, %r1839, %r1839, 24; + add.s32 %r1841, %r1840, %r1834; + xor.b32 %r1842, %r1841, %r1836; + shf.l.wrap.b32 %r1843, %r1842, %r1842, 25; + add.s32 %r1844, %r1796, %r1102; + add.s32 %r1845, %r1844, %r1759; + xor.b32 %r1846, %r1845, %r1784; + shf.l.wrap.b32 %r1847, %r1846, %r1846, 16; + add.s32 %r1848, %r1847, %r1771; + xor.b32 %r1849, %r1848, %r1759; + shf.l.wrap.b32 %r1850, %r1849, %r1849, 20; + add.s32 %r1851, %r1845, %r1123; + add.s32 %r1852, %r1851, %r1850; + xor.b32 %r1853, %r1852, %r1847; + shf.l.wrap.b32 %r1854, %r1853, %r1853, 24; + add.s32 %r1855, %r1854, %r1848; + xor.b32 %r1856, %r1855, %r1850; + shf.l.wrap.b32 %r1857, %r1856, %r1856, 25; + add.s32 %r1858, %r1810, %r1151; + add.s32 %r1859, %r1858, %r1857; + xor.b32 %r1860, %r1859, %r1826; + shf.l.wrap.b32 %r1861, %r1860, %r1860, 16; + add.s32 %r1862, %r1861, %r1841; + xor.b32 %r1863, %r1862, %r1857; + shf.l.wrap.b32 %r1864, %r1863, %r1863, 20; + add.s32 %r1865, %r1859, %r1179; + add.s32 %r1866, %r1865, %r1864; + xor.b32 %r1867, %r1866, %r1861; + shf.l.wrap.b32 %r1868, %r1867, %r1867, 24; + add.s32 %r1869, %r1868, %r1862; + xor.b32 %r1870, %r1869, %r1864; + shf.l.wrap.b32 %r1871, %r1870, %r1870, 25; + add.s32 %r1872, %r1824, %r1109; + add.s32 %r1873, %r1872, %r1815; + xor.b32 %r1874, %r1873, %r1840; + shf.l.wrap.b32 %r1875, %r1874, %r1874, 16; + add.s32 %r1876, %r1875, %r1855; + xor.b32 %r1877, %r1876, %r1815; + shf.l.wrap.b32 %r1878, %r1877, %r1877, 20; + add.s32 %r1879, %r1873, %r1074; + add.s32 %r1880, %r1879, %r1878; + xor.b32 %r1881, %r1880, %r1875; + shf.l.wrap.b32 %r1882, %r1881, %r1881, 24; + add.s32 %r1883, %r1882, %r1876; + xor.b32 %r1884, %r1883, %r1878; + shf.l.wrap.b32 %r1885, %r1884, %r1884, 25; + add.s32 %r1886, %r1838, %r1081; + add.s32 %r1887, %r1886, %r1829; + xor.b32 %r1888, %r1887, %r1854; + shf.l.wrap.b32 %r1889, %r1888, %r1888, 16; + add.s32 %r1890, %r1889, %r1813; + xor.b32 %r1891, %r1890, %r1829; + shf.l.wrap.b32 %r1892, %r1891, %r1891, 20; + add.s32 %r1893, %r1887, %r1137; + add.s32 %r1894, %r1893, %r1892; + xor.b32 %r1895, %r1894, %r1889; + shf.l.wrap.b32 %r1896, %r1895, %r1895, 24; + add.s32 %r1897, %r1896, %r1890; + xor.b32 %r1898, %r1897, %r1892; + shf.l.wrap.b32 %r1899, %r1898, %r1898, 25; + add.s32 %r1900, %r1852, %r1130; + add.s32 %r1901, %r1900, %r1843; + xor.b32 %r1902, %r1901, %r1812; + shf.l.wrap.b32 %r1903, %r1902, %r1902, 16; + add.s32 %r1904, %r1903, %r1827; + xor.b32 %r1905, %r1904, %r1843; + shf.l.wrap.b32 %r1906, %r1905, %r1905, 20; + add.s32 %r1907, %r1901, %r1116; + add.s32 %r1908, %r1907, %r1906; + xor.b32 %r1909, %r1908, %r1903; + shf.l.wrap.b32 %r1910, %r1909, %r1909, 24; + add.s32 %r1911, %r1910, %r1904; + xor.b32 %r1912, %r1911, %r1906; + shf.l.wrap.b32 %r1913, %r1912, %r1912, 25; + add.s32 %r1914, %r1866, %r1172; + add.s32 %r1915, %r1914, %r1885; + xor.b32 %r1916, %r1915, %r1910; + shf.l.wrap.b32 %r1917, %r1916, %r1916, 16; + add.s32 %r1918, %r1917, %r1897; + xor.b32 %r1919, %r1918, %r1885; + shf.l.wrap.b32 %r1920, %r1919, %r1919, 20; + add.s32 %r1921, %r1915, %r1144; + add.s32 %r1922, %r1921, %r1920; + xor.b32 %r1923, %r1922, %r1917; + shf.l.wrap.b32 %r1924, %r1923, %r1923, 24; + add.s32 %r1925, %r1924, %r1918; + xor.b32 %r1926, %r1925, %r1920; + shf.l.wrap.b32 %r1927, %r1926, %r1926, 25; + add.s32 %r1928, %r1880, %r1088; + add.s32 %r1929, %r1928, %r1899; + xor.b32 %r1930, %r1929, %r1868; + shf.l.wrap.b32 %r1931, %r1930, %r1930, 16; + add.s32 %r1932, %r1931, %r1911; + xor.b32 %r1933, %r1932, %r1899; + shf.l.wrap.b32 %r1934, %r1933, %r1933, 20; + add.s32 %r1935, %r1929, %r1158; + add.s32 %r1936, %r1935, %r1934; + xor.b32 %r1937, %r1936, %r1931; + shf.l.wrap.b32 %r1938, %r1937, %r1937, 24; + add.s32 %r1939, %r1938, %r1932; + xor.b32 %r1940, %r1939, %r1934; + shf.l.wrap.b32 %r1941, %r1940, %r1940, 25; + add.s32 %r1942, %r1894, %r1095; + add.s32 %r1943, %r1942, %r1913; + xor.b32 %r1944, %r1943, %r1882; + shf.l.wrap.b32 %r1945, %r1944, %r1944, 16; + add.s32 %r1946, %r1945, %r1869; + xor.b32 %r1947, %r1946, %r1913; + shf.l.wrap.b32 %r1948, %r1947, %r1947, 20; + add.s32 %r1949, %r1943, %r1102; + add.s32 %r1950, %r1949, %r1948; + xor.b32 %r1951, %r1950, %r1945; + shf.l.wrap.b32 %r1952, %r1951, %r1951, 24; + add.s32 %r1953, %r1952, %r1946; + xor.b32 %r1954, %r1953, %r1948; + shf.l.wrap.b32 %r1955, %r1954, %r1954, 25; + add.s32 %r1956, %r1908, %r1123; + add.s32 %r1957, %r1956, %r1871; + xor.b32 %r1958, %r1957, %r1896; + shf.l.wrap.b32 %r1959, %r1958, %r1958, 16; + add.s32 %r1960, %r1959, %r1883; + xor.b32 %r1961, %r1960, %r1871; + shf.l.wrap.b32 %r1962, %r1961, %r1961, 20; + add.s32 %r1963, %r1957, %r1165; + add.s32 %r1964, %r1963, %r1962; + xor.b32 %r1965, %r1964, %r1959; + shf.l.wrap.b32 %r1966, %r1965, %r1965, 24; + add.s32 %r1967, %r1966, %r1960; + xor.b32 %r1968, %r1967, %r1962; + shf.l.wrap.b32 %r1969, %r1968, %r1968, 25; + xor.b32 %r11657, %r1953, %r1922; + st.local.u32 [%rd3+-104], %r11657; + xor.b32 %r11656, %r1967, %r1936; + st.local.u32 [%rd3+-100], %r11656; + xor.b32 %r11655, %r1925, %r1950; + st.local.u32 [%rd3+-96], %r11655; + xor.b32 %r11654, %r1939, %r1964; + st.local.u32 [%rd3+-92], %r11654; + xor.b32 %r11653, %r1969, %r1938; + st.local.u32 [%rd3+-88], %r11653; + xor.b32 %r11652, %r1927, %r1952; + st.local.u32 [%rd3+-84], %r11652; + xor.b32 %r11651, %r1941, %r1966; + st.local.u32 [%rd3+-80], %r11651; + xor.b32 %r11650, %r1955, %r1924; + st.local.u32 [%rd3+-76], %r11650; + add.s16 %rs352, %rs352, 1; + st.local.u8 [%rd3+1], %rs352; + add.s64 %rd260, %rd260, 64; + add.s64 %rd243, %rd243, -64; + setp.gt.u64 %p10, %rd243, 64; + @%p10 bra $L__BB1_11; + +$L__BB1_12: + cvt.u64.u16 %rd120, %rs351; + and.b64 %rd24, %rd120, 255; + mov.u64 %rd121, 64; + sub.s64 %rd122, %rd121, %rd24; + min.u64 %rd25, %rd122, %rd243; + setp.eq.s64 %p11, %rd25, 0; + @%p11 bra $L__BB1_15; + + add.s64 %rd124, %rd2, %rd24; + add.s64 %rd26, %rd124, 72; + mov.u64 %rd244, 0; + +$L__BB1_14: + add.s64 %rd125, %rd260, %rd244; + ld.local.u8 %rs119, [%rd125]; + add.s64 %rd126, %rd26, %rd244; + st.local.u8 [%rd126], %rs119; + add.s64 %rd244, %rd244, 1; + setp.lt.u64 %p12, %rd244, %rd25; + @%p12 bra $L__BB1_14; + +$L__BB1_15: + cvt.u16.u64 %rs120, %rd25; + ld.local.u8 %rs121, [%rd3]; + add.s16 %rs13, %rs121, %rs120; + st.local.u8 [%rd3], %rs13; + mov.u64 %rd127, 80; + sub.s64 %rd29, %rd127, %rd6; + setp.eq.s64 %p13, %rd29, 0; + @%p13 bra $L__BB1_68; + + ld.local.u8 %rs122, [%rd3+1]; + setp.eq.s16 %p14, %rs122, 0; + selp.u16 %rs123, 1, 0, %p14; + ld.local.u8 %rs124, [%rd3+2]; + or.b16 %rs125, %rs124, %rs123; + or.b16 %rs126, %rs125, 2; + ld.local.u8 %r1970, [%rd3+-64]; + ld.local.u8 %r1971, [%rd3+-63]; + prmt.b32 %r1972, %r1971, %r1970, 30212; + ld.local.u8 %r1973, [%rd3+-62]; + prmt.b32 %r1974, %r1973, %r1972, 28756; + ld.local.u8 %r1975, [%rd3+-61]; + prmt.b32 %r1976, %r1975, %r1974, 1620; + ld.local.u8 %r1977, [%rd3+-60]; + ld.local.u8 %r1978, [%rd3+-59]; + prmt.b32 %r1979, %r1978, %r1977, 30212; + ld.local.u8 %r1980, [%rd3+-58]; + prmt.b32 %r1981, %r1980, %r1979, 28756; + ld.local.u8 %r1982, [%rd3+-57]; + prmt.b32 %r1983, %r1982, %r1981, 1620; + ld.local.u8 %r1984, [%rd3+-56]; + ld.local.u8 %r1985, [%rd3+-55]; + prmt.b32 %r1986, %r1985, %r1984, 30212; + ld.local.u8 %r1987, [%rd3+-54]; + prmt.b32 %r1988, %r1987, %r1986, 28756; + ld.local.u8 %r1989, [%rd3+-53]; + prmt.b32 %r1990, %r1989, %r1988, 1620; + ld.local.u8 %r1991, [%rd3+-52]; + ld.local.u8 %r1992, [%rd3+-51]; + prmt.b32 %r1993, %r1992, %r1991, 30212; + ld.local.u8 %r1994, [%rd3+-50]; + prmt.b32 %r1995, %r1994, %r1993, 28756; + ld.local.u8 %r1996, [%rd3+-49]; + prmt.b32 %r1997, %r1996, %r1995, 1620; + ld.local.u8 %r1998, [%rd3+-48]; + ld.local.u8 %r1999, [%rd3+-47]; + prmt.b32 %r2000, %r1999, %r1998, 30212; + ld.local.u8 %r2001, [%rd3+-46]; + prmt.b32 %r2002, %r2001, %r2000, 28756; + ld.local.u8 %r2003, [%rd3+-45]; + prmt.b32 %r2004, %r2003, %r2002, 1620; + ld.local.u8 %r2005, [%rd3+-44]; + ld.local.u8 %r2006, [%rd3+-43]; + prmt.b32 %r2007, %r2006, %r2005, 30212; + ld.local.u8 %r2008, [%rd3+-42]; + prmt.b32 %r2009, %r2008, %r2007, 28756; + ld.local.u8 %r2010, [%rd3+-41]; + prmt.b32 %r2011, %r2010, %r2009, 1620; + ld.local.u8 %r2012, [%rd3+-40]; + ld.local.u8 %r2013, [%rd3+-39]; + prmt.b32 %r2014, %r2013, %r2012, 30212; + ld.local.u8 %r2015, [%rd3+-38]; + prmt.b32 %r2016, %r2015, %r2014, 28756; + ld.local.u8 %r2017, [%rd3+-37]; + prmt.b32 %r2018, %r2017, %r2016, 1620; + ld.local.u8 %r2019, [%rd3+-36]; + ld.local.u8 %r2020, [%rd3+-35]; + prmt.b32 %r2021, %r2020, %r2019, 30212; + ld.local.u8 %r2022, [%rd3+-34]; + prmt.b32 %r2023, %r2022, %r2021, 28756; + ld.local.u8 %r2024, [%rd3+-33]; + prmt.b32 %r2025, %r2024, %r2023, 1620; + ld.local.u8 %r2026, [%rd3+-32]; + ld.local.u8 %r2027, [%rd3+-31]; + prmt.b32 %r2028, %r2027, %r2026, 30212; + ld.local.u8 %r2029, [%rd3+-30]; + prmt.b32 %r2030, %r2029, %r2028, 28756; + ld.local.u8 %r2031, [%rd3+-29]; + prmt.b32 %r2032, %r2031, %r2030, 1620; + ld.local.u8 %r2033, [%rd3+-28]; + ld.local.u8 %r2034, [%rd3+-27]; + prmt.b32 %r2035, %r2034, %r2033, 30212; + ld.local.u8 %r2036, [%rd3+-26]; + prmt.b32 %r2037, %r2036, %r2035, 28756; + ld.local.u8 %r2038, [%rd3+-25]; + prmt.b32 %r2039, %r2038, %r2037, 1620; + ld.local.u8 %r2040, [%rd3+-24]; + ld.local.u8 %r2041, [%rd3+-23]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd3+-22]; + prmt.b32 %r2044, %r2043, %r2042, 28756; + ld.local.u8 %r2045, [%rd3+-21]; + prmt.b32 %r2046, %r2045, %r2044, 1620; + ld.local.u8 %r2047, [%rd3+-20]; + ld.local.u8 %r2048, [%rd3+-19]; + prmt.b32 %r2049, %r2048, %r2047, 30212; + ld.local.u8 %r2050, [%rd3+-18]; + prmt.b32 %r2051, %r2050, %r2049, 28756; + ld.local.u8 %r2052, [%rd3+-17]; + prmt.b32 %r2053, %r2052, %r2051, 1620; + ld.local.u8 %r2054, [%rd3+-16]; + ld.local.u8 %r2055, [%rd3+-15]; + prmt.b32 %r2056, %r2055, %r2054, 30212; + ld.local.u8 %r2057, [%rd3+-14]; + prmt.b32 %r2058, %r2057, %r2056, 28756; + ld.local.u8 %r2059, [%rd3+-13]; + prmt.b32 %r2060, %r2059, %r2058, 1620; + ld.local.u8 %r2061, [%rd3+-12]; + ld.local.u8 %r2062, [%rd3+-11]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + ld.local.u8 %r2064, [%rd3+-10]; + prmt.b32 %r2065, %r2064, %r2063, 28756; + ld.local.u8 %r2066, [%rd3+-9]; + prmt.b32 %r2067, %r2066, %r2065, 1620; + ld.local.u8 %r2068, [%rd3+-8]; + ld.local.u8 %r2069, [%rd3+-7]; + prmt.b32 %r2070, %r2069, %r2068, 30212; + ld.local.u8 %r2071, [%rd3+-6]; + prmt.b32 %r2072, %r2071, %r2070, 28756; + ld.local.u8 %r2073, [%rd3+-5]; + prmt.b32 %r2074, %r2073, %r2072, 1620; + ld.local.u8 %r2075, [%rd3+-4]; + ld.local.u8 %r2076, [%rd3+-3]; + prmt.b32 %r2077, %r2076, %r2075, 30212; + ld.local.u8 %r2078, [%rd3+-2]; + prmt.b32 %r2079, %r2078, %r2077, 28756; + ld.local.u8 %r2080, [%rd3+-1]; + prmt.b32 %r2081, %r2080, %r2079, 1620; + ld.local.u64 %rd128, [%rd3+-72]; + cvt.u32.u64 %r2082, %rd128; + shr.u64 %rd129, %rd128, 32; + cvt.u32.u64 %r2083, %rd129; + cvt.u32.u16 %r2084, %rs126; + and.b32 %r2085, %r2084, 255; + ld.local.u8 %r2086, [%rd3+-88]; + ld.local.u8 %r2087, [%rd3+-87]; + prmt.b32 %r2088, %r2087, %r2086, 30212; + ld.local.u8 %r2089, [%rd3+-86]; + ld.local.u8 %r2090, [%rd3+-85]; + prmt.b32 %r2091, %r2090, %r2089, 30212; + prmt.b32 %r2092, %r2091, %r2088, 4180; + ld.local.u8 %r2093, [%rd3+-104]; + ld.local.u8 %r2094, [%rd3+-103]; + prmt.b32 %r2095, %r2094, %r2093, 30212; + ld.local.u8 %r2096, [%rd3+-102]; + ld.local.u8 %r2097, [%rd3+-101]; + prmt.b32 %r2098, %r2097, %r2096, 30212; + prmt.b32 %r2099, %r2098, %r2095, 4180; + add.s32 %r2100, %r2092, %r2099; + add.s32 %r2101, %r2100, %r1976; + xor.b32 %r2102, %r2101, %r2082; + shf.l.wrap.b32 %r2103, %r2102, %r2102, 16; + add.s32 %r2104, %r2103, 1779033703; + xor.b32 %r2105, %r2104, %r2092; + shf.l.wrap.b32 %r2106, %r2105, %r2105, 20; + add.s32 %r2107, %r1983, %r2101; + add.s32 %r2108, %r2107, %r2106; + xor.b32 %r2109, %r2108, %r2103; + shf.l.wrap.b32 %r2110, %r2109, %r2109, 24; + add.s32 %r2111, %r2110, %r2104; + xor.b32 %r2112, %r2111, %r2106; + shf.l.wrap.b32 %r2113, %r2112, %r2112, 25; + ld.local.u8 %r2114, [%rd3+-84]; + ld.local.u8 %r2115, [%rd3+-83]; + prmt.b32 %r2116, %r2115, %r2114, 30212; + ld.local.u8 %r2117, [%rd3+-82]; + ld.local.u8 %r2118, [%rd3+-81]; + prmt.b32 %r2119, %r2118, %r2117, 30212; + prmt.b32 %r2120, %r2119, %r2116, 4180; + ld.local.u8 %r2121, [%rd3+-100]; + ld.local.u8 %r2122, [%rd3+-99]; + prmt.b32 %r2123, %r2122, %r2121, 30212; + ld.local.u8 %r2124, [%rd3+-98]; + ld.local.u8 %r2125, [%rd3+-97]; + prmt.b32 %r2126, %r2125, %r2124, 30212; + prmt.b32 %r2127, %r2126, %r2123, 4180; + add.s32 %r2128, %r2120, %r2127; + add.s32 %r2129, %r2128, %r1990; + xor.b32 %r2130, %r2129, %r2083; + shf.l.wrap.b32 %r2131, %r2130, %r2130, 16; + add.s32 %r2132, %r2131, -1150833019; + xor.b32 %r2133, %r2132, %r2120; + shf.l.wrap.b32 %r2134, %r2133, %r2133, 20; + add.s32 %r2135, %r1997, %r2129; + add.s32 %r2136, %r2135, %r2134; + xor.b32 %r2137, %r2136, %r2131; + shf.l.wrap.b32 %r2138, %r2137, %r2137, 24; + add.s32 %r2139, %r2138, %r2132; + xor.b32 %r2140, %r2139, %r2134; + shf.l.wrap.b32 %r2141, %r2140, %r2140, 25; + ld.local.u8 %r2142, [%rd3+-80]; + ld.local.u8 %r2143, [%rd3+-79]; + prmt.b32 %r2144, %r2143, %r2142, 30212; + ld.local.u8 %r2145, [%rd3+-78]; + ld.local.u8 %r2146, [%rd3+-77]; + prmt.b32 %r2147, %r2146, %r2145, 30212; + prmt.b32 %r2148, %r2147, %r2144, 4180; + ld.local.u8 %r2149, [%rd3+-96]; + ld.local.u8 %r2150, [%rd3+-95]; + prmt.b32 %r2151, %r2150, %r2149, 30212; + ld.local.u8 %r2152, [%rd3+-94]; + ld.local.u8 %r2153, [%rd3+-93]; + prmt.b32 %r2154, %r2153, %r2152, 30212; + prmt.b32 %r2155, %r2154, %r2151, 4180; + add.s32 %r2156, %r2148, %r2155; + add.s32 %r2157, %r2156, %r2004; + cvt.u32.u16 %r2158, %rs13; + and.b32 %r2159, %r2158, 255; + xor.b32 %r2160, %r2157, %r2159; + shr.u32 %r2161, %r2157, 16; + shl.b32 %r2162, %r2160, 16; + or.b32 %r2163, %r2162, %r2161; + add.s32 %r2164, %r2163, 1013904242; + xor.b32 %r2165, %r2164, %r2148; + shf.l.wrap.b32 %r2166, %r2165, %r2165, 20; + add.s32 %r2167, %r2011, %r2157; + add.s32 %r2168, %r2167, %r2166; + xor.b32 %r2169, %r2168, %r2163; + shf.l.wrap.b32 %r2170, %r2169, %r2169, 24; + add.s32 %r2171, %r2170, %r2164; + xor.b32 %r2172, %r2171, %r2166; + shf.l.wrap.b32 %r2173, %r2172, %r2172, 25; + ld.local.u8 %r2174, [%rd3+-76]; + ld.local.u8 %r2175, [%rd3+-75]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.local.u8 %r2177, [%rd3+-74]; + ld.local.u8 %r2178, [%rd3+-73]; + prmt.b32 %r2179, %r2178, %r2177, 30212; + prmt.b32 %r2180, %r2179, %r2176, 4180; + ld.local.u8 %r2181, [%rd3+-92]; + ld.local.u8 %r2182, [%rd3+-91]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.local.u8 %r2184, [%rd3+-90]; + ld.local.u8 %r2185, [%rd3+-89]; + prmt.b32 %r2186, %r2185, %r2184, 30212; + prmt.b32 %r2187, %r2186, %r2183, 4180; + add.s32 %r2188, %r2180, %r2187; + add.s32 %r2189, %r2188, %r2018; + xor.b32 %r2190, %r2189, %r2085; + shr.u32 %r2191, %r2189, 16; + shl.b32 %r2192, %r2190, 16; + or.b32 %r2193, %r2192, %r2191; + add.s32 %r2194, %r2193, -1521486534; + xor.b32 %r2195, %r2194, %r2180; + shf.l.wrap.b32 %r2196, %r2195, %r2195, 20; + add.s32 %r2197, %r2025, %r2189; + add.s32 %r2198, %r2197, %r2196; + xor.b32 %r2199, %r2198, %r2193; + shf.l.wrap.b32 %r2200, %r2199, %r2199, 24; + add.s32 %r2201, %r2200, %r2194; + xor.b32 %r2202, %r2201, %r2196; + shf.l.wrap.b32 %r2203, %r2202, %r2202, 25; + add.s32 %r2204, %r2141, %r2108; + add.s32 %r2205, %r2204, %r2032; + xor.b32 %r2206, %r2200, %r2205; + shf.l.wrap.b32 %r2207, %r2206, %r2206, 16; + add.s32 %r2208, %r2207, %r2171; + xor.b32 %r2209, %r2208, %r2141; + shf.l.wrap.b32 %r2210, %r2209, %r2209, 20; + add.s32 %r2211, %r2039, %r2205; + add.s32 %r2212, %r2211, %r2210; + xor.b32 %r2213, %r2212, %r2207; + shf.l.wrap.b32 %r2214, %r2213, %r2213, 24; + add.s32 %r2215, %r2214, %r2208; + xor.b32 %r2216, %r2215, %r2210; + shf.l.wrap.b32 %r2217, %r2216, %r2216, 25; + add.s32 %r2218, %r2173, %r2136; + add.s32 %r2219, %r2218, %r2046; + xor.b32 %r2220, %r2219, %r2110; + shf.l.wrap.b32 %r2221, %r2220, %r2220, 16; + add.s32 %r2222, %r2221, %r2201; + xor.b32 %r2223, %r2222, %r2173; + shf.l.wrap.b32 %r2224, %r2223, %r2223, 20; + add.s32 %r2225, %r2053, %r2219; + add.s32 %r2226, %r2225, %r2224; + xor.b32 %r2227, %r2226, %r2221; + shf.l.wrap.b32 %r2228, %r2227, %r2227, 24; + add.s32 %r2229, %r2228, %r2222; + xor.b32 %r2230, %r2229, %r2224; + shf.l.wrap.b32 %r2231, %r2230, %r2230, 25; + add.s32 %r2232, %r2203, %r2168; + add.s32 %r2233, %r2232, %r2060; + xor.b32 %r2234, %r2233, %r2138; + shf.l.wrap.b32 %r2235, %r2234, %r2234, 16; + add.s32 %r2236, %r2235, %r2111; + xor.b32 %r2237, %r2236, %r2203; + shf.l.wrap.b32 %r2238, %r2237, %r2237, 20; + add.s32 %r2239, %r2067, %r2233; + add.s32 %r2240, %r2239, %r2238; + xor.b32 %r2241, %r2240, %r2235; + shf.l.wrap.b32 %r2242, %r2241, %r2241, 24; + add.s32 %r2243, %r2242, %r2236; + xor.b32 %r2244, %r2243, %r2238; + shf.l.wrap.b32 %r2245, %r2244, %r2244, 25; + add.s32 %r2246, %r2198, %r2113; + add.s32 %r2247, %r2246, %r2074; + xor.b32 %r2248, %r2247, %r2170; + shf.l.wrap.b32 %r2249, %r2248, %r2248, 16; + add.s32 %r2250, %r2249, %r2139; + xor.b32 %r2251, %r2250, %r2113; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 20; + add.s32 %r2253, %r2081, %r2247; + add.s32 %r2254, %r2253, %r2252; + xor.b32 %r2255, %r2254, %r2249; + shf.l.wrap.b32 %r2256, %r2255, %r2255, 24; + add.s32 %r2257, %r2256, %r2250; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 25; + add.s32 %r2260, %r2212, %r1990; + add.s32 %r2261, %r2260, %r2259; + xor.b32 %r2262, %r2261, %r2228; + shf.l.wrap.b32 %r2263, %r2262, %r2262, 16; + add.s32 %r2264, %r2263, %r2243; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 20; + add.s32 %r2267, %r2261, %r2018; + add.s32 %r2268, %r2267, %r2266; + xor.b32 %r2269, %r2268, %r2263; + shf.l.wrap.b32 %r2270, %r2269, %r2269, 24; + add.s32 %r2271, %r2270, %r2264; + xor.b32 %r2272, %r2271, %r2266; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 25; + add.s32 %r2274, %r2226, %r1997; + add.s32 %r2275, %r2274, %r2217; + xor.b32 %r2276, %r2242, %r2275; + shf.l.wrap.b32 %r2277, %r2276, %r2276, 16; + add.s32 %r2278, %r2257, %r2277; + xor.b32 %r2279, %r2278, %r2217; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 20; + add.s32 %r2281, %r2275, %r2046; + add.s32 %r2282, %r2281, %r2280; + xor.b32 %r2283, %r2282, %r2277; + shf.l.wrap.b32 %r2284, %r2283, %r2283, 24; + add.s32 %r2285, %r2284, %r2278; + xor.b32 %r2286, %r2285, %r2280; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 25; + add.s32 %r2288, %r2231, %r2025; + add.s32 %r2289, %r2288, %r2240; + xor.b32 %r2290, %r2256, %r2289; + shf.l.wrap.b32 %r2291, %r2290, %r2290, 16; + add.s32 %r2292, %r2291, %r2215; + xor.b32 %r2293, %r2292, %r2231; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 20; + add.s32 %r2295, %r2289, %r1976; + add.s32 %r2296, %r2295, %r2294; + xor.b32 %r2297, %r2296, %r2291; + shf.l.wrap.b32 %r2298, %r2297, %r2297, 24; + add.s32 %r2299, %r2298, %r2292; + xor.b32 %r2300, %r2299, %r2294; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 25; + add.s32 %r2302, %r2245, %r2004; + add.s32 %r2303, %r2302, %r2254; + xor.b32 %r2304, %r2303, %r2214; + shf.l.wrap.b32 %r2305, %r2304, %r2304, 16; + add.s32 %r2306, %r2305, %r2229; + xor.b32 %r2307, %r2306, %r2245; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 20; + add.s32 %r2309, %r2303, %r2067; + add.s32 %r2310, %r2309, %r2308; + xor.b32 %r2311, %r2310, %r2305; + shf.l.wrap.b32 %r2312, %r2311, %r2311, 24; + add.s32 %r2313, %r2312, %r2306; + xor.b32 %r2314, %r2313, %r2308; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 25; + add.s32 %r2316, %r2287, %r1983; + add.s32 %r2317, %r2316, %r2268; + xor.b32 %r2318, %r2317, %r2312; + shf.l.wrap.b32 %r2319, %r2318, %r2318, 16; + add.s32 %r2320, %r2319, %r2299; + xor.b32 %r2321, %r2320, %r2287; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 20; + add.s32 %r2323, %r2317, %r2053; + add.s32 %r2324, %r2323, %r2322; + xor.b32 %r2325, %r2324, %r2319; + shf.l.wrap.b32 %r2326, %r2325, %r2325, 24; + add.s32 %r2327, %r2326, %r2320; + xor.b32 %r2328, %r2327, %r2322; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 25; + add.s32 %r2330, %r2282, %r2060; + add.s32 %r2331, %r2330, %r2301; + xor.b32 %r2332, %r2270, %r2331; + shf.l.wrap.b32 %r2333, %r2332, %r2332, 16; + add.s32 %r2334, %r2333, %r2313; + xor.b32 %r2335, %r2334, %r2301; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 20; + add.s32 %r2337, %r2331, %r2011; + add.s32 %r2338, %r2337, %r2336; + xor.b32 %r2339, %r2338, %r2333; + shf.l.wrap.b32 %r2340, %r2339, %r2339, 24; + add.s32 %r2341, %r2340, %r2334; + xor.b32 %r2342, %r2341, %r2336; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 25; + add.s32 %r2344, %r2296, %r2039; + add.s32 %r2345, %r2344, %r2315; + xor.b32 %r2346, %r2345, %r2284; + shf.l.wrap.b32 %r2347, %r2346, %r2346, 16; + add.s32 %r2348, %r2347, %r2271; + xor.b32 %r2349, %r2348, %r2315; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 20; + add.s32 %r2351, %r2345, %r2074; + add.s32 %r2352, %r2351, %r2350; + xor.b32 %r2353, %r2352, %r2347; + shf.l.wrap.b32 %r2354, %r2353, %r2353, 24; + add.s32 %r2355, %r2354, %r2348; + xor.b32 %r2356, %r2355, %r2350; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 25; + add.s32 %r2358, %r2310, %r2081; + add.s32 %r2359, %r2358, %r2273; + xor.b32 %r2360, %r2359, %r2298; + shf.l.wrap.b32 %r2361, %r2360, %r2360, 16; + add.s32 %r2362, %r2361, %r2285; + xor.b32 %r2363, %r2362, %r2273; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 20; + add.s32 %r2365, %r2359, %r2032; + add.s32 %r2366, %r2365, %r2364; + xor.b32 %r2367, %r2366, %r2361; + shf.l.wrap.b32 %r2368, %r2367, %r2367, 24; + add.s32 %r2369, %r2368, %r2362; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 25; + add.s32 %r2372, %r2324, %r1997; + add.s32 %r2373, %r2372, %r2371; + xor.b32 %r2374, %r2373, %r2340; + shf.l.wrap.b32 %r2375, %r2374, %r2374, 16; + add.s32 %r2376, %r2375, %r2355; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 20; + add.s32 %r2379, %r2373, %r2004; + add.s32 %r2380, %r2379, %r2378; + xor.b32 %r2381, %r2380, %r2375; + shf.l.wrap.b32 %r2382, %r2381, %r2381, 24; + add.s32 %r2383, %r2382, %r2376; + xor.b32 %r2384, %r2383, %r2378; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 25; + add.s32 %r2386, %r2338, %r2046; + add.s32 %r2387, %r2386, %r2329; + xor.b32 %r2388, %r2387, %r2354; + shf.l.wrap.b32 %r2389, %r2388, %r2388, 16; + add.s32 %r2390, %r2389, %r2369; + xor.b32 %r2391, %r2390, %r2329; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 20; + add.s32 %r2393, %r2387, %r2060; + add.s32 %r2394, %r2393, %r2392; + xor.b32 %r2395, %r2394, %r2389; + shf.l.wrap.b32 %r2396, %r2395, %r2395, 24; + add.s32 %r2397, %r2396, %r2390; + xor.b32 %r2398, %r2397, %r2392; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 25; + add.s32 %r2400, %r2352, %r2067; + add.s32 %r2401, %r2400, %r2343; + xor.b32 %r2402, %r2368, %r2401; + shf.l.wrap.b32 %r2403, %r2402, %r2402, 16; + add.s32 %r2404, %r2403, %r2327; + xor.b32 %r2405, %r2404, %r2343; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 20; + add.s32 %r2407, %r2401, %r1990; + add.s32 %r2408, %r2407, %r2406; + xor.b32 %r2409, %r2408, %r2403; + shf.l.wrap.b32 %r2410, %r2409, %r2409, 24; + add.s32 %r2411, %r2410, %r2404; + xor.b32 %r2412, %r2411, %r2406; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 25; + add.s32 %r2414, %r2357, %r2025; + add.s32 %r2415, %r2414, %r2366; + xor.b32 %r2416, %r2415, %r2326; + shf.l.wrap.b32 %r2417, %r2416, %r2416, 16; + add.s32 %r2418, %r2417, %r2341; + xor.b32 %r2419, %r2418, %r2357; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 20; + add.s32 %r2421, %r2415, %r2074; + add.s32 %r2422, %r2421, %r2420; + xor.b32 %r2423, %r2422, %r2417; + shf.l.wrap.b32 %r2424, %r2423, %r2423, 24; + add.s32 %r2425, %r2424, %r2418; + xor.b32 %r2426, %r2425, %r2420; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 25; + add.s32 %r2428, %r2399, %r2018; + add.s32 %r2429, %r2428, %r2380; + xor.b32 %r2430, %r2429, %r2424; + shf.l.wrap.b32 %r2431, %r2430, %r2430, 16; + add.s32 %r2432, %r2431, %r2411; + xor.b32 %r2433, %r2432, %r2399; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 20; + add.s32 %r2435, %r2429, %r2011; + add.s32 %r2436, %r2435, %r2434; + xor.b32 %r2437, %r2436, %r2431; + shf.l.wrap.b32 %r2438, %r2437, %r2437, 24; + add.s32 %r2439, %r2438, %r2432; + xor.b32 %r2440, %r2439, %r2434; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 25; + add.s32 %r2442, %r2394, %r2039; + add.s32 %r2443, %r2442, %r2413; + xor.b32 %r2444, %r2382, %r2443; + shf.l.wrap.b32 %r2445, %r2444, %r2444, 16; + add.s32 %r2446, %r2445, %r2425; + xor.b32 %r2447, %r2446, %r2413; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 20; + add.s32 %r2449, %r2443, %r1976; + add.s32 %r2450, %r2449, %r2448; + xor.b32 %r2451, %r2450, %r2445; + shf.l.wrap.b32 %r2452, %r2451, %r2451, 24; + add.s32 %r2453, %r2452, %r2446; + xor.b32 %r2454, %r2453, %r2448; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 25; + add.s32 %r2456, %r2408, %r2053; + add.s32 %r2457, %r2456, %r2427; + xor.b32 %r2458, %r2457, %r2396; + shf.l.wrap.b32 %r2459, %r2458, %r2458, 16; + add.s32 %r2460, %r2459, %r2383; + xor.b32 %r2461, %r2460, %r2427; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 20; + add.s32 %r2463, %r2457, %r2081; + add.s32 %r2464, %r2463, %r2462; + xor.b32 %r2465, %r2464, %r2459; + shf.l.wrap.b32 %r2466, %r2465, %r2465, 24; + add.s32 %r2467, %r2466, %r2460; + xor.b32 %r2468, %r2467, %r2462; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 25; + add.s32 %r2470, %r2422, %r2032; + add.s32 %r2471, %r2470, %r2385; + xor.b32 %r2472, %r2471, %r2410; + shf.l.wrap.b32 %r2473, %r2472, %r2472, 16; + add.s32 %r2474, %r2473, %r2397; + xor.b32 %r2475, %r2474, %r2385; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 20; + add.s32 %r2477, %r2471, %r1983; + add.s32 %r2478, %r2477, %r2476; + xor.b32 %r2479, %r2478, %r2473; + shf.l.wrap.b32 %r2480, %r2479, %r2479, 24; + add.s32 %r2481, %r2480, %r2474; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 25; + add.s32 %r2484, %r2436, %r2046; + add.s32 %r2485, %r2484, %r2483; + xor.b32 %r2486, %r2485, %r2452; + shf.l.wrap.b32 %r2487, %r2486, %r2486, 16; + add.s32 %r2488, %r2487, %r2467; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 20; + add.s32 %r2491, %r2485, %r2025; + add.s32 %r2492, %r2491, %r2490; + xor.b32 %r2493, %r2492, %r2487; + shf.l.wrap.b32 %r2494, %r2493, %r2493, 24; + add.s32 %r2495, %r2494, %r2488; + xor.b32 %r2496, %r2495, %r2490; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 25; + add.s32 %r2498, %r2450, %r2060; + add.s32 %r2499, %r2498, %r2441; + xor.b32 %r2500, %r2499, %r2466; + shf.l.wrap.b32 %r2501, %r2500, %r2500, 16; + add.s32 %r2502, %r2501, %r2481; + xor.b32 %r2503, %r2502, %r2441; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 20; + add.s32 %r2505, %r2499, %r2039; + add.s32 %r2506, %r2505, %r2504; + xor.b32 %r2507, %r2506, %r2501; + shf.l.wrap.b32 %r2508, %r2507, %r2507, 24; + add.s32 %r2509, %r2508, %r2502; + xor.b32 %r2510, %r2509, %r2504; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 25; + add.s32 %r2512, %r2464, %r2074; + add.s32 %r2513, %r2512, %r2455; + xor.b32 %r2514, %r2480, %r2513; + shf.l.wrap.b32 %r2515, %r2514, %r2514, 16; + add.s32 %r2516, %r2515, %r2439; + xor.b32 %r2517, %r2516, %r2455; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 20; + add.s32 %r2519, %r2513, %r1997; + add.s32 %r2520, %r2519, %r2518; + xor.b32 %r2521, %r2520, %r2515; + shf.l.wrap.b32 %r2522, %r2521, %r2521, 24; + add.s32 %r2523, %r2522, %r2516; + xor.b32 %r2524, %r2523, %r2518; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 25; + add.s32 %r2526, %r2469, %r2067; + add.s32 %r2527, %r2526, %r2478; + xor.b32 %r2528, %r2527, %r2438; + shf.l.wrap.b32 %r2529, %r2528, %r2528, 16; + add.s32 %r2530, %r2529, %r2453; + xor.b32 %r2531, %r2530, %r2469; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 20; + add.s32 %r2533, %r2527, %r2081; + add.s32 %r2534, %r2533, %r2532; + xor.b32 %r2535, %r2534, %r2529; + shf.l.wrap.b32 %r2536, %r2535, %r2535, 24; + add.s32 %r2537, %r2536, %r2530; + xor.b32 %r2538, %r2537, %r2532; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 25; + add.s32 %r2540, %r2511, %r2004; + add.s32 %r2541, %r2540, %r2492; + xor.b32 %r2542, %r2541, %r2536; + shf.l.wrap.b32 %r2543, %r2542, %r2542, 16; + add.s32 %r2544, %r2543, %r2523; + xor.b32 %r2545, %r2544, %r2511; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 20; + add.s32 %r2547, %r2541, %r1976; + add.s32 %r2548, %r2547, %r2546; + xor.b32 %r2549, %r2548, %r2543; + shf.l.wrap.b32 %r2550, %r2549, %r2549, 24; + add.s32 %r2551, %r2550, %r2544; + xor.b32 %r2552, %r2551, %r2546; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 25; + add.s32 %r2554, %r2506, %r2053; + add.s32 %r2555, %r2554, %r2525; + xor.b32 %r2556, %r2494, %r2555; + shf.l.wrap.b32 %r2557, %r2556, %r2556, 16; + add.s32 %r2558, %r2557, %r2537; + xor.b32 %r2559, %r2558, %r2525; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 20; + add.s32 %r2561, %r2555, %r1990; + add.s32 %r2562, %r2561, %r2560; + xor.b32 %r2563, %r2562, %r2557; + shf.l.wrap.b32 %r2564, %r2563, %r2563, 24; + add.s32 %r2565, %r2564, %r2558; + xor.b32 %r2566, %r2565, %r2560; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 25; + add.s32 %r2568, %r2520, %r2011; + add.s32 %r2569, %r2568, %r2539; + xor.b32 %r2570, %r2569, %r2508; + shf.l.wrap.b32 %r2571, %r2570, %r2570, 16; + add.s32 %r2572, %r2571, %r2495; + xor.b32 %r2573, %r2572, %r2539; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 20; + add.s32 %r2575, %r2569, %r2032; + add.s32 %r2576, %r2575, %r2574; + xor.b32 %r2577, %r2576, %r2571; + shf.l.wrap.b32 %r2578, %r2577, %r2577, 24; + add.s32 %r2579, %r2578, %r2572; + xor.b32 %r2580, %r2579, %r2574; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 25; + add.s32 %r2582, %r2534, %r1983; + add.s32 %r2583, %r2582, %r2497; + xor.b32 %r2584, %r2583, %r2522; + shf.l.wrap.b32 %r2585, %r2584, %r2584, 16; + add.s32 %r2586, %r2585, %r2509; + xor.b32 %r2587, %r2586, %r2497; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 20; + add.s32 %r2589, %r2583, %r2018; + add.s32 %r2590, %r2589, %r2588; + xor.b32 %r2591, %r2590, %r2585; + shf.l.wrap.b32 %r2592, %r2591, %r2591, 24; + add.s32 %r2593, %r2592, %r2586; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 25; + add.s32 %r2596, %r2548, %r2060; + add.s32 %r2597, %r2596, %r2595; + xor.b32 %r2598, %r2597, %r2564; + shf.l.wrap.b32 %r2599, %r2598, %r2598, 16; + add.s32 %r2600, %r2599, %r2579; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 20; + add.s32 %r2603, %r2597, %r2067; + add.s32 %r2604, %r2603, %r2602; + xor.b32 %r2605, %r2604, %r2599; + shf.l.wrap.b32 %r2606, %r2605, %r2605, 24; + add.s32 %r2607, %r2606, %r2600; + xor.b32 %r2608, %r2607, %r2602; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 25; + add.s32 %r2610, %r2562, %r2039; + add.s32 %r2611, %r2610, %r2553; + xor.b32 %r2612, %r2611, %r2578; + shf.l.wrap.b32 %r2613, %r2612, %r2612, 16; + add.s32 %r2614, %r2613, %r2593; + xor.b32 %r2615, %r2614, %r2553; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 20; + add.s32 %r2617, %r2611, %r2053; + add.s32 %r2618, %r2617, %r2616; + xor.b32 %r2619, %r2618, %r2613; + shf.l.wrap.b32 %r2620, %r2619, %r2619, 24; + add.s32 %r2621, %r2620, %r2614; + xor.b32 %r2622, %r2621, %r2616; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 25; + add.s32 %r2624, %r2576, %r2081; + add.s32 %r2625, %r2624, %r2567; + xor.b32 %r2626, %r2592, %r2625; + shf.l.wrap.b32 %r2627, %r2626, %r2626, 16; + add.s32 %r2628, %r2627, %r2551; + xor.b32 %r2629, %r2628, %r2567; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 20; + add.s32 %r2631, %r2625, %r2046; + add.s32 %r2632, %r2631, %r2630; + xor.b32 %r2633, %r2632, %r2627; + shf.l.wrap.b32 %r2634, %r2633, %r2633, 24; + add.s32 %r2635, %r2634, %r2628; + xor.b32 %r2636, %r2635, %r2630; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 25; + add.s32 %r2638, %r2581, %r2074; + add.s32 %r2639, %r2638, %r2590; + xor.b32 %r2640, %r2639, %r2550; + shf.l.wrap.b32 %r2641, %r2640, %r2640, 16; + add.s32 %r2642, %r2641, %r2565; + xor.b32 %r2643, %r2642, %r2581; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 20; + add.s32 %r2645, %r2639, %r2032; + add.s32 %r2646, %r2645, %r2644; + xor.b32 %r2647, %r2646, %r2641; + shf.l.wrap.b32 %r2648, %r2647, %r2647, 24; + add.s32 %r2649, %r2648, %r2642; + xor.b32 %r2650, %r2649, %r2644; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 25; + add.s32 %r2652, %r2623, %r2025; + add.s32 %r2653, %r2652, %r2604; + xor.b32 %r2654, %r2653, %r2648; + shf.l.wrap.b32 %r2655, %r2654, %r2654, 16; + add.s32 %r2656, %r2655, %r2635; + xor.b32 %r2657, %r2656, %r2623; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 20; + add.s32 %r2659, %r2653, %r1990; + add.s32 %r2660, %r2659, %r2658; + xor.b32 %r2661, %r2660, %r2655; + shf.l.wrap.b32 %r2662, %r2661, %r2661, 24; + add.s32 %r2663, %r2662, %r2656; + xor.b32 %r2664, %r2663, %r2658; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 25; + add.s32 %r2666, %r2618, %r2011; + add.s32 %r2667, %r2666, %r2637; + xor.b32 %r2668, %r2606, %r2667; + shf.l.wrap.b32 %r2669, %r2668, %r2668, 16; + add.s32 %r2670, %r2669, %r2649; + xor.b32 %r2671, %r2670, %r2637; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 20; + add.s32 %r2673, %r2667, %r1997; + add.s32 %r2674, %r2673, %r2672; + xor.b32 %r2675, %r2674, %r2669; + shf.l.wrap.b32 %r2676, %r2675, %r2675, 24; + add.s32 %r2677, %r2676, %r2670; + xor.b32 %r2678, %r2677, %r2672; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 25; + add.s32 %r2680, %r2632, %r1976; + add.s32 %r2681, %r2680, %r2651; + xor.b32 %r2682, %r2681, %r2620; + shf.l.wrap.b32 %r2683, %r2682, %r2682, 16; + add.s32 %r2684, %r2683, %r2607; + xor.b32 %r2685, %r2684, %r2651; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 20; + add.s32 %r2687, %r2681, %r1983; + add.s32 %r2688, %r2687, %r2686; + xor.b32 %r2689, %r2688, %r2683; + shf.l.wrap.b32 %r2690, %r2689, %r2689, 24; + add.s32 %r2691, %r2690, %r2684; + xor.b32 %r2692, %r2691, %r2686; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 25; + add.s32 %r2694, %r2646, %r2018; + add.s32 %r2695, %r2694, %r2609; + xor.b32 %r2696, %r2695, %r2634; + shf.l.wrap.b32 %r2697, %r2696, %r2696, 16; + add.s32 %r2698, %r2697, %r2621; + xor.b32 %r2699, %r2698, %r2609; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 20; + add.s32 %r2701, %r2695, %r2004; + add.s32 %r2702, %r2701, %r2700; + xor.b32 %r2703, %r2702, %r2697; + shf.l.wrap.b32 %r2704, %r2703, %r2703, 24; + add.s32 %r2705, %r2704, %r2698; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 25; + add.s32 %r2708, %r2660, %r2039; + add.s32 %r2709, %r2708, %r2707; + xor.b32 %r2710, %r2709, %r2676; + shf.l.wrap.b32 %r2711, %r2710, %r2710, 16; + add.s32 %r2712, %r2711, %r2691; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 20; + add.s32 %r2715, %r2709, %r2074; + add.s32 %r2716, %r2715, %r2714; + xor.b32 %r2717, %r2716, %r2711; + shf.l.wrap.b32 %r2718, %r2717, %r2717, 24; + add.s32 %r2719, %r2718, %r2712; + xor.b32 %r2720, %r2719, %r2714; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 25; + add.s32 %r2722, %r2674, %r2053; + add.s32 %r2723, %r2722, %r2665; + xor.b32 %r2724, %r2723, %r2690; + shf.l.wrap.b32 %r2725, %r2724, %r2724, 16; + add.s32 %r2726, %r2725, %r2705; + xor.b32 %r2727, %r2726, %r2665; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 20; + add.s32 %r2729, %r2723, %r2011; + add.s32 %r2730, %r2729, %r2728; + xor.b32 %r2731, %r2730, %r2725; + shf.l.wrap.b32 %r2732, %r2731, %r2731, 24; + add.s32 %r2733, %r2732, %r2726; + xor.b32 %r2734, %r2733, %r2728; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 25; + add.s32 %r2736, %r2688, %r2032; + add.s32 %r2737, %r2736, %r2679; + xor.b32 %r2738, %r2704, %r2737; + shf.l.wrap.b32 %r2739, %r2738, %r2738, 16; + add.s32 %r2740, %r2739, %r2663; + xor.b32 %r2741, %r2740, %r2679; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 20; + add.s32 %r2743, %r2737, %r2060; + add.s32 %r2744, %r2743, %r2742; + xor.b32 %r2745, %r2744, %r2739; + shf.l.wrap.b32 %r2746, %r2745, %r2745, 24; + add.s32 %r2747, %r2746, %r2740; + xor.b32 %r2748, %r2747, %r2742; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 25; + add.s32 %r2750, %r2693, %r2081; + add.s32 %r2751, %r2750, %r2702; + xor.b32 %r2752, %r2751, %r2662; + shf.l.wrap.b32 %r2753, %r2752, %r2752, 16; + add.s32 %r2754, %r2753, %r2677; + xor.b32 %r2755, %r2754, %r2693; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 20; + add.s32 %r2757, %r2751, %r1983; + add.s32 %r2758, %r2757, %r2756; + xor.b32 %r2759, %r2758, %r2753; + shf.l.wrap.b32 %r2760, %r2759, %r2759, 24; + add.s32 %r2761, %r2760, %r2754; + xor.b32 %r2762, %r2761, %r2756; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 25; + add.s32 %r2764, %r2735, %r2067; + add.s32 %r2765, %r2764, %r2716; + xor.b32 %r2766, %r2765, %r2760; + shf.l.wrap.b32 %r2767, %r2766, %r2766, 16; + add.s32 %r2768, %r2767, %r2747; + xor.b32 %r2769, %r2768, %r2735; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 20; + add.s32 %r2771, %r2765, %r1997; + add.s32 %r2772, %r2771, %r2770; + xor.b32 %r2773, %r2772, %r2767; + shf.l.wrap.b32 %r2774, %r2773, %r2773, 24; + add.s32 %r2775, %r2774, %r2768; + xor.b32 %r2776, %r2775, %r2770; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 25; + add.s32 %r2778, %r2730, %r1976; + add.s32 %r2779, %r2778, %r2749; + xor.b32 %r2780, %r2718, %r2779; + shf.l.wrap.b32 %r2781, %r2780, %r2780, 16; + add.s32 %r2782, %r2781, %r2761; + xor.b32 %r2783, %r2782, %r2749; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 20; + add.s32 %r2785, %r2779, %r2046; + add.s32 %r2786, %r2785, %r2784; + xor.b32 %r2787, %r2786, %r2781; + shf.l.wrap.b32 %r2788, %r2787, %r2787, 24; + add.s32 %r2789, %r2788, %r2782; + xor.b32 %r2790, %r2789, %r2784; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 25; + add.s32 %r2792, %r2744, %r1990; + add.s32 %r2793, %r2792, %r2763; + xor.b32 %r2794, %r2793, %r2732; + shf.l.wrap.b32 %r2795, %r2794, %r2794, 16; + add.s32 %r2796, %r2795, %r2719; + xor.b32 %r2797, %r2796, %r2763; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 20; + add.s32 %r2799, %r2793, %r2018; + add.s32 %r2800, %r2799, %r2798; + xor.b32 %r2801, %r2800, %r2795; + shf.l.wrap.b32 %r2802, %r2801, %r2801, 24; + add.s32 %r2803, %r2802, %r2796; + xor.b32 %r2804, %r2803, %r2798; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 25; + add.s32 %r2806, %r2758, %r2004; + add.s32 %r2807, %r2806, %r2721; + xor.b32 %r2808, %r2807, %r2746; + shf.l.wrap.b32 %r2809, %r2808, %r2808, 16; + add.s32 %r2810, %r2809, %r2733; + xor.b32 %r2811, %r2810, %r2721; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 20; + add.s32 %r2813, %r2807, %r2025; + add.s32 %r2814, %r2813, %r2812; + xor.b32 %r2815, %r2814, %r2809; + shf.l.wrap.b32 %r2816, %r2815, %r2815, 24; + add.s32 %r2817, %r2816, %r2810; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 25; + add.s32 %r2820, %r2772, %r2053; + add.s32 %r2821, %r2820, %r2819; + xor.b32 %r2822, %r2821, %r2788; + shf.l.wrap.b32 %r2823, %r2822, %r2822, 16; + add.s32 %r2824, %r2823, %r2803; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 20; + add.s32 %r2827, %r2821, %r2081; + add.s32 %r2828, %r2827, %r2826; + xor.b32 %r2829, %r2828, %r2823; + shf.l.wrap.b32 %r2830, %r2829, %r2829, 24; + add.s32 %r2831, %r2830, %r2824; + xor.b32 %r2832, %r2831, %r2826; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 25; + add.s32 %r2834, %r2786, %r2011; + add.s32 %r2835, %r2834, %r2777; + xor.b32 %r2836, %r2835, %r2802; + shf.l.wrap.b32 %r2837, %r2836, %r2836, 16; + add.s32 %r2838, %r2837, %r2817; + xor.b32 %r2839, %r2838, %r2777; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 20; + add.s32 %r2841, %r2835, %r1976; + add.s32 %r2842, %r2841, %r2840; + xor.b32 %r2843, %r2842, %r2837; + shf.l.wrap.b32 %r2844, %r2843, %r2843, 24; + add.s32 %r2845, %r2844, %r2838; + xor.b32 %r2846, %r2845, %r2840; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 25; + add.s32 %r2848, %r2800, %r1983; + add.s32 %r2849, %r2848, %r2791; + xor.b32 %r2850, %r2816, %r2849; + shf.l.wrap.b32 %r2851, %r2850, %r2850, 16; + add.s32 %r2852, %r2851, %r2775; + xor.b32 %r2853, %r2852, %r2791; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 20; + add.s32 %r2855, %r2849, %r2039; + add.s32 %r2856, %r2855, %r2854; + xor.b32 %r2857, %r2856, %r2851; + shf.l.wrap.b32 %r2858, %r2857, %r2857, 24; + add.s32 %r2859, %r2858, %r2852; + xor.b32 %r2860, %r2859, %r2854; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 25; + add.s32 %r2862, %r2805, %r2032; + add.s32 %r2863, %r2862, %r2814; + xor.b32 %r2864, %r2863, %r2774; + shf.l.wrap.b32 %r2865, %r2864, %r2864, 16; + add.s32 %r2866, %r2865, %r2789; + xor.b32 %r2867, %r2866, %r2805; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 20; + add.s32 %r2869, %r2863, %r2018; + add.s32 %r2870, %r2869, %r2868; + xor.b32 %r2871, %r2870, %r2865; + shf.l.wrap.b32 %r2872, %r2871, %r2871, 24; + add.s32 %r2873, %r2872, %r2866; + xor.b32 %r2874, %r2873, %r2868; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 25; + add.s32 %r2876, %r2847, %r2074; + add.s32 %r2877, %r2876, %r2828; + xor.b32 %r2878, %r2877, %r2872; + shf.l.wrap.b32 %r2879, %r2878, %r2878, 16; + add.s32 %r2880, %r2879, %r2859; + xor.b32 %r2881, %r2880, %r2847; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 20; + add.s32 %r2883, %r2877, %r2046; + add.s32 %r2884, %r2883, %r2882; + xor.b32 %r2885, %r2884, %r2879; + shf.l.wrap.b32 %r2886, %r2885, %r2885, 24; + add.s32 %r2887, %r2886, %r2880; + xor.b32 %r2888, %r2887, %r2882; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 25; + add.s32 %r2890, %r2842, %r1990; + add.s32 %r2891, %r2890, %r2861; + xor.b32 %r2892, %r2830, %r2891; + shf.l.wrap.b32 %r2893, %r2892, %r2892, 16; + add.s32 %r2894, %r2893, %r2873; + xor.b32 %r2895, %r2894, %r2861; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 20; + add.s32 %r2897, %r2891, %r2060; + add.s32 %r2898, %r2897, %r2896; + xor.b32 %r2899, %r2898, %r2893; + shf.l.wrap.b32 %r2900, %r2899, %r2899, 24; + add.s32 %r2901, %r2900, %r2894; + xor.b32 %r2902, %r2901, %r2896; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 25; + add.s32 %r2904, %r2856, %r1997; + add.s32 %r2905, %r2904, %r2875; + xor.b32 %r2906, %r2905, %r2844; + shf.l.wrap.b32 %r2907, %r2906, %r2906, 16; + add.s32 %r2908, %r2907, %r2831; + xor.b32 %r2909, %r2908, %r2875; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 20; + add.s32 %r2911, %r2905, %r2004; + add.s32 %r2912, %r2911, %r2910; + xor.b32 %r2913, %r2912, %r2907; + shf.l.wrap.b32 %r2914, %r2913, %r2913, 24; + add.s32 %r2915, %r2914, %r2908; + xor.b32 %r2916, %r2915, %r2910; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 25; + add.s32 %r2918, %r2870, %r2025; + add.s32 %r2919, %r2918, %r2833; + xor.b32 %r2920, %r2919, %r2858; + shf.l.wrap.b32 %r2921, %r2920, %r2920, 16; + add.s32 %r2922, %r2921, %r2845; + xor.b32 %r2923, %r2922, %r2833; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 20; + add.s32 %r2925, %r2919, %r2067; + add.s32 %r2926, %r2925, %r2924; + xor.b32 %r2927, %r2926, %r2921; + shf.l.wrap.b32 %r2928, %r2927, %r2927, 24; + add.s32 %r2929, %r2928, %r2922; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 25; + xor.b32 %r27, %r2915, %r2884; + xor.b32 %r28, %r2929, %r2898; + xor.b32 %r29, %r2887, %r2912; + xor.b32 %r30, %r2926, %r2901; + xor.b32 %r31, %r2931, %r2900; + xor.b32 %r32, %r2889, %r2914; + xor.b32 %r33, %r2928, %r2903; + xor.b32 %r34, %r2917, %r2886; + popc.b64 %r2932, %rd128; + cvt.u64.u32 %rd30, %r2932; + ld.local.u8 %rs127, [%rd3+8]; + cvt.u64.u16 %rd130, %rs127; + setp.ge.u64 %p15, %rd30, %rd130; + mul.wide.u16 %r11659, %rs127, 32; + @%p15 bra $L__BB1_19; + +$L__BB1_18: + add.s32 %r2933, %r11659, -64; + cvt.s64.s32 %rd131, %r2933; + add.s64 %rd132, %rd2, %rd131; + ld.local.u8 %r2934, [%rd3+2]; + ld.local.u8 %r2935, [%rd132+145]; + ld.local.u8 %r2936, [%rd132+146]; + prmt.b32 %r2937, %r2936, %r2935, 30212; + ld.local.u8 %r2938, [%rd132+147]; + prmt.b32 %r2939, %r2938, %r2937, 28756; + ld.local.u8 %r2940, [%rd132+148]; + prmt.b32 %r2941, %r2940, %r2939, 1620; + ld.local.u8 %r2942, [%rd132+149]; + ld.local.u8 %r2943, [%rd132+150]; + prmt.b32 %r2944, %r2943, %r2942, 30212; + ld.local.u8 %r2945, [%rd132+151]; + prmt.b32 %r2946, %r2945, %r2944, 28756; + ld.local.u8 %r2947, [%rd132+152]; + prmt.b32 %r2948, %r2947, %r2946, 1620; + ld.local.u8 %r2949, [%rd132+153]; + ld.local.u8 %r2950, [%rd132+154]; + prmt.b32 %r2951, %r2950, %r2949, 30212; + ld.local.u8 %r2952, [%rd132+155]; + prmt.b32 %r2953, %r2952, %r2951, 28756; + ld.local.u8 %r2954, [%rd132+156]; + prmt.b32 %r2955, %r2954, %r2953, 1620; + ld.local.u8 %r2956, [%rd132+157]; + ld.local.u8 %r2957, [%rd132+158]; + prmt.b32 %r2958, %r2957, %r2956, 30212; + ld.local.u8 %r2959, [%rd132+159]; + prmt.b32 %r2960, %r2959, %r2958, 28756; + ld.local.u8 %r2961, [%rd132+160]; + prmt.b32 %r2962, %r2961, %r2960, 1620; + ld.local.u8 %r2963, [%rd132+161]; + ld.local.u8 %r2964, [%rd132+162]; + prmt.b32 %r2965, %r2964, %r2963, 30212; + ld.local.u8 %r2966, [%rd132+163]; + prmt.b32 %r2967, %r2966, %r2965, 28756; + ld.local.u8 %r2968, [%rd132+164]; + prmt.b32 %r2969, %r2968, %r2967, 1620; + ld.local.u8 %r2970, [%rd132+165]; + ld.local.u8 %r2971, [%rd132+166]; + prmt.b32 %r2972, %r2971, %r2970, 30212; + ld.local.u8 %r2973, [%rd132+167]; + prmt.b32 %r2974, %r2973, %r2972, 28756; + ld.local.u8 %r2975, [%rd132+168]; + prmt.b32 %r2976, %r2975, %r2974, 1620; + ld.local.u8 %r2977, [%rd132+169]; + ld.local.u8 %r2978, [%rd132+170]; + prmt.b32 %r2979, %r2978, %r2977, 30212; + ld.local.u8 %r2980, [%rd132+171]; + prmt.b32 %r2981, %r2980, %r2979, 28756; + ld.local.u8 %r2982, [%rd132+172]; + prmt.b32 %r2983, %r2982, %r2981, 1620; + ld.local.u8 %r2984, [%rd132+173]; + ld.local.u8 %r2985, [%rd132+174]; + prmt.b32 %r2986, %r2985, %r2984, 30212; + ld.local.u8 %r2987, [%rd132+175]; + prmt.b32 %r2988, %r2987, %r2986, 28756; + ld.local.u8 %r2989, [%rd132+176]; + prmt.b32 %r2990, %r2989, %r2988, 1620; + ld.local.u8 %r2991, [%rd132+177]; + ld.local.u8 %r2992, [%rd132+178]; + prmt.b32 %r2993, %r2992, %r2991, 30212; + ld.local.u8 %r2994, [%rd132+179]; + prmt.b32 %r2995, %r2994, %r2993, 28756; + ld.local.u8 %r2996, [%rd132+180]; + prmt.b32 %r2997, %r2996, %r2995, 1620; + ld.local.u8 %r2998, [%rd132+181]; + ld.local.u8 %r2999, [%rd132+182]; + prmt.b32 %r3000, %r2999, %r2998, 30212; + ld.local.u8 %r3001, [%rd132+183]; + prmt.b32 %r3002, %r3001, %r3000, 28756; + ld.local.u8 %r3003, [%rd132+184]; + prmt.b32 %r3004, %r3003, %r3002, 1620; + ld.local.u8 %r3005, [%rd132+185]; + ld.local.u8 %r3006, [%rd132+186]; + prmt.b32 %r3007, %r3006, %r3005, 30212; + ld.local.u8 %r3008, [%rd132+187]; + prmt.b32 %r3009, %r3008, %r3007, 28756; + ld.local.u8 %r3010, [%rd132+188]; + prmt.b32 %r3011, %r3010, %r3009, 1620; + ld.local.u8 %r3012, [%rd132+189]; + ld.local.u8 %r3013, [%rd132+190]; + prmt.b32 %r3014, %r3013, %r3012, 30212; + ld.local.u8 %r3015, [%rd132+191]; + prmt.b32 %r3016, %r3015, %r3014, 28756; + ld.local.u8 %r3017, [%rd132+192]; + prmt.b32 %r3018, %r3017, %r3016, 1620; + ld.local.u8 %r3019, [%rd132+193]; + ld.local.u8 %r3020, [%rd132+194]; + prmt.b32 %r3021, %r3020, %r3019, 30212; + ld.local.u8 %r3022, [%rd132+195]; + prmt.b32 %r3023, %r3022, %r3021, 28756; + ld.local.u8 %r3024, [%rd132+196]; + prmt.b32 %r3025, %r3024, %r3023, 1620; + ld.local.u8 %r3026, [%rd132+197]; + ld.local.u8 %r3027, [%rd132+198]; + prmt.b32 %r3028, %r3027, %r3026, 30212; + ld.local.u8 %r3029, [%rd132+199]; + prmt.b32 %r3030, %r3029, %r3028, 28756; + ld.local.u8 %r3031, [%rd132+200]; + prmt.b32 %r3032, %r3031, %r3030, 1620; + ld.local.u8 %r3033, [%rd132+201]; + ld.local.u8 %r3034, [%rd132+202]; + prmt.b32 %r3035, %r3034, %r3033, 30212; + ld.local.u8 %r3036, [%rd132+203]; + prmt.b32 %r3037, %r3036, %r3035, 28756; + ld.local.u8 %r3038, [%rd132+204]; + prmt.b32 %r3039, %r3038, %r3037, 1620; + ld.local.u8 %r3040, [%rd132+205]; + ld.local.u8 %r3041, [%rd132+206]; + prmt.b32 %r3042, %r3041, %r3040, 30212; + ld.local.u8 %r3043, [%rd132+207]; + prmt.b32 %r3044, %r3043, %r3042, 28756; + ld.local.u8 %r3045, [%rd132+208]; + prmt.b32 %r3046, %r3045, %r3044, 1620; + or.b32 %r3047, %r2934, 4; + ld.local.u8 %r3048, [%rd3+-120]; + ld.local.u8 %r3049, [%rd3+-119]; + prmt.b32 %r3050, %r3049, %r3048, 30212; + ld.local.u8 %r3051, [%rd3+-118]; + ld.local.u8 %r3052, [%rd3+-117]; + prmt.b32 %r3053, %r3052, %r3051, 30212; + prmt.b32 %r3054, %r3053, %r3050, 4180; + ld.local.u8 %r3055, [%rd3+-136]; + ld.local.u8 %r3056, [%rd3+-135]; + prmt.b32 %r3057, %r3056, %r3055, 30212; + ld.local.u8 %r3058, [%rd3+-134]; + ld.local.u8 %r3059, [%rd3+-133]; + prmt.b32 %r3060, %r3059, %r3058, 30212; + prmt.b32 %r3061, %r3060, %r3057, 4180; + add.s32 %r3062, %r3054, %r3061; + add.s32 %r3063, %r3062, %r2941; + shf.l.wrap.b32 %r3064, %r3063, %r3063, 16; + add.s32 %r3065, %r3064, 1779033703; + xor.b32 %r3066, %r3065, %r3054; + shf.l.wrap.b32 %r3067, %r3066, %r3066, 20; + add.s32 %r3068, %r2948, %r3063; + add.s32 %r3069, %r3068, %r3067; + xor.b32 %r3070, %r3069, %r3064; + shf.l.wrap.b32 %r3071, %r3070, %r3070, 24; + add.s32 %r3072, %r3071, %r3065; + xor.b32 %r3073, %r3072, %r3067; + shf.l.wrap.b32 %r3074, %r3073, %r3073, 25; + ld.local.u8 %r3075, [%rd3+-116]; + ld.local.u8 %r3076, [%rd3+-115]; + prmt.b32 %r3077, %r3076, %r3075, 30212; + ld.local.u8 %r3078, [%rd3+-114]; + ld.local.u8 %r3079, [%rd3+-113]; + prmt.b32 %r3080, %r3079, %r3078, 30212; + prmt.b32 %r3081, %r3080, %r3077, 4180; + ld.local.u8 %r3082, [%rd3+-132]; + ld.local.u8 %r3083, [%rd3+-131]; + prmt.b32 %r3084, %r3083, %r3082, 30212; + ld.local.u8 %r3085, [%rd3+-130]; + ld.local.u8 %r3086, [%rd3+-129]; + prmt.b32 %r3087, %r3086, %r3085, 30212; + prmt.b32 %r3088, %r3087, %r3084, 4180; + add.s32 %r3089, %r3081, %r3088; + add.s32 %r3090, %r3089, %r2955; + shf.l.wrap.b32 %r3091, %r3090, %r3090, 16; + add.s32 %r3092, %r3091, -1150833019; + xor.b32 %r3093, %r3092, %r3081; + shf.l.wrap.b32 %r3094, %r3093, %r3093, 20; + add.s32 %r3095, %r2962, %r3090; + add.s32 %r3096, %r3095, %r3094; + xor.b32 %r3097, %r3096, %r3091; + shf.l.wrap.b32 %r3098, %r3097, %r3097, 24; + add.s32 %r3099, %r3098, %r3092; + xor.b32 %r3100, %r3099, %r3094; + shf.l.wrap.b32 %r3101, %r3100, %r3100, 25; + ld.local.u8 %r3102, [%rd3+-112]; + ld.local.u8 %r3103, [%rd3+-111]; + prmt.b32 %r3104, %r3103, %r3102, 30212; + ld.local.u8 %r3105, [%rd3+-110]; + ld.local.u8 %r3106, [%rd3+-109]; + prmt.b32 %r3107, %r3106, %r3105, 30212; + prmt.b32 %r3108, %r3107, %r3104, 4180; + ld.local.u8 %r3109, [%rd3+-128]; + ld.local.u8 %r3110, [%rd3+-127]; + prmt.b32 %r3111, %r3110, %r3109, 30212; + ld.local.u8 %r3112, [%rd3+-126]; + ld.local.u8 %r3113, [%rd3+-125]; + prmt.b32 %r3114, %r3113, %r3112, 30212; + prmt.b32 %r3115, %r3114, %r3111, 4180; + add.s32 %r3116, %r3108, %r3115; + add.s32 %r3117, %r3116, %r2969; + shr.u32 %r3118, %r3117, 16; + shl.b32 %r3119, %r3117, 16; + xor.b32 %r3120, %r3119, 4194304; + or.b32 %r3121, %r3120, %r3118; + add.s32 %r3122, %r3121, 1013904242; + xor.b32 %r3123, %r3122, %r3108; + shf.l.wrap.b32 %r3124, %r3123, %r3123, 20; + add.s32 %r3125, %r2976, %r3117; + add.s32 %r3126, %r3125, %r3124; + xor.b32 %r3127, %r3126, %r3121; + shf.l.wrap.b32 %r3128, %r3127, %r3127, 24; + add.s32 %r3129, %r3128, %r3122; + xor.b32 %r3130, %r3129, %r3124; + shf.l.wrap.b32 %r3131, %r3130, %r3130, 25; + ld.local.u8 %r3132, [%rd3+-108]; + ld.local.u8 %r3133, [%rd3+-107]; + prmt.b32 %r3134, %r3133, %r3132, 30212; + ld.local.u8 %r3135, [%rd3+-106]; + ld.local.u8 %r3136, [%rd3+-105]; + prmt.b32 %r3137, %r3136, %r3135, 30212; + prmt.b32 %r3138, %r3137, %r3134, 4180; + ld.local.u8 %r3139, [%rd3+-124]; + ld.local.u8 %r3140, [%rd3+-123]; + prmt.b32 %r3141, %r3140, %r3139, 30212; + ld.local.u8 %r3142, [%rd3+-122]; + ld.local.u8 %r3143, [%rd3+-121]; + prmt.b32 %r3144, %r3143, %r3142, 30212; + prmt.b32 %r3145, %r3144, %r3141, 4180; + add.s32 %r3146, %r3138, %r3145; + add.s32 %r3147, %r3146, %r2983; + xor.b32 %r3148, %r3147, %r3047; + shr.u32 %r3149, %r3147, 16; + shl.b32 %r3150, %r3148, 16; + or.b32 %r3151, %r3150, %r3149; + add.s32 %r3152, %r3151, -1521486534; + xor.b32 %r3153, %r3152, %r3138; + shf.l.wrap.b32 %r3154, %r3153, %r3153, 20; + add.s32 %r3155, %r2990, %r3147; + add.s32 %r3156, %r3155, %r3154; + xor.b32 %r3157, %r3156, %r3151; + shf.l.wrap.b32 %r3158, %r3157, %r3157, 24; + add.s32 %r3159, %r3158, %r3152; + xor.b32 %r3160, %r3159, %r3154; + shf.l.wrap.b32 %r3161, %r3160, %r3160, 25; + add.s32 %r3162, %r3101, %r3069; + add.s32 %r3163, %r3162, %r2997; + xor.b32 %r3164, %r3158, %r3163; + shf.l.wrap.b32 %r3165, %r3164, %r3164, 16; + add.s32 %r3166, %r3165, %r3129; + xor.b32 %r3167, %r3166, %r3101; + shf.l.wrap.b32 %r3168, %r3167, %r3167, 20; + add.s32 %r3169, %r3004, %r3163; + add.s32 %r3170, %r3169, %r3168; + xor.b32 %r3171, %r3170, %r3165; + shf.l.wrap.b32 %r3172, %r3171, %r3171, 24; + add.s32 %r3173, %r3172, %r3166; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 25; + add.s32 %r3176, %r3131, %r3096; + add.s32 %r3177, %r3176, %r3011; + xor.b32 %r3178, %r3177, %r3071; + shf.l.wrap.b32 %r3179, %r3178, %r3178, 16; + add.s32 %r3180, %r3179, %r3159; + xor.b32 %r3181, %r3180, %r3131; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 20; + add.s32 %r3183, %r3018, %r3177; + add.s32 %r3184, %r3183, %r3182; + xor.b32 %r3185, %r3184, %r3179; + shf.l.wrap.b32 %r3186, %r3185, %r3185, 24; + add.s32 %r3187, %r3186, %r3180; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 25; + add.s32 %r3190, %r3161, %r3126; + add.s32 %r3191, %r3190, %r3025; + xor.b32 %r3192, %r3191, %r3098; + shf.l.wrap.b32 %r3193, %r3192, %r3192, 16; + add.s32 %r3194, %r3193, %r3072; + xor.b32 %r3195, %r3194, %r3161; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 20; + add.s32 %r3197, %r3032, %r3191; + add.s32 %r3198, %r3197, %r3196; + xor.b32 %r3199, %r3198, %r3193; + shf.l.wrap.b32 %r3200, %r3199, %r3199, 24; + add.s32 %r3201, %r3200, %r3194; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 25; + add.s32 %r3204, %r3156, %r3074; + add.s32 %r3205, %r3204, %r3039; + xor.b32 %r3206, %r3205, %r3128; + shf.l.wrap.b32 %r3207, %r3206, %r3206, 16; + add.s32 %r3208, %r3207, %r3099; + xor.b32 %r3209, %r3208, %r3074; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 20; + add.s32 %r3211, %r3046, %r3205; + add.s32 %r3212, %r3211, %r3210; + xor.b32 %r3213, %r3212, %r3207; + shf.l.wrap.b32 %r3214, %r3213, %r3213, 24; + add.s32 %r3215, %r3214, %r3208; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 25; + add.s32 %r3218, %r3170, %r2955; + add.s32 %r3219, %r3218, %r3217; + xor.b32 %r3220, %r3219, %r3186; + shf.l.wrap.b32 %r3221, %r3220, %r3220, 16; + add.s32 %r3222, %r3221, %r3201; + xor.b32 %r3223, %r3222, %r3217; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 20; + add.s32 %r3225, %r3219, %r2983; + add.s32 %r3226, %r3225, %r3224; + xor.b32 %r3227, %r3226, %r3221; + shf.l.wrap.b32 %r3228, %r3227, %r3227, 24; + add.s32 %r3229, %r3228, %r3222; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 25; + add.s32 %r3232, %r3184, %r2962; + add.s32 %r3233, %r3232, %r3175; + xor.b32 %r3234, %r3200, %r3233; + shf.l.wrap.b32 %r3235, %r3234, %r3234, 16; + add.s32 %r3236, %r3215, %r3235; + xor.b32 %r3237, %r3236, %r3175; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 20; + add.s32 %r3239, %r3233, %r3011; + add.s32 %r3240, %r3239, %r3238; + xor.b32 %r3241, %r3240, %r3235; + shf.l.wrap.b32 %r3242, %r3241, %r3241, 24; + add.s32 %r3243, %r3242, %r3236; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 25; + add.s32 %r3246, %r3189, %r2990; + add.s32 %r3247, %r3246, %r3198; + xor.b32 %r3248, %r3214, %r3247; + shf.l.wrap.b32 %r3249, %r3248, %r3248, 16; + add.s32 %r3250, %r3249, %r3173; + xor.b32 %r3251, %r3250, %r3189; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 20; + add.s32 %r3253, %r3247, %r2941; + add.s32 %r3254, %r3253, %r3252; + xor.b32 %r3255, %r3254, %r3249; + shf.l.wrap.b32 %r3256, %r3255, %r3255, 24; + add.s32 %r3257, %r3256, %r3250; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 25; + add.s32 %r3260, %r3203, %r2969; + add.s32 %r3261, %r3260, %r3212; + xor.b32 %r3262, %r3261, %r3172; + shf.l.wrap.b32 %r3263, %r3262, %r3262, 16; + add.s32 %r3264, %r3263, %r3187; + xor.b32 %r3265, %r3264, %r3203; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 20; + add.s32 %r3267, %r3261, %r3032; + add.s32 %r3268, %r3267, %r3266; + xor.b32 %r3269, %r3268, %r3263; + shf.l.wrap.b32 %r3270, %r3269, %r3269, 24; + add.s32 %r3271, %r3270, %r3264; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 25; + add.s32 %r3274, %r3245, %r2948; + add.s32 %r3275, %r3274, %r3226; + xor.b32 %r3276, %r3275, %r3270; + shf.l.wrap.b32 %r3277, %r3276, %r3276, 16; + add.s32 %r3278, %r3277, %r3257; + xor.b32 %r3279, %r3278, %r3245; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 20; + add.s32 %r3281, %r3275, %r3018; + add.s32 %r3282, %r3281, %r3280; + xor.b32 %r3283, %r3282, %r3277; + shf.l.wrap.b32 %r3284, %r3283, %r3283, 24; + add.s32 %r3285, %r3284, %r3278; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 25; + add.s32 %r3288, %r3240, %r3025; + add.s32 %r3289, %r3288, %r3259; + xor.b32 %r3290, %r3228, %r3289; + shf.l.wrap.b32 %r3291, %r3290, %r3290, 16; + add.s32 %r3292, %r3291, %r3271; + xor.b32 %r3293, %r3292, %r3259; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 20; + add.s32 %r3295, %r3289, %r2976; + add.s32 %r3296, %r3295, %r3294; + xor.b32 %r3297, %r3296, %r3291; + shf.l.wrap.b32 %r3298, %r3297, %r3297, 24; + add.s32 %r3299, %r3298, %r3292; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 25; + add.s32 %r3302, %r3254, %r3004; + add.s32 %r3303, %r3302, %r3273; + xor.b32 %r3304, %r3303, %r3242; + shf.l.wrap.b32 %r3305, %r3304, %r3304, 16; + add.s32 %r3306, %r3305, %r3229; + xor.b32 %r3307, %r3306, %r3273; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 20; + add.s32 %r3309, %r3303, %r3039; + add.s32 %r3310, %r3309, %r3308; + xor.b32 %r3311, %r3310, %r3305; + shf.l.wrap.b32 %r3312, %r3311, %r3311, 24; + add.s32 %r3313, %r3312, %r3306; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 25; + add.s32 %r3316, %r3268, %r3046; + add.s32 %r3317, %r3316, %r3231; + xor.b32 %r3318, %r3317, %r3256; + shf.l.wrap.b32 %r3319, %r3318, %r3318, 16; + add.s32 %r3320, %r3319, %r3243; + xor.b32 %r3321, %r3320, %r3231; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 20; + add.s32 %r3323, %r3317, %r2997; + add.s32 %r3324, %r3323, %r3322; + xor.b32 %r3325, %r3324, %r3319; + shf.l.wrap.b32 %r3326, %r3325, %r3325, 24; + add.s32 %r3327, %r3326, %r3320; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 25; + add.s32 %r3330, %r3282, %r2962; + add.s32 %r3331, %r3330, %r3329; + xor.b32 %r3332, %r3331, %r3298; + shf.l.wrap.b32 %r3333, %r3332, %r3332, 16; + add.s32 %r3334, %r3333, %r3313; + xor.b32 %r3335, %r3334, %r3329; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 20; + add.s32 %r3337, %r3331, %r2969; + add.s32 %r3338, %r3337, %r3336; + xor.b32 %r3339, %r3338, %r3333; + shf.l.wrap.b32 %r3340, %r3339, %r3339, 24; + add.s32 %r3341, %r3340, %r3334; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 25; + add.s32 %r3344, %r3296, %r3011; + add.s32 %r3345, %r3344, %r3287; + xor.b32 %r3346, %r3345, %r3312; + shf.l.wrap.b32 %r3347, %r3346, %r3346, 16; + add.s32 %r3348, %r3347, %r3327; + xor.b32 %r3349, %r3348, %r3287; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 20; + add.s32 %r3351, %r3345, %r3025; + add.s32 %r3352, %r3351, %r3350; + xor.b32 %r3353, %r3352, %r3347; + shf.l.wrap.b32 %r3354, %r3353, %r3353, 24; + add.s32 %r3355, %r3354, %r3348; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 25; + add.s32 %r3358, %r3310, %r3032; + add.s32 %r3359, %r3358, %r3301; + xor.b32 %r3360, %r3326, %r3359; + shf.l.wrap.b32 %r3361, %r3360, %r3360, 16; + add.s32 %r3362, %r3361, %r3285; + xor.b32 %r3363, %r3362, %r3301; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 20; + add.s32 %r3365, %r3359, %r2955; + add.s32 %r3366, %r3365, %r3364; + xor.b32 %r3367, %r3366, %r3361; + shf.l.wrap.b32 %r3368, %r3367, %r3367, 24; + add.s32 %r3369, %r3368, %r3362; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 25; + add.s32 %r3372, %r3315, %r2990; + add.s32 %r3373, %r3372, %r3324; + xor.b32 %r3374, %r3373, %r3284; + shf.l.wrap.b32 %r3375, %r3374, %r3374, 16; + add.s32 %r3376, %r3375, %r3299; + xor.b32 %r3377, %r3376, %r3315; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 20; + add.s32 %r3379, %r3373, %r3039; + add.s32 %r3380, %r3379, %r3378; + xor.b32 %r3381, %r3380, %r3375; + shf.l.wrap.b32 %r3382, %r3381, %r3381, 24; + add.s32 %r3383, %r3382, %r3376; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 25; + add.s32 %r3386, %r3357, %r2983; + add.s32 %r3387, %r3386, %r3338; + xor.b32 %r3388, %r3387, %r3382; + shf.l.wrap.b32 %r3389, %r3388, %r3388, 16; + add.s32 %r3390, %r3389, %r3369; + xor.b32 %r3391, %r3390, %r3357; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 20; + add.s32 %r3393, %r3387, %r2976; + add.s32 %r3394, %r3393, %r3392; + xor.b32 %r3395, %r3394, %r3389; + shf.l.wrap.b32 %r3396, %r3395, %r3395, 24; + add.s32 %r3397, %r3396, %r3390; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 25; + add.s32 %r3400, %r3352, %r3004; + add.s32 %r3401, %r3400, %r3371; + xor.b32 %r3402, %r3340, %r3401; + shf.l.wrap.b32 %r3403, %r3402, %r3402, 16; + add.s32 %r3404, %r3403, %r3383; + xor.b32 %r3405, %r3404, %r3371; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 20; + add.s32 %r3407, %r3401, %r2941; + add.s32 %r3408, %r3407, %r3406; + xor.b32 %r3409, %r3408, %r3403; + shf.l.wrap.b32 %r3410, %r3409, %r3409, 24; + add.s32 %r3411, %r3410, %r3404; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 25; + add.s32 %r3414, %r3366, %r3018; + add.s32 %r3415, %r3414, %r3385; + xor.b32 %r3416, %r3415, %r3354; + shf.l.wrap.b32 %r3417, %r3416, %r3416, 16; + add.s32 %r3418, %r3417, %r3341; + xor.b32 %r3419, %r3418, %r3385; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 20; + add.s32 %r3421, %r3415, %r3046; + add.s32 %r3422, %r3421, %r3420; + xor.b32 %r3423, %r3422, %r3417; + shf.l.wrap.b32 %r3424, %r3423, %r3423, 24; + add.s32 %r3425, %r3424, %r3418; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 25; + add.s32 %r3428, %r3380, %r2997; + add.s32 %r3429, %r3428, %r3343; + xor.b32 %r3430, %r3429, %r3368; + shf.l.wrap.b32 %r3431, %r3430, %r3430, 16; + add.s32 %r3432, %r3431, %r3355; + xor.b32 %r3433, %r3432, %r3343; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 20; + add.s32 %r3435, %r3429, %r2948; + add.s32 %r3436, %r3435, %r3434; + xor.b32 %r3437, %r3436, %r3431; + shf.l.wrap.b32 %r3438, %r3437, %r3437, 24; + add.s32 %r3439, %r3438, %r3432; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 25; + add.s32 %r3442, %r3394, %r3011; + add.s32 %r3443, %r3442, %r3441; + xor.b32 %r3444, %r3443, %r3410; + shf.l.wrap.b32 %r3445, %r3444, %r3444, 16; + add.s32 %r3446, %r3445, %r3425; + xor.b32 %r3447, %r3446, %r3441; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 20; + add.s32 %r3449, %r3443, %r2990; + add.s32 %r3450, %r3449, %r3448; + xor.b32 %r3451, %r3450, %r3445; + shf.l.wrap.b32 %r3452, %r3451, %r3451, 24; + add.s32 %r3453, %r3452, %r3446; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 25; + add.s32 %r3456, %r3408, %r3025; + add.s32 %r3457, %r3456, %r3399; + xor.b32 %r3458, %r3457, %r3424; + shf.l.wrap.b32 %r3459, %r3458, %r3458, 16; + add.s32 %r3460, %r3459, %r3439; + xor.b32 %r3461, %r3460, %r3399; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 20; + add.s32 %r3463, %r3457, %r3004; + add.s32 %r3464, %r3463, %r3462; + xor.b32 %r3465, %r3464, %r3459; + shf.l.wrap.b32 %r3466, %r3465, %r3465, 24; + add.s32 %r3467, %r3466, %r3460; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 25; + add.s32 %r3470, %r3422, %r3039; + add.s32 %r3471, %r3470, %r3413; + xor.b32 %r3472, %r3438, %r3471; + shf.l.wrap.b32 %r3473, %r3472, %r3472, 16; + add.s32 %r3474, %r3473, %r3397; + xor.b32 %r3475, %r3474, %r3413; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 20; + add.s32 %r3477, %r3471, %r2962; + add.s32 %r3478, %r3477, %r3476; + xor.b32 %r3479, %r3478, %r3473; + shf.l.wrap.b32 %r3480, %r3479, %r3479, 24; + add.s32 %r3481, %r3480, %r3474; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 25; + add.s32 %r3484, %r3427, %r3032; + add.s32 %r3485, %r3484, %r3436; + xor.b32 %r3486, %r3485, %r3396; + shf.l.wrap.b32 %r3487, %r3486, %r3486, 16; + add.s32 %r3488, %r3487, %r3411; + xor.b32 %r3489, %r3488, %r3427; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 20; + add.s32 %r3491, %r3485, %r3046; + add.s32 %r3492, %r3491, %r3490; + xor.b32 %r3493, %r3492, %r3487; + shf.l.wrap.b32 %r3494, %r3493, %r3493, 24; + add.s32 %r3495, %r3494, %r3488; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 25; + add.s32 %r3498, %r3469, %r2969; + add.s32 %r3499, %r3498, %r3450; + xor.b32 %r3500, %r3499, %r3494; + shf.l.wrap.b32 %r3501, %r3500, %r3500, 16; + add.s32 %r3502, %r3501, %r3481; + xor.b32 %r3503, %r3502, %r3469; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 20; + add.s32 %r3505, %r3499, %r2941; + add.s32 %r3506, %r3505, %r3504; + xor.b32 %r3507, %r3506, %r3501; + shf.l.wrap.b32 %r3508, %r3507, %r3507, 24; + add.s32 %r3509, %r3508, %r3502; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 25; + add.s32 %r3512, %r3464, %r3018; + add.s32 %r3513, %r3512, %r3483; + xor.b32 %r3514, %r3452, %r3513; + shf.l.wrap.b32 %r3515, %r3514, %r3514, 16; + add.s32 %r3516, %r3515, %r3495; + xor.b32 %r3517, %r3516, %r3483; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 20; + add.s32 %r3519, %r3513, %r2955; + add.s32 %r3520, %r3519, %r3518; + xor.b32 %r3521, %r3520, %r3515; + shf.l.wrap.b32 %r3522, %r3521, %r3521, 24; + add.s32 %r3523, %r3522, %r3516; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 25; + add.s32 %r3526, %r3478, %r2976; + add.s32 %r3527, %r3526, %r3497; + xor.b32 %r3528, %r3527, %r3466; + shf.l.wrap.b32 %r3529, %r3528, %r3528, 16; + add.s32 %r3530, %r3529, %r3453; + xor.b32 %r3531, %r3530, %r3497; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 20; + add.s32 %r3533, %r3527, %r2997; + add.s32 %r3534, %r3533, %r3532; + xor.b32 %r3535, %r3534, %r3529; + shf.l.wrap.b32 %r3536, %r3535, %r3535, 24; + add.s32 %r3537, %r3536, %r3530; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 25; + add.s32 %r3540, %r3492, %r2948; + add.s32 %r3541, %r3540, %r3455; + xor.b32 %r3542, %r3541, %r3480; + shf.l.wrap.b32 %r3543, %r3542, %r3542, 16; + add.s32 %r3544, %r3543, %r3467; + xor.b32 %r3545, %r3544, %r3455; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 20; + add.s32 %r3547, %r3541, %r2983; + add.s32 %r3548, %r3547, %r3546; + xor.b32 %r3549, %r3548, %r3543; + shf.l.wrap.b32 %r3550, %r3549, %r3549, 24; + add.s32 %r3551, %r3550, %r3544; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 25; + add.s32 %r3554, %r3506, %r3025; + add.s32 %r3555, %r3554, %r3553; + xor.b32 %r3556, %r3555, %r3522; + shf.l.wrap.b32 %r3557, %r3556, %r3556, 16; + add.s32 %r3558, %r3557, %r3537; + xor.b32 %r3559, %r3558, %r3553; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 20; + add.s32 %r3561, %r3555, %r3032; + add.s32 %r3562, %r3561, %r3560; + xor.b32 %r3563, %r3562, %r3557; + shf.l.wrap.b32 %r3564, %r3563, %r3563, 24; + add.s32 %r3565, %r3564, %r3558; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 25; + add.s32 %r3568, %r3520, %r3004; + add.s32 %r3569, %r3568, %r3511; + xor.b32 %r3570, %r3569, %r3536; + shf.l.wrap.b32 %r3571, %r3570, %r3570, 16; + add.s32 %r3572, %r3571, %r3551; + xor.b32 %r3573, %r3572, %r3511; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 20; + add.s32 %r3575, %r3569, %r3018; + add.s32 %r3576, %r3575, %r3574; + xor.b32 %r3577, %r3576, %r3571; + shf.l.wrap.b32 %r3578, %r3577, %r3577, 24; + add.s32 %r3579, %r3578, %r3572; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 25; + add.s32 %r3582, %r3534, %r3046; + add.s32 %r3583, %r3582, %r3525; + xor.b32 %r3584, %r3550, %r3583; + shf.l.wrap.b32 %r3585, %r3584, %r3584, 16; + add.s32 %r3586, %r3585, %r3509; + xor.b32 %r3587, %r3586, %r3525; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 20; + add.s32 %r3589, %r3583, %r3011; + add.s32 %r3590, %r3589, %r3588; + xor.b32 %r3591, %r3590, %r3585; + shf.l.wrap.b32 %r3592, %r3591, %r3591, 24; + add.s32 %r3593, %r3592, %r3586; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 25; + add.s32 %r3596, %r3539, %r3039; + add.s32 %r3597, %r3596, %r3548; + xor.b32 %r3598, %r3597, %r3508; + shf.l.wrap.b32 %r3599, %r3598, %r3598, 16; + add.s32 %r3600, %r3599, %r3523; + xor.b32 %r3601, %r3600, %r3539; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 20; + add.s32 %r3603, %r3597, %r2997; + add.s32 %r3604, %r3603, %r3602; + xor.b32 %r3605, %r3604, %r3599; + shf.l.wrap.b32 %r3606, %r3605, %r3605, 24; + add.s32 %r3607, %r3606, %r3600; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 25; + add.s32 %r3610, %r3581, %r2990; + add.s32 %r3611, %r3610, %r3562; + xor.b32 %r3612, %r3611, %r3606; + shf.l.wrap.b32 %r3613, %r3612, %r3612, 16; + add.s32 %r3614, %r3613, %r3593; + xor.b32 %r3615, %r3614, %r3581; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 20; + add.s32 %r3617, %r3611, %r2955; + add.s32 %r3618, %r3617, %r3616; + xor.b32 %r3619, %r3618, %r3613; + shf.l.wrap.b32 %r3620, %r3619, %r3619, 24; + add.s32 %r3621, %r3620, %r3614; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 25; + add.s32 %r3624, %r3576, %r2976; + add.s32 %r3625, %r3624, %r3595; + xor.b32 %r3626, %r3564, %r3625; + shf.l.wrap.b32 %r3627, %r3626, %r3626, 16; + add.s32 %r3628, %r3627, %r3607; + xor.b32 %r3629, %r3628, %r3595; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 20; + add.s32 %r3631, %r3625, %r2962; + add.s32 %r3632, %r3631, %r3630; + xor.b32 %r3633, %r3632, %r3627; + shf.l.wrap.b32 %r3634, %r3633, %r3633, 24; + add.s32 %r3635, %r3634, %r3628; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 25; + add.s32 %r3638, %r3590, %r2941; + add.s32 %r3639, %r3638, %r3609; + xor.b32 %r3640, %r3639, %r3578; + shf.l.wrap.b32 %r3641, %r3640, %r3640, 16; + add.s32 %r3642, %r3641, %r3565; + xor.b32 %r3643, %r3642, %r3609; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 20; + add.s32 %r3645, %r3639, %r2948; + add.s32 %r3646, %r3645, %r3644; + xor.b32 %r3647, %r3646, %r3641; + shf.l.wrap.b32 %r3648, %r3647, %r3647, 24; + add.s32 %r3649, %r3648, %r3642; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 25; + add.s32 %r3652, %r3604, %r2983; + add.s32 %r3653, %r3652, %r3567; + xor.b32 %r3654, %r3653, %r3592; + shf.l.wrap.b32 %r3655, %r3654, %r3654, 16; + add.s32 %r3656, %r3655, %r3579; + xor.b32 %r3657, %r3656, %r3567; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 20; + add.s32 %r3659, %r3653, %r2969; + add.s32 %r3660, %r3659, %r3658; + xor.b32 %r3661, %r3660, %r3655; + shf.l.wrap.b32 %r3662, %r3661, %r3661, 24; + add.s32 %r3663, %r3662, %r3656; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 25; + add.s32 %r3666, %r3618, %r3004; + add.s32 %r3667, %r3666, %r3665; + xor.b32 %r3668, %r3667, %r3634; + shf.l.wrap.b32 %r3669, %r3668, %r3668, 16; + add.s32 %r3670, %r3669, %r3649; + xor.b32 %r3671, %r3670, %r3665; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 20; + add.s32 %r3673, %r3667, %r3039; + add.s32 %r3674, %r3673, %r3672; + xor.b32 %r3675, %r3674, %r3669; + shf.l.wrap.b32 %r3676, %r3675, %r3675, 24; + add.s32 %r3677, %r3676, %r3670; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 25; + add.s32 %r3680, %r3632, %r3018; + add.s32 %r3681, %r3680, %r3623; + xor.b32 %r3682, %r3681, %r3648; + shf.l.wrap.b32 %r3683, %r3682, %r3682, 16; + add.s32 %r3684, %r3683, %r3663; + xor.b32 %r3685, %r3684, %r3623; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 20; + add.s32 %r3687, %r3681, %r2976; + add.s32 %r3688, %r3687, %r3686; + xor.b32 %r3689, %r3688, %r3683; + shf.l.wrap.b32 %r3690, %r3689, %r3689, 24; + add.s32 %r3691, %r3690, %r3684; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 25; + add.s32 %r3694, %r3646, %r2997; + add.s32 %r3695, %r3694, %r3637; + xor.b32 %r3696, %r3662, %r3695; + shf.l.wrap.b32 %r3697, %r3696, %r3696, 16; + add.s32 %r3698, %r3697, %r3621; + xor.b32 %r3699, %r3698, %r3637; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 20; + add.s32 %r3701, %r3695, %r3025; + add.s32 %r3702, %r3701, %r3700; + xor.b32 %r3703, %r3702, %r3697; + shf.l.wrap.b32 %r3704, %r3703, %r3703, 24; + add.s32 %r3705, %r3704, %r3698; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 25; + add.s32 %r3708, %r3651, %r3046; + add.s32 %r3709, %r3708, %r3660; + xor.b32 %r3710, %r3709, %r3620; + shf.l.wrap.b32 %r3711, %r3710, %r3710, 16; + add.s32 %r3712, %r3711, %r3635; + xor.b32 %r3713, %r3712, %r3651; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 20; + add.s32 %r3715, %r3709, %r2948; + add.s32 %r3716, %r3715, %r3714; + xor.b32 %r3717, %r3716, %r3711; + shf.l.wrap.b32 %r3718, %r3717, %r3717, 24; + add.s32 %r3719, %r3718, %r3712; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 25; + add.s32 %r3722, %r3693, %r3032; + add.s32 %r3723, %r3722, %r3674; + xor.b32 %r3724, %r3723, %r3718; + shf.l.wrap.b32 %r3725, %r3724, %r3724, 16; + add.s32 %r3726, %r3725, %r3705; + xor.b32 %r3727, %r3726, %r3693; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 20; + add.s32 %r3729, %r3723, %r2962; + add.s32 %r3730, %r3729, %r3728; + xor.b32 %r3731, %r3730, %r3725; + shf.l.wrap.b32 %r3732, %r3731, %r3731, 24; + add.s32 %r3733, %r3732, %r3726; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 25; + add.s32 %r3736, %r3688, %r2941; + add.s32 %r3737, %r3736, %r3707; + xor.b32 %r3738, %r3676, %r3737; + shf.l.wrap.b32 %r3739, %r3738, %r3738, 16; + add.s32 %r3740, %r3739, %r3719; + xor.b32 %r3741, %r3740, %r3707; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 20; + add.s32 %r3743, %r3737, %r3011; + add.s32 %r3744, %r3743, %r3742; + xor.b32 %r3745, %r3744, %r3739; + shf.l.wrap.b32 %r3746, %r3745, %r3745, 24; + add.s32 %r3747, %r3746, %r3740; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 25; + add.s32 %r3750, %r3702, %r2955; + add.s32 %r3751, %r3750, %r3721; + xor.b32 %r3752, %r3751, %r3690; + shf.l.wrap.b32 %r3753, %r3752, %r3752, 16; + add.s32 %r3754, %r3753, %r3677; + xor.b32 %r3755, %r3754, %r3721; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 20; + add.s32 %r3757, %r3751, %r2983; + add.s32 %r3758, %r3757, %r3756; + xor.b32 %r3759, %r3758, %r3753; + shf.l.wrap.b32 %r3760, %r3759, %r3759, 24; + add.s32 %r3761, %r3760, %r3754; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 25; + add.s32 %r3764, %r3716, %r2969; + add.s32 %r3765, %r3764, %r3679; + xor.b32 %r3766, %r3765, %r3704; + shf.l.wrap.b32 %r3767, %r3766, %r3766, 16; + add.s32 %r3768, %r3767, %r3691; + xor.b32 %r3769, %r3768, %r3679; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 20; + add.s32 %r3771, %r3765, %r2990; + add.s32 %r3772, %r3771, %r3770; + xor.b32 %r3773, %r3772, %r3767; + shf.l.wrap.b32 %r3774, %r3773, %r3773, 24; + add.s32 %r3775, %r3774, %r3768; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 25; + add.s32 %r3778, %r3730, %r3018; + add.s32 %r3779, %r3778, %r3777; + xor.b32 %r3780, %r3779, %r3746; + shf.l.wrap.b32 %r3781, %r3780, %r3780, 16; + add.s32 %r3782, %r3781, %r3761; + xor.b32 %r3783, %r3782, %r3777; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 20; + add.s32 %r3785, %r3779, %r3046; + add.s32 %r3786, %r3785, %r3784; + xor.b32 %r3787, %r3786, %r3781; + shf.l.wrap.b32 %r3788, %r3787, %r3787, 24; + add.s32 %r3789, %r3788, %r3782; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 25; + add.s32 %r3792, %r3744, %r2976; + add.s32 %r3793, %r3792, %r3735; + xor.b32 %r3794, %r3793, %r3760; + shf.l.wrap.b32 %r3795, %r3794, %r3794, 16; + add.s32 %r3796, %r3795, %r3775; + xor.b32 %r3797, %r3796, %r3735; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 20; + add.s32 %r3799, %r3793, %r2941; + add.s32 %r3800, %r3799, %r3798; + xor.b32 %r3801, %r3800, %r3795; + shf.l.wrap.b32 %r3802, %r3801, %r3801, 24; + add.s32 %r3803, %r3802, %r3796; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 25; + add.s32 %r3806, %r3758, %r2948; + add.s32 %r3807, %r3806, %r3749; + xor.b32 %r3808, %r3774, %r3807; + shf.l.wrap.b32 %r3809, %r3808, %r3808, 16; + add.s32 %r3810, %r3809, %r3733; + xor.b32 %r3811, %r3810, %r3749; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 20; + add.s32 %r3813, %r3807, %r3004; + add.s32 %r3814, %r3813, %r3812; + xor.b32 %r3815, %r3814, %r3809; + shf.l.wrap.b32 %r3816, %r3815, %r3815, 24; + add.s32 %r3817, %r3816, %r3810; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 25; + add.s32 %r3820, %r3763, %r2997; + add.s32 %r3821, %r3820, %r3772; + xor.b32 %r3822, %r3821, %r3732; + shf.l.wrap.b32 %r3823, %r3822, %r3822, 16; + add.s32 %r3824, %r3823, %r3747; + xor.b32 %r3825, %r3824, %r3763; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 20; + add.s32 %r3827, %r3821, %r2983; + add.s32 %r3828, %r3827, %r3826; + xor.b32 %r3829, %r3828, %r3823; + shf.l.wrap.b32 %r3830, %r3829, %r3829, 24; + add.s32 %r3831, %r3830, %r3824; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 25; + add.s32 %r3834, %r3805, %r3039; + add.s32 %r3835, %r3834, %r3786; + xor.b32 %r3836, %r3835, %r3830; + shf.l.wrap.b32 %r3837, %r3836, %r3836, 16; + add.s32 %r3838, %r3837, %r3817; + xor.b32 %r3839, %r3838, %r3805; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 20; + add.s32 %r3841, %r3835, %r3011; + add.s32 %r3842, %r3841, %r3840; + xor.b32 %r3843, %r3842, %r3837; + shf.l.wrap.b32 %r3844, %r3843, %r3843, 24; + add.s32 %r3845, %r3844, %r3838; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 25; + add.s32 %r3848, %r3800, %r2955; + add.s32 %r3849, %r3848, %r3819; + xor.b32 %r3850, %r3788, %r3849; + shf.l.wrap.b32 %r3851, %r3850, %r3850, 16; + add.s32 %r3852, %r3851, %r3831; + xor.b32 %r3853, %r3852, %r3819; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 20; + add.s32 %r3855, %r3849, %r3025; + add.s32 %r3856, %r3855, %r3854; + xor.b32 %r3857, %r3856, %r3851; + shf.l.wrap.b32 %r3858, %r3857, %r3857, 24; + add.s32 %r3859, %r3858, %r3852; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 25; + add.s32 %r3862, %r3814, %r2962; + add.s32 %r3863, %r3862, %r3833; + xor.b32 %r3864, %r3863, %r3802; + shf.l.wrap.b32 %r3865, %r3864, %r3864, 16; + add.s32 %r3866, %r3865, %r3789; + xor.b32 %r3867, %r3866, %r3833; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 20; + add.s32 %r3869, %r3863, %r2969; + add.s32 %r3870, %r3869, %r3868; + xor.b32 %r3871, %r3870, %r3865; + shf.l.wrap.b32 %r3872, %r3871, %r3871, 24; + add.s32 %r3873, %r3872, %r3866; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 25; + add.s32 %r3876, %r3828, %r2990; + add.s32 %r3877, %r3876, %r3791; + xor.b32 %r3878, %r3877, %r3816; + shf.l.wrap.b32 %r3879, %r3878, %r3878, 16; + add.s32 %r3880, %r3879, %r3803; + xor.b32 %r3881, %r3880, %r3791; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 20; + add.s32 %r3883, %r3877, %r3032; + add.s32 %r3884, %r3883, %r3882; + xor.b32 %r3885, %r3884, %r3879; + shf.l.wrap.b32 %r3886, %r3885, %r3885, 24; + add.s32 %r3887, %r3886, %r3880; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 25; + xor.b32 %r3890, %r3873, %r3842; + xor.b32 %r3891, %r3887, %r3856; + xor.b32 %r3892, %r3845, %r3870; + xor.b32 %r3893, %r3884, %r3859; + xor.b32 %r3894, %r3889, %r3858; + xor.b32 %r3895, %r3847, %r3872; + xor.b32 %r3896, %r3886, %r3861; + xor.b32 %r3897, %r3875, %r3844; + st.local.u8 [%rd132+145], %r3890; + shr.u32 %r3898, %r3890, 8; + st.local.u8 [%rd132+146], %r3898; + shr.u32 %r3899, %r3890, 16; + st.local.u8 [%rd132+147], %r3899; + shr.u32 %r3900, %r3890, 24; + st.local.u8 [%rd132+148], %r3900; + st.local.u8 [%rd132+149], %r3891; + shr.u32 %r3901, %r3891, 8; + st.local.u8 [%rd132+150], %r3901; + shr.u32 %r3902, %r3891, 16; + st.local.u8 [%rd132+151], %r3902; + shr.u32 %r3903, %r3891, 24; + st.local.u8 [%rd132+152], %r3903; + st.local.u8 [%rd132+153], %r3892; + shr.u32 %r3904, %r3892, 8; + st.local.u8 [%rd132+154], %r3904; + shr.u32 %r3905, %r3892, 16; + st.local.u8 [%rd132+155], %r3905; + shr.u32 %r3906, %r3892, 24; + st.local.u8 [%rd132+156], %r3906; + st.local.u8 [%rd132+157], %r3893; + shr.u32 %r3907, %r3893, 8; + st.local.u8 [%rd132+158], %r3907; + shr.u32 %r3908, %r3893, 16; + st.local.u8 [%rd132+159], %r3908; + shr.u32 %r3909, %r3893, 24; + st.local.u8 [%rd132+160], %r3909; + st.local.u8 [%rd132+161], %r3894; + shr.u32 %r3910, %r3894, 8; + st.local.u8 [%rd132+162], %r3910; + shr.u32 %r3911, %r3894, 16; + st.local.u8 [%rd132+163], %r3911; + shr.u32 %r3912, %r3894, 24; + st.local.u8 [%rd132+164], %r3912; + st.local.u8 [%rd132+165], %r3895; + shr.u32 %r3913, %r3895, 8; + st.local.u8 [%rd132+166], %r3913; + shr.u32 %r3914, %r3895, 16; + st.local.u8 [%rd132+167], %r3914; + shr.u32 %r3915, %r3895, 24; + st.local.u8 [%rd132+168], %r3915; + st.local.u8 [%rd132+169], %r3896; + shr.u32 %r3916, %r3896, 8; + st.local.u8 [%rd132+170], %r3916; + shr.u32 %r3917, %r3896, 16; + st.local.u8 [%rd132+171], %r3917; + shr.u32 %r3918, %r3896, 24; + st.local.u8 [%rd132+172], %r3918; + st.local.u8 [%rd132+173], %r3897; + shr.u32 %r3919, %r3897, 8; + st.local.u8 [%rd132+174], %r3919; + shr.u32 %r3920, %r3897, 16; + st.local.u8 [%rd132+175], %r3920; + shr.u32 %r3921, %r3897, 24; + st.local.u8 [%rd132+176], %r3921; + ld.local.u8 %rs128, [%rd3+8]; + add.s16 %rs129, %rs128, -1; + st.local.u8 [%rd3+8], %rs129; + cvt.u64.u16 %rd133, %rs129; + and.b64 %rd134, %rd133, 255; + setp.lt.u64 %p16, %rd30, %rd134; + and.b16 %rs130, %rs129, 255; + mul.wide.u16 %r11659, %rs130, 32; + @%p16 bra $L__BB1_18; + +$L__BB1_19: + ld.param.u64 %rd222, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvt.s64.s32 %rd136, %r11659; + add.s64 %rd137, %rd2, %rd136; + mov.u64 %rd245, 0; + st.local.u8 [%rd137+145], %r27; + shr.u32 %r3922, %r27, 8; + st.local.u8 [%rd137+146], %r3922; + shr.u32 %r3923, %r27, 16; + st.local.u8 [%rd137+147], %r3923; + shr.u32 %r3924, %r27, 24; + st.local.u8 [%rd137+148], %r3924; + st.local.u8 [%rd137+149], %r28; + shr.u32 %r3925, %r28, 8; + st.local.u8 [%rd137+150], %r3925; + shr.u32 %r3926, %r28, 16; + st.local.u8 [%rd137+151], %r3926; + shr.u32 %r3927, %r28, 24; + st.local.u8 [%rd137+152], %r3927; + st.local.u8 [%rd137+153], %r29; + shr.u32 %r3928, %r29, 8; + st.local.u8 [%rd137+154], %r3928; + shr.u32 %r3929, %r29, 16; + st.local.u8 [%rd137+155], %r3929; + shr.u32 %r3930, %r29, 24; + st.local.u8 [%rd137+156], %r3930; + st.local.u8 [%rd137+157], %r30; + shr.u32 %r3931, %r30, 8; + st.local.u8 [%rd137+158], %r3931; + shr.u32 %r3932, %r30, 16; + st.local.u8 [%rd137+159], %r3932; + shr.u32 %r3933, %r30, 24; + st.local.u8 [%rd137+160], %r3933; + st.local.u8 [%rd137+161], %r31; + shr.u32 %r3934, %r31, 8; + st.local.u8 [%rd137+162], %r3934; + shr.u32 %r3935, %r31, 16; + st.local.u8 [%rd137+163], %r3935; + shr.u32 %r3936, %r31, 24; + st.local.u8 [%rd137+164], %r3936; + st.local.u8 [%rd137+165], %r32; + shr.u32 %r3937, %r32, 8; + st.local.u8 [%rd137+166], %r3937; + shr.u32 %r3938, %r32, 16; + st.local.u8 [%rd137+167], %r3938; + shr.u32 %r3939, %r32, 24; + st.local.u8 [%rd137+168], %r3939; + st.local.u8 [%rd137+169], %r33; + shr.u32 %r3940, %r33, 8; + st.local.u8 [%rd137+170], %r3940; + shr.u32 %r3941, %r33, 16; + st.local.u8 [%rd137+171], %r3941; + shr.u32 %r3942, %r33, 24; + st.local.u8 [%rd137+172], %r3942; + st.local.u8 [%rd137+173], %r34; + shr.u32 %r3943, %r34, 8; + st.local.u8 [%rd137+174], %r3943; + shr.u32 %r3944, %r34, 16; + st.local.u8 [%rd137+175], %r3944; + shr.u32 %r3945, %r34, 24; + st.local.u8 [%rd137+176], %r3945; + ld.local.u8 %rs131, [%rd3+8]; + add.s16 %rs132, %rs131, 1; + st.local.u8 [%rd3+8], %rs132; + ld.local.u64 %rd138, [%rd3+-72]; + add.s64 %rd32, %rd138, 1; + add.s64 %rd253, %rd222, %rd6; + +$L__BB1_20: + add.s64 %rd139, %rd2, %rd245; + ld.local.u8 %rs133, [%rd139]; + st.local.u8 [%rd139+32], %rs133; + add.s64 %rd245, %rd245, 1; + setp.lt.u64 %p17, %rd245, 32; + @%p17 bra $L__BB1_20; + + mov.u64 %rd246, 0; + st.local.u64 [%rd3+-72], %rd32; + mov.u16 %rs134, 0; + st.local.u8 [%rd3+1], %rs134; + +$L__BB1_22: + add.s64 %rd141, %rd2, %rd246; + st.local.u8 [%rd141+72], %rs134; + add.s64 %rd246, %rd246, 1; + setp.lt.u64 %p18, %rd246, 64; + @%p18 bra $L__BB1_22; + + ld.param.u64 %rd235, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd234, %rd235; + add.s64 %rd260, %rd234, %rd6; + mov.u64 %rd224, 80; + sub.s64 %rd261, %rd224, %rd6; + mov.u16 %rs136, 0; + st.local.u8 [%rd3], %rs136; + +$L__BB1_24: + setp.lt.u64 %p19, %rd261, 1025; + @%p19 bra $L__BB1_48; + + ld.local.u64 %rd250, [%rd3+-72]; + add.u64 %rd142, %SP, 0; + add.u64 %rd42, %SPL, 0; + +$L__BB1_26: + or.b64 %rd143, %rd261, 1; + mov.u64 %rd144, 1; + setp.gt.u64 %p20, %rd143, 4294967295; + shr.u64 %rd145, %rd261, 32; + selp.b64 %rd146, %rd145, %rd143, %p20; + selp.b32 %r3946, 32, 0, %p20; + and.b64 %rd147, %rd146, 4294901760; + setp.ne.s64 %p21, %rd147, 0; + shr.u64 %rd148, %rd146, 16; + or.b32 %r3947, %r3946, 16; + selp.b64 %rd149, %rd148, %rd146, %p21; + selp.b32 %r3948, %r3947, %r3946, %p21; + and.b64 %rd150, %rd149, 65280; + setp.ne.s64 %p22, %rd150, 0; + shr.u64 %rd151, %rd149, 8; + or.b32 %r3949, %r3948, 8; + selp.b64 %rd152, %rd151, %rd149, %p22; + selp.b32 %r3950, %r3949, %r3948, %p22; + and.b64 %rd153, %rd152, 240; + setp.ne.s64 %p23, %rd153, 0; + shr.u64 %rd154, %rd152, 4; + or.b32 %r3951, %r3950, 4; + selp.b64 %rd155, %rd154, %rd152, %p23; + selp.b32 %r3952, %r3951, %r3950, %p23; + and.b64 %rd156, %rd155, 12; + setp.ne.s64 %p24, %rd156, 0; + shr.u64 %rd157, %rd155, 2; + add.s32 %r3953, %r3952, 2; + selp.b64 %rd158, %rd157, %rd155, %p24; + selp.b32 %r3954, %r3953, %r3952, %p24; + bfe.u64 %rd159, %rd158, 1, 1; + cvt.u32.u64 %r3955, %rd159; + add.s32 %r3956, %r3954, %r3955; + shl.b64 %rd254, %rd144, %r3956; + shl.b64 %rd48, %rd250, 10; + +$L__BB1_27: + mov.u64 %rd49, %rd254; + add.s64 %rd160, %rd49, -1; + and.b64 %rd161, %rd160, %rd48; + setp.ne.s64 %p25, %rd161, 0; + shr.u64 %rd254, %rd49, 1; + @%p25 bra $L__BB1_27; + + ld.local.u8 %rs14, [%rd3+2]; + setp.lt.u64 %p26, %rd49, 1025; + @%p26 bra $L__BB1_36; + bra.uni $L__BB1_29; + +$L__BB1_36: + ld.local.u8 %r5955, [%rd3+-136]; + ld.local.u8 %r5956, [%rd3+-135]; + prmt.b32 %r5957, %r5956, %r5955, 30212; + ld.local.u8 %r5958, [%rd3+-134]; + ld.local.u8 %r5959, [%rd3+-133]; + prmt.b32 %r5960, %r5959, %r5958, 30212; + prmt.b32 %r11679, %r5960, %r5957, 4180; + ld.local.u8 %r5961, [%rd3+-132]; + ld.local.u8 %r5962, [%rd3+-131]; + prmt.b32 %r5963, %r5962, %r5961, 30212; + ld.local.u8 %r5964, [%rd3+-130]; + ld.local.u8 %r5965, [%rd3+-129]; + prmt.b32 %r5966, %r5965, %r5964, 30212; + prmt.b32 %r11678, %r5966, %r5963, 4180; + ld.local.u8 %r5967, [%rd3+-128]; + ld.local.u8 %r5968, [%rd3+-127]; + prmt.b32 %r5969, %r5968, %r5967, 30212; + ld.local.u8 %r5970, [%rd3+-126]; + ld.local.u8 %r5971, [%rd3+-125]; + prmt.b32 %r5972, %r5971, %r5970, 30212; + prmt.b32 %r11677, %r5972, %r5969, 4180; + ld.local.u8 %r5973, [%rd3+-124]; + ld.local.u8 %r5974, [%rd3+-123]; + prmt.b32 %r5975, %r5974, %r5973, 30212; + ld.local.u8 %r5976, [%rd3+-122]; + ld.local.u8 %r5977, [%rd3+-121]; + prmt.b32 %r5978, %r5977, %r5976, 30212; + prmt.b32 %r11676, %r5978, %r5975, 4180; + ld.local.u8 %r5979, [%rd3+-120]; + ld.local.u8 %r5980, [%rd3+-119]; + prmt.b32 %r5981, %r5980, %r5979, 30212; + ld.local.u8 %r5982, [%rd3+-118]; + ld.local.u8 %r5983, [%rd3+-117]; + prmt.b32 %r5984, %r5983, %r5982, 30212; + prmt.b32 %r11675, %r5984, %r5981, 4180; + ld.local.u8 %r5985, [%rd3+-116]; + ld.local.u8 %r5986, [%rd3+-115]; + prmt.b32 %r5987, %r5986, %r5985, 30212; + ld.local.u8 %r5988, [%rd3+-114]; + ld.local.u8 %r5989, [%rd3+-113]; + prmt.b32 %r5990, %r5989, %r5988, 30212; + prmt.b32 %r11674, %r5990, %r5987, 4180; + ld.local.u8 %r5991, [%rd3+-112]; + ld.local.u8 %r5992, [%rd3+-111]; + prmt.b32 %r5993, %r5992, %r5991, 30212; + ld.local.u8 %r5994, [%rd3+-110]; + ld.local.u8 %r5995, [%rd3+-109]; + prmt.b32 %r5996, %r5995, %r5994, 30212; + prmt.b32 %r11673, %r5996, %r5993, 4180; + ld.local.u8 %r5997, [%rd3+-108]; + ld.local.u8 %r5998, [%rd3+-107]; + prmt.b32 %r5999, %r5998, %r5997, 30212; + ld.local.u8 %r6000, [%rd3+-106]; + ld.local.u8 %r6001, [%rd3+-105]; + prmt.b32 %r6002, %r6001, %r6000, 30212; + prmt.b32 %r11672, %r6002, %r5999, 4180; + add.u64 %rd53, %SPL, 64; + mov.u32 %r6003, 0; + st.local.v2.u32 [%rd53], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+8], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+16], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+24], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+32], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+40], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+48], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+56], {%r6003, %r6003}; + mov.u16 %rs354, 0; + st.local.v2.u8 [%rd53+64], {%rs354, %rs354}; + st.local.u8 [%rd53+66], %rs14; + cvt.u32.u64 %r71, %rd250; + shr.u64 %rd184, %rd250, 32; + cvt.u32.u64 %r72, %rd184; + setp.lt.u64 %p31, %rd49, 65; + mov.u64 %rd257, %rd260; + mov.u64 %rd258, %rd49; + @%p31 bra $L__BB1_39; + + add.s64 %rd54, %rd53, 64; + mov.u16 %rs353, 0; + mov.u64 %rd258, %rd49; + mov.u64 %rd257, %rd260; + +$L__BB1_38: + and.b16 %rs213, %rs353, 255; + setp.eq.s16 %p32, %rs213, 0; + selp.u16 %rs214, 1, 0, %p32; + or.b16 %rs215, %rs14, %rs214; + ld.local.u8 %r6004, [%rd257]; + ld.local.u8 %r6005, [%rd257+1]; + prmt.b32 %r6006, %r6005, %r6004, 30212; + ld.local.u8 %r6007, [%rd257+2]; + prmt.b32 %r6008, %r6007, %r6006, 28756; + ld.local.u8 %r6009, [%rd257+3]; + prmt.b32 %r6010, %r6009, %r6008, 1620; + ld.local.u8 %r6011, [%rd257+4]; + ld.local.u8 %r6012, [%rd257+5]; + prmt.b32 %r6013, %r6012, %r6011, 30212; + ld.local.u8 %r6014, [%rd257+6]; + prmt.b32 %r6015, %r6014, %r6013, 28756; + ld.local.u8 %r6016, [%rd257+7]; + prmt.b32 %r6017, %r6016, %r6015, 1620; + ld.local.u8 %r6018, [%rd257+8]; + ld.local.u8 %r6019, [%rd257+9]; + prmt.b32 %r6020, %r6019, %r6018, 30212; + ld.local.u8 %r6021, [%rd257+10]; + prmt.b32 %r6022, %r6021, %r6020, 28756; + ld.local.u8 %r6023, [%rd257+11]; + prmt.b32 %r6024, %r6023, %r6022, 1620; + ld.local.u8 %r6025, [%rd257+12]; + ld.local.u8 %r6026, [%rd257+13]; + prmt.b32 %r6027, %r6026, %r6025, 30212; + ld.local.u8 %r6028, [%rd257+14]; + prmt.b32 %r6029, %r6028, %r6027, 28756; + ld.local.u8 %r6030, [%rd257+15]; + prmt.b32 %r6031, %r6030, %r6029, 1620; + ld.local.u8 %r6032, [%rd257+16]; + ld.local.u8 %r6033, [%rd257+17]; + prmt.b32 %r6034, %r6033, %r6032, 30212; + ld.local.u8 %r6035, [%rd257+18]; + prmt.b32 %r6036, %r6035, %r6034, 28756; + ld.local.u8 %r6037, [%rd257+19]; + prmt.b32 %r6038, %r6037, %r6036, 1620; + ld.local.u8 %r6039, [%rd257+20]; + ld.local.u8 %r6040, [%rd257+21]; prmt.b32 %r6041, %r6040, %r6039, 30212; - shl.b32 %r6042, %r6037, 4; - and.b32 %r6043, %r6042, 983040; - or.b32 %r6044, %r6041, %r6043; - shl.b32 %r6045, %r6036, 24; - and.b32 %r6046, %r6045, 251658240; - or.b32 %r5918, %r6044, %r6046; - shr.u64 %rd202, %rd9, 16; - cvt.u32.u64 %r6047, %rd202; - bfe.u32 %r6048, %r6037, 20, 4; - and.b32 %r6049, %r6047, 15; - shr.u64 %rd203, %rd9, 24; - cvt.u32.u64 %r6050, %rd203; - bfi.b32 %r6051, %r6049, %r6048, 8, 4; - and.b32 %r6052, %r6038, 983040; - or.b32 %r6053, %r6051, %r6052; - shl.b32 %r6054, %r6050, 24; - and.b32 %r6055, %r6054, 251658240; - or.b32 %r5922, %r6053, %r6055; - shr.u64 %rd204, %rd9, 32; - cvt.u32.u64 %r6056, %rd204; - shr.u64 %rd205, %rd9, 36; - cvt.u32.u64 %r6057, %rd205; - and.b32 %r6058, %r6057, 15; - and.b32 %r6059, %r6056, 15; - shr.u64 %rd206, %rd9, 40; - cvt.u32.u64 %r6060, %rd206; - shr.u64 %rd207, %rd9, 44; - cvt.u32.u64 %r6061, %rd207; - bfi.b32 %r6062, %r6059, %r6058, 8, 4; - shl.b32 %r6063, %r6061, 16; - and.b32 %r6064, %r6063, 983040; - or.b32 %r6065, %r6062, %r6064; - shl.b32 %r6066, %r6060, 24; - and.b32 %r6067, %r6066, 251658240; - or.b32 %r5926, %r6065, %r6067; - shr.u64 %rd208, %rd9, 48; - cvt.u32.u64 %r6068, %rd208; - shr.u64 %rd209, %rd9, 52; - cvt.u32.u64 %r6069, %rd209; - and.b32 %r6070, %r6069, 15; - and.b32 %r6071, %r6068, 15; - shr.u64 %rd210, %rd9, 56; - cvt.u32.u64 %r5935, %rd210; - bfi.b32 %r6072, %r6071, %r6070, 8, 4; - and.b32 %r6073, %r6061, 983040; - or.b32 %r6074, %r6072, %r6073; - shl.b32 %r6075, %r5935, 24; - and.b32 %r6076, %r6075, 251658240; - or.b32 %r5930, %r6074, %r6076; - ld.const.u32 %r1713, [matrix]; - mov.u32 %r6249, 0; + ld.local.u8 %r6042, [%rd257+22]; + prmt.b32 %r6043, %r6042, %r6041, 28756; + ld.local.u8 %r6044, [%rd257+23]; + prmt.b32 %r6045, %r6044, %r6043, 1620; + ld.local.u8 %r6046, [%rd257+24]; + ld.local.u8 %r6047, [%rd257+25]; + prmt.b32 %r6048, %r6047, %r6046, 30212; + ld.local.u8 %r6049, [%rd257+26]; + prmt.b32 %r6050, %r6049, %r6048, 28756; + ld.local.u8 %r6051, [%rd257+27]; + prmt.b32 %r6052, %r6051, %r6050, 1620; + ld.local.u8 %r6053, [%rd257+28]; + ld.local.u8 %r6054, [%rd257+29]; + prmt.b32 %r6055, %r6054, %r6053, 30212; + ld.local.u8 %r6056, [%rd257+30]; + prmt.b32 %r6057, %r6056, %r6055, 28756; + ld.local.u8 %r6058, [%rd257+31]; + prmt.b32 %r6059, %r6058, %r6057, 1620; + ld.local.u8 %r6060, [%rd257+32]; + ld.local.u8 %r6061, [%rd257+33]; + prmt.b32 %r6062, %r6061, %r6060, 30212; + ld.local.u8 %r6063, [%rd257+34]; + prmt.b32 %r6064, %r6063, %r6062, 28756; + ld.local.u8 %r6065, [%rd257+35]; + prmt.b32 %r6066, %r6065, %r6064, 1620; + ld.local.u8 %r6067, [%rd257+36]; + ld.local.u8 %r6068, [%rd257+37]; + prmt.b32 %r6069, %r6068, %r6067, 30212; + ld.local.u8 %r6070, [%rd257+38]; + prmt.b32 %r6071, %r6070, %r6069, 28756; + ld.local.u8 %r6072, [%rd257+39]; + prmt.b32 %r6073, %r6072, %r6071, 1620; + ld.local.u8 %r6074, [%rd257+40]; + ld.local.u8 %r6075, [%rd257+41]; + prmt.b32 %r6076, %r6075, %r6074, 30212; + ld.local.u8 %r6077, [%rd257+42]; + prmt.b32 %r6078, %r6077, %r6076, 28756; + ld.local.u8 %r6079, [%rd257+43]; + prmt.b32 %r6080, %r6079, %r6078, 1620; + ld.local.u8 %r6081, [%rd257+44]; + ld.local.u8 %r6082, [%rd257+45]; + prmt.b32 %r6083, %r6082, %r6081, 30212; + ld.local.u8 %r6084, [%rd257+46]; + prmt.b32 %r6085, %r6084, %r6083, 28756; + ld.local.u8 %r6086, [%rd257+47]; + prmt.b32 %r6087, %r6086, %r6085, 1620; + ld.local.u8 %r6088, [%rd257+48]; + ld.local.u8 %r6089, [%rd257+49]; + prmt.b32 %r6090, %r6089, %r6088, 30212; + ld.local.u8 %r6091, [%rd257+50]; + prmt.b32 %r6092, %r6091, %r6090, 28756; + ld.local.u8 %r6093, [%rd257+51]; + prmt.b32 %r6094, %r6093, %r6092, 1620; + ld.local.u8 %r6095, [%rd257+52]; + ld.local.u8 %r6096, [%rd257+53]; + prmt.b32 %r6097, %r6096, %r6095, 30212; + ld.local.u8 %r6098, [%rd257+54]; + prmt.b32 %r6099, %r6098, %r6097, 28756; + ld.local.u8 %r6100, [%rd257+55]; + prmt.b32 %r6101, %r6100, %r6099, 1620; + ld.local.u8 %r6102, [%rd257+56]; + ld.local.u8 %r6103, [%rd257+57]; + prmt.b32 %r6104, %r6103, %r6102, 30212; + ld.local.u8 %r6105, [%rd257+58]; + prmt.b32 %r6106, %r6105, %r6104, 28756; + ld.local.u8 %r6107, [%rd257+59]; + prmt.b32 %r6108, %r6107, %r6106, 1620; + ld.local.u8 %r6109, [%rd257+60]; + ld.local.u8 %r6110, [%rd257+61]; + prmt.b32 %r6111, %r6110, %r6109, 30212; + ld.local.u8 %r6112, [%rd257+62]; + prmt.b32 %r6113, %r6112, %r6111, 28756; + ld.local.u8 %r6114, [%rd257+63]; + prmt.b32 %r6115, %r6114, %r6113, 1620; + cvt.u32.u16 %r6116, %rs215; + and.b32 %r6117, %r6116, 255; + add.s32 %r6118, %r11679, %r11675; + add.s32 %r6119, %r6118, %r6010; + xor.b32 %r6120, %r6119, %r71; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 16; + add.s32 %r6122, %r6121, 1779033703; + xor.b32 %r6123, %r6122, %r11675; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 20; + add.s32 %r6125, %r6017, %r6119; + add.s32 %r6126, %r6125, %r6124; + xor.b32 %r6127, %r6126, %r6121; + shf.l.wrap.b32 %r6128, %r6127, %r6127, 24; + add.s32 %r6129, %r6128, %r6122; + xor.b32 %r6130, %r6129, %r6124; + shf.l.wrap.b32 %r6131, %r6130, %r6130, 25; + add.s32 %r6132, %r11678, %r11674; + add.s32 %r6133, %r6132, %r6024; + xor.b32 %r6134, %r6133, %r72; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 16; + add.s32 %r6136, %r6135, -1150833019; + xor.b32 %r6137, %r6136, %r11674; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 20; + add.s32 %r6139, %r6031, %r6133; + add.s32 %r6140, %r6139, %r6138; + xor.b32 %r6141, %r6140, %r6135; + shf.l.wrap.b32 %r6142, %r6141, %r6141, 24; + add.s32 %r6143, %r6142, %r6136; + xor.b32 %r6144, %r6143, %r6138; + shf.l.wrap.b32 %r6145, %r6144, %r6144, 25; + add.s32 %r6146, %r11677, %r11673; + add.s32 %r6147, %r6146, %r6038; + shr.u32 %r6148, %r6147, 16; + shl.b32 %r6149, %r6147, 16; + xor.b32 %r6150, %r6149, 4194304; + or.b32 %r6151, %r6150, %r6148; + add.s32 %r6152, %r6151, 1013904242; + xor.b32 %r6153, %r6152, %r11673; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6045, %r6147; + add.s32 %r6156, %r6155, %r6154; + xor.b32 %r6157, %r6156, %r6151; + shf.l.wrap.b32 %r6158, %r6157, %r6157, 24; + add.s32 %r6159, %r6158, %r6152; + xor.b32 %r6160, %r6159, %r6154; + shf.l.wrap.b32 %r6161, %r6160, %r6160, 25; + add.s32 %r6162, %r11676, %r11672; + add.s32 %r6163, %r6162, %r6052; + xor.b32 %r6164, %r6163, %r6117; + shr.u32 %r6165, %r6163, 16; + shl.b32 %r6166, %r6164, 16; + or.b32 %r6167, %r6166, %r6165; + add.s32 %r6168, %r6167, -1521486534; + xor.b32 %r6169, %r6168, %r11672; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6059, %r6163; + add.s32 %r6172, %r6171, %r6170; + xor.b32 %r6173, %r6172, %r6167; + shf.l.wrap.b32 %r6174, %r6173, %r6173, 24; + add.s32 %r6175, %r6174, %r6168; + xor.b32 %r6176, %r6175, %r6170; + shf.l.wrap.b32 %r6177, %r6176, %r6176, 25; + add.s32 %r6178, %r6145, %r6126; + add.s32 %r6179, %r6178, %r6066; + xor.b32 %r6180, %r6174, %r6179; + shf.l.wrap.b32 %r6181, %r6180, %r6180, 16; + add.s32 %r6182, %r6181, %r6159; + xor.b32 %r6183, %r6182, %r6145; + shf.l.wrap.b32 %r6184, %r6183, %r6183, 20; + add.s32 %r6185, %r6073, %r6179; + add.s32 %r6186, %r6185, %r6184; + xor.b32 %r6187, %r6186, %r6181; + shf.l.wrap.b32 %r6188, %r6187, %r6187, 24; + add.s32 %r6189, %r6188, %r6182; + xor.b32 %r6190, %r6189, %r6184; + shf.l.wrap.b32 %r6191, %r6190, %r6190, 25; + add.s32 %r6192, %r6161, %r6140; + add.s32 %r6193, %r6192, %r6080; + xor.b32 %r6194, %r6193, %r6128; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 16; + add.s32 %r6196, %r6195, %r6175; + xor.b32 %r6197, %r6196, %r6161; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 20; + add.s32 %r6199, %r6087, %r6193; + add.s32 %r6200, %r6199, %r6198; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 24; + add.s32 %r6203, %r6202, %r6196; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 25; + add.s32 %r6206, %r6177, %r6156; + add.s32 %r6207, %r6206, %r6094; + xor.b32 %r6208, %r6207, %r6142; + shf.l.wrap.b32 %r6209, %r6208, %r6208, 16; + add.s32 %r6210, %r6209, %r6129; + xor.b32 %r6211, %r6210, %r6177; + shf.l.wrap.b32 %r6212, %r6211, %r6211, 20; + add.s32 %r6213, %r6101, %r6207; + add.s32 %r6214, %r6213, %r6212; + xor.b32 %r6215, %r6214, %r6209; + shf.l.wrap.b32 %r6216, %r6215, %r6215, 24; + add.s32 %r6217, %r6216, %r6210; + xor.b32 %r6218, %r6217, %r6212; + shf.l.wrap.b32 %r6219, %r6218, %r6218, 25; + add.s32 %r6220, %r6172, %r6131; + add.s32 %r6221, %r6220, %r6108; + xor.b32 %r6222, %r6221, %r6158; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 16; + add.s32 %r6224, %r6223, %r6143; + xor.b32 %r6225, %r6224, %r6131; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 20; + add.s32 %r6227, %r6115, %r6221; + add.s32 %r6228, %r6227, %r6226; + xor.b32 %r6229, %r6228, %r6223; + shf.l.wrap.b32 %r6230, %r6229, %r6229, 24; + add.s32 %r6231, %r6230, %r6224; + xor.b32 %r6232, %r6231, %r6226; + shf.l.wrap.b32 %r6233, %r6232, %r6232, 25; + add.s32 %r6234, %r6186, %r6024; + add.s32 %r6235, %r6234, %r6233; + xor.b32 %r6236, %r6235, %r6202; + shf.l.wrap.b32 %r6237, %r6236, %r6236, 16; + add.s32 %r6238, %r6237, %r6217; + xor.b32 %r6239, %r6238, %r6233; + shf.l.wrap.b32 %r6240, %r6239, %r6239, 20; + add.s32 %r6241, %r6235, %r6052; + add.s32 %r6242, %r6241, %r6240; + xor.b32 %r6243, %r6242, %r6237; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6238; + xor.b32 %r6246, %r6245, %r6240; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6200, %r6031; + add.s32 %r6249, %r6248, %r6191; + xor.b32 %r6250, %r6216, %r6249; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6231, %r6251; + xor.b32 %r6253, %r6252, %r6191; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6080; + add.s32 %r6256, %r6255, %r6254; + xor.b32 %r6257, %r6256, %r6251; + shf.l.wrap.b32 %r6258, %r6257, %r6257, 24; + add.s32 %r6259, %r6258, %r6252; + xor.b32 %r6260, %r6259, %r6254; + shf.l.wrap.b32 %r6261, %r6260, %r6260, 25; + add.s32 %r6262, %r6205, %r6059; + add.s32 %r6263, %r6262, %r6214; + xor.b32 %r6264, %r6230, %r6263; + shf.l.wrap.b32 %r6265, %r6264, %r6264, 16; + add.s32 %r6266, %r6265, %r6189; + xor.b32 %r6267, %r6266, %r6205; + shf.l.wrap.b32 %r6268, %r6267, %r6267, 20; + add.s32 %r6269, %r6263, %r6010; + add.s32 %r6270, %r6269, %r6268; + xor.b32 %r6271, %r6270, %r6265; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6266; + xor.b32 %r6274, %r6273, %r6268; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6219, %r6038; + add.s32 %r6277, %r6276, %r6228; + xor.b32 %r6278, %r6277, %r6188; + shf.l.wrap.b32 %r6279, %r6278, %r6278, 16; + add.s32 %r6280, %r6279, %r6203; + xor.b32 %r6281, %r6280, %r6219; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 20; + add.s32 %r6283, %r6277, %r6101; + add.s32 %r6284, %r6283, %r6282; + xor.b32 %r6285, %r6284, %r6279; + shf.l.wrap.b32 %r6286, %r6285, %r6285, 24; + add.s32 %r6287, %r6286, %r6280; + xor.b32 %r6288, %r6287, %r6282; + shf.l.wrap.b32 %r6289, %r6288, %r6288, 25; + add.s32 %r6290, %r6242, %r6017; + add.s32 %r6291, %r6290, %r6261; + xor.b32 %r6292, %r6291, %r6286; + shf.l.wrap.b32 %r6293, %r6292, %r6292, 16; + add.s32 %r6294, %r6293, %r6273; + xor.b32 %r6295, %r6294, %r6261; + shf.l.wrap.b32 %r6296, %r6295, %r6295, 20; + add.s32 %r6297, %r6291, %r6087; + add.s32 %r6298, %r6297, %r6296; + xor.b32 %r6299, %r6298, %r6293; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 24; + add.s32 %r6301, %r6300, %r6294; + xor.b32 %r6302, %r6301, %r6296; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 25; + add.s32 %r6304, %r6256, %r6094; + add.s32 %r6305, %r6304, %r6275; + xor.b32 %r6306, %r6305, %r6244; + shf.l.wrap.b32 %r6307, %r6306, %r6306, 16; + add.s32 %r6308, %r6307, %r6287; + xor.b32 %r6309, %r6308, %r6275; + shf.l.wrap.b32 %r6310, %r6309, %r6309, 20; + add.s32 %r6311, %r6305, %r6045; + add.s32 %r6312, %r6311, %r6310; + xor.b32 %r6313, %r6312, %r6307; + shf.l.wrap.b32 %r6314, %r6313, %r6313, 24; + add.s32 %r6315, %r6314, %r6308; + xor.b32 %r6316, %r6315, %r6310; + shf.l.wrap.b32 %r6317, %r6316, %r6316, 25; + add.s32 %r6318, %r6270, %r6073; + add.s32 %r6319, %r6318, %r6289; + xor.b32 %r6320, %r6319, %r6258; + shf.l.wrap.b32 %r6321, %r6320, %r6320, 16; + add.s32 %r6322, %r6321, %r6245; + xor.b32 %r6323, %r6322, %r6289; + shf.l.wrap.b32 %r6324, %r6323, %r6323, 20; + add.s32 %r6325, %r6319, %r6108; + add.s32 %r6326, %r6325, %r6324; + xor.b32 %r6327, %r6326, %r6321; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 24; + add.s32 %r6329, %r6328, %r6322; + xor.b32 %r6330, %r6329, %r6324; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 25; + add.s32 %r6332, %r6284, %r6115; + add.s32 %r6333, %r6332, %r6247; + xor.b32 %r6334, %r6333, %r6272; + shf.l.wrap.b32 %r6335, %r6334, %r6334, 16; + add.s32 %r6336, %r6335, %r6259; + xor.b32 %r6337, %r6336, %r6247; + shf.l.wrap.b32 %r6338, %r6337, %r6337, 20; + add.s32 %r6339, %r6333, %r6066; + add.s32 %r6340, %r6339, %r6338; + xor.b32 %r6341, %r6340, %r6335; + shf.l.wrap.b32 %r6342, %r6341, %r6341, 24; + add.s32 %r6343, %r6342, %r6336; + xor.b32 %r6344, %r6343, %r6338; + shf.l.wrap.b32 %r6345, %r6344, %r6344, 25; + add.s32 %r6346, %r6298, %r6031; + add.s32 %r6347, %r6346, %r6345; + xor.b32 %r6348, %r6347, %r6314; + shf.l.wrap.b32 %r6349, %r6348, %r6348, 16; + add.s32 %r6350, %r6349, %r6329; + xor.b32 %r6351, %r6350, %r6345; + shf.l.wrap.b32 %r6352, %r6351, %r6351, 20; + add.s32 %r6353, %r6347, %r6038; + add.s32 %r6354, %r6353, %r6352; + xor.b32 %r6355, %r6354, %r6349; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6350; + xor.b32 %r6358, %r6357, %r6352; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6312, %r6080; + add.s32 %r6361, %r6360, %r6303; + xor.b32 %r6362, %r6361, %r6328; + shf.l.wrap.b32 %r6363, %r6362, %r6362, 16; + add.s32 %r6364, %r6363, %r6343; + xor.b32 %r6365, %r6364, %r6303; + shf.l.wrap.b32 %r6366, %r6365, %r6365, 20; + add.s32 %r6367, %r6361, %r6094; + add.s32 %r6368, %r6367, %r6366; + xor.b32 %r6369, %r6368, %r6363; + shf.l.wrap.b32 %r6370, %r6369, %r6369, 24; + add.s32 %r6371, %r6370, %r6364; + xor.b32 %r6372, %r6371, %r6366; + shf.l.wrap.b32 %r6373, %r6372, %r6372, 25; + add.s32 %r6374, %r6326, %r6101; + add.s32 %r6375, %r6374, %r6317; + xor.b32 %r6376, %r6375, %r6342; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6301; + xor.b32 %r6379, %r6378, %r6317; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6024; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6340, %r6059; + add.s32 %r6389, %r6388, %r6331; + xor.b32 %r6390, %r6389, %r6300; + shf.l.wrap.b32 %r6391, %r6390, %r6390, 16; + add.s32 %r6392, %r6391, %r6315; + xor.b32 %r6393, %r6392, %r6331; + shf.l.wrap.b32 %r6394, %r6393, %r6393, 20; + add.s32 %r6395, %r6389, %r6108; + add.s32 %r6396, %r6395, %r6394; + xor.b32 %r6397, %r6396, %r6391; + shf.l.wrap.b32 %r6398, %r6397, %r6397, 24; + add.s32 %r6399, %r6398, %r6392; + xor.b32 %r6400, %r6399, %r6394; + shf.l.wrap.b32 %r6401, %r6400, %r6400, 25; + add.s32 %r6402, %r6354, %r6052; + add.s32 %r6403, %r6402, %r6373; + xor.b32 %r6404, %r6403, %r6398; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 16; + add.s32 %r6406, %r6405, %r6385; + xor.b32 %r6407, %r6406, %r6373; + shf.l.wrap.b32 %r6408, %r6407, %r6407, 20; + add.s32 %r6409, %r6403, %r6045; + add.s32 %r6410, %r6409, %r6408; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 24; + add.s32 %r6413, %r6412, %r6406; + xor.b32 %r6414, %r6413, %r6408; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 25; + add.s32 %r6416, %r6368, %r6073; + add.s32 %r6417, %r6416, %r6387; + xor.b32 %r6418, %r6417, %r6356; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 16; + add.s32 %r6420, %r6419, %r6399; + xor.b32 %r6421, %r6420, %r6387; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 20; + add.s32 %r6423, %r6417, %r6010; + add.s32 %r6424, %r6423, %r6422; + xor.b32 %r6425, %r6424, %r6419; + shf.l.wrap.b32 %r6426, %r6425, %r6425, 24; + add.s32 %r6427, %r6426, %r6420; + xor.b32 %r6428, %r6427, %r6422; + shf.l.wrap.b32 %r6429, %r6428, %r6428, 25; + add.s32 %r6430, %r6382, %r6087; + add.s32 %r6431, %r6430, %r6401; + xor.b32 %r6432, %r6431, %r6370; + shf.l.wrap.b32 %r6433, %r6432, %r6432, 16; + add.s32 %r6434, %r6433, %r6357; + xor.b32 %r6435, %r6434, %r6401; + shf.l.wrap.b32 %r6436, %r6435, %r6435, 20; + add.s32 %r6437, %r6431, %r6115; + add.s32 %r6438, %r6437, %r6436; + xor.b32 %r6439, %r6438, %r6433; + shf.l.wrap.b32 %r6440, %r6439, %r6439, 24; + add.s32 %r6441, %r6440, %r6434; + xor.b32 %r6442, %r6441, %r6436; + shf.l.wrap.b32 %r6443, %r6442, %r6442, 25; + add.s32 %r6444, %r6396, %r6066; + add.s32 %r6445, %r6444, %r6359; + xor.b32 %r6446, %r6445, %r6384; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 16; + add.s32 %r6448, %r6447, %r6371; + xor.b32 %r6449, %r6448, %r6359; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 20; + add.s32 %r6451, %r6445, %r6017; + add.s32 %r6452, %r6451, %r6450; + xor.b32 %r6453, %r6452, %r6447; + shf.l.wrap.b32 %r6454, %r6453, %r6453, 24; + add.s32 %r6455, %r6454, %r6448; + xor.b32 %r6456, %r6455, %r6450; + shf.l.wrap.b32 %r6457, %r6456, %r6456, 25; + add.s32 %r6458, %r6410, %r6080; + add.s32 %r6459, %r6458, %r6457; + xor.b32 %r6460, %r6459, %r6426; + shf.l.wrap.b32 %r6461, %r6460, %r6460, 16; + add.s32 %r6462, %r6461, %r6441; + xor.b32 %r6463, %r6462, %r6457; + shf.l.wrap.b32 %r6464, %r6463, %r6463, 20; + add.s32 %r6465, %r6459, %r6059; + add.s32 %r6466, %r6465, %r6464; + xor.b32 %r6467, %r6466, %r6461; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6462; + xor.b32 %r6470, %r6469, %r6464; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6424, %r6094; + add.s32 %r6473, %r6472, %r6415; + xor.b32 %r6474, %r6473, %r6440; + shf.l.wrap.b32 %r6475, %r6474, %r6474, 16; + add.s32 %r6476, %r6475, %r6455; + xor.b32 %r6477, %r6476, %r6415; + shf.l.wrap.b32 %r6478, %r6477, %r6477, 20; + add.s32 %r6479, %r6473, %r6073; + add.s32 %r6480, %r6479, %r6478; + xor.b32 %r6481, %r6480, %r6475; + shf.l.wrap.b32 %r6482, %r6481, %r6481, 24; + add.s32 %r6483, %r6482, %r6476; + xor.b32 %r6484, %r6483, %r6478; + shf.l.wrap.b32 %r6485, %r6484, %r6484, 25; + add.s32 %r6486, %r6438, %r6108; + add.s32 %r6487, %r6486, %r6429; + xor.b32 %r6488, %r6487, %r6454; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6413; + xor.b32 %r6491, %r6490, %r6429; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6031; + add.s32 %r6494, %r6493, %r6492; + xor.b32 %r6495, %r6494, %r6489; + shf.l.wrap.b32 %r6496, %r6495, %r6495, 24; + add.s32 %r6497, %r6496, %r6490; + xor.b32 %r6498, %r6497, %r6492; + shf.l.wrap.b32 %r6499, %r6498, %r6498, 25; + add.s32 %r6500, %r6452, %r6101; + add.s32 %r6501, %r6500, %r6443; + xor.b32 %r6502, %r6501, %r6412; + shf.l.wrap.b32 %r6503, %r6502, %r6502, 16; + add.s32 %r6504, %r6503, %r6427; + xor.b32 %r6505, %r6504, %r6443; + shf.l.wrap.b32 %r6506, %r6505, %r6505, 20; + add.s32 %r6507, %r6501, %r6115; + add.s32 %r6508, %r6507, %r6506; + xor.b32 %r6509, %r6508, %r6503; + shf.l.wrap.b32 %r6510, %r6509, %r6509, 24; + add.s32 %r6511, %r6510, %r6504; + xor.b32 %r6512, %r6511, %r6506; + shf.l.wrap.b32 %r6513, %r6512, %r6512, 25; + add.s32 %r6514, %r6466, %r6038; + add.s32 %r6515, %r6514, %r6485; + xor.b32 %r6516, %r6515, %r6510; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 16; + add.s32 %r6518, %r6517, %r6497; + xor.b32 %r6519, %r6518, %r6485; + shf.l.wrap.b32 %r6520, %r6519, %r6519, 20; + add.s32 %r6521, %r6515, %r6010; + add.s32 %r6522, %r6521, %r6520; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 24; + add.s32 %r6525, %r6524, %r6518; + xor.b32 %r6526, %r6525, %r6520; + shf.l.wrap.b32 %r6527, %r6526, %r6526, 25; + add.s32 %r6528, %r6480, %r6087; + add.s32 %r6529, %r6528, %r6499; + xor.b32 %r6530, %r6529, %r6468; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 16; + add.s32 %r6532, %r6531, %r6511; + xor.b32 %r6533, %r6532, %r6499; + shf.l.wrap.b32 %r6534, %r6533, %r6533, 20; + add.s32 %r6535, %r6529, %r6024; + add.s32 %r6536, %r6535, %r6534; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 24; + add.s32 %r6539, %r6538, %r6532; + xor.b32 %r6540, %r6539, %r6534; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 25; + add.s32 %r6542, %r6494, %r6045; + add.s32 %r6543, %r6542, %r6513; + xor.b32 %r6544, %r6543, %r6482; + shf.l.wrap.b32 %r6545, %r6544, %r6544, 16; + add.s32 %r6546, %r6545, %r6469; + xor.b32 %r6547, %r6546, %r6513; + shf.l.wrap.b32 %r6548, %r6547, %r6547, 20; + add.s32 %r6549, %r6543, %r6066; + add.s32 %r6550, %r6549, %r6548; + xor.b32 %r6551, %r6550, %r6545; + shf.l.wrap.b32 %r6552, %r6551, %r6551, 24; + add.s32 %r6553, %r6552, %r6546; + xor.b32 %r6554, %r6553, %r6548; + shf.l.wrap.b32 %r6555, %r6554, %r6554, 25; + add.s32 %r6556, %r6508, %r6017; + add.s32 %r6557, %r6556, %r6471; + xor.b32 %r6558, %r6557, %r6496; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 16; + add.s32 %r6560, %r6559, %r6483; + xor.b32 %r6561, %r6560, %r6471; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 20; + add.s32 %r6563, %r6557, %r6052; + add.s32 %r6564, %r6563, %r6562; + xor.b32 %r6565, %r6564, %r6559; + shf.l.wrap.b32 %r6566, %r6565, %r6565, 24; + add.s32 %r6567, %r6566, %r6560; + xor.b32 %r6568, %r6567, %r6562; + shf.l.wrap.b32 %r6569, %r6568, %r6568, 25; + add.s32 %r6570, %r6522, %r6094; + add.s32 %r6571, %r6570, %r6569; + xor.b32 %r6572, %r6571, %r6538; + shf.l.wrap.b32 %r6573, %r6572, %r6572, 16; + add.s32 %r6574, %r6573, %r6553; + xor.b32 %r6575, %r6574, %r6569; + shf.l.wrap.b32 %r6576, %r6575, %r6575, 20; + add.s32 %r6577, %r6571, %r6101; + add.s32 %r6578, %r6577, %r6576; + xor.b32 %r6579, %r6578, %r6573; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6574; + xor.b32 %r6582, %r6581, %r6576; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6536, %r6073; + add.s32 %r6585, %r6584, %r6527; + xor.b32 %r6586, %r6585, %r6552; + shf.l.wrap.b32 %r6587, %r6586, %r6586, 16; + add.s32 %r6588, %r6587, %r6567; + xor.b32 %r6589, %r6588, %r6527; + shf.l.wrap.b32 %r6590, %r6589, %r6589, 20; + add.s32 %r6591, %r6585, %r6087; + add.s32 %r6592, %r6591, %r6590; + xor.b32 %r6593, %r6592, %r6587; + shf.l.wrap.b32 %r6594, %r6593, %r6593, 24; + add.s32 %r6595, %r6594, %r6588; + xor.b32 %r6596, %r6595, %r6590; + shf.l.wrap.b32 %r6597, %r6596, %r6596, 25; + add.s32 %r6598, %r6550, %r6115; + add.s32 %r6599, %r6598, %r6541; + xor.b32 %r6600, %r6599, %r6566; + shf.l.wrap.b32 %r6601, %r6600, %r6600, 16; + add.s32 %r6602, %r6601, %r6525; + xor.b32 %r6603, %r6602, %r6541; + shf.l.wrap.b32 %r6604, %r6603, %r6603, 20; + add.s32 %r6605, %r6599, %r6080; + add.s32 %r6606, %r6605, %r6604; + xor.b32 %r6607, %r6606, %r6601; + shf.l.wrap.b32 %r6608, %r6607, %r6607, 24; + add.s32 %r6609, %r6608, %r6602; + xor.b32 %r6610, %r6609, %r6604; + shf.l.wrap.b32 %r6611, %r6610, %r6610, 25; + add.s32 %r6612, %r6564, %r6108; + add.s32 %r6613, %r6612, %r6555; + xor.b32 %r6614, %r6613, %r6524; + shf.l.wrap.b32 %r6615, %r6614, %r6614, 16; + add.s32 %r6616, %r6615, %r6539; + xor.b32 %r6617, %r6616, %r6555; + shf.l.wrap.b32 %r6618, %r6617, %r6617, 20; + add.s32 %r6619, %r6613, %r6066; + add.s32 %r6620, %r6619, %r6618; + xor.b32 %r6621, %r6620, %r6615; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6616; + xor.b32 %r6624, %r6623, %r6618; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6578, %r6059; + add.s32 %r6627, %r6626, %r6597; + xor.b32 %r6628, %r6627, %r6622; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6609; + xor.b32 %r6631, %r6630, %r6597; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6024; + add.s32 %r6634, %r6633, %r6632; + xor.b32 %r6635, %r6634, %r6629; + shf.l.wrap.b32 %r6636, %r6635, %r6635, 24; + add.s32 %r6637, %r6636, %r6630; + xor.b32 %r6638, %r6637, %r6632; + shf.l.wrap.b32 %r6639, %r6638, %r6638, 25; + add.s32 %r6640, %r6592, %r6045; + add.s32 %r6641, %r6640, %r6611; + xor.b32 %r6642, %r6641, %r6580; + shf.l.wrap.b32 %r6643, %r6642, %r6642, 16; + add.s32 %r6644, %r6643, %r6623; + xor.b32 %r6645, %r6644, %r6611; + shf.l.wrap.b32 %r6646, %r6645, %r6645, 20; + add.s32 %r6647, %r6641, %r6031; + add.s32 %r6648, %r6647, %r6646; + xor.b32 %r6649, %r6648, %r6643; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 24; + add.s32 %r6651, %r6650, %r6644; + xor.b32 %r6652, %r6651, %r6646; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 25; + add.s32 %r6654, %r6606, %r6010; + add.s32 %r6655, %r6654, %r6625; + xor.b32 %r6656, %r6655, %r6594; + shf.l.wrap.b32 %r6657, %r6656, %r6656, 16; + add.s32 %r6658, %r6657, %r6581; + xor.b32 %r6659, %r6658, %r6625; + shf.l.wrap.b32 %r6660, %r6659, %r6659, 20; + add.s32 %r6661, %r6655, %r6017; + add.s32 %r6662, %r6661, %r6660; + xor.b32 %r6663, %r6662, %r6657; + shf.l.wrap.b32 %r6664, %r6663, %r6663, 24; + add.s32 %r6665, %r6664, %r6658; + xor.b32 %r6666, %r6665, %r6660; + shf.l.wrap.b32 %r6667, %r6666, %r6666, 25; + add.s32 %r6668, %r6620, %r6052; + add.s32 %r6669, %r6668, %r6583; + xor.b32 %r6670, %r6669, %r6608; + shf.l.wrap.b32 %r6671, %r6670, %r6670, 16; + add.s32 %r6672, %r6671, %r6595; + xor.b32 %r6673, %r6672, %r6583; + shf.l.wrap.b32 %r6674, %r6673, %r6673, 20; + add.s32 %r6675, %r6669, %r6038; + add.s32 %r6676, %r6675, %r6674; + xor.b32 %r6677, %r6676, %r6671; + shf.l.wrap.b32 %r6678, %r6677, %r6677, 24; + add.s32 %r6679, %r6678, %r6672; + xor.b32 %r6680, %r6679, %r6674; + shf.l.wrap.b32 %r6681, %r6680, %r6680, 25; + add.s32 %r6682, %r6634, %r6073; + add.s32 %r6683, %r6682, %r6681; + xor.b32 %r6684, %r6683, %r6650; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 16; + add.s32 %r6686, %r6685, %r6665; + xor.b32 %r6687, %r6686, %r6681; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 20; + add.s32 %r6689, %r6683, %r6108; + add.s32 %r6690, %r6689, %r6688; + xor.b32 %r6691, %r6690, %r6685; + shf.l.wrap.b32 %r6692, %r6691, %r6691, 24; + add.s32 %r6693, %r6692, %r6686; + xor.b32 %r6694, %r6693, %r6688; + shf.l.wrap.b32 %r6695, %r6694, %r6694, 25; + add.s32 %r6696, %r6648, %r6087; + add.s32 %r6697, %r6696, %r6639; + xor.b32 %r6698, %r6697, %r6664; + shf.l.wrap.b32 %r6699, %r6698, %r6698, 16; + add.s32 %r6700, %r6699, %r6679; + xor.b32 %r6701, %r6700, %r6639; + shf.l.wrap.b32 %r6702, %r6701, %r6701, 20; + add.s32 %r6703, %r6697, %r6045; + add.s32 %r6704, %r6703, %r6702; + xor.b32 %r6705, %r6704, %r6699; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6700; + xor.b32 %r6708, %r6707, %r6702; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6662, %r6066; + add.s32 %r6711, %r6710, %r6653; + xor.b32 %r6712, %r6711, %r6678; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6637; + xor.b32 %r6715, %r6714, %r6653; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6094; + add.s32 %r6718, %r6717, %r6716; + xor.b32 %r6719, %r6718, %r6713; + shf.l.wrap.b32 %r6720, %r6719, %r6719, 24; + add.s32 %r6721, %r6720, %r6714; + xor.b32 %r6722, %r6721, %r6716; + shf.l.wrap.b32 %r6723, %r6722, %r6722, 25; + add.s32 %r6724, %r6676, %r6115; + add.s32 %r6725, %r6724, %r6667; + xor.b32 %r6726, %r6725, %r6636; + shf.l.wrap.b32 %r6727, %r6726, %r6726, 16; + add.s32 %r6728, %r6727, %r6651; + xor.b32 %r6729, %r6728, %r6667; + shf.l.wrap.b32 %r6730, %r6729, %r6729, 20; + add.s32 %r6731, %r6725, %r6017; + add.s32 %r6732, %r6731, %r6730; + xor.b32 %r6733, %r6732, %r6727; + shf.l.wrap.b32 %r6734, %r6733, %r6733, 24; + add.s32 %r6735, %r6734, %r6728; + xor.b32 %r6736, %r6735, %r6730; + shf.l.wrap.b32 %r6737, %r6736, %r6736, 25; + add.s32 %r6738, %r6690, %r6101; + add.s32 %r6739, %r6738, %r6709; + xor.b32 %r6740, %r6739, %r6734; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6721; + xor.b32 %r6743, %r6742, %r6709; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6031; + add.s32 %r6746, %r6745, %r6744; + xor.b32 %r6747, %r6746, %r6741; + shf.l.wrap.b32 %r6748, %r6747, %r6747, 24; + add.s32 %r6749, %r6748, %r6742; + xor.b32 %r6750, %r6749, %r6744; + shf.l.wrap.b32 %r6751, %r6750, %r6750, 25; + add.s32 %r6752, %r6704, %r6010; + add.s32 %r6753, %r6752, %r6723; + xor.b32 %r6754, %r6753, %r6692; + shf.l.wrap.b32 %r6755, %r6754, %r6754, 16; + add.s32 %r6756, %r6755, %r6735; + xor.b32 %r6757, %r6756, %r6723; + shf.l.wrap.b32 %r6758, %r6757, %r6757, 20; + add.s32 %r6759, %r6753, %r6080; + add.s32 %r6760, %r6759, %r6758; + xor.b32 %r6761, %r6760, %r6755; + shf.l.wrap.b32 %r6762, %r6761, %r6761, 24; + add.s32 %r6763, %r6762, %r6756; + xor.b32 %r6764, %r6763, %r6758; + shf.l.wrap.b32 %r6765, %r6764, %r6764, 25; + add.s32 %r6766, %r6718, %r6024; + add.s32 %r6767, %r6766, %r6737; + xor.b32 %r6768, %r6767, %r6706; + shf.l.wrap.b32 %r6769, %r6768, %r6768, 16; + add.s32 %r6770, %r6769, %r6693; + xor.b32 %r6771, %r6770, %r6737; + shf.l.wrap.b32 %r6772, %r6771, %r6771, 20; + add.s32 %r6773, %r6767, %r6052; + add.s32 %r6774, %r6773, %r6772; + xor.b32 %r6775, %r6774, %r6769; + shf.l.wrap.b32 %r6776, %r6775, %r6775, 24; + add.s32 %r6777, %r6776, %r6770; + xor.b32 %r6778, %r6777, %r6772; + shf.l.wrap.b32 %r6779, %r6778, %r6778, 25; + add.s32 %r6780, %r6732, %r6038; + add.s32 %r6781, %r6780, %r6695; + xor.b32 %r6782, %r6781, %r6720; + shf.l.wrap.b32 %r6783, %r6782, %r6782, 16; + add.s32 %r6784, %r6783, %r6707; + xor.b32 %r6785, %r6784, %r6695; + shf.l.wrap.b32 %r6786, %r6785, %r6785, 20; + add.s32 %r6787, %r6781, %r6059; + add.s32 %r6788, %r6787, %r6786; + xor.b32 %r6789, %r6788, %r6783; + shf.l.wrap.b32 %r6790, %r6789, %r6789, 24; + add.s32 %r6791, %r6790, %r6784; + xor.b32 %r6792, %r6791, %r6786; + shf.l.wrap.b32 %r6793, %r6792, %r6792, 25; + add.s32 %r6794, %r6746, %r6087; + add.s32 %r6795, %r6794, %r6793; + xor.b32 %r6796, %r6795, %r6762; + shf.l.wrap.b32 %r6797, %r6796, %r6796, 16; + add.s32 %r6798, %r6797, %r6777; + xor.b32 %r6799, %r6798, %r6793; + shf.l.wrap.b32 %r6800, %r6799, %r6799, 20; + add.s32 %r6801, %r6795, %r6115; + add.s32 %r6802, %r6801, %r6800; + xor.b32 %r6803, %r6802, %r6797; + shf.l.wrap.b32 %r6804, %r6803, %r6803, 24; + add.s32 %r6805, %r6804, %r6798; + xor.b32 %r6806, %r6805, %r6800; + shf.l.wrap.b32 %r6807, %r6806, %r6806, 25; + add.s32 %r6808, %r6760, %r6045; + add.s32 %r6809, %r6808, %r6751; + xor.b32 %r6810, %r6809, %r6776; + shf.l.wrap.b32 %r6811, %r6810, %r6810, 16; + add.s32 %r6812, %r6811, %r6791; + xor.b32 %r6813, %r6812, %r6751; + shf.l.wrap.b32 %r6814, %r6813, %r6813, 20; + add.s32 %r6815, %r6809, %r6010; + add.s32 %r6816, %r6815, %r6814; + xor.b32 %r6817, %r6816, %r6811; + shf.l.wrap.b32 %r6818, %r6817, %r6817, 24; + add.s32 %r6819, %r6818, %r6812; + xor.b32 %r6820, %r6819, %r6814; + shf.l.wrap.b32 %r6821, %r6820, %r6820, 25; + add.s32 %r6822, %r6774, %r6017; + add.s32 %r6823, %r6822, %r6765; + xor.b32 %r6824, %r6823, %r6790; + shf.l.wrap.b32 %r6825, %r6824, %r6824, 16; + add.s32 %r6826, %r6825, %r6749; + xor.b32 %r6827, %r6826, %r6765; + shf.l.wrap.b32 %r6828, %r6827, %r6827, 20; + add.s32 %r6829, %r6823, %r6073; + add.s32 %r6830, %r6829, %r6828; + xor.b32 %r6831, %r6830, %r6825; + shf.l.wrap.b32 %r6832, %r6831, %r6831, 24; + add.s32 %r6833, %r6832, %r6826; + xor.b32 %r6834, %r6833, %r6828; + shf.l.wrap.b32 %r6835, %r6834, %r6834, 25; + add.s32 %r6836, %r6788, %r6066; + add.s32 %r6837, %r6836, %r6779; + xor.b32 %r6838, %r6837, %r6748; + shf.l.wrap.b32 %r6839, %r6838, %r6838, 16; + add.s32 %r6840, %r6839, %r6763; + xor.b32 %r6841, %r6840, %r6779; + shf.l.wrap.b32 %r6842, %r6841, %r6841, 20; + add.s32 %r6843, %r6837, %r6052; + add.s32 %r6844, %r6843, %r6842; + xor.b32 %r6845, %r6844, %r6839; + shf.l.wrap.b32 %r6846, %r6845, %r6845, 24; + add.s32 %r6847, %r6846, %r6840; + xor.b32 %r6848, %r6847, %r6842; + shf.l.wrap.b32 %r6849, %r6848, %r6848, 25; + add.s32 %r6850, %r6802, %r6108; + add.s32 %r6851, %r6850, %r6821; + xor.b32 %r6852, %r6851, %r6846; + shf.l.wrap.b32 %r6853, %r6852, %r6852, 16; + add.s32 %r6854, %r6853, %r6833; + xor.b32 %r6855, %r6854, %r6821; + shf.l.wrap.b32 %r6856, %r6855, %r6855, 20; + add.s32 %r6857, %r6851, %r6080; + add.s32 %r6858, %r6857, %r6856; + xor.b32 %r6859, %r6858, %r6853; + shf.l.wrap.b32 %r6860, %r6859, %r6859, 24; + add.s32 %r6861, %r6860, %r6854; + xor.b32 %r6862, %r6861, %r6856; + shf.l.wrap.b32 %r6863, %r6862, %r6862, 25; + add.s32 %r6864, %r6816, %r6024; + add.s32 %r6865, %r6864, %r6835; + xor.b32 %r6866, %r6865, %r6804; + shf.l.wrap.b32 %r6867, %r6866, %r6866, 16; + add.s32 %r6868, %r6867, %r6847; + xor.b32 %r6869, %r6868, %r6835; + shf.l.wrap.b32 %r6870, %r6869, %r6869, 20; + add.s32 %r6871, %r6865, %r6094; + add.s32 %r6872, %r6871, %r6870; + xor.b32 %r6873, %r6872, %r6867; + shf.l.wrap.b32 %r6874, %r6873, %r6873, 24; + add.s32 %r6875, %r6874, %r6868; + xor.b32 %r6876, %r6875, %r6870; + shf.l.wrap.b32 %r6877, %r6876, %r6876, 25; + add.s32 %r6878, %r6830, %r6031; + add.s32 %r6879, %r6878, %r6849; + xor.b32 %r6880, %r6879, %r6818; + shf.l.wrap.b32 %r6881, %r6880, %r6880, 16; + add.s32 %r6882, %r6881, %r6805; + xor.b32 %r6883, %r6882, %r6849; + shf.l.wrap.b32 %r6884, %r6883, %r6883, 20; + add.s32 %r6885, %r6879, %r6038; + add.s32 %r6886, %r6885, %r6884; + xor.b32 %r6887, %r6886, %r6881; + shf.l.wrap.b32 %r6888, %r6887, %r6887, 24; + add.s32 %r6889, %r6888, %r6882; + xor.b32 %r6890, %r6889, %r6884; + shf.l.wrap.b32 %r6891, %r6890, %r6890, 25; + add.s32 %r6892, %r6844, %r6059; + add.s32 %r6893, %r6892, %r6807; + xor.b32 %r6894, %r6893, %r6832; + shf.l.wrap.b32 %r6895, %r6894, %r6894, 16; + add.s32 %r6896, %r6895, %r6819; + xor.b32 %r6897, %r6896, %r6807; + shf.l.wrap.b32 %r6898, %r6897, %r6897, 20; + add.s32 %r6899, %r6893, %r6101; + add.s32 %r6900, %r6899, %r6898; + xor.b32 %r6901, %r6900, %r6895; + shf.l.wrap.b32 %r6902, %r6901, %r6901, 24; + add.s32 %r6903, %r6902, %r6896; + xor.b32 %r6904, %r6903, %r6898; + shf.l.wrap.b32 %r6905, %r6904, %r6904, 25; + xor.b32 %r11679, %r6889, %r6858; + xor.b32 %r11678, %r6903, %r6872; + xor.b32 %r11677, %r6861, %r6886; + xor.b32 %r11676, %r6900, %r6875; + xor.b32 %r11675, %r6905, %r6874; + xor.b32 %r11674, %r6863, %r6888; + xor.b32 %r11673, %r6902, %r6877; + xor.b32 %r11672, %r6891, %r6860; + add.s16 %rs353, %rs353, 1; + st.local.u8 [%rd54+1], %rs353; + add.s64 %rd257, %rd257, 64; + add.s64 %rd258, %rd258, -64; + setp.gt.u64 %p33, %rd258, 64; + @%p33 bra $L__BB1_38; + +$L__BB1_39: + min.u64 %rd61, %rd258, 64; + setp.eq.s64 %p34, %rd61, 0; + mov.u16 %rs355, %rs354; + mov.u16 %rs356, %rs354; + mov.u16 %rs357, %rs354; + mov.u16 %rs358, %rs354; + mov.u16 %rs359, %rs354; + mov.u16 %rs360, %rs354; + mov.u16 %rs361, %rs354; + mov.u16 %rs362, %rs354; + mov.u16 %rs363, %rs354; + mov.u16 %rs364, %rs354; + mov.u16 %rs365, %rs354; + mov.u16 %rs366, %rs354; + mov.u16 %rs367, %rs354; + mov.u16 %rs368, %rs354; + mov.u16 %rs369, %rs354; + mov.u16 %rs370, %rs354; + mov.u16 %rs371, %rs354; + mov.u16 %rs372, %rs354; + mov.u16 %rs373, %rs354; + mov.u16 %rs374, %rs354; + mov.u16 %rs375, %rs354; + mov.u16 %rs376, %rs354; + mov.u16 %rs377, %rs354; + mov.u16 %rs378, %rs354; + mov.u16 %rs379, %rs354; + mov.u16 %rs380, %rs354; + mov.u16 %rs381, %rs354; + mov.u16 %rs382, %rs354; + mov.u16 %rs383, %rs354; + mov.u16 %rs384, %rs354; + mov.u16 %rs385, %rs354; + mov.u16 %rs386, %rs354; + mov.u16 %rs387, %rs354; + @%p34 bra $L__BB1_43; + + mov.u64 %rd259, 0; + +$L__BB1_41: + add.s64 %rd186, %rd257, %rd259; + ld.local.u8 %rs251, [%rd186]; + add.s64 %rd187, %rd53, %rd259; + st.local.u8 [%rd187], %rs251; + add.s64 %rd259, %rd259, 1; + setp.lt.u64 %p35, %rd259, %rd61; + @%p35 bra $L__BB1_41; + + ld.local.v4.u16 {%rs384, %rs385, %rs386, %rs387}, [%rd53]; + ld.local.v4.u16 {%rs380, %rs381, %rs382, %rs383}, [%rd53+8]; + ld.local.v4.u16 {%rs376, %rs377, %rs378, %rs379}, [%rd53+16]; + ld.local.v4.u16 {%rs372, %rs373, %rs374, %rs375}, [%rd53+24]; + ld.local.v4.u16 {%rs368, %rs369, %rs370, %rs371}, [%rd53+32]; + ld.local.v4.u16 {%rs364, %rs365, %rs366, %rs367}, [%rd53+40]; + ld.local.v4.u16 {%rs360, %rs361, %rs362, %rs363}, [%rd53+48]; + ld.local.v4.u16 {%rs357, %rs358, %rs359, %rs283}, [%rd53+56]; + ld.local.u8 %rs356, [%rd53+61]; + ld.local.v2.u8 {%rs354, %rs355}, [%rd53+62]; + +$L__BB1_43: + ld.local.v4.u8 {%rs286, %rs287, %rs288, %rs289}, [%rd53+64]; + cvt.u16.u64 %rs292, %rd61; + add.s16 %rs293, %rs286, %rs292; + st.local.u8 [%rd53+64], %rs293; + setp.eq.s16 %p36, %rs287, 0; + selp.u16 %rs294, 1, 0, %p36; + or.b16 %rs295, %rs288, %rs294; + or.b16 %rs296, %rs295, 2; + shr.u16 %rs297, %rs384, 8; + shr.u16 %rs298, %rs385, 8; + shr.u16 %rs299, %rs386, 8; + shr.u16 %rs300, %rs387, 8; + shr.u16 %rs301, %rs380, 8; + shr.u16 %rs302, %rs381, 8; + shr.u16 %rs303, %rs382, 8; + shr.u16 %rs304, %rs383, 8; + shr.u16 %rs305, %rs376, 8; + shr.u16 %rs306, %rs377, 8; + shr.u16 %rs307, %rs378, 8; + shr.u16 %rs308, %rs379, 8; + shr.u16 %rs309, %rs372, 8; + shr.u16 %rs310, %rs373, 8; + shr.u16 %rs311, %rs374, 8; + shr.u16 %rs312, %rs375, 8; + shr.u16 %rs313, %rs368, 8; + shr.u16 %rs314, %rs369, 8; + shr.u16 %rs315, %rs370, 8; + shr.u16 %rs316, %rs371, 8; + shr.u16 %rs317, %rs364, 8; + shr.u16 %rs318, %rs365, 8; + shr.u16 %rs319, %rs366, 8; + shr.u16 %rs320, %rs367, 8; + shr.u16 %rs321, %rs360, 8; + shr.u16 %rs322, %rs361, 8; + shr.u16 %rs323, %rs362, 8; + shr.u16 %rs324, %rs363, 8; + shr.u16 %rs325, %rs357, 8; + shr.u16 %rs326, %rs358, 8; + cvt.u32.u16 %r6906, %rs384; + and.b32 %r6907, %r6906, 255; + cvt.u32.u16 %r6908, %rs297; + prmt.b32 %r6909, %r6908, %r6907, 30212; + cvt.u32.u16 %r6910, %rs385; + prmt.b32 %r6911, %r6910, %r6909, 28756; + cvt.u32.u16 %r6912, %rs298; + prmt.b32 %r6913, %r6912, %r6911, 1620; + cvt.u32.u16 %r6914, %rs386; + and.b32 %r6915, %r6914, 255; + cvt.u32.u16 %r6916, %rs299; + prmt.b32 %r6917, %r6916, %r6915, 30212; + cvt.u32.u16 %r6918, %rs387; + prmt.b32 %r6919, %r6918, %r6917, 28756; + cvt.u32.u16 %r6920, %rs300; + prmt.b32 %r6921, %r6920, %r6919, 1620; + cvt.u32.u16 %r6922, %rs380; + and.b32 %r6923, %r6922, 255; + cvt.u32.u16 %r6924, %rs301; + prmt.b32 %r6925, %r6924, %r6923, 30212; + cvt.u32.u16 %r6926, %rs381; + prmt.b32 %r6927, %r6926, %r6925, 28756; + cvt.u32.u16 %r6928, %rs302; + prmt.b32 %r6929, %r6928, %r6927, 1620; + cvt.u32.u16 %r6930, %rs382; + and.b32 %r6931, %r6930, 255; + cvt.u32.u16 %r6932, %rs303; + prmt.b32 %r6933, %r6932, %r6931, 30212; + cvt.u32.u16 %r6934, %rs383; + prmt.b32 %r6935, %r6934, %r6933, 28756; + cvt.u32.u16 %r6936, %rs304; + prmt.b32 %r6937, %r6936, %r6935, 1620; + cvt.u32.u16 %r6938, %rs376; + and.b32 %r6939, %r6938, 255; + cvt.u32.u16 %r6940, %rs305; + prmt.b32 %r6941, %r6940, %r6939, 30212; + cvt.u32.u16 %r6942, %rs377; + prmt.b32 %r6943, %r6942, %r6941, 28756; + cvt.u32.u16 %r6944, %rs306; + prmt.b32 %r6945, %r6944, %r6943, 1620; + cvt.u32.u16 %r6946, %rs378; + and.b32 %r6947, %r6946, 255; + cvt.u32.u16 %r6948, %rs307; + prmt.b32 %r6949, %r6948, %r6947, 30212; + cvt.u32.u16 %r6950, %rs379; + prmt.b32 %r6951, %r6950, %r6949, 28756; + cvt.u32.u16 %r6952, %rs308; + prmt.b32 %r6953, %r6952, %r6951, 1620; + cvt.u32.u16 %r6954, %rs372; + and.b32 %r6955, %r6954, 255; + cvt.u32.u16 %r6956, %rs309; + prmt.b32 %r6957, %r6956, %r6955, 30212; + cvt.u32.u16 %r6958, %rs373; + prmt.b32 %r6959, %r6958, %r6957, 28756; + cvt.u32.u16 %r6960, %rs310; + prmt.b32 %r6961, %r6960, %r6959, 1620; + cvt.u32.u16 %r6962, %rs374; + and.b32 %r6963, %r6962, 255; + cvt.u32.u16 %r6964, %rs311; + prmt.b32 %r6965, %r6964, %r6963, 30212; + cvt.u32.u16 %r6966, %rs375; + prmt.b32 %r6967, %r6966, %r6965, 28756; + cvt.u32.u16 %r6968, %rs312; + prmt.b32 %r6969, %r6968, %r6967, 1620; + cvt.u32.u16 %r6970, %rs368; + and.b32 %r6971, %r6970, 255; + cvt.u32.u16 %r6972, %rs313; + prmt.b32 %r6973, %r6972, %r6971, 30212; + cvt.u32.u16 %r6974, %rs369; + prmt.b32 %r6975, %r6974, %r6973, 28756; + cvt.u32.u16 %r6976, %rs314; + prmt.b32 %r6977, %r6976, %r6975, 1620; + cvt.u32.u16 %r6978, %rs370; + and.b32 %r6979, %r6978, 255; + cvt.u32.u16 %r6980, %rs315; + prmt.b32 %r6981, %r6980, %r6979, 30212; + cvt.u32.u16 %r6982, %rs371; + prmt.b32 %r6983, %r6982, %r6981, 28756; + cvt.u32.u16 %r6984, %rs316; + prmt.b32 %r6985, %r6984, %r6983, 1620; + cvt.u32.u16 %r6986, %rs364; + and.b32 %r6987, %r6986, 255; + cvt.u32.u16 %r6988, %rs317; + prmt.b32 %r6989, %r6988, %r6987, 30212; + cvt.u32.u16 %r6990, %rs365; + prmt.b32 %r6991, %r6990, %r6989, 28756; + cvt.u32.u16 %r6992, %rs318; + prmt.b32 %r6993, %r6992, %r6991, 1620; + cvt.u32.u16 %r6994, %rs366; + and.b32 %r6995, %r6994, 255; + cvt.u32.u16 %r6996, %rs319; + prmt.b32 %r6997, %r6996, %r6995, 30212; + cvt.u32.u16 %r6998, %rs367; + prmt.b32 %r6999, %r6998, %r6997, 28756; + cvt.u32.u16 %r7000, %rs320; + prmt.b32 %r7001, %r7000, %r6999, 1620; + cvt.u32.u16 %r7002, %rs360; + and.b32 %r7003, %r7002, 255; + cvt.u32.u16 %r7004, %rs321; + prmt.b32 %r7005, %r7004, %r7003, 30212; + cvt.u32.u16 %r7006, %rs361; + prmt.b32 %r7007, %r7006, %r7005, 28756; + cvt.u32.u16 %r7008, %rs322; + prmt.b32 %r7009, %r7008, %r7007, 1620; + cvt.u32.u16 %r7010, %rs362; + and.b32 %r7011, %r7010, 255; + cvt.u32.u16 %r7012, %rs323; + prmt.b32 %r7013, %r7012, %r7011, 30212; + cvt.u32.u16 %r7014, %rs363; + prmt.b32 %r7015, %r7014, %r7013, 28756; + cvt.u32.u16 %r7016, %rs324; + prmt.b32 %r7017, %r7016, %r7015, 1620; + cvt.u32.u16 %r7018, %rs357; + and.b32 %r7019, %r7018, 255; + cvt.u32.u16 %r7020, %rs325; + prmt.b32 %r7021, %r7020, %r7019, 30212; + cvt.u32.u16 %r7022, %rs358; + prmt.b32 %r7023, %r7022, %r7021, 28756; + cvt.u32.u16 %r7024, %rs326; + prmt.b32 %r7025, %r7024, %r7023, 1620; + cvt.u32.u16 %r7026, %rs359; + and.b32 %r7027, %r7026, 255; + cvt.u32.u16 %r7028, %rs356; + prmt.b32 %r7029, %r7028, %r7027, 30212; + cvt.u32.u16 %r7030, %rs354; + shl.b32 %r7031, %r7030, 16; + and.b32 %r7032, %r7031, 16711680; + or.b32 %r7033, %r7029, %r7032; + cvt.u32.u16 %r7034, %rs355; + shl.b32 %r7035, %r7034, 24; + or.b32 %r7036, %r7033, %r7035; + cvt.u32.u16 %r7037, %rs293; + and.b32 %r7038, %r7037, 255; + cvt.u32.u16 %r7039, %rs296; + and.b32 %r7040, %r7039, 255; + add.s32 %r7041, %r11675, %r11679; + add.s32 %r7042, %r7041, %r6913; + xor.b32 %r7043, %r7042, %r71; + shf.l.wrap.b32 %r7044, %r7043, %r7043, 16; + add.s32 %r7045, %r7044, 1779033703; + xor.b32 %r7046, %r7045, %r11675; + shf.l.wrap.b32 %r7047, %r7046, %r7046, 20; + add.s32 %r7048, %r6921, %r7042; + add.s32 %r7049, %r7048, %r7047; + xor.b32 %r7050, %r7049, %r7044; + shf.l.wrap.b32 %r7051, %r7050, %r7050, 24; + add.s32 %r7052, %r7051, %r7045; + xor.b32 %r7053, %r7052, %r7047; + shf.l.wrap.b32 %r7054, %r7053, %r7053, 25; + add.s32 %r7055, %r11674, %r11678; + add.s32 %r7056, %r7055, %r6929; + xor.b32 %r7057, %r7056, %r72; + shf.l.wrap.b32 %r7058, %r7057, %r7057, 16; + add.s32 %r7059, %r7058, -1150833019; + xor.b32 %r7060, %r7059, %r11674; + shf.l.wrap.b32 %r7061, %r7060, %r7060, 20; + add.s32 %r7062, %r6937, %r7056; + add.s32 %r7063, %r7062, %r7061; + xor.b32 %r7064, %r7063, %r7058; + shf.l.wrap.b32 %r7065, %r7064, %r7064, 24; + add.s32 %r7066, %r7065, %r7059; + xor.b32 %r7067, %r7066, %r7061; + shf.l.wrap.b32 %r7068, %r7067, %r7067, 25; + add.s32 %r7069, %r11673, %r11677; + add.s32 %r7070, %r7069, %r6945; + xor.b32 %r7071, %r7070, %r7038; + shr.u32 %r7072, %r7070, 16; + shl.b32 %r7073, %r7071, 16; + or.b32 %r7074, %r7073, %r7072; + add.s32 %r7075, %r7074, 1013904242; + xor.b32 %r7076, %r7075, %r11673; + shf.l.wrap.b32 %r7077, %r7076, %r7076, 20; + add.s32 %r7078, %r6953, %r7070; + add.s32 %r7079, %r7078, %r7077; + xor.b32 %r7080, %r7079, %r7074; + shf.l.wrap.b32 %r7081, %r7080, %r7080, 24; + add.s32 %r7082, %r7081, %r7075; + xor.b32 %r7083, %r7082, %r7077; + shf.l.wrap.b32 %r7084, %r7083, %r7083, 25; + add.s32 %r7085, %r11672, %r11676; + add.s32 %r7086, %r7085, %r6961; + xor.b32 %r7087, %r7086, %r7040; + shr.u32 %r7088, %r7086, 16; + shl.b32 %r7089, %r7087, 16; + or.b32 %r7090, %r7089, %r7088; + add.s32 %r7091, %r7090, -1521486534; + xor.b32 %r7092, %r7091, %r11672; + shf.l.wrap.b32 %r7093, %r7092, %r7092, 20; + add.s32 %r7094, %r6969, %r7086; + add.s32 %r7095, %r7094, %r7093; + xor.b32 %r7096, %r7095, %r7090; + shf.l.wrap.b32 %r7097, %r7096, %r7096, 24; + add.s32 %r7098, %r7097, %r7091; + xor.b32 %r7099, %r7098, %r7093; + shf.l.wrap.b32 %r7100, %r7099, %r7099, 25; + add.s32 %r7101, %r7068, %r7049; + add.s32 %r7102, %r7101, %r6977; + xor.b32 %r7103, %r7097, %r7102; + shf.l.wrap.b32 %r7104, %r7103, %r7103, 16; + add.s32 %r7105, %r7104, %r7082; + xor.b32 %r7106, %r7105, %r7068; + shf.l.wrap.b32 %r7107, %r7106, %r7106, 20; + add.s32 %r7108, %r6985, %r7102; + add.s32 %r7109, %r7108, %r7107; + xor.b32 %r7110, %r7109, %r7104; + shf.l.wrap.b32 %r7111, %r7110, %r7110, 24; + add.s32 %r7112, %r7111, %r7105; + xor.b32 %r7113, %r7112, %r7107; + shf.l.wrap.b32 %r7114, %r7113, %r7113, 25; + add.s32 %r7115, %r7084, %r7063; + add.s32 %r7116, %r7115, %r6993; + xor.b32 %r7117, %r7116, %r7051; + shf.l.wrap.b32 %r7118, %r7117, %r7117, 16; + add.s32 %r7119, %r7118, %r7098; + xor.b32 %r7120, %r7119, %r7084; + shf.l.wrap.b32 %r7121, %r7120, %r7120, 20; + add.s32 %r7122, %r7001, %r7116; + add.s32 %r7123, %r7122, %r7121; + xor.b32 %r7124, %r7123, %r7118; + shf.l.wrap.b32 %r7125, %r7124, %r7124, 24; + add.s32 %r7126, %r7125, %r7119; + xor.b32 %r7127, %r7126, %r7121; + shf.l.wrap.b32 %r7128, %r7127, %r7127, 25; + add.s32 %r7129, %r7100, %r7079; + add.s32 %r7130, %r7129, %r7009; + xor.b32 %r7131, %r7130, %r7065; + shf.l.wrap.b32 %r7132, %r7131, %r7131, 16; + add.s32 %r7133, %r7132, %r7052; + xor.b32 %r7134, %r7133, %r7100; + shf.l.wrap.b32 %r7135, %r7134, %r7134, 20; + add.s32 %r7136, %r7017, %r7130; + add.s32 %r7137, %r7136, %r7135; + xor.b32 %r7138, %r7137, %r7132; + shf.l.wrap.b32 %r7139, %r7138, %r7138, 24; + add.s32 %r7140, %r7139, %r7133; + xor.b32 %r7141, %r7140, %r7135; + shf.l.wrap.b32 %r7142, %r7141, %r7141, 25; + add.s32 %r7143, %r7095, %r7054; + add.s32 %r7144, %r7143, %r7025; + xor.b32 %r7145, %r7144, %r7081; + shf.l.wrap.b32 %r7146, %r7145, %r7145, 16; + add.s32 %r7147, %r7146, %r7066; + xor.b32 %r7148, %r7147, %r7054; + shf.l.wrap.b32 %r7149, %r7148, %r7148, 20; + add.s32 %r7150, %r7036, %r7144; + add.s32 %r7151, %r7150, %r7149; + xor.b32 %r7152, %r7151, %r7146; + shf.l.wrap.b32 %r7153, %r7152, %r7152, 24; + add.s32 %r7154, %r7153, %r7147; + xor.b32 %r7155, %r7154, %r7149; + shf.l.wrap.b32 %r7156, %r7155, %r7155, 25; + add.s32 %r7157, %r7109, %r6929; + add.s32 %r7158, %r7157, %r7156; + xor.b32 %r7159, %r7158, %r7125; + shf.l.wrap.b32 %r7160, %r7159, %r7159, 16; + add.s32 %r7161, %r7160, %r7140; + xor.b32 %r7162, %r7161, %r7156; + shf.l.wrap.b32 %r7163, %r7162, %r7162, 20; + add.s32 %r7164, %r7158, %r6961; + add.s32 %r7165, %r7164, %r7163; + xor.b32 %r7166, %r7165, %r7160; + shf.l.wrap.b32 %r7167, %r7166, %r7166, 24; + add.s32 %r7168, %r7167, %r7161; + xor.b32 %r7169, %r7168, %r7163; + shf.l.wrap.b32 %r7170, %r7169, %r7169, 25; + add.s32 %r7171, %r7123, %r6937; + add.s32 %r7172, %r7171, %r7114; + xor.b32 %r7173, %r7139, %r7172; + shf.l.wrap.b32 %r7174, %r7173, %r7173, 16; + add.s32 %r7175, %r7154, %r7174; + xor.b32 %r7176, %r7175, %r7114; + shf.l.wrap.b32 %r7177, %r7176, %r7176, 20; + add.s32 %r7178, %r7172, %r6993; + add.s32 %r7179, %r7178, %r7177; + xor.b32 %r7180, %r7179, %r7174; + shf.l.wrap.b32 %r7181, %r7180, %r7180, 24; + add.s32 %r7182, %r7181, %r7175; + xor.b32 %r7183, %r7182, %r7177; + shf.l.wrap.b32 %r7184, %r7183, %r7183, 25; + add.s32 %r7185, %r7128, %r6969; + add.s32 %r7186, %r7185, %r7137; + xor.b32 %r7187, %r7153, %r7186; + shf.l.wrap.b32 %r7188, %r7187, %r7187, 16; + add.s32 %r7189, %r7188, %r7112; + xor.b32 %r7190, %r7189, %r7128; + shf.l.wrap.b32 %r7191, %r7190, %r7190, 20; + add.s32 %r7192, %r7186, %r6913; + add.s32 %r7193, %r7192, %r7191; + xor.b32 %r7194, %r7193, %r7188; + shf.l.wrap.b32 %r7195, %r7194, %r7194, 24; + add.s32 %r7196, %r7195, %r7189; + xor.b32 %r7197, %r7196, %r7191; + shf.l.wrap.b32 %r7198, %r7197, %r7197, 25; + add.s32 %r7199, %r7142, %r6945; + add.s32 %r7200, %r7199, %r7151; + xor.b32 %r7201, %r7200, %r7111; + shf.l.wrap.b32 %r7202, %r7201, %r7201, 16; + add.s32 %r7203, %r7202, %r7126; + xor.b32 %r7204, %r7203, %r7142; + shf.l.wrap.b32 %r7205, %r7204, %r7204, 20; + add.s32 %r7206, %r7200, %r7017; + add.s32 %r7207, %r7206, %r7205; + xor.b32 %r7208, %r7207, %r7202; + shf.l.wrap.b32 %r7209, %r7208, %r7208, 24; + add.s32 %r7210, %r7209, %r7203; + xor.b32 %r7211, %r7210, %r7205; + shf.l.wrap.b32 %r7212, %r7211, %r7211, 25; + add.s32 %r7213, %r7184, %r6921; + add.s32 %r7214, %r7213, %r7165; + xor.b32 %r7215, %r7214, %r7209; + shf.l.wrap.b32 %r7216, %r7215, %r7215, 16; + add.s32 %r7217, %r7216, %r7196; + xor.b32 %r7218, %r7217, %r7184; + shf.l.wrap.b32 %r7219, %r7218, %r7218, 20; + add.s32 %r7220, %r7214, %r7001; + add.s32 %r7221, %r7220, %r7219; + xor.b32 %r7222, %r7221, %r7216; + shf.l.wrap.b32 %r7223, %r7222, %r7222, 24; + add.s32 %r7224, %r7223, %r7217; + xor.b32 %r7225, %r7224, %r7219; + shf.l.wrap.b32 %r7226, %r7225, %r7225, 25; + add.s32 %r7227, %r7179, %r7009; + add.s32 %r7228, %r7227, %r7198; + xor.b32 %r7229, %r7167, %r7228; + shf.l.wrap.b32 %r7230, %r7229, %r7229, 16; + add.s32 %r7231, %r7230, %r7210; + xor.b32 %r7232, %r7231, %r7198; + shf.l.wrap.b32 %r7233, %r7232, %r7232, 20; + add.s32 %r7234, %r7228, %r6953; + add.s32 %r7235, %r7234, %r7233; + xor.b32 %r7236, %r7235, %r7230; + shf.l.wrap.b32 %r7237, %r7236, %r7236, 24; + add.s32 %r7238, %r7237, %r7231; + xor.b32 %r7239, %r7238, %r7233; + shf.l.wrap.b32 %r7240, %r7239, %r7239, 25; + add.s32 %r7241, %r7193, %r6985; + add.s32 %r7242, %r7241, %r7212; + xor.b32 %r7243, %r7242, %r7181; + shf.l.wrap.b32 %r7244, %r7243, %r7243, 16; + add.s32 %r7245, %r7244, %r7168; + xor.b32 %r7246, %r7245, %r7212; + shf.l.wrap.b32 %r7247, %r7246, %r7246, 20; + add.s32 %r7248, %r7242, %r7025; + add.s32 %r7249, %r7248, %r7247; + xor.b32 %r7250, %r7249, %r7244; + shf.l.wrap.b32 %r7251, %r7250, %r7250, 24; + add.s32 %r7252, %r7251, %r7245; + xor.b32 %r7253, %r7252, %r7247; + shf.l.wrap.b32 %r7254, %r7253, %r7253, 25; + add.s32 %r7255, %r7207, %r7036; + add.s32 %r7256, %r7255, %r7170; + xor.b32 %r7257, %r7256, %r7195; + shf.l.wrap.b32 %r7258, %r7257, %r7257, 16; + add.s32 %r7259, %r7258, %r7182; + xor.b32 %r7260, %r7259, %r7170; + shf.l.wrap.b32 %r7261, %r7260, %r7260, 20; + add.s32 %r7262, %r7256, %r6977; + add.s32 %r7263, %r7262, %r7261; + xor.b32 %r7264, %r7263, %r7258; + shf.l.wrap.b32 %r7265, %r7264, %r7264, 24; + add.s32 %r7266, %r7265, %r7259; + xor.b32 %r7267, %r7266, %r7261; + shf.l.wrap.b32 %r7268, %r7267, %r7267, 25; + add.s32 %r7269, %r7221, %r6937; + add.s32 %r7270, %r7269, %r7268; + xor.b32 %r7271, %r7270, %r7237; + shf.l.wrap.b32 %r7272, %r7271, %r7271, 16; + add.s32 %r7273, %r7272, %r7252; + xor.b32 %r7274, %r7273, %r7268; + shf.l.wrap.b32 %r7275, %r7274, %r7274, 20; + add.s32 %r7276, %r7270, %r6945; + add.s32 %r7277, %r7276, %r7275; + xor.b32 %r7278, %r7277, %r7272; + shf.l.wrap.b32 %r7279, %r7278, %r7278, 24; + add.s32 %r7280, %r7279, %r7273; + xor.b32 %r7281, %r7280, %r7275; + shf.l.wrap.b32 %r7282, %r7281, %r7281, 25; + add.s32 %r7283, %r7235, %r6993; + add.s32 %r7284, %r7283, %r7226; + xor.b32 %r7285, %r7284, %r7251; + shf.l.wrap.b32 %r7286, %r7285, %r7285, 16; + add.s32 %r7287, %r7286, %r7266; + xor.b32 %r7288, %r7287, %r7226; + shf.l.wrap.b32 %r7289, %r7288, %r7288, 20; + add.s32 %r7290, %r7284, %r7009; + add.s32 %r7291, %r7290, %r7289; + xor.b32 %r7292, %r7291, %r7286; + shf.l.wrap.b32 %r7293, %r7292, %r7292, 24; + add.s32 %r7294, %r7293, %r7287; + xor.b32 %r7295, %r7294, %r7289; + shf.l.wrap.b32 %r7296, %r7295, %r7295, 25; + add.s32 %r7297, %r7249, %r7017; + add.s32 %r7298, %r7297, %r7240; + xor.b32 %r7299, %r7265, %r7298; + shf.l.wrap.b32 %r7300, %r7299, %r7299, 16; + add.s32 %r7301, %r7300, %r7224; + xor.b32 %r7302, %r7301, %r7240; + shf.l.wrap.b32 %r7303, %r7302, %r7302, 20; + add.s32 %r7304, %r7298, %r6929; + add.s32 %r7305, %r7304, %r7303; + xor.b32 %r7306, %r7305, %r7300; + shf.l.wrap.b32 %r7307, %r7306, %r7306, 24; + add.s32 %r7308, %r7307, %r7301; + xor.b32 %r7309, %r7308, %r7303; + shf.l.wrap.b32 %r7310, %r7309, %r7309, 25; + add.s32 %r7311, %r7254, %r6969; + add.s32 %r7312, %r7311, %r7263; + xor.b32 %r7313, %r7312, %r7223; + shf.l.wrap.b32 %r7314, %r7313, %r7313, 16; + add.s32 %r7315, %r7314, %r7238; + xor.b32 %r7316, %r7315, %r7254; + shf.l.wrap.b32 %r7317, %r7316, %r7316, 20; + add.s32 %r7318, %r7312, %r7025; + add.s32 %r7319, %r7318, %r7317; + xor.b32 %r7320, %r7319, %r7314; + shf.l.wrap.b32 %r7321, %r7320, %r7320, 24; + add.s32 %r7322, %r7321, %r7315; + xor.b32 %r7323, %r7322, %r7317; + shf.l.wrap.b32 %r7324, %r7323, %r7323, 25; + add.s32 %r7325, %r7277, %r6961; + add.s32 %r7326, %r7325, %r7296; + xor.b32 %r7327, %r7326, %r7321; + shf.l.wrap.b32 %r7328, %r7327, %r7327, 16; + add.s32 %r7329, %r7328, %r7308; + xor.b32 %r7330, %r7329, %r7296; + shf.l.wrap.b32 %r7331, %r7330, %r7330, 20; + add.s32 %r7332, %r7326, %r6953; + add.s32 %r7333, %r7332, %r7331; + xor.b32 %r7334, %r7333, %r7328; + shf.l.wrap.b32 %r7335, %r7334, %r7334, 24; + add.s32 %r7336, %r7335, %r7329; + xor.b32 %r7337, %r7336, %r7331; + shf.l.wrap.b32 %r7338, %r7337, %r7337, 25; + add.s32 %r7339, %r7291, %r6985; + add.s32 %r7340, %r7339, %r7310; + xor.b32 %r7341, %r7279, %r7340; + shf.l.wrap.b32 %r7342, %r7341, %r7341, 16; + add.s32 %r7343, %r7342, %r7322; + xor.b32 %r7344, %r7343, %r7310; + shf.l.wrap.b32 %r7345, %r7344, %r7344, 20; + add.s32 %r7346, %r7340, %r6913; + add.s32 %r7347, %r7346, %r7345; + xor.b32 %r7348, %r7347, %r7342; + shf.l.wrap.b32 %r7349, %r7348, %r7348, 24; + add.s32 %r7350, %r7349, %r7343; + xor.b32 %r7351, %r7350, %r7345; + shf.l.wrap.b32 %r7352, %r7351, %r7351, 25; + add.s32 %r7353, %r7305, %r7001; + add.s32 %r7354, %r7353, %r7324; + xor.b32 %r7355, %r7354, %r7293; + shf.l.wrap.b32 %r7356, %r7355, %r7355, 16; + add.s32 %r7357, %r7356, %r7280; + xor.b32 %r7358, %r7357, %r7324; + shf.l.wrap.b32 %r7359, %r7358, %r7358, 20; + add.s32 %r7360, %r7354, %r7036; + add.s32 %r7361, %r7360, %r7359; + xor.b32 %r7362, %r7361, %r7356; + shf.l.wrap.b32 %r7363, %r7362, %r7362, 24; + add.s32 %r7364, %r7363, %r7357; + xor.b32 %r7365, %r7364, %r7359; + shf.l.wrap.b32 %r7366, %r7365, %r7365, 25; + add.s32 %r7367, %r7319, %r6977; + add.s32 %r7368, %r7367, %r7282; + xor.b32 %r7369, %r7368, %r7307; + shf.l.wrap.b32 %r7370, %r7369, %r7369, 16; + add.s32 %r7371, %r7370, %r7294; + xor.b32 %r7372, %r7371, %r7282; + shf.l.wrap.b32 %r7373, %r7372, %r7372, 20; + add.s32 %r7374, %r7368, %r6921; + add.s32 %r7375, %r7374, %r7373; + xor.b32 %r7376, %r7375, %r7370; + shf.l.wrap.b32 %r7377, %r7376, %r7376, 24; + add.s32 %r7378, %r7377, %r7371; + xor.b32 %r7379, %r7378, %r7373; + shf.l.wrap.b32 %r7380, %r7379, %r7379, 25; + add.s32 %r7381, %r7333, %r6993; + add.s32 %r7382, %r7381, %r7380; + xor.b32 %r7383, %r7382, %r7349; + shf.l.wrap.b32 %r7384, %r7383, %r7383, 16; + add.s32 %r7385, %r7384, %r7364; + xor.b32 %r7386, %r7385, %r7380; + shf.l.wrap.b32 %r7387, %r7386, %r7386, 20; + add.s32 %r7388, %r7382, %r6969; + add.s32 %r7389, %r7388, %r7387; + xor.b32 %r7390, %r7389, %r7384; + shf.l.wrap.b32 %r7391, %r7390, %r7390, 24; + add.s32 %r7392, %r7391, %r7385; + xor.b32 %r7393, %r7392, %r7387; + shf.l.wrap.b32 %r7394, %r7393, %r7393, 25; + add.s32 %r7395, %r7347, %r7009; + add.s32 %r7396, %r7395, %r7338; + xor.b32 %r7397, %r7396, %r7363; + shf.l.wrap.b32 %r7398, %r7397, %r7397, 16; + add.s32 %r7399, %r7398, %r7378; + xor.b32 %r7400, %r7399, %r7338; + shf.l.wrap.b32 %r7401, %r7400, %r7400, 20; + add.s32 %r7402, %r7396, %r6985; + add.s32 %r7403, %r7402, %r7401; + xor.b32 %r7404, %r7403, %r7398; + shf.l.wrap.b32 %r7405, %r7404, %r7404, 24; + add.s32 %r7406, %r7405, %r7399; + xor.b32 %r7407, %r7406, %r7401; + shf.l.wrap.b32 %r7408, %r7407, %r7407, 25; + add.s32 %r7409, %r7361, %r7025; + add.s32 %r7410, %r7409, %r7352; + xor.b32 %r7411, %r7377, %r7410; + shf.l.wrap.b32 %r7412, %r7411, %r7411, 16; + add.s32 %r7413, %r7412, %r7336; + xor.b32 %r7414, %r7413, %r7352; + shf.l.wrap.b32 %r7415, %r7414, %r7414, 20; + add.s32 %r7416, %r7410, %r6937; + add.s32 %r7417, %r7416, %r7415; + xor.b32 %r7418, %r7417, %r7412; + shf.l.wrap.b32 %r7419, %r7418, %r7418, 24; + add.s32 %r7420, %r7419, %r7413; + xor.b32 %r7421, %r7420, %r7415; + shf.l.wrap.b32 %r7422, %r7421, %r7421, 25; + add.s32 %r7423, %r7375, %r7017; + add.s32 %r7424, %r7423, %r7366; + xor.b32 %r7425, %r7424, %r7335; + shf.l.wrap.b32 %r7426, %r7425, %r7425, 16; + add.s32 %r7427, %r7426, %r7350; + xor.b32 %r7428, %r7427, %r7366; + shf.l.wrap.b32 %r7429, %r7428, %r7428, 20; + add.s32 %r7430, %r7424, %r7036; + add.s32 %r7431, %r7430, %r7429; + xor.b32 %r7432, %r7431, %r7426; + shf.l.wrap.b32 %r7433, %r7432, %r7432, 24; + add.s32 %r7434, %r7433, %r7427; + xor.b32 %r7435, %r7434, %r7429; + shf.l.wrap.b32 %r7436, %r7435, %r7435, 25; + add.s32 %r7437, %r7389, %r6945; + add.s32 %r7438, %r7437, %r7408; + xor.b32 %r7439, %r7438, %r7433; + shf.l.wrap.b32 %r7440, %r7439, %r7439, 16; + add.s32 %r7441, %r7440, %r7420; + xor.b32 %r7442, %r7441, %r7408; + shf.l.wrap.b32 %r7443, %r7442, %r7442, 20; + add.s32 %r7444, %r7438, %r6913; + add.s32 %r7445, %r7444, %r7443; + xor.b32 %r7446, %r7445, %r7440; + shf.l.wrap.b32 %r7447, %r7446, %r7446, 24; + add.s32 %r7448, %r7447, %r7441; + xor.b32 %r7449, %r7448, %r7443; + shf.l.wrap.b32 %r7450, %r7449, %r7449, 25; + add.s32 %r7451, %r7403, %r7001; + add.s32 %r7452, %r7451, %r7422; + xor.b32 %r7453, %r7391, %r7452; + shf.l.wrap.b32 %r7454, %r7453, %r7453, 16; + add.s32 %r7455, %r7454, %r7434; + xor.b32 %r7456, %r7455, %r7422; + shf.l.wrap.b32 %r7457, %r7456, %r7456, 20; + add.s32 %r7458, %r7452, %r6929; + add.s32 %r7459, %r7458, %r7457; + xor.b32 %r7460, %r7459, %r7454; + shf.l.wrap.b32 %r7461, %r7460, %r7460, 24; + add.s32 %r7462, %r7461, %r7455; + xor.b32 %r7463, %r7462, %r7457; + shf.l.wrap.b32 %r7464, %r7463, %r7463, 25; + add.s32 %r7465, %r7417, %r6953; + add.s32 %r7466, %r7465, %r7436; + xor.b32 %r7467, %r7466, %r7405; + shf.l.wrap.b32 %r7468, %r7467, %r7467, 16; + add.s32 %r7469, %r7468, %r7392; + xor.b32 %r7470, %r7469, %r7436; + shf.l.wrap.b32 %r7471, %r7470, %r7470, 20; + add.s32 %r7472, %r7466, %r6977; + add.s32 %r7473, %r7472, %r7471; + xor.b32 %r7474, %r7473, %r7468; + shf.l.wrap.b32 %r7475, %r7474, %r7474, 24; + add.s32 %r7476, %r7475, %r7469; + xor.b32 %r7477, %r7476, %r7471; + shf.l.wrap.b32 %r7478, %r7477, %r7477, 25; + add.s32 %r7479, %r7431, %r6921; + add.s32 %r7480, %r7479, %r7394; + xor.b32 %r7481, %r7480, %r7419; + shf.l.wrap.b32 %r7482, %r7481, %r7481, 16; + add.s32 %r7483, %r7482, %r7406; + xor.b32 %r7484, %r7483, %r7394; + shf.l.wrap.b32 %r7485, %r7484, %r7484, 20; + add.s32 %r7486, %r7480, %r6961; + add.s32 %r7487, %r7486, %r7485; + xor.b32 %r7488, %r7487, %r7482; + shf.l.wrap.b32 %r7489, %r7488, %r7488, 24; + add.s32 %r7490, %r7489, %r7483; + xor.b32 %r7491, %r7490, %r7485; + shf.l.wrap.b32 %r7492, %r7491, %r7491, 25; + add.s32 %r7493, %r7445, %r7009; + add.s32 %r7494, %r7493, %r7492; + xor.b32 %r7495, %r7494, %r7461; + shf.l.wrap.b32 %r7496, %r7495, %r7495, 16; + add.s32 %r7497, %r7496, %r7476; + xor.b32 %r7498, %r7497, %r7492; + shf.l.wrap.b32 %r7499, %r7498, %r7498, 20; + add.s32 %r7500, %r7494, %r7017; + add.s32 %r7501, %r7500, %r7499; + xor.b32 %r7502, %r7501, %r7496; + shf.l.wrap.b32 %r7503, %r7502, %r7502, 24; + add.s32 %r7504, %r7503, %r7497; + xor.b32 %r7505, %r7504, %r7499; + shf.l.wrap.b32 %r7506, %r7505, %r7505, 25; + add.s32 %r7507, %r7459, %r6985; + add.s32 %r7508, %r7507, %r7450; + xor.b32 %r7509, %r7508, %r7475; + shf.l.wrap.b32 %r7510, %r7509, %r7509, 16; + add.s32 %r7511, %r7510, %r7490; + xor.b32 %r7512, %r7511, %r7450; + shf.l.wrap.b32 %r7513, %r7512, %r7512, 20; + add.s32 %r7514, %r7508, %r7001; + add.s32 %r7515, %r7514, %r7513; + xor.b32 %r7516, %r7515, %r7510; + shf.l.wrap.b32 %r7517, %r7516, %r7516, 24; + add.s32 %r7518, %r7517, %r7511; + xor.b32 %r7519, %r7518, %r7513; + shf.l.wrap.b32 %r7520, %r7519, %r7519, 25; + add.s32 %r7521, %r7473, %r7036; + add.s32 %r7522, %r7521, %r7464; + xor.b32 %r7523, %r7489, %r7522; + shf.l.wrap.b32 %r7524, %r7523, %r7523, 16; + add.s32 %r7525, %r7524, %r7448; + xor.b32 %r7526, %r7525, %r7464; + shf.l.wrap.b32 %r7527, %r7526, %r7526, 20; + add.s32 %r7528, %r7522, %r6993; + add.s32 %r7529, %r7528, %r7527; + xor.b32 %r7530, %r7529, %r7524; + shf.l.wrap.b32 %r7531, %r7530, %r7530, 24; + add.s32 %r7532, %r7531, %r7525; + xor.b32 %r7533, %r7532, %r7527; + shf.l.wrap.b32 %r7534, %r7533, %r7533, 25; + add.s32 %r7535, %r7487, %r7025; + add.s32 %r7536, %r7535, %r7478; + xor.b32 %r7537, %r7536, %r7447; + shf.l.wrap.b32 %r7538, %r7537, %r7537, 16; + add.s32 %r7539, %r7538, %r7462; + xor.b32 %r7540, %r7539, %r7478; + shf.l.wrap.b32 %r7541, %r7540, %r7540, 20; + add.s32 %r7542, %r7536, %r6977; + add.s32 %r7543, %r7542, %r7541; + xor.b32 %r7544, %r7543, %r7538; + shf.l.wrap.b32 %r7545, %r7544, %r7544, 24; + add.s32 %r7546, %r7545, %r7539; + xor.b32 %r7547, %r7546, %r7541; + shf.l.wrap.b32 %r7548, %r7547, %r7547, 25; + add.s32 %r7549, %r7501, %r6969; + add.s32 %r7550, %r7549, %r7520; + xor.b32 %r7551, %r7550, %r7545; + shf.l.wrap.b32 %r7552, %r7551, %r7551, 16; + add.s32 %r7553, %r7552, %r7532; + xor.b32 %r7554, %r7553, %r7520; + shf.l.wrap.b32 %r7555, %r7554, %r7554, 20; + add.s32 %r7556, %r7550, %r6929; + add.s32 %r7557, %r7556, %r7555; + xor.b32 %r7558, %r7557, %r7552; + shf.l.wrap.b32 %r7559, %r7558, %r7558, 24; + add.s32 %r7560, %r7559, %r7553; + xor.b32 %r7561, %r7560, %r7555; + shf.l.wrap.b32 %r7562, %r7561, %r7561, 25; + add.s32 %r7563, %r7515, %r6953; + add.s32 %r7564, %r7563, %r7534; + xor.b32 %r7565, %r7503, %r7564; + shf.l.wrap.b32 %r7566, %r7565, %r7565, 16; + add.s32 %r7567, %r7566, %r7546; + xor.b32 %r7568, %r7567, %r7534; + shf.l.wrap.b32 %r7569, %r7568, %r7568, 20; + add.s32 %r7570, %r7564, %r6937; + add.s32 %r7571, %r7570, %r7569; + xor.b32 %r7572, %r7571, %r7566; + shf.l.wrap.b32 %r7573, %r7572, %r7572, 24; + add.s32 %r7574, %r7573, %r7567; + xor.b32 %r7575, %r7574, %r7569; + shf.l.wrap.b32 %r7576, %r7575, %r7575, 25; + add.s32 %r7577, %r7529, %r6913; + add.s32 %r7578, %r7577, %r7548; + xor.b32 %r7579, %r7578, %r7517; + shf.l.wrap.b32 %r7580, %r7579, %r7579, 16; + add.s32 %r7581, %r7580, %r7504; + xor.b32 %r7582, %r7581, %r7548; + shf.l.wrap.b32 %r7583, %r7582, %r7582, 20; + add.s32 %r7584, %r7578, %r6921; + add.s32 %r7585, %r7584, %r7583; + xor.b32 %r7586, %r7585, %r7580; + shf.l.wrap.b32 %r7587, %r7586, %r7586, 24; + add.s32 %r7588, %r7587, %r7581; + xor.b32 %r7589, %r7588, %r7583; + shf.l.wrap.b32 %r7590, %r7589, %r7589, 25; + add.s32 %r7591, %r7543, %r6961; + add.s32 %r7592, %r7591, %r7506; + xor.b32 %r7593, %r7592, %r7531; + shf.l.wrap.b32 %r7594, %r7593, %r7593, 16; + add.s32 %r7595, %r7594, %r7518; + xor.b32 %r7596, %r7595, %r7506; + shf.l.wrap.b32 %r7597, %r7596, %r7596, 20; + add.s32 %r7598, %r7592, %r6945; + add.s32 %r7599, %r7598, %r7597; + xor.b32 %r7600, %r7599, %r7594; + shf.l.wrap.b32 %r7601, %r7600, %r7600, 24; + add.s32 %r7602, %r7601, %r7595; + xor.b32 %r7603, %r7602, %r7597; + shf.l.wrap.b32 %r7604, %r7603, %r7603, 25; + add.s32 %r7605, %r7557, %r6985; + add.s32 %r7606, %r7605, %r7604; + xor.b32 %r7607, %r7606, %r7573; + shf.l.wrap.b32 %r7608, %r7607, %r7607, 16; + add.s32 %r7609, %r7608, %r7588; + xor.b32 %r7610, %r7609, %r7604; + shf.l.wrap.b32 %r7611, %r7610, %r7610, 20; + add.s32 %r7612, %r7606, %r7025; + add.s32 %r7613, %r7612, %r7611; + xor.b32 %r7614, %r7613, %r7608; + shf.l.wrap.b32 %r7615, %r7614, %r7614, 24; + add.s32 %r7616, %r7615, %r7609; + xor.b32 %r7617, %r7616, %r7611; + shf.l.wrap.b32 %r7618, %r7617, %r7617, 25; + add.s32 %r7619, %r7571, %r7001; + add.s32 %r7620, %r7619, %r7562; + xor.b32 %r7621, %r7620, %r7587; + shf.l.wrap.b32 %r7622, %r7621, %r7621, 16; + add.s32 %r7623, %r7622, %r7602; + xor.b32 %r7624, %r7623, %r7562; + shf.l.wrap.b32 %r7625, %r7624, %r7624, 20; + add.s32 %r7626, %r7620, %r6953; + add.s32 %r7627, %r7626, %r7625; + xor.b32 %r7628, %r7627, %r7622; + shf.l.wrap.b32 %r7629, %r7628, %r7628, 24; + add.s32 %r7630, %r7629, %r7623; + xor.b32 %r7631, %r7630, %r7625; + shf.l.wrap.b32 %r7632, %r7631, %r7631, 25; + add.s32 %r7633, %r7585, %r6977; + add.s32 %r7634, %r7633, %r7576; + xor.b32 %r7635, %r7601, %r7634; + shf.l.wrap.b32 %r7636, %r7635, %r7635, 16; + add.s32 %r7637, %r7636, %r7560; + xor.b32 %r7638, %r7637, %r7576; + shf.l.wrap.b32 %r7639, %r7638, %r7638, 20; + add.s32 %r7640, %r7634, %r7009; + add.s32 %r7641, %r7640, %r7639; + xor.b32 %r7642, %r7641, %r7636; + shf.l.wrap.b32 %r7643, %r7642, %r7642, 24; + add.s32 %r7644, %r7643, %r7637; + xor.b32 %r7645, %r7644, %r7639; + shf.l.wrap.b32 %r7646, %r7645, %r7645, 25; + add.s32 %r7647, %r7599, %r7036; + add.s32 %r7648, %r7647, %r7590; + xor.b32 %r7649, %r7648, %r7559; + shf.l.wrap.b32 %r7650, %r7649, %r7649, 16; + add.s32 %r7651, %r7650, %r7574; + xor.b32 %r7652, %r7651, %r7590; + shf.l.wrap.b32 %r7653, %r7652, %r7652, 20; + add.s32 %r7654, %r7648, %r6921; + add.s32 %r7655, %r7654, %r7653; + xor.b32 %r7656, %r7655, %r7650; + shf.l.wrap.b32 %r7657, %r7656, %r7656, 24; + add.s32 %r7658, %r7657, %r7651; + xor.b32 %r7659, %r7658, %r7653; + shf.l.wrap.b32 %r7660, %r7659, %r7659, 25; + add.s32 %r7661, %r7613, %r7017; + add.s32 %r7662, %r7661, %r7632; + xor.b32 %r7663, %r7662, %r7657; + shf.l.wrap.b32 %r7664, %r7663, %r7663, 16; + add.s32 %r7665, %r7664, %r7644; + xor.b32 %r7666, %r7665, %r7632; + shf.l.wrap.b32 %r7667, %r7666, %r7666, 20; + add.s32 %r7668, %r7662, %r6937; + add.s32 %r7669, %r7668, %r7667; + xor.b32 %r7670, %r7669, %r7664; + shf.l.wrap.b32 %r7671, %r7670, %r7670, 24; + add.s32 %r7672, %r7671, %r7665; + xor.b32 %r7673, %r7672, %r7667; + shf.l.wrap.b32 %r7674, %r7673, %r7673, 25; + add.s32 %r7675, %r7627, %r6913; + add.s32 %r7676, %r7675, %r7646; + xor.b32 %r7677, %r7615, %r7676; + shf.l.wrap.b32 %r7678, %r7677, %r7677, 16; + add.s32 %r7679, %r7678, %r7658; + xor.b32 %r7680, %r7679, %r7646; + shf.l.wrap.b32 %r7681, %r7680, %r7680, 20; + add.s32 %r7682, %r7676, %r6993; + add.s32 %r7683, %r7682, %r7681; + xor.b32 %r7684, %r7683, %r7678; + shf.l.wrap.b32 %r7685, %r7684, %r7684, 24; + add.s32 %r7686, %r7685, %r7679; + xor.b32 %r7687, %r7686, %r7681; + shf.l.wrap.b32 %r7688, %r7687, %r7687, 25; + add.s32 %r7689, %r7641, %r6929; + add.s32 %r7690, %r7689, %r7660; + xor.b32 %r7691, %r7690, %r7629; + shf.l.wrap.b32 %r7692, %r7691, %r7691, 16; + add.s32 %r7693, %r7692, %r7616; + xor.b32 %r7694, %r7693, %r7660; + shf.l.wrap.b32 %r7695, %r7694, %r7694, 20; + add.s32 %r7696, %r7690, %r6961; + add.s32 %r7697, %r7696, %r7695; + xor.b32 %r7698, %r7697, %r7692; + shf.l.wrap.b32 %r7699, %r7698, %r7698, 24; + add.s32 %r7700, %r7699, %r7693; + xor.b32 %r7701, %r7700, %r7695; + shf.l.wrap.b32 %r7702, %r7701, %r7701, 25; + add.s32 %r7703, %r7655, %r6945; + add.s32 %r7704, %r7703, %r7618; + xor.b32 %r7705, %r7704, %r7643; + shf.l.wrap.b32 %r7706, %r7705, %r7705, 16; + add.s32 %r7707, %r7706, %r7630; + xor.b32 %r7708, %r7707, %r7618; + shf.l.wrap.b32 %r7709, %r7708, %r7708, 20; + add.s32 %r7710, %r7704, %r6969; + add.s32 %r7711, %r7710, %r7709; + xor.b32 %r7712, %r7711, %r7706; + shf.l.wrap.b32 %r7713, %r7712, %r7712, 24; + add.s32 %r7714, %r7713, %r7707; + xor.b32 %r7715, %r7714, %r7709; + shf.l.wrap.b32 %r7716, %r7715, %r7715, 25; + add.s32 %r7717, %r7669, %r7001; + add.s32 %r7718, %r7717, %r7716; + xor.b32 %r7719, %r7718, %r7685; + shf.l.wrap.b32 %r7720, %r7719, %r7719, 16; + add.s32 %r7721, %r7720, %r7700; + xor.b32 %r7722, %r7721, %r7716; + shf.l.wrap.b32 %r7723, %r7722, %r7722, 20; + add.s32 %r7724, %r7718, %r7036; + add.s32 %r7725, %r7724, %r7723; + xor.b32 %r7726, %r7725, %r7720; + shf.l.wrap.b32 %r7727, %r7726, %r7726, 24; + add.s32 %r7728, %r7727, %r7721; + xor.b32 %r7729, %r7728, %r7723; + shf.l.wrap.b32 %r7730, %r7729, %r7729, 25; + add.s32 %r7731, %r7683, %r6953; + add.s32 %r7732, %r7731, %r7674; + xor.b32 %r7733, %r7732, %r7699; + shf.l.wrap.b32 %r7734, %r7733, %r7733, 16; + add.s32 %r7735, %r7734, %r7714; + xor.b32 %r7736, %r7735, %r7674; + shf.l.wrap.b32 %r7737, %r7736, %r7736, 20; + add.s32 %r7738, %r7732, %r6913; + add.s32 %r7739, %r7738, %r7737; + xor.b32 %r7740, %r7739, %r7734; + shf.l.wrap.b32 %r7741, %r7740, %r7740, 24; + add.s32 %r7742, %r7741, %r7735; + xor.b32 %r7743, %r7742, %r7737; + shf.l.wrap.b32 %r7744, %r7743, %r7743, 25; + add.s32 %r7745, %r7697, %r6921; + add.s32 %r7746, %r7745, %r7688; + xor.b32 %r7747, %r7713, %r7746; + shf.l.wrap.b32 %r7748, %r7747, %r7747, 16; + add.s32 %r7749, %r7748, %r7672; + xor.b32 %r7750, %r7749, %r7688; + shf.l.wrap.b32 %r7751, %r7750, %r7750, 20; + add.s32 %r7752, %r7746, %r6985; + add.s32 %r7753, %r7752, %r7751; + xor.b32 %r7754, %r7753, %r7748; + shf.l.wrap.b32 %r7755, %r7754, %r7754, 24; + add.s32 %r7756, %r7755, %r7749; + xor.b32 %r7757, %r7756, %r7751; + shf.l.wrap.b32 %r7758, %r7757, %r7757, 25; + add.s32 %r7759, %r7711, %r6977; + add.s32 %r7760, %r7759, %r7702; + xor.b32 %r7761, %r7760, %r7671; + shf.l.wrap.b32 %r7762, %r7761, %r7761, 16; + add.s32 %r7763, %r7762, %r7686; + xor.b32 %r7764, %r7763, %r7702; + shf.l.wrap.b32 %r7765, %r7764, %r7764, 20; + add.s32 %r7766, %r7760, %r6961; + add.s32 %r7767, %r7766, %r7765; + xor.b32 %r7768, %r7767, %r7762; + shf.l.wrap.b32 %r7769, %r7768, %r7768, 24; + add.s32 %r7770, %r7769, %r7763; + xor.b32 %r7771, %r7770, %r7765; + shf.l.wrap.b32 %r7772, %r7771, %r7771, 25; + add.s32 %r7773, %r7725, %r7025; + add.s32 %r7774, %r7773, %r7744; + xor.b32 %r7775, %r7774, %r7769; + shf.l.wrap.b32 %r7776, %r7775, %r7775, 16; + add.s32 %r7777, %r7776, %r7756; + xor.b32 %r7778, %r7777, %r7744; + shf.l.wrap.b32 %r7779, %r7778, %r7778, 20; + add.s32 %r7780, %r7774, %r6993; + add.s32 %r7781, %r7780, %r7779; + xor.b32 %r7782, %r7781, %r7776; + shf.l.wrap.b32 %r7783, %r7782, %r7782, 24; + add.s32 %r7784, %r7783, %r7777; + xor.b32 %r7785, %r7784, %r7779; + shf.l.wrap.b32 %r7786, %r7785, %r7785, 25; + add.s32 %r7787, %r7739, %r6929; + add.s32 %r7788, %r7787, %r7758; + xor.b32 %r7789, %r7727, %r7788; + shf.l.wrap.b32 %r7790, %r7789, %r7789, 16; + add.s32 %r7791, %r7790, %r7770; + xor.b32 %r7792, %r7791, %r7758; + shf.l.wrap.b32 %r7793, %r7792, %r7792, 20; + add.s32 %r7794, %r7788, %r7009; + add.s32 %r7795, %r7794, %r7793; + xor.b32 %r7796, %r7795, %r7790; + shf.l.wrap.b32 %r7797, %r7796, %r7796, 24; + add.s32 %r7798, %r7797, %r7791; + xor.b32 %r7799, %r7798, %r7793; + shf.l.wrap.b32 %r7800, %r7799, %r7799, 25; + add.s32 %r7801, %r7753, %r6937; + add.s32 %r7802, %r7801, %r7772; + xor.b32 %r7803, %r7802, %r7741; + shf.l.wrap.b32 %r7804, %r7803, %r7803, 16; + add.s32 %r7805, %r7804, %r7728; + xor.b32 %r7806, %r7805, %r7772; + shf.l.wrap.b32 %r7807, %r7806, %r7806, 20; + add.s32 %r7808, %r7802, %r6945; + add.s32 %r7809, %r7808, %r7807; + xor.b32 %r7810, %r7809, %r7804; + shf.l.wrap.b32 %r7811, %r7810, %r7810, 24; + add.s32 %r7812, %r7811, %r7805; + xor.b32 %r7813, %r7812, %r7807; + shf.l.wrap.b32 %r7814, %r7813, %r7813, 25; + add.s32 %r7815, %r7767, %r6969; + add.s32 %r7816, %r7815, %r7730; + xor.b32 %r7817, %r7816, %r7755; + shf.l.wrap.b32 %r7818, %r7817, %r7817, 16; + add.s32 %r7819, %r7818, %r7742; + xor.b32 %r7820, %r7819, %r7730; + shf.l.wrap.b32 %r7821, %r7820, %r7820, 20; + add.s32 %r7822, %r7816, %r7017; + add.s32 %r7823, %r7822, %r7821; + xor.b32 %r7824, %r7823, %r7818; + shf.l.wrap.b32 %r7825, %r7824, %r7824, 24; + add.s32 %r7826, %r7825, %r7819; + xor.b32 %r7827, %r7826, %r7821; + shf.l.wrap.b32 %r7828, %r7827, %r7827, 25; + xor.b32 %r97, %r7812, %r7781; + xor.b32 %r98, %r7826, %r7795; + xor.b32 %r99, %r7784, %r7809; + xor.b32 %r100, %r7823, %r7798; + xor.b32 %r101, %r7828, %r7797; + xor.b32 %r102, %r7786, %r7811; + xor.b32 %r103, %r7825, %r7800; + xor.b32 %r104, %r7814, %r7783; + ld.local.u8 %rs327, [%rd3+8]; + cvt.u64.u16 %rd188, %rs327; + popc.b64 %r7829, %rd250; + cvt.u64.u32 %rd64, %r7829; + setp.ge.u64 %p37, %rd64, %rd188; + mul.wide.u16 %r11681, %rs327, 32; + @%p37 bra $L__BB1_46; + +$L__BB1_45: + popc.b64 %r11649, %rd250; + cvt.u64.u32 %rd229, %r11649; + add.s32 %r7830, %r11681, -64; + cvt.s64.s32 %rd189, %r7830; + add.s64 %rd190, %rd2, %rd189; + ld.local.u8 %r7831, [%rd3+2]; + ld.local.u8 %r7832, [%rd190+145]; + ld.local.u8 %r7833, [%rd190+146]; + prmt.b32 %r7834, %r7833, %r7832, 30212; + ld.local.u8 %r7835, [%rd190+147]; + prmt.b32 %r7836, %r7835, %r7834, 28756; + ld.local.u8 %r7837, [%rd190+148]; + prmt.b32 %r7838, %r7837, %r7836, 1620; + ld.local.u8 %r7839, [%rd190+149]; + ld.local.u8 %r7840, [%rd190+150]; + prmt.b32 %r7841, %r7840, %r7839, 30212; + ld.local.u8 %r7842, [%rd190+151]; + prmt.b32 %r7843, %r7842, %r7841, 28756; + ld.local.u8 %r7844, [%rd190+152]; + prmt.b32 %r7845, %r7844, %r7843, 1620; + ld.local.u8 %r7846, [%rd190+153]; + ld.local.u8 %r7847, [%rd190+154]; + prmt.b32 %r7848, %r7847, %r7846, 30212; + ld.local.u8 %r7849, [%rd190+155]; + prmt.b32 %r7850, %r7849, %r7848, 28756; + ld.local.u8 %r7851, [%rd190+156]; + prmt.b32 %r7852, %r7851, %r7850, 1620; + ld.local.u8 %r7853, [%rd190+157]; + ld.local.u8 %r7854, [%rd190+158]; + prmt.b32 %r7855, %r7854, %r7853, 30212; + ld.local.u8 %r7856, [%rd190+159]; + prmt.b32 %r7857, %r7856, %r7855, 28756; + ld.local.u8 %r7858, [%rd190+160]; + prmt.b32 %r7859, %r7858, %r7857, 1620; + ld.local.u8 %r7860, [%rd190+161]; + ld.local.u8 %r7861, [%rd190+162]; + prmt.b32 %r7862, %r7861, %r7860, 30212; + ld.local.u8 %r7863, [%rd190+163]; + prmt.b32 %r7864, %r7863, %r7862, 28756; + ld.local.u8 %r7865, [%rd190+164]; + prmt.b32 %r7866, %r7865, %r7864, 1620; + ld.local.u8 %r7867, [%rd190+165]; + ld.local.u8 %r7868, [%rd190+166]; + prmt.b32 %r7869, %r7868, %r7867, 30212; + ld.local.u8 %r7870, [%rd190+167]; + prmt.b32 %r7871, %r7870, %r7869, 28756; + ld.local.u8 %r7872, [%rd190+168]; + prmt.b32 %r7873, %r7872, %r7871, 1620; + ld.local.u8 %r7874, [%rd190+169]; + ld.local.u8 %r7875, [%rd190+170]; + prmt.b32 %r7876, %r7875, %r7874, 30212; + ld.local.u8 %r7877, [%rd190+171]; + prmt.b32 %r7878, %r7877, %r7876, 28756; + ld.local.u8 %r7879, [%rd190+172]; + prmt.b32 %r7880, %r7879, %r7878, 1620; + ld.local.u8 %r7881, [%rd190+173]; + ld.local.u8 %r7882, [%rd190+174]; + prmt.b32 %r7883, %r7882, %r7881, 30212; + ld.local.u8 %r7884, [%rd190+175]; + prmt.b32 %r7885, %r7884, %r7883, 28756; + ld.local.u8 %r7886, [%rd190+176]; + prmt.b32 %r7887, %r7886, %r7885, 1620; + ld.local.u8 %r7888, [%rd190+177]; + ld.local.u8 %r7889, [%rd190+178]; + prmt.b32 %r7890, %r7889, %r7888, 30212; + ld.local.u8 %r7891, [%rd190+179]; + prmt.b32 %r7892, %r7891, %r7890, 28756; + ld.local.u8 %r7893, [%rd190+180]; + prmt.b32 %r7894, %r7893, %r7892, 1620; + ld.local.u8 %r7895, [%rd190+181]; + ld.local.u8 %r7896, [%rd190+182]; + prmt.b32 %r7897, %r7896, %r7895, 30212; + ld.local.u8 %r7898, [%rd190+183]; + prmt.b32 %r7899, %r7898, %r7897, 28756; + ld.local.u8 %r7900, [%rd190+184]; + prmt.b32 %r7901, %r7900, %r7899, 1620; + ld.local.u8 %r7902, [%rd190+185]; + ld.local.u8 %r7903, [%rd190+186]; + prmt.b32 %r7904, %r7903, %r7902, 30212; + ld.local.u8 %r7905, [%rd190+187]; + prmt.b32 %r7906, %r7905, %r7904, 28756; + ld.local.u8 %r7907, [%rd190+188]; + prmt.b32 %r7908, %r7907, %r7906, 1620; + ld.local.u8 %r7909, [%rd190+189]; + ld.local.u8 %r7910, [%rd190+190]; + prmt.b32 %r7911, %r7910, %r7909, 30212; + ld.local.u8 %r7912, [%rd190+191]; + prmt.b32 %r7913, %r7912, %r7911, 28756; + ld.local.u8 %r7914, [%rd190+192]; + prmt.b32 %r7915, %r7914, %r7913, 1620; + ld.local.u8 %r7916, [%rd190+193]; + ld.local.u8 %r7917, [%rd190+194]; + prmt.b32 %r7918, %r7917, %r7916, 30212; + ld.local.u8 %r7919, [%rd190+195]; + prmt.b32 %r7920, %r7919, %r7918, 28756; + ld.local.u8 %r7921, [%rd190+196]; + prmt.b32 %r7922, %r7921, %r7920, 1620; + ld.local.u8 %r7923, [%rd190+197]; + ld.local.u8 %r7924, [%rd190+198]; + prmt.b32 %r7925, %r7924, %r7923, 30212; + ld.local.u8 %r7926, [%rd190+199]; + prmt.b32 %r7927, %r7926, %r7925, 28756; + ld.local.u8 %r7928, [%rd190+200]; + prmt.b32 %r7929, %r7928, %r7927, 1620; + ld.local.u8 %r7930, [%rd190+201]; + ld.local.u8 %r7931, [%rd190+202]; + prmt.b32 %r7932, %r7931, %r7930, 30212; + ld.local.u8 %r7933, [%rd190+203]; + prmt.b32 %r7934, %r7933, %r7932, 28756; + ld.local.u8 %r7935, [%rd190+204]; + prmt.b32 %r7936, %r7935, %r7934, 1620; + ld.local.u8 %r7937, [%rd190+205]; + ld.local.u8 %r7938, [%rd190+206]; + prmt.b32 %r7939, %r7938, %r7937, 30212; + ld.local.u8 %r7940, [%rd190+207]; + prmt.b32 %r7941, %r7940, %r7939, 28756; + ld.local.u8 %r7942, [%rd190+208]; + prmt.b32 %r7943, %r7942, %r7941, 1620; + or.b32 %r7944, %r7831, 4; + ld.local.u8 %r7945, [%rd3+-120]; + ld.local.u8 %r7946, [%rd3+-119]; + prmt.b32 %r7947, %r7946, %r7945, 30212; + ld.local.u8 %r7948, [%rd3+-118]; + ld.local.u8 %r7949, [%rd3+-117]; + prmt.b32 %r7950, %r7949, %r7948, 30212; + prmt.b32 %r7951, %r7950, %r7947, 4180; + ld.local.u8 %r7952, [%rd3+-136]; + ld.local.u8 %r7953, [%rd3+-135]; + prmt.b32 %r7954, %r7953, %r7952, 30212; + ld.local.u8 %r7955, [%rd3+-134]; + ld.local.u8 %r7956, [%rd3+-133]; + prmt.b32 %r7957, %r7956, %r7955, 30212; + prmt.b32 %r7958, %r7957, %r7954, 4180; + add.s32 %r7959, %r7951, %r7958; + add.s32 %r7960, %r7959, %r7838; + shf.l.wrap.b32 %r7961, %r7960, %r7960, 16; + add.s32 %r7962, %r7961, 1779033703; + xor.b32 %r7963, %r7962, %r7951; + shf.l.wrap.b32 %r7964, %r7963, %r7963, 20; + add.s32 %r7965, %r7845, %r7960; + add.s32 %r7966, %r7965, %r7964; + xor.b32 %r7967, %r7966, %r7961; + shf.l.wrap.b32 %r7968, %r7967, %r7967, 24; + add.s32 %r7969, %r7968, %r7962; + xor.b32 %r7970, %r7969, %r7964; + shf.l.wrap.b32 %r7971, %r7970, %r7970, 25; + ld.local.u8 %r7972, [%rd3+-116]; + ld.local.u8 %r7973, [%rd3+-115]; + prmt.b32 %r7974, %r7973, %r7972, 30212; + ld.local.u8 %r7975, [%rd3+-114]; + ld.local.u8 %r7976, [%rd3+-113]; + prmt.b32 %r7977, %r7976, %r7975, 30212; + prmt.b32 %r7978, %r7977, %r7974, 4180; + ld.local.u8 %r7979, [%rd3+-132]; + ld.local.u8 %r7980, [%rd3+-131]; + prmt.b32 %r7981, %r7980, %r7979, 30212; + ld.local.u8 %r7982, [%rd3+-130]; + ld.local.u8 %r7983, [%rd3+-129]; + prmt.b32 %r7984, %r7983, %r7982, 30212; + prmt.b32 %r7985, %r7984, %r7981, 4180; + add.s32 %r7986, %r7978, %r7985; + add.s32 %r7987, %r7986, %r7852; + shf.l.wrap.b32 %r7988, %r7987, %r7987, 16; + add.s32 %r7989, %r7988, -1150833019; + xor.b32 %r7990, %r7989, %r7978; + shf.l.wrap.b32 %r7991, %r7990, %r7990, 20; + add.s32 %r7992, %r7859, %r7987; + add.s32 %r7993, %r7992, %r7991; + xor.b32 %r7994, %r7993, %r7988; + shf.l.wrap.b32 %r7995, %r7994, %r7994, 24; + add.s32 %r7996, %r7995, %r7989; + xor.b32 %r7997, %r7996, %r7991; + shf.l.wrap.b32 %r7998, %r7997, %r7997, 25; + ld.local.u8 %r7999, [%rd3+-112]; + ld.local.u8 %r8000, [%rd3+-111]; + prmt.b32 %r8001, %r8000, %r7999, 30212; + ld.local.u8 %r8002, [%rd3+-110]; + ld.local.u8 %r8003, [%rd3+-109]; + prmt.b32 %r8004, %r8003, %r8002, 30212; + prmt.b32 %r8005, %r8004, %r8001, 4180; + ld.local.u8 %r8006, [%rd3+-128]; + ld.local.u8 %r8007, [%rd3+-127]; + prmt.b32 %r8008, %r8007, %r8006, 30212; + ld.local.u8 %r8009, [%rd3+-126]; + ld.local.u8 %r8010, [%rd3+-125]; + prmt.b32 %r8011, %r8010, %r8009, 30212; + prmt.b32 %r8012, %r8011, %r8008, 4180; + add.s32 %r8013, %r8005, %r8012; + add.s32 %r8014, %r8013, %r7866; + shr.u32 %r8015, %r8014, 16; + shl.b32 %r8016, %r8014, 16; + xor.b32 %r8017, %r8016, 4194304; + or.b32 %r8018, %r8017, %r8015; + add.s32 %r8019, %r8018, 1013904242; + xor.b32 %r8020, %r8019, %r8005; + shf.l.wrap.b32 %r8021, %r8020, %r8020, 20; + add.s32 %r8022, %r7873, %r8014; + add.s32 %r8023, %r8022, %r8021; + xor.b32 %r8024, %r8023, %r8018; + shf.l.wrap.b32 %r8025, %r8024, %r8024, 24; + add.s32 %r8026, %r8025, %r8019; + xor.b32 %r8027, %r8026, %r8021; + shf.l.wrap.b32 %r8028, %r8027, %r8027, 25; + ld.local.u8 %r8029, [%rd3+-108]; + ld.local.u8 %r8030, [%rd3+-107]; + prmt.b32 %r8031, %r8030, %r8029, 30212; + ld.local.u8 %r8032, [%rd3+-106]; + ld.local.u8 %r8033, [%rd3+-105]; + prmt.b32 %r8034, %r8033, %r8032, 30212; + prmt.b32 %r8035, %r8034, %r8031, 4180; + ld.local.u8 %r8036, [%rd3+-124]; + ld.local.u8 %r8037, [%rd3+-123]; + prmt.b32 %r8038, %r8037, %r8036, 30212; + ld.local.u8 %r8039, [%rd3+-122]; + ld.local.u8 %r8040, [%rd3+-121]; + prmt.b32 %r8041, %r8040, %r8039, 30212; + prmt.b32 %r8042, %r8041, %r8038, 4180; + add.s32 %r8043, %r8035, %r8042; + add.s32 %r8044, %r8043, %r7880; + xor.b32 %r8045, %r8044, %r7944; + shr.u32 %r8046, %r8044, 16; + shl.b32 %r8047, %r8045, 16; + or.b32 %r8048, %r8047, %r8046; + add.s32 %r8049, %r8048, -1521486534; + xor.b32 %r8050, %r8049, %r8035; + shf.l.wrap.b32 %r8051, %r8050, %r8050, 20; + add.s32 %r8052, %r7887, %r8044; + add.s32 %r8053, %r8052, %r8051; + xor.b32 %r8054, %r8053, %r8048; + shf.l.wrap.b32 %r8055, %r8054, %r8054, 24; + add.s32 %r8056, %r8055, %r8049; + xor.b32 %r8057, %r8056, %r8051; + shf.l.wrap.b32 %r8058, %r8057, %r8057, 25; + add.s32 %r8059, %r7998, %r7966; + add.s32 %r8060, %r8059, %r7894; + xor.b32 %r8061, %r8055, %r8060; + shf.l.wrap.b32 %r8062, %r8061, %r8061, 16; + add.s32 %r8063, %r8062, %r8026; + xor.b32 %r8064, %r8063, %r7998; + shf.l.wrap.b32 %r8065, %r8064, %r8064, 20; + add.s32 %r8066, %r7901, %r8060; + add.s32 %r8067, %r8066, %r8065; + xor.b32 %r8068, %r8067, %r8062; + shf.l.wrap.b32 %r8069, %r8068, %r8068, 24; + add.s32 %r8070, %r8069, %r8063; + xor.b32 %r8071, %r8070, %r8065; + shf.l.wrap.b32 %r8072, %r8071, %r8071, 25; + add.s32 %r8073, %r8028, %r7993; + add.s32 %r8074, %r8073, %r7908; + xor.b32 %r8075, %r8074, %r7968; + shf.l.wrap.b32 %r8076, %r8075, %r8075, 16; + add.s32 %r8077, %r8076, %r8056; + xor.b32 %r8078, %r8077, %r8028; + shf.l.wrap.b32 %r8079, %r8078, %r8078, 20; + add.s32 %r8080, %r7915, %r8074; + add.s32 %r8081, %r8080, %r8079; + xor.b32 %r8082, %r8081, %r8076; + shf.l.wrap.b32 %r8083, %r8082, %r8082, 24; + add.s32 %r8084, %r8083, %r8077; + xor.b32 %r8085, %r8084, %r8079; + shf.l.wrap.b32 %r8086, %r8085, %r8085, 25; + add.s32 %r8087, %r8058, %r8023; + add.s32 %r8088, %r8087, %r7922; + xor.b32 %r8089, %r8088, %r7995; + shf.l.wrap.b32 %r8090, %r8089, %r8089, 16; + add.s32 %r8091, %r8090, %r7969; + xor.b32 %r8092, %r8091, %r8058; + shf.l.wrap.b32 %r8093, %r8092, %r8092, 20; + add.s32 %r8094, %r7929, %r8088; + add.s32 %r8095, %r8094, %r8093; + xor.b32 %r8096, %r8095, %r8090; + shf.l.wrap.b32 %r8097, %r8096, %r8096, 24; + add.s32 %r8098, %r8097, %r8091; + xor.b32 %r8099, %r8098, %r8093; + shf.l.wrap.b32 %r8100, %r8099, %r8099, 25; + add.s32 %r8101, %r8053, %r7971; + add.s32 %r8102, %r8101, %r7936; + xor.b32 %r8103, %r8102, %r8025; + shf.l.wrap.b32 %r8104, %r8103, %r8103, 16; + add.s32 %r8105, %r8104, %r7996; + xor.b32 %r8106, %r8105, %r7971; + shf.l.wrap.b32 %r8107, %r8106, %r8106, 20; + add.s32 %r8108, %r7943, %r8102; + add.s32 %r8109, %r8108, %r8107; + xor.b32 %r8110, %r8109, %r8104; + shf.l.wrap.b32 %r8111, %r8110, %r8110, 24; + add.s32 %r8112, %r8111, %r8105; + xor.b32 %r8113, %r8112, %r8107; + shf.l.wrap.b32 %r8114, %r8113, %r8113, 25; + add.s32 %r8115, %r8067, %r7852; + add.s32 %r8116, %r8115, %r8114; + xor.b32 %r8117, %r8116, %r8083; + shf.l.wrap.b32 %r8118, %r8117, %r8117, 16; + add.s32 %r8119, %r8118, %r8098; + xor.b32 %r8120, %r8119, %r8114; + shf.l.wrap.b32 %r8121, %r8120, %r8120, 20; + add.s32 %r8122, %r8116, %r7880; + add.s32 %r8123, %r8122, %r8121; + xor.b32 %r8124, %r8123, %r8118; + shf.l.wrap.b32 %r8125, %r8124, %r8124, 24; + add.s32 %r8126, %r8125, %r8119; + xor.b32 %r8127, %r8126, %r8121; + shf.l.wrap.b32 %r8128, %r8127, %r8127, 25; + add.s32 %r8129, %r8081, %r7859; + add.s32 %r8130, %r8129, %r8072; + xor.b32 %r8131, %r8097, %r8130; + shf.l.wrap.b32 %r8132, %r8131, %r8131, 16; + add.s32 %r8133, %r8112, %r8132; + xor.b32 %r8134, %r8133, %r8072; + shf.l.wrap.b32 %r8135, %r8134, %r8134, 20; + add.s32 %r8136, %r8130, %r7908; + add.s32 %r8137, %r8136, %r8135; + xor.b32 %r8138, %r8137, %r8132; + shf.l.wrap.b32 %r8139, %r8138, %r8138, 24; + add.s32 %r8140, %r8139, %r8133; + xor.b32 %r8141, %r8140, %r8135; + shf.l.wrap.b32 %r8142, %r8141, %r8141, 25; + add.s32 %r8143, %r8086, %r7887; + add.s32 %r8144, %r8143, %r8095; + xor.b32 %r8145, %r8111, %r8144; + shf.l.wrap.b32 %r8146, %r8145, %r8145, 16; + add.s32 %r8147, %r8146, %r8070; + xor.b32 %r8148, %r8147, %r8086; + shf.l.wrap.b32 %r8149, %r8148, %r8148, 20; + add.s32 %r8150, %r8144, %r7838; + add.s32 %r8151, %r8150, %r8149; + xor.b32 %r8152, %r8151, %r8146; + shf.l.wrap.b32 %r8153, %r8152, %r8152, 24; + add.s32 %r8154, %r8153, %r8147; + xor.b32 %r8155, %r8154, %r8149; + shf.l.wrap.b32 %r8156, %r8155, %r8155, 25; + add.s32 %r8157, %r8100, %r7866; + add.s32 %r8158, %r8157, %r8109; + xor.b32 %r8159, %r8158, %r8069; + shf.l.wrap.b32 %r8160, %r8159, %r8159, 16; + add.s32 %r8161, %r8160, %r8084; + xor.b32 %r8162, %r8161, %r8100; + shf.l.wrap.b32 %r8163, %r8162, %r8162, 20; + add.s32 %r8164, %r8158, %r7929; + add.s32 %r8165, %r8164, %r8163; + xor.b32 %r8166, %r8165, %r8160; + shf.l.wrap.b32 %r8167, %r8166, %r8166, 24; + add.s32 %r8168, %r8167, %r8161; + xor.b32 %r8169, %r8168, %r8163; + shf.l.wrap.b32 %r8170, %r8169, %r8169, 25; + add.s32 %r8171, %r8142, %r7845; + add.s32 %r8172, %r8171, %r8123; + xor.b32 %r8173, %r8172, %r8167; + shf.l.wrap.b32 %r8174, %r8173, %r8173, 16; + add.s32 %r8175, %r8174, %r8154; + xor.b32 %r8176, %r8175, %r8142; + shf.l.wrap.b32 %r8177, %r8176, %r8176, 20; + add.s32 %r8178, %r8172, %r7915; + add.s32 %r8179, %r8178, %r8177; + xor.b32 %r8180, %r8179, %r8174; + shf.l.wrap.b32 %r8181, %r8180, %r8180, 24; + add.s32 %r8182, %r8181, %r8175; + xor.b32 %r8183, %r8182, %r8177; + shf.l.wrap.b32 %r8184, %r8183, %r8183, 25; + add.s32 %r8185, %r8137, %r7922; + add.s32 %r8186, %r8185, %r8156; + xor.b32 %r8187, %r8125, %r8186; + shf.l.wrap.b32 %r8188, %r8187, %r8187, 16; + add.s32 %r8189, %r8188, %r8168; + xor.b32 %r8190, %r8189, %r8156; + shf.l.wrap.b32 %r8191, %r8190, %r8190, 20; + add.s32 %r8192, %r8186, %r7873; + add.s32 %r8193, %r8192, %r8191; + xor.b32 %r8194, %r8193, %r8188; + shf.l.wrap.b32 %r8195, %r8194, %r8194, 24; + add.s32 %r8196, %r8195, %r8189; + xor.b32 %r8197, %r8196, %r8191; + shf.l.wrap.b32 %r8198, %r8197, %r8197, 25; + add.s32 %r8199, %r8151, %r7901; + add.s32 %r8200, %r8199, %r8170; + xor.b32 %r8201, %r8200, %r8139; + shf.l.wrap.b32 %r8202, %r8201, %r8201, 16; + add.s32 %r8203, %r8202, %r8126; + xor.b32 %r8204, %r8203, %r8170; + shf.l.wrap.b32 %r8205, %r8204, %r8204, 20; + add.s32 %r8206, %r8200, %r7936; + add.s32 %r8207, %r8206, %r8205; + xor.b32 %r8208, %r8207, %r8202; + shf.l.wrap.b32 %r8209, %r8208, %r8208, 24; + add.s32 %r8210, %r8209, %r8203; + xor.b32 %r8211, %r8210, %r8205; + shf.l.wrap.b32 %r8212, %r8211, %r8211, 25; + add.s32 %r8213, %r8165, %r7943; + add.s32 %r8214, %r8213, %r8128; + xor.b32 %r8215, %r8214, %r8153; + shf.l.wrap.b32 %r8216, %r8215, %r8215, 16; + add.s32 %r8217, %r8216, %r8140; + xor.b32 %r8218, %r8217, %r8128; + shf.l.wrap.b32 %r8219, %r8218, %r8218, 20; + add.s32 %r8220, %r8214, %r7894; + add.s32 %r8221, %r8220, %r8219; + xor.b32 %r8222, %r8221, %r8216; + shf.l.wrap.b32 %r8223, %r8222, %r8222, 24; + add.s32 %r8224, %r8223, %r8217; + xor.b32 %r8225, %r8224, %r8219; + shf.l.wrap.b32 %r8226, %r8225, %r8225, 25; + add.s32 %r8227, %r8179, %r7859; + add.s32 %r8228, %r8227, %r8226; + xor.b32 %r8229, %r8228, %r8195; + shf.l.wrap.b32 %r8230, %r8229, %r8229, 16; + add.s32 %r8231, %r8230, %r8210; + xor.b32 %r8232, %r8231, %r8226; + shf.l.wrap.b32 %r8233, %r8232, %r8232, 20; + add.s32 %r8234, %r8228, %r7866; + add.s32 %r8235, %r8234, %r8233; + xor.b32 %r8236, %r8235, %r8230; + shf.l.wrap.b32 %r8237, %r8236, %r8236, 24; + add.s32 %r8238, %r8237, %r8231; + xor.b32 %r8239, %r8238, %r8233; + shf.l.wrap.b32 %r8240, %r8239, %r8239, 25; + add.s32 %r8241, %r8193, %r7908; + add.s32 %r8242, %r8241, %r8184; + xor.b32 %r8243, %r8242, %r8209; + shf.l.wrap.b32 %r8244, %r8243, %r8243, 16; + add.s32 %r8245, %r8244, %r8224; + xor.b32 %r8246, %r8245, %r8184; + shf.l.wrap.b32 %r8247, %r8246, %r8246, 20; + add.s32 %r8248, %r8242, %r7922; + add.s32 %r8249, %r8248, %r8247; + xor.b32 %r8250, %r8249, %r8244; + shf.l.wrap.b32 %r8251, %r8250, %r8250, 24; + add.s32 %r8252, %r8251, %r8245; + xor.b32 %r8253, %r8252, %r8247; + shf.l.wrap.b32 %r8254, %r8253, %r8253, 25; + add.s32 %r8255, %r8207, %r7929; + add.s32 %r8256, %r8255, %r8198; + xor.b32 %r8257, %r8223, %r8256; + shf.l.wrap.b32 %r8258, %r8257, %r8257, 16; + add.s32 %r8259, %r8258, %r8182; + xor.b32 %r8260, %r8259, %r8198; + shf.l.wrap.b32 %r8261, %r8260, %r8260, 20; + add.s32 %r8262, %r8256, %r7852; + add.s32 %r8263, %r8262, %r8261; + xor.b32 %r8264, %r8263, %r8258; + shf.l.wrap.b32 %r8265, %r8264, %r8264, 24; + add.s32 %r8266, %r8265, %r8259; + xor.b32 %r8267, %r8266, %r8261; + shf.l.wrap.b32 %r8268, %r8267, %r8267, 25; + add.s32 %r8269, %r8212, %r7887; + add.s32 %r8270, %r8269, %r8221; + xor.b32 %r8271, %r8270, %r8181; + shf.l.wrap.b32 %r8272, %r8271, %r8271, 16; + add.s32 %r8273, %r8272, %r8196; + xor.b32 %r8274, %r8273, %r8212; + shf.l.wrap.b32 %r8275, %r8274, %r8274, 20; + add.s32 %r8276, %r8270, %r7936; + add.s32 %r8277, %r8276, %r8275; + xor.b32 %r8278, %r8277, %r8272; + shf.l.wrap.b32 %r8279, %r8278, %r8278, 24; + add.s32 %r8280, %r8279, %r8273; + xor.b32 %r8281, %r8280, %r8275; + shf.l.wrap.b32 %r8282, %r8281, %r8281, 25; + add.s32 %r8283, %r8235, %r7880; + add.s32 %r8284, %r8283, %r8254; + xor.b32 %r8285, %r8284, %r8279; + shf.l.wrap.b32 %r8286, %r8285, %r8285, 16; + add.s32 %r8287, %r8286, %r8266; + xor.b32 %r8288, %r8287, %r8254; + shf.l.wrap.b32 %r8289, %r8288, %r8288, 20; + add.s32 %r8290, %r8284, %r7873; + add.s32 %r8291, %r8290, %r8289; + xor.b32 %r8292, %r8291, %r8286; + shf.l.wrap.b32 %r8293, %r8292, %r8292, 24; + add.s32 %r8294, %r8293, %r8287; + xor.b32 %r8295, %r8294, %r8289; + shf.l.wrap.b32 %r8296, %r8295, %r8295, 25; + add.s32 %r8297, %r8249, %r7901; + add.s32 %r8298, %r8297, %r8268; + xor.b32 %r8299, %r8237, %r8298; + shf.l.wrap.b32 %r8300, %r8299, %r8299, 16; + add.s32 %r8301, %r8300, %r8280; + xor.b32 %r8302, %r8301, %r8268; + shf.l.wrap.b32 %r8303, %r8302, %r8302, 20; + add.s32 %r8304, %r8298, %r7838; + add.s32 %r8305, %r8304, %r8303; + xor.b32 %r8306, %r8305, %r8300; + shf.l.wrap.b32 %r8307, %r8306, %r8306, 24; + add.s32 %r8308, %r8307, %r8301; + xor.b32 %r8309, %r8308, %r8303; + shf.l.wrap.b32 %r8310, %r8309, %r8309, 25; + add.s32 %r8311, %r8263, %r7915; + add.s32 %r8312, %r8311, %r8282; + xor.b32 %r8313, %r8312, %r8251; + shf.l.wrap.b32 %r8314, %r8313, %r8313, 16; + add.s32 %r8315, %r8314, %r8238; + xor.b32 %r8316, %r8315, %r8282; + shf.l.wrap.b32 %r8317, %r8316, %r8316, 20; + add.s32 %r8318, %r8312, %r7943; + add.s32 %r8319, %r8318, %r8317; + xor.b32 %r8320, %r8319, %r8314; + shf.l.wrap.b32 %r8321, %r8320, %r8320, 24; + add.s32 %r8322, %r8321, %r8315; + xor.b32 %r8323, %r8322, %r8317; + shf.l.wrap.b32 %r8324, %r8323, %r8323, 25; + add.s32 %r8325, %r8277, %r7894; + add.s32 %r8326, %r8325, %r8240; + xor.b32 %r8327, %r8326, %r8265; + shf.l.wrap.b32 %r8328, %r8327, %r8327, 16; + add.s32 %r8329, %r8328, %r8252; + xor.b32 %r8330, %r8329, %r8240; + shf.l.wrap.b32 %r8331, %r8330, %r8330, 20; + add.s32 %r8332, %r8326, %r7845; + add.s32 %r8333, %r8332, %r8331; + xor.b32 %r8334, %r8333, %r8328; + shf.l.wrap.b32 %r8335, %r8334, %r8334, 24; + add.s32 %r8336, %r8335, %r8329; + xor.b32 %r8337, %r8336, %r8331; + shf.l.wrap.b32 %r8338, %r8337, %r8337, 25; + add.s32 %r8339, %r8291, %r7908; + add.s32 %r8340, %r8339, %r8338; + xor.b32 %r8341, %r8340, %r8307; + shf.l.wrap.b32 %r8342, %r8341, %r8341, 16; + add.s32 %r8343, %r8342, %r8322; + xor.b32 %r8344, %r8343, %r8338; + shf.l.wrap.b32 %r8345, %r8344, %r8344, 20; + add.s32 %r8346, %r8340, %r7887; + add.s32 %r8347, %r8346, %r8345; + xor.b32 %r8348, %r8347, %r8342; + shf.l.wrap.b32 %r8349, %r8348, %r8348, 24; + add.s32 %r8350, %r8349, %r8343; + xor.b32 %r8351, %r8350, %r8345; + shf.l.wrap.b32 %r8352, %r8351, %r8351, 25; + add.s32 %r8353, %r8305, %r7922; + add.s32 %r8354, %r8353, %r8296; + xor.b32 %r8355, %r8354, %r8321; + shf.l.wrap.b32 %r8356, %r8355, %r8355, 16; + add.s32 %r8357, %r8356, %r8336; + xor.b32 %r8358, %r8357, %r8296; + shf.l.wrap.b32 %r8359, %r8358, %r8358, 20; + add.s32 %r8360, %r8354, %r7901; + add.s32 %r8361, %r8360, %r8359; + xor.b32 %r8362, %r8361, %r8356; + shf.l.wrap.b32 %r8363, %r8362, %r8362, 24; + add.s32 %r8364, %r8363, %r8357; + xor.b32 %r8365, %r8364, %r8359; + shf.l.wrap.b32 %r8366, %r8365, %r8365, 25; + add.s32 %r8367, %r8319, %r7936; + add.s32 %r8368, %r8367, %r8310; + xor.b32 %r8369, %r8335, %r8368; + shf.l.wrap.b32 %r8370, %r8369, %r8369, 16; + add.s32 %r8371, %r8370, %r8294; + xor.b32 %r8372, %r8371, %r8310; + shf.l.wrap.b32 %r8373, %r8372, %r8372, 20; + add.s32 %r8374, %r8368, %r7859; + add.s32 %r8375, %r8374, %r8373; + xor.b32 %r8376, %r8375, %r8370; + shf.l.wrap.b32 %r8377, %r8376, %r8376, 24; + add.s32 %r8378, %r8377, %r8371; + xor.b32 %r8379, %r8378, %r8373; + shf.l.wrap.b32 %r8380, %r8379, %r8379, 25; + add.s32 %r8381, %r8333, %r7929; + add.s32 %r8382, %r8381, %r8324; + xor.b32 %r8383, %r8382, %r8293; + shf.l.wrap.b32 %r8384, %r8383, %r8383, 16; + add.s32 %r8385, %r8384, %r8308; + xor.b32 %r8386, %r8385, %r8324; + shf.l.wrap.b32 %r8387, %r8386, %r8386, 20; + add.s32 %r8388, %r8382, %r7943; + add.s32 %r8389, %r8388, %r8387; + xor.b32 %r8390, %r8389, %r8384; + shf.l.wrap.b32 %r8391, %r8390, %r8390, 24; + add.s32 %r8392, %r8391, %r8385; + xor.b32 %r8393, %r8392, %r8387; + shf.l.wrap.b32 %r8394, %r8393, %r8393, 25; + add.s32 %r8395, %r8347, %r7866; + add.s32 %r8396, %r8395, %r8366; + xor.b32 %r8397, %r8396, %r8391; + shf.l.wrap.b32 %r8398, %r8397, %r8397, 16; + add.s32 %r8399, %r8398, %r8378; + xor.b32 %r8400, %r8399, %r8366; + shf.l.wrap.b32 %r8401, %r8400, %r8400, 20; + add.s32 %r8402, %r8396, %r7838; + add.s32 %r8403, %r8402, %r8401; + xor.b32 %r8404, %r8403, %r8398; + shf.l.wrap.b32 %r8405, %r8404, %r8404, 24; + add.s32 %r8406, %r8405, %r8399; + xor.b32 %r8407, %r8406, %r8401; + shf.l.wrap.b32 %r8408, %r8407, %r8407, 25; + add.s32 %r8409, %r8361, %r7915; + add.s32 %r8410, %r8409, %r8380; + xor.b32 %r8411, %r8349, %r8410; + shf.l.wrap.b32 %r8412, %r8411, %r8411, 16; + add.s32 %r8413, %r8412, %r8392; + xor.b32 %r8414, %r8413, %r8380; + shf.l.wrap.b32 %r8415, %r8414, %r8414, 20; + add.s32 %r8416, %r8410, %r7852; + add.s32 %r8417, %r8416, %r8415; + xor.b32 %r8418, %r8417, %r8412; + shf.l.wrap.b32 %r8419, %r8418, %r8418, 24; + add.s32 %r8420, %r8419, %r8413; + xor.b32 %r8421, %r8420, %r8415; + shf.l.wrap.b32 %r8422, %r8421, %r8421, 25; + add.s32 %r8423, %r8375, %r7873; + add.s32 %r8424, %r8423, %r8394; + xor.b32 %r8425, %r8424, %r8363; + shf.l.wrap.b32 %r8426, %r8425, %r8425, 16; + add.s32 %r8427, %r8426, %r8350; + xor.b32 %r8428, %r8427, %r8394; + shf.l.wrap.b32 %r8429, %r8428, %r8428, 20; + add.s32 %r8430, %r8424, %r7894; + add.s32 %r8431, %r8430, %r8429; + xor.b32 %r8432, %r8431, %r8426; + shf.l.wrap.b32 %r8433, %r8432, %r8432, 24; + add.s32 %r8434, %r8433, %r8427; + xor.b32 %r8435, %r8434, %r8429; + shf.l.wrap.b32 %r8436, %r8435, %r8435, 25; + add.s32 %r8437, %r8389, %r7845; + add.s32 %r8438, %r8437, %r8352; + xor.b32 %r8439, %r8438, %r8377; + shf.l.wrap.b32 %r8440, %r8439, %r8439, 16; + add.s32 %r8441, %r8440, %r8364; + xor.b32 %r8442, %r8441, %r8352; + shf.l.wrap.b32 %r8443, %r8442, %r8442, 20; + add.s32 %r8444, %r8438, %r7880; + add.s32 %r8445, %r8444, %r8443; + xor.b32 %r8446, %r8445, %r8440; + shf.l.wrap.b32 %r8447, %r8446, %r8446, 24; + add.s32 %r8448, %r8447, %r8441; + xor.b32 %r8449, %r8448, %r8443; + shf.l.wrap.b32 %r8450, %r8449, %r8449, 25; + add.s32 %r8451, %r8403, %r7922; + add.s32 %r8452, %r8451, %r8450; + xor.b32 %r8453, %r8452, %r8419; + shf.l.wrap.b32 %r8454, %r8453, %r8453, 16; + add.s32 %r8455, %r8454, %r8434; + xor.b32 %r8456, %r8455, %r8450; + shf.l.wrap.b32 %r8457, %r8456, %r8456, 20; + add.s32 %r8458, %r8452, %r7929; + add.s32 %r8459, %r8458, %r8457; + xor.b32 %r8460, %r8459, %r8454; + shf.l.wrap.b32 %r8461, %r8460, %r8460, 24; + add.s32 %r8462, %r8461, %r8455; + xor.b32 %r8463, %r8462, %r8457; + shf.l.wrap.b32 %r8464, %r8463, %r8463, 25; + add.s32 %r8465, %r8417, %r7901; + add.s32 %r8466, %r8465, %r8408; + xor.b32 %r8467, %r8466, %r8433; + shf.l.wrap.b32 %r8468, %r8467, %r8467, 16; + add.s32 %r8469, %r8468, %r8448; + xor.b32 %r8470, %r8469, %r8408; + shf.l.wrap.b32 %r8471, %r8470, %r8470, 20; + add.s32 %r8472, %r8466, %r7915; + add.s32 %r8473, %r8472, %r8471; + xor.b32 %r8474, %r8473, %r8468; + shf.l.wrap.b32 %r8475, %r8474, %r8474, 24; + add.s32 %r8476, %r8475, %r8469; + xor.b32 %r8477, %r8476, %r8471; + shf.l.wrap.b32 %r8478, %r8477, %r8477, 25; + add.s32 %r8479, %r8431, %r7943; + add.s32 %r8480, %r8479, %r8422; + xor.b32 %r8481, %r8447, %r8480; + shf.l.wrap.b32 %r8482, %r8481, %r8481, 16; + add.s32 %r8483, %r8482, %r8406; + xor.b32 %r8484, %r8483, %r8422; + shf.l.wrap.b32 %r8485, %r8484, %r8484, 20; + add.s32 %r8486, %r8480, %r7908; + add.s32 %r8487, %r8486, %r8485; + xor.b32 %r8488, %r8487, %r8482; + shf.l.wrap.b32 %r8489, %r8488, %r8488, 24; + add.s32 %r8490, %r8489, %r8483; + xor.b32 %r8491, %r8490, %r8485; + shf.l.wrap.b32 %r8492, %r8491, %r8491, 25; + add.s32 %r8493, %r8445, %r7936; + add.s32 %r8494, %r8493, %r8436; + xor.b32 %r8495, %r8494, %r8405; + shf.l.wrap.b32 %r8496, %r8495, %r8495, 16; + add.s32 %r8497, %r8496, %r8420; + xor.b32 %r8498, %r8497, %r8436; + shf.l.wrap.b32 %r8499, %r8498, %r8498, 20; + add.s32 %r8500, %r8494, %r7894; + add.s32 %r8501, %r8500, %r8499; + xor.b32 %r8502, %r8501, %r8496; + shf.l.wrap.b32 %r8503, %r8502, %r8502, 24; + add.s32 %r8504, %r8503, %r8497; + xor.b32 %r8505, %r8504, %r8499; + shf.l.wrap.b32 %r8506, %r8505, %r8505, 25; + add.s32 %r8507, %r8459, %r7887; + add.s32 %r8508, %r8507, %r8478; + xor.b32 %r8509, %r8508, %r8503; + shf.l.wrap.b32 %r8510, %r8509, %r8509, 16; + add.s32 %r8511, %r8510, %r8490; + xor.b32 %r8512, %r8511, %r8478; + shf.l.wrap.b32 %r8513, %r8512, %r8512, 20; + add.s32 %r8514, %r8508, %r7852; + add.s32 %r8515, %r8514, %r8513; + xor.b32 %r8516, %r8515, %r8510; + shf.l.wrap.b32 %r8517, %r8516, %r8516, 24; + add.s32 %r8518, %r8517, %r8511; + xor.b32 %r8519, %r8518, %r8513; + shf.l.wrap.b32 %r8520, %r8519, %r8519, 25; + add.s32 %r8521, %r8473, %r7873; + add.s32 %r8522, %r8521, %r8492; + xor.b32 %r8523, %r8461, %r8522; + shf.l.wrap.b32 %r8524, %r8523, %r8523, 16; + add.s32 %r8525, %r8524, %r8504; + xor.b32 %r8526, %r8525, %r8492; + shf.l.wrap.b32 %r8527, %r8526, %r8526, 20; + add.s32 %r8528, %r8522, %r7859; + add.s32 %r8529, %r8528, %r8527; + xor.b32 %r8530, %r8529, %r8524; + shf.l.wrap.b32 %r8531, %r8530, %r8530, 24; + add.s32 %r8532, %r8531, %r8525; + xor.b32 %r8533, %r8532, %r8527; + shf.l.wrap.b32 %r8534, %r8533, %r8533, 25; + add.s32 %r8535, %r8487, %r7838; + add.s32 %r8536, %r8535, %r8506; + xor.b32 %r8537, %r8536, %r8475; + shf.l.wrap.b32 %r8538, %r8537, %r8537, 16; + add.s32 %r8539, %r8538, %r8462; + xor.b32 %r8540, %r8539, %r8506; + shf.l.wrap.b32 %r8541, %r8540, %r8540, 20; + add.s32 %r8542, %r8536, %r7845; + add.s32 %r8543, %r8542, %r8541; + xor.b32 %r8544, %r8543, %r8538; + shf.l.wrap.b32 %r8545, %r8544, %r8544, 24; + add.s32 %r8546, %r8545, %r8539; + xor.b32 %r8547, %r8546, %r8541; + shf.l.wrap.b32 %r8548, %r8547, %r8547, 25; + add.s32 %r8549, %r8501, %r7880; + add.s32 %r8550, %r8549, %r8464; + xor.b32 %r8551, %r8550, %r8489; + shf.l.wrap.b32 %r8552, %r8551, %r8551, 16; + add.s32 %r8553, %r8552, %r8476; + xor.b32 %r8554, %r8553, %r8464; + shf.l.wrap.b32 %r8555, %r8554, %r8554, 20; + add.s32 %r8556, %r8550, %r7866; + add.s32 %r8557, %r8556, %r8555; + xor.b32 %r8558, %r8557, %r8552; + shf.l.wrap.b32 %r8559, %r8558, %r8558, 24; + add.s32 %r8560, %r8559, %r8553; + xor.b32 %r8561, %r8560, %r8555; + shf.l.wrap.b32 %r8562, %r8561, %r8561, 25; + add.s32 %r8563, %r8515, %r7901; + add.s32 %r8564, %r8563, %r8562; + xor.b32 %r8565, %r8564, %r8531; + shf.l.wrap.b32 %r8566, %r8565, %r8565, 16; + add.s32 %r8567, %r8566, %r8546; + xor.b32 %r8568, %r8567, %r8562; + shf.l.wrap.b32 %r8569, %r8568, %r8568, 20; + add.s32 %r8570, %r8564, %r7936; + add.s32 %r8571, %r8570, %r8569; + xor.b32 %r8572, %r8571, %r8566; + shf.l.wrap.b32 %r8573, %r8572, %r8572, 24; + add.s32 %r8574, %r8573, %r8567; + xor.b32 %r8575, %r8574, %r8569; + shf.l.wrap.b32 %r8576, %r8575, %r8575, 25; + add.s32 %r8577, %r8529, %r7915; + add.s32 %r8578, %r8577, %r8520; + xor.b32 %r8579, %r8578, %r8545; + shf.l.wrap.b32 %r8580, %r8579, %r8579, 16; + add.s32 %r8581, %r8580, %r8560; + xor.b32 %r8582, %r8581, %r8520; + shf.l.wrap.b32 %r8583, %r8582, %r8582, 20; + add.s32 %r8584, %r8578, %r7873; + add.s32 %r8585, %r8584, %r8583; + xor.b32 %r8586, %r8585, %r8580; + shf.l.wrap.b32 %r8587, %r8586, %r8586, 24; + add.s32 %r8588, %r8587, %r8581; + xor.b32 %r8589, %r8588, %r8583; + shf.l.wrap.b32 %r8590, %r8589, %r8589, 25; + add.s32 %r8591, %r8543, %r7894; + add.s32 %r8592, %r8591, %r8534; + xor.b32 %r8593, %r8559, %r8592; + shf.l.wrap.b32 %r8594, %r8593, %r8593, 16; + add.s32 %r8595, %r8594, %r8518; + xor.b32 %r8596, %r8595, %r8534; + shf.l.wrap.b32 %r8597, %r8596, %r8596, 20; + add.s32 %r8598, %r8592, %r7922; + add.s32 %r8599, %r8598, %r8597; + xor.b32 %r8600, %r8599, %r8594; + shf.l.wrap.b32 %r8601, %r8600, %r8600, 24; + add.s32 %r8602, %r8601, %r8595; + xor.b32 %r8603, %r8602, %r8597; + shf.l.wrap.b32 %r8604, %r8603, %r8603, 25; + add.s32 %r8605, %r8557, %r7943; + add.s32 %r8606, %r8605, %r8548; + xor.b32 %r8607, %r8606, %r8517; + shf.l.wrap.b32 %r8608, %r8607, %r8607, 16; + add.s32 %r8609, %r8608, %r8532; + xor.b32 %r8610, %r8609, %r8548; + shf.l.wrap.b32 %r8611, %r8610, %r8610, 20; + add.s32 %r8612, %r8606, %r7845; + add.s32 %r8613, %r8612, %r8611; + xor.b32 %r8614, %r8613, %r8608; + shf.l.wrap.b32 %r8615, %r8614, %r8614, 24; + add.s32 %r8616, %r8615, %r8609; + xor.b32 %r8617, %r8616, %r8611; + shf.l.wrap.b32 %r8618, %r8617, %r8617, 25; + add.s32 %r8619, %r8571, %r7929; + add.s32 %r8620, %r8619, %r8590; + xor.b32 %r8621, %r8620, %r8615; + shf.l.wrap.b32 %r8622, %r8621, %r8621, 16; + add.s32 %r8623, %r8622, %r8602; + xor.b32 %r8624, %r8623, %r8590; + shf.l.wrap.b32 %r8625, %r8624, %r8624, 20; + add.s32 %r8626, %r8620, %r7859; + add.s32 %r8627, %r8626, %r8625; + xor.b32 %r8628, %r8627, %r8622; + shf.l.wrap.b32 %r8629, %r8628, %r8628, 24; + add.s32 %r8630, %r8629, %r8623; + xor.b32 %r8631, %r8630, %r8625; + shf.l.wrap.b32 %r8632, %r8631, %r8631, 25; + add.s32 %r8633, %r8585, %r7838; + add.s32 %r8634, %r8633, %r8604; + xor.b32 %r8635, %r8573, %r8634; + shf.l.wrap.b32 %r8636, %r8635, %r8635, 16; + add.s32 %r8637, %r8636, %r8616; + xor.b32 %r8638, %r8637, %r8604; + shf.l.wrap.b32 %r8639, %r8638, %r8638, 20; + add.s32 %r8640, %r8634, %r7908; + add.s32 %r8641, %r8640, %r8639; + xor.b32 %r8642, %r8641, %r8636; + shf.l.wrap.b32 %r8643, %r8642, %r8642, 24; + add.s32 %r8644, %r8643, %r8637; + xor.b32 %r8645, %r8644, %r8639; + shf.l.wrap.b32 %r8646, %r8645, %r8645, 25; + add.s32 %r8647, %r8599, %r7852; + add.s32 %r8648, %r8647, %r8618; + xor.b32 %r8649, %r8648, %r8587; + shf.l.wrap.b32 %r8650, %r8649, %r8649, 16; + add.s32 %r8651, %r8650, %r8574; + xor.b32 %r8652, %r8651, %r8618; + shf.l.wrap.b32 %r8653, %r8652, %r8652, 20; + add.s32 %r8654, %r8648, %r7880; + add.s32 %r8655, %r8654, %r8653; + xor.b32 %r8656, %r8655, %r8650; + shf.l.wrap.b32 %r8657, %r8656, %r8656, 24; + add.s32 %r8658, %r8657, %r8651; + xor.b32 %r8659, %r8658, %r8653; + shf.l.wrap.b32 %r8660, %r8659, %r8659, 25; + add.s32 %r8661, %r8613, %r7866; + add.s32 %r8662, %r8661, %r8576; + xor.b32 %r8663, %r8662, %r8601; + shf.l.wrap.b32 %r8664, %r8663, %r8663, 16; + add.s32 %r8665, %r8664, %r8588; + xor.b32 %r8666, %r8665, %r8576; + shf.l.wrap.b32 %r8667, %r8666, %r8666, 20; + add.s32 %r8668, %r8662, %r7887; + add.s32 %r8669, %r8668, %r8667; + xor.b32 %r8670, %r8669, %r8664; + shf.l.wrap.b32 %r8671, %r8670, %r8670, 24; + add.s32 %r8672, %r8671, %r8665; + xor.b32 %r8673, %r8672, %r8667; + shf.l.wrap.b32 %r8674, %r8673, %r8673, 25; + add.s32 %r8675, %r8627, %r7915; + add.s32 %r8676, %r8675, %r8674; + xor.b32 %r8677, %r8676, %r8643; + shf.l.wrap.b32 %r8678, %r8677, %r8677, 16; + add.s32 %r8679, %r8678, %r8658; + xor.b32 %r8680, %r8679, %r8674; + shf.l.wrap.b32 %r8681, %r8680, %r8680, 20; + add.s32 %r8682, %r8676, %r7943; + add.s32 %r8683, %r8682, %r8681; + xor.b32 %r8684, %r8683, %r8678; + shf.l.wrap.b32 %r8685, %r8684, %r8684, 24; + add.s32 %r8686, %r8685, %r8679; + xor.b32 %r8687, %r8686, %r8681; + shf.l.wrap.b32 %r8688, %r8687, %r8687, 25; + add.s32 %r8689, %r8641, %r7873; + add.s32 %r8690, %r8689, %r8632; + xor.b32 %r8691, %r8690, %r8657; + shf.l.wrap.b32 %r8692, %r8691, %r8691, 16; + add.s32 %r8693, %r8692, %r8672; + xor.b32 %r8694, %r8693, %r8632; + shf.l.wrap.b32 %r8695, %r8694, %r8694, 20; + add.s32 %r8696, %r8690, %r7838; + add.s32 %r8697, %r8696, %r8695; + xor.b32 %r8698, %r8697, %r8692; + shf.l.wrap.b32 %r8699, %r8698, %r8698, 24; + add.s32 %r8700, %r8699, %r8693; + xor.b32 %r8701, %r8700, %r8695; + shf.l.wrap.b32 %r8702, %r8701, %r8701, 25; + add.s32 %r8703, %r8655, %r7845; + add.s32 %r8704, %r8703, %r8646; + xor.b32 %r8705, %r8671, %r8704; + shf.l.wrap.b32 %r8706, %r8705, %r8705, 16; + add.s32 %r8707, %r8706, %r8630; + xor.b32 %r8708, %r8707, %r8646; + shf.l.wrap.b32 %r8709, %r8708, %r8708, 20; + add.s32 %r8710, %r8704, %r7901; + add.s32 %r8711, %r8710, %r8709; + xor.b32 %r8712, %r8711, %r8706; + shf.l.wrap.b32 %r8713, %r8712, %r8712, 24; + add.s32 %r8714, %r8713, %r8707; + xor.b32 %r8715, %r8714, %r8709; + shf.l.wrap.b32 %r8716, %r8715, %r8715, 25; + add.s32 %r8717, %r8669, %r7894; + add.s32 %r8718, %r8717, %r8660; + xor.b32 %r8719, %r8718, %r8629; + shf.l.wrap.b32 %r8720, %r8719, %r8719, 16; + add.s32 %r8721, %r8720, %r8644; + xor.b32 %r8722, %r8721, %r8660; + shf.l.wrap.b32 %r8723, %r8722, %r8722, 20; + add.s32 %r8724, %r8718, %r7880; + add.s32 %r8725, %r8724, %r8723; + xor.b32 %r8726, %r8725, %r8720; + shf.l.wrap.b32 %r8727, %r8726, %r8726, 24; + add.s32 %r8728, %r8727, %r8721; + xor.b32 %r8729, %r8728, %r8723; + shf.l.wrap.b32 %r8730, %r8729, %r8729, 25; + add.s32 %r8731, %r8683, %r7936; + add.s32 %r8732, %r8731, %r8702; + xor.b32 %r8733, %r8732, %r8727; + shf.l.wrap.b32 %r8734, %r8733, %r8733, 16; + add.s32 %r8735, %r8734, %r8714; + xor.b32 %r8736, %r8735, %r8702; + shf.l.wrap.b32 %r8737, %r8736, %r8736, 20; + add.s32 %r8738, %r8732, %r7908; + add.s32 %r8739, %r8738, %r8737; + xor.b32 %r8740, %r8739, %r8734; + shf.l.wrap.b32 %r8741, %r8740, %r8740, 24; + add.s32 %r8742, %r8741, %r8735; + xor.b32 %r8743, %r8742, %r8737; + shf.l.wrap.b32 %r8744, %r8743, %r8743, 25; + add.s32 %r8745, %r8697, %r7852; + add.s32 %r8746, %r8745, %r8716; + xor.b32 %r8747, %r8685, %r8746; + shf.l.wrap.b32 %r8748, %r8747, %r8747, 16; + add.s32 %r8749, %r8748, %r8728; + xor.b32 %r8750, %r8749, %r8716; + shf.l.wrap.b32 %r8751, %r8750, %r8750, 20; + add.s32 %r8752, %r8746, %r7922; + add.s32 %r8753, %r8752, %r8751; + xor.b32 %r8754, %r8753, %r8748; + shf.l.wrap.b32 %r8755, %r8754, %r8754, 24; + add.s32 %r8756, %r8755, %r8749; + xor.b32 %r8757, %r8756, %r8751; + shf.l.wrap.b32 %r8758, %r8757, %r8757, 25; + add.s32 %r8759, %r8711, %r7859; + add.s32 %r8760, %r8759, %r8730; + xor.b32 %r8761, %r8760, %r8699; + shf.l.wrap.b32 %r8762, %r8761, %r8761, 16; + add.s32 %r8763, %r8762, %r8686; + xor.b32 %r8764, %r8763, %r8730; + shf.l.wrap.b32 %r8765, %r8764, %r8764, 20; + add.s32 %r8766, %r8760, %r7866; + add.s32 %r8767, %r8766, %r8765; + xor.b32 %r8768, %r8767, %r8762; + shf.l.wrap.b32 %r8769, %r8768, %r8768, 24; + add.s32 %r8770, %r8769, %r8763; + xor.b32 %r8771, %r8770, %r8765; + shf.l.wrap.b32 %r8772, %r8771, %r8771, 25; + add.s32 %r8773, %r8725, %r7887; + add.s32 %r8774, %r8773, %r8688; + xor.b32 %r8775, %r8774, %r8713; + shf.l.wrap.b32 %r8776, %r8775, %r8775, 16; + add.s32 %r8777, %r8776, %r8700; + xor.b32 %r8778, %r8777, %r8688; + shf.l.wrap.b32 %r8779, %r8778, %r8778, 20; + add.s32 %r8780, %r8774, %r7929; + add.s32 %r8781, %r8780, %r8779; + xor.b32 %r8782, %r8781, %r8776; + shf.l.wrap.b32 %r8783, %r8782, %r8782, 24; + add.s32 %r8784, %r8783, %r8777; + xor.b32 %r8785, %r8784, %r8779; + shf.l.wrap.b32 %r8786, %r8785, %r8785, 25; + xor.b32 %r8787, %r8770, %r8739; + xor.b32 %r8788, %r8784, %r8753; + xor.b32 %r8789, %r8742, %r8767; + xor.b32 %r8790, %r8781, %r8756; + xor.b32 %r8791, %r8786, %r8755; + xor.b32 %r8792, %r8744, %r8769; + xor.b32 %r8793, %r8783, %r8758; + xor.b32 %r8794, %r8772, %r8741; + st.local.u8 [%rd190+145], %r8787; + shr.u32 %r8795, %r8787, 8; + st.local.u8 [%rd190+146], %r8795; + shr.u32 %r8796, %r8787, 16; + st.local.u8 [%rd190+147], %r8796; + shr.u32 %r8797, %r8787, 24; + st.local.u8 [%rd190+148], %r8797; + st.local.u8 [%rd190+149], %r8788; + shr.u32 %r8798, %r8788, 8; + st.local.u8 [%rd190+150], %r8798; + shr.u32 %r8799, %r8788, 16; + st.local.u8 [%rd190+151], %r8799; + shr.u32 %r8800, %r8788, 24; + st.local.u8 [%rd190+152], %r8800; + st.local.u8 [%rd190+153], %r8789; + shr.u32 %r8801, %r8789, 8; + st.local.u8 [%rd190+154], %r8801; + shr.u32 %r8802, %r8789, 16; + st.local.u8 [%rd190+155], %r8802; + shr.u32 %r8803, %r8789, 24; + st.local.u8 [%rd190+156], %r8803; + st.local.u8 [%rd190+157], %r8790; + shr.u32 %r8804, %r8790, 8; + st.local.u8 [%rd190+158], %r8804; + shr.u32 %r8805, %r8790, 16; + st.local.u8 [%rd190+159], %r8805; + shr.u32 %r8806, %r8790, 24; + st.local.u8 [%rd190+160], %r8806; + st.local.u8 [%rd190+161], %r8791; + shr.u32 %r8807, %r8791, 8; + st.local.u8 [%rd190+162], %r8807; + shr.u32 %r8808, %r8791, 16; + st.local.u8 [%rd190+163], %r8808; + shr.u32 %r8809, %r8791, 24; + st.local.u8 [%rd190+164], %r8809; + st.local.u8 [%rd190+165], %r8792; + shr.u32 %r8810, %r8792, 8; + st.local.u8 [%rd190+166], %r8810; + shr.u32 %r8811, %r8792, 16; + st.local.u8 [%rd190+167], %r8811; + shr.u32 %r8812, %r8792, 24; + st.local.u8 [%rd190+168], %r8812; + st.local.u8 [%rd190+169], %r8793; + shr.u32 %r8813, %r8793, 8; + st.local.u8 [%rd190+170], %r8813; + shr.u32 %r8814, %r8793, 16; + st.local.u8 [%rd190+171], %r8814; + shr.u32 %r8815, %r8793, 24; + st.local.u8 [%rd190+172], %r8815; + st.local.u8 [%rd190+173], %r8794; + shr.u32 %r8816, %r8794, 8; + st.local.u8 [%rd190+174], %r8816; + shr.u32 %r8817, %r8794, 16; + st.local.u8 [%rd190+175], %r8817; + shr.u32 %r8818, %r8794, 24; + st.local.u8 [%rd190+176], %r8818; + ld.local.u8 %rs328, [%rd3+8]; + add.s16 %rs329, %rs328, -1; + st.local.u8 [%rd3+8], %rs329; + cvt.u64.u16 %rd191, %rs329; + and.b64 %rd192, %rd191, 255; + setp.lt.u64 %p38, %rd229, %rd192; + and.b16 %rs330, %rs329, 255; + mul.wide.u16 %r11681, %rs330, 32; + @%p38 bra $L__BB1_45; + +$L__BB1_46: + cvt.s64.s32 %rd193, %r11681; + add.s64 %rd194, %rd2, %rd193; + st.local.u8 [%rd194+145], %r97; + shr.u32 %r8819, %r97, 8; + st.local.u8 [%rd194+146], %r8819; + shr.u32 %r8820, %r97, 16; + st.local.u8 [%rd194+147], %r8820; + shr.u32 %r8821, %r97, 24; + st.local.u8 [%rd194+148], %r8821; + st.local.u8 [%rd194+149], %r98; + shr.u32 %r8822, %r98, 8; + st.local.u8 [%rd194+150], %r8822; + shr.u32 %r8823, %r98, 16; + st.local.u8 [%rd194+151], %r8823; + shr.u32 %r8824, %r98, 24; + st.local.u8 [%rd194+152], %r8824; + st.local.u8 [%rd194+153], %r99; + shr.u32 %r8825, %r99, 8; + st.local.u8 [%rd194+154], %r8825; + shr.u32 %r8826, %r99, 16; + st.local.u8 [%rd194+155], %r8826; + shr.u32 %r8827, %r99, 24; + st.local.u8 [%rd194+156], %r8827; + st.local.u8 [%rd194+157], %r100; + shr.u32 %r8828, %r100, 8; + st.local.u8 [%rd194+158], %r8828; + shr.u32 %r8829, %r100, 16; + st.local.u8 [%rd194+159], %r8829; + shr.u32 %r8830, %r100, 24; + st.local.u8 [%rd194+160], %r8830; + st.local.u8 [%rd194+161], %r101; + shr.u32 %r8831, %r101, 8; + st.local.u8 [%rd194+162], %r8831; + shr.u32 %r8832, %r101, 16; + st.local.u8 [%rd194+163], %r8832; + shr.u32 %r8833, %r101, 24; + st.local.u8 [%rd194+164], %r8833; + st.local.u8 [%rd194+165], %r102; + shr.u32 %r8834, %r102, 8; + st.local.u8 [%rd194+166], %r8834; + shr.u32 %r8835, %r102, 16; + st.local.u8 [%rd194+167], %r8835; + shr.u32 %r8836, %r102, 24; + st.local.u8 [%rd194+168], %r8836; + st.local.u8 [%rd194+169], %r103; + shr.u32 %r8837, %r103, 8; + st.local.u8 [%rd194+170], %r8837; + shr.u32 %r8838, %r103, 16; + st.local.u8 [%rd194+171], %r8838; + shr.u32 %r8839, %r103, 24; + st.local.u8 [%rd194+172], %r8839; + st.local.u8 [%rd194+173], %r104; + shr.u32 %r8840, %r104, 8; + st.local.u8 [%rd194+174], %r8840; + shr.u32 %r8841, %r104, 16; + st.local.u8 [%rd194+175], %r8841; + shr.u32 %r8842, %r104, 24; + st.local.u8 [%rd194+176], %r8842; + ld.local.u8 %rs388, [%rd3+8]; + bra.uni $L__BB1_47; + +$L__BB1_29: + cvt.u32.u16 %r3957, %rs14; + and.b32 %r3958, %r3957, 255; + { // callseq 2, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd253; + .param .b64 param1; + st.param.b64 [param1+0], %rd49; + .param .b64 param2; + st.param.b64 [param2+0], %rd98; + .param .b64 param3; + st.param.b64 [param3+0], %rd250; + .param .b32 param4; + st.param.b32 [param4+0], %r3958; + .param .b64 param5; + st.param.b64 [param5+0], %rd142; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd163, [retval0+0]; + } // callseq 2 + ld.local.v4.u32 {%r3959, %r3960, %r3961, %r3962}, [%rd42]; + ld.local.v4.u32 {%r3963, %r3964, %r3965, %r3966}, [%rd42+16]; + ld.local.v4.u32 {%r3967, %r3968, %r3969, %r3970}, [%rd42+32]; + ld.local.v4.u32 {%r3971, %r3972, %r3973, %r3974}, [%rd42+48]; + ld.local.u64 %rd164, [%rd3+-72]; + popc.b64 %r3975, %rd164; + cvt.u64.u32 %rd51, %r3975; + ld.local.u8 %rs137, [%rd3+8]; + cvt.u64.u16 %rd165, %rs137; + setp.ge.u64 %p27, %rd51, %rd165; + mul.wide.u16 %r11661, %rs137, 32; + @%p27 bra $L__BB1_32; + +$L__BB1_31: + popc.b64 %r11647, %rd164; + cvt.u64.u32 %rd225, %r11647; + add.s32 %r3976, %r11661, -64; + cvt.s64.s32 %rd166, %r3976; + add.s64 %rd167, %rd2, %rd166; + ld.local.u8 %r3977, [%rd3+2]; + ld.local.u8 %r3978, [%rd167+145]; + ld.local.u8 %r3979, [%rd167+146]; + prmt.b32 %r3980, %r3979, %r3978, 30212; + ld.local.u8 %r3981, [%rd167+147]; + prmt.b32 %r3982, %r3981, %r3980, 28756; + ld.local.u8 %r3983, [%rd167+148]; + prmt.b32 %r3984, %r3983, %r3982, 1620; + ld.local.u8 %r3985, [%rd167+149]; + ld.local.u8 %r3986, [%rd167+150]; + prmt.b32 %r3987, %r3986, %r3985, 30212; + ld.local.u8 %r3988, [%rd167+151]; + prmt.b32 %r3989, %r3988, %r3987, 28756; + ld.local.u8 %r3990, [%rd167+152]; + prmt.b32 %r3991, %r3990, %r3989, 1620; + ld.local.u8 %r3992, [%rd167+153]; + ld.local.u8 %r3993, [%rd167+154]; + prmt.b32 %r3994, %r3993, %r3992, 30212; + ld.local.u8 %r3995, [%rd167+155]; + prmt.b32 %r3996, %r3995, %r3994, 28756; + ld.local.u8 %r3997, [%rd167+156]; + prmt.b32 %r3998, %r3997, %r3996, 1620; + ld.local.u8 %r3999, [%rd167+157]; + ld.local.u8 %r4000, [%rd167+158]; + prmt.b32 %r4001, %r4000, %r3999, 30212; + ld.local.u8 %r4002, [%rd167+159]; + prmt.b32 %r4003, %r4002, %r4001, 28756; + ld.local.u8 %r4004, [%rd167+160]; + prmt.b32 %r4005, %r4004, %r4003, 1620; + ld.local.u8 %r4006, [%rd167+161]; + ld.local.u8 %r4007, [%rd167+162]; + prmt.b32 %r4008, %r4007, %r4006, 30212; + ld.local.u8 %r4009, [%rd167+163]; + prmt.b32 %r4010, %r4009, %r4008, 28756; + ld.local.u8 %r4011, [%rd167+164]; + prmt.b32 %r4012, %r4011, %r4010, 1620; + ld.local.u8 %r4013, [%rd167+165]; + ld.local.u8 %r4014, [%rd167+166]; + prmt.b32 %r4015, %r4014, %r4013, 30212; + ld.local.u8 %r4016, [%rd167+167]; + prmt.b32 %r4017, %r4016, %r4015, 28756; + ld.local.u8 %r4018, [%rd167+168]; + prmt.b32 %r4019, %r4018, %r4017, 1620; + ld.local.u8 %r4020, [%rd167+169]; + ld.local.u8 %r4021, [%rd167+170]; + prmt.b32 %r4022, %r4021, %r4020, 30212; + ld.local.u8 %r4023, [%rd167+171]; + prmt.b32 %r4024, %r4023, %r4022, 28756; + ld.local.u8 %r4025, [%rd167+172]; + prmt.b32 %r4026, %r4025, %r4024, 1620; + ld.local.u8 %r4027, [%rd167+173]; + ld.local.u8 %r4028, [%rd167+174]; + prmt.b32 %r4029, %r4028, %r4027, 30212; + ld.local.u8 %r4030, [%rd167+175]; + prmt.b32 %r4031, %r4030, %r4029, 28756; + ld.local.u8 %r4032, [%rd167+176]; + prmt.b32 %r4033, %r4032, %r4031, 1620; + ld.local.u8 %r4034, [%rd167+177]; + ld.local.u8 %r4035, [%rd167+178]; + prmt.b32 %r4036, %r4035, %r4034, 30212; + ld.local.u8 %r4037, [%rd167+179]; + prmt.b32 %r4038, %r4037, %r4036, 28756; + ld.local.u8 %r4039, [%rd167+180]; + prmt.b32 %r4040, %r4039, %r4038, 1620; + ld.local.u8 %r4041, [%rd167+181]; + ld.local.u8 %r4042, [%rd167+182]; + prmt.b32 %r4043, %r4042, %r4041, 30212; + ld.local.u8 %r4044, [%rd167+183]; + prmt.b32 %r4045, %r4044, %r4043, 28756; + ld.local.u8 %r4046, [%rd167+184]; + prmt.b32 %r4047, %r4046, %r4045, 1620; + ld.local.u8 %r4048, [%rd167+185]; + ld.local.u8 %r4049, [%rd167+186]; + prmt.b32 %r4050, %r4049, %r4048, 30212; + ld.local.u8 %r4051, [%rd167+187]; + prmt.b32 %r4052, %r4051, %r4050, 28756; + ld.local.u8 %r4053, [%rd167+188]; + prmt.b32 %r4054, %r4053, %r4052, 1620; + ld.local.u8 %r4055, [%rd167+189]; + ld.local.u8 %r4056, [%rd167+190]; + prmt.b32 %r4057, %r4056, %r4055, 30212; + ld.local.u8 %r4058, [%rd167+191]; + prmt.b32 %r4059, %r4058, %r4057, 28756; + ld.local.u8 %r4060, [%rd167+192]; + prmt.b32 %r4061, %r4060, %r4059, 1620; + ld.local.u8 %r4062, [%rd167+193]; + ld.local.u8 %r4063, [%rd167+194]; + prmt.b32 %r4064, %r4063, %r4062, 30212; + ld.local.u8 %r4065, [%rd167+195]; + prmt.b32 %r4066, %r4065, %r4064, 28756; + ld.local.u8 %r4067, [%rd167+196]; + prmt.b32 %r4068, %r4067, %r4066, 1620; + ld.local.u8 %r4069, [%rd167+197]; + ld.local.u8 %r4070, [%rd167+198]; + prmt.b32 %r4071, %r4070, %r4069, 30212; + ld.local.u8 %r4072, [%rd167+199]; + prmt.b32 %r4073, %r4072, %r4071, 28756; + ld.local.u8 %r4074, [%rd167+200]; + prmt.b32 %r4075, %r4074, %r4073, 1620; + ld.local.u8 %r4076, [%rd167+201]; + ld.local.u8 %r4077, [%rd167+202]; + prmt.b32 %r4078, %r4077, %r4076, 30212; + ld.local.u8 %r4079, [%rd167+203]; + prmt.b32 %r4080, %r4079, %r4078, 28756; + ld.local.u8 %r4081, [%rd167+204]; + prmt.b32 %r4082, %r4081, %r4080, 1620; + ld.local.u8 %r4083, [%rd167+205]; + ld.local.u8 %r4084, [%rd167+206]; + prmt.b32 %r4085, %r4084, %r4083, 30212; + ld.local.u8 %r4086, [%rd167+207]; + prmt.b32 %r4087, %r4086, %r4085, 28756; + ld.local.u8 %r4088, [%rd167+208]; + prmt.b32 %r4089, %r4088, %r4087, 1620; + or.b32 %r4090, %r3977, 4; + ld.local.u8 %r4091, [%rd3+-120]; + ld.local.u8 %r4092, [%rd3+-119]; + prmt.b32 %r4093, %r4092, %r4091, 30212; + ld.local.u8 %r4094, [%rd3+-118]; + ld.local.u8 %r4095, [%rd3+-117]; + prmt.b32 %r4096, %r4095, %r4094, 30212; + prmt.b32 %r4097, %r4096, %r4093, 4180; + ld.local.u8 %r4098, [%rd3+-136]; + ld.local.u8 %r4099, [%rd3+-135]; + prmt.b32 %r4100, %r4099, %r4098, 30212; + ld.local.u8 %r4101, [%rd3+-134]; + ld.local.u8 %r4102, [%rd3+-133]; + prmt.b32 %r4103, %r4102, %r4101, 30212; + prmt.b32 %r4104, %r4103, %r4100, 4180; + add.s32 %r4105, %r4097, %r4104; + add.s32 %r4106, %r4105, %r3984; + shf.l.wrap.b32 %r4107, %r4106, %r4106, 16; + add.s32 %r4108, %r4107, 1779033703; + xor.b32 %r4109, %r4108, %r4097; + shf.l.wrap.b32 %r4110, %r4109, %r4109, 20; + add.s32 %r4111, %r3991, %r4106; + add.s32 %r4112, %r4111, %r4110; + xor.b32 %r4113, %r4112, %r4107; + shf.l.wrap.b32 %r4114, %r4113, %r4113, 24; + add.s32 %r4115, %r4114, %r4108; + xor.b32 %r4116, %r4115, %r4110; + shf.l.wrap.b32 %r4117, %r4116, %r4116, 25; + ld.local.u8 %r4118, [%rd3+-116]; + ld.local.u8 %r4119, [%rd3+-115]; + prmt.b32 %r4120, %r4119, %r4118, 30212; + ld.local.u8 %r4121, [%rd3+-114]; + ld.local.u8 %r4122, [%rd3+-113]; + prmt.b32 %r4123, %r4122, %r4121, 30212; + prmt.b32 %r4124, %r4123, %r4120, 4180; + ld.local.u8 %r4125, [%rd3+-132]; + ld.local.u8 %r4126, [%rd3+-131]; + prmt.b32 %r4127, %r4126, %r4125, 30212; + ld.local.u8 %r4128, [%rd3+-130]; + ld.local.u8 %r4129, [%rd3+-129]; + prmt.b32 %r4130, %r4129, %r4128, 30212; + prmt.b32 %r4131, %r4130, %r4127, 4180; + add.s32 %r4132, %r4124, %r4131; + add.s32 %r4133, %r4132, %r3998; + shf.l.wrap.b32 %r4134, %r4133, %r4133, 16; + add.s32 %r4135, %r4134, -1150833019; + xor.b32 %r4136, %r4135, %r4124; + shf.l.wrap.b32 %r4137, %r4136, %r4136, 20; + add.s32 %r4138, %r4005, %r4133; + add.s32 %r4139, %r4138, %r4137; + xor.b32 %r4140, %r4139, %r4134; + shf.l.wrap.b32 %r4141, %r4140, %r4140, 24; + add.s32 %r4142, %r4141, %r4135; + xor.b32 %r4143, %r4142, %r4137; + shf.l.wrap.b32 %r4144, %r4143, %r4143, 25; + ld.local.u8 %r4145, [%rd3+-112]; + ld.local.u8 %r4146, [%rd3+-111]; + prmt.b32 %r4147, %r4146, %r4145, 30212; + ld.local.u8 %r4148, [%rd3+-110]; + ld.local.u8 %r4149, [%rd3+-109]; + prmt.b32 %r4150, %r4149, %r4148, 30212; + prmt.b32 %r4151, %r4150, %r4147, 4180; + ld.local.u8 %r4152, [%rd3+-128]; + ld.local.u8 %r4153, [%rd3+-127]; + prmt.b32 %r4154, %r4153, %r4152, 30212; + ld.local.u8 %r4155, [%rd3+-126]; + ld.local.u8 %r4156, [%rd3+-125]; + prmt.b32 %r4157, %r4156, %r4155, 30212; + prmt.b32 %r4158, %r4157, %r4154, 4180; + add.s32 %r4159, %r4151, %r4158; + add.s32 %r4160, %r4159, %r4012; + shr.u32 %r4161, %r4160, 16; + shl.b32 %r4162, %r4160, 16; + xor.b32 %r4163, %r4162, 4194304; + or.b32 %r4164, %r4163, %r4161; + add.s32 %r4165, %r4164, 1013904242; + xor.b32 %r4166, %r4165, %r4151; + shf.l.wrap.b32 %r4167, %r4166, %r4166, 20; + add.s32 %r4168, %r4019, %r4160; + add.s32 %r4169, %r4168, %r4167; + xor.b32 %r4170, %r4169, %r4164; + shf.l.wrap.b32 %r4171, %r4170, %r4170, 24; + add.s32 %r4172, %r4171, %r4165; + xor.b32 %r4173, %r4172, %r4167; + shf.l.wrap.b32 %r4174, %r4173, %r4173, 25; + ld.local.u8 %r4175, [%rd3+-108]; + ld.local.u8 %r4176, [%rd3+-107]; + prmt.b32 %r4177, %r4176, %r4175, 30212; + ld.local.u8 %r4178, [%rd3+-106]; + ld.local.u8 %r4179, [%rd3+-105]; + prmt.b32 %r4180, %r4179, %r4178, 30212; + prmt.b32 %r4181, %r4180, %r4177, 4180; + ld.local.u8 %r4182, [%rd3+-124]; + ld.local.u8 %r4183, [%rd3+-123]; + prmt.b32 %r4184, %r4183, %r4182, 30212; + ld.local.u8 %r4185, [%rd3+-122]; + ld.local.u8 %r4186, [%rd3+-121]; + prmt.b32 %r4187, %r4186, %r4185, 30212; + prmt.b32 %r4188, %r4187, %r4184, 4180; + add.s32 %r4189, %r4181, %r4188; + add.s32 %r4190, %r4189, %r4026; + xor.b32 %r4191, %r4190, %r4090; + shr.u32 %r4192, %r4190, 16; + shl.b32 %r4193, %r4191, 16; + or.b32 %r4194, %r4193, %r4192; + add.s32 %r4195, %r4194, -1521486534; + xor.b32 %r4196, %r4195, %r4181; + shf.l.wrap.b32 %r4197, %r4196, %r4196, 20; + add.s32 %r4198, %r4033, %r4190; + add.s32 %r4199, %r4198, %r4197; + xor.b32 %r4200, %r4199, %r4194; + shf.l.wrap.b32 %r4201, %r4200, %r4200, 24; + add.s32 %r4202, %r4201, %r4195; + xor.b32 %r4203, %r4202, %r4197; + shf.l.wrap.b32 %r4204, %r4203, %r4203, 25; + add.s32 %r4205, %r4144, %r4112; + add.s32 %r4206, %r4205, %r4040; + xor.b32 %r4207, %r4201, %r4206; + shf.l.wrap.b32 %r4208, %r4207, %r4207, 16; + add.s32 %r4209, %r4208, %r4172; + xor.b32 %r4210, %r4209, %r4144; + shf.l.wrap.b32 %r4211, %r4210, %r4210, 20; + add.s32 %r4212, %r4047, %r4206; + add.s32 %r4213, %r4212, %r4211; + xor.b32 %r4214, %r4213, %r4208; + shf.l.wrap.b32 %r4215, %r4214, %r4214, 24; + add.s32 %r4216, %r4215, %r4209; + xor.b32 %r4217, %r4216, %r4211; + shf.l.wrap.b32 %r4218, %r4217, %r4217, 25; + add.s32 %r4219, %r4174, %r4139; + add.s32 %r4220, %r4219, %r4054; + xor.b32 %r4221, %r4220, %r4114; + shf.l.wrap.b32 %r4222, %r4221, %r4221, 16; + add.s32 %r4223, %r4222, %r4202; + xor.b32 %r4224, %r4223, %r4174; + shf.l.wrap.b32 %r4225, %r4224, %r4224, 20; + add.s32 %r4226, %r4061, %r4220; + add.s32 %r4227, %r4226, %r4225; + xor.b32 %r4228, %r4227, %r4222; + shf.l.wrap.b32 %r4229, %r4228, %r4228, 24; + add.s32 %r4230, %r4229, %r4223; + xor.b32 %r4231, %r4230, %r4225; + shf.l.wrap.b32 %r4232, %r4231, %r4231, 25; + add.s32 %r4233, %r4204, %r4169; + add.s32 %r4234, %r4233, %r4068; + xor.b32 %r4235, %r4234, %r4141; + shf.l.wrap.b32 %r4236, %r4235, %r4235, 16; + add.s32 %r4237, %r4236, %r4115; + xor.b32 %r4238, %r4237, %r4204; + shf.l.wrap.b32 %r4239, %r4238, %r4238, 20; + add.s32 %r4240, %r4075, %r4234; + add.s32 %r4241, %r4240, %r4239; + xor.b32 %r4242, %r4241, %r4236; + shf.l.wrap.b32 %r4243, %r4242, %r4242, 24; + add.s32 %r4244, %r4243, %r4237; + xor.b32 %r4245, %r4244, %r4239; + shf.l.wrap.b32 %r4246, %r4245, %r4245, 25; + add.s32 %r4247, %r4199, %r4117; + add.s32 %r4248, %r4247, %r4082; + xor.b32 %r4249, %r4248, %r4171; + shf.l.wrap.b32 %r4250, %r4249, %r4249, 16; + add.s32 %r4251, %r4250, %r4142; + xor.b32 %r4252, %r4251, %r4117; + shf.l.wrap.b32 %r4253, %r4252, %r4252, 20; + add.s32 %r4254, %r4089, %r4248; + add.s32 %r4255, %r4254, %r4253; + xor.b32 %r4256, %r4255, %r4250; + shf.l.wrap.b32 %r4257, %r4256, %r4256, 24; + add.s32 %r4258, %r4257, %r4251; + xor.b32 %r4259, %r4258, %r4253; + shf.l.wrap.b32 %r4260, %r4259, %r4259, 25; + add.s32 %r4261, %r4213, %r3998; + add.s32 %r4262, %r4261, %r4260; + xor.b32 %r4263, %r4262, %r4229; + shf.l.wrap.b32 %r4264, %r4263, %r4263, 16; + add.s32 %r4265, %r4264, %r4244; + xor.b32 %r4266, %r4265, %r4260; + shf.l.wrap.b32 %r4267, %r4266, %r4266, 20; + add.s32 %r4268, %r4262, %r4026; + add.s32 %r4269, %r4268, %r4267; + xor.b32 %r4270, %r4269, %r4264; + shf.l.wrap.b32 %r4271, %r4270, %r4270, 24; + add.s32 %r4272, %r4271, %r4265; + xor.b32 %r4273, %r4272, %r4267; + shf.l.wrap.b32 %r4274, %r4273, %r4273, 25; + add.s32 %r4275, %r4227, %r4005; + add.s32 %r4276, %r4275, %r4218; + xor.b32 %r4277, %r4243, %r4276; + shf.l.wrap.b32 %r4278, %r4277, %r4277, 16; + add.s32 %r4279, %r4258, %r4278; + xor.b32 %r4280, %r4279, %r4218; + shf.l.wrap.b32 %r4281, %r4280, %r4280, 20; + add.s32 %r4282, %r4276, %r4054; + add.s32 %r4283, %r4282, %r4281; + xor.b32 %r4284, %r4283, %r4278; + shf.l.wrap.b32 %r4285, %r4284, %r4284, 24; + add.s32 %r4286, %r4285, %r4279; + xor.b32 %r4287, %r4286, %r4281; + shf.l.wrap.b32 %r4288, %r4287, %r4287, 25; + add.s32 %r4289, %r4232, %r4033; + add.s32 %r4290, %r4289, %r4241; + xor.b32 %r4291, %r4257, %r4290; + shf.l.wrap.b32 %r4292, %r4291, %r4291, 16; + add.s32 %r4293, %r4292, %r4216; + xor.b32 %r4294, %r4293, %r4232; + shf.l.wrap.b32 %r4295, %r4294, %r4294, 20; + add.s32 %r4296, %r4290, %r3984; + add.s32 %r4297, %r4296, %r4295; + xor.b32 %r4298, %r4297, %r4292; + shf.l.wrap.b32 %r4299, %r4298, %r4298, 24; + add.s32 %r4300, %r4299, %r4293; + xor.b32 %r4301, %r4300, %r4295; + shf.l.wrap.b32 %r4302, %r4301, %r4301, 25; + add.s32 %r4303, %r4246, %r4012; + add.s32 %r4304, %r4303, %r4255; + xor.b32 %r4305, %r4304, %r4215; + shf.l.wrap.b32 %r4306, %r4305, %r4305, 16; + add.s32 %r4307, %r4306, %r4230; + xor.b32 %r4308, %r4307, %r4246; + shf.l.wrap.b32 %r4309, %r4308, %r4308, 20; + add.s32 %r4310, %r4304, %r4075; + add.s32 %r4311, %r4310, %r4309; + xor.b32 %r4312, %r4311, %r4306; + shf.l.wrap.b32 %r4313, %r4312, %r4312, 24; + add.s32 %r4314, %r4313, %r4307; + xor.b32 %r4315, %r4314, %r4309; + shf.l.wrap.b32 %r4316, %r4315, %r4315, 25; + add.s32 %r4317, %r4288, %r3991; + add.s32 %r4318, %r4317, %r4269; + xor.b32 %r4319, %r4318, %r4313; + shf.l.wrap.b32 %r4320, %r4319, %r4319, 16; + add.s32 %r4321, %r4320, %r4300; + xor.b32 %r4322, %r4321, %r4288; + shf.l.wrap.b32 %r4323, %r4322, %r4322, 20; + add.s32 %r4324, %r4318, %r4061; + add.s32 %r4325, %r4324, %r4323; + xor.b32 %r4326, %r4325, %r4320; + shf.l.wrap.b32 %r4327, %r4326, %r4326, 24; + add.s32 %r4328, %r4327, %r4321; + xor.b32 %r4329, %r4328, %r4323; + shf.l.wrap.b32 %r4330, %r4329, %r4329, 25; + add.s32 %r4331, %r4283, %r4068; + add.s32 %r4332, %r4331, %r4302; + xor.b32 %r4333, %r4271, %r4332; + shf.l.wrap.b32 %r4334, %r4333, %r4333, 16; + add.s32 %r4335, %r4334, %r4314; + xor.b32 %r4336, %r4335, %r4302; + shf.l.wrap.b32 %r4337, %r4336, %r4336, 20; + add.s32 %r4338, %r4332, %r4019; + add.s32 %r4339, %r4338, %r4337; + xor.b32 %r4340, %r4339, %r4334; + shf.l.wrap.b32 %r4341, %r4340, %r4340, 24; + add.s32 %r4342, %r4341, %r4335; + xor.b32 %r4343, %r4342, %r4337; + shf.l.wrap.b32 %r4344, %r4343, %r4343, 25; + add.s32 %r4345, %r4297, %r4047; + add.s32 %r4346, %r4345, %r4316; + xor.b32 %r4347, %r4346, %r4285; + shf.l.wrap.b32 %r4348, %r4347, %r4347, 16; + add.s32 %r4349, %r4348, %r4272; + xor.b32 %r4350, %r4349, %r4316; + shf.l.wrap.b32 %r4351, %r4350, %r4350, 20; + add.s32 %r4352, %r4346, %r4082; + add.s32 %r4353, %r4352, %r4351; + xor.b32 %r4354, %r4353, %r4348; + shf.l.wrap.b32 %r4355, %r4354, %r4354, 24; + add.s32 %r4356, %r4355, %r4349; + xor.b32 %r4357, %r4356, %r4351; + shf.l.wrap.b32 %r4358, %r4357, %r4357, 25; + add.s32 %r4359, %r4311, %r4089; + add.s32 %r4360, %r4359, %r4274; + xor.b32 %r4361, %r4360, %r4299; + shf.l.wrap.b32 %r4362, %r4361, %r4361, 16; + add.s32 %r4363, %r4362, %r4286; + xor.b32 %r4364, %r4363, %r4274; + shf.l.wrap.b32 %r4365, %r4364, %r4364, 20; + add.s32 %r4366, %r4360, %r4040; + add.s32 %r4367, %r4366, %r4365; + xor.b32 %r4368, %r4367, %r4362; + shf.l.wrap.b32 %r4369, %r4368, %r4368, 24; + add.s32 %r4370, %r4369, %r4363; + xor.b32 %r4371, %r4370, %r4365; + shf.l.wrap.b32 %r4372, %r4371, %r4371, 25; + add.s32 %r4373, %r4325, %r4005; + add.s32 %r4374, %r4373, %r4372; + xor.b32 %r4375, %r4374, %r4341; + shf.l.wrap.b32 %r4376, %r4375, %r4375, 16; + add.s32 %r4377, %r4376, %r4356; + xor.b32 %r4378, %r4377, %r4372; + shf.l.wrap.b32 %r4379, %r4378, %r4378, 20; + add.s32 %r4380, %r4374, %r4012; + add.s32 %r4381, %r4380, %r4379; + xor.b32 %r4382, %r4381, %r4376; + shf.l.wrap.b32 %r4383, %r4382, %r4382, 24; + add.s32 %r4384, %r4383, %r4377; + xor.b32 %r4385, %r4384, %r4379; + shf.l.wrap.b32 %r4386, %r4385, %r4385, 25; + add.s32 %r4387, %r4339, %r4054; + add.s32 %r4388, %r4387, %r4330; + xor.b32 %r4389, %r4388, %r4355; + shf.l.wrap.b32 %r4390, %r4389, %r4389, 16; + add.s32 %r4391, %r4390, %r4370; + xor.b32 %r4392, %r4391, %r4330; + shf.l.wrap.b32 %r4393, %r4392, %r4392, 20; + add.s32 %r4394, %r4388, %r4068; + add.s32 %r4395, %r4394, %r4393; + xor.b32 %r4396, %r4395, %r4390; + shf.l.wrap.b32 %r4397, %r4396, %r4396, 24; + add.s32 %r4398, %r4397, %r4391; + xor.b32 %r4399, %r4398, %r4393; + shf.l.wrap.b32 %r4400, %r4399, %r4399, 25; + add.s32 %r4401, %r4353, %r4075; + add.s32 %r4402, %r4401, %r4344; + xor.b32 %r4403, %r4369, %r4402; + shf.l.wrap.b32 %r4404, %r4403, %r4403, 16; + add.s32 %r4405, %r4404, %r4328; + xor.b32 %r4406, %r4405, %r4344; + shf.l.wrap.b32 %r4407, %r4406, %r4406, 20; + add.s32 %r4408, %r4402, %r3998; + add.s32 %r4409, %r4408, %r4407; + xor.b32 %r4410, %r4409, %r4404; + shf.l.wrap.b32 %r4411, %r4410, %r4410, 24; + add.s32 %r4412, %r4411, %r4405; + xor.b32 %r4413, %r4412, %r4407; + shf.l.wrap.b32 %r4414, %r4413, %r4413, 25; + add.s32 %r4415, %r4358, %r4033; + add.s32 %r4416, %r4415, %r4367; + xor.b32 %r4417, %r4416, %r4327; + shf.l.wrap.b32 %r4418, %r4417, %r4417, 16; + add.s32 %r4419, %r4418, %r4342; + xor.b32 %r4420, %r4419, %r4358; + shf.l.wrap.b32 %r4421, %r4420, %r4420, 20; + add.s32 %r4422, %r4416, %r4082; + add.s32 %r4423, %r4422, %r4421; + xor.b32 %r4424, %r4423, %r4418; + shf.l.wrap.b32 %r4425, %r4424, %r4424, 24; + add.s32 %r4426, %r4425, %r4419; + xor.b32 %r4427, %r4426, %r4421; + shf.l.wrap.b32 %r4428, %r4427, %r4427, 25; + add.s32 %r4429, %r4381, %r4026; + add.s32 %r4430, %r4429, %r4400; + xor.b32 %r4431, %r4430, %r4425; + shf.l.wrap.b32 %r4432, %r4431, %r4431, 16; + add.s32 %r4433, %r4432, %r4412; + xor.b32 %r4434, %r4433, %r4400; + shf.l.wrap.b32 %r4435, %r4434, %r4434, 20; + add.s32 %r4436, %r4430, %r4019; + add.s32 %r4437, %r4436, %r4435; + xor.b32 %r4438, %r4437, %r4432; + shf.l.wrap.b32 %r4439, %r4438, %r4438, 24; + add.s32 %r4440, %r4439, %r4433; + xor.b32 %r4441, %r4440, %r4435; + shf.l.wrap.b32 %r4442, %r4441, %r4441, 25; + add.s32 %r4443, %r4395, %r4047; + add.s32 %r4444, %r4443, %r4414; + xor.b32 %r4445, %r4383, %r4444; + shf.l.wrap.b32 %r4446, %r4445, %r4445, 16; + add.s32 %r4447, %r4446, %r4426; + xor.b32 %r4448, %r4447, %r4414; + shf.l.wrap.b32 %r4449, %r4448, %r4448, 20; + add.s32 %r4450, %r4444, %r3984; + add.s32 %r4451, %r4450, %r4449; + xor.b32 %r4452, %r4451, %r4446; + shf.l.wrap.b32 %r4453, %r4452, %r4452, 24; + add.s32 %r4454, %r4453, %r4447; + xor.b32 %r4455, %r4454, %r4449; + shf.l.wrap.b32 %r4456, %r4455, %r4455, 25; + add.s32 %r4457, %r4409, %r4061; + add.s32 %r4458, %r4457, %r4428; + xor.b32 %r4459, %r4458, %r4397; + shf.l.wrap.b32 %r4460, %r4459, %r4459, 16; + add.s32 %r4461, %r4460, %r4384; + xor.b32 %r4462, %r4461, %r4428; + shf.l.wrap.b32 %r4463, %r4462, %r4462, 20; + add.s32 %r4464, %r4458, %r4089; + add.s32 %r4465, %r4464, %r4463; + xor.b32 %r4466, %r4465, %r4460; + shf.l.wrap.b32 %r4467, %r4466, %r4466, 24; + add.s32 %r4468, %r4467, %r4461; + xor.b32 %r4469, %r4468, %r4463; + shf.l.wrap.b32 %r4470, %r4469, %r4469, 25; + add.s32 %r4471, %r4423, %r4040; + add.s32 %r4472, %r4471, %r4386; + xor.b32 %r4473, %r4472, %r4411; + shf.l.wrap.b32 %r4474, %r4473, %r4473, 16; + add.s32 %r4475, %r4474, %r4398; + xor.b32 %r4476, %r4475, %r4386; + shf.l.wrap.b32 %r4477, %r4476, %r4476, 20; + add.s32 %r4478, %r4472, %r3991; + add.s32 %r4479, %r4478, %r4477; + xor.b32 %r4480, %r4479, %r4474; + shf.l.wrap.b32 %r4481, %r4480, %r4480, 24; + add.s32 %r4482, %r4481, %r4475; + xor.b32 %r4483, %r4482, %r4477; + shf.l.wrap.b32 %r4484, %r4483, %r4483, 25; + add.s32 %r4485, %r4437, %r4054; + add.s32 %r4486, %r4485, %r4484; + xor.b32 %r4487, %r4486, %r4453; + shf.l.wrap.b32 %r4488, %r4487, %r4487, 16; + add.s32 %r4489, %r4488, %r4468; + xor.b32 %r4490, %r4489, %r4484; + shf.l.wrap.b32 %r4491, %r4490, %r4490, 20; + add.s32 %r4492, %r4486, %r4033; + add.s32 %r4493, %r4492, %r4491; + xor.b32 %r4494, %r4493, %r4488; + shf.l.wrap.b32 %r4495, %r4494, %r4494, 24; + add.s32 %r4496, %r4495, %r4489; + xor.b32 %r4497, %r4496, %r4491; + shf.l.wrap.b32 %r4498, %r4497, %r4497, 25; + add.s32 %r4499, %r4451, %r4068; + add.s32 %r4500, %r4499, %r4442; + xor.b32 %r4501, %r4500, %r4467; + shf.l.wrap.b32 %r4502, %r4501, %r4501, 16; + add.s32 %r4503, %r4502, %r4482; + xor.b32 %r4504, %r4503, %r4442; + shf.l.wrap.b32 %r4505, %r4504, %r4504, 20; + add.s32 %r4506, %r4500, %r4047; + add.s32 %r4507, %r4506, %r4505; + xor.b32 %r4508, %r4507, %r4502; + shf.l.wrap.b32 %r4509, %r4508, %r4508, 24; + add.s32 %r4510, %r4509, %r4503; + xor.b32 %r4511, %r4510, %r4505; + shf.l.wrap.b32 %r4512, %r4511, %r4511, 25; + add.s32 %r4513, %r4465, %r4082; + add.s32 %r4514, %r4513, %r4456; + xor.b32 %r4515, %r4481, %r4514; + shf.l.wrap.b32 %r4516, %r4515, %r4515, 16; + add.s32 %r4517, %r4516, %r4440; + xor.b32 %r4518, %r4517, %r4456; + shf.l.wrap.b32 %r4519, %r4518, %r4518, 20; + add.s32 %r4520, %r4514, %r4005; + add.s32 %r4521, %r4520, %r4519; + xor.b32 %r4522, %r4521, %r4516; + shf.l.wrap.b32 %r4523, %r4522, %r4522, 24; + add.s32 %r4524, %r4523, %r4517; + xor.b32 %r4525, %r4524, %r4519; + shf.l.wrap.b32 %r4526, %r4525, %r4525, 25; + add.s32 %r4527, %r4479, %r4075; + add.s32 %r4528, %r4527, %r4470; + xor.b32 %r4529, %r4528, %r4439; + shf.l.wrap.b32 %r4530, %r4529, %r4529, 16; + add.s32 %r4531, %r4530, %r4454; + xor.b32 %r4532, %r4531, %r4470; + shf.l.wrap.b32 %r4533, %r4532, %r4532, 20; + add.s32 %r4534, %r4528, %r4089; + add.s32 %r4535, %r4534, %r4533; + xor.b32 %r4536, %r4535, %r4530; + shf.l.wrap.b32 %r4537, %r4536, %r4536, 24; + add.s32 %r4538, %r4537, %r4531; + xor.b32 %r4539, %r4538, %r4533; + shf.l.wrap.b32 %r4540, %r4539, %r4539, 25; + add.s32 %r4541, %r4493, %r4012; + add.s32 %r4542, %r4541, %r4512; + xor.b32 %r4543, %r4542, %r4537; + shf.l.wrap.b32 %r4544, %r4543, %r4543, 16; + add.s32 %r4545, %r4544, %r4524; + xor.b32 %r4546, %r4545, %r4512; + shf.l.wrap.b32 %r4547, %r4546, %r4546, 20; + add.s32 %r4548, %r4542, %r3984; + add.s32 %r4549, %r4548, %r4547; + xor.b32 %r4550, %r4549, %r4544; + shf.l.wrap.b32 %r4551, %r4550, %r4550, 24; + add.s32 %r4552, %r4551, %r4545; + xor.b32 %r4553, %r4552, %r4547; + shf.l.wrap.b32 %r4554, %r4553, %r4553, 25; + add.s32 %r4555, %r4507, %r4061; + add.s32 %r4556, %r4555, %r4526; + xor.b32 %r4557, %r4495, %r4556; + shf.l.wrap.b32 %r4558, %r4557, %r4557, 16; + add.s32 %r4559, %r4558, %r4538; + xor.b32 %r4560, %r4559, %r4526; + shf.l.wrap.b32 %r4561, %r4560, %r4560, 20; + add.s32 %r4562, %r4556, %r3998; + add.s32 %r4563, %r4562, %r4561; + xor.b32 %r4564, %r4563, %r4558; + shf.l.wrap.b32 %r4565, %r4564, %r4564, 24; + add.s32 %r4566, %r4565, %r4559; + xor.b32 %r4567, %r4566, %r4561; + shf.l.wrap.b32 %r4568, %r4567, %r4567, 25; + add.s32 %r4569, %r4521, %r4019; + add.s32 %r4570, %r4569, %r4540; + xor.b32 %r4571, %r4570, %r4509; + shf.l.wrap.b32 %r4572, %r4571, %r4571, 16; + add.s32 %r4573, %r4572, %r4496; + xor.b32 %r4574, %r4573, %r4540; + shf.l.wrap.b32 %r4575, %r4574, %r4574, 20; + add.s32 %r4576, %r4570, %r4040; + add.s32 %r4577, %r4576, %r4575; + xor.b32 %r4578, %r4577, %r4572; + shf.l.wrap.b32 %r4579, %r4578, %r4578, 24; + add.s32 %r4580, %r4579, %r4573; + xor.b32 %r4581, %r4580, %r4575; + shf.l.wrap.b32 %r4582, %r4581, %r4581, 25; + add.s32 %r4583, %r4535, %r3991; + add.s32 %r4584, %r4583, %r4498; + xor.b32 %r4585, %r4584, %r4523; + shf.l.wrap.b32 %r4586, %r4585, %r4585, 16; + add.s32 %r4587, %r4586, %r4510; + xor.b32 %r4588, %r4587, %r4498; + shf.l.wrap.b32 %r4589, %r4588, %r4588, 20; + add.s32 %r4590, %r4584, %r4026; + add.s32 %r4591, %r4590, %r4589; + xor.b32 %r4592, %r4591, %r4586; + shf.l.wrap.b32 %r4593, %r4592, %r4592, 24; + add.s32 %r4594, %r4593, %r4587; + xor.b32 %r4595, %r4594, %r4589; + shf.l.wrap.b32 %r4596, %r4595, %r4595, 25; + add.s32 %r4597, %r4549, %r4068; + add.s32 %r4598, %r4597, %r4596; + xor.b32 %r4599, %r4598, %r4565; + shf.l.wrap.b32 %r4600, %r4599, %r4599, 16; + add.s32 %r4601, %r4600, %r4580; + xor.b32 %r4602, %r4601, %r4596; + shf.l.wrap.b32 %r4603, %r4602, %r4602, 20; + add.s32 %r4604, %r4598, %r4075; + add.s32 %r4605, %r4604, %r4603; + xor.b32 %r4606, %r4605, %r4600; + shf.l.wrap.b32 %r4607, %r4606, %r4606, 24; + add.s32 %r4608, %r4607, %r4601; + xor.b32 %r4609, %r4608, %r4603; + shf.l.wrap.b32 %r4610, %r4609, %r4609, 25; + add.s32 %r4611, %r4563, %r4047; + add.s32 %r4612, %r4611, %r4554; + xor.b32 %r4613, %r4612, %r4579; + shf.l.wrap.b32 %r4614, %r4613, %r4613, 16; + add.s32 %r4615, %r4614, %r4594; + xor.b32 %r4616, %r4615, %r4554; + shf.l.wrap.b32 %r4617, %r4616, %r4616, 20; + add.s32 %r4618, %r4612, %r4061; + add.s32 %r4619, %r4618, %r4617; + xor.b32 %r4620, %r4619, %r4614; + shf.l.wrap.b32 %r4621, %r4620, %r4620, 24; + add.s32 %r4622, %r4621, %r4615; + xor.b32 %r4623, %r4622, %r4617; + shf.l.wrap.b32 %r4624, %r4623, %r4623, 25; + add.s32 %r4625, %r4577, %r4089; + add.s32 %r4626, %r4625, %r4568; + xor.b32 %r4627, %r4593, %r4626; + shf.l.wrap.b32 %r4628, %r4627, %r4627, 16; + add.s32 %r4629, %r4628, %r4552; + xor.b32 %r4630, %r4629, %r4568; + shf.l.wrap.b32 %r4631, %r4630, %r4630, 20; + add.s32 %r4632, %r4626, %r4054; + add.s32 %r4633, %r4632, %r4631; + xor.b32 %r4634, %r4633, %r4628; + shf.l.wrap.b32 %r4635, %r4634, %r4634, 24; + add.s32 %r4636, %r4635, %r4629; + xor.b32 %r4637, %r4636, %r4631; + shf.l.wrap.b32 %r4638, %r4637, %r4637, 25; + add.s32 %r4639, %r4591, %r4082; + add.s32 %r4640, %r4639, %r4582; + xor.b32 %r4641, %r4640, %r4551; + shf.l.wrap.b32 %r4642, %r4641, %r4641, 16; + add.s32 %r4643, %r4642, %r4566; + xor.b32 %r4644, %r4643, %r4582; + shf.l.wrap.b32 %r4645, %r4644, %r4644, 20; + add.s32 %r4646, %r4640, %r4040; + add.s32 %r4647, %r4646, %r4645; + xor.b32 %r4648, %r4647, %r4642; + shf.l.wrap.b32 %r4649, %r4648, %r4648, 24; + add.s32 %r4650, %r4649, %r4643; + xor.b32 %r4651, %r4650, %r4645; + shf.l.wrap.b32 %r4652, %r4651, %r4651, 25; + add.s32 %r4653, %r4605, %r4033; + add.s32 %r4654, %r4653, %r4624; + xor.b32 %r4655, %r4654, %r4649; + shf.l.wrap.b32 %r4656, %r4655, %r4655, 16; + add.s32 %r4657, %r4656, %r4636; + xor.b32 %r4658, %r4657, %r4624; + shf.l.wrap.b32 %r4659, %r4658, %r4658, 20; + add.s32 %r4660, %r4654, %r3998; + add.s32 %r4661, %r4660, %r4659; + xor.b32 %r4662, %r4661, %r4656; + shf.l.wrap.b32 %r4663, %r4662, %r4662, 24; + add.s32 %r4664, %r4663, %r4657; + xor.b32 %r4665, %r4664, %r4659; + shf.l.wrap.b32 %r4666, %r4665, %r4665, 25; + add.s32 %r4667, %r4619, %r4019; + add.s32 %r4668, %r4667, %r4638; + xor.b32 %r4669, %r4607, %r4668; + shf.l.wrap.b32 %r4670, %r4669, %r4669, 16; + add.s32 %r4671, %r4670, %r4650; + xor.b32 %r4672, %r4671, %r4638; + shf.l.wrap.b32 %r4673, %r4672, %r4672, 20; + add.s32 %r4674, %r4668, %r4005; + add.s32 %r4675, %r4674, %r4673; + xor.b32 %r4676, %r4675, %r4670; + shf.l.wrap.b32 %r4677, %r4676, %r4676, 24; + add.s32 %r4678, %r4677, %r4671; + xor.b32 %r4679, %r4678, %r4673; + shf.l.wrap.b32 %r4680, %r4679, %r4679, 25; + add.s32 %r4681, %r4633, %r3984; + add.s32 %r4682, %r4681, %r4652; + xor.b32 %r4683, %r4682, %r4621; + shf.l.wrap.b32 %r4684, %r4683, %r4683, 16; + add.s32 %r4685, %r4684, %r4608; + xor.b32 %r4686, %r4685, %r4652; + shf.l.wrap.b32 %r4687, %r4686, %r4686, 20; + add.s32 %r4688, %r4682, %r3991; + add.s32 %r4689, %r4688, %r4687; + xor.b32 %r4690, %r4689, %r4684; + shf.l.wrap.b32 %r4691, %r4690, %r4690, 24; + add.s32 %r4692, %r4691, %r4685; + xor.b32 %r4693, %r4692, %r4687; + shf.l.wrap.b32 %r4694, %r4693, %r4693, 25; + add.s32 %r4695, %r4647, %r4026; + add.s32 %r4696, %r4695, %r4610; + xor.b32 %r4697, %r4696, %r4635; + shf.l.wrap.b32 %r4698, %r4697, %r4697, 16; + add.s32 %r4699, %r4698, %r4622; + xor.b32 %r4700, %r4699, %r4610; + shf.l.wrap.b32 %r4701, %r4700, %r4700, 20; + add.s32 %r4702, %r4696, %r4012; + add.s32 %r4703, %r4702, %r4701; + xor.b32 %r4704, %r4703, %r4698; + shf.l.wrap.b32 %r4705, %r4704, %r4704, 24; + add.s32 %r4706, %r4705, %r4699; + xor.b32 %r4707, %r4706, %r4701; + shf.l.wrap.b32 %r4708, %r4707, %r4707, 25; + add.s32 %r4709, %r4661, %r4047; + add.s32 %r4710, %r4709, %r4708; + xor.b32 %r4711, %r4710, %r4677; + shf.l.wrap.b32 %r4712, %r4711, %r4711, 16; + add.s32 %r4713, %r4712, %r4692; + xor.b32 %r4714, %r4713, %r4708; + shf.l.wrap.b32 %r4715, %r4714, %r4714, 20; + add.s32 %r4716, %r4710, %r4082; + add.s32 %r4717, %r4716, %r4715; + xor.b32 %r4718, %r4717, %r4712; + shf.l.wrap.b32 %r4719, %r4718, %r4718, 24; + add.s32 %r4720, %r4719, %r4713; + xor.b32 %r4721, %r4720, %r4715; + shf.l.wrap.b32 %r4722, %r4721, %r4721, 25; + add.s32 %r4723, %r4675, %r4061; + add.s32 %r4724, %r4723, %r4666; + xor.b32 %r4725, %r4724, %r4691; + shf.l.wrap.b32 %r4726, %r4725, %r4725, 16; + add.s32 %r4727, %r4726, %r4706; + xor.b32 %r4728, %r4727, %r4666; + shf.l.wrap.b32 %r4729, %r4728, %r4728, 20; + add.s32 %r4730, %r4724, %r4019; + add.s32 %r4731, %r4730, %r4729; + xor.b32 %r4732, %r4731, %r4726; + shf.l.wrap.b32 %r4733, %r4732, %r4732, 24; + add.s32 %r4734, %r4733, %r4727; + xor.b32 %r4735, %r4734, %r4729; + shf.l.wrap.b32 %r4736, %r4735, %r4735, 25; + add.s32 %r4737, %r4689, %r4040; + add.s32 %r4738, %r4737, %r4680; + xor.b32 %r4739, %r4705, %r4738; + shf.l.wrap.b32 %r4740, %r4739, %r4739, 16; + add.s32 %r4741, %r4740, %r4664; + xor.b32 %r4742, %r4741, %r4680; + shf.l.wrap.b32 %r4743, %r4742, %r4742, 20; + add.s32 %r4744, %r4738, %r4068; + add.s32 %r4745, %r4744, %r4743; + xor.b32 %r4746, %r4745, %r4740; + shf.l.wrap.b32 %r4747, %r4746, %r4746, 24; + add.s32 %r4748, %r4747, %r4741; + xor.b32 %r4749, %r4748, %r4743; + shf.l.wrap.b32 %r4750, %r4749, %r4749, 25; + add.s32 %r4751, %r4703, %r4089; + add.s32 %r4752, %r4751, %r4694; + xor.b32 %r4753, %r4752, %r4663; + shf.l.wrap.b32 %r4754, %r4753, %r4753, 16; + add.s32 %r4755, %r4754, %r4678; + xor.b32 %r4756, %r4755, %r4694; + shf.l.wrap.b32 %r4757, %r4756, %r4756, 20; + add.s32 %r4758, %r4752, %r3991; + add.s32 %r4759, %r4758, %r4757; + xor.b32 %r4760, %r4759, %r4754; + shf.l.wrap.b32 %r4761, %r4760, %r4760, 24; + add.s32 %r4762, %r4761, %r4755; + xor.b32 %r4763, %r4762, %r4757; + shf.l.wrap.b32 %r4764, %r4763, %r4763, 25; + add.s32 %r4765, %r4717, %r4075; + add.s32 %r4766, %r4765, %r4736; + xor.b32 %r4767, %r4766, %r4761; + shf.l.wrap.b32 %r4768, %r4767, %r4767, 16; + add.s32 %r4769, %r4768, %r4748; + xor.b32 %r4770, %r4769, %r4736; + shf.l.wrap.b32 %r4771, %r4770, %r4770, 20; + add.s32 %r4772, %r4766, %r4005; + add.s32 %r4773, %r4772, %r4771; + xor.b32 %r4774, %r4773, %r4768; + shf.l.wrap.b32 %r4775, %r4774, %r4774, 24; + add.s32 %r4776, %r4775, %r4769; + xor.b32 %r4777, %r4776, %r4771; + shf.l.wrap.b32 %r4778, %r4777, %r4777, 25; + add.s32 %r4779, %r4731, %r3984; + add.s32 %r4780, %r4779, %r4750; + xor.b32 %r4781, %r4719, %r4780; + shf.l.wrap.b32 %r4782, %r4781, %r4781, 16; + add.s32 %r4783, %r4782, %r4762; + xor.b32 %r4784, %r4783, %r4750; + shf.l.wrap.b32 %r4785, %r4784, %r4784, 20; + add.s32 %r4786, %r4780, %r4054; + add.s32 %r4787, %r4786, %r4785; + xor.b32 %r4788, %r4787, %r4782; + shf.l.wrap.b32 %r4789, %r4788, %r4788, 24; + add.s32 %r4790, %r4789, %r4783; + xor.b32 %r4791, %r4790, %r4785; + shf.l.wrap.b32 %r4792, %r4791, %r4791, 25; + add.s32 %r4793, %r4745, %r3998; + add.s32 %r4794, %r4793, %r4764; + xor.b32 %r4795, %r4794, %r4733; + shf.l.wrap.b32 %r4796, %r4795, %r4795, 16; + add.s32 %r4797, %r4796, %r4720; + xor.b32 %r4798, %r4797, %r4764; + shf.l.wrap.b32 %r4799, %r4798, %r4798, 20; + add.s32 %r4800, %r4794, %r4026; + add.s32 %r4801, %r4800, %r4799; + xor.b32 %r4802, %r4801, %r4796; + shf.l.wrap.b32 %r4803, %r4802, %r4802, 24; + add.s32 %r4804, %r4803, %r4797; + xor.b32 %r4805, %r4804, %r4799; + shf.l.wrap.b32 %r4806, %r4805, %r4805, 25; + add.s32 %r4807, %r4759, %r4012; + add.s32 %r4808, %r4807, %r4722; + xor.b32 %r4809, %r4808, %r4747; + shf.l.wrap.b32 %r4810, %r4809, %r4809, 16; + add.s32 %r4811, %r4810, %r4734; + xor.b32 %r4812, %r4811, %r4722; + shf.l.wrap.b32 %r4813, %r4812, %r4812, 20; + add.s32 %r4814, %r4808, %r4033; + add.s32 %r4815, %r4814, %r4813; + xor.b32 %r4816, %r4815, %r4810; + shf.l.wrap.b32 %r4817, %r4816, %r4816, 24; + add.s32 %r4818, %r4817, %r4811; + xor.b32 %r4819, %r4818, %r4813; + shf.l.wrap.b32 %r4820, %r4819, %r4819, 25; + add.s32 %r4821, %r4773, %r4061; + add.s32 %r4822, %r4821, %r4820; + xor.b32 %r4823, %r4822, %r4789; + shf.l.wrap.b32 %r4824, %r4823, %r4823, 16; + add.s32 %r4825, %r4824, %r4804; + xor.b32 %r4826, %r4825, %r4820; + shf.l.wrap.b32 %r4827, %r4826, %r4826, 20; + add.s32 %r4828, %r4822, %r4089; + add.s32 %r4829, %r4828, %r4827; + xor.b32 %r4830, %r4829, %r4824; + shf.l.wrap.b32 %r4831, %r4830, %r4830, 24; + add.s32 %r4832, %r4831, %r4825; + xor.b32 %r4833, %r4832, %r4827; + shf.l.wrap.b32 %r4834, %r4833, %r4833, 25; + add.s32 %r4835, %r4787, %r4019; + add.s32 %r4836, %r4835, %r4778; + xor.b32 %r4837, %r4836, %r4803; + shf.l.wrap.b32 %r4838, %r4837, %r4837, 16; + add.s32 %r4839, %r4838, %r4818; + xor.b32 %r4840, %r4839, %r4778; + shf.l.wrap.b32 %r4841, %r4840, %r4840, 20; + add.s32 %r4842, %r4836, %r3984; + add.s32 %r4843, %r4842, %r4841; + xor.b32 %r4844, %r4843, %r4838; + shf.l.wrap.b32 %r4845, %r4844, %r4844, 24; + add.s32 %r4846, %r4845, %r4839; + xor.b32 %r4847, %r4846, %r4841; + shf.l.wrap.b32 %r4848, %r4847, %r4847, 25; + add.s32 %r4849, %r4801, %r3991; + add.s32 %r4850, %r4849, %r4792; + xor.b32 %r4851, %r4817, %r4850; + shf.l.wrap.b32 %r4852, %r4851, %r4851, 16; + add.s32 %r4853, %r4852, %r4776; + xor.b32 %r4854, %r4853, %r4792; + shf.l.wrap.b32 %r4855, %r4854, %r4854, 20; + add.s32 %r4856, %r4850, %r4047; + add.s32 %r4857, %r4856, %r4855; + xor.b32 %r4858, %r4857, %r4852; + shf.l.wrap.b32 %r4859, %r4858, %r4858, 24; + add.s32 %r4860, %r4859, %r4853; + xor.b32 %r4861, %r4860, %r4855; + shf.l.wrap.b32 %r4862, %r4861, %r4861, 25; + add.s32 %r4863, %r4815, %r4040; + add.s32 %r4864, %r4863, %r4806; + xor.b32 %r4865, %r4864, %r4775; + shf.l.wrap.b32 %r4866, %r4865, %r4865, 16; + add.s32 %r4867, %r4866, %r4790; + xor.b32 %r4868, %r4867, %r4806; + shf.l.wrap.b32 %r4869, %r4868, %r4868, 20; + add.s32 %r4870, %r4864, %r4026; + add.s32 %r4871, %r4870, %r4869; + xor.b32 %r4872, %r4871, %r4866; + shf.l.wrap.b32 %r4873, %r4872, %r4872, 24; + add.s32 %r4874, %r4873, %r4867; + xor.b32 %r4875, %r4874, %r4869; + shf.l.wrap.b32 %r4876, %r4875, %r4875, 25; + add.s32 %r4877, %r4829, %r4082; + add.s32 %r4878, %r4877, %r4848; + xor.b32 %r4879, %r4878, %r4873; + shf.l.wrap.b32 %r4880, %r4879, %r4879, 16; + add.s32 %r4881, %r4880, %r4860; + xor.b32 %r4882, %r4881, %r4848; + shf.l.wrap.b32 %r4883, %r4882, %r4882, 20; + add.s32 %r4884, %r4878, %r4054; + add.s32 %r4885, %r4884, %r4883; + xor.b32 %r4886, %r4885, %r4880; + shf.l.wrap.b32 %r4887, %r4886, %r4886, 24; + add.s32 %r4888, %r4887, %r4881; + xor.b32 %r4889, %r4888, %r4883; + shf.l.wrap.b32 %r4890, %r4889, %r4889, 25; + add.s32 %r4891, %r4843, %r3998; + add.s32 %r4892, %r4891, %r4862; + xor.b32 %r4893, %r4831, %r4892; + shf.l.wrap.b32 %r4894, %r4893, %r4893, 16; + add.s32 %r4895, %r4894, %r4874; + xor.b32 %r4896, %r4895, %r4862; + shf.l.wrap.b32 %r4897, %r4896, %r4896, 20; + add.s32 %r4898, %r4892, %r4068; + add.s32 %r4899, %r4898, %r4897; + xor.b32 %r4900, %r4899, %r4894; + shf.l.wrap.b32 %r4901, %r4900, %r4900, 24; + add.s32 %r4902, %r4901, %r4895; + xor.b32 %r4903, %r4902, %r4897; + shf.l.wrap.b32 %r4904, %r4903, %r4903, 25; + add.s32 %r4905, %r4857, %r4005; + add.s32 %r4906, %r4905, %r4876; + xor.b32 %r4907, %r4906, %r4845; + shf.l.wrap.b32 %r4908, %r4907, %r4907, 16; + add.s32 %r4909, %r4908, %r4832; + xor.b32 %r4910, %r4909, %r4876; + shf.l.wrap.b32 %r4911, %r4910, %r4910, 20; + add.s32 %r4912, %r4906, %r4012; + add.s32 %r4913, %r4912, %r4911; + xor.b32 %r4914, %r4913, %r4908; + shf.l.wrap.b32 %r4915, %r4914, %r4914, 24; + add.s32 %r4916, %r4915, %r4909; + xor.b32 %r4917, %r4916, %r4911; + shf.l.wrap.b32 %r4918, %r4917, %r4917, 25; + add.s32 %r4919, %r4871, %r4033; + add.s32 %r4920, %r4919, %r4834; + xor.b32 %r4921, %r4920, %r4859; + shf.l.wrap.b32 %r4922, %r4921, %r4921, 16; + add.s32 %r4923, %r4922, %r4846; + xor.b32 %r4924, %r4923, %r4834; + shf.l.wrap.b32 %r4925, %r4924, %r4924, 20; + add.s32 %r4926, %r4920, %r4075; + add.s32 %r4927, %r4926, %r4925; + xor.b32 %r4928, %r4927, %r4922; + shf.l.wrap.b32 %r4929, %r4928, %r4928, 24; + add.s32 %r4930, %r4929, %r4923; + xor.b32 %r4931, %r4930, %r4925; + shf.l.wrap.b32 %r4932, %r4931, %r4931, 25; + xor.b32 %r4933, %r4916, %r4885; + xor.b32 %r4934, %r4930, %r4899; + xor.b32 %r4935, %r4888, %r4913; + xor.b32 %r4936, %r4927, %r4902; + xor.b32 %r4937, %r4932, %r4901; + xor.b32 %r4938, %r4890, %r4915; + xor.b32 %r4939, %r4929, %r4904; + xor.b32 %r4940, %r4918, %r4887; + st.local.u8 [%rd167+145], %r4933; + shr.u32 %r4941, %r4933, 8; + st.local.u8 [%rd167+146], %r4941; + shr.u32 %r4942, %r4933, 16; + st.local.u8 [%rd167+147], %r4942; + shr.u32 %r4943, %r4933, 24; + st.local.u8 [%rd167+148], %r4943; + st.local.u8 [%rd167+149], %r4934; + shr.u32 %r4944, %r4934, 8; + st.local.u8 [%rd167+150], %r4944; + shr.u32 %r4945, %r4934, 16; + st.local.u8 [%rd167+151], %r4945; + shr.u32 %r4946, %r4934, 24; + st.local.u8 [%rd167+152], %r4946; + st.local.u8 [%rd167+153], %r4935; + shr.u32 %r4947, %r4935, 8; + st.local.u8 [%rd167+154], %r4947; + shr.u32 %r4948, %r4935, 16; + st.local.u8 [%rd167+155], %r4948; + shr.u32 %r4949, %r4935, 24; + st.local.u8 [%rd167+156], %r4949; + st.local.u8 [%rd167+157], %r4936; + shr.u32 %r4950, %r4936, 8; + st.local.u8 [%rd167+158], %r4950; + shr.u32 %r4951, %r4936, 16; + st.local.u8 [%rd167+159], %r4951; + shr.u32 %r4952, %r4936, 24; + st.local.u8 [%rd167+160], %r4952; + st.local.u8 [%rd167+161], %r4937; + shr.u32 %r4953, %r4937, 8; + st.local.u8 [%rd167+162], %r4953; + shr.u32 %r4954, %r4937, 16; + st.local.u8 [%rd167+163], %r4954; + shr.u32 %r4955, %r4937, 24; + st.local.u8 [%rd167+164], %r4955; + st.local.u8 [%rd167+165], %r4938; + shr.u32 %r4956, %r4938, 8; + st.local.u8 [%rd167+166], %r4956; + shr.u32 %r4957, %r4938, 16; + st.local.u8 [%rd167+167], %r4957; + shr.u32 %r4958, %r4938, 24; + st.local.u8 [%rd167+168], %r4958; + st.local.u8 [%rd167+169], %r4939; + shr.u32 %r4959, %r4939, 8; + st.local.u8 [%rd167+170], %r4959; + shr.u32 %r4960, %r4939, 16; + st.local.u8 [%rd167+171], %r4960; + shr.u32 %r4961, %r4939, 24; + st.local.u8 [%rd167+172], %r4961; + st.local.u8 [%rd167+173], %r4940; + shr.u32 %r4962, %r4940, 8; + st.local.u8 [%rd167+174], %r4962; + shr.u32 %r4963, %r4940, 16; + st.local.u8 [%rd167+175], %r4963; + shr.u32 %r4964, %r4940, 24; + st.local.u8 [%rd167+176], %r4964; + ld.local.u8 %rs138, [%rd3+8]; + add.s16 %rs139, %rs138, -1; + st.local.u8 [%rd3+8], %rs139; + cvt.u64.u16 %rd168, %rs139; + and.b64 %rd169, %rd168, 255; + setp.lt.u64 %p28, %rd225, %rd169; + and.b16 %rs140, %rs139, 255; + mul.wide.u16 %r11661, %rs140, 32; + @%p28 bra $L__BB1_31; + +$L__BB1_32: + cvt.s64.s32 %rd170, %r11661; + add.s64 %rd171, %rd2, %rd170; + mov.b32 {%rs141, %rs142}, %r3959; + st.local.u8 [%rd171+145], %rs141; + shr.u16 %rs143, %rs141, 8; + st.local.u8 [%rd171+146], %rs143; + st.local.u8 [%rd171+147], %rs142; + shr.u16 %rs144, %rs142, 8; + st.local.u8 [%rd171+148], %rs144; + mov.b32 {%rs145, %rs146}, %r3960; + st.local.u8 [%rd171+149], %rs145; + shr.u16 %rs147, %rs145, 8; + st.local.u8 [%rd171+150], %rs147; + st.local.u8 [%rd171+151], %rs146; + shr.u16 %rs148, %rs146, 8; + st.local.u8 [%rd171+152], %rs148; + mov.b32 {%rs149, %rs150}, %r3961; + st.local.u8 [%rd171+153], %rs149; + shr.u16 %rs151, %rs149, 8; + st.local.u8 [%rd171+154], %rs151; + st.local.u8 [%rd171+155], %rs150; + shr.u16 %rs152, %rs150, 8; + st.local.u8 [%rd171+156], %rs152; + mov.b32 {%rs153, %rs154}, %r3962; + st.local.u8 [%rd171+157], %rs153; + shr.u16 %rs155, %rs153, 8; + st.local.u8 [%rd171+158], %rs155; + st.local.u8 [%rd171+159], %rs154; + shr.u16 %rs156, %rs154, 8; + st.local.u8 [%rd171+160], %rs156; + mov.b32 {%rs157, %rs158}, %r3963; + st.local.u8 [%rd171+161], %rs157; + shr.u16 %rs159, %rs157, 8; + st.local.u8 [%rd171+162], %rs159; + st.local.u8 [%rd171+163], %rs158; + shr.u16 %rs160, %rs158, 8; + st.local.u8 [%rd171+164], %rs160; + mov.b32 {%rs161, %rs162}, %r3964; + st.local.u8 [%rd171+165], %rs161; + shr.u16 %rs163, %rs161, 8; + st.local.u8 [%rd171+166], %rs163; + st.local.u8 [%rd171+167], %rs162; + shr.u16 %rs164, %rs162, 8; + st.local.u8 [%rd171+168], %rs164; + mov.b32 {%rs165, %rs166}, %r3965; + st.local.u8 [%rd171+169], %rs165; + shr.u16 %rs167, %rs165, 8; + st.local.u8 [%rd171+170], %rs167; + st.local.u8 [%rd171+171], %rs166; + shr.u16 %rs168, %rs166, 8; + st.local.u8 [%rd171+172], %rs168; + mov.b32 {%rs169, %rs170}, %r3966; + st.local.u8 [%rd171+173], %rs169; + shr.u16 %rs171, %rs169, 8; + st.local.u8 [%rd171+174], %rs171; + st.local.u8 [%rd171+175], %rs170; + shr.u16 %rs172, %rs170, 8; + st.local.u8 [%rd171+176], %rs172; + ld.local.u8 %rs173, [%rd3+8]; + add.s16 %rs174, %rs173, 1; + st.local.u8 [%rd3+8], %rs174; + shr.u64 %rd172, %rd49, 11; + ld.local.u64 %rd173, [%rd3+-72]; + add.s64 %rd174, %rd173, %rd172; + popc.b64 %r4965, %rd174; + cvt.u64.u32 %rd52, %r4965; + cvt.u64.u16 %rd175, %rs174; + and.b64 %rd176, %rd175, 255; + setp.ge.u64 %p29, %rd52, %rd176; + and.b16 %rs175, %rs174, 255; + mul.wide.u16 %r11663, %rs175, 32; + @%p29 bra $L__BB1_35; + +$L__BB1_34: + shr.u64 %rd228, %rd49, 11; + add.s64 %rd227, %rd173, %rd228; + popc.b64 %r11648, %rd227; + cvt.u64.u32 %rd226, %r11648; + add.s32 %r4966, %r11663, -64; + cvt.s64.s32 %rd177, %r4966; + add.s64 %rd178, %rd2, %rd177; + ld.local.u8 %r4967, [%rd3+2]; + ld.local.u8 %r4968, [%rd178+145]; + ld.local.u8 %r4969, [%rd178+146]; + prmt.b32 %r4970, %r4969, %r4968, 30212; + ld.local.u8 %r4971, [%rd178+147]; + prmt.b32 %r4972, %r4971, %r4970, 28756; + ld.local.u8 %r4973, [%rd178+148]; + prmt.b32 %r4974, %r4973, %r4972, 1620; + ld.local.u8 %r4975, [%rd178+149]; + ld.local.u8 %r4976, [%rd178+150]; + prmt.b32 %r4977, %r4976, %r4975, 30212; + ld.local.u8 %r4978, [%rd178+151]; + prmt.b32 %r4979, %r4978, %r4977, 28756; + ld.local.u8 %r4980, [%rd178+152]; + prmt.b32 %r4981, %r4980, %r4979, 1620; + ld.local.u8 %r4982, [%rd178+153]; + ld.local.u8 %r4983, [%rd178+154]; + prmt.b32 %r4984, %r4983, %r4982, 30212; + ld.local.u8 %r4985, [%rd178+155]; + prmt.b32 %r4986, %r4985, %r4984, 28756; + ld.local.u8 %r4987, [%rd178+156]; + prmt.b32 %r4988, %r4987, %r4986, 1620; + ld.local.u8 %r4989, [%rd178+157]; + ld.local.u8 %r4990, [%rd178+158]; + prmt.b32 %r4991, %r4990, %r4989, 30212; + ld.local.u8 %r4992, [%rd178+159]; + prmt.b32 %r4993, %r4992, %r4991, 28756; + ld.local.u8 %r4994, [%rd178+160]; + prmt.b32 %r4995, %r4994, %r4993, 1620; + ld.local.u8 %r4996, [%rd178+161]; + ld.local.u8 %r4997, [%rd178+162]; + prmt.b32 %r4998, %r4997, %r4996, 30212; + ld.local.u8 %r4999, [%rd178+163]; + prmt.b32 %r5000, %r4999, %r4998, 28756; + ld.local.u8 %r5001, [%rd178+164]; + prmt.b32 %r5002, %r5001, %r5000, 1620; + ld.local.u8 %r5003, [%rd178+165]; + ld.local.u8 %r5004, [%rd178+166]; + prmt.b32 %r5005, %r5004, %r5003, 30212; + ld.local.u8 %r5006, [%rd178+167]; + prmt.b32 %r5007, %r5006, %r5005, 28756; + ld.local.u8 %r5008, [%rd178+168]; + prmt.b32 %r5009, %r5008, %r5007, 1620; + ld.local.u8 %r5010, [%rd178+169]; + ld.local.u8 %r5011, [%rd178+170]; + prmt.b32 %r5012, %r5011, %r5010, 30212; + ld.local.u8 %r5013, [%rd178+171]; + prmt.b32 %r5014, %r5013, %r5012, 28756; + ld.local.u8 %r5015, [%rd178+172]; + prmt.b32 %r5016, %r5015, %r5014, 1620; + ld.local.u8 %r5017, [%rd178+173]; + ld.local.u8 %r5018, [%rd178+174]; + prmt.b32 %r5019, %r5018, %r5017, 30212; + ld.local.u8 %r5020, [%rd178+175]; + prmt.b32 %r5021, %r5020, %r5019, 28756; + ld.local.u8 %r5022, [%rd178+176]; + prmt.b32 %r5023, %r5022, %r5021, 1620; + ld.local.u8 %r5024, [%rd178+177]; + ld.local.u8 %r5025, [%rd178+178]; + prmt.b32 %r5026, %r5025, %r5024, 30212; + ld.local.u8 %r5027, [%rd178+179]; + prmt.b32 %r5028, %r5027, %r5026, 28756; + ld.local.u8 %r5029, [%rd178+180]; + prmt.b32 %r5030, %r5029, %r5028, 1620; + ld.local.u8 %r5031, [%rd178+181]; + ld.local.u8 %r5032, [%rd178+182]; + prmt.b32 %r5033, %r5032, %r5031, 30212; + ld.local.u8 %r5034, [%rd178+183]; + prmt.b32 %r5035, %r5034, %r5033, 28756; + ld.local.u8 %r5036, [%rd178+184]; + prmt.b32 %r5037, %r5036, %r5035, 1620; + ld.local.u8 %r5038, [%rd178+185]; + ld.local.u8 %r5039, [%rd178+186]; + prmt.b32 %r5040, %r5039, %r5038, 30212; + ld.local.u8 %r5041, [%rd178+187]; + prmt.b32 %r5042, %r5041, %r5040, 28756; + ld.local.u8 %r5043, [%rd178+188]; + prmt.b32 %r5044, %r5043, %r5042, 1620; + ld.local.u8 %r5045, [%rd178+189]; + ld.local.u8 %r5046, [%rd178+190]; + prmt.b32 %r5047, %r5046, %r5045, 30212; + ld.local.u8 %r5048, [%rd178+191]; + prmt.b32 %r5049, %r5048, %r5047, 28756; + ld.local.u8 %r5050, [%rd178+192]; + prmt.b32 %r5051, %r5050, %r5049, 1620; + ld.local.u8 %r5052, [%rd178+193]; + ld.local.u8 %r5053, [%rd178+194]; + prmt.b32 %r5054, %r5053, %r5052, 30212; + ld.local.u8 %r5055, [%rd178+195]; + prmt.b32 %r5056, %r5055, %r5054, 28756; + ld.local.u8 %r5057, [%rd178+196]; + prmt.b32 %r5058, %r5057, %r5056, 1620; + ld.local.u8 %r5059, [%rd178+197]; + ld.local.u8 %r5060, [%rd178+198]; + prmt.b32 %r5061, %r5060, %r5059, 30212; + ld.local.u8 %r5062, [%rd178+199]; + prmt.b32 %r5063, %r5062, %r5061, 28756; + ld.local.u8 %r5064, [%rd178+200]; + prmt.b32 %r5065, %r5064, %r5063, 1620; + ld.local.u8 %r5066, [%rd178+201]; + ld.local.u8 %r5067, [%rd178+202]; + prmt.b32 %r5068, %r5067, %r5066, 30212; + ld.local.u8 %r5069, [%rd178+203]; + prmt.b32 %r5070, %r5069, %r5068, 28756; + ld.local.u8 %r5071, [%rd178+204]; + prmt.b32 %r5072, %r5071, %r5070, 1620; + ld.local.u8 %r5073, [%rd178+205]; + ld.local.u8 %r5074, [%rd178+206]; + prmt.b32 %r5075, %r5074, %r5073, 30212; + ld.local.u8 %r5076, [%rd178+207]; + prmt.b32 %r5077, %r5076, %r5075, 28756; + ld.local.u8 %r5078, [%rd178+208]; + prmt.b32 %r5079, %r5078, %r5077, 1620; + or.b32 %r5080, %r4967, 4; + ld.local.u8 %r5081, [%rd3+-120]; + ld.local.u8 %r5082, [%rd3+-119]; + prmt.b32 %r5083, %r5082, %r5081, 30212; + ld.local.u8 %r5084, [%rd3+-118]; + ld.local.u8 %r5085, [%rd3+-117]; + prmt.b32 %r5086, %r5085, %r5084, 30212; + prmt.b32 %r5087, %r5086, %r5083, 4180; + ld.local.u8 %r5088, [%rd3+-136]; + ld.local.u8 %r5089, [%rd3+-135]; + prmt.b32 %r5090, %r5089, %r5088, 30212; + ld.local.u8 %r5091, [%rd3+-134]; + ld.local.u8 %r5092, [%rd3+-133]; + prmt.b32 %r5093, %r5092, %r5091, 30212; + prmt.b32 %r5094, %r5093, %r5090, 4180; + add.s32 %r5095, %r5087, %r5094; + add.s32 %r5096, %r5095, %r4974; + shf.l.wrap.b32 %r5097, %r5096, %r5096, 16; + add.s32 %r5098, %r5097, 1779033703; + xor.b32 %r5099, %r5098, %r5087; + shf.l.wrap.b32 %r5100, %r5099, %r5099, 20; + add.s32 %r5101, %r4981, %r5096; + add.s32 %r5102, %r5101, %r5100; + xor.b32 %r5103, %r5102, %r5097; + shf.l.wrap.b32 %r5104, %r5103, %r5103, 24; + add.s32 %r5105, %r5104, %r5098; + xor.b32 %r5106, %r5105, %r5100; + shf.l.wrap.b32 %r5107, %r5106, %r5106, 25; + ld.local.u8 %r5108, [%rd3+-116]; + ld.local.u8 %r5109, [%rd3+-115]; + prmt.b32 %r5110, %r5109, %r5108, 30212; + ld.local.u8 %r5111, [%rd3+-114]; + ld.local.u8 %r5112, [%rd3+-113]; + prmt.b32 %r5113, %r5112, %r5111, 30212; + prmt.b32 %r5114, %r5113, %r5110, 4180; + ld.local.u8 %r5115, [%rd3+-132]; + ld.local.u8 %r5116, [%rd3+-131]; + prmt.b32 %r5117, %r5116, %r5115, 30212; + ld.local.u8 %r5118, [%rd3+-130]; + ld.local.u8 %r5119, [%rd3+-129]; + prmt.b32 %r5120, %r5119, %r5118, 30212; + prmt.b32 %r5121, %r5120, %r5117, 4180; + add.s32 %r5122, %r5114, %r5121; + add.s32 %r5123, %r5122, %r4988; + shf.l.wrap.b32 %r5124, %r5123, %r5123, 16; + add.s32 %r5125, %r5124, -1150833019; + xor.b32 %r5126, %r5125, %r5114; + shf.l.wrap.b32 %r5127, %r5126, %r5126, 20; + add.s32 %r5128, %r4995, %r5123; + add.s32 %r5129, %r5128, %r5127; + xor.b32 %r5130, %r5129, %r5124; + shf.l.wrap.b32 %r5131, %r5130, %r5130, 24; + add.s32 %r5132, %r5131, %r5125; + xor.b32 %r5133, %r5132, %r5127; + shf.l.wrap.b32 %r5134, %r5133, %r5133, 25; + ld.local.u8 %r5135, [%rd3+-112]; + ld.local.u8 %r5136, [%rd3+-111]; + prmt.b32 %r5137, %r5136, %r5135, 30212; + ld.local.u8 %r5138, [%rd3+-110]; + ld.local.u8 %r5139, [%rd3+-109]; + prmt.b32 %r5140, %r5139, %r5138, 30212; + prmt.b32 %r5141, %r5140, %r5137, 4180; + ld.local.u8 %r5142, [%rd3+-128]; + ld.local.u8 %r5143, [%rd3+-127]; + prmt.b32 %r5144, %r5143, %r5142, 30212; + ld.local.u8 %r5145, [%rd3+-126]; + ld.local.u8 %r5146, [%rd3+-125]; + prmt.b32 %r5147, %r5146, %r5145, 30212; + prmt.b32 %r5148, %r5147, %r5144, 4180; + add.s32 %r5149, %r5141, %r5148; + add.s32 %r5150, %r5149, %r5002; + shr.u32 %r5151, %r5150, 16; + shl.b32 %r5152, %r5150, 16; + xor.b32 %r5153, %r5152, 4194304; + or.b32 %r5154, %r5153, %r5151; + add.s32 %r5155, %r5154, 1013904242; + xor.b32 %r5156, %r5155, %r5141; + shf.l.wrap.b32 %r5157, %r5156, %r5156, 20; + add.s32 %r5158, %r5009, %r5150; + add.s32 %r5159, %r5158, %r5157; + xor.b32 %r5160, %r5159, %r5154; + shf.l.wrap.b32 %r5161, %r5160, %r5160, 24; + add.s32 %r5162, %r5161, %r5155; + xor.b32 %r5163, %r5162, %r5157; + shf.l.wrap.b32 %r5164, %r5163, %r5163, 25; + ld.local.u8 %r5165, [%rd3+-108]; + ld.local.u8 %r5166, [%rd3+-107]; + prmt.b32 %r5167, %r5166, %r5165, 30212; + ld.local.u8 %r5168, [%rd3+-106]; + ld.local.u8 %r5169, [%rd3+-105]; + prmt.b32 %r5170, %r5169, %r5168, 30212; + prmt.b32 %r5171, %r5170, %r5167, 4180; + ld.local.u8 %r5172, [%rd3+-124]; + ld.local.u8 %r5173, [%rd3+-123]; + prmt.b32 %r5174, %r5173, %r5172, 30212; + ld.local.u8 %r5175, [%rd3+-122]; + ld.local.u8 %r5176, [%rd3+-121]; + prmt.b32 %r5177, %r5176, %r5175, 30212; + prmt.b32 %r5178, %r5177, %r5174, 4180; + add.s32 %r5179, %r5171, %r5178; + add.s32 %r5180, %r5179, %r5016; + xor.b32 %r5181, %r5180, %r5080; + shr.u32 %r5182, %r5180, 16; + shl.b32 %r5183, %r5181, 16; + or.b32 %r5184, %r5183, %r5182; + add.s32 %r5185, %r5184, -1521486534; + xor.b32 %r5186, %r5185, %r5171; + shf.l.wrap.b32 %r5187, %r5186, %r5186, 20; + add.s32 %r5188, %r5023, %r5180; + add.s32 %r5189, %r5188, %r5187; + xor.b32 %r5190, %r5189, %r5184; + shf.l.wrap.b32 %r5191, %r5190, %r5190, 24; + add.s32 %r5192, %r5191, %r5185; + xor.b32 %r5193, %r5192, %r5187; + shf.l.wrap.b32 %r5194, %r5193, %r5193, 25; + add.s32 %r5195, %r5134, %r5102; + add.s32 %r5196, %r5195, %r5030; + xor.b32 %r5197, %r5191, %r5196; + shf.l.wrap.b32 %r5198, %r5197, %r5197, 16; + add.s32 %r5199, %r5198, %r5162; + xor.b32 %r5200, %r5199, %r5134; + shf.l.wrap.b32 %r5201, %r5200, %r5200, 20; + add.s32 %r5202, %r5037, %r5196; + add.s32 %r5203, %r5202, %r5201; + xor.b32 %r5204, %r5203, %r5198; + shf.l.wrap.b32 %r5205, %r5204, %r5204, 24; + add.s32 %r5206, %r5205, %r5199; + xor.b32 %r5207, %r5206, %r5201; + shf.l.wrap.b32 %r5208, %r5207, %r5207, 25; + add.s32 %r5209, %r5164, %r5129; + add.s32 %r5210, %r5209, %r5044; + xor.b32 %r5211, %r5210, %r5104; + shf.l.wrap.b32 %r5212, %r5211, %r5211, 16; + add.s32 %r5213, %r5212, %r5192; + xor.b32 %r5214, %r5213, %r5164; + shf.l.wrap.b32 %r5215, %r5214, %r5214, 20; + add.s32 %r5216, %r5051, %r5210; + add.s32 %r5217, %r5216, %r5215; + xor.b32 %r5218, %r5217, %r5212; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 24; + add.s32 %r5220, %r5219, %r5213; + xor.b32 %r5221, %r5220, %r5215; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 25; + add.s32 %r5223, %r5194, %r5159; + add.s32 %r5224, %r5223, %r5058; + xor.b32 %r5225, %r5224, %r5131; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 16; + add.s32 %r5227, %r5226, %r5105; + xor.b32 %r5228, %r5227, %r5194; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 20; + add.s32 %r5230, %r5065, %r5224; + add.s32 %r5231, %r5230, %r5229; + xor.b32 %r5232, %r5231, %r5226; + shf.l.wrap.b32 %r5233, %r5232, %r5232, 24; + add.s32 %r5234, %r5233, %r5227; + xor.b32 %r5235, %r5234, %r5229; + shf.l.wrap.b32 %r5236, %r5235, %r5235, 25; + add.s32 %r5237, %r5189, %r5107; + add.s32 %r5238, %r5237, %r5072; + xor.b32 %r5239, %r5238, %r5161; + shf.l.wrap.b32 %r5240, %r5239, %r5239, 16; + add.s32 %r5241, %r5240, %r5132; + xor.b32 %r5242, %r5241, %r5107; + shf.l.wrap.b32 %r5243, %r5242, %r5242, 20; + add.s32 %r5244, %r5079, %r5238; + add.s32 %r5245, %r5244, %r5243; + xor.b32 %r5246, %r5245, %r5240; + shf.l.wrap.b32 %r5247, %r5246, %r5246, 24; + add.s32 %r5248, %r5247, %r5241; + xor.b32 %r5249, %r5248, %r5243; + shf.l.wrap.b32 %r5250, %r5249, %r5249, 25; + add.s32 %r5251, %r5203, %r4988; + add.s32 %r5252, %r5251, %r5250; + xor.b32 %r5253, %r5252, %r5219; + shf.l.wrap.b32 %r5254, %r5253, %r5253, 16; + add.s32 %r5255, %r5254, %r5234; + xor.b32 %r5256, %r5255, %r5250; + shf.l.wrap.b32 %r5257, %r5256, %r5256, 20; + add.s32 %r5258, %r5252, %r5016; + add.s32 %r5259, %r5258, %r5257; + xor.b32 %r5260, %r5259, %r5254; + shf.l.wrap.b32 %r5261, %r5260, %r5260, 24; + add.s32 %r5262, %r5261, %r5255; + xor.b32 %r5263, %r5262, %r5257; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 25; + add.s32 %r5265, %r5217, %r4995; + add.s32 %r5266, %r5265, %r5208; + xor.b32 %r5267, %r5233, %r5266; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 16; + add.s32 %r5269, %r5248, %r5268; + xor.b32 %r5270, %r5269, %r5208; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 20; + add.s32 %r5272, %r5266, %r5044; + add.s32 %r5273, %r5272, %r5271; + xor.b32 %r5274, %r5273, %r5268; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 24; + add.s32 %r5276, %r5275, %r5269; + xor.b32 %r5277, %r5276, %r5271; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 25; + add.s32 %r5279, %r5222, %r5023; + add.s32 %r5280, %r5279, %r5231; + xor.b32 %r5281, %r5247, %r5280; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 16; + add.s32 %r5283, %r5282, %r5206; + xor.b32 %r5284, %r5283, %r5222; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 20; + add.s32 %r5286, %r5280, %r4974; + add.s32 %r5287, %r5286, %r5285; + xor.b32 %r5288, %r5287, %r5282; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 24; + add.s32 %r5290, %r5289, %r5283; + xor.b32 %r5291, %r5290, %r5285; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 25; + add.s32 %r5293, %r5236, %r5002; + add.s32 %r5294, %r5293, %r5245; + xor.b32 %r5295, %r5294, %r5205; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 16; + add.s32 %r5297, %r5296, %r5220; + xor.b32 %r5298, %r5297, %r5236; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 20; + add.s32 %r5300, %r5294, %r5065; + add.s32 %r5301, %r5300, %r5299; + xor.b32 %r5302, %r5301, %r5296; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 24; + add.s32 %r5304, %r5303, %r5297; + xor.b32 %r5305, %r5304, %r5299; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 25; + add.s32 %r5307, %r5278, %r4981; + add.s32 %r5308, %r5307, %r5259; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 16; + add.s32 %r5311, %r5310, %r5290; + xor.b32 %r5312, %r5311, %r5278; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 20; + add.s32 %r5314, %r5308, %r5051; + add.s32 %r5315, %r5314, %r5313; + xor.b32 %r5316, %r5315, %r5310; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 24; + add.s32 %r5318, %r5317, %r5311; + xor.b32 %r5319, %r5318, %r5313; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 25; + add.s32 %r5321, %r5273, %r5058; + add.s32 %r5322, %r5321, %r5292; + xor.b32 %r5323, %r5261, %r5322; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 16; + add.s32 %r5325, %r5324, %r5304; + xor.b32 %r5326, %r5325, %r5292; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 20; + add.s32 %r5328, %r5322, %r5009; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5324; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 24; + add.s32 %r5332, %r5331, %r5325; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 25; + add.s32 %r5335, %r5287, %r5037; + add.s32 %r5336, %r5335, %r5306; + xor.b32 %r5337, %r5336, %r5275; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 16; + add.s32 %r5339, %r5338, %r5262; + xor.b32 %r5340, %r5339, %r5306; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 20; + add.s32 %r5342, %r5336, %r5072; + add.s32 %r5343, %r5342, %r5341; + xor.b32 %r5344, %r5343, %r5338; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 24; + add.s32 %r5346, %r5345, %r5339; + xor.b32 %r5347, %r5346, %r5341; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 25; + add.s32 %r5349, %r5301, %r5079; + add.s32 %r5350, %r5349, %r5264; + xor.b32 %r5351, %r5350, %r5289; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 16; + add.s32 %r5353, %r5352, %r5276; + xor.b32 %r5354, %r5353, %r5264; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 20; + add.s32 %r5356, %r5350, %r5030; + add.s32 %r5357, %r5356, %r5355; + xor.b32 %r5358, %r5357, %r5352; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 24; + add.s32 %r5360, %r5359, %r5353; + xor.b32 %r5361, %r5360, %r5355; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 25; + add.s32 %r5363, %r5315, %r4995; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5331; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 16; + add.s32 %r5367, %r5366, %r5346; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 20; + add.s32 %r5370, %r5364, %r5002; + add.s32 %r5371, %r5370, %r5369; + xor.b32 %r5372, %r5371, %r5366; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 24; + add.s32 %r5374, %r5373, %r5367; + xor.b32 %r5375, %r5374, %r5369; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 25; + add.s32 %r5377, %r5329, %r5044; + add.s32 %r5378, %r5377, %r5320; + xor.b32 %r5379, %r5378, %r5345; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 16; + add.s32 %r5381, %r5380, %r5360; + xor.b32 %r5382, %r5381, %r5320; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 20; + add.s32 %r5384, %r5378, %r5058; + add.s32 %r5385, %r5384, %r5383; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 24; + add.s32 %r5388, %r5387, %r5381; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 25; + add.s32 %r5391, %r5343, %r5065; + add.s32 %r5392, %r5391, %r5334; + xor.b32 %r5393, %r5359, %r5392; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 16; + add.s32 %r5395, %r5394, %r5318; + xor.b32 %r5396, %r5395, %r5334; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 20; + add.s32 %r5398, %r5392, %r4988; + add.s32 %r5399, %r5398, %r5397; + xor.b32 %r5400, %r5399, %r5394; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 24; + add.s32 %r5402, %r5401, %r5395; + xor.b32 %r5403, %r5402, %r5397; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 25; + add.s32 %r5405, %r5348, %r5023; + add.s32 %r5406, %r5405, %r5357; + xor.b32 %r5407, %r5406, %r5317; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 16; + add.s32 %r5409, %r5408, %r5332; + xor.b32 %r5410, %r5409, %r5348; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 20; + add.s32 %r5412, %r5406, %r5072; + add.s32 %r5413, %r5412, %r5411; + xor.b32 %r5414, %r5413, %r5408; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 24; + add.s32 %r5416, %r5415, %r5409; + xor.b32 %r5417, %r5416, %r5411; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 25; + add.s32 %r5419, %r5371, %r5016; + add.s32 %r5420, %r5419, %r5390; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 16; + add.s32 %r5423, %r5422, %r5402; + xor.b32 %r5424, %r5423, %r5390; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 20; + add.s32 %r5426, %r5420, %r5009; + add.s32 %r5427, %r5426, %r5425; + xor.b32 %r5428, %r5427, %r5422; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 24; + add.s32 %r5430, %r5429, %r5423; + xor.b32 %r5431, %r5430, %r5425; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 25; + add.s32 %r5433, %r5385, %r5037; + add.s32 %r5434, %r5433, %r5404; + xor.b32 %r5435, %r5373, %r5434; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 16; + add.s32 %r5437, %r5436, %r5416; + xor.b32 %r5438, %r5437, %r5404; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 20; + add.s32 %r5440, %r5434, %r4974; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5436; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 24; + add.s32 %r5444, %r5443, %r5437; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 25; + add.s32 %r5447, %r5399, %r5051; + add.s32 %r5448, %r5447, %r5418; + xor.b32 %r5449, %r5448, %r5387; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 16; + add.s32 %r5451, %r5450, %r5374; + xor.b32 %r5452, %r5451, %r5418; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 20; + add.s32 %r5454, %r5448, %r5079; + add.s32 %r5455, %r5454, %r5453; + xor.b32 %r5456, %r5455, %r5450; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 24; + add.s32 %r5458, %r5457, %r5451; + xor.b32 %r5459, %r5458, %r5453; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 25; + add.s32 %r5461, %r5413, %r5030; + add.s32 %r5462, %r5461, %r5376; + xor.b32 %r5463, %r5462, %r5401; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 16; + add.s32 %r5465, %r5464, %r5388; + xor.b32 %r5466, %r5465, %r5376; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 20; + add.s32 %r5468, %r5462, %r4981; + add.s32 %r5469, %r5468, %r5467; + xor.b32 %r5470, %r5469, %r5464; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 24; + add.s32 %r5472, %r5471, %r5465; + xor.b32 %r5473, %r5472, %r5467; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 25; + add.s32 %r5475, %r5427, %r5044; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5443; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 16; + add.s32 %r5479, %r5478, %r5458; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 20; + add.s32 %r5482, %r5476, %r5023; + add.s32 %r5483, %r5482, %r5481; + xor.b32 %r5484, %r5483, %r5478; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 24; + add.s32 %r5486, %r5485, %r5479; + xor.b32 %r5487, %r5486, %r5481; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 25; + add.s32 %r5489, %r5441, %r5058; + add.s32 %r5490, %r5489, %r5432; + xor.b32 %r5491, %r5490, %r5457; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 16; + add.s32 %r5493, %r5492, %r5472; + xor.b32 %r5494, %r5493, %r5432; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 20; + add.s32 %r5496, %r5490, %r5037; + add.s32 %r5497, %r5496, %r5495; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 24; + add.s32 %r5500, %r5499, %r5493; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 25; + add.s32 %r5503, %r5455, %r5072; + add.s32 %r5504, %r5503, %r5446; + xor.b32 %r5505, %r5471, %r5504; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 16; + add.s32 %r5507, %r5506, %r5430; + xor.b32 %r5508, %r5507, %r5446; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 20; + add.s32 %r5510, %r5504, %r4995; + add.s32 %r5511, %r5510, %r5509; + xor.b32 %r5512, %r5511, %r5506; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 24; + add.s32 %r5514, %r5513, %r5507; + xor.b32 %r5515, %r5514, %r5509; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 25; + add.s32 %r5517, %r5469, %r5065; + add.s32 %r5518, %r5517, %r5460; + xor.b32 %r5519, %r5518, %r5429; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 16; + add.s32 %r5521, %r5520, %r5444; + xor.b32 %r5522, %r5521, %r5460; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 20; + add.s32 %r5524, %r5518, %r5079; + add.s32 %r5525, %r5524, %r5523; + xor.b32 %r5526, %r5525, %r5520; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 24; + add.s32 %r5528, %r5527, %r5521; + xor.b32 %r5529, %r5528, %r5523; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 25; + add.s32 %r5531, %r5483, %r5002; + add.s32 %r5532, %r5531, %r5502; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 16; + add.s32 %r5535, %r5534, %r5514; + xor.b32 %r5536, %r5535, %r5502; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 20; + add.s32 %r5538, %r5532, %r4974; + add.s32 %r5539, %r5538, %r5537; + xor.b32 %r5540, %r5539, %r5534; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 24; + add.s32 %r5542, %r5541, %r5535; + xor.b32 %r5543, %r5542, %r5537; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 25; + add.s32 %r5545, %r5497, %r5051; + add.s32 %r5546, %r5545, %r5516; + xor.b32 %r5547, %r5485, %r5546; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 16; + add.s32 %r5549, %r5548, %r5528; + xor.b32 %r5550, %r5549, %r5516; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 20; + add.s32 %r5552, %r5546, %r4988; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5548; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 24; + add.s32 %r5556, %r5555, %r5549; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 25; + add.s32 %r5559, %r5511, %r5009; + add.s32 %r5560, %r5559, %r5530; + xor.b32 %r5561, %r5560, %r5499; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 16; + add.s32 %r5563, %r5562, %r5486; + xor.b32 %r5564, %r5563, %r5530; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 20; + add.s32 %r5566, %r5560, %r5030; + add.s32 %r5567, %r5566, %r5565; + xor.b32 %r5568, %r5567, %r5562; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 24; + add.s32 %r5570, %r5569, %r5563; + xor.b32 %r5571, %r5570, %r5565; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 25; + add.s32 %r5573, %r5525, %r4981; + add.s32 %r5574, %r5573, %r5488; + xor.b32 %r5575, %r5574, %r5513; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 16; + add.s32 %r5577, %r5576, %r5500; + xor.b32 %r5578, %r5577, %r5488; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 20; + add.s32 %r5580, %r5574, %r5016; + add.s32 %r5581, %r5580, %r5579; + xor.b32 %r5582, %r5581, %r5576; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 24; + add.s32 %r5584, %r5583, %r5577; + xor.b32 %r5585, %r5584, %r5579; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 25; + add.s32 %r5587, %r5539, %r5058; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5555; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 16; + add.s32 %r5591, %r5590, %r5570; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 20; + add.s32 %r5594, %r5588, %r5065; + add.s32 %r5595, %r5594, %r5593; + xor.b32 %r5596, %r5595, %r5590; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 24; + add.s32 %r5598, %r5597, %r5591; + xor.b32 %r5599, %r5598, %r5593; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 25; + add.s32 %r5601, %r5553, %r5037; + add.s32 %r5602, %r5601, %r5544; + xor.b32 %r5603, %r5602, %r5569; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 16; + add.s32 %r5605, %r5604, %r5584; + xor.b32 %r5606, %r5605, %r5544; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 20; + add.s32 %r5608, %r5602, %r5051; + add.s32 %r5609, %r5608, %r5607; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 24; + add.s32 %r5612, %r5611, %r5605; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 25; + add.s32 %r5615, %r5567, %r5079; + add.s32 %r5616, %r5615, %r5558; + xor.b32 %r5617, %r5583, %r5616; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 16; + add.s32 %r5619, %r5618, %r5542; + xor.b32 %r5620, %r5619, %r5558; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 20; + add.s32 %r5622, %r5616, %r5044; + add.s32 %r5623, %r5622, %r5621; + xor.b32 %r5624, %r5623, %r5618; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 24; + add.s32 %r5626, %r5625, %r5619; + xor.b32 %r5627, %r5626, %r5621; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 25; + add.s32 %r5629, %r5581, %r5072; + add.s32 %r5630, %r5629, %r5572; + xor.b32 %r5631, %r5630, %r5541; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 16; + add.s32 %r5633, %r5632, %r5556; + xor.b32 %r5634, %r5633, %r5572; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 20; + add.s32 %r5636, %r5630, %r5030; + add.s32 %r5637, %r5636, %r5635; + xor.b32 %r5638, %r5637, %r5632; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 24; + add.s32 %r5640, %r5639, %r5633; + xor.b32 %r5641, %r5640, %r5635; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 25; + add.s32 %r5643, %r5595, %r5023; + add.s32 %r5644, %r5643, %r5614; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 16; + add.s32 %r5647, %r5646, %r5626; + xor.b32 %r5648, %r5647, %r5614; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 20; + add.s32 %r5650, %r5644, %r4988; + add.s32 %r5651, %r5650, %r5649; + xor.b32 %r5652, %r5651, %r5646; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 24; + add.s32 %r5654, %r5653, %r5647; + xor.b32 %r5655, %r5654, %r5649; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 25; + add.s32 %r5657, %r5609, %r5009; + add.s32 %r5658, %r5657, %r5628; + xor.b32 %r5659, %r5597, %r5658; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 16; + add.s32 %r5661, %r5660, %r5640; + xor.b32 %r5662, %r5661, %r5628; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 20; + add.s32 %r5664, %r5658, %r4995; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5660; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 24; + add.s32 %r5668, %r5667, %r5661; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 25; + add.s32 %r5671, %r5623, %r4974; + add.s32 %r5672, %r5671, %r5642; + xor.b32 %r5673, %r5672, %r5611; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 16; + add.s32 %r5675, %r5674, %r5598; + xor.b32 %r5676, %r5675, %r5642; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 20; + add.s32 %r5678, %r5672, %r4981; + add.s32 %r5679, %r5678, %r5677; + xor.b32 %r5680, %r5679, %r5674; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 24; + add.s32 %r5682, %r5681, %r5675; + xor.b32 %r5683, %r5682, %r5677; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 25; + add.s32 %r5685, %r5637, %r5016; + add.s32 %r5686, %r5685, %r5600; + xor.b32 %r5687, %r5686, %r5625; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 16; + add.s32 %r5689, %r5688, %r5612; + xor.b32 %r5690, %r5689, %r5600; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 20; + add.s32 %r5692, %r5686, %r5002; + add.s32 %r5693, %r5692, %r5691; + xor.b32 %r5694, %r5693, %r5688; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 24; + add.s32 %r5696, %r5695, %r5689; + xor.b32 %r5697, %r5696, %r5691; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 25; + add.s32 %r5699, %r5651, %r5037; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5667; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 16; + add.s32 %r5703, %r5702, %r5682; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 20; + add.s32 %r5706, %r5700, %r5072; + add.s32 %r5707, %r5706, %r5705; + xor.b32 %r5708, %r5707, %r5702; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 24; + add.s32 %r5710, %r5709, %r5703; + xor.b32 %r5711, %r5710, %r5705; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 25; + add.s32 %r5713, %r5665, %r5051; + add.s32 %r5714, %r5713, %r5656; + xor.b32 %r5715, %r5714, %r5681; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 16; + add.s32 %r5717, %r5716, %r5696; + xor.b32 %r5718, %r5717, %r5656; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 20; + add.s32 %r5720, %r5714, %r5009; + add.s32 %r5721, %r5720, %r5719; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 24; + add.s32 %r5724, %r5723, %r5717; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 25; + add.s32 %r5727, %r5679, %r5030; + add.s32 %r5728, %r5727, %r5670; + xor.b32 %r5729, %r5695, %r5728; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 16; + add.s32 %r5731, %r5730, %r5654; + xor.b32 %r5732, %r5731, %r5670; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 20; + add.s32 %r5734, %r5728, %r5058; + add.s32 %r5735, %r5734, %r5733; + xor.b32 %r5736, %r5735, %r5730; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 24; + add.s32 %r5738, %r5737, %r5731; + xor.b32 %r5739, %r5738, %r5733; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 25; + add.s32 %r5741, %r5693, %r5079; + add.s32 %r5742, %r5741, %r5684; + xor.b32 %r5743, %r5742, %r5653; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 16; + add.s32 %r5745, %r5744, %r5668; + xor.b32 %r5746, %r5745, %r5684; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 20; + add.s32 %r5748, %r5742, %r4981; + add.s32 %r5749, %r5748, %r5747; + xor.b32 %r5750, %r5749, %r5744; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 24; + add.s32 %r5752, %r5751, %r5745; + xor.b32 %r5753, %r5752, %r5747; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 25; + add.s32 %r5755, %r5707, %r5065; + add.s32 %r5756, %r5755, %r5726; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 16; + add.s32 %r5759, %r5758, %r5738; + xor.b32 %r5760, %r5759, %r5726; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 20; + add.s32 %r5762, %r5756, %r4995; + add.s32 %r5763, %r5762, %r5761; + xor.b32 %r5764, %r5763, %r5758; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 24; + add.s32 %r5766, %r5765, %r5759; + xor.b32 %r5767, %r5766, %r5761; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 25; + add.s32 %r5769, %r5721, %r4974; + add.s32 %r5770, %r5769, %r5740; + xor.b32 %r5771, %r5709, %r5770; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 16; + add.s32 %r5773, %r5772, %r5752; + xor.b32 %r5774, %r5773, %r5740; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 20; + add.s32 %r5776, %r5770, %r5044; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5772; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 24; + add.s32 %r5780, %r5779, %r5773; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 25; + add.s32 %r5783, %r5735, %r4988; + add.s32 %r5784, %r5783, %r5754; + xor.b32 %r5785, %r5784, %r5723; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 16; + add.s32 %r5787, %r5786, %r5710; + xor.b32 %r5788, %r5787, %r5754; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 20; + add.s32 %r5790, %r5784, %r5016; + add.s32 %r5791, %r5790, %r5789; + xor.b32 %r5792, %r5791, %r5786; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 24; + add.s32 %r5794, %r5793, %r5787; + xor.b32 %r5795, %r5794, %r5789; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 25; + add.s32 %r5797, %r5749, %r5002; + add.s32 %r5798, %r5797, %r5712; + xor.b32 %r5799, %r5798, %r5737; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 16; + add.s32 %r5801, %r5800, %r5724; + xor.b32 %r5802, %r5801, %r5712; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 20; + add.s32 %r5804, %r5798, %r5023; + add.s32 %r5805, %r5804, %r5803; + xor.b32 %r5806, %r5805, %r5800; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 24; + add.s32 %r5808, %r5807, %r5801; + xor.b32 %r5809, %r5808, %r5803; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 25; + add.s32 %r5811, %r5763, %r5051; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5779; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 16; + add.s32 %r5815, %r5814, %r5794; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 20; + add.s32 %r5818, %r5812, %r5079; + add.s32 %r5819, %r5818, %r5817; + xor.b32 %r5820, %r5819, %r5814; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 24; + add.s32 %r5822, %r5821, %r5815; + xor.b32 %r5823, %r5822, %r5817; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 25; + add.s32 %r5825, %r5777, %r5009; + add.s32 %r5826, %r5825, %r5768; + xor.b32 %r5827, %r5826, %r5793; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 16; + add.s32 %r5829, %r5828, %r5808; + xor.b32 %r5830, %r5829, %r5768; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 20; + add.s32 %r5832, %r5826, %r4974; + add.s32 %r5833, %r5832, %r5831; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 24; + add.s32 %r5836, %r5835, %r5829; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 25; + add.s32 %r5839, %r5791, %r4981; + add.s32 %r5840, %r5839, %r5782; + xor.b32 %r5841, %r5807, %r5840; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 16; + add.s32 %r5843, %r5842, %r5766; + xor.b32 %r5844, %r5843, %r5782; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 20; + add.s32 %r5846, %r5840, %r5037; + add.s32 %r5847, %r5846, %r5845; + xor.b32 %r5848, %r5847, %r5842; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 24; + add.s32 %r5850, %r5849, %r5843; + xor.b32 %r5851, %r5850, %r5845; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 25; + add.s32 %r5853, %r5805, %r5030; + add.s32 %r5854, %r5853, %r5796; + xor.b32 %r5855, %r5854, %r5765; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 16; + add.s32 %r5857, %r5856, %r5780; + xor.b32 %r5858, %r5857, %r5796; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 20; + add.s32 %r5860, %r5854, %r5016; + add.s32 %r5861, %r5860, %r5859; + xor.b32 %r5862, %r5861, %r5856; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 24; + add.s32 %r5864, %r5863, %r5857; + xor.b32 %r5865, %r5864, %r5859; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 25; + add.s32 %r5867, %r5819, %r5072; + add.s32 %r5868, %r5867, %r5838; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 16; + add.s32 %r5871, %r5870, %r5850; + xor.b32 %r5872, %r5871, %r5838; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 20; + add.s32 %r5874, %r5868, %r5044; + add.s32 %r5875, %r5874, %r5873; + xor.b32 %r5876, %r5875, %r5870; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 24; + add.s32 %r5878, %r5877, %r5871; + xor.b32 %r5879, %r5878, %r5873; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 25; + add.s32 %r5881, %r5833, %r4988; + add.s32 %r5882, %r5881, %r5852; + xor.b32 %r5883, %r5821, %r5882; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 16; + add.s32 %r5885, %r5884, %r5864; + xor.b32 %r5886, %r5885, %r5852; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 20; + add.s32 %r5888, %r5882, %r5058; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5884; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 24; + add.s32 %r5892, %r5891, %r5885; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 25; + add.s32 %r5895, %r5847, %r4995; + add.s32 %r5896, %r5895, %r5866; + xor.b32 %r5897, %r5896, %r5835; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 16; + add.s32 %r5899, %r5898, %r5822; + xor.b32 %r5900, %r5899, %r5866; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 20; + add.s32 %r5902, %r5896, %r5002; + add.s32 %r5903, %r5902, %r5901; + xor.b32 %r5904, %r5903, %r5898; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 24; + add.s32 %r5906, %r5905, %r5899; + xor.b32 %r5907, %r5906, %r5901; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 25; + add.s32 %r5909, %r5861, %r5023; + add.s32 %r5910, %r5909, %r5824; + xor.b32 %r5911, %r5910, %r5849; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 16; + add.s32 %r5913, %r5912, %r5836; + xor.b32 %r5914, %r5913, %r5824; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 20; + add.s32 %r5916, %r5910, %r5065; + add.s32 %r5917, %r5916, %r5915; + xor.b32 %r5918, %r5917, %r5912; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 24; + add.s32 %r5920, %r5919, %r5913; + xor.b32 %r5921, %r5920, %r5915; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 25; + xor.b32 %r5923, %r5906, %r5875; + xor.b32 %r5924, %r5920, %r5889; + xor.b32 %r5925, %r5878, %r5903; + xor.b32 %r5926, %r5917, %r5892; + xor.b32 %r5927, %r5922, %r5891; + xor.b32 %r5928, %r5880, %r5905; + xor.b32 %r5929, %r5919, %r5894; + xor.b32 %r5930, %r5908, %r5877; + st.local.u8 [%rd178+145], %r5923; + shr.u32 %r5931, %r5923, 8; + st.local.u8 [%rd178+146], %r5931; + shr.u32 %r5932, %r5923, 16; + st.local.u8 [%rd178+147], %r5932; + shr.u32 %r5933, %r5923, 24; + st.local.u8 [%rd178+148], %r5933; + st.local.u8 [%rd178+149], %r5924; + shr.u32 %r5934, %r5924, 8; + st.local.u8 [%rd178+150], %r5934; + shr.u32 %r5935, %r5924, 16; + st.local.u8 [%rd178+151], %r5935; + shr.u32 %r5936, %r5924, 24; + st.local.u8 [%rd178+152], %r5936; + st.local.u8 [%rd178+153], %r5925; + shr.u32 %r5937, %r5925, 8; + st.local.u8 [%rd178+154], %r5937; + shr.u32 %r5938, %r5925, 16; + st.local.u8 [%rd178+155], %r5938; + shr.u32 %r5939, %r5925, 24; + st.local.u8 [%rd178+156], %r5939; + st.local.u8 [%rd178+157], %r5926; + shr.u32 %r5940, %r5926, 8; + st.local.u8 [%rd178+158], %r5940; + shr.u32 %r5941, %r5926, 16; + st.local.u8 [%rd178+159], %r5941; + shr.u32 %r5942, %r5926, 24; + st.local.u8 [%rd178+160], %r5942; + st.local.u8 [%rd178+161], %r5927; + shr.u32 %r5943, %r5927, 8; + st.local.u8 [%rd178+162], %r5943; + shr.u32 %r5944, %r5927, 16; + st.local.u8 [%rd178+163], %r5944; + shr.u32 %r5945, %r5927, 24; + st.local.u8 [%rd178+164], %r5945; + st.local.u8 [%rd178+165], %r5928; + shr.u32 %r5946, %r5928, 8; + st.local.u8 [%rd178+166], %r5946; + shr.u32 %r5947, %r5928, 16; + st.local.u8 [%rd178+167], %r5947; + shr.u32 %r5948, %r5928, 24; + st.local.u8 [%rd178+168], %r5948; + st.local.u8 [%rd178+169], %r5929; + shr.u32 %r5949, %r5929, 8; + st.local.u8 [%rd178+170], %r5949; + shr.u32 %r5950, %r5929, 16; + st.local.u8 [%rd178+171], %r5950; + shr.u32 %r5951, %r5929, 24; + st.local.u8 [%rd178+172], %r5951; + st.local.u8 [%rd178+173], %r5930; + shr.u32 %r5952, %r5930, 8; + st.local.u8 [%rd178+174], %r5952; + shr.u32 %r5953, %r5930, 16; + st.local.u8 [%rd178+175], %r5953; + shr.u32 %r5954, %r5930, 24; + st.local.u8 [%rd178+176], %r5954; + ld.local.u8 %rs176, [%rd3+8]; + add.s16 %rs177, %rs176, -1; + st.local.u8 [%rd3+8], %rs177; + cvt.u64.u16 %rd179, %rs177; + and.b64 %rd180, %rd179, 255; + setp.lt.u64 %p30, %rd226, %rd180; + and.b16 %rs178, %rs177, 255; + mul.wide.u16 %r11663, %rs178, 32; + @%p30 bra $L__BB1_34; + +$L__BB1_35: + cvt.s64.s32 %rd181, %r11663; + add.s64 %rd182, %rd2, %rd181; + mov.b32 {%rs179, %rs180}, %r3967; + st.local.u8 [%rd182+145], %rs179; + shr.u16 %rs181, %rs179, 8; + st.local.u8 [%rd182+146], %rs181; + st.local.u8 [%rd182+147], %rs180; + shr.u16 %rs182, %rs180, 8; + st.local.u8 [%rd182+148], %rs182; + mov.b32 {%rs183, %rs184}, %r3968; + st.local.u8 [%rd182+149], %rs183; + shr.u16 %rs185, %rs183, 8; + st.local.u8 [%rd182+150], %rs185; + st.local.u8 [%rd182+151], %rs184; + shr.u16 %rs186, %rs184, 8; + st.local.u8 [%rd182+152], %rs186; + mov.b32 {%rs187, %rs188}, %r3969; + st.local.u8 [%rd182+153], %rs187; + shr.u16 %rs189, %rs187, 8; + st.local.u8 [%rd182+154], %rs189; + st.local.u8 [%rd182+155], %rs188; + shr.u16 %rs190, %rs188, 8; + st.local.u8 [%rd182+156], %rs190; + mov.b32 {%rs191, %rs192}, %r3970; + st.local.u8 [%rd182+157], %rs191; + shr.u16 %rs193, %rs191, 8; + st.local.u8 [%rd182+158], %rs193; + st.local.u8 [%rd182+159], %rs192; + shr.u16 %rs194, %rs192, 8; + st.local.u8 [%rd182+160], %rs194; + mov.b32 {%rs195, %rs196}, %r3971; + st.local.u8 [%rd182+161], %rs195; + shr.u16 %rs197, %rs195, 8; + st.local.u8 [%rd182+162], %rs197; + st.local.u8 [%rd182+163], %rs196; + shr.u16 %rs198, %rs196, 8; + st.local.u8 [%rd182+164], %rs198; + mov.b32 {%rs199, %rs200}, %r3972; + st.local.u8 [%rd182+165], %rs199; + shr.u16 %rs201, %rs199, 8; + st.local.u8 [%rd182+166], %rs201; + st.local.u8 [%rd182+167], %rs200; + shr.u16 %rs202, %rs200, 8; + st.local.u8 [%rd182+168], %rs202; + mov.b32 {%rs203, %rs204}, %r3973; + st.local.u8 [%rd182+169], %rs203; + shr.u16 %rs205, %rs203, 8; + st.local.u8 [%rd182+170], %rs205; + st.local.u8 [%rd182+171], %rs204; + shr.u16 %rs206, %rs204, 8; + st.local.u8 [%rd182+172], %rs206; + mov.b32 {%rs207, %rs208}, %r3974; + st.local.u8 [%rd182+173], %rs207; + shr.u16 %rs209, %rs207, 8; + st.local.u8 [%rd182+174], %rs209; + st.local.u8 [%rd182+175], %rs208; + shr.u16 %rs210, %rs208, 8; + st.local.u8 [%rd182+176], %rs210; + ld.local.u8 %rs388, [%rd3+8]; + +$L__BB1_47: + add.s16 %rs331, %rs388, 1; + st.local.u8 [%rd3+8], %rs331; + ld.local.u64 %rd195, [%rd3+-72]; + shr.u64 %rd196, %rd49, 10; + add.s64 %rd250, %rd195, %rd196; + st.local.u64 [%rd3+-72], %rd250; + add.s64 %rd260, %rd260, %rd49; + add.s64 %rd253, %rd253, %rd49; + sub.s64 %rd261, %rd261, %rd49; + setp.gt.u64 %p39, %rd261, 1024; + @%p39 bra $L__BB1_26; + +$L__BB1_48: + setp.eq.s64 %p40, %rd261, 0; + @%p40 bra $L__BB1_68; + + ld.local.u8 %rs389, [%rd3]; + cvt.u64.u16 %rd71, %rs389; + setp.eq.s16 %p41, %rs389, 0; + mov.u16 %rs390, 0; + mov.u64 %rd270, %rd261; + @%p41 bra $L__BB1_57; + + mov.u64 %rd197, 64; + sub.s64 %rd198, %rd197, %rd71; + min.u64 %rd72, %rd198, %rd261; + setp.eq.s64 %p42, %rd72, 0; + @%p42 bra $L__BB1_54; + + add.s64 %rd200, %rd2, %rd71; + add.s64 %rd73, %rd200, 72; + mov.u64 %rd262, 0; + +$L__BB1_52: + add.s64 %rd201, %rd260, %rd262; + ld.local.u8 %rs333, [%rd201]; + add.s64 %rd202, %rd73, %rd262; + st.local.u8 [%rd202], %rs333; + add.s64 %rd262, %rd262, 1; + setp.lt.u64 %p43, %rd262, %rd72; + @%p43 bra $L__BB1_52; + + ld.local.u8 %rs389, [%rd3]; + +$L__BB1_54: + cvt.u16.u64 %rs334, %rd72; + add.s16 %rs390, %rs389, %rs334; + mov.u64 %rd270, 0; + st.local.u8 [%rd3], %rs390; + add.s64 %rd260, %rd260, %rd72; + sub.s64 %rd77, %rd261, %rd72; + setp.eq.s64 %p44, %rd77, 0; + @%p44 bra $L__BB1_57; + + add.s64 %rd78, %rd2, 72; + ld.local.u8 %rs335, [%rd3+1]; + mov.u64 %rd263, 0; + setp.eq.s16 %p45, %rs335, 0; + mov.u16 %rs390, 0; + selp.u16 %rs337, 1, 0, %p45; + ld.local.u8 %rs338, [%rd3+2]; + or.b16 %rs339, %rs338, %rs337; + ld.local.u8 %r8843, [%rd3+-64]; + ld.local.u8 %r8844, [%rd3+-63]; + prmt.b32 %r8845, %r8844, %r8843, 30212; + ld.local.u8 %r8846, [%rd3+-62]; + prmt.b32 %r8847, %r8846, %r8845, 28756; + ld.local.u8 %r8848, [%rd3+-61]; + prmt.b32 %r8849, %r8848, %r8847, 1620; + ld.local.u8 %r8850, [%rd3+-60]; + ld.local.u8 %r8851, [%rd3+-59]; + prmt.b32 %r8852, %r8851, %r8850, 30212; + ld.local.u8 %r8853, [%rd3+-58]; + prmt.b32 %r8854, %r8853, %r8852, 28756; + ld.local.u8 %r8855, [%rd3+-57]; + prmt.b32 %r8856, %r8855, %r8854, 1620; + ld.local.u8 %r8857, [%rd3+-56]; + ld.local.u8 %r8858, [%rd3+-55]; + prmt.b32 %r8859, %r8858, %r8857, 30212; + ld.local.u8 %r8860, [%rd3+-54]; + prmt.b32 %r8861, %r8860, %r8859, 28756; + ld.local.u8 %r8862, [%rd3+-53]; + prmt.b32 %r8863, %r8862, %r8861, 1620; + ld.local.u8 %r8864, [%rd3+-52]; + ld.local.u8 %r8865, [%rd3+-51]; + prmt.b32 %r8866, %r8865, %r8864, 30212; + ld.local.u8 %r8867, [%rd3+-50]; + prmt.b32 %r8868, %r8867, %r8866, 28756; + ld.local.u8 %r8869, [%rd3+-49]; + prmt.b32 %r8870, %r8869, %r8868, 1620; + ld.local.u8 %r8871, [%rd3+-48]; + ld.local.u8 %r8872, [%rd3+-47]; + prmt.b32 %r8873, %r8872, %r8871, 30212; + ld.local.u8 %r8874, [%rd3+-46]; + prmt.b32 %r8875, %r8874, %r8873, 28756; + ld.local.u8 %r8876, [%rd3+-45]; + prmt.b32 %r8877, %r8876, %r8875, 1620; + ld.local.u8 %r8878, [%rd3+-44]; + ld.local.u8 %r8879, [%rd3+-43]; + prmt.b32 %r8880, %r8879, %r8878, 30212; + ld.local.u8 %r8881, [%rd3+-42]; + prmt.b32 %r8882, %r8881, %r8880, 28756; + ld.local.u8 %r8883, [%rd3+-41]; + prmt.b32 %r8884, %r8883, %r8882, 1620; + ld.local.u8 %r8885, [%rd3+-40]; + ld.local.u8 %r8886, [%rd3+-39]; + prmt.b32 %r8887, %r8886, %r8885, 30212; + ld.local.u8 %r8888, [%rd3+-38]; + prmt.b32 %r8889, %r8888, %r8887, 28756; + ld.local.u8 %r8890, [%rd3+-37]; + prmt.b32 %r8891, %r8890, %r8889, 1620; + ld.local.u8 %r8892, [%rd3+-36]; + ld.local.u8 %r8893, [%rd3+-35]; + prmt.b32 %r8894, %r8893, %r8892, 30212; + ld.local.u8 %r8895, [%rd3+-34]; + prmt.b32 %r8896, %r8895, %r8894, 28756; + ld.local.u8 %r8897, [%rd3+-33]; + prmt.b32 %r8898, %r8897, %r8896, 1620; + ld.local.u8 %r8899, [%rd3+-32]; + ld.local.u8 %r8900, [%rd3+-31]; + prmt.b32 %r8901, %r8900, %r8899, 30212; + ld.local.u8 %r8902, [%rd3+-30]; + prmt.b32 %r8903, %r8902, %r8901, 28756; + ld.local.u8 %r8904, [%rd3+-29]; + prmt.b32 %r8905, %r8904, %r8903, 1620; + ld.local.u8 %r8906, [%rd3+-28]; + ld.local.u8 %r8907, [%rd3+-27]; + prmt.b32 %r8908, %r8907, %r8906, 30212; + ld.local.u8 %r8909, [%rd3+-26]; + prmt.b32 %r8910, %r8909, %r8908, 28756; + ld.local.u8 %r8911, [%rd3+-25]; + prmt.b32 %r8912, %r8911, %r8910, 1620; + ld.local.u8 %r8913, [%rd3+-24]; + ld.local.u8 %r8914, [%rd3+-23]; + prmt.b32 %r8915, %r8914, %r8913, 30212; + ld.local.u8 %r8916, [%rd3+-22]; + prmt.b32 %r8917, %r8916, %r8915, 28756; + ld.local.u8 %r8918, [%rd3+-21]; + prmt.b32 %r8919, %r8918, %r8917, 1620; + ld.local.u8 %r8920, [%rd3+-20]; + ld.local.u8 %r8921, [%rd3+-19]; + prmt.b32 %r8922, %r8921, %r8920, 30212; + ld.local.u8 %r8923, [%rd3+-18]; + prmt.b32 %r8924, %r8923, %r8922, 28756; + ld.local.u8 %r8925, [%rd3+-17]; + prmt.b32 %r8926, %r8925, %r8924, 1620; + ld.local.u8 %r8927, [%rd3+-16]; + ld.local.u8 %r8928, [%rd3+-15]; + prmt.b32 %r8929, %r8928, %r8927, 30212; + ld.local.u8 %r8930, [%rd3+-14]; + prmt.b32 %r8931, %r8930, %r8929, 28756; + ld.local.u8 %r8932, [%rd3+-13]; + prmt.b32 %r8933, %r8932, %r8931, 1620; + ld.local.u8 %r8934, [%rd3+-12]; + ld.local.u8 %r8935, [%rd3+-11]; + prmt.b32 %r8936, %r8935, %r8934, 30212; + ld.local.u8 %r8937, [%rd3+-10]; + prmt.b32 %r8938, %r8937, %r8936, 28756; + ld.local.u8 %r8939, [%rd3+-9]; + prmt.b32 %r8940, %r8939, %r8938, 1620; + ld.local.u8 %r8941, [%rd3+-8]; + ld.local.u8 %r8942, [%rd3+-7]; + prmt.b32 %r8943, %r8942, %r8941, 30212; + ld.local.u8 %r8944, [%rd3+-6]; + prmt.b32 %r8945, %r8944, %r8943, 28756; + ld.local.u8 %r8946, [%rd3+-5]; + prmt.b32 %r8947, %r8946, %r8945, 1620; + ld.local.u8 %r8948, [%rd3+-4]; + ld.local.u8 %r8949, [%rd3+-3]; + prmt.b32 %r8950, %r8949, %r8948, 30212; + ld.local.u8 %r8951, [%rd3+-2]; + prmt.b32 %r8952, %r8951, %r8950, 28756; + ld.local.u8 %r8953, [%rd3+-1]; + prmt.b32 %r8954, %r8953, %r8952, 1620; + ld.local.u64 %rd205, [%rd3+-72]; + cvt.u32.u64 %r8955, %rd205; + shr.u64 %rd206, %rd205, 32; + cvt.u32.u64 %r8956, %rd206; + cvt.u32.u16 %r8957, %rs339; + and.b32 %r8958, %r8957, 255; + ld.local.u32 %r8959, [%rd3+-104]; + add.s32 %r8960, %r8959, %r8849; + ld.local.u32 %r8961, [%rd3+-88]; + add.s32 %r8962, %r8960, %r8961; + xor.b32 %r8963, %r8962, %r8955; + shf.l.wrap.b32 %r8964, %r8963, %r8963, 16; + add.s32 %r8965, %r8964, 1779033703; + xor.b32 %r8966, %r8965, %r8961; + shf.l.wrap.b32 %r8967, %r8966, %r8966, 20; + add.s32 %r8968, %r8962, %r8856; + add.s32 %r8969, %r8968, %r8967; + xor.b32 %r8970, %r8969, %r8964; + shf.l.wrap.b32 %r8971, %r8970, %r8970, 24; + add.s32 %r8972, %r8971, %r8965; + xor.b32 %r8973, %r8972, %r8967; + shf.l.wrap.b32 %r8974, %r8973, %r8973, 25; + ld.local.u32 %r8975, [%rd3+-100]; + add.s32 %r8976, %r8975, %r8863; + ld.local.u32 %r8977, [%rd3+-84]; + add.s32 %r8978, %r8976, %r8977; + xor.b32 %r8979, %r8978, %r8956; + shf.l.wrap.b32 %r8980, %r8979, %r8979, 16; + add.s32 %r8981, %r8980, -1150833019; + xor.b32 %r8982, %r8981, %r8977; + shf.l.wrap.b32 %r8983, %r8982, %r8982, 20; + add.s32 %r8984, %r8978, %r8870; + add.s32 %r8985, %r8984, %r8983; + xor.b32 %r8986, %r8985, %r8980; + shf.l.wrap.b32 %r8987, %r8986, %r8986, 24; + add.s32 %r8988, %r8987, %r8981; + xor.b32 %r8989, %r8988, %r8983; + shf.l.wrap.b32 %r8990, %r8989, %r8989, 25; + ld.local.u32 %r8991, [%rd3+-96]; + add.s32 %r8992, %r8991, %r8877; + ld.local.u32 %r8993, [%rd3+-80]; + add.s32 %r8994, %r8992, %r8993; + shr.u32 %r8995, %r8994, 16; + shl.b32 %r8996, %r8994, 16; + xor.b32 %r8997, %r8996, 4194304; + or.b32 %r8998, %r8997, %r8995; + add.s32 %r8999, %r8998, 1013904242; + xor.b32 %r9000, %r8999, %r8993; + shf.l.wrap.b32 %r9001, %r9000, %r9000, 20; + add.s32 %r9002, %r8994, %r8884; + add.s32 %r9003, %r9002, %r9001; + xor.b32 %r9004, %r9003, %r8998; + shf.l.wrap.b32 %r9005, %r9004, %r9004, 24; + add.s32 %r9006, %r9005, %r8999; + xor.b32 %r9007, %r9006, %r9001; + shf.l.wrap.b32 %r9008, %r9007, %r9007, 25; + ld.local.u32 %r9009, [%rd3+-92]; + add.s32 %r9010, %r9009, %r8891; + ld.local.u32 %r9011, [%rd3+-76]; + add.s32 %r9012, %r9010, %r9011; + xor.b32 %r9013, %r9012, %r8958; + shr.u32 %r9014, %r9012, 16; + shl.b32 %r9015, %r9013, 16; + or.b32 %r9016, %r9015, %r9014; + add.s32 %r9017, %r9016, -1521486534; + xor.b32 %r9018, %r9017, %r9011; + shf.l.wrap.b32 %r9019, %r9018, %r9018, 20; + add.s32 %r9020, %r9012, %r8898; + add.s32 %r9021, %r9020, %r9019; + xor.b32 %r9022, %r9021, %r9016; + shf.l.wrap.b32 %r9023, %r9022, %r9022, 24; + add.s32 %r9024, %r9023, %r9017; + xor.b32 %r9025, %r9024, %r9019; + shf.l.wrap.b32 %r9026, %r9025, %r9025, 25; + add.s32 %r9027, %r8969, %r8905; + add.s32 %r9028, %r9027, %r8990; + xor.b32 %r9029, %r9028, %r9023; + shf.l.wrap.b32 %r9030, %r9029, %r9029, 16; + add.s32 %r9031, %r9030, %r9006; + xor.b32 %r9032, %r9031, %r8990; + shf.l.wrap.b32 %r9033, %r9032, %r9032, 20; + add.s32 %r9034, %r9028, %r8912; + add.s32 %r9035, %r9034, %r9033; + xor.b32 %r9036, %r9035, %r9030; + shf.l.wrap.b32 %r9037, %r9036, %r9036, 24; + add.s32 %r9038, %r9037, %r9031; + xor.b32 %r9039, %r9038, %r9033; + shf.l.wrap.b32 %r9040, %r9039, %r9039, 25; + add.s32 %r9041, %r8985, %r8919; + add.s32 %r9042, %r9041, %r9008; + xor.b32 %r9043, %r9042, %r8971; + shf.l.wrap.b32 %r9044, %r9043, %r9043, 16; + add.s32 %r9045, %r9044, %r9024; + xor.b32 %r9046, %r9045, %r9008; + shf.l.wrap.b32 %r9047, %r9046, %r9046, 20; + add.s32 %r9048, %r9042, %r8926; + add.s32 %r9049, %r9048, %r9047; + xor.b32 %r9050, %r9049, %r9044; + shf.l.wrap.b32 %r9051, %r9050, %r9050, 24; + add.s32 %r9052, %r9051, %r9045; + xor.b32 %r9053, %r9052, %r9047; + shf.l.wrap.b32 %r9054, %r9053, %r9053, 25; + add.s32 %r9055, %r9003, %r8933; + add.s32 %r9056, %r9055, %r9026; + xor.b32 %r9057, %r9056, %r8987; + shf.l.wrap.b32 %r9058, %r9057, %r9057, 16; + add.s32 %r9059, %r9058, %r8972; + xor.b32 %r9060, %r9059, %r9026; + shf.l.wrap.b32 %r9061, %r9060, %r9060, 20; + add.s32 %r9062, %r9056, %r8940; + add.s32 %r9063, %r9062, %r9061; + xor.b32 %r9064, %r9063, %r9058; + shf.l.wrap.b32 %r9065, %r9064, %r9064, 24; + add.s32 %r9066, %r9065, %r9059; + xor.b32 %r9067, %r9066, %r9061; + shf.l.wrap.b32 %r9068, %r9067, %r9067, 25; + add.s32 %r9069, %r9021, %r8947; + add.s32 %r9070, %r9069, %r8974; + xor.b32 %r9071, %r9070, %r9005; + shf.l.wrap.b32 %r9072, %r9071, %r9071, 16; + add.s32 %r9073, %r9072, %r8988; + xor.b32 %r9074, %r9073, %r8974; + shf.l.wrap.b32 %r9075, %r9074, %r9074, 20; + add.s32 %r9076, %r9070, %r8954; + add.s32 %r9077, %r9076, %r9075; + xor.b32 %r9078, %r9077, %r9072; + shf.l.wrap.b32 %r9079, %r9078, %r9078, 24; + add.s32 %r9080, %r9079, %r9073; + xor.b32 %r9081, %r9080, %r9075; + shf.l.wrap.b32 %r9082, %r9081, %r9081, 25; + add.s32 %r9083, %r9035, %r8863; + add.s32 %r9084, %r9083, %r9082; + xor.b32 %r9085, %r9084, %r9051; + shf.l.wrap.b32 %r9086, %r9085, %r9085, 16; + add.s32 %r9087, %r9086, %r9066; + xor.b32 %r9088, %r9087, %r9082; + shf.l.wrap.b32 %r9089, %r9088, %r9088, 20; + add.s32 %r9090, %r9084, %r8891; + add.s32 %r9091, %r9090, %r9089; + xor.b32 %r9092, %r9091, %r9086; + shf.l.wrap.b32 %r9093, %r9092, %r9092, 24; + add.s32 %r9094, %r9093, %r9087; + xor.b32 %r9095, %r9094, %r9089; + shf.l.wrap.b32 %r9096, %r9095, %r9095, 25; + add.s32 %r9097, %r9049, %r8870; + add.s32 %r9098, %r9097, %r9040; + xor.b32 %r9099, %r9098, %r9065; + shf.l.wrap.b32 %r9100, %r9099, %r9099, 16; + add.s32 %r9101, %r9100, %r9080; + xor.b32 %r9102, %r9101, %r9040; + shf.l.wrap.b32 %r9103, %r9102, %r9102, 20; + add.s32 %r9104, %r9098, %r8919; + add.s32 %r9105, %r9104, %r9103; + xor.b32 %r9106, %r9105, %r9100; + shf.l.wrap.b32 %r9107, %r9106, %r9106, 24; + add.s32 %r9108, %r9107, %r9101; + xor.b32 %r9109, %r9108, %r9103; + shf.l.wrap.b32 %r9110, %r9109, %r9109, 25; + add.s32 %r9111, %r9063, %r8898; + add.s32 %r9112, %r9111, %r9054; + xor.b32 %r9113, %r9112, %r9079; + shf.l.wrap.b32 %r9114, %r9113, %r9113, 16; + add.s32 %r9115, %r9114, %r9038; + xor.b32 %r9116, %r9115, %r9054; + shf.l.wrap.b32 %r9117, %r9116, %r9116, 20; + add.s32 %r9118, %r9112, %r8849; + add.s32 %r9119, %r9118, %r9117; + xor.b32 %r9120, %r9119, %r9114; + shf.l.wrap.b32 %r9121, %r9120, %r9120, 24; + add.s32 %r9122, %r9121, %r9115; + xor.b32 %r9123, %r9122, %r9117; + shf.l.wrap.b32 %r9124, %r9123, %r9123, 25; + add.s32 %r9125, %r9077, %r8877; + add.s32 %r9126, %r9125, %r9068; + xor.b32 %r9127, %r9126, %r9037; + shf.l.wrap.b32 %r9128, %r9127, %r9127, 16; + add.s32 %r9129, %r9128, %r9052; + xor.b32 %r9130, %r9129, %r9068; + shf.l.wrap.b32 %r9131, %r9130, %r9130, 20; + add.s32 %r9132, %r9126, %r8940; + add.s32 %r9133, %r9132, %r9131; + xor.b32 %r9134, %r9133, %r9128; + shf.l.wrap.b32 %r9135, %r9134, %r9134, 24; + add.s32 %r9136, %r9135, %r9129; + xor.b32 %r9137, %r9136, %r9131; + shf.l.wrap.b32 %r9138, %r9137, %r9137, 25; + add.s32 %r9139, %r9091, %r8856; + add.s32 %r9140, %r9139, %r9110; + xor.b32 %r9141, %r9140, %r9135; + shf.l.wrap.b32 %r9142, %r9141, %r9141, 16; + add.s32 %r9143, %r9142, %r9122; + xor.b32 %r9144, %r9143, %r9110; + shf.l.wrap.b32 %r9145, %r9144, %r9144, 20; + add.s32 %r9146, %r9140, %r8926; + add.s32 %r9147, %r9146, %r9145; + xor.b32 %r9148, %r9147, %r9142; + shf.l.wrap.b32 %r9149, %r9148, %r9148, 24; + add.s32 %r9150, %r9149, %r9143; + xor.b32 %r9151, %r9150, %r9145; + shf.l.wrap.b32 %r9152, %r9151, %r9151, 25; + add.s32 %r9153, %r9105, %r8933; + add.s32 %r9154, %r9153, %r9124; + xor.b32 %r9155, %r9154, %r9093; + shf.l.wrap.b32 %r9156, %r9155, %r9155, 16; + add.s32 %r9157, %r9156, %r9136; + xor.b32 %r9158, %r9157, %r9124; + shf.l.wrap.b32 %r9159, %r9158, %r9158, 20; + add.s32 %r9160, %r9154, %r8884; + add.s32 %r9161, %r9160, %r9159; + xor.b32 %r9162, %r9161, %r9156; + shf.l.wrap.b32 %r9163, %r9162, %r9162, 24; + add.s32 %r9164, %r9163, %r9157; + xor.b32 %r9165, %r9164, %r9159; + shf.l.wrap.b32 %r9166, %r9165, %r9165, 25; + add.s32 %r9167, %r9119, %r8912; + add.s32 %r9168, %r9167, %r9138; + xor.b32 %r9169, %r9168, %r9107; + shf.l.wrap.b32 %r9170, %r9169, %r9169, 16; + add.s32 %r9171, %r9170, %r9094; + xor.b32 %r9172, %r9171, %r9138; + shf.l.wrap.b32 %r9173, %r9172, %r9172, 20; + add.s32 %r9174, %r9168, %r8947; + add.s32 %r9175, %r9174, %r9173; + xor.b32 %r9176, %r9175, %r9170; + shf.l.wrap.b32 %r9177, %r9176, %r9176, 24; + add.s32 %r9178, %r9177, %r9171; + xor.b32 %r9179, %r9178, %r9173; + shf.l.wrap.b32 %r9180, %r9179, %r9179, 25; + add.s32 %r9181, %r9133, %r8954; + add.s32 %r9182, %r9181, %r9096; + xor.b32 %r9183, %r9182, %r9121; + shf.l.wrap.b32 %r9184, %r9183, %r9183, 16; + add.s32 %r9185, %r9184, %r9108; + xor.b32 %r9186, %r9185, %r9096; + shf.l.wrap.b32 %r9187, %r9186, %r9186, 20; + add.s32 %r9188, %r9182, %r8905; + add.s32 %r9189, %r9188, %r9187; + xor.b32 %r9190, %r9189, %r9184; + shf.l.wrap.b32 %r9191, %r9190, %r9190, 24; + add.s32 %r9192, %r9191, %r9185; + xor.b32 %r9193, %r9192, %r9187; + shf.l.wrap.b32 %r9194, %r9193, %r9193, 25; + add.s32 %r9195, %r9147, %r8870; + add.s32 %r9196, %r9195, %r9194; + xor.b32 %r9197, %r9196, %r9163; + shf.l.wrap.b32 %r9198, %r9197, %r9197, 16; + add.s32 %r9199, %r9198, %r9178; + xor.b32 %r9200, %r9199, %r9194; + shf.l.wrap.b32 %r9201, %r9200, %r9200, 20; + add.s32 %r9202, %r9196, %r8877; + add.s32 %r9203, %r9202, %r9201; + xor.b32 %r9204, %r9203, %r9198; + shf.l.wrap.b32 %r9205, %r9204, %r9204, 24; + add.s32 %r9206, %r9205, %r9199; + xor.b32 %r9207, %r9206, %r9201; + shf.l.wrap.b32 %r9208, %r9207, %r9207, 25; + add.s32 %r9209, %r9161, %r8919; + add.s32 %r9210, %r9209, %r9152; + xor.b32 %r9211, %r9210, %r9177; + shf.l.wrap.b32 %r9212, %r9211, %r9211, 16; + add.s32 %r9213, %r9212, %r9192; + xor.b32 %r9214, %r9213, %r9152; + shf.l.wrap.b32 %r9215, %r9214, %r9214, 20; + add.s32 %r9216, %r9210, %r8933; + add.s32 %r9217, %r9216, %r9215; + xor.b32 %r9218, %r9217, %r9212; + shf.l.wrap.b32 %r9219, %r9218, %r9218, 24; + add.s32 %r9220, %r9219, %r9213; + xor.b32 %r9221, %r9220, %r9215; + shf.l.wrap.b32 %r9222, %r9221, %r9221, 25; + add.s32 %r9223, %r9175, %r8940; + add.s32 %r9224, %r9223, %r9166; + xor.b32 %r9225, %r9224, %r9191; + shf.l.wrap.b32 %r9226, %r9225, %r9225, 16; + add.s32 %r9227, %r9226, %r9150; + xor.b32 %r9228, %r9227, %r9166; + shf.l.wrap.b32 %r9229, %r9228, %r9228, 20; + add.s32 %r9230, %r9224, %r8863; + add.s32 %r9231, %r9230, %r9229; + xor.b32 %r9232, %r9231, %r9226; + shf.l.wrap.b32 %r9233, %r9232, %r9232, 24; + add.s32 %r9234, %r9233, %r9227; + xor.b32 %r9235, %r9234, %r9229; + shf.l.wrap.b32 %r9236, %r9235, %r9235, 25; + add.s32 %r9237, %r9189, %r8898; + add.s32 %r9238, %r9237, %r9180; + xor.b32 %r9239, %r9238, %r9149; + shf.l.wrap.b32 %r9240, %r9239, %r9239, 16; + add.s32 %r9241, %r9240, %r9164; + xor.b32 %r9242, %r9241, %r9180; + shf.l.wrap.b32 %r9243, %r9242, %r9242, 20; + add.s32 %r9244, %r9238, %r8947; + add.s32 %r9245, %r9244, %r9243; + xor.b32 %r9246, %r9245, %r9240; + shf.l.wrap.b32 %r9247, %r9246, %r9246, 24; + add.s32 %r9248, %r9247, %r9241; + xor.b32 %r9249, %r9248, %r9243; + shf.l.wrap.b32 %r9250, %r9249, %r9249, 25; + add.s32 %r9251, %r9203, %r8891; + add.s32 %r9252, %r9251, %r9222; + xor.b32 %r9253, %r9252, %r9247; + shf.l.wrap.b32 %r9254, %r9253, %r9253, 16; + add.s32 %r9255, %r9254, %r9234; + xor.b32 %r9256, %r9255, %r9222; + shf.l.wrap.b32 %r9257, %r9256, %r9256, 20; + add.s32 %r9258, %r9252, %r8884; + add.s32 %r9259, %r9258, %r9257; + xor.b32 %r9260, %r9259, %r9254; + shf.l.wrap.b32 %r9261, %r9260, %r9260, 24; + add.s32 %r9262, %r9261, %r9255; + xor.b32 %r9263, %r9262, %r9257; + shf.l.wrap.b32 %r9264, %r9263, %r9263, 25; + add.s32 %r9265, %r9217, %r8912; + add.s32 %r9266, %r9265, %r9236; + xor.b32 %r9267, %r9266, %r9205; + shf.l.wrap.b32 %r9268, %r9267, %r9267, 16; + add.s32 %r9269, %r9268, %r9248; + xor.b32 %r9270, %r9269, %r9236; + shf.l.wrap.b32 %r9271, %r9270, %r9270, 20; + add.s32 %r9272, %r9266, %r8849; + add.s32 %r9273, %r9272, %r9271; + xor.b32 %r9274, %r9273, %r9268; + shf.l.wrap.b32 %r9275, %r9274, %r9274, 24; + add.s32 %r9276, %r9275, %r9269; + xor.b32 %r9277, %r9276, %r9271; + shf.l.wrap.b32 %r9278, %r9277, %r9277, 25; + add.s32 %r9279, %r9231, %r8926; + add.s32 %r9280, %r9279, %r9250; + xor.b32 %r9281, %r9280, %r9219; + shf.l.wrap.b32 %r9282, %r9281, %r9281, 16; + add.s32 %r9283, %r9282, %r9206; + xor.b32 %r9284, %r9283, %r9250; + shf.l.wrap.b32 %r9285, %r9284, %r9284, 20; + add.s32 %r9286, %r9280, %r8954; + add.s32 %r9287, %r9286, %r9285; + xor.b32 %r9288, %r9287, %r9282; + shf.l.wrap.b32 %r9289, %r9288, %r9288, 24; + add.s32 %r9290, %r9289, %r9283; + xor.b32 %r9291, %r9290, %r9285; + shf.l.wrap.b32 %r9292, %r9291, %r9291, 25; + add.s32 %r9293, %r9245, %r8905; + add.s32 %r9294, %r9293, %r9208; + xor.b32 %r9295, %r9294, %r9233; + shf.l.wrap.b32 %r9296, %r9295, %r9295, 16; + add.s32 %r9297, %r9296, %r9220; + xor.b32 %r9298, %r9297, %r9208; + shf.l.wrap.b32 %r9299, %r9298, %r9298, 20; + add.s32 %r9300, %r9294, %r8856; + add.s32 %r9301, %r9300, %r9299; + xor.b32 %r9302, %r9301, %r9296; + shf.l.wrap.b32 %r9303, %r9302, %r9302, 24; + add.s32 %r9304, %r9303, %r9297; + xor.b32 %r9305, %r9304, %r9299; + shf.l.wrap.b32 %r9306, %r9305, %r9305, 25; + add.s32 %r9307, %r9259, %r8919; + add.s32 %r9308, %r9307, %r9306; + xor.b32 %r9309, %r9308, %r9275; + shf.l.wrap.b32 %r9310, %r9309, %r9309, 16; + add.s32 %r9311, %r9310, %r9290; + xor.b32 %r9312, %r9311, %r9306; + shf.l.wrap.b32 %r9313, %r9312, %r9312, 20; + add.s32 %r9314, %r9308, %r8898; + add.s32 %r9315, %r9314, %r9313; + xor.b32 %r9316, %r9315, %r9310; + shf.l.wrap.b32 %r9317, %r9316, %r9316, 24; + add.s32 %r9318, %r9317, %r9311; + xor.b32 %r9319, %r9318, %r9313; + shf.l.wrap.b32 %r9320, %r9319, %r9319, 25; + add.s32 %r9321, %r9273, %r8933; + add.s32 %r9322, %r9321, %r9264; + xor.b32 %r9323, %r9322, %r9289; + shf.l.wrap.b32 %r9324, %r9323, %r9323, 16; + add.s32 %r9325, %r9324, %r9304; + xor.b32 %r9326, %r9325, %r9264; + shf.l.wrap.b32 %r9327, %r9326, %r9326, 20; + add.s32 %r9328, %r9322, %r8912; + add.s32 %r9329, %r9328, %r9327; + xor.b32 %r9330, %r9329, %r9324; + shf.l.wrap.b32 %r9331, %r9330, %r9330, 24; + add.s32 %r9332, %r9331, %r9325; + xor.b32 %r9333, %r9332, %r9327; + shf.l.wrap.b32 %r9334, %r9333, %r9333, 25; + add.s32 %r9335, %r9287, %r8947; + add.s32 %r9336, %r9335, %r9278; + xor.b32 %r9337, %r9336, %r9303; + shf.l.wrap.b32 %r9338, %r9337, %r9337, 16; + add.s32 %r9339, %r9338, %r9262; + xor.b32 %r9340, %r9339, %r9278; + shf.l.wrap.b32 %r9341, %r9340, %r9340, 20; + add.s32 %r9342, %r9336, %r8870; + add.s32 %r9343, %r9342, %r9341; + xor.b32 %r9344, %r9343, %r9338; + shf.l.wrap.b32 %r9345, %r9344, %r9344, 24; + add.s32 %r9346, %r9345, %r9339; + xor.b32 %r9347, %r9346, %r9341; + shf.l.wrap.b32 %r9348, %r9347, %r9347, 25; + add.s32 %r9349, %r9301, %r8940; + add.s32 %r9350, %r9349, %r9292; + xor.b32 %r9351, %r9350, %r9261; + shf.l.wrap.b32 %r9352, %r9351, %r9351, 16; + add.s32 %r9353, %r9352, %r9276; + xor.b32 %r9354, %r9353, %r9292; + shf.l.wrap.b32 %r9355, %r9354, %r9354, 20; + add.s32 %r9356, %r9350, %r8954; + add.s32 %r9357, %r9356, %r9355; + xor.b32 %r9358, %r9357, %r9352; + shf.l.wrap.b32 %r9359, %r9358, %r9358, 24; + add.s32 %r9360, %r9359, %r9353; + xor.b32 %r9361, %r9360, %r9355; + shf.l.wrap.b32 %r9362, %r9361, %r9361, 25; + add.s32 %r9363, %r9315, %r8877; + add.s32 %r9364, %r9363, %r9334; + xor.b32 %r9365, %r9364, %r9359; + shf.l.wrap.b32 %r9366, %r9365, %r9365, 16; + add.s32 %r9367, %r9366, %r9346; + xor.b32 %r9368, %r9367, %r9334; + shf.l.wrap.b32 %r9369, %r9368, %r9368, 20; + add.s32 %r9370, %r9364, %r8849; + add.s32 %r9371, %r9370, %r9369; + xor.b32 %r9372, %r9371, %r9366; + shf.l.wrap.b32 %r9373, %r9372, %r9372, 24; + add.s32 %r9374, %r9373, %r9367; + xor.b32 %r9375, %r9374, %r9369; + shf.l.wrap.b32 %r9376, %r9375, %r9375, 25; + add.s32 %r9377, %r9329, %r8926; + add.s32 %r9378, %r9377, %r9348; + xor.b32 %r9379, %r9378, %r9317; + shf.l.wrap.b32 %r9380, %r9379, %r9379, 16; + add.s32 %r9381, %r9380, %r9360; + xor.b32 %r9382, %r9381, %r9348; + shf.l.wrap.b32 %r9383, %r9382, %r9382, 20; + add.s32 %r9384, %r9378, %r8863; + add.s32 %r9385, %r9384, %r9383; + xor.b32 %r9386, %r9385, %r9380; + shf.l.wrap.b32 %r9387, %r9386, %r9386, 24; + add.s32 %r9388, %r9387, %r9381; + xor.b32 %r9389, %r9388, %r9383; + shf.l.wrap.b32 %r9390, %r9389, %r9389, 25; + add.s32 %r9391, %r9343, %r8884; + add.s32 %r9392, %r9391, %r9362; + xor.b32 %r9393, %r9392, %r9331; + shf.l.wrap.b32 %r9394, %r9393, %r9393, 16; + add.s32 %r9395, %r9394, %r9318; + xor.b32 %r9396, %r9395, %r9362; + shf.l.wrap.b32 %r9397, %r9396, %r9396, 20; + add.s32 %r9398, %r9392, %r8905; + add.s32 %r9399, %r9398, %r9397; + xor.b32 %r9400, %r9399, %r9394; + shf.l.wrap.b32 %r9401, %r9400, %r9400, 24; + add.s32 %r9402, %r9401, %r9395; + xor.b32 %r9403, %r9402, %r9397; + shf.l.wrap.b32 %r9404, %r9403, %r9403, 25; + add.s32 %r9405, %r9357, %r8856; + add.s32 %r9406, %r9405, %r9320; + xor.b32 %r9407, %r9406, %r9345; + shf.l.wrap.b32 %r9408, %r9407, %r9407, 16; + add.s32 %r9409, %r9408, %r9332; + xor.b32 %r9410, %r9409, %r9320; + shf.l.wrap.b32 %r9411, %r9410, %r9410, 20; + add.s32 %r9412, %r9406, %r8891; + add.s32 %r9413, %r9412, %r9411; + xor.b32 %r9414, %r9413, %r9408; + shf.l.wrap.b32 %r9415, %r9414, %r9414, 24; + add.s32 %r9416, %r9415, %r9409; + xor.b32 %r9417, %r9416, %r9411; + shf.l.wrap.b32 %r9418, %r9417, %r9417, 25; + add.s32 %r9419, %r9371, %r8933; + add.s32 %r9420, %r9419, %r9418; + xor.b32 %r9421, %r9420, %r9387; + shf.l.wrap.b32 %r9422, %r9421, %r9421, 16; + add.s32 %r9423, %r9422, %r9402; + xor.b32 %r9424, %r9423, %r9418; + shf.l.wrap.b32 %r9425, %r9424, %r9424, 20; + add.s32 %r9426, %r9420, %r8940; + add.s32 %r9427, %r9426, %r9425; + xor.b32 %r9428, %r9427, %r9422; + shf.l.wrap.b32 %r9429, %r9428, %r9428, 24; + add.s32 %r9430, %r9429, %r9423; + xor.b32 %r9431, %r9430, %r9425; + shf.l.wrap.b32 %r9432, %r9431, %r9431, 25; + add.s32 %r9433, %r9385, %r8912; + add.s32 %r9434, %r9433, %r9376; + xor.b32 %r9435, %r9434, %r9401; + shf.l.wrap.b32 %r9436, %r9435, %r9435, 16; + add.s32 %r9437, %r9436, %r9416; + xor.b32 %r9438, %r9437, %r9376; + shf.l.wrap.b32 %r9439, %r9438, %r9438, 20; + add.s32 %r9440, %r9434, %r8926; + add.s32 %r9441, %r9440, %r9439; + xor.b32 %r9442, %r9441, %r9436; + shf.l.wrap.b32 %r9443, %r9442, %r9442, 24; + add.s32 %r9444, %r9443, %r9437; + xor.b32 %r9445, %r9444, %r9439; + shf.l.wrap.b32 %r9446, %r9445, %r9445, 25; + add.s32 %r9447, %r9399, %r8954; + add.s32 %r9448, %r9447, %r9390; + xor.b32 %r9449, %r9448, %r9415; + shf.l.wrap.b32 %r9450, %r9449, %r9449, 16; + add.s32 %r9451, %r9450, %r9374; + xor.b32 %r9452, %r9451, %r9390; + shf.l.wrap.b32 %r9453, %r9452, %r9452, 20; + add.s32 %r9454, %r9448, %r8919; + add.s32 %r9455, %r9454, %r9453; + xor.b32 %r9456, %r9455, %r9450; + shf.l.wrap.b32 %r9457, %r9456, %r9456, 24; + add.s32 %r9458, %r9457, %r9451; + xor.b32 %r9459, %r9458, %r9453; + shf.l.wrap.b32 %r9460, %r9459, %r9459, 25; + add.s32 %r9461, %r9413, %r8947; + add.s32 %r9462, %r9461, %r9404; + xor.b32 %r9463, %r9462, %r9373; + shf.l.wrap.b32 %r9464, %r9463, %r9463, 16; + add.s32 %r9465, %r9464, %r9388; + xor.b32 %r9466, %r9465, %r9404; + shf.l.wrap.b32 %r9467, %r9466, %r9466, 20; + add.s32 %r9468, %r9462, %r8905; + add.s32 %r9469, %r9468, %r9467; + xor.b32 %r9470, %r9469, %r9464; + shf.l.wrap.b32 %r9471, %r9470, %r9470, 24; + add.s32 %r9472, %r9471, %r9465; + xor.b32 %r9473, %r9472, %r9467; + shf.l.wrap.b32 %r9474, %r9473, %r9473, 25; + add.s32 %r9475, %r9427, %r8898; + add.s32 %r9476, %r9475, %r9446; + xor.b32 %r9477, %r9476, %r9471; + shf.l.wrap.b32 %r9478, %r9477, %r9477, 16; + add.s32 %r9479, %r9478, %r9458; + xor.b32 %r9480, %r9479, %r9446; + shf.l.wrap.b32 %r9481, %r9480, %r9480, 20; + add.s32 %r9482, %r9476, %r8863; + add.s32 %r9483, %r9482, %r9481; + xor.b32 %r9484, %r9483, %r9478; + shf.l.wrap.b32 %r9485, %r9484, %r9484, 24; + add.s32 %r9486, %r9485, %r9479; + xor.b32 %r9487, %r9486, %r9481; + shf.l.wrap.b32 %r9488, %r9487, %r9487, 25; + add.s32 %r9489, %r9441, %r8884; + add.s32 %r9490, %r9489, %r9460; + xor.b32 %r9491, %r9490, %r9429; + shf.l.wrap.b32 %r9492, %r9491, %r9491, 16; + add.s32 %r9493, %r9492, %r9472; + xor.b32 %r9494, %r9493, %r9460; + shf.l.wrap.b32 %r9495, %r9494, %r9494, 20; + add.s32 %r9496, %r9490, %r8870; + add.s32 %r9497, %r9496, %r9495; + xor.b32 %r9498, %r9497, %r9492; + shf.l.wrap.b32 %r9499, %r9498, %r9498, 24; + add.s32 %r9500, %r9499, %r9493; + xor.b32 %r9501, %r9500, %r9495; + shf.l.wrap.b32 %r9502, %r9501, %r9501, 25; + add.s32 %r9503, %r9455, %r8849; + add.s32 %r9504, %r9503, %r9474; + xor.b32 %r9505, %r9504, %r9443; + shf.l.wrap.b32 %r9506, %r9505, %r9505, 16; + add.s32 %r9507, %r9506, %r9430; + xor.b32 %r9508, %r9507, %r9474; + shf.l.wrap.b32 %r9509, %r9508, %r9508, 20; + add.s32 %r9510, %r9504, %r8856; + add.s32 %r9511, %r9510, %r9509; + xor.b32 %r9512, %r9511, %r9506; + shf.l.wrap.b32 %r9513, %r9512, %r9512, 24; + add.s32 %r9514, %r9513, %r9507; + xor.b32 %r9515, %r9514, %r9509; + shf.l.wrap.b32 %r9516, %r9515, %r9515, 25; + add.s32 %r9517, %r9469, %r8891; + add.s32 %r9518, %r9517, %r9432; + xor.b32 %r9519, %r9518, %r9457; + shf.l.wrap.b32 %r9520, %r9519, %r9519, 16; + add.s32 %r9521, %r9520, %r9444; + xor.b32 %r9522, %r9521, %r9432; + shf.l.wrap.b32 %r9523, %r9522, %r9522, 20; + add.s32 %r9524, %r9518, %r8877; + add.s32 %r9525, %r9524, %r9523; + xor.b32 %r9526, %r9525, %r9520; + shf.l.wrap.b32 %r9527, %r9526, %r9526, 24; + add.s32 %r9528, %r9527, %r9521; + xor.b32 %r9529, %r9528, %r9523; + shf.l.wrap.b32 %r9530, %r9529, %r9529, 25; + add.s32 %r9531, %r9483, %r8912; + add.s32 %r9532, %r9531, %r9530; + xor.b32 %r9533, %r9532, %r9499; + shf.l.wrap.b32 %r9534, %r9533, %r9533, 16; + add.s32 %r9535, %r9534, %r9514; + xor.b32 %r9536, %r9535, %r9530; + shf.l.wrap.b32 %r9537, %r9536, %r9536, 20; + add.s32 %r9538, %r9532, %r8947; + add.s32 %r9539, %r9538, %r9537; + xor.b32 %r9540, %r9539, %r9534; + shf.l.wrap.b32 %r9541, %r9540, %r9540, 24; + add.s32 %r9542, %r9541, %r9535; + xor.b32 %r9543, %r9542, %r9537; + shf.l.wrap.b32 %r9544, %r9543, %r9543, 25; + add.s32 %r9545, %r9497, %r8926; + add.s32 %r9546, %r9545, %r9488; + xor.b32 %r9547, %r9546, %r9513; + shf.l.wrap.b32 %r9548, %r9547, %r9547, 16; + add.s32 %r9549, %r9548, %r9528; + xor.b32 %r9550, %r9549, %r9488; + shf.l.wrap.b32 %r9551, %r9550, %r9550, 20; + add.s32 %r9552, %r9546, %r8884; + add.s32 %r9553, %r9552, %r9551; + xor.b32 %r9554, %r9553, %r9548; + shf.l.wrap.b32 %r9555, %r9554, %r9554, 24; + add.s32 %r9556, %r9555, %r9549; + xor.b32 %r9557, %r9556, %r9551; + shf.l.wrap.b32 %r9558, %r9557, %r9557, 25; + add.s32 %r9559, %r9511, %r8905; + add.s32 %r9560, %r9559, %r9502; + xor.b32 %r9561, %r9560, %r9527; + shf.l.wrap.b32 %r9562, %r9561, %r9561, 16; + add.s32 %r9563, %r9562, %r9486; + xor.b32 %r9564, %r9563, %r9502; + shf.l.wrap.b32 %r9565, %r9564, %r9564, 20; + add.s32 %r9566, %r9560, %r8933; + add.s32 %r9567, %r9566, %r9565; + xor.b32 %r9568, %r9567, %r9562; + shf.l.wrap.b32 %r9569, %r9568, %r9568, 24; + add.s32 %r9570, %r9569, %r9563; + xor.b32 %r9571, %r9570, %r9565; + shf.l.wrap.b32 %r9572, %r9571, %r9571, 25; + add.s32 %r9573, %r9525, %r8954; + add.s32 %r9574, %r9573, %r9516; + xor.b32 %r9575, %r9574, %r9485; + shf.l.wrap.b32 %r9576, %r9575, %r9575, 16; + add.s32 %r9577, %r9576, %r9500; + xor.b32 %r9578, %r9577, %r9516; + shf.l.wrap.b32 %r9579, %r9578, %r9578, 20; + add.s32 %r9580, %r9574, %r8856; + add.s32 %r9581, %r9580, %r9579; + xor.b32 %r9582, %r9581, %r9576; + shf.l.wrap.b32 %r9583, %r9582, %r9582, 24; + add.s32 %r9584, %r9583, %r9577; + xor.b32 %r9585, %r9584, %r9579; + shf.l.wrap.b32 %r9586, %r9585, %r9585, 25; + add.s32 %r9587, %r9539, %r8940; + add.s32 %r9588, %r9587, %r9558; + xor.b32 %r9589, %r9588, %r9583; + shf.l.wrap.b32 %r9590, %r9589, %r9589, 16; + add.s32 %r9591, %r9590, %r9570; + xor.b32 %r9592, %r9591, %r9558; + shf.l.wrap.b32 %r9593, %r9592, %r9592, 20; + add.s32 %r9594, %r9588, %r8870; + add.s32 %r9595, %r9594, %r9593; + xor.b32 %r9596, %r9595, %r9590; + shf.l.wrap.b32 %r9597, %r9596, %r9596, 24; + add.s32 %r9598, %r9597, %r9591; + xor.b32 %r9599, %r9598, %r9593; + shf.l.wrap.b32 %r9600, %r9599, %r9599, 25; + add.s32 %r9601, %r9553, %r8849; + add.s32 %r9602, %r9601, %r9572; + xor.b32 %r9603, %r9602, %r9541; + shf.l.wrap.b32 %r9604, %r9603, %r9603, 16; + add.s32 %r9605, %r9604, %r9584; + xor.b32 %r9606, %r9605, %r9572; + shf.l.wrap.b32 %r9607, %r9606, %r9606, 20; + add.s32 %r9608, %r9602, %r8919; + add.s32 %r9609, %r9608, %r9607; + xor.b32 %r9610, %r9609, %r9604; + shf.l.wrap.b32 %r9611, %r9610, %r9610, 24; + add.s32 %r9612, %r9611, %r9605; + xor.b32 %r9613, %r9612, %r9607; + shf.l.wrap.b32 %r9614, %r9613, %r9613, 25; + add.s32 %r9615, %r9567, %r8863; + add.s32 %r9616, %r9615, %r9586; + xor.b32 %r9617, %r9616, %r9555; + shf.l.wrap.b32 %r9618, %r9617, %r9617, 16; + add.s32 %r9619, %r9618, %r9542; + xor.b32 %r9620, %r9619, %r9586; + shf.l.wrap.b32 %r9621, %r9620, %r9620, 20; + add.s32 %r9622, %r9616, %r8891; + add.s32 %r9623, %r9622, %r9621; + xor.b32 %r9624, %r9623, %r9618; + shf.l.wrap.b32 %r9625, %r9624, %r9624, 24; + add.s32 %r9626, %r9625, %r9619; + xor.b32 %r9627, %r9626, %r9621; + shf.l.wrap.b32 %r9628, %r9627, %r9627, 25; + add.s32 %r9629, %r9581, %r8877; + add.s32 %r9630, %r9629, %r9544; + xor.b32 %r9631, %r9630, %r9569; + shf.l.wrap.b32 %r9632, %r9631, %r9631, 16; + add.s32 %r9633, %r9632, %r9556; + xor.b32 %r9634, %r9633, %r9544; + shf.l.wrap.b32 %r9635, %r9634, %r9634, 20; + add.s32 %r9636, %r9630, %r8898; + add.s32 %r9637, %r9636, %r9635; + xor.b32 %r9638, %r9637, %r9632; + shf.l.wrap.b32 %r9639, %r9638, %r9638, 24; + add.s32 %r9640, %r9639, %r9633; + xor.b32 %r9641, %r9640, %r9635; + shf.l.wrap.b32 %r9642, %r9641, %r9641, 25; + add.s32 %r9643, %r9595, %r8926; + add.s32 %r9644, %r9643, %r9642; + xor.b32 %r9645, %r9644, %r9611; + shf.l.wrap.b32 %r9646, %r9645, %r9645, 16; + add.s32 %r9647, %r9646, %r9626; + xor.b32 %r9648, %r9647, %r9642; + shf.l.wrap.b32 %r9649, %r9648, %r9648, 20; + add.s32 %r9650, %r9644, %r8954; + add.s32 %r9651, %r9650, %r9649; + xor.b32 %r9652, %r9651, %r9646; + shf.l.wrap.b32 %r9653, %r9652, %r9652, 24; + add.s32 %r9654, %r9653, %r9647; + xor.b32 %r9655, %r9654, %r9649; + shf.l.wrap.b32 %r9656, %r9655, %r9655, 25; + add.s32 %r9657, %r9609, %r8884; + add.s32 %r9658, %r9657, %r9600; + xor.b32 %r9659, %r9658, %r9625; + shf.l.wrap.b32 %r9660, %r9659, %r9659, 16; + add.s32 %r9661, %r9660, %r9640; + xor.b32 %r9662, %r9661, %r9600; + shf.l.wrap.b32 %r9663, %r9662, %r9662, 20; + add.s32 %r9664, %r9658, %r8849; + add.s32 %r9665, %r9664, %r9663; + xor.b32 %r9666, %r9665, %r9660; + shf.l.wrap.b32 %r9667, %r9666, %r9666, 24; + add.s32 %r9668, %r9667, %r9661; + xor.b32 %r9669, %r9668, %r9663; + shf.l.wrap.b32 %r9670, %r9669, %r9669, 25; + add.s32 %r9671, %r9623, %r8856; + add.s32 %r9672, %r9671, %r9614; + xor.b32 %r9673, %r9672, %r9639; + shf.l.wrap.b32 %r9674, %r9673, %r9673, 16; + add.s32 %r9675, %r9674, %r9598; + xor.b32 %r9676, %r9675, %r9614; + shf.l.wrap.b32 %r9677, %r9676, %r9676, 20; + add.s32 %r9678, %r9672, %r8912; + add.s32 %r9679, %r9678, %r9677; + xor.b32 %r9680, %r9679, %r9674; + shf.l.wrap.b32 %r9681, %r9680, %r9680, 24; + add.s32 %r9682, %r9681, %r9675; + xor.b32 %r9683, %r9682, %r9677; + shf.l.wrap.b32 %r9684, %r9683, %r9683, 25; + add.s32 %r9685, %r9637, %r8905; + add.s32 %r9686, %r9685, %r9628; + xor.b32 %r9687, %r9686, %r9597; + shf.l.wrap.b32 %r9688, %r9687, %r9687, 16; + add.s32 %r9689, %r9688, %r9612; + xor.b32 %r9690, %r9689, %r9628; + shf.l.wrap.b32 %r9691, %r9690, %r9690, 20; + add.s32 %r9692, %r9686, %r8891; + add.s32 %r9693, %r9692, %r9691; + xor.b32 %r9694, %r9693, %r9688; + shf.l.wrap.b32 %r9695, %r9694, %r9694, 24; + add.s32 %r9696, %r9695, %r9689; + xor.b32 %r9697, %r9696, %r9691; + shf.l.wrap.b32 %r9698, %r9697, %r9697, 25; + add.s32 %r9699, %r9651, %r8947; + add.s32 %r9700, %r9699, %r9670; + xor.b32 %r9701, %r9700, %r9695; + shf.l.wrap.b32 %r9702, %r9701, %r9701, 16; + add.s32 %r9703, %r9702, %r9682; + xor.b32 %r9704, %r9703, %r9670; + shf.l.wrap.b32 %r9705, %r9704, %r9704, 20; + add.s32 %r9706, %r9700, %r8919; + add.s32 %r9707, %r9706, %r9705; + xor.b32 %r9708, %r9707, %r9702; + shf.l.wrap.b32 %r9709, %r9708, %r9708, 24; + add.s32 %r9710, %r9709, %r9703; + xor.b32 %r9711, %r9710, %r9705; + shf.l.wrap.b32 %r9712, %r9711, %r9711, 25; + add.s32 %r9713, %r9665, %r8863; + add.s32 %r9714, %r9713, %r9684; + xor.b32 %r9715, %r9714, %r9653; + shf.l.wrap.b32 %r9716, %r9715, %r9715, 16; + add.s32 %r9717, %r9716, %r9696; + xor.b32 %r9718, %r9717, %r9684; + shf.l.wrap.b32 %r9719, %r9718, %r9718, 20; + add.s32 %r9720, %r9714, %r8933; + add.s32 %r9721, %r9720, %r9719; + xor.b32 %r9722, %r9721, %r9716; + shf.l.wrap.b32 %r9723, %r9722, %r9722, 24; + add.s32 %r9724, %r9723, %r9717; + xor.b32 %r9725, %r9724, %r9719; + shf.l.wrap.b32 %r9726, %r9725, %r9725, 25; + add.s32 %r9727, %r9679, %r8870; + add.s32 %r9728, %r9727, %r9698; + xor.b32 %r9729, %r9728, %r9667; + shf.l.wrap.b32 %r9730, %r9729, %r9729, 16; + add.s32 %r9731, %r9730, %r9654; + xor.b32 %r9732, %r9731, %r9698; + shf.l.wrap.b32 %r9733, %r9732, %r9732, 20; + add.s32 %r9734, %r9728, %r8877; + add.s32 %r9735, %r9734, %r9733; + xor.b32 %r9736, %r9735, %r9730; + shf.l.wrap.b32 %r9737, %r9736, %r9736, 24; + add.s32 %r9738, %r9737, %r9731; + xor.b32 %r9739, %r9738, %r9733; + shf.l.wrap.b32 %r9740, %r9739, %r9739, 25; + add.s32 %r9741, %r9693, %r8898; + add.s32 %r9742, %r9741, %r9656; + xor.b32 %r9743, %r9742, %r9681; + shf.l.wrap.b32 %r9744, %r9743, %r9743, 16; + add.s32 %r9745, %r9744, %r9668; + xor.b32 %r9746, %r9745, %r9656; + shf.l.wrap.b32 %r9747, %r9746, %r9746, 20; + add.s32 %r9748, %r9742, %r8940; + add.s32 %r9749, %r9748, %r9747; + xor.b32 %r9750, %r9749, %r9744; + shf.l.wrap.b32 %r9751, %r9750, %r9750, 24; + add.s32 %r9752, %r9751, %r9745; + xor.b32 %r9753, %r9752, %r9747; + shf.l.wrap.b32 %r9754, %r9753, %r9753, 25; + xor.b32 %r9755, %r9738, %r9707; + st.local.u32 [%rd3+-104], %r9755; + xor.b32 %r9756, %r9752, %r9721; + st.local.u32 [%rd3+-100], %r9756; + xor.b32 %r9757, %r9710, %r9735; + st.local.u32 [%rd3+-96], %r9757; + xor.b32 %r9758, %r9724, %r9749; + st.local.u32 [%rd3+-92], %r9758; + xor.b32 %r9759, %r9754, %r9723; + st.local.u32 [%rd3+-88], %r9759; + xor.b32 %r9760, %r9712, %r9737; + st.local.u32 [%rd3+-84], %r9760; + xor.b32 %r9761, %r9726, %r9751; + st.local.u32 [%rd3+-80], %r9761; + xor.b32 %r9762, %r9740, %r9709; + st.local.u32 [%rd3+-76], %r9762; + add.s16 %rs340, %rs335, 1; + st.local.v2.u8 [%rd3], {%rs390, %rs340}; + +$L__BB1_56: + add.s64 %rd207, %rd78, %rd263; + st.local.u8 [%rd207], %rs390; + add.s64 %rd263, %rd263, 1; + setp.lt.u64 %p46, %rd263, 64; + mov.u64 %rd270, %rd77; + @%p46 bra $L__BB1_56; + +$L__BB1_57: + setp.gt.u64 %p47, %rd270, 64; + @%p47 bra $L__BB1_59; + bra.uni $L__BB1_58; + +$L__BB1_59: + ld.local.u8 %rs95, [%rd3+2]; + ld.local.u8 %rs391, [%rd3+1]; + ld.local.u32 %r11689, [%rd3+-104]; + ld.local.u32 %r11688, [%rd3+-100]; + ld.local.u32 %r11687, [%rd3+-96]; + ld.local.u32 %r11686, [%rd3+-92]; + ld.local.u32 %r11685, [%rd3+-88]; + ld.local.u32 %r11684, [%rd3+-84]; + ld.local.u32 %r11683, [%rd3+-80]; + ld.local.u32 %r11682, [%rd3+-76]; + ld.local.u64 %rd268, [%rd3+-72]; + cvt.u32.u64 %r117, %rd268; + shr.u64 %rd208, %rd268, 32; + cvt.u32.u64 %r118, %rd208; + +$L__BB1_60: + and.b16 %rs342, %rs391, 255; + setp.eq.s16 %p48, %rs342, 0; + selp.u16 %rs343, 1, 0, %p48; + or.b16 %rs344, %rs95, %rs343; + ld.local.u8 %r9763, [%rd260]; + ld.local.u8 %r9764, [%rd260+1]; + prmt.b32 %r9765, %r9764, %r9763, 30212; + ld.local.u8 %r9766, [%rd260+2]; + prmt.b32 %r9767, %r9766, %r9765, 28756; + ld.local.u8 %r9768, [%rd260+3]; + prmt.b32 %r9769, %r9768, %r9767, 1620; + ld.local.u8 %r9770, [%rd260+4]; + ld.local.u8 %r9771, [%rd260+5]; + prmt.b32 %r9772, %r9771, %r9770, 30212; + ld.local.u8 %r9773, [%rd260+6]; + prmt.b32 %r9774, %r9773, %r9772, 28756; + ld.local.u8 %r9775, [%rd260+7]; + prmt.b32 %r9776, %r9775, %r9774, 1620; + ld.local.u8 %r9777, [%rd260+8]; + ld.local.u8 %r9778, [%rd260+9]; + prmt.b32 %r9779, %r9778, %r9777, 30212; + ld.local.u8 %r9780, [%rd260+10]; + prmt.b32 %r9781, %r9780, %r9779, 28756; + ld.local.u8 %r9782, [%rd260+11]; + prmt.b32 %r9783, %r9782, %r9781, 1620; + ld.local.u8 %r9784, [%rd260+12]; + ld.local.u8 %r9785, [%rd260+13]; + prmt.b32 %r9786, %r9785, %r9784, 30212; + ld.local.u8 %r9787, [%rd260+14]; + prmt.b32 %r9788, %r9787, %r9786, 28756; + ld.local.u8 %r9789, [%rd260+15]; + prmt.b32 %r9790, %r9789, %r9788, 1620; + ld.local.u8 %r9791, [%rd260+16]; + ld.local.u8 %r9792, [%rd260+17]; + prmt.b32 %r9793, %r9792, %r9791, 30212; + ld.local.u8 %r9794, [%rd260+18]; + prmt.b32 %r9795, %r9794, %r9793, 28756; + ld.local.u8 %r9796, [%rd260+19]; + prmt.b32 %r9797, %r9796, %r9795, 1620; + ld.local.u8 %r9798, [%rd260+20]; + ld.local.u8 %r9799, [%rd260+21]; + prmt.b32 %r9800, %r9799, %r9798, 30212; + ld.local.u8 %r9801, [%rd260+22]; + prmt.b32 %r9802, %r9801, %r9800, 28756; + ld.local.u8 %r9803, [%rd260+23]; + prmt.b32 %r9804, %r9803, %r9802, 1620; + ld.local.u8 %r9805, [%rd260+24]; + ld.local.u8 %r9806, [%rd260+25]; + prmt.b32 %r9807, %r9806, %r9805, 30212; + ld.local.u8 %r9808, [%rd260+26]; + prmt.b32 %r9809, %r9808, %r9807, 28756; + ld.local.u8 %r9810, [%rd260+27]; + prmt.b32 %r9811, %r9810, %r9809, 1620; + ld.local.u8 %r9812, [%rd260+28]; + ld.local.u8 %r9813, [%rd260+29]; + prmt.b32 %r9814, %r9813, %r9812, 30212; + ld.local.u8 %r9815, [%rd260+30]; + prmt.b32 %r9816, %r9815, %r9814, 28756; + ld.local.u8 %r9817, [%rd260+31]; + prmt.b32 %r9818, %r9817, %r9816, 1620; + ld.local.u8 %r9819, [%rd260+32]; + ld.local.u8 %r9820, [%rd260+33]; + prmt.b32 %r9821, %r9820, %r9819, 30212; + ld.local.u8 %r9822, [%rd260+34]; + prmt.b32 %r9823, %r9822, %r9821, 28756; + ld.local.u8 %r9824, [%rd260+35]; + prmt.b32 %r9825, %r9824, %r9823, 1620; + ld.local.u8 %r9826, [%rd260+36]; + ld.local.u8 %r9827, [%rd260+37]; + prmt.b32 %r9828, %r9827, %r9826, 30212; + ld.local.u8 %r9829, [%rd260+38]; + prmt.b32 %r9830, %r9829, %r9828, 28756; + ld.local.u8 %r9831, [%rd260+39]; + prmt.b32 %r9832, %r9831, %r9830, 1620; + ld.local.u8 %r9833, [%rd260+40]; + ld.local.u8 %r9834, [%rd260+41]; + prmt.b32 %r9835, %r9834, %r9833, 30212; + ld.local.u8 %r9836, [%rd260+42]; + prmt.b32 %r9837, %r9836, %r9835, 28756; + ld.local.u8 %r9838, [%rd260+43]; + prmt.b32 %r9839, %r9838, %r9837, 1620; + ld.local.u8 %r9840, [%rd260+44]; + ld.local.u8 %r9841, [%rd260+45]; + prmt.b32 %r9842, %r9841, %r9840, 30212; + ld.local.u8 %r9843, [%rd260+46]; + prmt.b32 %r9844, %r9843, %r9842, 28756; + ld.local.u8 %r9845, [%rd260+47]; + prmt.b32 %r9846, %r9845, %r9844, 1620; + ld.local.u8 %r9847, [%rd260+48]; + ld.local.u8 %r9848, [%rd260+49]; + prmt.b32 %r9849, %r9848, %r9847, 30212; + ld.local.u8 %r9850, [%rd260+50]; + prmt.b32 %r9851, %r9850, %r9849, 28756; + ld.local.u8 %r9852, [%rd260+51]; + prmt.b32 %r9853, %r9852, %r9851, 1620; + ld.local.u8 %r9854, [%rd260+52]; + ld.local.u8 %r9855, [%rd260+53]; + prmt.b32 %r9856, %r9855, %r9854, 30212; + ld.local.u8 %r9857, [%rd260+54]; + prmt.b32 %r9858, %r9857, %r9856, 28756; + ld.local.u8 %r9859, [%rd260+55]; + prmt.b32 %r9860, %r9859, %r9858, 1620; + ld.local.u8 %r9861, [%rd260+56]; + ld.local.u8 %r9862, [%rd260+57]; + prmt.b32 %r9863, %r9862, %r9861, 30212; + ld.local.u8 %r9864, [%rd260+58]; + prmt.b32 %r9865, %r9864, %r9863, 28756; + ld.local.u8 %r9866, [%rd260+59]; + prmt.b32 %r9867, %r9866, %r9865, 1620; + ld.local.u8 %r9868, [%rd260+60]; + ld.local.u8 %r9869, [%rd260+61]; + prmt.b32 %r9870, %r9869, %r9868, 30212; + ld.local.u8 %r9871, [%rd260+62]; + prmt.b32 %r9872, %r9871, %r9870, 28756; + ld.local.u8 %r9873, [%rd260+63]; + prmt.b32 %r9874, %r9873, %r9872, 1620; + cvt.u32.u16 %r9875, %rs344; + and.b32 %r9876, %r9875, 255; + add.s32 %r9877, %r11689, %r11685; + add.s32 %r9878, %r9877, %r9769; + xor.b32 %r9879, %r9878, %r117; + shf.l.wrap.b32 %r9880, %r9879, %r9879, 16; + add.s32 %r9881, %r9880, 1779033703; + xor.b32 %r9882, %r9881, %r11685; + shf.l.wrap.b32 %r9883, %r9882, %r9882, 20; + add.s32 %r9884, %r9776, %r9878; + add.s32 %r9885, %r9884, %r9883; + xor.b32 %r9886, %r9885, %r9880; + shf.l.wrap.b32 %r9887, %r9886, %r9886, 24; + add.s32 %r9888, %r9887, %r9881; + xor.b32 %r9889, %r9888, %r9883; + shf.l.wrap.b32 %r9890, %r9889, %r9889, 25; + add.s32 %r9891, %r11688, %r11684; + add.s32 %r9892, %r9891, %r9783; + xor.b32 %r9893, %r9892, %r118; + shf.l.wrap.b32 %r9894, %r9893, %r9893, 16; + add.s32 %r9895, %r9894, -1150833019; + xor.b32 %r9896, %r9895, %r11684; + shf.l.wrap.b32 %r9897, %r9896, %r9896, 20; + add.s32 %r9898, %r9790, %r9892; + add.s32 %r9899, %r9898, %r9897; + xor.b32 %r9900, %r9899, %r9894; + shf.l.wrap.b32 %r9901, %r9900, %r9900, 24; + add.s32 %r9902, %r9901, %r9895; + xor.b32 %r9903, %r9902, %r9897; + shf.l.wrap.b32 %r9904, %r9903, %r9903, 25; + add.s32 %r9905, %r11687, %r11683; + add.s32 %r9906, %r9905, %r9797; + shr.u32 %r9907, %r9906, 16; + shl.b32 %r9908, %r9906, 16; + xor.b32 %r9909, %r9908, 4194304; + or.b32 %r9910, %r9909, %r9907; + add.s32 %r9911, %r9910, 1013904242; + xor.b32 %r9912, %r9911, %r11683; + shf.l.wrap.b32 %r9913, %r9912, %r9912, 20; + add.s32 %r9914, %r9804, %r9906; + add.s32 %r9915, %r9914, %r9913; + xor.b32 %r9916, %r9915, %r9910; + shf.l.wrap.b32 %r9917, %r9916, %r9916, 24; + add.s32 %r9918, %r9917, %r9911; + xor.b32 %r9919, %r9918, %r9913; + shf.l.wrap.b32 %r9920, %r9919, %r9919, 25; + add.s32 %r9921, %r11686, %r11682; + add.s32 %r9922, %r9921, %r9811; + xor.b32 %r9923, %r9922, %r9876; + shr.u32 %r9924, %r9922, 16; + shl.b32 %r9925, %r9923, 16; + or.b32 %r9926, %r9925, %r9924; + add.s32 %r9927, %r9926, -1521486534; + xor.b32 %r9928, %r9927, %r11682; + shf.l.wrap.b32 %r9929, %r9928, %r9928, 20; + add.s32 %r9930, %r9818, %r9922; + add.s32 %r9931, %r9930, %r9929; + xor.b32 %r9932, %r9931, %r9926; + shf.l.wrap.b32 %r9933, %r9932, %r9932, 24; + add.s32 %r9934, %r9933, %r9927; + xor.b32 %r9935, %r9934, %r9929; + shf.l.wrap.b32 %r9936, %r9935, %r9935, 25; + add.s32 %r9937, %r9904, %r9885; + add.s32 %r9938, %r9937, %r9825; + xor.b32 %r9939, %r9933, %r9938; + shf.l.wrap.b32 %r9940, %r9939, %r9939, 16; + add.s32 %r9941, %r9940, %r9918; + xor.b32 %r9942, %r9941, %r9904; + shf.l.wrap.b32 %r9943, %r9942, %r9942, 20; + add.s32 %r9944, %r9832, %r9938; + add.s32 %r9945, %r9944, %r9943; + xor.b32 %r9946, %r9945, %r9940; + shf.l.wrap.b32 %r9947, %r9946, %r9946, 24; + add.s32 %r9948, %r9947, %r9941; + xor.b32 %r9949, %r9948, %r9943; + shf.l.wrap.b32 %r9950, %r9949, %r9949, 25; + add.s32 %r9951, %r9920, %r9899; + add.s32 %r9952, %r9951, %r9839; + xor.b32 %r9953, %r9952, %r9887; + shf.l.wrap.b32 %r9954, %r9953, %r9953, 16; + add.s32 %r9955, %r9954, %r9934; + xor.b32 %r9956, %r9955, %r9920; + shf.l.wrap.b32 %r9957, %r9956, %r9956, 20; + add.s32 %r9958, %r9846, %r9952; + add.s32 %r9959, %r9958, %r9957; + xor.b32 %r9960, %r9959, %r9954; + shf.l.wrap.b32 %r9961, %r9960, %r9960, 24; + add.s32 %r9962, %r9961, %r9955; + xor.b32 %r9963, %r9962, %r9957; + shf.l.wrap.b32 %r9964, %r9963, %r9963, 25; + add.s32 %r9965, %r9936, %r9915; + add.s32 %r9966, %r9965, %r9853; + xor.b32 %r9967, %r9966, %r9901; + shf.l.wrap.b32 %r9968, %r9967, %r9967, 16; + add.s32 %r9969, %r9968, %r9888; + xor.b32 %r9970, %r9969, %r9936; + shf.l.wrap.b32 %r9971, %r9970, %r9970, 20; + add.s32 %r9972, %r9860, %r9966; + add.s32 %r9973, %r9972, %r9971; + xor.b32 %r9974, %r9973, %r9968; + shf.l.wrap.b32 %r9975, %r9974, %r9974, 24; + add.s32 %r9976, %r9975, %r9969; + xor.b32 %r9977, %r9976, %r9971; + shf.l.wrap.b32 %r9978, %r9977, %r9977, 25; + add.s32 %r9979, %r9931, %r9890; + add.s32 %r9980, %r9979, %r9867; + xor.b32 %r9981, %r9980, %r9917; + shf.l.wrap.b32 %r9982, %r9981, %r9981, 16; + add.s32 %r9983, %r9982, %r9902; + xor.b32 %r9984, %r9983, %r9890; + shf.l.wrap.b32 %r9985, %r9984, %r9984, 20; + add.s32 %r9986, %r9874, %r9980; + add.s32 %r9987, %r9986, %r9985; + xor.b32 %r9988, %r9987, %r9982; + shf.l.wrap.b32 %r9989, %r9988, %r9988, 24; + add.s32 %r9990, %r9989, %r9983; + xor.b32 %r9991, %r9990, %r9985; + shf.l.wrap.b32 %r9992, %r9991, %r9991, 25; + add.s32 %r9993, %r9945, %r9783; + add.s32 %r9994, %r9993, %r9992; + xor.b32 %r9995, %r9994, %r9961; + shf.l.wrap.b32 %r9996, %r9995, %r9995, 16; + add.s32 %r9997, %r9996, %r9976; + xor.b32 %r9998, %r9997, %r9992; + shf.l.wrap.b32 %r9999, %r9998, %r9998, 20; + add.s32 %r10000, %r9994, %r9811; + add.s32 %r10001, %r10000, %r9999; + xor.b32 %r10002, %r10001, %r9996; + shf.l.wrap.b32 %r10003, %r10002, %r10002, 24; + add.s32 %r10004, %r10003, %r9997; + xor.b32 %r10005, %r10004, %r9999; + shf.l.wrap.b32 %r10006, %r10005, %r10005, 25; + add.s32 %r10007, %r9959, %r9790; + add.s32 %r10008, %r10007, %r9950; + xor.b32 %r10009, %r9975, %r10008; + shf.l.wrap.b32 %r10010, %r10009, %r10009, 16; + add.s32 %r10011, %r9990, %r10010; + xor.b32 %r10012, %r10011, %r9950; + shf.l.wrap.b32 %r10013, %r10012, %r10012, 20; + add.s32 %r10014, %r10008, %r9839; + add.s32 %r10015, %r10014, %r10013; + xor.b32 %r10016, %r10015, %r10010; + shf.l.wrap.b32 %r10017, %r10016, %r10016, 24; + add.s32 %r10018, %r10017, %r10011; + xor.b32 %r10019, %r10018, %r10013; + shf.l.wrap.b32 %r10020, %r10019, %r10019, 25; + add.s32 %r10021, %r9964, %r9818; + add.s32 %r10022, %r10021, %r9973; + xor.b32 %r10023, %r9989, %r10022; + shf.l.wrap.b32 %r10024, %r10023, %r10023, 16; + add.s32 %r10025, %r10024, %r9948; + xor.b32 %r10026, %r10025, %r9964; + shf.l.wrap.b32 %r10027, %r10026, %r10026, 20; + add.s32 %r10028, %r10022, %r9769; + add.s32 %r10029, %r10028, %r10027; + xor.b32 %r10030, %r10029, %r10024; + shf.l.wrap.b32 %r10031, %r10030, %r10030, 24; + add.s32 %r10032, %r10031, %r10025; + xor.b32 %r10033, %r10032, %r10027; + shf.l.wrap.b32 %r10034, %r10033, %r10033, 25; + add.s32 %r10035, %r9978, %r9797; + add.s32 %r10036, %r10035, %r9987; + xor.b32 %r10037, %r10036, %r9947; + shf.l.wrap.b32 %r10038, %r10037, %r10037, 16; + add.s32 %r10039, %r10038, %r9962; + xor.b32 %r10040, %r10039, %r9978; + shf.l.wrap.b32 %r10041, %r10040, %r10040, 20; + add.s32 %r10042, %r10036, %r9860; + add.s32 %r10043, %r10042, %r10041; + xor.b32 %r10044, %r10043, %r10038; + shf.l.wrap.b32 %r10045, %r10044, %r10044, 24; + add.s32 %r10046, %r10045, %r10039; + xor.b32 %r10047, %r10046, %r10041; + shf.l.wrap.b32 %r10048, %r10047, %r10047, 25; + add.s32 %r10049, %r10001, %r9776; + add.s32 %r10050, %r10049, %r10020; + xor.b32 %r10051, %r10050, %r10045; + shf.l.wrap.b32 %r10052, %r10051, %r10051, 16; + add.s32 %r10053, %r10052, %r10032; + xor.b32 %r10054, %r10053, %r10020; + shf.l.wrap.b32 %r10055, %r10054, %r10054, 20; + add.s32 %r10056, %r10050, %r9846; + add.s32 %r10057, %r10056, %r10055; + xor.b32 %r10058, %r10057, %r10052; + shf.l.wrap.b32 %r10059, %r10058, %r10058, 24; + add.s32 %r10060, %r10059, %r10053; + xor.b32 %r10061, %r10060, %r10055; + shf.l.wrap.b32 %r10062, %r10061, %r10061, 25; + add.s32 %r10063, %r10015, %r9853; + add.s32 %r10064, %r10063, %r10034; + xor.b32 %r10065, %r10064, %r10003; + shf.l.wrap.b32 %r10066, %r10065, %r10065, 16; + add.s32 %r10067, %r10066, %r10046; + xor.b32 %r10068, %r10067, %r10034; + shf.l.wrap.b32 %r10069, %r10068, %r10068, 20; + add.s32 %r10070, %r10064, %r9804; + add.s32 %r10071, %r10070, %r10069; + xor.b32 %r10072, %r10071, %r10066; + shf.l.wrap.b32 %r10073, %r10072, %r10072, 24; + add.s32 %r10074, %r10073, %r10067; + xor.b32 %r10075, %r10074, %r10069; + shf.l.wrap.b32 %r10076, %r10075, %r10075, 25; + add.s32 %r10077, %r10029, %r9832; + add.s32 %r10078, %r10077, %r10048; + xor.b32 %r10079, %r10078, %r10017; + shf.l.wrap.b32 %r10080, %r10079, %r10079, 16; + add.s32 %r10081, %r10080, %r10004; + xor.b32 %r10082, %r10081, %r10048; + shf.l.wrap.b32 %r10083, %r10082, %r10082, 20; + add.s32 %r10084, %r10078, %r9867; + add.s32 %r10085, %r10084, %r10083; + xor.b32 %r10086, %r10085, %r10080; + shf.l.wrap.b32 %r10087, %r10086, %r10086, 24; + add.s32 %r10088, %r10087, %r10081; + xor.b32 %r10089, %r10088, %r10083; + shf.l.wrap.b32 %r10090, %r10089, %r10089, 25; + add.s32 %r10091, %r10043, %r9874; + add.s32 %r10092, %r10091, %r10006; + xor.b32 %r10093, %r10092, %r10031; + shf.l.wrap.b32 %r10094, %r10093, %r10093, 16; + add.s32 %r10095, %r10094, %r10018; + xor.b32 %r10096, %r10095, %r10006; + shf.l.wrap.b32 %r10097, %r10096, %r10096, 20; + add.s32 %r10098, %r10092, %r9825; + add.s32 %r10099, %r10098, %r10097; + xor.b32 %r10100, %r10099, %r10094; + shf.l.wrap.b32 %r10101, %r10100, %r10100, 24; + add.s32 %r10102, %r10101, %r10095; + xor.b32 %r10103, %r10102, %r10097; + shf.l.wrap.b32 %r10104, %r10103, %r10103, 25; + add.s32 %r10105, %r10057, %r9790; + add.s32 %r10106, %r10105, %r10104; + xor.b32 %r10107, %r10106, %r10073; + shf.l.wrap.b32 %r10108, %r10107, %r10107, 16; + add.s32 %r10109, %r10108, %r10088; + xor.b32 %r10110, %r10109, %r10104; + shf.l.wrap.b32 %r10111, %r10110, %r10110, 20; + add.s32 %r10112, %r10106, %r9797; + add.s32 %r10113, %r10112, %r10111; + xor.b32 %r10114, %r10113, %r10108; + shf.l.wrap.b32 %r10115, %r10114, %r10114, 24; + add.s32 %r10116, %r10115, %r10109; + xor.b32 %r10117, %r10116, %r10111; + shf.l.wrap.b32 %r10118, %r10117, %r10117, 25; + add.s32 %r10119, %r10071, %r9839; + add.s32 %r10120, %r10119, %r10062; + xor.b32 %r10121, %r10120, %r10087; + shf.l.wrap.b32 %r10122, %r10121, %r10121, 16; + add.s32 %r10123, %r10122, %r10102; + xor.b32 %r10124, %r10123, %r10062; + shf.l.wrap.b32 %r10125, %r10124, %r10124, 20; + add.s32 %r10126, %r10120, %r9853; + add.s32 %r10127, %r10126, %r10125; + xor.b32 %r10128, %r10127, %r10122; + shf.l.wrap.b32 %r10129, %r10128, %r10128, 24; + add.s32 %r10130, %r10129, %r10123; + xor.b32 %r10131, %r10130, %r10125; + shf.l.wrap.b32 %r10132, %r10131, %r10131, 25; + add.s32 %r10133, %r10085, %r9860; + add.s32 %r10134, %r10133, %r10076; + xor.b32 %r10135, %r10134, %r10101; + shf.l.wrap.b32 %r10136, %r10135, %r10135, 16; + add.s32 %r10137, %r10136, %r10060; + xor.b32 %r10138, %r10137, %r10076; + shf.l.wrap.b32 %r10139, %r10138, %r10138, 20; + add.s32 %r10140, %r10134, %r9783; + add.s32 %r10141, %r10140, %r10139; + xor.b32 %r10142, %r10141, %r10136; + shf.l.wrap.b32 %r10143, %r10142, %r10142, 24; + add.s32 %r10144, %r10143, %r10137; + xor.b32 %r10145, %r10144, %r10139; + shf.l.wrap.b32 %r10146, %r10145, %r10145, 25; + add.s32 %r10147, %r10099, %r9818; + add.s32 %r10148, %r10147, %r10090; + xor.b32 %r10149, %r10148, %r10059; + shf.l.wrap.b32 %r10150, %r10149, %r10149, 16; + add.s32 %r10151, %r10150, %r10074; + xor.b32 %r10152, %r10151, %r10090; + shf.l.wrap.b32 %r10153, %r10152, %r10152, 20; + add.s32 %r10154, %r10148, %r9867; + add.s32 %r10155, %r10154, %r10153; + xor.b32 %r10156, %r10155, %r10150; + shf.l.wrap.b32 %r10157, %r10156, %r10156, 24; + add.s32 %r10158, %r10157, %r10151; + xor.b32 %r10159, %r10158, %r10153; + shf.l.wrap.b32 %r10160, %r10159, %r10159, 25; + add.s32 %r10161, %r10113, %r9811; + add.s32 %r10162, %r10161, %r10132; + xor.b32 %r10163, %r10162, %r10157; + shf.l.wrap.b32 %r10164, %r10163, %r10163, 16; + add.s32 %r10165, %r10164, %r10144; + xor.b32 %r10166, %r10165, %r10132; + shf.l.wrap.b32 %r10167, %r10166, %r10166, 20; + add.s32 %r10168, %r10162, %r9804; + add.s32 %r10169, %r10168, %r10167; + xor.b32 %r10170, %r10169, %r10164; + shf.l.wrap.b32 %r10171, %r10170, %r10170, 24; + add.s32 %r10172, %r10171, %r10165; + xor.b32 %r10173, %r10172, %r10167; + shf.l.wrap.b32 %r10174, %r10173, %r10173, 25; + add.s32 %r10175, %r10127, %r9832; + add.s32 %r10176, %r10175, %r10146; + xor.b32 %r10177, %r10176, %r10115; + shf.l.wrap.b32 %r10178, %r10177, %r10177, 16; + add.s32 %r10179, %r10178, %r10158; + xor.b32 %r10180, %r10179, %r10146; + shf.l.wrap.b32 %r10181, %r10180, %r10180, 20; + add.s32 %r10182, %r10176, %r9769; + add.s32 %r10183, %r10182, %r10181; + xor.b32 %r10184, %r10183, %r10178; + shf.l.wrap.b32 %r10185, %r10184, %r10184, 24; + add.s32 %r10186, %r10185, %r10179; + xor.b32 %r10187, %r10186, %r10181; + shf.l.wrap.b32 %r10188, %r10187, %r10187, 25; + add.s32 %r10189, %r10141, %r9846; + add.s32 %r10190, %r10189, %r10160; + xor.b32 %r10191, %r10190, %r10129; + shf.l.wrap.b32 %r10192, %r10191, %r10191, 16; + add.s32 %r10193, %r10192, %r10116; + xor.b32 %r10194, %r10193, %r10160; + shf.l.wrap.b32 %r10195, %r10194, %r10194, 20; + add.s32 %r10196, %r10190, %r9874; + add.s32 %r10197, %r10196, %r10195; + xor.b32 %r10198, %r10197, %r10192; + shf.l.wrap.b32 %r10199, %r10198, %r10198, 24; + add.s32 %r10200, %r10199, %r10193; + xor.b32 %r10201, %r10200, %r10195; + shf.l.wrap.b32 %r10202, %r10201, %r10201, 25; + add.s32 %r10203, %r10155, %r9825; + add.s32 %r10204, %r10203, %r10118; + xor.b32 %r10205, %r10204, %r10143; + shf.l.wrap.b32 %r10206, %r10205, %r10205, 16; + add.s32 %r10207, %r10206, %r10130; + xor.b32 %r10208, %r10207, %r10118; + shf.l.wrap.b32 %r10209, %r10208, %r10208, 20; + add.s32 %r10210, %r10204, %r9776; + add.s32 %r10211, %r10210, %r10209; + xor.b32 %r10212, %r10211, %r10206; + shf.l.wrap.b32 %r10213, %r10212, %r10212, 24; + add.s32 %r10214, %r10213, %r10207; + xor.b32 %r10215, %r10214, %r10209; + shf.l.wrap.b32 %r10216, %r10215, %r10215, 25; + add.s32 %r10217, %r10169, %r9839; + add.s32 %r10218, %r10217, %r10216; + xor.b32 %r10219, %r10218, %r10185; + shf.l.wrap.b32 %r10220, %r10219, %r10219, 16; + add.s32 %r10221, %r10220, %r10200; + xor.b32 %r10222, %r10221, %r10216; + shf.l.wrap.b32 %r10223, %r10222, %r10222, 20; + add.s32 %r10224, %r10218, %r9818; + add.s32 %r10225, %r10224, %r10223; + xor.b32 %r10226, %r10225, %r10220; + shf.l.wrap.b32 %r10227, %r10226, %r10226, 24; + add.s32 %r10228, %r10227, %r10221; + xor.b32 %r10229, %r10228, %r10223; + shf.l.wrap.b32 %r10230, %r10229, %r10229, 25; + add.s32 %r10231, %r10183, %r9853; + add.s32 %r10232, %r10231, %r10174; + xor.b32 %r10233, %r10232, %r10199; + shf.l.wrap.b32 %r10234, %r10233, %r10233, 16; + add.s32 %r10235, %r10234, %r10214; + xor.b32 %r10236, %r10235, %r10174; + shf.l.wrap.b32 %r10237, %r10236, %r10236, 20; + add.s32 %r10238, %r10232, %r9832; + add.s32 %r10239, %r10238, %r10237; + xor.b32 %r10240, %r10239, %r10234; + shf.l.wrap.b32 %r10241, %r10240, %r10240, 24; + add.s32 %r10242, %r10241, %r10235; + xor.b32 %r10243, %r10242, %r10237; + shf.l.wrap.b32 %r10244, %r10243, %r10243, 25; + add.s32 %r10245, %r10197, %r9867; + add.s32 %r10246, %r10245, %r10188; + xor.b32 %r10247, %r10246, %r10213; + shf.l.wrap.b32 %r10248, %r10247, %r10247, 16; + add.s32 %r10249, %r10248, %r10172; + xor.b32 %r10250, %r10249, %r10188; + shf.l.wrap.b32 %r10251, %r10250, %r10250, 20; + add.s32 %r10252, %r10246, %r9790; + add.s32 %r10253, %r10252, %r10251; + xor.b32 %r10254, %r10253, %r10248; + shf.l.wrap.b32 %r10255, %r10254, %r10254, 24; + add.s32 %r10256, %r10255, %r10249; + xor.b32 %r10257, %r10256, %r10251; + shf.l.wrap.b32 %r10258, %r10257, %r10257, 25; + add.s32 %r10259, %r10211, %r9860; + add.s32 %r10260, %r10259, %r10202; + xor.b32 %r10261, %r10260, %r10171; + shf.l.wrap.b32 %r10262, %r10261, %r10261, 16; + add.s32 %r10263, %r10262, %r10186; + xor.b32 %r10264, %r10263, %r10202; + shf.l.wrap.b32 %r10265, %r10264, %r10264, 20; + add.s32 %r10266, %r10260, %r9874; + add.s32 %r10267, %r10266, %r10265; + xor.b32 %r10268, %r10267, %r10262; + shf.l.wrap.b32 %r10269, %r10268, %r10268, 24; + add.s32 %r10270, %r10269, %r10263; + xor.b32 %r10271, %r10270, %r10265; + shf.l.wrap.b32 %r10272, %r10271, %r10271, 25; + add.s32 %r10273, %r10225, %r9797; + add.s32 %r10274, %r10273, %r10244; + xor.b32 %r10275, %r10274, %r10269; + shf.l.wrap.b32 %r10276, %r10275, %r10275, 16; + add.s32 %r10277, %r10276, %r10256; + xor.b32 %r10278, %r10277, %r10244; + shf.l.wrap.b32 %r10279, %r10278, %r10278, 20; + add.s32 %r10280, %r10274, %r9769; + add.s32 %r10281, %r10280, %r10279; + xor.b32 %r10282, %r10281, %r10276; + shf.l.wrap.b32 %r10283, %r10282, %r10282, 24; + add.s32 %r10284, %r10283, %r10277; + xor.b32 %r10285, %r10284, %r10279; + shf.l.wrap.b32 %r10286, %r10285, %r10285, 25; + add.s32 %r10287, %r10239, %r9846; + add.s32 %r10288, %r10287, %r10258; + xor.b32 %r10289, %r10288, %r10227; + shf.l.wrap.b32 %r10290, %r10289, %r10289, 16; + add.s32 %r10291, %r10290, %r10270; + xor.b32 %r10292, %r10291, %r10258; + shf.l.wrap.b32 %r10293, %r10292, %r10292, 20; + add.s32 %r10294, %r10288, %r9783; + add.s32 %r10295, %r10294, %r10293; + xor.b32 %r10296, %r10295, %r10290; + shf.l.wrap.b32 %r10297, %r10296, %r10296, 24; + add.s32 %r10298, %r10297, %r10291; + xor.b32 %r10299, %r10298, %r10293; + shf.l.wrap.b32 %r10300, %r10299, %r10299, 25; + add.s32 %r10301, %r10253, %r9804; + add.s32 %r10302, %r10301, %r10272; + xor.b32 %r10303, %r10302, %r10241; + shf.l.wrap.b32 %r10304, %r10303, %r10303, 16; + add.s32 %r10305, %r10304, %r10228; + xor.b32 %r10306, %r10305, %r10272; + shf.l.wrap.b32 %r10307, %r10306, %r10306, 20; + add.s32 %r10308, %r10302, %r9825; + add.s32 %r10309, %r10308, %r10307; + xor.b32 %r10310, %r10309, %r10304; + shf.l.wrap.b32 %r10311, %r10310, %r10310, 24; + add.s32 %r10312, %r10311, %r10305; + xor.b32 %r10313, %r10312, %r10307; + shf.l.wrap.b32 %r10314, %r10313, %r10313, 25; + add.s32 %r10315, %r10267, %r9776; + add.s32 %r10316, %r10315, %r10230; + xor.b32 %r10317, %r10316, %r10255; + shf.l.wrap.b32 %r10318, %r10317, %r10317, 16; + add.s32 %r10319, %r10318, %r10242; + xor.b32 %r10320, %r10319, %r10230; + shf.l.wrap.b32 %r10321, %r10320, %r10320, 20; + add.s32 %r10322, %r10316, %r9811; + add.s32 %r10323, %r10322, %r10321; + xor.b32 %r10324, %r10323, %r10318; + shf.l.wrap.b32 %r10325, %r10324, %r10324, 24; + add.s32 %r10326, %r10325, %r10319; + xor.b32 %r10327, %r10326, %r10321; + shf.l.wrap.b32 %r10328, %r10327, %r10327, 25; + add.s32 %r10329, %r10281, %r9853; + add.s32 %r10330, %r10329, %r10328; + xor.b32 %r10331, %r10330, %r10297; + shf.l.wrap.b32 %r10332, %r10331, %r10331, 16; + add.s32 %r10333, %r10332, %r10312; + xor.b32 %r10334, %r10333, %r10328; + shf.l.wrap.b32 %r10335, %r10334, %r10334, 20; + add.s32 %r10336, %r10330, %r9860; + add.s32 %r10337, %r10336, %r10335; + xor.b32 %r10338, %r10337, %r10332; + shf.l.wrap.b32 %r10339, %r10338, %r10338, 24; + add.s32 %r10340, %r10339, %r10333; + xor.b32 %r10341, %r10340, %r10335; + shf.l.wrap.b32 %r10342, %r10341, %r10341, 25; + add.s32 %r10343, %r10295, %r9832; + add.s32 %r10344, %r10343, %r10286; + xor.b32 %r10345, %r10344, %r10311; + shf.l.wrap.b32 %r10346, %r10345, %r10345, 16; + add.s32 %r10347, %r10346, %r10326; + xor.b32 %r10348, %r10347, %r10286; + shf.l.wrap.b32 %r10349, %r10348, %r10348, 20; + add.s32 %r10350, %r10344, %r9846; + add.s32 %r10351, %r10350, %r10349; + xor.b32 %r10352, %r10351, %r10346; + shf.l.wrap.b32 %r10353, %r10352, %r10352, 24; + add.s32 %r10354, %r10353, %r10347; + xor.b32 %r10355, %r10354, %r10349; + shf.l.wrap.b32 %r10356, %r10355, %r10355, 25; + add.s32 %r10357, %r10309, %r9874; + add.s32 %r10358, %r10357, %r10300; + xor.b32 %r10359, %r10358, %r10325; + shf.l.wrap.b32 %r10360, %r10359, %r10359, 16; + add.s32 %r10361, %r10360, %r10284; + xor.b32 %r10362, %r10361, %r10300; + shf.l.wrap.b32 %r10363, %r10362, %r10362, 20; + add.s32 %r10364, %r10358, %r9839; + add.s32 %r10365, %r10364, %r10363; + xor.b32 %r10366, %r10365, %r10360; + shf.l.wrap.b32 %r10367, %r10366, %r10366, 24; + add.s32 %r10368, %r10367, %r10361; + xor.b32 %r10369, %r10368, %r10363; + shf.l.wrap.b32 %r10370, %r10369, %r10369, 25; + add.s32 %r10371, %r10323, %r9867; + add.s32 %r10372, %r10371, %r10314; + xor.b32 %r10373, %r10372, %r10283; + shf.l.wrap.b32 %r10374, %r10373, %r10373, 16; + add.s32 %r10375, %r10374, %r10298; + xor.b32 %r10376, %r10375, %r10314; + shf.l.wrap.b32 %r10377, %r10376, %r10376, 20; + add.s32 %r10378, %r10372, %r9825; + add.s32 %r10379, %r10378, %r10377; + xor.b32 %r10380, %r10379, %r10374; + shf.l.wrap.b32 %r10381, %r10380, %r10380, 24; + add.s32 %r10382, %r10381, %r10375; + xor.b32 %r10383, %r10382, %r10377; + shf.l.wrap.b32 %r10384, %r10383, %r10383, 25; + add.s32 %r10385, %r10337, %r9818; + add.s32 %r10386, %r10385, %r10356; + xor.b32 %r10387, %r10386, %r10381; + shf.l.wrap.b32 %r10388, %r10387, %r10387, 16; + add.s32 %r10389, %r10388, %r10368; + xor.b32 %r10390, %r10389, %r10356; + shf.l.wrap.b32 %r10391, %r10390, %r10390, 20; + add.s32 %r10392, %r10386, %r9783; + add.s32 %r10393, %r10392, %r10391; + xor.b32 %r10394, %r10393, %r10388; + shf.l.wrap.b32 %r10395, %r10394, %r10394, 24; + add.s32 %r10396, %r10395, %r10389; + xor.b32 %r10397, %r10396, %r10391; + shf.l.wrap.b32 %r10398, %r10397, %r10397, 25; + add.s32 %r10399, %r10351, %r9804; + add.s32 %r10400, %r10399, %r10370; + xor.b32 %r10401, %r10400, %r10339; + shf.l.wrap.b32 %r10402, %r10401, %r10401, 16; + add.s32 %r10403, %r10402, %r10382; + xor.b32 %r10404, %r10403, %r10370; + shf.l.wrap.b32 %r10405, %r10404, %r10404, 20; + add.s32 %r10406, %r10400, %r9790; + add.s32 %r10407, %r10406, %r10405; + xor.b32 %r10408, %r10407, %r10402; + shf.l.wrap.b32 %r10409, %r10408, %r10408, 24; + add.s32 %r10410, %r10409, %r10403; + xor.b32 %r10411, %r10410, %r10405; + shf.l.wrap.b32 %r10412, %r10411, %r10411, 25; + add.s32 %r10413, %r10365, %r9769; + add.s32 %r10414, %r10413, %r10384; + xor.b32 %r10415, %r10414, %r10353; + shf.l.wrap.b32 %r10416, %r10415, %r10415, 16; + add.s32 %r10417, %r10416, %r10340; + xor.b32 %r10418, %r10417, %r10384; + shf.l.wrap.b32 %r10419, %r10418, %r10418, 20; + add.s32 %r10420, %r10414, %r9776; + add.s32 %r10421, %r10420, %r10419; + xor.b32 %r10422, %r10421, %r10416; + shf.l.wrap.b32 %r10423, %r10422, %r10422, 24; + add.s32 %r10424, %r10423, %r10417; + xor.b32 %r10425, %r10424, %r10419; + shf.l.wrap.b32 %r10426, %r10425, %r10425, 25; + add.s32 %r10427, %r10379, %r9811; + add.s32 %r10428, %r10427, %r10342; + xor.b32 %r10429, %r10428, %r10367; + shf.l.wrap.b32 %r10430, %r10429, %r10429, 16; + add.s32 %r10431, %r10430, %r10354; + xor.b32 %r10432, %r10431, %r10342; + shf.l.wrap.b32 %r10433, %r10432, %r10432, 20; + add.s32 %r10434, %r10428, %r9797; + add.s32 %r10435, %r10434, %r10433; + xor.b32 %r10436, %r10435, %r10430; + shf.l.wrap.b32 %r10437, %r10436, %r10436, 24; + add.s32 %r10438, %r10437, %r10431; + xor.b32 %r10439, %r10438, %r10433; + shf.l.wrap.b32 %r10440, %r10439, %r10439, 25; + add.s32 %r10441, %r10393, %r9832; + add.s32 %r10442, %r10441, %r10440; + xor.b32 %r10443, %r10442, %r10409; + shf.l.wrap.b32 %r10444, %r10443, %r10443, 16; + add.s32 %r10445, %r10444, %r10424; + xor.b32 %r10446, %r10445, %r10440; + shf.l.wrap.b32 %r10447, %r10446, %r10446, 20; + add.s32 %r10448, %r10442, %r9867; + add.s32 %r10449, %r10448, %r10447; + xor.b32 %r10450, %r10449, %r10444; + shf.l.wrap.b32 %r10451, %r10450, %r10450, 24; + add.s32 %r10452, %r10451, %r10445; + xor.b32 %r10453, %r10452, %r10447; + shf.l.wrap.b32 %r10454, %r10453, %r10453, 25; + add.s32 %r10455, %r10407, %r9846; + add.s32 %r10456, %r10455, %r10398; + xor.b32 %r10457, %r10456, %r10423; + shf.l.wrap.b32 %r10458, %r10457, %r10457, 16; + add.s32 %r10459, %r10458, %r10438; + xor.b32 %r10460, %r10459, %r10398; + shf.l.wrap.b32 %r10461, %r10460, %r10460, 20; + add.s32 %r10462, %r10456, %r9804; + add.s32 %r10463, %r10462, %r10461; + xor.b32 %r10464, %r10463, %r10458; + shf.l.wrap.b32 %r10465, %r10464, %r10464, 24; + add.s32 %r10466, %r10465, %r10459; + xor.b32 %r10467, %r10466, %r10461; + shf.l.wrap.b32 %r10468, %r10467, %r10467, 25; + add.s32 %r10469, %r10421, %r9825; + add.s32 %r10470, %r10469, %r10412; + xor.b32 %r10471, %r10470, %r10437; + shf.l.wrap.b32 %r10472, %r10471, %r10471, 16; + add.s32 %r10473, %r10472, %r10396; + xor.b32 %r10474, %r10473, %r10412; + shf.l.wrap.b32 %r10475, %r10474, %r10474, 20; + add.s32 %r10476, %r10470, %r9853; + add.s32 %r10477, %r10476, %r10475; + xor.b32 %r10478, %r10477, %r10472; + shf.l.wrap.b32 %r10479, %r10478, %r10478, 24; + add.s32 %r10480, %r10479, %r10473; + xor.b32 %r10481, %r10480, %r10475; + shf.l.wrap.b32 %r10482, %r10481, %r10481, 25; + add.s32 %r10483, %r10435, %r9874; + add.s32 %r10484, %r10483, %r10426; + xor.b32 %r10485, %r10484, %r10395; + shf.l.wrap.b32 %r10486, %r10485, %r10485, 16; + add.s32 %r10487, %r10486, %r10410; + xor.b32 %r10488, %r10487, %r10426; + shf.l.wrap.b32 %r10489, %r10488, %r10488, 20; + add.s32 %r10490, %r10484, %r9776; + add.s32 %r10491, %r10490, %r10489; + xor.b32 %r10492, %r10491, %r10486; + shf.l.wrap.b32 %r10493, %r10492, %r10492, 24; + add.s32 %r10494, %r10493, %r10487; + xor.b32 %r10495, %r10494, %r10489; + shf.l.wrap.b32 %r10496, %r10495, %r10495, 25; + add.s32 %r10497, %r10449, %r9860; + add.s32 %r10498, %r10497, %r10468; + xor.b32 %r10499, %r10498, %r10493; + shf.l.wrap.b32 %r10500, %r10499, %r10499, 16; + add.s32 %r10501, %r10500, %r10480; + xor.b32 %r10502, %r10501, %r10468; + shf.l.wrap.b32 %r10503, %r10502, %r10502, 20; + add.s32 %r10504, %r10498, %r9790; + add.s32 %r10505, %r10504, %r10503; + xor.b32 %r10506, %r10505, %r10500; + shf.l.wrap.b32 %r10507, %r10506, %r10506, 24; + add.s32 %r10508, %r10507, %r10501; + xor.b32 %r10509, %r10508, %r10503; + shf.l.wrap.b32 %r10510, %r10509, %r10509, 25; + add.s32 %r10511, %r10463, %r9769; + add.s32 %r10512, %r10511, %r10482; + xor.b32 %r10513, %r10512, %r10451; + shf.l.wrap.b32 %r10514, %r10513, %r10513, 16; + add.s32 %r10515, %r10514, %r10494; + xor.b32 %r10516, %r10515, %r10482; + shf.l.wrap.b32 %r10517, %r10516, %r10516, 20; + add.s32 %r10518, %r10512, %r9839; + add.s32 %r10519, %r10518, %r10517; + xor.b32 %r10520, %r10519, %r10514; + shf.l.wrap.b32 %r10521, %r10520, %r10520, 24; + add.s32 %r10522, %r10521, %r10515; + xor.b32 %r10523, %r10522, %r10517; + shf.l.wrap.b32 %r10524, %r10523, %r10523, 25; + add.s32 %r10525, %r10477, %r9783; + add.s32 %r10526, %r10525, %r10496; + xor.b32 %r10527, %r10526, %r10465; + shf.l.wrap.b32 %r10528, %r10527, %r10527, 16; + add.s32 %r10529, %r10528, %r10452; + xor.b32 %r10530, %r10529, %r10496; + shf.l.wrap.b32 %r10531, %r10530, %r10530, 20; + add.s32 %r10532, %r10526, %r9811; + add.s32 %r10533, %r10532, %r10531; + xor.b32 %r10534, %r10533, %r10528; + shf.l.wrap.b32 %r10535, %r10534, %r10534, 24; + add.s32 %r10536, %r10535, %r10529; + xor.b32 %r10537, %r10536, %r10531; + shf.l.wrap.b32 %r10538, %r10537, %r10537, 25; + add.s32 %r10539, %r10491, %r9797; + add.s32 %r10540, %r10539, %r10454; + xor.b32 %r10541, %r10540, %r10479; + shf.l.wrap.b32 %r10542, %r10541, %r10541, 16; + add.s32 %r10543, %r10542, %r10466; + xor.b32 %r10544, %r10543, %r10454; + shf.l.wrap.b32 %r10545, %r10544, %r10544, 20; + add.s32 %r10546, %r10540, %r9818; + add.s32 %r10547, %r10546, %r10545; + xor.b32 %r10548, %r10547, %r10542; + shf.l.wrap.b32 %r10549, %r10548, %r10548, 24; + add.s32 %r10550, %r10549, %r10543; + xor.b32 %r10551, %r10550, %r10545; + shf.l.wrap.b32 %r10552, %r10551, %r10551, 25; + add.s32 %r10553, %r10505, %r9846; + add.s32 %r10554, %r10553, %r10552; + xor.b32 %r10555, %r10554, %r10521; + shf.l.wrap.b32 %r10556, %r10555, %r10555, 16; + add.s32 %r10557, %r10556, %r10536; + xor.b32 %r10558, %r10557, %r10552; + shf.l.wrap.b32 %r10559, %r10558, %r10558, 20; + add.s32 %r10560, %r10554, %r9874; + add.s32 %r10561, %r10560, %r10559; + xor.b32 %r10562, %r10561, %r10556; + shf.l.wrap.b32 %r10563, %r10562, %r10562, 24; + add.s32 %r10564, %r10563, %r10557; + xor.b32 %r10565, %r10564, %r10559; + shf.l.wrap.b32 %r10566, %r10565, %r10565, 25; + add.s32 %r10567, %r10519, %r9804; + add.s32 %r10568, %r10567, %r10510; + xor.b32 %r10569, %r10568, %r10535; + shf.l.wrap.b32 %r10570, %r10569, %r10569, 16; + add.s32 %r10571, %r10570, %r10550; + xor.b32 %r10572, %r10571, %r10510; + shf.l.wrap.b32 %r10573, %r10572, %r10572, 20; + add.s32 %r10574, %r10568, %r9769; + add.s32 %r10575, %r10574, %r10573; + xor.b32 %r10576, %r10575, %r10570; + shf.l.wrap.b32 %r10577, %r10576, %r10576, 24; + add.s32 %r10578, %r10577, %r10571; + xor.b32 %r10579, %r10578, %r10573; + shf.l.wrap.b32 %r10580, %r10579, %r10579, 25; + add.s32 %r10581, %r10533, %r9776; + add.s32 %r10582, %r10581, %r10524; + xor.b32 %r10583, %r10582, %r10549; + shf.l.wrap.b32 %r10584, %r10583, %r10583, 16; + add.s32 %r10585, %r10584, %r10508; + xor.b32 %r10586, %r10585, %r10524; + shf.l.wrap.b32 %r10587, %r10586, %r10586, 20; + add.s32 %r10588, %r10582, %r9832; + add.s32 %r10589, %r10588, %r10587; + xor.b32 %r10590, %r10589, %r10584; + shf.l.wrap.b32 %r10591, %r10590, %r10590, 24; + add.s32 %r10592, %r10591, %r10585; + xor.b32 %r10593, %r10592, %r10587; + shf.l.wrap.b32 %r10594, %r10593, %r10593, 25; + add.s32 %r10595, %r10547, %r9825; + add.s32 %r10596, %r10595, %r10538; + xor.b32 %r10597, %r10596, %r10507; + shf.l.wrap.b32 %r10598, %r10597, %r10597, 16; + add.s32 %r10599, %r10598, %r10522; + xor.b32 %r10600, %r10599, %r10538; + shf.l.wrap.b32 %r10601, %r10600, %r10600, 20; + add.s32 %r10602, %r10596, %r9811; + add.s32 %r10603, %r10602, %r10601; + xor.b32 %r10604, %r10603, %r10598; + shf.l.wrap.b32 %r10605, %r10604, %r10604, 24; + add.s32 %r10606, %r10605, %r10599; + xor.b32 %r10607, %r10606, %r10601; + shf.l.wrap.b32 %r10608, %r10607, %r10607, 25; + add.s32 %r10609, %r10561, %r9867; + add.s32 %r10610, %r10609, %r10580; + xor.b32 %r10611, %r10610, %r10605; + shf.l.wrap.b32 %r10612, %r10611, %r10611, 16; + add.s32 %r10613, %r10612, %r10592; + xor.b32 %r10614, %r10613, %r10580; + shf.l.wrap.b32 %r10615, %r10614, %r10614, 20; + add.s32 %r10616, %r10610, %r9839; + add.s32 %r10617, %r10616, %r10615; + xor.b32 %r10618, %r10617, %r10612; + shf.l.wrap.b32 %r10619, %r10618, %r10618, 24; + add.s32 %r10620, %r10619, %r10613; + xor.b32 %r10621, %r10620, %r10615; + shf.l.wrap.b32 %r10622, %r10621, %r10621, 25; + add.s32 %r10623, %r10575, %r9783; + add.s32 %r10624, %r10623, %r10594; + xor.b32 %r10625, %r10624, %r10563; + shf.l.wrap.b32 %r10626, %r10625, %r10625, 16; + add.s32 %r10627, %r10626, %r10606; + xor.b32 %r10628, %r10627, %r10594; + shf.l.wrap.b32 %r10629, %r10628, %r10628, 20; + add.s32 %r10630, %r10624, %r9853; + add.s32 %r10631, %r10630, %r10629; + xor.b32 %r10632, %r10631, %r10626; + shf.l.wrap.b32 %r10633, %r10632, %r10632, 24; + add.s32 %r10634, %r10633, %r10627; + xor.b32 %r10635, %r10634, %r10629; + shf.l.wrap.b32 %r10636, %r10635, %r10635, 25; + add.s32 %r10637, %r10589, %r9790; + add.s32 %r10638, %r10637, %r10608; + xor.b32 %r10639, %r10638, %r10577; + shf.l.wrap.b32 %r10640, %r10639, %r10639, 16; + add.s32 %r10641, %r10640, %r10564; + xor.b32 %r10642, %r10641, %r10608; + shf.l.wrap.b32 %r10643, %r10642, %r10642, 20; + add.s32 %r10644, %r10638, %r9797; + add.s32 %r10645, %r10644, %r10643; + xor.b32 %r10646, %r10645, %r10640; + shf.l.wrap.b32 %r10647, %r10646, %r10646, 24; + add.s32 %r10648, %r10647, %r10641; + xor.b32 %r10649, %r10648, %r10643; + shf.l.wrap.b32 %r10650, %r10649, %r10649, 25; + add.s32 %r10651, %r10603, %r9818; + add.s32 %r10652, %r10651, %r10566; + xor.b32 %r10653, %r10652, %r10591; + shf.l.wrap.b32 %r10654, %r10653, %r10653, 16; + add.s32 %r10655, %r10654, %r10578; + xor.b32 %r10656, %r10655, %r10566; + shf.l.wrap.b32 %r10657, %r10656, %r10656, 20; + add.s32 %r10658, %r10652, %r9860; + add.s32 %r10659, %r10658, %r10657; + xor.b32 %r10660, %r10659, %r10654; + shf.l.wrap.b32 %r10661, %r10660, %r10660, 24; + add.s32 %r10662, %r10661, %r10655; + xor.b32 %r10663, %r10662, %r10657; + shf.l.wrap.b32 %r10664, %r10663, %r10663, 25; + xor.b32 %r11689, %r10648, %r10617; + st.local.u32 [%rd3+-104], %r11689; + xor.b32 %r11688, %r10662, %r10631; + st.local.u32 [%rd3+-100], %r11688; + xor.b32 %r11687, %r10620, %r10645; + st.local.u32 [%rd3+-96], %r11687; + xor.b32 %r11686, %r10659, %r10634; + st.local.u32 [%rd3+-92], %r11686; + xor.b32 %r11685, %r10664, %r10633; + st.local.u32 [%rd3+-88], %r11685; + xor.b32 %r11684, %r10622, %r10647; + st.local.u32 [%rd3+-84], %r11684; + xor.b32 %r11683, %r10661, %r10636; + st.local.u32 [%rd3+-80], %r11683; + xor.b32 %r11682, %r10650, %r10619; + st.local.u32 [%rd3+-76], %r11682; + add.s16 %rs391, %rs391, 1; + st.local.u8 [%rd3+1], %rs391; + add.s64 %rd260, %rd260, 64; + add.s64 %rd270, %rd270, -64; + setp.gt.u64 %p49, %rd270, 64; + @%p49 bra $L__BB1_60; + bra.uni $L__BB1_61; + +$L__BB1_58: + ld.local.u64 %rd268, [%rd3+-72]; + +$L__BB1_61: + cvt.u64.u16 %rd209, %rs390; + and.b64 %rd92, %rd209, 255; + mov.u64 %rd210, 64; + sub.s64 %rd211, %rd210, %rd92; + min.u64 %rd93, %rd211, %rd270; + setp.eq.s64 %p50, %rd93, 0; + @%p50 bra $L__BB1_64; + + add.s64 %rd213, %rd2, %rd92; + add.s64 %rd94, %rd213, 72; + mov.u64 %rd271, 0; + +$L__BB1_63: + add.s64 %rd214, %rd260, %rd271; + ld.local.u8 %rs345, [%rd214]; + add.s64 %rd215, %rd94, %rd271; + st.local.u8 [%rd215], %rs345; + add.s64 %rd271, %rd271, 1; + setp.lt.u64 %p51, %rd271, %rd93; + @%p51 bra $L__BB1_63; + +$L__BB1_64: + cvt.u16.u64 %rs346, %rd93; + ld.local.u8 %rs347, [%rd3]; + add.s16 %rs348, %rs347, %rs346; + st.local.u8 [%rd3], %rs348; + ld.local.u8 %rs392, [%rd3+8]; + cvt.u64.u16 %rd216, %rs392; + and.b64 %rd217, %rd216, 255; + popc.b64 %r10665, %rd268; + cvt.u64.u32 %rd97, %r10665; + setp.ge.u64 %p52, %rd97, %rd217; + @%p52 bra $L__BB1_68; + + ld.local.u8 %r10666, [%rd3+2]; + or.b32 %r135, %r10666, 4; + ld.local.u8 %r10667, [%rd3+-120]; + ld.local.u8 %r10668, [%rd3+-119]; + prmt.b32 %r10669, %r10668, %r10667, 30212; + ld.local.u8 %r10670, [%rd3+-118]; + ld.local.u8 %r10671, [%rd3+-117]; + prmt.b32 %r10672, %r10671, %r10670, 30212; + prmt.b32 %r136, %r10672, %r10669, 4180; + ld.local.u8 %r10673, [%rd3+-136]; + ld.local.u8 %r10674, [%rd3+-135]; + prmt.b32 %r10675, %r10674, %r10673, 30212; + ld.local.u8 %r10676, [%rd3+-134]; + ld.local.u8 %r10677, [%rd3+-133]; + prmt.b32 %r10678, %r10677, %r10676, 30212; + prmt.b32 %r10679, %r10678, %r10675, 4180; + add.s32 %r137, %r136, %r10679; + ld.local.u8 %r10680, [%rd3+-116]; + ld.local.u8 %r10681, [%rd3+-115]; + prmt.b32 %r10682, %r10681, %r10680, 30212; + ld.local.u8 %r10683, [%rd3+-114]; + ld.local.u8 %r10684, [%rd3+-113]; + prmt.b32 %r10685, %r10684, %r10683, 30212; + prmt.b32 %r138, %r10685, %r10682, 4180; + ld.local.u8 %r10686, [%rd3+-132]; + ld.local.u8 %r10687, [%rd3+-131]; + prmt.b32 %r10688, %r10687, %r10686, 30212; + ld.local.u8 %r10689, [%rd3+-130]; + ld.local.u8 %r10690, [%rd3+-129]; + prmt.b32 %r10691, %r10690, %r10689, 30212; + prmt.b32 %r10692, %r10691, %r10688, 4180; + add.s32 %r139, %r138, %r10692; + ld.local.u8 %r10693, [%rd3+-112]; + ld.local.u8 %r10694, [%rd3+-111]; + prmt.b32 %r10695, %r10694, %r10693, 30212; + ld.local.u8 %r10696, [%rd3+-110]; + ld.local.u8 %r10697, [%rd3+-109]; + prmt.b32 %r10698, %r10697, %r10696, 30212; + prmt.b32 %r140, %r10698, %r10695, 4180; + ld.local.u8 %r10699, [%rd3+-128]; + ld.local.u8 %r10700, [%rd3+-127]; + prmt.b32 %r10701, %r10700, %r10699, 30212; + ld.local.u8 %r10702, [%rd3+-126]; + ld.local.u8 %r10703, [%rd3+-125]; + prmt.b32 %r10704, %r10703, %r10702, 30212; + prmt.b32 %r10705, %r10704, %r10701, 4180; + add.s32 %r141, %r140, %r10705; + ld.local.u8 %r10706, [%rd3+-108]; + ld.local.u8 %r10707, [%rd3+-107]; + prmt.b32 %r10708, %r10707, %r10706, 30212; + ld.local.u8 %r10709, [%rd3+-106]; + ld.local.u8 %r10710, [%rd3+-105]; + prmt.b32 %r10711, %r10710, %r10709, 30212; + prmt.b32 %r142, %r10711, %r10708, 4180; + ld.local.u8 %r10712, [%rd3+-124]; + ld.local.u8 %r10713, [%rd3+-123]; + prmt.b32 %r10714, %r10713, %r10712, 30212; + ld.local.u8 %r10715, [%rd3+-122]; + ld.local.u8 %r10716, [%rd3+-121]; + prmt.b32 %r10717, %r10716, %r10715, 30212; + prmt.b32 %r10718, %r10717, %r10714, 4180; + add.s32 %r143, %r142, %r10718; + +$L__BB1_66: + and.b16 %rs349, %rs392, 255; + mul.wide.u16 %r10719, %rs349, 32; + add.s32 %r10720, %r10719, -64; + cvt.s64.s32 %rd218, %r10720; + add.s64 %rd219, %rd2, %rd218; + ld.local.u8 %r10721, [%rd219+145]; + ld.local.u8 %r10722, [%rd219+146]; + prmt.b32 %r10723, %r10722, %r10721, 30212; + ld.local.u8 %r10724, [%rd219+147]; + prmt.b32 %r10725, %r10724, %r10723, 28756; + ld.local.u8 %r10726, [%rd219+148]; + prmt.b32 %r10727, %r10726, %r10725, 1620; + ld.local.u8 %r10728, [%rd219+149]; + ld.local.u8 %r10729, [%rd219+150]; + prmt.b32 %r10730, %r10729, %r10728, 30212; + ld.local.u8 %r10731, [%rd219+151]; + prmt.b32 %r10732, %r10731, %r10730, 28756; + ld.local.u8 %r10733, [%rd219+152]; + prmt.b32 %r10734, %r10733, %r10732, 1620; + ld.local.u8 %r10735, [%rd219+153]; + ld.local.u8 %r10736, [%rd219+154]; + prmt.b32 %r10737, %r10736, %r10735, 30212; + ld.local.u8 %r10738, [%rd219+155]; + prmt.b32 %r10739, %r10738, %r10737, 28756; + ld.local.u8 %r10740, [%rd219+156]; + prmt.b32 %r10741, %r10740, %r10739, 1620; + ld.local.u8 %r10742, [%rd219+157]; + ld.local.u8 %r10743, [%rd219+158]; + prmt.b32 %r10744, %r10743, %r10742, 30212; + ld.local.u8 %r10745, [%rd219+159]; + prmt.b32 %r10746, %r10745, %r10744, 28756; + ld.local.u8 %r10747, [%rd219+160]; + prmt.b32 %r10748, %r10747, %r10746, 1620; + ld.local.u8 %r10749, [%rd219+161]; + ld.local.u8 %r10750, [%rd219+162]; + prmt.b32 %r10751, %r10750, %r10749, 30212; + ld.local.u8 %r10752, [%rd219+163]; + prmt.b32 %r10753, %r10752, %r10751, 28756; + ld.local.u8 %r10754, [%rd219+164]; + prmt.b32 %r10755, %r10754, %r10753, 1620; + ld.local.u8 %r10756, [%rd219+165]; + ld.local.u8 %r10757, [%rd219+166]; + prmt.b32 %r10758, %r10757, %r10756, 30212; + ld.local.u8 %r10759, [%rd219+167]; + prmt.b32 %r10760, %r10759, %r10758, 28756; + ld.local.u8 %r10761, [%rd219+168]; + prmt.b32 %r10762, %r10761, %r10760, 1620; + ld.local.u8 %r10763, [%rd219+169]; + ld.local.u8 %r10764, [%rd219+170]; + prmt.b32 %r10765, %r10764, %r10763, 30212; + ld.local.u8 %r10766, [%rd219+171]; + prmt.b32 %r10767, %r10766, %r10765, 28756; + ld.local.u8 %r10768, [%rd219+172]; + prmt.b32 %r10769, %r10768, %r10767, 1620; + ld.local.u8 %r10770, [%rd219+173]; + ld.local.u8 %r10771, [%rd219+174]; + prmt.b32 %r10772, %r10771, %r10770, 30212; + ld.local.u8 %r10773, [%rd219+175]; + prmt.b32 %r10774, %r10773, %r10772, 28756; + ld.local.u8 %r10775, [%rd219+176]; + prmt.b32 %r10776, %r10775, %r10774, 1620; + ld.local.u8 %r10777, [%rd219+177]; + ld.local.u8 %r10778, [%rd219+178]; + prmt.b32 %r10779, %r10778, %r10777, 30212; + ld.local.u8 %r10780, [%rd219+179]; + prmt.b32 %r10781, %r10780, %r10779, 28756; + ld.local.u8 %r10782, [%rd219+180]; + prmt.b32 %r10783, %r10782, %r10781, 1620; + ld.local.u8 %r10784, [%rd219+181]; + ld.local.u8 %r10785, [%rd219+182]; + prmt.b32 %r10786, %r10785, %r10784, 30212; + ld.local.u8 %r10787, [%rd219+183]; + prmt.b32 %r10788, %r10787, %r10786, 28756; + ld.local.u8 %r10789, [%rd219+184]; + prmt.b32 %r10790, %r10789, %r10788, 1620; + ld.local.u8 %r10791, [%rd219+185]; + ld.local.u8 %r10792, [%rd219+186]; + prmt.b32 %r10793, %r10792, %r10791, 30212; + ld.local.u8 %r10794, [%rd219+187]; + prmt.b32 %r10795, %r10794, %r10793, 28756; + ld.local.u8 %r10796, [%rd219+188]; + prmt.b32 %r10797, %r10796, %r10795, 1620; + ld.local.u8 %r10798, [%rd219+189]; + ld.local.u8 %r10799, [%rd219+190]; + prmt.b32 %r10800, %r10799, %r10798, 30212; + ld.local.u8 %r10801, [%rd219+191]; + prmt.b32 %r10802, %r10801, %r10800, 28756; + ld.local.u8 %r10803, [%rd219+192]; + prmt.b32 %r10804, %r10803, %r10802, 1620; + ld.local.u8 %r10805, [%rd219+193]; + ld.local.u8 %r10806, [%rd219+194]; + prmt.b32 %r10807, %r10806, %r10805, 30212; + ld.local.u8 %r10808, [%rd219+195]; + prmt.b32 %r10809, %r10808, %r10807, 28756; + ld.local.u8 %r10810, [%rd219+196]; + prmt.b32 %r10811, %r10810, %r10809, 1620; + ld.local.u8 %r10812, [%rd219+197]; + ld.local.u8 %r10813, [%rd219+198]; + prmt.b32 %r10814, %r10813, %r10812, 30212; + ld.local.u8 %r10815, [%rd219+199]; + prmt.b32 %r10816, %r10815, %r10814, 28756; + ld.local.u8 %r10817, [%rd219+200]; + prmt.b32 %r10818, %r10817, %r10816, 1620; + ld.local.u8 %r10819, [%rd219+201]; + ld.local.u8 %r10820, [%rd219+202]; + prmt.b32 %r10821, %r10820, %r10819, 30212; + ld.local.u8 %r10822, [%rd219+203]; + prmt.b32 %r10823, %r10822, %r10821, 28756; + ld.local.u8 %r10824, [%rd219+204]; + prmt.b32 %r10825, %r10824, %r10823, 1620; + ld.local.u8 %r10826, [%rd219+205]; + ld.local.u8 %r10827, [%rd219+206]; + prmt.b32 %r10828, %r10827, %r10826, 30212; + ld.local.u8 %r10829, [%rd219+207]; + prmt.b32 %r10830, %r10829, %r10828, 28756; + ld.local.u8 %r10831, [%rd219+208]; + prmt.b32 %r10832, %r10831, %r10830, 1620; + add.s32 %r10833, %r137, %r10727; + shf.l.wrap.b32 %r10834, %r10833, %r10833, 16; + add.s32 %r10835, %r10834, 1779033703; + xor.b32 %r10836, %r10835, %r136; + shf.l.wrap.b32 %r10837, %r10836, %r10836, 20; + add.s32 %r10838, %r10734, %r10833; + add.s32 %r10839, %r10838, %r10837; + xor.b32 %r10840, %r10839, %r10834; + shf.l.wrap.b32 %r10841, %r10840, %r10840, 24; + add.s32 %r10842, %r10841, %r10835; + xor.b32 %r10843, %r10842, %r10837; + shf.l.wrap.b32 %r10844, %r10843, %r10843, 25; + add.s32 %r10845, %r139, %r10741; + shf.l.wrap.b32 %r10846, %r10845, %r10845, 16; + add.s32 %r10847, %r10846, -1150833019; + xor.b32 %r10848, %r10847, %r138; + shf.l.wrap.b32 %r10849, %r10848, %r10848, 20; + add.s32 %r10850, %r10748, %r10845; + add.s32 %r10851, %r10850, %r10849; + xor.b32 %r10852, %r10851, %r10846; + shf.l.wrap.b32 %r10853, %r10852, %r10852, 24; + add.s32 %r10854, %r10853, %r10847; + xor.b32 %r10855, %r10854, %r10849; + shf.l.wrap.b32 %r10856, %r10855, %r10855, 25; + add.s32 %r10857, %r141, %r10755; + shr.u32 %r10858, %r10857, 16; + shl.b32 %r10859, %r10857, 16; + xor.b32 %r10860, %r10859, 4194304; + or.b32 %r10861, %r10860, %r10858; + add.s32 %r10862, %r10861, 1013904242; + xor.b32 %r10863, %r10862, %r140; + shf.l.wrap.b32 %r10864, %r10863, %r10863, 20; + add.s32 %r10865, %r10762, %r10857; + add.s32 %r10866, %r10865, %r10864; + xor.b32 %r10867, %r10866, %r10861; + shf.l.wrap.b32 %r10868, %r10867, %r10867, 24; + add.s32 %r10869, %r10868, %r10862; + xor.b32 %r10870, %r10869, %r10864; + shf.l.wrap.b32 %r10871, %r10870, %r10870, 25; + add.s32 %r10872, %r143, %r10769; + xor.b32 %r10873, %r10872, %r135; + shr.u32 %r10874, %r10872, 16; + shl.b32 %r10875, %r10873, 16; + or.b32 %r10876, %r10875, %r10874; + add.s32 %r10877, %r10876, -1521486534; + xor.b32 %r10878, %r10877, %r142; + shf.l.wrap.b32 %r10879, %r10878, %r10878, 20; + add.s32 %r10880, %r10776, %r10872; + add.s32 %r10881, %r10880, %r10879; + xor.b32 %r10882, %r10881, %r10876; + shf.l.wrap.b32 %r10883, %r10882, %r10882, 24; + add.s32 %r10884, %r10883, %r10877; + xor.b32 %r10885, %r10884, %r10879; + shf.l.wrap.b32 %r10886, %r10885, %r10885, 25; + add.s32 %r10887, %r10856, %r10839; + add.s32 %r10888, %r10887, %r10783; + xor.b32 %r10889, %r10883, %r10888; + shf.l.wrap.b32 %r10890, %r10889, %r10889, 16; + add.s32 %r10891, %r10890, %r10869; + xor.b32 %r10892, %r10891, %r10856; + shf.l.wrap.b32 %r10893, %r10892, %r10892, 20; + add.s32 %r10894, %r10790, %r10888; + add.s32 %r10895, %r10894, %r10893; + xor.b32 %r10896, %r10895, %r10890; + shf.l.wrap.b32 %r10897, %r10896, %r10896, 24; + add.s32 %r10898, %r10897, %r10891; + xor.b32 %r10899, %r10898, %r10893; + shf.l.wrap.b32 %r10900, %r10899, %r10899, 25; + add.s32 %r10901, %r10871, %r10851; + add.s32 %r10902, %r10901, %r10797; + xor.b32 %r10903, %r10902, %r10841; + shf.l.wrap.b32 %r10904, %r10903, %r10903, 16; + add.s32 %r10905, %r10904, %r10884; + xor.b32 %r10906, %r10905, %r10871; + shf.l.wrap.b32 %r10907, %r10906, %r10906, 20; + add.s32 %r10908, %r10804, %r10902; + add.s32 %r10909, %r10908, %r10907; + xor.b32 %r10910, %r10909, %r10904; + shf.l.wrap.b32 %r10911, %r10910, %r10910, 24; + add.s32 %r10912, %r10911, %r10905; + xor.b32 %r10913, %r10912, %r10907; + shf.l.wrap.b32 %r10914, %r10913, %r10913, 25; + add.s32 %r10915, %r10886, %r10866; + add.s32 %r10916, %r10915, %r10811; + xor.b32 %r10917, %r10916, %r10853; + shf.l.wrap.b32 %r10918, %r10917, %r10917, 16; + add.s32 %r10919, %r10918, %r10842; + xor.b32 %r10920, %r10919, %r10886; + shf.l.wrap.b32 %r10921, %r10920, %r10920, 20; + add.s32 %r10922, %r10818, %r10916; + add.s32 %r10923, %r10922, %r10921; + xor.b32 %r10924, %r10923, %r10918; + shf.l.wrap.b32 %r10925, %r10924, %r10924, 24; + add.s32 %r10926, %r10925, %r10919; + xor.b32 %r10927, %r10926, %r10921; + shf.l.wrap.b32 %r10928, %r10927, %r10927, 25; + add.s32 %r10929, %r10881, %r10844; + add.s32 %r10930, %r10929, %r10825; + xor.b32 %r10931, %r10930, %r10868; + shf.l.wrap.b32 %r10932, %r10931, %r10931, 16; + add.s32 %r10933, %r10932, %r10854; + xor.b32 %r10934, %r10933, %r10844; + shf.l.wrap.b32 %r10935, %r10934, %r10934, 20; + add.s32 %r10936, %r10832, %r10930; + add.s32 %r10937, %r10936, %r10935; + xor.b32 %r10938, %r10937, %r10932; + shf.l.wrap.b32 %r10939, %r10938, %r10938, 24; + add.s32 %r10940, %r10939, %r10933; + xor.b32 %r10941, %r10940, %r10935; + shf.l.wrap.b32 %r10942, %r10941, %r10941, 25; + add.s32 %r10943, %r10895, %r10741; + add.s32 %r10944, %r10943, %r10942; + xor.b32 %r10945, %r10944, %r10911; + shf.l.wrap.b32 %r10946, %r10945, %r10945, 16; + add.s32 %r10947, %r10946, %r10926; + xor.b32 %r10948, %r10947, %r10942; + shf.l.wrap.b32 %r10949, %r10948, %r10948, 20; + add.s32 %r10950, %r10944, %r10769; + add.s32 %r10951, %r10950, %r10949; + xor.b32 %r10952, %r10951, %r10946; + shf.l.wrap.b32 %r10953, %r10952, %r10952, 24; + add.s32 %r10954, %r10953, %r10947; + xor.b32 %r10955, %r10954, %r10949; + shf.l.wrap.b32 %r10956, %r10955, %r10955, 25; + add.s32 %r10957, %r10909, %r10748; + add.s32 %r10958, %r10957, %r10900; + xor.b32 %r10959, %r10925, %r10958; + shf.l.wrap.b32 %r10960, %r10959, %r10959, 16; + add.s32 %r10961, %r10940, %r10960; + xor.b32 %r10962, %r10961, %r10900; + shf.l.wrap.b32 %r10963, %r10962, %r10962, 20; + add.s32 %r10964, %r10958, %r10797; + add.s32 %r10965, %r10964, %r10963; + xor.b32 %r10966, %r10965, %r10960; + shf.l.wrap.b32 %r10967, %r10966, %r10966, 24; + add.s32 %r10968, %r10967, %r10961; + xor.b32 %r10969, %r10968, %r10963; + shf.l.wrap.b32 %r10970, %r10969, %r10969, 25; + add.s32 %r10971, %r10914, %r10776; + add.s32 %r10972, %r10971, %r10923; + xor.b32 %r10973, %r10939, %r10972; + shf.l.wrap.b32 %r10974, %r10973, %r10973, 16; + add.s32 %r10975, %r10974, %r10898; + xor.b32 %r10976, %r10975, %r10914; + shf.l.wrap.b32 %r10977, %r10976, %r10976, 20; + add.s32 %r10978, %r10972, %r10727; + add.s32 %r10979, %r10978, %r10977; + xor.b32 %r10980, %r10979, %r10974; + shf.l.wrap.b32 %r10981, %r10980, %r10980, 24; + add.s32 %r10982, %r10981, %r10975; + xor.b32 %r10983, %r10982, %r10977; + shf.l.wrap.b32 %r10984, %r10983, %r10983, 25; + add.s32 %r10985, %r10928, %r10755; + add.s32 %r10986, %r10985, %r10937; + xor.b32 %r10987, %r10986, %r10897; + shf.l.wrap.b32 %r10988, %r10987, %r10987, 16; + add.s32 %r10989, %r10988, %r10912; + xor.b32 %r10990, %r10989, %r10928; + shf.l.wrap.b32 %r10991, %r10990, %r10990, 20; + add.s32 %r10992, %r10986, %r10818; + add.s32 %r10993, %r10992, %r10991; + xor.b32 %r10994, %r10993, %r10988; + shf.l.wrap.b32 %r10995, %r10994, %r10994, 24; + add.s32 %r10996, %r10995, %r10989; + xor.b32 %r10997, %r10996, %r10991; + shf.l.wrap.b32 %r10998, %r10997, %r10997, 25; + add.s32 %r10999, %r10970, %r10734; + add.s32 %r11000, %r10999, %r10951; + xor.b32 %r11001, %r11000, %r10995; + shf.l.wrap.b32 %r11002, %r11001, %r11001, 16; + add.s32 %r11003, %r11002, %r10982; + xor.b32 %r11004, %r11003, %r10970; + shf.l.wrap.b32 %r11005, %r11004, %r11004, 20; + add.s32 %r11006, %r11000, %r10804; + add.s32 %r11007, %r11006, %r11005; + xor.b32 %r11008, %r11007, %r11002; + shf.l.wrap.b32 %r11009, %r11008, %r11008, 24; + add.s32 %r11010, %r11009, %r11003; + xor.b32 %r11011, %r11010, %r11005; + shf.l.wrap.b32 %r11012, %r11011, %r11011, 25; + add.s32 %r11013, %r10965, %r10811; + add.s32 %r11014, %r11013, %r10984; + xor.b32 %r11015, %r10953, %r11014; + shf.l.wrap.b32 %r11016, %r11015, %r11015, 16; + add.s32 %r11017, %r11016, %r10996; + xor.b32 %r11018, %r11017, %r10984; + shf.l.wrap.b32 %r11019, %r11018, %r11018, 20; + add.s32 %r11020, %r11014, %r10762; + add.s32 %r11021, %r11020, %r11019; + xor.b32 %r11022, %r11021, %r11016; + shf.l.wrap.b32 %r11023, %r11022, %r11022, 24; + add.s32 %r11024, %r11023, %r11017; + xor.b32 %r11025, %r11024, %r11019; + shf.l.wrap.b32 %r11026, %r11025, %r11025, 25; + add.s32 %r11027, %r10979, %r10790; + add.s32 %r11028, %r11027, %r10998; + xor.b32 %r11029, %r11028, %r10967; + shf.l.wrap.b32 %r11030, %r11029, %r11029, 16; + add.s32 %r11031, %r11030, %r10954; + xor.b32 %r11032, %r11031, %r10998; + shf.l.wrap.b32 %r11033, %r11032, %r11032, 20; + add.s32 %r11034, %r11028, %r10825; + add.s32 %r11035, %r11034, %r11033; + xor.b32 %r11036, %r11035, %r11030; + shf.l.wrap.b32 %r11037, %r11036, %r11036, 24; + add.s32 %r11038, %r11037, %r11031; + xor.b32 %r11039, %r11038, %r11033; + shf.l.wrap.b32 %r11040, %r11039, %r11039, 25; + add.s32 %r11041, %r10993, %r10832; + add.s32 %r11042, %r11041, %r10956; + xor.b32 %r11043, %r11042, %r10981; + shf.l.wrap.b32 %r11044, %r11043, %r11043, 16; + add.s32 %r11045, %r11044, %r10968; + xor.b32 %r11046, %r11045, %r10956; + shf.l.wrap.b32 %r11047, %r11046, %r11046, 20; + add.s32 %r11048, %r11042, %r10783; + add.s32 %r11049, %r11048, %r11047; + xor.b32 %r11050, %r11049, %r11044; + shf.l.wrap.b32 %r11051, %r11050, %r11050, 24; + add.s32 %r11052, %r11051, %r11045; + xor.b32 %r11053, %r11052, %r11047; + shf.l.wrap.b32 %r11054, %r11053, %r11053, 25; + add.s32 %r11055, %r11007, %r10748; + add.s32 %r11056, %r11055, %r11054; + xor.b32 %r11057, %r11056, %r11023; + shf.l.wrap.b32 %r11058, %r11057, %r11057, 16; + add.s32 %r11059, %r11058, %r11038; + xor.b32 %r11060, %r11059, %r11054; + shf.l.wrap.b32 %r11061, %r11060, %r11060, 20; + add.s32 %r11062, %r11056, %r10755; + add.s32 %r11063, %r11062, %r11061; + xor.b32 %r11064, %r11063, %r11058; + shf.l.wrap.b32 %r11065, %r11064, %r11064, 24; + add.s32 %r11066, %r11065, %r11059; + xor.b32 %r11067, %r11066, %r11061; + shf.l.wrap.b32 %r11068, %r11067, %r11067, 25; + add.s32 %r11069, %r11021, %r10797; + add.s32 %r11070, %r11069, %r11012; + xor.b32 %r11071, %r11070, %r11037; + shf.l.wrap.b32 %r11072, %r11071, %r11071, 16; + add.s32 %r11073, %r11072, %r11052; + xor.b32 %r11074, %r11073, %r11012; + shf.l.wrap.b32 %r11075, %r11074, %r11074, 20; + add.s32 %r11076, %r11070, %r10811; + add.s32 %r11077, %r11076, %r11075; + xor.b32 %r11078, %r11077, %r11072; + shf.l.wrap.b32 %r11079, %r11078, %r11078, 24; + add.s32 %r11080, %r11079, %r11073; + xor.b32 %r11081, %r11080, %r11075; + shf.l.wrap.b32 %r11082, %r11081, %r11081, 25; + add.s32 %r11083, %r11035, %r10818; + add.s32 %r11084, %r11083, %r11026; + xor.b32 %r11085, %r11051, %r11084; + shf.l.wrap.b32 %r11086, %r11085, %r11085, 16; + add.s32 %r11087, %r11086, %r11010; + xor.b32 %r11088, %r11087, %r11026; + shf.l.wrap.b32 %r11089, %r11088, %r11088, 20; + add.s32 %r11090, %r11084, %r10741; + add.s32 %r11091, %r11090, %r11089; + xor.b32 %r11092, %r11091, %r11086; + shf.l.wrap.b32 %r11093, %r11092, %r11092, 24; + add.s32 %r11094, %r11093, %r11087; + xor.b32 %r11095, %r11094, %r11089; + shf.l.wrap.b32 %r11096, %r11095, %r11095, 25; + add.s32 %r11097, %r11040, %r10776; + add.s32 %r11098, %r11097, %r11049; + xor.b32 %r11099, %r11098, %r11009; + shf.l.wrap.b32 %r11100, %r11099, %r11099, 16; + add.s32 %r11101, %r11100, %r11024; + xor.b32 %r11102, %r11101, %r11040; + shf.l.wrap.b32 %r11103, %r11102, %r11102, 20; + add.s32 %r11104, %r11098, %r10825; + add.s32 %r11105, %r11104, %r11103; + xor.b32 %r11106, %r11105, %r11100; + shf.l.wrap.b32 %r11107, %r11106, %r11106, 24; + add.s32 %r11108, %r11107, %r11101; + xor.b32 %r11109, %r11108, %r11103; + shf.l.wrap.b32 %r11110, %r11109, %r11109, 25; + add.s32 %r11111, %r11082, %r10769; + add.s32 %r11112, %r11111, %r11063; + xor.b32 %r11113, %r11112, %r11107; + shf.l.wrap.b32 %r11114, %r11113, %r11113, 16; + add.s32 %r11115, %r11114, %r11094; + xor.b32 %r11116, %r11115, %r11082; + shf.l.wrap.b32 %r11117, %r11116, %r11116, 20; + add.s32 %r11118, %r11112, %r10762; + add.s32 %r11119, %r11118, %r11117; + xor.b32 %r11120, %r11119, %r11114; + shf.l.wrap.b32 %r11121, %r11120, %r11120, 24; + add.s32 %r11122, %r11121, %r11115; + xor.b32 %r11123, %r11122, %r11117; + shf.l.wrap.b32 %r11124, %r11123, %r11123, 25; + add.s32 %r11125, %r11077, %r10790; + add.s32 %r11126, %r11125, %r11096; + xor.b32 %r11127, %r11065, %r11126; + shf.l.wrap.b32 %r11128, %r11127, %r11127, 16; + add.s32 %r11129, %r11128, %r11108; + xor.b32 %r11130, %r11129, %r11096; + shf.l.wrap.b32 %r11131, %r11130, %r11130, 20; + add.s32 %r11132, %r11126, %r10727; + add.s32 %r11133, %r11132, %r11131; + xor.b32 %r11134, %r11133, %r11128; + shf.l.wrap.b32 %r11135, %r11134, %r11134, 24; + add.s32 %r11136, %r11135, %r11129; + xor.b32 %r11137, %r11136, %r11131; + shf.l.wrap.b32 %r11138, %r11137, %r11137, 25; + add.s32 %r11139, %r11091, %r10804; + add.s32 %r11140, %r11139, %r11110; + xor.b32 %r11141, %r11140, %r11079; + shf.l.wrap.b32 %r11142, %r11141, %r11141, 16; + add.s32 %r11143, %r11142, %r11066; + xor.b32 %r11144, %r11143, %r11110; + shf.l.wrap.b32 %r11145, %r11144, %r11144, 20; + add.s32 %r11146, %r11140, %r10832; + add.s32 %r11147, %r11146, %r11145; + xor.b32 %r11148, %r11147, %r11142; + shf.l.wrap.b32 %r11149, %r11148, %r11148, 24; + add.s32 %r11150, %r11149, %r11143; + xor.b32 %r11151, %r11150, %r11145; + shf.l.wrap.b32 %r11152, %r11151, %r11151, 25; + add.s32 %r11153, %r11105, %r10783; + add.s32 %r11154, %r11153, %r11068; + xor.b32 %r11155, %r11154, %r11093; + shf.l.wrap.b32 %r11156, %r11155, %r11155, 16; + add.s32 %r11157, %r11156, %r11080; + xor.b32 %r11158, %r11157, %r11068; + shf.l.wrap.b32 %r11159, %r11158, %r11158, 20; + add.s32 %r11160, %r11154, %r10734; + add.s32 %r11161, %r11160, %r11159; + xor.b32 %r11162, %r11161, %r11156; + shf.l.wrap.b32 %r11163, %r11162, %r11162, 24; + add.s32 %r11164, %r11163, %r11157; + xor.b32 %r11165, %r11164, %r11159; + shf.l.wrap.b32 %r11166, %r11165, %r11165, 25; + add.s32 %r11167, %r11119, %r10797; + add.s32 %r11168, %r11167, %r11166; + xor.b32 %r11169, %r11168, %r11135; + shf.l.wrap.b32 %r11170, %r11169, %r11169, 16; + add.s32 %r11171, %r11170, %r11150; + xor.b32 %r11172, %r11171, %r11166; + shf.l.wrap.b32 %r11173, %r11172, %r11172, 20; + add.s32 %r11174, %r11168, %r10776; + add.s32 %r11175, %r11174, %r11173; + xor.b32 %r11176, %r11175, %r11170; + shf.l.wrap.b32 %r11177, %r11176, %r11176, 24; + add.s32 %r11178, %r11177, %r11171; + xor.b32 %r11179, %r11178, %r11173; + shf.l.wrap.b32 %r11180, %r11179, %r11179, 25; + add.s32 %r11181, %r11133, %r10811; + add.s32 %r11182, %r11181, %r11124; + xor.b32 %r11183, %r11182, %r11149; + shf.l.wrap.b32 %r11184, %r11183, %r11183, 16; + add.s32 %r11185, %r11184, %r11164; + xor.b32 %r11186, %r11185, %r11124; + shf.l.wrap.b32 %r11187, %r11186, %r11186, 20; + add.s32 %r11188, %r11182, %r10790; + add.s32 %r11189, %r11188, %r11187; + xor.b32 %r11190, %r11189, %r11184; + shf.l.wrap.b32 %r11191, %r11190, %r11190, 24; + add.s32 %r11192, %r11191, %r11185; + xor.b32 %r11193, %r11192, %r11187; + shf.l.wrap.b32 %r11194, %r11193, %r11193, 25; + add.s32 %r11195, %r11147, %r10825; + add.s32 %r11196, %r11195, %r11138; + xor.b32 %r11197, %r11163, %r11196; + shf.l.wrap.b32 %r11198, %r11197, %r11197, 16; + add.s32 %r11199, %r11198, %r11122; + xor.b32 %r11200, %r11199, %r11138; + shf.l.wrap.b32 %r11201, %r11200, %r11200, 20; + add.s32 %r11202, %r11196, %r10748; + add.s32 %r11203, %r11202, %r11201; + xor.b32 %r11204, %r11203, %r11198; + shf.l.wrap.b32 %r11205, %r11204, %r11204, 24; + add.s32 %r11206, %r11205, %r11199; + xor.b32 %r11207, %r11206, %r11201; + shf.l.wrap.b32 %r11208, %r11207, %r11207, 25; + add.s32 %r11209, %r11152, %r10818; + add.s32 %r11210, %r11209, %r11161; + xor.b32 %r11211, %r11210, %r11121; + shf.l.wrap.b32 %r11212, %r11211, %r11211, 16; + add.s32 %r11213, %r11212, %r11136; + xor.b32 %r11214, %r11213, %r11152; + shf.l.wrap.b32 %r11215, %r11214, %r11214, 20; + add.s32 %r11216, %r11210, %r10832; + add.s32 %r11217, %r11216, %r11215; + xor.b32 %r11218, %r11217, %r11212; + shf.l.wrap.b32 %r11219, %r11218, %r11218, 24; + add.s32 %r11220, %r11219, %r11213; + xor.b32 %r11221, %r11220, %r11215; + shf.l.wrap.b32 %r11222, %r11221, %r11221, 25; + add.s32 %r11223, %r11194, %r10755; + add.s32 %r11224, %r11223, %r11175; + xor.b32 %r11225, %r11224, %r11219; + shf.l.wrap.b32 %r11226, %r11225, %r11225, 16; + add.s32 %r11227, %r11226, %r11206; + xor.b32 %r11228, %r11227, %r11194; + shf.l.wrap.b32 %r11229, %r11228, %r11228, 20; + add.s32 %r11230, %r11224, %r10727; + add.s32 %r11231, %r11230, %r11229; + xor.b32 %r11232, %r11231, %r11226; + shf.l.wrap.b32 %r11233, %r11232, %r11232, 24; + add.s32 %r11234, %r11233, %r11227; + xor.b32 %r11235, %r11234, %r11229; + shf.l.wrap.b32 %r11236, %r11235, %r11235, 25; + add.s32 %r11237, %r11189, %r10804; + add.s32 %r11238, %r11237, %r11208; + xor.b32 %r11239, %r11177, %r11238; + shf.l.wrap.b32 %r11240, %r11239, %r11239, 16; + add.s32 %r11241, %r11240, %r11220; + xor.b32 %r11242, %r11241, %r11208; + shf.l.wrap.b32 %r11243, %r11242, %r11242, 20; + add.s32 %r11244, %r11238, %r10741; + add.s32 %r11245, %r11244, %r11243; + xor.b32 %r11246, %r11245, %r11240; + shf.l.wrap.b32 %r11247, %r11246, %r11246, 24; + add.s32 %r11248, %r11247, %r11241; + xor.b32 %r11249, %r11248, %r11243; + shf.l.wrap.b32 %r11250, %r11249, %r11249, 25; + add.s32 %r11251, %r11203, %r10762; + add.s32 %r11252, %r11251, %r11222; + xor.b32 %r11253, %r11252, %r11191; + shf.l.wrap.b32 %r11254, %r11253, %r11253, 16; + add.s32 %r11255, %r11254, %r11178; + xor.b32 %r11256, %r11255, %r11222; + shf.l.wrap.b32 %r11257, %r11256, %r11256, 20; + add.s32 %r11258, %r11252, %r10783; + add.s32 %r11259, %r11258, %r11257; + xor.b32 %r11260, %r11259, %r11254; + shf.l.wrap.b32 %r11261, %r11260, %r11260, 24; + add.s32 %r11262, %r11261, %r11255; + xor.b32 %r11263, %r11262, %r11257; + shf.l.wrap.b32 %r11264, %r11263, %r11263, 25; + add.s32 %r11265, %r11217, %r10734; + add.s32 %r11266, %r11265, %r11180; + xor.b32 %r11267, %r11266, %r11205; + shf.l.wrap.b32 %r11268, %r11267, %r11267, 16; + add.s32 %r11269, %r11268, %r11192; + xor.b32 %r11270, %r11269, %r11180; + shf.l.wrap.b32 %r11271, %r11270, %r11270, 20; + add.s32 %r11272, %r11266, %r10769; + add.s32 %r11273, %r11272, %r11271; + xor.b32 %r11274, %r11273, %r11268; + shf.l.wrap.b32 %r11275, %r11274, %r11274, 24; + add.s32 %r11276, %r11275, %r11269; + xor.b32 %r11277, %r11276, %r11271; + shf.l.wrap.b32 %r11278, %r11277, %r11277, 25; + add.s32 %r11279, %r11231, %r10811; + add.s32 %r11280, %r11279, %r11278; + xor.b32 %r11281, %r11280, %r11247; + shf.l.wrap.b32 %r11282, %r11281, %r11281, 16; + add.s32 %r11283, %r11282, %r11262; + xor.b32 %r11284, %r11283, %r11278; + shf.l.wrap.b32 %r11285, %r11284, %r11284, 20; + add.s32 %r11286, %r11280, %r10818; + add.s32 %r11287, %r11286, %r11285; + xor.b32 %r11288, %r11287, %r11282; + shf.l.wrap.b32 %r11289, %r11288, %r11288, 24; + add.s32 %r11290, %r11289, %r11283; + xor.b32 %r11291, %r11290, %r11285; + shf.l.wrap.b32 %r11292, %r11291, %r11291, 25; + add.s32 %r11293, %r11245, %r10790; + add.s32 %r11294, %r11293, %r11236; + xor.b32 %r11295, %r11294, %r11261; + shf.l.wrap.b32 %r11296, %r11295, %r11295, 16; + add.s32 %r11297, %r11296, %r11276; + xor.b32 %r11298, %r11297, %r11236; + shf.l.wrap.b32 %r11299, %r11298, %r11298, 20; + add.s32 %r11300, %r11294, %r10804; + add.s32 %r11301, %r11300, %r11299; + xor.b32 %r11302, %r11301, %r11296; + shf.l.wrap.b32 %r11303, %r11302, %r11302, 24; + add.s32 %r11304, %r11303, %r11297; + xor.b32 %r11305, %r11304, %r11299; + shf.l.wrap.b32 %r11306, %r11305, %r11305, 25; + add.s32 %r11307, %r11259, %r10832; + add.s32 %r11308, %r11307, %r11250; + xor.b32 %r11309, %r11275, %r11308; + shf.l.wrap.b32 %r11310, %r11309, %r11309, 16; + add.s32 %r11311, %r11310, %r11234; + xor.b32 %r11312, %r11311, %r11250; + shf.l.wrap.b32 %r11313, %r11312, %r11312, 20; + add.s32 %r11314, %r11308, %r10797; + add.s32 %r11315, %r11314, %r11313; + xor.b32 %r11316, %r11315, %r11310; + shf.l.wrap.b32 %r11317, %r11316, %r11316, 24; + add.s32 %r11318, %r11317, %r11311; + xor.b32 %r11319, %r11318, %r11313; + shf.l.wrap.b32 %r11320, %r11319, %r11319, 25; + add.s32 %r11321, %r11264, %r10825; + add.s32 %r11322, %r11321, %r11273; + xor.b32 %r11323, %r11322, %r11233; + shf.l.wrap.b32 %r11324, %r11323, %r11323, 16; + add.s32 %r11325, %r11324, %r11248; + xor.b32 %r11326, %r11325, %r11264; + shf.l.wrap.b32 %r11327, %r11326, %r11326, 20; + add.s32 %r11328, %r11322, %r10783; + add.s32 %r11329, %r11328, %r11327; + xor.b32 %r11330, %r11329, %r11324; + shf.l.wrap.b32 %r11331, %r11330, %r11330, 24; + add.s32 %r11332, %r11331, %r11325; + xor.b32 %r11333, %r11332, %r11327; + shf.l.wrap.b32 %r11334, %r11333, %r11333, 25; + add.s32 %r11335, %r11306, %r10776; + add.s32 %r11336, %r11335, %r11287; + xor.b32 %r11337, %r11336, %r11331; + shf.l.wrap.b32 %r11338, %r11337, %r11337, 16; + add.s32 %r11339, %r11338, %r11318; + xor.b32 %r11340, %r11339, %r11306; + shf.l.wrap.b32 %r11341, %r11340, %r11340, 20; + add.s32 %r11342, %r11336, %r10741; + add.s32 %r11343, %r11342, %r11341; + xor.b32 %r11344, %r11343, %r11338; + shf.l.wrap.b32 %r11345, %r11344, %r11344, 24; + add.s32 %r11346, %r11345, %r11339; + xor.b32 %r11347, %r11346, %r11341; + shf.l.wrap.b32 %r11348, %r11347, %r11347, 25; + add.s32 %r11349, %r11301, %r10762; + add.s32 %r11350, %r11349, %r11320; + xor.b32 %r11351, %r11289, %r11350; + shf.l.wrap.b32 %r11352, %r11351, %r11351, 16; + add.s32 %r11353, %r11352, %r11332; + xor.b32 %r11354, %r11353, %r11320; + shf.l.wrap.b32 %r11355, %r11354, %r11354, 20; + add.s32 %r11356, %r11350, %r10748; + add.s32 %r11357, %r11356, %r11355; + xor.b32 %r11358, %r11357, %r11352; + shf.l.wrap.b32 %r11359, %r11358, %r11358, 24; + add.s32 %r11360, %r11359, %r11353; + xor.b32 %r11361, %r11360, %r11355; + shf.l.wrap.b32 %r11362, %r11361, %r11361, 25; + add.s32 %r11363, %r11315, %r10727; + add.s32 %r11364, %r11363, %r11334; + xor.b32 %r11365, %r11364, %r11303; + shf.l.wrap.b32 %r11366, %r11365, %r11365, 16; + add.s32 %r11367, %r11366, %r11290; + xor.b32 %r11368, %r11367, %r11334; + shf.l.wrap.b32 %r11369, %r11368, %r11368, 20; + add.s32 %r11370, %r11364, %r10734; + add.s32 %r11371, %r11370, %r11369; + xor.b32 %r11372, %r11371, %r11366; + shf.l.wrap.b32 %r11373, %r11372, %r11372, 24; + add.s32 %r11374, %r11373, %r11367; + xor.b32 %r11375, %r11374, %r11369; + shf.l.wrap.b32 %r11376, %r11375, %r11375, 25; + add.s32 %r11377, %r11329, %r10769; + add.s32 %r11378, %r11377, %r11292; + xor.b32 %r11379, %r11378, %r11317; + shf.l.wrap.b32 %r11380, %r11379, %r11379, 16; + add.s32 %r11381, %r11380, %r11304; + xor.b32 %r11382, %r11381, %r11292; + shf.l.wrap.b32 %r11383, %r11382, %r11382, 20; + add.s32 %r11384, %r11378, %r10755; + add.s32 %r11385, %r11384, %r11383; + xor.b32 %r11386, %r11385, %r11380; + shf.l.wrap.b32 %r11387, %r11386, %r11386, 24; + add.s32 %r11388, %r11387, %r11381; + xor.b32 %r11389, %r11388, %r11383; + shf.l.wrap.b32 %r11390, %r11389, %r11389, 25; + add.s32 %r11391, %r11343, %r10790; + add.s32 %r11392, %r11391, %r11390; + xor.b32 %r11393, %r11392, %r11359; + shf.l.wrap.b32 %r11394, %r11393, %r11393, 16; + add.s32 %r11395, %r11394, %r11374; + xor.b32 %r11396, %r11395, %r11390; + shf.l.wrap.b32 %r11397, %r11396, %r11396, 20; + add.s32 %r11398, %r11392, %r10825; + add.s32 %r11399, %r11398, %r11397; + xor.b32 %r11400, %r11399, %r11394; + shf.l.wrap.b32 %r11401, %r11400, %r11400, 24; + add.s32 %r11402, %r11401, %r11395; + xor.b32 %r11403, %r11402, %r11397; + shf.l.wrap.b32 %r11404, %r11403, %r11403, 25; + add.s32 %r11405, %r11357, %r10804; + add.s32 %r11406, %r11405, %r11348; + xor.b32 %r11407, %r11406, %r11373; + shf.l.wrap.b32 %r11408, %r11407, %r11407, 16; + add.s32 %r11409, %r11408, %r11388; + xor.b32 %r11410, %r11409, %r11348; + shf.l.wrap.b32 %r11411, %r11410, %r11410, 20; + add.s32 %r11412, %r11406, %r10762; + add.s32 %r11413, %r11412, %r11411; + xor.b32 %r11414, %r11413, %r11408; + shf.l.wrap.b32 %r11415, %r11414, %r11414, 24; + add.s32 %r11416, %r11415, %r11409; + xor.b32 %r11417, %r11416, %r11411; + shf.l.wrap.b32 %r11418, %r11417, %r11417, 25; + add.s32 %r11419, %r11371, %r10783; + add.s32 %r11420, %r11419, %r11362; + xor.b32 %r11421, %r11387, %r11420; + shf.l.wrap.b32 %r11422, %r11421, %r11421, 16; + add.s32 %r11423, %r11422, %r11346; + xor.b32 %r11424, %r11423, %r11362; + shf.l.wrap.b32 %r11425, %r11424, %r11424, 20; + add.s32 %r11426, %r11420, %r10811; + add.s32 %r11427, %r11426, %r11425; + xor.b32 %r11428, %r11427, %r11422; + shf.l.wrap.b32 %r11429, %r11428, %r11428, 24; + add.s32 %r11430, %r11429, %r11423; + xor.b32 %r11431, %r11430, %r11425; + shf.l.wrap.b32 %r11432, %r11431, %r11431, 25; + add.s32 %r11433, %r11376, %r10832; + add.s32 %r11434, %r11433, %r11385; + xor.b32 %r11435, %r11434, %r11345; + shf.l.wrap.b32 %r11436, %r11435, %r11435, 16; + add.s32 %r11437, %r11436, %r11360; + xor.b32 %r11438, %r11437, %r11376; + shf.l.wrap.b32 %r11439, %r11438, %r11438, 20; + add.s32 %r11440, %r11434, %r10734; + add.s32 %r11441, %r11440, %r11439; + xor.b32 %r11442, %r11441, %r11436; + shf.l.wrap.b32 %r11443, %r11442, %r11442, 24; + add.s32 %r11444, %r11443, %r11437; + xor.b32 %r11445, %r11444, %r11439; + shf.l.wrap.b32 %r11446, %r11445, %r11445, 25; + add.s32 %r11447, %r11418, %r10818; + add.s32 %r11448, %r11447, %r11399; + xor.b32 %r11449, %r11448, %r11443; + shf.l.wrap.b32 %r11450, %r11449, %r11449, 16; + add.s32 %r11451, %r11450, %r11430; + xor.b32 %r11452, %r11451, %r11418; + shf.l.wrap.b32 %r11453, %r11452, %r11452, 20; + add.s32 %r11454, %r11448, %r10748; + add.s32 %r11455, %r11454, %r11453; + xor.b32 %r11456, %r11455, %r11450; + shf.l.wrap.b32 %r11457, %r11456, %r11456, 24; + add.s32 %r11458, %r11457, %r11451; + xor.b32 %r11459, %r11458, %r11453; + shf.l.wrap.b32 %r11460, %r11459, %r11459, 25; + add.s32 %r11461, %r11413, %r10727; + add.s32 %r11462, %r11461, %r11432; + xor.b32 %r11463, %r11401, %r11462; + shf.l.wrap.b32 %r11464, %r11463, %r11463, 16; + add.s32 %r11465, %r11464, %r11444; + xor.b32 %r11466, %r11465, %r11432; + shf.l.wrap.b32 %r11467, %r11466, %r11466, 20; + add.s32 %r11468, %r11462, %r10797; + add.s32 %r11469, %r11468, %r11467; + xor.b32 %r11470, %r11469, %r11464; + shf.l.wrap.b32 %r11471, %r11470, %r11470, 24; + add.s32 %r11472, %r11471, %r11465; + xor.b32 %r11473, %r11472, %r11467; + shf.l.wrap.b32 %r11474, %r11473, %r11473, 25; + add.s32 %r11475, %r11427, %r10741; + add.s32 %r11476, %r11475, %r11446; + xor.b32 %r11477, %r11476, %r11415; + shf.l.wrap.b32 %r11478, %r11477, %r11477, 16; + add.s32 %r11479, %r11478, %r11402; + xor.b32 %r11480, %r11479, %r11446; + shf.l.wrap.b32 %r11481, %r11480, %r11480, 20; + add.s32 %r11482, %r11476, %r10769; + add.s32 %r11483, %r11482, %r11481; + xor.b32 %r11484, %r11483, %r11478; + shf.l.wrap.b32 %r11485, %r11484, %r11484, 24; + add.s32 %r11486, %r11485, %r11479; + xor.b32 %r11487, %r11486, %r11481; + shf.l.wrap.b32 %r11488, %r11487, %r11487, 25; + add.s32 %r11489, %r11441, %r10755; + add.s32 %r11490, %r11489, %r11404; + xor.b32 %r11491, %r11490, %r11429; + shf.l.wrap.b32 %r11492, %r11491, %r11491, 16; + add.s32 %r11493, %r11492, %r11416; + xor.b32 %r11494, %r11493, %r11404; + shf.l.wrap.b32 %r11495, %r11494, %r11494, 20; + add.s32 %r11496, %r11490, %r10776; + add.s32 %r11497, %r11496, %r11495; + xor.b32 %r11498, %r11497, %r11492; + shf.l.wrap.b32 %r11499, %r11498, %r11498, 24; + add.s32 %r11500, %r11499, %r11493; + xor.b32 %r11501, %r11500, %r11495; + shf.l.wrap.b32 %r11502, %r11501, %r11501, 25; + add.s32 %r11503, %r11455, %r10804; + add.s32 %r11504, %r11503, %r11502; + xor.b32 %r11505, %r11504, %r11471; + shf.l.wrap.b32 %r11506, %r11505, %r11505, 16; + add.s32 %r11507, %r11506, %r11486; + xor.b32 %r11508, %r11507, %r11502; + shf.l.wrap.b32 %r11509, %r11508, %r11508, 20; + add.s32 %r11510, %r11504, %r10832; + add.s32 %r11511, %r11510, %r11509; + xor.b32 %r11512, %r11511, %r11506; + shf.l.wrap.b32 %r11513, %r11512, %r11512, 24; + add.s32 %r11514, %r11513, %r11507; + xor.b32 %r11515, %r11514, %r11509; + shf.l.wrap.b32 %r11516, %r11515, %r11515, 25; + add.s32 %r11517, %r11469, %r10762; + add.s32 %r11518, %r11517, %r11460; + xor.b32 %r11519, %r11518, %r11485; + shf.l.wrap.b32 %r11520, %r11519, %r11519, 16; + add.s32 %r11521, %r11520, %r11500; + xor.b32 %r11522, %r11521, %r11460; + shf.l.wrap.b32 %r11523, %r11522, %r11522, 20; + add.s32 %r11524, %r11518, %r10727; + add.s32 %r11525, %r11524, %r11523; + xor.b32 %r11526, %r11525, %r11520; + shf.l.wrap.b32 %r11527, %r11526, %r11526, 24; + add.s32 %r11528, %r11527, %r11521; + xor.b32 %r11529, %r11528, %r11523; + shf.l.wrap.b32 %r11530, %r11529, %r11529, 25; + add.s32 %r11531, %r11483, %r10734; + add.s32 %r11532, %r11531, %r11474; + xor.b32 %r11533, %r11499, %r11532; + shf.l.wrap.b32 %r11534, %r11533, %r11533, 16; + add.s32 %r11535, %r11534, %r11458; + xor.b32 %r11536, %r11535, %r11474; + shf.l.wrap.b32 %r11537, %r11536, %r11536, 20; + add.s32 %r11538, %r11532, %r10790; + add.s32 %r11539, %r11538, %r11537; + xor.b32 %r11540, %r11539, %r11534; + shf.l.wrap.b32 %r11541, %r11540, %r11540, 24; + add.s32 %r11542, %r11541, %r11535; + xor.b32 %r11543, %r11542, %r11537; + shf.l.wrap.b32 %r11544, %r11543, %r11543, 25; + add.s32 %r11545, %r11488, %r10783; + add.s32 %r11546, %r11545, %r11497; + xor.b32 %r11547, %r11546, %r11457; + shf.l.wrap.b32 %r11548, %r11547, %r11547, 16; + add.s32 %r11549, %r11548, %r11472; + xor.b32 %r11550, %r11549, %r11488; + shf.l.wrap.b32 %r11551, %r11550, %r11550, 20; + add.s32 %r11552, %r11546, %r10769; + add.s32 %r11553, %r11552, %r11551; + xor.b32 %r11554, %r11553, %r11548; + shf.l.wrap.b32 %r11555, %r11554, %r11554, 24; + add.s32 %r11556, %r11555, %r11549; + xor.b32 %r11557, %r11556, %r11551; + shf.l.wrap.b32 %r11558, %r11557, %r11557, 25; + add.s32 %r11559, %r11530, %r10825; + add.s32 %r11560, %r11559, %r11511; + xor.b32 %r11561, %r11560, %r11555; + shf.l.wrap.b32 %r11562, %r11561, %r11561, 16; + add.s32 %r11563, %r11562, %r11542; + xor.b32 %r11564, %r11563, %r11530; + shf.l.wrap.b32 %r11565, %r11564, %r11564, 20; + add.s32 %r11566, %r11560, %r10797; + add.s32 %r11567, %r11566, %r11565; + xor.b32 %r11568, %r11567, %r11562; + shf.l.wrap.b32 %r11569, %r11568, %r11568, 24; + add.s32 %r11570, %r11569, %r11563; + xor.b32 %r11571, %r11570, %r11565; + shf.l.wrap.b32 %r11572, %r11571, %r11571, 25; + add.s32 %r11573, %r11525, %r10741; + add.s32 %r11574, %r11573, %r11544; + xor.b32 %r11575, %r11513, %r11574; + shf.l.wrap.b32 %r11576, %r11575, %r11575, 16; + add.s32 %r11577, %r11576, %r11556; + xor.b32 %r11578, %r11577, %r11544; + shf.l.wrap.b32 %r11579, %r11578, %r11578, 20; + add.s32 %r11580, %r11574, %r10811; + add.s32 %r11581, %r11580, %r11579; + xor.b32 %r11582, %r11581, %r11576; + shf.l.wrap.b32 %r11583, %r11582, %r11582, 24; + add.s32 %r11584, %r11583, %r11577; + xor.b32 %r11585, %r11584, %r11579; + shf.l.wrap.b32 %r11586, %r11585, %r11585, 25; + add.s32 %r11587, %r11539, %r10748; + add.s32 %r11588, %r11587, %r11558; + xor.b32 %r11589, %r11588, %r11527; + shf.l.wrap.b32 %r11590, %r11589, %r11589, 16; + add.s32 %r11591, %r11590, %r11514; + xor.b32 %r11592, %r11591, %r11558; + shf.l.wrap.b32 %r11593, %r11592, %r11592, 20; + add.s32 %r11594, %r11588, %r10755; + add.s32 %r11595, %r11594, %r11593; + xor.b32 %r11596, %r11595, %r11590; + shf.l.wrap.b32 %r11597, %r11596, %r11596, 24; + add.s32 %r11598, %r11597, %r11591; + xor.b32 %r11599, %r11598, %r11593; + shf.l.wrap.b32 %r11600, %r11599, %r11599, 25; + add.s32 %r11601, %r11553, %r10776; + add.s32 %r11602, %r11601, %r11516; + xor.b32 %r11603, %r11602, %r11541; + shf.l.wrap.b32 %r11604, %r11603, %r11603, 16; + add.s32 %r11605, %r11604, %r11528; + xor.b32 %r11606, %r11605, %r11516; + shf.l.wrap.b32 %r11607, %r11606, %r11606, 20; + add.s32 %r11608, %r11602, %r10818; + add.s32 %r11609, %r11608, %r11607; + xor.b32 %r11610, %r11609, %r11604; + shf.l.wrap.b32 %r11611, %r11610, %r11610, 24; + add.s32 %r11612, %r11611, %r11605; + xor.b32 %r11613, %r11612, %r11607; + shf.l.wrap.b32 %r11614, %r11613, %r11613, 25; + xor.b32 %r11615, %r11598, %r11567; + xor.b32 %r11616, %r11612, %r11581; + xor.b32 %r11617, %r11570, %r11595; + xor.b32 %r11618, %r11609, %r11584; + xor.b32 %r11619, %r11614, %r11583; + xor.b32 %r11620, %r11572, %r11597; + xor.b32 %r11621, %r11611, %r11586; + xor.b32 %r11622, %r11600, %r11569; + st.local.u8 [%rd219+145], %r11615; + shr.u32 %r11623, %r11615, 8; + st.local.u8 [%rd219+146], %r11623; + shr.u32 %r11624, %r11615, 16; + st.local.u8 [%rd219+147], %r11624; + shr.u32 %r11625, %r11615, 24; + st.local.u8 [%rd219+148], %r11625; + st.local.u8 [%rd219+149], %r11616; + shr.u32 %r11626, %r11616, 8; + st.local.u8 [%rd219+150], %r11626; + shr.u32 %r11627, %r11616, 16; + st.local.u8 [%rd219+151], %r11627; + shr.u32 %r11628, %r11616, 24; + st.local.u8 [%rd219+152], %r11628; + st.local.u8 [%rd219+153], %r11617; + shr.u32 %r11629, %r11617, 8; + st.local.u8 [%rd219+154], %r11629; + shr.u32 %r11630, %r11617, 16; + st.local.u8 [%rd219+155], %r11630; + shr.u32 %r11631, %r11617, 24; + st.local.u8 [%rd219+156], %r11631; + st.local.u8 [%rd219+157], %r11618; + shr.u32 %r11632, %r11618, 8; + st.local.u8 [%rd219+158], %r11632; + shr.u32 %r11633, %r11618, 16; + st.local.u8 [%rd219+159], %r11633; + shr.u32 %r11634, %r11618, 24; + st.local.u8 [%rd219+160], %r11634; + st.local.u8 [%rd219+161], %r11619; + shr.u32 %r11635, %r11619, 8; + st.local.u8 [%rd219+162], %r11635; + shr.u32 %r11636, %r11619, 16; + st.local.u8 [%rd219+163], %r11636; + shr.u32 %r11637, %r11619, 24; + st.local.u8 [%rd219+164], %r11637; + st.local.u8 [%rd219+165], %r11620; + shr.u32 %r11638, %r11620, 8; + st.local.u8 [%rd219+166], %r11638; + shr.u32 %r11639, %r11620, 16; + st.local.u8 [%rd219+167], %r11639; + shr.u32 %r11640, %r11620, 24; + st.local.u8 [%rd219+168], %r11640; + st.local.u8 [%rd219+169], %r11621; + shr.u32 %r11641, %r11621, 8; + st.local.u8 [%rd219+170], %r11641; + shr.u32 %r11642, %r11621, 16; + st.local.u8 [%rd219+171], %r11642; + shr.u32 %r11643, %r11621, 24; + st.local.u8 [%rd219+172], %r11643; + st.local.u8 [%rd219+173], %r11622; + shr.u32 %r11644, %r11622, 8; + st.local.u8 [%rd219+174], %r11644; + shr.u32 %r11645, %r11622, 16; + st.local.u8 [%rd219+175], %r11645; + shr.u32 %r11646, %r11622, 24; + st.local.u8 [%rd219+176], %r11646; + add.s16 %rs392, %rs392, -1; + cvt.u64.u16 %rd220, %rs392; + and.b64 %rd221, %rd220, 255; + setp.lt.u64 %p53, %rd97, %rd221; + @%p53 bra $L__BB1_66; + + ld.param.u64 %rd232, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + cvta.to.local.u64 %rd231, %rd232; + add.s64 %rd230, %rd231, 136; + st.local.u8 [%rd230+8], %rs392; + +$L__BB1_68: + ret; + +} + // .globl heavy_hash +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5, + .param .u64 heavy_hash_param_6, + .param .u64 heavy_hash_param_7 +) +{ + .local .align 16 .b8 __local_depot2[2096]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<64>; + .reg .b16 %rs<866>; + .reg .b32 %r<30985>; + .reg .b64 %rd<1282>; + + + mov.u64 %SPL, __local_depot2; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs409, [heavy_hash_param_3]; + ld.param.u64 %rd349, [heavy_hash_param_0]; + ld.param.u64 %rd350, [heavy_hash_param_1]; + ld.param.u64 %rd351, [heavy_hash_param_2]; + ld.param.u64 %rd355, [heavy_hash_param_4]; + ld.param.u64 %rd352, [heavy_hash_param_5]; + ld.param.u64 %rd353, [heavy_hash_param_6]; + ld.param.u64 %rd354, [heavy_hash_param_7]; + cvta.to.global.u64 %rd1254, %rd353; + cvta.to.global.u64 %rd2, %rd355; + add.u64 %rd356, %SP, 0; + add.u64 %rd3, %SPL, 0; + add.u64 %rd4, %SPL, 2016; + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %tid.x; + or.b32 %r5023, %r2, %r1; + setp.ne.s32 %p6, %r5023, 0; + @%p6 bra $L__BB2_8; + + add.u64 %rd358, %SP, 2000; + add.u64 %rd359, %SPL, 2000; + mov.u32 %r29535, 0; + mov.u64 %rd360, 0; + st.local.v2.u32 [%rd359], {%r2, %r1}; + mov.u64 %rd361, $str$2; + cvta.global.u64 %rd362, %rd361; + { // callseq 3, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd362; + .param .b64 param1; + st.param.b64 [param1+0], %rd358; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5025, [retval0+0]; + } // callseq 3 + mov.u64 %rd363, $str$3; + cvta.global.u64 %rd364, %rd363; + { // callseq 4, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd364; + .param .b64 param1; + st.param.b64 [param1+0], %rd360; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5026, [retval0+0]; + } // callseq 4 + mov.u64 %rd1252, %rd1254; + +$L__BB2_2: + ld.global.u8 %r5027, [%rd1252+1280]; + st.local.u32 [%rd3], %r5027; + mov.u64 %rd368, $str; + cvta.global.u64 %rd369, %rd368; + { // callseq 5, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5028, [retval0+0]; + } // callseq 5 + add.s64 %rd1252, %rd1252, 1; + add.s32 %r29535, %r29535, 1; + setp.lt.u32 %p7, %r29535, 128; + @%p7 bra $L__BB2_2; + + mov.u64 %rd371, $str$1; + cvta.global.u64 %rd372, %rd371; + mov.u32 %r29536, 0; + mov.u64 %rd373, 0; + { // callseq 6, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd373; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5030, [retval0+0]; + } // callseq 6 + mov.u64 %rd374, $str$4; + cvta.global.u64 %rd375, %rd374; + { // callseq 7, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd375; + .param .b64 param1; + st.param.b64 [param1+0], %rd373; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5031, [retval0+0]; + } // callseq 7 + mov.u64 %rd1253, %rd1254; + +$L__BB2_4: + ld.global.u8 %r5032, [%rd1253+5376]; + st.local.u32 [%rd3], %r5032; + { // callseq 8, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5033, [retval0+0]; + } // callseq 8 + add.s64 %rd1253, %rd1253, 1; + add.s32 %r29536, %r29536, 1; + setp.lt.u32 %p8, %r29536, 128; + @%p8 bra $L__BB2_4; + + mov.u32 %r29537, 0; + mov.u64 %rd381, 0; + { // callseq 9, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd381; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5035, [retval0+0]; + } // callseq 9 + mov.u64 %rd382, $str$5; + cvta.global.u64 %rd383, %rd382; + { // callseq 10, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd383; + .param .b64 param1; + st.param.b64 [param1+0], %rd381; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5036, [retval0+0]; + } // callseq 10 + +$L__BB2_6: + ld.global.u8 %r5037, [%rd1254+1580160]; + st.local.u32 [%rd3], %r5037; + { // callseq 11, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5038, [retval0+0]; + } // callseq 11 + add.s64 %rd1254, %rd1254, 1; + add.s32 %r29537, %r29537, 1; + setp.lt.u32 %p9, %r29537, 128; + @%p9 bra $L__BB2_6; + + mov.u64 %rd389, 0; + { // callseq 12, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd389; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5039, [retval0+0]; + } // callseq 12 + +$L__BB2_8: + mov.u32 %r5040, %ntid.x; + mad.lo.s32 %r5041, %r1, %r5040, %r2; + cvt.s64.s32 %rd14, %r5041; + setp.ge.u64 %p10, %rd14, %rd351; + @%p10 bra $L__BB2_113; + + cvt.u32.u64 %r5042, %rd14; + setp.ne.s32 %p11, %r5042, 0; + @%p11 bra $L__BB2_11; + + cvta.to.global.u64 %rd390, %rd352; + mov.u64 %rd391, 0; + st.global.u64 [%rd390], %rd391; + +$L__BB2_11: + setp.eq.s16 %p12, %rs409, 0; + @%p12 bra $L__BB2_13; + + shl.b64 %rd392, %rd14, 5; + add.s64 %rd393, %rd2, %rd392; + ld.global.v2.u64 {%rd394, %rd395}, [%rd393]; + mul.lo.s64 %rd398, %rd395, 5; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd398, 7; + shr.b64 %rhs, %rd398, 57; + add.u64 %rd399, %lhs, %rhs; + } + mul.lo.s64 %rd1255, %rd399, 9; + shl.b64 %rd400, %rd395, 17; + ld.global.v2.u64 {%rd401, %rd402}, [%rd393+16]; + xor.b64 %rd405, %rd401, %rd394; + xor.b64 %rd406, %rd402, %rd395; + xor.b64 %rd407, %rd395, %rd405; + xor.b64 %rd408, %rd394, %rd406; + st.global.v2.u64 [%rd393], {%rd408, %rd407}; + { + .reg .b32 %dummy; + mov.b64 {%r5043,%dummy}, %rd406; + } + { + .reg .b32 %dummy; + mov.b64 {%dummy,%r5044}, %rd406; + } + shf.r.wrap.b32 %r5045, %r5044, %r5043, 19; + shf.r.wrap.b32 %r5046, %r5043, %r5044, 19; + mov.b64 %rd409, {%r5046, %r5045}; + xor.b64 %rd410, %rd405, %rd400; + st.global.v2.u64 [%rd393+16], {%rd410, %rd409}; + bra.uni $L__BB2_14; + +$L__BB2_13: + ld.global.u64 %rd411, [%rd2]; + xor.b64 %rd1255, %rd411, %rd14; + +$L__BB2_14: + and.b64 %rd413, %rd1255, %rd349; + or.b64 %rd18, %rd413, %rd350; + mov.u64 %rd1256, 0; + mov.u64 %rd414, hash_header; + +$L__BB2_15: + add.s64 %rd415, %rd414, %rd1256; + ld.const.u8 %rs410, [%rd415]; + add.s64 %rd416, %rd4, %rd1256; + st.local.u8 [%rd416], %rs410; + add.s64 %rd1256, %rd1256, 1; + setp.lt.u64 %p13, %rd1256, 72; + @%p13 bra $L__BB2_15; + + mov.u64 %rd417, 0; + st.local.u64 [%rd4+72], %rd18; + mov.u32 %r5047, -1150833019; + mov.u32 %r5048, 1779033703; + st.local.v2.u32 [%rd3], {%r5048, %r5047}; + mov.u32 %r5049, -1521486534; + mov.u32 %r5050, 1013904242; + st.local.v2.u32 [%rd3+8], {%r5050, %r5049}; + mov.u32 %r5051, -1694144372; + mov.u32 %r5052, 1359893119; + st.local.v2.u32 [%rd3+16], {%r5052, %r5051}; + mov.u32 %r5053, 1541459225; + mov.u32 %r5054, 528734635; + st.local.v2.u32 [%rd3+24], {%r5054, %r5053}; + st.local.v2.u32 [%rd3+32], {%r5048, %r5047}; + st.local.v2.u32 [%rd3+40], {%r5050, %r5049}; + st.local.v2.u32 [%rd3+48], {%r5052, %r5051}; + st.local.v2.u32 [%rd3+56], {%r5054, %r5053}; + st.local.u64 [%rd3+64], %rd417; + mov.u32 %r5055, 0; + st.local.v2.u32 [%rd3+72], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+80], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+88], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+96], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+104], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+112], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+120], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+128], {%r5055, %r5055}; + mov.u16 %rs411, 0; + st.local.v2.u8 [%rd3+136], {%rs411, %rs411}; + st.local.u8 [%rd3+138], %rs411; + st.local.u8 [%rd3+144], %rs411; + ld.local.v4.u8 {%rs412, %rs413, %rs414, %rs415}, [%rd3+136]; + setp.eq.s16 %p14, %rs413, 0; + selp.u16 %rs419, 1, 0, %p14; + or.b16 %rs420, %rs414, %rs419; + ld.local.v4.u32 {%r5056, %r5057, %r5058, %r5059}, [%rd4]; + mov.b32 {%rs421, %rs422}, %r5056; + shr.u16 %rs423, %rs421, 8; + shr.u16 %rs424, %rs422, 8; + mov.b32 {%rs425, %rs426}, %r5057; + shr.u16 %rs427, %rs425, 8; + shr.u16 %rs428, %rs426, 8; + mov.b32 {%rs429, %rs430}, %r5058; + shr.u16 %rs431, %rs429, 8; + shr.u16 %rs432, %rs430, 8; + mov.b32 {%rs433, %rs434}, %r5059; + shr.u16 %rs435, %rs433, 8; + shr.u16 %rs436, %rs434, 8; + cvt.u32.u16 %r5064, %rs421; + and.b32 %r5065, %r5064, 255; + cvt.u32.u16 %r5066, %rs423; + prmt.b32 %r5067, %r5066, %r5065, 30212; + cvt.u32.u16 %r5068, %rs422; + prmt.b32 %r5069, %r5068, %r5067, 28756; + cvt.u32.u16 %r5070, %rs424; + prmt.b32 %r5071, %r5070, %r5069, 1620; + cvt.u32.u16 %r5072, %rs425; + and.b32 %r5073, %r5072, 255; + cvt.u32.u16 %r5074, %rs427; + prmt.b32 %r5075, %r5074, %r5073, 30212; + cvt.u32.u16 %r5076, %rs426; + prmt.b32 %r5077, %r5076, %r5075, 28756; + cvt.u32.u16 %r5078, %rs428; + prmt.b32 %r5079, %r5078, %r5077, 1620; + cvt.u32.u16 %r5080, %rs429; + and.b32 %r5081, %r5080, 255; + cvt.u32.u16 %r5082, %rs431; + prmt.b32 %r5083, %r5082, %r5081, 30212; + cvt.u32.u16 %r5084, %rs430; + prmt.b32 %r5085, %r5084, %r5083, 28756; + cvt.u32.u16 %r5086, %rs432; + prmt.b32 %r5087, %r5086, %r5085, 1620; + cvt.u32.u16 %r5088, %rs433; + and.b32 %r5089, %r5088, 255; + cvt.u32.u16 %r5090, %rs435; + prmt.b32 %r5091, %r5090, %r5089, 30212; + cvt.u32.u16 %r5092, %rs434; + prmt.b32 %r5093, %r5092, %r5091, 28756; + cvt.u32.u16 %r5094, %rs436; + prmt.b32 %r5095, %r5094, %r5093, 1620; + ld.local.v4.u32 {%r5096, %r5097, %r5098, %r5099}, [%rd4+16]; + mov.b32 {%rs437, %rs438}, %r5096; + shr.u16 %rs439, %rs437, 8; + shr.u16 %rs440, %rs438, 8; + mov.b32 {%rs441, %rs442}, %r5097; + shr.u16 %rs443, %rs441, 8; + shr.u16 %rs444, %rs442, 8; + mov.b32 {%rs445, %rs446}, %r5098; + shr.u16 %rs447, %rs445, 8; + shr.u16 %rs448, %rs446, 8; + mov.b32 {%rs449, %rs450}, %r5099; + shr.u16 %rs451, %rs449, 8; + shr.u16 %rs452, %rs450, 8; + cvt.u32.u16 %r5104, %rs437; + and.b32 %r5105, %r5104, 255; + cvt.u32.u16 %r5106, %rs439; + prmt.b32 %r5107, %r5106, %r5105, 30212; + cvt.u32.u16 %r5108, %rs438; + prmt.b32 %r5109, %r5108, %r5107, 28756; + cvt.u32.u16 %r5110, %rs440; + prmt.b32 %r5111, %r5110, %r5109, 1620; + cvt.u32.u16 %r5112, %rs441; + and.b32 %r5113, %r5112, 255; + cvt.u32.u16 %r5114, %rs443; + prmt.b32 %r5115, %r5114, %r5113, 30212; + cvt.u32.u16 %r5116, %rs442; + prmt.b32 %r5117, %r5116, %r5115, 28756; + cvt.u32.u16 %r5118, %rs444; + prmt.b32 %r5119, %r5118, %r5117, 1620; + cvt.u32.u16 %r5120, %rs445; + and.b32 %r5121, %r5120, 255; + cvt.u32.u16 %r5122, %rs447; + prmt.b32 %r5123, %r5122, %r5121, 30212; + cvt.u32.u16 %r5124, %rs446; + prmt.b32 %r5125, %r5124, %r5123, 28756; + cvt.u32.u16 %r5126, %rs448; + prmt.b32 %r5127, %r5126, %r5125, 1620; + cvt.u32.u16 %r5128, %rs449; + and.b32 %r5129, %r5128, 255; + cvt.u32.u16 %r5130, %rs451; + prmt.b32 %r5131, %r5130, %r5129, 30212; + cvt.u32.u16 %r5132, %rs450; + prmt.b32 %r5133, %r5132, %r5131, 28756; + cvt.u32.u16 %r5134, %rs452; + prmt.b32 %r5135, %r5134, %r5133, 1620; + ld.local.v4.u32 {%r5136, %r5137, %r5138, %r5139}, [%rd4+32]; + mov.b32 {%rs453, %rs454}, %r5136; + shr.u16 %rs455, %rs453, 8; + shr.u16 %rs456, %rs454, 8; + mov.b32 {%rs457, %rs458}, %r5137; + shr.u16 %rs459, %rs457, 8; + shr.u16 %rs460, %rs458, 8; + mov.b32 {%rs461, %rs462}, %r5138; + shr.u16 %rs463, %rs461, 8; + shr.u16 %rs464, %rs462, 8; + mov.b32 {%rs465, %rs466}, %r5139; + shr.u16 %rs467, %rs465, 8; + shr.u16 %rs468, %rs466, 8; + cvt.u32.u16 %r5144, %rs453; + and.b32 %r5145, %r5144, 255; + cvt.u32.u16 %r5146, %rs455; + prmt.b32 %r5147, %r5146, %r5145, 30212; + cvt.u32.u16 %r5148, %rs454; + prmt.b32 %r5149, %r5148, %r5147, 28756; + cvt.u32.u16 %r5150, %rs456; + prmt.b32 %r5151, %r5150, %r5149, 1620; + cvt.u32.u16 %r5152, %rs457; + and.b32 %r5153, %r5152, 255; + cvt.u32.u16 %r5154, %rs459; + prmt.b32 %r5155, %r5154, %r5153, 30212; + cvt.u32.u16 %r5156, %rs458; + prmt.b32 %r5157, %r5156, %r5155, 28756; + cvt.u32.u16 %r5158, %rs460; + prmt.b32 %r5159, %r5158, %r5157, 1620; + cvt.u32.u16 %r5160, %rs461; + and.b32 %r5161, %r5160, 255; + cvt.u32.u16 %r5162, %rs463; + prmt.b32 %r5163, %r5162, %r5161, 30212; + cvt.u32.u16 %r5164, %rs462; + prmt.b32 %r5165, %r5164, %r5163, 28756; + cvt.u32.u16 %r5166, %rs464; + prmt.b32 %r5167, %r5166, %r5165, 1620; + cvt.u32.u16 %r5168, %rs465; + and.b32 %r5169, %r5168, 255; + cvt.u32.u16 %r5170, %rs467; + prmt.b32 %r5171, %r5170, %r5169, 30212; + cvt.u32.u16 %r5172, %rs466; + prmt.b32 %r5173, %r5172, %r5171, 28756; + cvt.u32.u16 %r5174, %rs468; + prmt.b32 %r5175, %r5174, %r5173, 1620; + ld.local.v4.u32 {%r5176, %r5177, %r5178, %r5179}, [%rd4+48]; + mov.b32 {%rs469, %rs470}, %r5176; + shr.u16 %rs471, %rs469, 8; + shr.u16 %rs472, %rs470, 8; + mov.b32 {%rs473, %rs474}, %r5177; + shr.u16 %rs475, %rs473, 8; + shr.u16 %rs476, %rs474, 8; + mov.b32 {%rs477, %rs478}, %r5178; + shr.u16 %rs479, %rs477, 8; + shr.u16 %rs480, %rs478, 8; + mov.b32 {%rs481, %rs482}, %r5179; + shr.u16 %rs483, %rs481, 8; + shr.u16 %rs484, %rs482, 8; + cvt.u32.u16 %r5184, %rs469; + and.b32 %r5185, %r5184, 255; + cvt.u32.u16 %r5186, %rs471; + prmt.b32 %r5187, %r5186, %r5185, 30212; + cvt.u32.u16 %r5188, %rs470; + prmt.b32 %r5189, %r5188, %r5187, 28756; + cvt.u32.u16 %r5190, %rs472; + prmt.b32 %r5191, %r5190, %r5189, 1620; + cvt.u32.u16 %r5192, %rs473; + and.b32 %r5193, %r5192, 255; + cvt.u32.u16 %r5194, %rs475; + prmt.b32 %r5195, %r5194, %r5193, 30212; + cvt.u32.u16 %r5196, %rs474; + prmt.b32 %r5197, %r5196, %r5195, 28756; + cvt.u32.u16 %r5198, %rs476; + prmt.b32 %r5199, %r5198, %r5197, 1620; + cvt.u32.u16 %r5200, %rs477; + and.b32 %r5201, %r5200, 255; + cvt.u32.u16 %r5202, %rs479; + prmt.b32 %r5203, %r5202, %r5201, 30212; + cvt.u32.u16 %r5204, %rs478; + prmt.b32 %r5205, %r5204, %r5203, 28756; + cvt.u32.u16 %r5206, %rs480; + prmt.b32 %r5207, %r5206, %r5205, 1620; + cvt.u32.u16 %r5208, %rs481; + and.b32 %r5209, %r5208, 255; + cvt.u32.u16 %r5210, %rs483; + prmt.b32 %r5211, %r5210, %r5209, 30212; + cvt.u32.u16 %r5212, %rs482; + prmt.b32 %r5213, %r5212, %r5211, 28756; + cvt.u32.u16 %r5214, %rs484; + prmt.b32 %r5215, %r5214, %r5213, 1620; + cvt.u32.u16 %r5216, %rs420; + and.b32 %r5217, %r5216, 255; + add.s32 %r5218, %r5071, -1156040474; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 16; + add.s32 %r5220, %r5219, 1779033703; + xor.b32 %r5221, %r5220, 1359893119; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 20; + add.s32 %r5223, %r5079, %r5218; + add.s32 %r5224, %r5223, %r5222; + xor.b32 %r5225, %r5224, %r5219; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 24; + add.s32 %r5227, %r5226, %r5220; + xor.b32 %r5228, %r5227, %r5222; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 25; + add.s32 %r5230, %r5087, 1449989905; + shf.l.wrap.b32 %r5231, %r5230, %r5230, 16; + add.s32 %r5232, %r5231, -1150833019; + xor.b32 %r5233, %r5232, -1694144372; + shf.l.wrap.b32 %r5234, %r5233, %r5233, 20; + add.s32 %r5235, %r5095, %r5230; + add.s32 %r5236, %r5235, %r5234; + xor.b32 %r5237, %r5236, %r5231; + shf.l.wrap.b32 %r5238, %r5237, %r5237, 24; + add.s32 %r5239, %r5238, %r5232; + xor.b32 %r5240, %r5239, %r5234; + shf.l.wrap.b32 %r5241, %r5240, %r5240, 25; + add.s32 %r5242, %r5111, 1542638877; + shr.u32 %r5243, %r5242, 16; + shl.b32 %r5244, %r5242, 16; + xor.b32 %r5245, %r5244, 4194304; + or.b32 %r5246, %r5245, %r5243; + add.s32 %r5247, %r5246, 1013904242; + xor.b32 %r5248, %r5247, 528734635; + shf.l.wrap.b32 %r5249, %r5248, %r5248, 20; + add.s32 %r5250, %r5119, %r5242; + add.s32 %r5251, %r5250, %r5249; + xor.b32 %r5252, %r5251, %r5246; + shf.l.wrap.b32 %r5253, %r5252, %r5252, 24; + add.s32 %r5254, %r5253, %r5247; + xor.b32 %r5255, %r5254, %r5249; + shf.l.wrap.b32 %r5256, %r5255, %r5255, 25; + add.s32 %r5257, %r5127, 19972691; + xor.b32 %r5258, %r5257, %r5217; + shr.u32 %r5259, %r5257, 16; + shl.b32 %r5260, %r5258, 16; + or.b32 %r5261, %r5260, %r5259; + add.s32 %r5262, %r5261, -1521486534; + xor.b32 %r5263, %r5262, 1541459225; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 20; + add.s32 %r5265, %r5135, %r5257; + add.s32 %r5266, %r5265, %r5264; + xor.b32 %r5267, %r5266, %r5261; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 24; + add.s32 %r5269, %r5268, %r5262; + xor.b32 %r5270, %r5269, %r5264; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 25; + add.s32 %r5272, %r5241, %r5224; + add.s32 %r5273, %r5272, %r5151; + xor.b32 %r5274, %r5268, %r5273; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 16; + add.s32 %r5276, %r5275, %r5254; + xor.b32 %r5277, %r5276, %r5241; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 20; + add.s32 %r5279, %r5159, %r5273; + add.s32 %r5280, %r5279, %r5278; + xor.b32 %r5281, %r5280, %r5275; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 24; + add.s32 %r5283, %r5282, %r5276; + xor.b32 %r5284, %r5283, %r5278; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 25; + add.s32 %r5286, %r5256, %r5236; + add.s32 %r5287, %r5286, %r5167; + xor.b32 %r5288, %r5287, %r5226; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 16; + add.s32 %r5290, %r5289, %r5269; + xor.b32 %r5291, %r5290, %r5256; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 20; + add.s32 %r5293, %r5175, %r5287; + add.s32 %r5294, %r5293, %r5292; + xor.b32 %r5295, %r5294, %r5289; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 24; + add.s32 %r5297, %r5296, %r5290; + xor.b32 %r5298, %r5297, %r5292; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 25; + add.s32 %r5300, %r5271, %r5251; + add.s32 %r5301, %r5300, %r5191; + xor.b32 %r5302, %r5301, %r5238; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 16; + add.s32 %r5304, %r5303, %r5227; + xor.b32 %r5305, %r5304, %r5271; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 20; + add.s32 %r5307, %r5199, %r5301; + add.s32 %r5308, %r5307, %r5306; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 24; + add.s32 %r5311, %r5310, %r5304; + xor.b32 %r5312, %r5311, %r5306; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 25; + add.s32 %r5314, %r5266, %r5229; + add.s32 %r5315, %r5314, %r5207; + xor.b32 %r5316, %r5315, %r5253; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 16; + add.s32 %r5318, %r5317, %r5239; + xor.b32 %r5319, %r5318, %r5229; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 20; + add.s32 %r5321, %r5215, %r5315; + add.s32 %r5322, %r5321, %r5320; + xor.b32 %r5323, %r5322, %r5317; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 24; + add.s32 %r5325, %r5324, %r5318; + xor.b32 %r5326, %r5325, %r5320; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 25; + add.s32 %r5328, %r5280, %r5087; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5296; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 16; + add.s32 %r5332, %r5331, %r5311; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 20; + add.s32 %r5335, %r5329, %r5127; + add.s32 %r5336, %r5335, %r5334; + xor.b32 %r5337, %r5336, %r5331; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 24; + add.s32 %r5339, %r5338, %r5332; + xor.b32 %r5340, %r5339, %r5334; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 25; + add.s32 %r5342, %r5294, %r5095; + add.s32 %r5343, %r5342, %r5285; + xor.b32 %r5344, %r5310, %r5343; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 16; + add.s32 %r5346, %r5325, %r5345; + xor.b32 %r5347, %r5346, %r5285; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 20; + add.s32 %r5349, %r5343, %r5167; + add.s32 %r5350, %r5349, %r5348; + xor.b32 %r5351, %r5350, %r5345; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 24; + add.s32 %r5353, %r5352, %r5346; + xor.b32 %r5354, %r5353, %r5348; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 25; + add.s32 %r5356, %r5299, %r5135; + add.s32 %r5357, %r5356, %r5308; + xor.b32 %r5358, %r5324, %r5357; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 16; + add.s32 %r5360, %r5359, %r5283; + xor.b32 %r5361, %r5360, %r5299; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 20; + add.s32 %r5363, %r5357, %r5071; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5359; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 24; + add.s32 %r5367, %r5366, %r5360; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 25; + add.s32 %r5370, %r5313, %r5111; + add.s32 %r5371, %r5370, %r5322; + xor.b32 %r5372, %r5371, %r5282; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 16; + add.s32 %r5374, %r5373, %r5297; + xor.b32 %r5375, %r5374, %r5313; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 20; + add.s32 %r5377, %r5371, %r5199; + add.s32 %r5378, %r5377, %r5376; + xor.b32 %r5379, %r5378, %r5373; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 24; + add.s32 %r5381, %r5380, %r5374; + xor.b32 %r5382, %r5381, %r5376; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 25; + add.s32 %r5384, %r5336, %r5079; + add.s32 %r5385, %r5384, %r5355; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 16; + add.s32 %r5388, %r5387, %r5367; + xor.b32 %r5389, %r5388, %r5355; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 20; + add.s32 %r5391, %r5385, %r5175; + add.s32 %r5392, %r5391, %r5390; + xor.b32 %r5393, %r5392, %r5387; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 24; + add.s32 %r5395, %r5394, %r5388; + xor.b32 %r5396, %r5395, %r5390; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 25; + add.s32 %r5398, %r5350, %r5191; + add.s32 %r5399, %r5398, %r5369; + xor.b32 %r5400, %r5399, %r5338; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 16; + add.s32 %r5402, %r5401, %r5381; + xor.b32 %r5403, %r5402, %r5369; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 20; + add.s32 %r5405, %r5399, %r5119; + add.s32 %r5406, %r5405, %r5404; + xor.b32 %r5407, %r5406, %r5401; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 24; + add.s32 %r5409, %r5408, %r5402; + xor.b32 %r5410, %r5409, %r5404; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 25; + add.s32 %r5412, %r5364, %r5159; + add.s32 %r5413, %r5412, %r5383; + xor.b32 %r5414, %r5413, %r5352; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 16; + add.s32 %r5416, %r5415, %r5339; + xor.b32 %r5417, %r5416, %r5383; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 20; + add.s32 %r5419, %r5413, %r5207; + add.s32 %r5420, %r5419, %r5418; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 24; + add.s32 %r5423, %r5422, %r5416; + xor.b32 %r5424, %r5423, %r5418; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 25; + add.s32 %r5426, %r5378, %r5215; + add.s32 %r5427, %r5426, %r5341; + xor.b32 %r5428, %r5427, %r5366; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 16; + add.s32 %r5430, %r5429, %r5353; + xor.b32 %r5431, %r5430, %r5341; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 20; + add.s32 %r5433, %r5427, %r5151; + add.s32 %r5434, %r5433, %r5432; + xor.b32 %r5435, %r5434, %r5429; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 24; + add.s32 %r5437, %r5436, %r5430; + xor.b32 %r5438, %r5437, %r5432; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 25; + add.s32 %r5440, %r5392, %r5095; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5408; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 16; + add.s32 %r5444, %r5443, %r5423; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 20; + add.s32 %r5447, %r5441, %r5111; + add.s32 %r5448, %r5447, %r5446; + xor.b32 %r5449, %r5448, %r5443; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 24; + add.s32 %r5451, %r5450, %r5444; + xor.b32 %r5452, %r5451, %r5446; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 25; + add.s32 %r5454, %r5406, %r5167; + add.s32 %r5455, %r5454, %r5397; + xor.b32 %r5456, %r5455, %r5422; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 16; + add.s32 %r5458, %r5457, %r5437; + xor.b32 %r5459, %r5458, %r5397; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 20; + add.s32 %r5461, %r5455, %r5191; + add.s32 %r5462, %r5461, %r5460; + xor.b32 %r5463, %r5462, %r5457; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 24; + add.s32 %r5465, %r5464, %r5458; + xor.b32 %r5466, %r5465, %r5460; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 25; + add.s32 %r5468, %r5420, %r5199; + add.s32 %r5469, %r5468, %r5411; + xor.b32 %r5470, %r5469, %r5436; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 16; + add.s32 %r5472, %r5471, %r5395; + xor.b32 %r5473, %r5472, %r5411; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 20; + add.s32 %r5475, %r5469, %r5087; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5471; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 24; + add.s32 %r5479, %r5478, %r5472; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 25; + add.s32 %r5482, %r5434, %r5135; + add.s32 %r5483, %r5482, %r5425; + xor.b32 %r5484, %r5483, %r5394; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 16; + add.s32 %r5486, %r5485, %r5409; + xor.b32 %r5487, %r5486, %r5425; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 20; + add.s32 %r5489, %r5483, %r5207; + add.s32 %r5490, %r5489, %r5488; + xor.b32 %r5491, %r5490, %r5485; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 24; + add.s32 %r5493, %r5492, %r5486; + xor.b32 %r5494, %r5493, %r5488; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 25; + add.s32 %r5496, %r5448, %r5127; + add.s32 %r5497, %r5496, %r5467; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 16; + add.s32 %r5500, %r5499, %r5479; + xor.b32 %r5501, %r5500, %r5467; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 20; + add.s32 %r5503, %r5497, %r5119; + add.s32 %r5504, %r5503, %r5502; + xor.b32 %r5505, %r5504, %r5499; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 24; + add.s32 %r5507, %r5506, %r5500; + xor.b32 %r5508, %r5507, %r5502; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 25; + add.s32 %r5510, %r5462, %r5159; + add.s32 %r5511, %r5510, %r5481; + xor.b32 %r5512, %r5511, %r5450; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 16; + add.s32 %r5514, %r5513, %r5493; + xor.b32 %r5515, %r5514, %r5481; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 20; + add.s32 %r5517, %r5511, %r5071; + add.s32 %r5518, %r5517, %r5516; + xor.b32 %r5519, %r5518, %r5513; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 24; + add.s32 %r5521, %r5520, %r5514; + xor.b32 %r5522, %r5521, %r5516; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 25; + add.s32 %r5524, %r5476, %r5175; + add.s32 %r5525, %r5524, %r5495; + xor.b32 %r5526, %r5525, %r5464; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 16; + add.s32 %r5528, %r5527, %r5451; + xor.b32 %r5529, %r5528, %r5495; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 20; + add.s32 %r5531, %r5525, %r5215; + add.s32 %r5532, %r5531, %r5530; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 24; + add.s32 %r5535, %r5534, %r5528; + xor.b32 %r5536, %r5535, %r5530; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 25; + add.s32 %r5538, %r5490, %r5151; + add.s32 %r5539, %r5538, %r5453; + xor.b32 %r5540, %r5539, %r5478; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 16; + add.s32 %r5542, %r5541, %r5465; + xor.b32 %r5543, %r5542, %r5453; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 20; + add.s32 %r5545, %r5539, %r5079; + add.s32 %r5546, %r5545, %r5544; + xor.b32 %r5547, %r5546, %r5541; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 24; + add.s32 %r5549, %r5548, %r5542; + xor.b32 %r5550, %r5549, %r5544; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 25; + add.s32 %r5552, %r5504, %r5167; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5520; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 16; + add.s32 %r5556, %r5555, %r5535; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 20; + add.s32 %r5559, %r5553, %r5135; + add.s32 %r5560, %r5559, %r5558; + xor.b32 %r5561, %r5560, %r5555; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 24; + add.s32 %r5563, %r5562, %r5556; + xor.b32 %r5564, %r5563, %r5558; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 25; + add.s32 %r5566, %r5518, %r5191; + add.s32 %r5567, %r5566, %r5509; + xor.b32 %r5568, %r5567, %r5534; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 16; + add.s32 %r5570, %r5569, %r5549; + xor.b32 %r5571, %r5570, %r5509; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 20; + add.s32 %r5573, %r5567, %r5159; + add.s32 %r5574, %r5573, %r5572; + xor.b32 %r5575, %r5574, %r5569; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 24; + add.s32 %r5577, %r5576, %r5570; + xor.b32 %r5578, %r5577, %r5572; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 25; + add.s32 %r5580, %r5532, %r5207; + add.s32 %r5581, %r5580, %r5523; + xor.b32 %r5582, %r5581, %r5548; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 16; + add.s32 %r5584, %r5583, %r5507; + xor.b32 %r5585, %r5584, %r5523; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 20; + add.s32 %r5587, %r5581, %r5095; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5583; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 24; + add.s32 %r5591, %r5590, %r5584; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 25; + add.s32 %r5594, %r5546, %r5199; + add.s32 %r5595, %r5594, %r5537; + xor.b32 %r5596, %r5595, %r5506; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 16; + add.s32 %r5598, %r5597, %r5521; + xor.b32 %r5599, %r5598, %r5537; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 20; + add.s32 %r5601, %r5595, %r5215; + add.s32 %r5602, %r5601, %r5600; + xor.b32 %r5603, %r5602, %r5597; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 24; + add.s32 %r5605, %r5604, %r5598; + xor.b32 %r5606, %r5605, %r5600; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 25; + add.s32 %r5608, %r5560, %r5111; + add.s32 %r5609, %r5608, %r5579; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 16; + add.s32 %r5612, %r5611, %r5591; + xor.b32 %r5613, %r5612, %r5579; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 20; + add.s32 %r5615, %r5609, %r5071; + add.s32 %r5616, %r5615, %r5614; + xor.b32 %r5617, %r5616, %r5611; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 24; + add.s32 %r5619, %r5618, %r5612; + xor.b32 %r5620, %r5619, %r5614; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 25; + add.s32 %r5622, %r5574, %r5175; + add.s32 %r5623, %r5622, %r5593; + xor.b32 %r5624, %r5623, %r5562; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 16; + add.s32 %r5626, %r5625, %r5605; + xor.b32 %r5627, %r5626, %r5593; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 20; + add.s32 %r5629, %r5623, %r5087; + add.s32 %r5630, %r5629, %r5628; + xor.b32 %r5631, %r5630, %r5625; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 24; + add.s32 %r5633, %r5632, %r5626; + xor.b32 %r5634, %r5633, %r5628; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 25; + add.s32 %r5636, %r5588, %r5119; + add.s32 %r5637, %r5636, %r5607; + xor.b32 %r5638, %r5637, %r5576; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 16; + add.s32 %r5640, %r5639, %r5563; + xor.b32 %r5641, %r5640, %r5607; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 20; + add.s32 %r5643, %r5637, %r5151; + add.s32 %r5644, %r5643, %r5642; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 24; + add.s32 %r5647, %r5646, %r5640; + xor.b32 %r5648, %r5647, %r5642; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 25; + add.s32 %r5650, %r5602, %r5079; + add.s32 %r5651, %r5650, %r5565; + xor.b32 %r5652, %r5651, %r5590; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 16; + add.s32 %r5654, %r5653, %r5577; + xor.b32 %r5655, %r5654, %r5565; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 20; + add.s32 %r5657, %r5651, %r5127; + add.s32 %r5658, %r5657, %r5656; + xor.b32 %r5659, %r5658, %r5653; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 24; + add.s32 %r5661, %r5660, %r5654; + xor.b32 %r5662, %r5661, %r5656; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 25; + add.s32 %r5664, %r5616, %r5191; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5632; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 16; + add.s32 %r5668, %r5667, %r5647; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 20; + add.s32 %r5671, %r5665, %r5199; + add.s32 %r5672, %r5671, %r5670; + xor.b32 %r5673, %r5672, %r5667; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 24; + add.s32 %r5675, %r5674, %r5668; + xor.b32 %r5676, %r5675, %r5670; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 25; + add.s32 %r5678, %r5630, %r5159; + add.s32 %r5679, %r5678, %r5621; + xor.b32 %r5680, %r5679, %r5646; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 16; + add.s32 %r5682, %r5681, %r5661; + xor.b32 %r5683, %r5682, %r5621; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 20; + add.s32 %r5685, %r5679, %r5175; + add.s32 %r5686, %r5685, %r5684; + xor.b32 %r5687, %r5686, %r5681; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 24; + add.s32 %r5689, %r5688, %r5682; + xor.b32 %r5690, %r5689, %r5684; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 25; + add.s32 %r5692, %r5644, %r5215; + add.s32 %r5693, %r5692, %r5635; + xor.b32 %r5694, %r5693, %r5660; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 16; + add.s32 %r5696, %r5695, %r5619; + xor.b32 %r5697, %r5696, %r5635; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 20; + add.s32 %r5699, %r5693, %r5167; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5695; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 24; + add.s32 %r5703, %r5702, %r5696; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 25; + add.s32 %r5706, %r5658, %r5207; + add.s32 %r5707, %r5706, %r5649; + xor.b32 %r5708, %r5707, %r5618; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 16; + add.s32 %r5710, %r5709, %r5633; + xor.b32 %r5711, %r5710, %r5649; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 20; + add.s32 %r5713, %r5707, %r5151; + add.s32 %r5714, %r5713, %r5712; + xor.b32 %r5715, %r5714, %r5709; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 24; + add.s32 %r5717, %r5716, %r5710; + xor.b32 %r5718, %r5717, %r5712; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 25; + add.s32 %r5720, %r5672, %r5135; + add.s32 %r5721, %r5720, %r5691; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 16; + add.s32 %r5724, %r5723, %r5703; + xor.b32 %r5725, %r5724, %r5691; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 20; + add.s32 %r5727, %r5721, %r5087; + add.s32 %r5728, %r5727, %r5726; + xor.b32 %r5729, %r5728, %r5723; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 24; + add.s32 %r5731, %r5730, %r5724; + xor.b32 %r5732, %r5731, %r5726; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 25; + add.s32 %r5734, %r5686, %r5119; + add.s32 %r5735, %r5734, %r5705; + xor.b32 %r5736, %r5735, %r5674; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 16; + add.s32 %r5738, %r5737, %r5717; + xor.b32 %r5739, %r5738, %r5705; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 20; + add.s32 %r5741, %r5735, %r5095; + add.s32 %r5742, %r5741, %r5740; + xor.b32 %r5743, %r5742, %r5737; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 24; + add.s32 %r5745, %r5744, %r5738; + xor.b32 %r5746, %r5745, %r5740; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 25; + add.s32 %r5748, %r5700, %r5071; + add.s32 %r5749, %r5748, %r5719; + xor.b32 %r5750, %r5749, %r5688; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 16; + add.s32 %r5752, %r5751, %r5675; + xor.b32 %r5753, %r5752, %r5719; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 20; + add.s32 %r5755, %r5749, %r5079; + add.s32 %r5756, %r5755, %r5754; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 24; + add.s32 %r5759, %r5758, %r5752; + xor.b32 %r5760, %r5759, %r5754; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 25; + add.s32 %r5762, %r5714, %r5127; + add.s32 %r5763, %r5762, %r5677; + xor.b32 %r5764, %r5763, %r5702; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 16; + add.s32 %r5766, %r5765, %r5689; + xor.b32 %r5767, %r5766, %r5677; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 20; + add.s32 %r5769, %r5763, %r5111; + add.s32 %r5770, %r5769, %r5768; + xor.b32 %r5771, %r5770, %r5765; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 24; + add.s32 %r5773, %r5772, %r5766; + xor.b32 %r5774, %r5773, %r5768; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 25; + add.s32 %r5776, %r5728, %r5159; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5744; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 16; + add.s32 %r5780, %r5779, %r5759; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 20; + add.s32 %r5783, %r5777, %r5207; + add.s32 %r5784, %r5783, %r5782; + xor.b32 %r5785, %r5784, %r5779; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 24; + add.s32 %r5787, %r5786, %r5780; + xor.b32 %r5788, %r5787, %r5782; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 25; + add.s32 %r5790, %r5742, %r5175; + add.s32 %r5791, %r5790, %r5733; + xor.b32 %r5792, %r5791, %r5758; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 16; + add.s32 %r5794, %r5793, %r5773; + xor.b32 %r5795, %r5794, %r5733; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 20; + add.s32 %r5797, %r5791, %r5119; + add.s32 %r5798, %r5797, %r5796; + xor.b32 %r5799, %r5798, %r5793; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 24; + add.s32 %r5801, %r5800, %r5794; + xor.b32 %r5802, %r5801, %r5796; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 25; + add.s32 %r5804, %r5756, %r5151; + add.s32 %r5805, %r5804, %r5747; + xor.b32 %r5806, %r5805, %r5772; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 16; + add.s32 %r5808, %r5807, %r5731; + xor.b32 %r5809, %r5808, %r5747; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 20; + add.s32 %r5811, %r5805, %r5191; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5807; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 24; + add.s32 %r5815, %r5814, %r5808; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 25; + add.s32 %r5818, %r5770, %r5215; + add.s32 %r5819, %r5818, %r5761; + xor.b32 %r5820, %r5819, %r5730; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 16; + add.s32 %r5822, %r5821, %r5745; + xor.b32 %r5823, %r5822, %r5761; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 20; + add.s32 %r5825, %r5819, %r5079; + add.s32 %r5826, %r5825, %r5824; + xor.b32 %r5827, %r5826, %r5821; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 24; + add.s32 %r5829, %r5828, %r5822; + xor.b32 %r5830, %r5829, %r5824; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 25; + add.s32 %r5832, %r5784, %r5199; + add.s32 %r5833, %r5832, %r5803; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 16; + add.s32 %r5836, %r5835, %r5815; + xor.b32 %r5837, %r5836, %r5803; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 20; + add.s32 %r5839, %r5833, %r5095; + add.s32 %r5840, %r5839, %r5838; + xor.b32 %r5841, %r5840, %r5835; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 24; + add.s32 %r5843, %r5842, %r5836; + xor.b32 %r5844, %r5843, %r5838; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 25; + add.s32 %r5846, %r5798, %r5071; + add.s32 %r5847, %r5846, %r5817; + xor.b32 %r5848, %r5847, %r5786; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 16; + add.s32 %r5850, %r5849, %r5829; + xor.b32 %r5851, %r5850, %r5817; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 20; + add.s32 %r5853, %r5847, %r5167; + add.s32 %r5854, %r5853, %r5852; + xor.b32 %r5855, %r5854, %r5849; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 24; + add.s32 %r5857, %r5856, %r5850; + xor.b32 %r5858, %r5857, %r5852; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 25; + add.s32 %r5860, %r5812, %r5087; + add.s32 %r5861, %r5860, %r5831; + xor.b32 %r5862, %r5861, %r5800; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 16; + add.s32 %r5864, %r5863, %r5787; + xor.b32 %r5865, %r5864, %r5831; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 20; + add.s32 %r5867, %r5861, %r5127; + add.s32 %r5868, %r5867, %r5866; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 24; + add.s32 %r5871, %r5870, %r5864; + xor.b32 %r5872, %r5871, %r5866; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 25; + add.s32 %r5874, %r5826, %r5111; + add.s32 %r5875, %r5874, %r5789; + xor.b32 %r5876, %r5875, %r5814; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 16; + add.s32 %r5878, %r5877, %r5801; + xor.b32 %r5879, %r5878, %r5789; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 20; + add.s32 %r5881, %r5875, %r5135; + add.s32 %r5882, %r5881, %r5880; + xor.b32 %r5883, %r5882, %r5877; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 24; + add.s32 %r5885, %r5884, %r5878; + xor.b32 %r5886, %r5885, %r5880; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 25; + add.s32 %r5888, %r5840, %r5175; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5856; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 16; + add.s32 %r5892, %r5891, %r5871; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 20; + add.s32 %r5895, %r5889, %r5215; + add.s32 %r5896, %r5895, %r5894; + xor.b32 %r5897, %r5896, %r5891; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 24; + add.s32 %r5899, %r5898, %r5892; + xor.b32 %r5900, %r5899, %r5894; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 25; + add.s32 %r5902, %r5854, %r5119; + add.s32 %r5903, %r5902, %r5845; + xor.b32 %r5904, %r5903, %r5870; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 16; + add.s32 %r5906, %r5905, %r5885; + xor.b32 %r5907, %r5906, %r5845; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 20; + add.s32 %r5909, %r5903, %r5071; + add.s32 %r5910, %r5909, %r5908; + xor.b32 %r5911, %r5910, %r5905; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 24; + add.s32 %r5913, %r5912, %r5906; + xor.b32 %r5914, %r5913, %r5908; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 25; + add.s32 %r5916, %r5868, %r5079; + add.s32 %r5917, %r5916, %r5859; + xor.b32 %r5918, %r5917, %r5884; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 16; + add.s32 %r5920, %r5919, %r5843; + xor.b32 %r5921, %r5920, %r5859; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 20; + add.s32 %r5923, %r5917, %r5159; + add.s32 %r5924, %r5923, %r5922; + xor.b32 %r5925, %r5924, %r5919; + shf.l.wrap.b32 %r5926, %r5925, %r5925, 24; + add.s32 %r5927, %r5926, %r5920; + xor.b32 %r5928, %r5927, %r5922; + shf.l.wrap.b32 %r5929, %r5928, %r5928, 25; + add.s32 %r5930, %r5882, %r5151; + add.s32 %r5931, %r5930, %r5873; + xor.b32 %r5932, %r5931, %r5842; + shf.l.wrap.b32 %r5933, %r5932, %r5932, 16; + add.s32 %r5934, %r5933, %r5857; + xor.b32 %r5935, %r5934, %r5873; + shf.l.wrap.b32 %r5936, %r5935, %r5935, 20; + add.s32 %r5937, %r5931, %r5127; + add.s32 %r5938, %r5937, %r5936; + xor.b32 %r5939, %r5938, %r5933; + shf.l.wrap.b32 %r5940, %r5939, %r5939, 24; + add.s32 %r5941, %r5940, %r5934; + xor.b32 %r5942, %r5941, %r5936; + shf.l.wrap.b32 %r5943, %r5942, %r5942, 25; + add.s32 %r5944, %r5896, %r5207; + add.s32 %r5945, %r5944, %r5915; + xor.b32 %r5946, %r5945, %r5940; + shf.l.wrap.b32 %r5947, %r5946, %r5946, 16; + add.s32 %r5948, %r5947, %r5927; + xor.b32 %r5949, %r5948, %r5915; + shf.l.wrap.b32 %r5950, %r5949, %r5949, 20; + add.s32 %r5951, %r5945, %r5167; + add.s32 %r5952, %r5951, %r5950; + xor.b32 %r5953, %r5952, %r5947; + shf.l.wrap.b32 %r5954, %r5953, %r5953, 24; + add.s32 %r5955, %r5954, %r5948; + xor.b32 %r5956, %r5955, %r5950; + shf.l.wrap.b32 %r5957, %r5956, %r5956, 25; + add.s32 %r5958, %r5910, %r5087; + add.s32 %r5959, %r5958, %r5929; + xor.b32 %r5960, %r5959, %r5898; + shf.l.wrap.b32 %r5961, %r5960, %r5960, 16; + add.s32 %r5962, %r5961, %r5941; + xor.b32 %r5963, %r5962, %r5929; + shf.l.wrap.b32 %r5964, %r5963, %r5963, 20; + add.s32 %r5965, %r5959, %r5191; + add.s32 %r5966, %r5965, %r5964; + xor.b32 %r5967, %r5966, %r5961; + shf.l.wrap.b32 %r5968, %r5967, %r5967, 24; + add.s32 %r5969, %r5968, %r5962; + xor.b32 %r5970, %r5969, %r5964; + shf.l.wrap.b32 %r5971, %r5970, %r5970, 25; + add.s32 %r5972, %r5924, %r5095; + add.s32 %r5973, %r5972, %r5943; + xor.b32 %r5974, %r5973, %r5912; + shf.l.wrap.b32 %r5975, %r5974, %r5974, 16; + add.s32 %r5976, %r5975, %r5899; + xor.b32 %r5977, %r5976, %r5943; + shf.l.wrap.b32 %r5978, %r5977, %r5977, 20; + add.s32 %r5979, %r5973, %r5111; + add.s32 %r5980, %r5979, %r5978; + xor.b32 %r5981, %r5980, %r5975; + shf.l.wrap.b32 %r5982, %r5981, %r5981, 24; + add.s32 %r5983, %r5982, %r5976; + xor.b32 %r5984, %r5983, %r5978; + shf.l.wrap.b32 %r5985, %r5984, %r5984, 25; + add.s32 %r5986, %r5938, %r5135; + add.s32 %r5987, %r5986, %r5901; + xor.b32 %r5988, %r5987, %r5926; + shf.l.wrap.b32 %r5989, %r5988, %r5988, 16; + add.s32 %r5990, %r5989, %r5913; + xor.b32 %r5991, %r5990, %r5901; + shf.l.wrap.b32 %r5992, %r5991, %r5991, 20; + add.s32 %r5993, %r5987, %r5199; + add.s32 %r5994, %r5993, %r5992; + xor.b32 %r5995, %r5994, %r5989; + shf.l.wrap.b32 %r5996, %r5995, %r5995, 24; + add.s32 %r5997, %r5996, %r5990; + xor.b32 %r5998, %r5997, %r5992; + shf.l.wrap.b32 %r5999, %r5998, %r5998, 25; + xor.b32 %r9, %r5983, %r5952; + xor.b32 %r10, %r5997, %r5966; + st.local.v2.u32 [%rd3+32], {%r9, %r10}; + xor.b32 %r11, %r5955, %r5980; + xor.b32 %r12, %r5994, %r5969; + st.local.v2.u32 [%rd3+40], {%r11, %r12}; + xor.b32 %r13, %r5999, %r5968; + xor.b32 %r14, %r5957, %r5982; + st.local.v2.u32 [%rd3+48], {%r13, %r14}; + xor.b32 %r15, %r5996, %r5971; + xor.b32 %r16, %r5985, %r5954; + st.local.v2.u32 [%rd3+56], {%r15, %r16}; + ld.local.v4.u32 {%r6000, %r6001, %r6002, %r6003}, [%rd4+64]; + st.local.v2.u32 [%rd3+72], {%r6000, %r6001}; + st.local.v2.u32 [%rd3+80], {%r6002, %r6003}; + add.s16 %rs1, %rs412, 16; + and.b16 %rs485, %rs1, 255; + add.s16 %rs486, %rs413, 1; + st.local.v2.u8 [%rd3+136], {%rs1, %rs486}; + cvt.u32.u16 %r6008, %rs486; + cvt.u32.u16 %r6009, %rs485; + prmt.b32 %r6010, %r6008, %r6009, 30212; + cvt.u16.u32 %rs487, %r6010; + shr.u16 %rs2, %rs487, 8; + mov.b32 {%rs5, %rs6}, %r6001; + mov.b32 {%rs3, %rs4}, %r6000; + mov.b32 {%rs9, %rs10}, %r6003; + mov.b32 {%rs7, %rs8}, %r6002; + setp.eq.s16 %p15, %rs2, 0; + selp.u16 %rs488, 1, 0, %p15; + shr.u16 %rs489, %rs3, 8; + shr.u16 %rs490, %rs4, 8; + shr.u16 %rs491, %rs5, 8; + shr.u16 %rs492, %rs6, 8; + shr.u16 %rs493, %rs7, 8; + shr.u16 %rs494, %rs8, 8; + shr.u16 %rs495, %rs9, 8; + shr.u16 %rs496, %rs10, 8; + or.b16 %rs497, %rs488, 10; + cvt.u32.u16 %r6011, %rs3; + and.b32 %r6012, %r6011, 255; + cvt.u32.u16 %r6013, %rs489; + prmt.b32 %r6014, %r6013, %r6012, 30212; + cvt.u32.u16 %r6015, %rs4; + prmt.b32 %r6016, %r6015, %r6014, 28756; + cvt.u32.u16 %r6017, %rs490; + prmt.b32 %r6018, %r6017, %r6016, 1620; + cvt.u32.u16 %r6019, %rs5; + and.b32 %r6020, %r6019, 255; + cvt.u32.u16 %r6021, %rs491; + prmt.b32 %r6022, %r6021, %r6020, 30212; + cvt.u32.u16 %r6023, %rs6; + prmt.b32 %r6024, %r6023, %r6022, 28756; + cvt.u32.u16 %r6025, %rs492; + prmt.b32 %r6026, %r6025, %r6024, 1620; + cvt.u32.u16 %r6027, %rs7; + and.b32 %r6028, %r6027, 255; + cvt.u32.u16 %r6029, %rs493; + prmt.b32 %r6030, %r6029, %r6028, 30212; + cvt.u32.u16 %r6031, %rs8; + prmt.b32 %r6032, %r6031, %r6030, 28756; + cvt.u32.u16 %r6033, %rs494; + prmt.b32 %r6034, %r6033, %r6032, 1620; + cvt.u32.u16 %r6035, %rs9; + and.b32 %r6036, %r6035, 255; + cvt.u32.u16 %r6037, %rs495; + prmt.b32 %r6038, %r6037, %r6036, 30212; + cvt.u32.u16 %r6039, %rs10; + prmt.b32 %r6040, %r6039, %r6038, 28756; + cvt.u32.u16 %r6041, %rs496; + prmt.b32 %r6042, %r6041, %r6040, 1620; + cvt.u32.u16 %r6043, %rs497; + add.s32 %r6044, %r13, %r9; + add.s32 %r6045, %r6044, %r6018; + add.s32 %r6046, %r6026, %r6045; + add.s32 %r6047, %r14, %r10; + add.s32 %r6048, %r6047, %r6034; + add.s32 %r6049, %r6042, %r6048; + add.s32 %r6050, %r15, %r11; + cvt.u32.u16 %r6051, %rs1; + and.b32 %r6052, %r6051, 255; + xor.b32 %r6053, %r6050, %r6052; + shr.u32 %r6054, %r6050, 16; + shl.b32 %r6055, %r6053, 16; + or.b32 %r6056, %r6055, %r6054; + add.s32 %r6057, %r6056, 1013904242; + xor.b32 %r6058, %r6057, %r15; + shf.l.wrap.b32 %r6059, %r6058, %r6058, 20; + add.s32 %r6060, %r6050, %r6059; + xor.b32 %r6061, %r6060, %r6056; + shf.l.wrap.b32 %r6062, %r6061, %r6061, 24; + add.s32 %r6063, %r6062, %r6057; + xor.b32 %r6064, %r6063, %r6059; + shf.l.wrap.b32 %r6065, %r6064, %r6064, 25; + add.s32 %r6066, %r16, %r12; + xor.b32 %r6067, %r6066, %r6043; + shr.u32 %r6068, %r6066, 16; + shl.b32 %r6069, %r6067, 16; + or.b32 %r6070, %r6069, %r6068; + add.s32 %r6071, %r6070, -1521486534; + xor.b32 %r6072, %r6071, %r16; + shf.l.wrap.b32 %r6073, %r6072, %r6072, 20; + add.s32 %r6074, %r6066, %r6073; + xor.b32 %r6075, %r6074, %r6070; + shf.l.wrap.b32 %r6076, %r6075, %r6075, 24; + add.s32 %r6077, %r6076, %r6071; + xor.b32 %r6078, %r6077, %r6073; + shf.l.wrap.b32 %r6079, %r6078, %r6078, 25; + add.s32 %r6080, %r6079, %r6060; + shf.l.wrap.b32 %r6081, %r6045, %r6045, 16; + add.s32 %r6082, %r6081, 1779033703; + xor.b32 %r6083, %r6082, %r13; + shf.l.wrap.b32 %r6084, %r6083, %r6083, 20; + add.s32 %r6085, %r6046, %r6084; + xor.b32 %r6086, %r6085, %r6081; + shf.l.wrap.b32 %r6087, %r6086, %r6086, 24; + add.s32 %r6088, %r6087, %r6082; + xor.b32 %r6089, %r6088, %r6084; + shf.l.wrap.b32 %r6090, %r6089, %r6089, 25; + shf.l.wrap.b32 %r6091, %r6048, %r6048, 16; + add.s32 %r6092, %r6091, -1150833019; + xor.b32 %r6093, %r6092, %r14; + shf.l.wrap.b32 %r6094, %r6093, %r6093, 20; + add.s32 %r6095, %r6049, %r6094; + xor.b32 %r6096, %r6095, %r6091; + shf.l.wrap.b32 %r6097, %r6096, %r6096, 24; + add.s32 %r6098, %r6097, %r6092; + xor.b32 %r6099, %r6098, %r6094; + shf.l.wrap.b32 %r6100, %r6099, %r6099, 25; + add.s32 %r6101, %r6085, %r6100; + xor.b32 %r6102, %r6101, %r6076; + shf.l.wrap.b32 %r6103, %r6102, %r6102, 16; + add.s32 %r6104, %r6103, %r6063; + xor.b32 %r6105, %r6104, %r6100; + shf.l.wrap.b32 %r6106, %r6105, %r6105, 20; + add.s32 %r6107, %r6101, %r6106; + xor.b32 %r6108, %r6107, %r6103; + shf.l.wrap.b32 %r6109, %r6108, %r6108, 24; + add.s32 %r6110, %r6109, %r6104; + xor.b32 %r6111, %r6110, %r6106; + shf.l.wrap.b32 %r6112, %r6111, %r6111, 25; + add.s32 %r6113, %r6065, %r6095; + xor.b32 %r6114, %r6087, %r6113; + shf.l.wrap.b32 %r6115, %r6114, %r6114, 16; + add.s32 %r6116, %r6115, %r6077; + xor.b32 %r6117, %r6116, %r6065; + shf.l.wrap.b32 %r6118, %r6117, %r6117, 20; + add.s32 %r6119, %r6113, %r6118; + xor.b32 %r6120, %r6119, %r6115; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 24; + add.s32 %r6122, %r6121, %r6116; + xor.b32 %r6123, %r6122, %r6118; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 25; + xor.b32 %r6125, %r6097, %r6080; + shf.l.wrap.b32 %r6126, %r6125, %r6125, 16; + add.s32 %r6127, %r6126, %r6088; + xor.b32 %r6128, %r6127, %r6079; + shf.l.wrap.b32 %r6129, %r6128, %r6128, 20; + add.s32 %r6130, %r6080, %r6129; + xor.b32 %r6131, %r6130, %r6126; + shf.l.wrap.b32 %r6132, %r6131, %r6131, 24; + add.s32 %r6133, %r6132, %r6127; + xor.b32 %r6134, %r6133, %r6129; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 25; + add.s32 %r6136, %r6074, %r6090; + xor.b32 %r6137, %r6136, %r6062; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 16; + add.s32 %r6139, %r6138, %r6098; + xor.b32 %r6140, %r6139, %r6090; + shf.l.wrap.b32 %r6141, %r6140, %r6140, 20; + add.s32 %r6142, %r6136, %r6141; + xor.b32 %r6143, %r6142, %r6138; + shf.l.wrap.b32 %r6144, %r6143, %r6143, 24; + add.s32 %r6145, %r6144, %r6139; + xor.b32 %r6146, %r6145, %r6141; + shf.l.wrap.b32 %r6147, %r6146, %r6146, 25; + add.s32 %r6148, %r6107, %r6034; + add.s32 %r6149, %r6148, %r6147; + xor.b32 %r6150, %r6149, %r6121; + shf.l.wrap.b32 %r6151, %r6150, %r6150, 16; + add.s32 %r6152, %r6151, %r6133; + xor.b32 %r6153, %r6152, %r6147; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6149, %r6154; + xor.b32 %r6156, %r6155, %r6151; + shf.l.wrap.b32 %r6157, %r6156, %r6156, 24; + add.s32 %r6158, %r6157, %r6152; + xor.b32 %r6159, %r6158, %r6154; + shf.l.wrap.b32 %r6160, %r6159, %r6159, 25; + add.s32 %r6161, %r6119, %r6042; + add.s32 %r6162, %r6161, %r6112; + xor.b32 %r6163, %r6162, %r6132; + shf.l.wrap.b32 %r6164, %r6163, %r6163, 16; + add.s32 %r6165, %r6164, %r6145; + xor.b32 %r6166, %r6165, %r6112; + shf.l.wrap.b32 %r6167, %r6166, %r6166, 20; + add.s32 %r6168, %r6162, %r6167; + xor.b32 %r6169, %r6168, %r6164; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 24; + add.s32 %r6171, %r6170, %r6165; + xor.b32 %r6172, %r6171, %r6167; + shf.l.wrap.b32 %r6173, %r6172, %r6172, 25; + add.s32 %r6174, %r6130, %r6124; + xor.b32 %r6175, %r6144, %r6174; + shf.l.wrap.b32 %r6176, %r6175, %r6175, 16; + add.s32 %r6177, %r6176, %r6110; + xor.b32 %r6178, %r6177, %r6124; + shf.l.wrap.b32 %r6179, %r6178, %r6178, 20; + add.s32 %r6180, %r6174, %r6018; + add.s32 %r6181, %r6180, %r6179; + xor.b32 %r6182, %r6181, %r6176; + shf.l.wrap.b32 %r6183, %r6182, %r6182, 24; + add.s32 %r6184, %r6183, %r6177; + xor.b32 %r6185, %r6184, %r6179; + shf.l.wrap.b32 %r6186, %r6185, %r6185, 25; + add.s32 %r6187, %r6142, %r6135; + xor.b32 %r6188, %r6109, %r6187; + shf.l.wrap.b32 %r6189, %r6188, %r6188, 16; + add.s32 %r6190, %r6189, %r6122; + xor.b32 %r6191, %r6190, %r6135; + shf.l.wrap.b32 %r6192, %r6191, %r6191, 20; + add.s32 %r6193, %r6187, %r6192; + xor.b32 %r6194, %r6193, %r6189; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 24; + add.s32 %r6196, %r6195, %r6190; + xor.b32 %r6197, %r6196, %r6192; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 25; + add.s32 %r6199, %r6155, %r6026; + add.s32 %r6200, %r6199, %r6173; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 16; + add.s32 %r6203, %r6202, %r6184; + xor.b32 %r6204, %r6203, %r6173; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 20; + add.s32 %r6206, %r6200, %r6205; + xor.b32 %r6207, %r6206, %r6202; + shf.l.wrap.b32 %r6208, %r6207, %r6207, 24; + add.s32 %r6209, %r6208, %r6203; + xor.b32 %r6210, %r6209, %r6205; + shf.l.wrap.b32 %r6211, %r6210, %r6210, 25; + add.s32 %r6212, %r6186, %r6168; + xor.b32 %r6213, %r6157, %r6212; + shf.l.wrap.b32 %r6214, %r6213, %r6213, 16; + add.s32 %r6215, %r6214, %r6196; + xor.b32 %r6216, %r6215, %r6186; + shf.l.wrap.b32 %r6217, %r6216, %r6216, 20; + add.s32 %r6218, %r6212, %r6217; + xor.b32 %r6219, %r6218, %r6214; + shf.l.wrap.b32 %r6220, %r6219, %r6219, 24; + add.s32 %r6221, %r6220, %r6215; + xor.b32 %r6222, %r6221, %r6217; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 25; + add.s32 %r6224, %r6181, %r6198; + xor.b32 %r6225, %r6170, %r6224; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 16; + add.s32 %r6227, %r6226, %r6158; + xor.b32 %r6228, %r6227, %r6198; + shf.l.wrap.b32 %r6229, %r6228, %r6228, 20; + add.s32 %r6230, %r6224, %r6229; + xor.b32 %r6231, %r6230, %r6226; + shf.l.wrap.b32 %r6232, %r6231, %r6231, 24; + add.s32 %r6233, %r6232, %r6227; + xor.b32 %r6234, %r6233, %r6229; + shf.l.wrap.b32 %r6235, %r6234, %r6234, 25; + add.s32 %r6236, %r6193, %r6160; + xor.b32 %r6237, %r6236, %r6183; + shf.l.wrap.b32 %r6238, %r6237, %r6237, 16; + add.s32 %r6239, %r6238, %r6171; + xor.b32 %r6240, %r6239, %r6160; + shf.l.wrap.b32 %r6241, %r6240, %r6240, 20; + add.s32 %r6242, %r6236, %r6241; + xor.b32 %r6243, %r6242, %r6238; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6239; + xor.b32 %r6246, %r6245, %r6241; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6206, %r6042; + add.s32 %r6249, %r6248, %r6247; + xor.b32 %r6250, %r6249, %r6220; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6251, %r6233; + xor.b32 %r6253, %r6252, %r6247; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6254; + xor.b32 %r6256, %r6255, %r6251; + shf.l.wrap.b32 %r6257, %r6256, %r6256, 24; + add.s32 %r6258, %r6257, %r6252; + xor.b32 %r6259, %r6258, %r6254; + shf.l.wrap.b32 %r6260, %r6259, %r6259, 25; + add.s32 %r6261, %r6218, %r6211; + xor.b32 %r6262, %r6261, %r6232; + shf.l.wrap.b32 %r6263, %r6262, %r6262, 16; + add.s32 %r6264, %r6263, %r6245; + xor.b32 %r6265, %r6264, %r6211; + shf.l.wrap.b32 %r6266, %r6265, %r6265, 20; + add.s32 %r6267, %r6261, %r6266; + xor.b32 %r6268, %r6267, %r6263; + shf.l.wrap.b32 %r6269, %r6268, %r6268, 24; + add.s32 %r6270, %r6269, %r6264; + xor.b32 %r6271, %r6270, %r6266; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 25; + add.s32 %r6273, %r6230, %r6223; + xor.b32 %r6274, %r6244, %r6273; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 16; + add.s32 %r6276, %r6275, %r6209; + xor.b32 %r6277, %r6276, %r6223; + shf.l.wrap.b32 %r6278, %r6277, %r6277, 20; + add.s32 %r6279, %r6273, %r6034; + add.s32 %r6280, %r6279, %r6278; + xor.b32 %r6281, %r6280, %r6275; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 24; + add.s32 %r6283, %r6282, %r6276; + xor.b32 %r6284, %r6283, %r6278; + shf.l.wrap.b32 %r6285, %r6284, %r6284, 25; + add.s32 %r6286, %r6242, %r6235; + xor.b32 %r6287, %r6208, %r6286; + shf.l.wrap.b32 %r6288, %r6287, %r6287, 16; + add.s32 %r6289, %r6288, %r6221; + xor.b32 %r6290, %r6289, %r6235; + shf.l.wrap.b32 %r6291, %r6290, %r6290, 20; + add.s32 %r6292, %r6286, %r6291; + xor.b32 %r6293, %r6292, %r6288; + shf.l.wrap.b32 %r6294, %r6293, %r6293, 24; + add.s32 %r6295, %r6294, %r6289; + xor.b32 %r6296, %r6295, %r6291; + shf.l.wrap.b32 %r6297, %r6296, %r6296, 25; + add.s32 %r6298, %r6255, %r6272; + xor.b32 %r6299, %r6298, %r6294; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 16; + add.s32 %r6301, %r6300, %r6283; + xor.b32 %r6302, %r6301, %r6272; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 20; + add.s32 %r6304, %r6298, %r6303; + xor.b32 %r6305, %r6304, %r6300; + shf.l.wrap.b32 %r6306, %r6305, %r6305, 24; + add.s32 %r6307, %r6306, %r6301; + xor.b32 %r6308, %r6307, %r6303; + shf.l.wrap.b32 %r6309, %r6308, %r6308, 25; + add.s32 %r6310, %r6285, %r6267; + xor.b32 %r6311, %r6257, %r6310; + shf.l.wrap.b32 %r6312, %r6311, %r6311, 16; + add.s32 %r6313, %r6312, %r6295; + xor.b32 %r6314, %r6313, %r6285; + shf.l.wrap.b32 %r6315, %r6314, %r6314, 20; + add.s32 %r6316, %r6310, %r6018; + add.s32 %r6317, %r6316, %r6315; + xor.b32 %r6318, %r6317, %r6312; + shf.l.wrap.b32 %r6319, %r6318, %r6318, 24; + add.s32 %r6320, %r6319, %r6313; + xor.b32 %r6321, %r6320, %r6315; + shf.l.wrap.b32 %r6322, %r6321, %r6321, 25; + add.s32 %r6323, %r6280, %r6297; + xor.b32 %r6324, %r6269, %r6323; + shf.l.wrap.b32 %r6325, %r6324, %r6324, 16; + add.s32 %r6326, %r6325, %r6258; + xor.b32 %r6327, %r6326, %r6297; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 20; + add.s32 %r6329, %r6323, %r6328; + xor.b32 %r6330, %r6329, %r6325; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 24; + add.s32 %r6332, %r6331, %r6326; + xor.b32 %r6333, %r6332, %r6328; + shf.l.wrap.b32 %r6334, %r6333, %r6333, 25; + add.s32 %r6335, %r6292, %r6260; + xor.b32 %r6336, %r6335, %r6282; + shf.l.wrap.b32 %r6337, %r6336, %r6336, 16; + add.s32 %r6338, %r6337, %r6270; + xor.b32 %r6339, %r6338, %r6260; + shf.l.wrap.b32 %r6340, %r6339, %r6339, 20; + add.s32 %r6341, %r6335, %r6026; + add.s32 %r6342, %r6341, %r6340; + xor.b32 %r6343, %r6342, %r6337; + shf.l.wrap.b32 %r6344, %r6343, %r6343, 24; + add.s32 %r6345, %r6344, %r6338; + xor.b32 %r6346, %r6345, %r6340; + shf.l.wrap.b32 %r6347, %r6346, %r6346, 25; + add.s32 %r6348, %r6304, %r6347; + xor.b32 %r6349, %r6348, %r6319; + shf.l.wrap.b32 %r6350, %r6349, %r6349, 16; + add.s32 %r6351, %r6350, %r6332; + xor.b32 %r6352, %r6351, %r6347; + shf.l.wrap.b32 %r6353, %r6352, %r6352, 20; + add.s32 %r6354, %r6348, %r6353; + xor.b32 %r6355, %r6354, %r6350; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6351; + xor.b32 %r6358, %r6357, %r6353; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6317, %r6309; + xor.b32 %r6361, %r6360, %r6331; + shf.l.wrap.b32 %r6362, %r6361, %r6361, 16; + add.s32 %r6363, %r6362, %r6345; + xor.b32 %r6364, %r6363, %r6309; + shf.l.wrap.b32 %r6365, %r6364, %r6364, 20; + add.s32 %r6366, %r6360, %r6365; + xor.b32 %r6367, %r6366, %r6362; + shf.l.wrap.b32 %r6368, %r6367, %r6367, 24; + add.s32 %r6369, %r6368, %r6363; + xor.b32 %r6370, %r6369, %r6365; + shf.l.wrap.b32 %r6371, %r6370, %r6370, 25; + add.s32 %r6372, %r6329, %r6322; + xor.b32 %r6373, %r6344, %r6372; + shf.l.wrap.b32 %r6374, %r6373, %r6373, 16; + add.s32 %r6375, %r6374, %r6307; + xor.b32 %r6376, %r6375, %r6322; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 20; + add.s32 %r6378, %r6372, %r6042; + add.s32 %r6379, %r6378, %r6377; + xor.b32 %r6380, %r6379, %r6374; + shf.l.wrap.b32 %r6381, %r6380, %r6380, 24; + add.s32 %r6382, %r6381, %r6375; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 25; + add.s32 %r6385, %r6342, %r6334; + xor.b32 %r6386, %r6306, %r6385; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 16; + add.s32 %r6388, %r6387, %r6320; + xor.b32 %r6389, %r6388, %r6334; + shf.l.wrap.b32 %r6390, %r6389, %r6389, 20; + add.s32 %r6391, %r6385, %r6390; + xor.b32 %r6392, %r6391, %r6387; + shf.l.wrap.b32 %r6393, %r6392, %r6392, 24; + add.s32 %r6394, %r6393, %r6388; + xor.b32 %r6395, %r6394, %r6390; + shf.l.wrap.b32 %r6396, %r6395, %r6395, 25; + add.s32 %r6397, %r6354, %r6371; + xor.b32 %r6398, %r6397, %r6393; + shf.l.wrap.b32 %r6399, %r6398, %r6398, 16; + add.s32 %r6400, %r6399, %r6382; + xor.b32 %r6401, %r6400, %r6371; + shf.l.wrap.b32 %r6402, %r6401, %r6401, 20; + add.s32 %r6403, %r6397, %r6018; + add.s32 %r6404, %r6403, %r6402; + xor.b32 %r6405, %r6404, %r6399; + shf.l.wrap.b32 %r6406, %r6405, %r6405, 24; + add.s32 %r6407, %r6406, %r6400; + xor.b32 %r6408, %r6407, %r6402; + shf.l.wrap.b32 %r6409, %r6408, %r6408, 25; + add.s32 %r6410, %r6384, %r6366; + xor.b32 %r6411, %r6356, %r6410; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 16; + add.s32 %r6413, %r6412, %r6394; + xor.b32 %r6414, %r6413, %r6384; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 20; + add.s32 %r6416, %r6410, %r6034; + add.s32 %r6417, %r6416, %r6415; + xor.b32 %r6418, %r6417, %r6412; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 24; + add.s32 %r6420, %r6419, %r6413; + xor.b32 %r6421, %r6420, %r6415; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 25; + add.s32 %r6423, %r6379, %r6396; + xor.b32 %r6424, %r6368, %r6423; + shf.l.wrap.b32 %r6425, %r6424, %r6424, 16; + add.s32 %r6426, %r6425, %r6357; + xor.b32 %r6427, %r6426, %r6396; + shf.l.wrap.b32 %r6428, %r6427, %r6427, 20; + add.s32 %r6429, %r6423, %r6428; + xor.b32 %r6430, %r6429, %r6425; + shf.l.wrap.b32 %r6431, %r6430, %r6430, 24; + add.s32 %r6432, %r6431, %r6426; + xor.b32 %r6433, %r6432, %r6428; + shf.l.wrap.b32 %r6434, %r6433, %r6433, 25; + add.s32 %r6435, %r6391, %r6026; + add.s32 %r6436, %r6435, %r6359; + xor.b32 %r6437, %r6436, %r6381; + shf.l.wrap.b32 %r6438, %r6437, %r6437, 16; + add.s32 %r6439, %r6438, %r6369; + xor.b32 %r6440, %r6439, %r6359; + shf.l.wrap.b32 %r6441, %r6440, %r6440, 20; + add.s32 %r6442, %r6436, %r6441; + xor.b32 %r6443, %r6442, %r6438; + shf.l.wrap.b32 %r6444, %r6443, %r6443, 24; + add.s32 %r6445, %r6444, %r6439; + xor.b32 %r6446, %r6445, %r6441; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 25; + add.s32 %r6448, %r6404, %r6447; + xor.b32 %r6449, %r6448, %r6419; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 16; + add.s32 %r6451, %r6450, %r6432; + xor.b32 %r6452, %r6451, %r6447; + shf.l.wrap.b32 %r6453, %r6452, %r6452, 20; + add.s32 %r6454, %r6448, %r6453; + xor.b32 %r6455, %r6454, %r6450; + shf.l.wrap.b32 %r6456, %r6455, %r6455, 24; + add.s32 %r6457, %r6456, %r6451; + xor.b32 %r6458, %r6457, %r6453; + shf.l.wrap.b32 %r6459, %r6458, %r6458, 25; + add.s32 %r6460, %r6417, %r6409; + xor.b32 %r6461, %r6460, %r6431; + shf.l.wrap.b32 %r6462, %r6461, %r6461, 16; + add.s32 %r6463, %r6462, %r6445; + xor.b32 %r6464, %r6463, %r6409; + shf.l.wrap.b32 %r6465, %r6464, %r6464, 20; + add.s32 %r6466, %r6460, %r6465; + xor.b32 %r6467, %r6466, %r6462; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6463; + xor.b32 %r6470, %r6469, %r6465; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6429, %r6422; + xor.b32 %r6473, %r6444, %r6472; + shf.l.wrap.b32 %r6474, %r6473, %r6473, 16; + add.s32 %r6475, %r6474, %r6407; + xor.b32 %r6476, %r6475, %r6422; + shf.l.wrap.b32 %r6477, %r6476, %r6476, 20; + add.s32 %r6478, %r6472, %r6477; + xor.b32 %r6479, %r6478, %r6474; + shf.l.wrap.b32 %r6480, %r6479, %r6479, 24; + add.s32 %r6481, %r6480, %r6475; + xor.b32 %r6482, %r6481, %r6477; + shf.l.wrap.b32 %r6483, %r6482, %r6482, 25; + add.s32 %r6484, %r6442, %r6434; + xor.b32 %r6485, %r6406, %r6484; + shf.l.wrap.b32 %r6486, %r6485, %r6485, 16; + add.s32 %r6487, %r6486, %r6420; + xor.b32 %r6488, %r6487, %r6434; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 20; + add.s32 %r6490, %r6484, %r6489; + xor.b32 %r6491, %r6490, %r6486; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 24; + add.s32 %r6493, %r6492, %r6487; + xor.b32 %r6494, %r6493, %r6489; + shf.l.wrap.b32 %r6495, %r6494, %r6494, 25; + add.s32 %r6496, %r6454, %r6471; + xor.b32 %r6497, %r6496, %r6492; + shf.l.wrap.b32 %r6498, %r6497, %r6497, 16; + add.s32 %r6499, %r6498, %r6481; + xor.b32 %r6500, %r6499, %r6471; + shf.l.wrap.b32 %r6501, %r6500, %r6500, 20; + add.s32 %r6502, %r6496, %r6034; + add.s32 %r6503, %r6502, %r6501; + xor.b32 %r6504, %r6503, %r6498; + shf.l.wrap.b32 %r6505, %r6504, %r6504, 24; + add.s32 %r6506, %r6505, %r6499; + xor.b32 %r6507, %r6506, %r6501; + shf.l.wrap.b32 %r6508, %r6507, %r6507, 25; + add.s32 %r6509, %r6483, %r6466; + xor.b32 %r6510, %r6456, %r6509; + shf.l.wrap.b32 %r6511, %r6510, %r6510, 16; + add.s32 %r6512, %r6511, %r6493; + xor.b32 %r6513, %r6512, %r6483; + shf.l.wrap.b32 %r6514, %r6513, %r6513, 20; + add.s32 %r6515, %r6509, %r6042; + add.s32 %r6516, %r6515, %r6514; + xor.b32 %r6517, %r6516, %r6511; + shf.l.wrap.b32 %r6518, %r6517, %r6517, 24; + add.s32 %r6519, %r6518, %r6512; + xor.b32 %r6520, %r6519, %r6514; + shf.l.wrap.b32 %r6521, %r6520, %r6520, 25; + add.s32 %r6522, %r6478, %r6018; + add.s32 %r6523, %r6522, %r6495; + xor.b32 %r6524, %r6468, %r6523; + shf.l.wrap.b32 %r6525, %r6524, %r6524, 16; + add.s32 %r6526, %r6525, %r6457; + xor.b32 %r6527, %r6526, %r6495; + shf.l.wrap.b32 %r6528, %r6527, %r6527, 20; + add.s32 %r6529, %r6523, %r6026; + add.s32 %r6530, %r6529, %r6528; + xor.b32 %r6531, %r6530, %r6525; + shf.l.wrap.b32 %r6532, %r6531, %r6531, 24; + add.s32 %r6533, %r6532, %r6526; + xor.b32 %r6534, %r6533, %r6528; + shf.l.wrap.b32 %r6535, %r6534, %r6534, 25; + add.s32 %r6536, %r6490, %r6459; + xor.b32 %r6537, %r6536, %r6480; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 16; + add.s32 %r6539, %r6538, %r6469; + xor.b32 %r6540, %r6539, %r6459; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 20; + add.s32 %r6542, %r6536, %r6541; + xor.b32 %r6543, %r6542, %r6538; + shf.l.wrap.b32 %r6544, %r6543, %r6543, 24; + add.s32 %r6545, %r6544, %r6539; + xor.b32 %r6546, %r6545, %r6541; + shf.l.wrap.b32 %r6547, %r6546, %r6546, 25; + add.s32 %r6548, %r6503, %r6547; + xor.b32 %r6549, %r6548, %r6518; + shf.l.wrap.b32 %r6550, %r6549, %r6549, 16; + add.s32 %r6551, %r6550, %r6533; + xor.b32 %r6552, %r6551, %r6547; + shf.l.wrap.b32 %r6553, %r6552, %r6552, 20; + add.s32 %r6554, %r6548, %r6553; + xor.b32 %r6555, %r6554, %r6550; + shf.l.wrap.b32 %r6556, %r6555, %r6555, 24; + add.s32 %r6557, %r6556, %r6551; + xor.b32 %r6558, %r6557, %r6553; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 25; + add.s32 %r6560, %r6516, %r6508; + xor.b32 %r6561, %r6560, %r6532; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 16; + add.s32 %r6563, %r6562, %r6545; + xor.b32 %r6564, %r6563, %r6508; + shf.l.wrap.b32 %r6565, %r6564, %r6564, 20; + add.s32 %r6566, %r6560, %r6565; + xor.b32 %r6567, %r6566, %r6562; + shf.l.wrap.b32 %r6568, %r6567, %r6567, 24; + add.s32 %r6569, %r6568, %r6563; + xor.b32 %r6570, %r6569, %r6565; + shf.l.wrap.b32 %r6571, %r6570, %r6570, 25; + add.s32 %r6572, %r6530, %r6521; + xor.b32 %r6573, %r6544, %r6572; + shf.l.wrap.b32 %r6574, %r6573, %r6573, 16; + add.s32 %r6575, %r6574, %r6506; + xor.b32 %r6576, %r6575, %r6521; + shf.l.wrap.b32 %r6577, %r6576, %r6576, 20; + add.s32 %r6578, %r6572, %r6577; + xor.b32 %r6579, %r6578, %r6574; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6575; + xor.b32 %r6582, %r6581, %r6577; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6542, %r6535; + xor.b32 %r6585, %r6505, %r6584; + shf.l.wrap.b32 %r6586, %r6585, %r6585, 16; + add.s32 %r6587, %r6586, %r6519; + xor.b32 %r6588, %r6587, %r6535; + shf.l.wrap.b32 %r6589, %r6588, %r6588, 20; + add.s32 %r6590, %r6584, %r6026; + add.s32 %r6591, %r6590, %r6589; + xor.b32 %r6592, %r6591, %r6586; + shf.l.wrap.b32 %r6593, %r6592, %r6592, 24; + add.s32 %r6594, %r6593, %r6587; + xor.b32 %r6595, %r6594, %r6589; + shf.l.wrap.b32 %r6596, %r6595, %r6595, 25; + add.s32 %r6597, %r6554, %r6571; + xor.b32 %r6598, %r6597, %r6593; + shf.l.wrap.b32 %r6599, %r6598, %r6598, 16; + add.s32 %r6600, %r6599, %r6581; + xor.b32 %r6601, %r6600, %r6571; + shf.l.wrap.b32 %r6602, %r6601, %r6601, 20; + add.s32 %r6603, %r6597, %r6042; + add.s32 %r6604, %r6603, %r6602; + xor.b32 %r6605, %r6604, %r6599; + shf.l.wrap.b32 %r6606, %r6605, %r6605, 24; + add.s32 %r6607, %r6606, %r6600; + xor.b32 %r6608, %r6607, %r6602; + shf.l.wrap.b32 %r6609, %r6608, %r6608, 25; + add.s32 %r6610, %r6583, %r6018; + add.s32 %r6611, %r6610, %r6566; + xor.b32 %r6612, %r6556, %r6611; + shf.l.wrap.b32 %r6613, %r6612, %r6612, 16; + add.s32 %r6614, %r6613, %r6594; + xor.b32 %r6615, %r6614, %r6583; + shf.l.wrap.b32 %r6616, %r6615, %r6615, 20; + add.s32 %r6617, %r6611, %r6616; + xor.b32 %r6618, %r6617, %r6613; + shf.l.wrap.b32 %r6619, %r6618, %r6618, 24; + add.s32 %r6620, %r6619, %r6614; + xor.b32 %r6621, %r6620, %r6616; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 25; + add.s32 %r6623, %r6578, %r6034; + add.s32 %r6624, %r6623, %r6596; + xor.b32 %r6625, %r6568, %r6624; + shf.l.wrap.b32 %r6626, %r6625, %r6625, 16; + add.s32 %r6627, %r6626, %r6557; + xor.b32 %r6628, %r6627, %r6596; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 20; + add.s32 %r6630, %r6624, %r6629; + xor.b32 %r6631, %r6630, %r6626; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 24; + add.s32 %r6633, %r6632, %r6627; + xor.b32 %r6634, %r6633, %r6629; + shf.l.wrap.b32 %r6635, %r6634, %r6634, 25; + add.s32 %r6636, %r6591, %r6559; + xor.b32 %r6637, %r6636, %r6580; + shf.l.wrap.b32 %r6638, %r6637, %r6637, 16; + add.s32 %r6639, %r6638, %r6569; + xor.b32 %r6640, %r6639, %r6559; + shf.l.wrap.b32 %r6641, %r6640, %r6640, 20; + add.s32 %r6642, %r6636, %r6641; + xor.b32 %r6643, %r6642, %r6638; + shf.l.wrap.b32 %r6644, %r6643, %r6643, 24; + add.s32 %r6645, %r6644, %r6639; + xor.b32 %r6646, %r6645, %r6641; + shf.l.wrap.b32 %r6647, %r6646, %r6646, 25; + add.s32 %r6648, %r6604, %r6647; + xor.b32 %r6649, %r6648, %r6619; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 16; + add.s32 %r6651, %r6650, %r6633; + xor.b32 %r6652, %r6651, %r6647; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 20; + add.s32 %r6654, %r6648, %r6653; + xor.b32 %r6655, %r6654, %r6650; + shf.l.wrap.b32 %r6656, %r6655, %r6655, 24; + add.s32 %r6657, %r6656, %r6651; + xor.b32 %r6658, %r6657, %r6653; + shf.l.wrap.b32 %r6659, %r6658, %r6658, 25; + add.s32 %r6660, %r6617, %r6609; + xor.b32 %r6661, %r6660, %r6632; + shf.l.wrap.b32 %r6662, %r6661, %r6661, 16; + add.s32 %r6663, %r6662, %r6645; + xor.b32 %r6664, %r6663, %r6609; + shf.l.wrap.b32 %r6665, %r6664, %r6664, 20; + add.s32 %r6666, %r6660, %r6018; + add.s32 %r6667, %r6666, %r6665; + xor.b32 %r6668, %r6667, %r6662; + shf.l.wrap.b32 %r6669, %r6668, %r6668, 24; + add.s32 %r6670, %r6669, %r6663; + xor.b32 %r6671, %r6670, %r6665; + shf.l.wrap.b32 %r6672, %r6671, %r6671, 25; + add.s32 %r6673, %r6630, %r6026; + add.s32 %r6674, %r6673, %r6622; + xor.b32 %r6675, %r6644, %r6674; + shf.l.wrap.b32 %r6676, %r6675, %r6675, 16; + add.s32 %r6677, %r6676, %r6607; + xor.b32 %r6678, %r6677, %r6622; + shf.l.wrap.b32 %r6679, %r6678, %r6678, 20; + add.s32 %r6680, %r6674, %r6679; + xor.b32 %r6681, %r6680, %r6676; + shf.l.wrap.b32 %r6682, %r6681, %r6681, 24; + add.s32 %r6683, %r6682, %r6677; + xor.b32 %r6684, %r6683, %r6679; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 25; + add.s32 %r6686, %r6642, %r6635; + xor.b32 %r6687, %r6606, %r6686; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 16; + add.s32 %r6689, %r6688, %r6620; + xor.b32 %r6690, %r6689, %r6635; + shf.l.wrap.b32 %r6691, %r6690, %r6690, 20; + add.s32 %r6692, %r6686, %r6691; + xor.b32 %r6693, %r6692, %r6688; + shf.l.wrap.b32 %r6694, %r6693, %r6693, 24; + add.s32 %r6695, %r6694, %r6689; + xor.b32 %r6696, %r6695, %r6691; + shf.l.wrap.b32 %r6697, %r6696, %r6696, 25; + add.s32 %r6698, %r6654, %r6672; + xor.b32 %r6699, %r6698, %r6694; + shf.l.wrap.b32 %r6700, %r6699, %r6699, 16; + add.s32 %r6701, %r6700, %r6683; + xor.b32 %r6702, %r6701, %r6672; + shf.l.wrap.b32 %r6703, %r6702, %r6702, 20; + add.s32 %r6704, %r6698, %r6703; + xor.b32 %r6705, %r6704, %r6700; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6701; + xor.b32 %r6708, %r6707, %r6703; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6685, %r6034; + add.s32 %r6711, %r6710, %r6667; + xor.b32 %r6712, %r6656, %r6711; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6695; + xor.b32 %r6715, %r6714, %r6685; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6716; + xor.b32 %r6718, %r6717, %r6713; + shf.l.wrap.b32 %r6719, %r6718, %r6718, 24; + add.s32 %r6720, %r6719, %r6714; + xor.b32 %r6721, %r6720, %r6716; + shf.l.wrap.b32 %r6722, %r6721, %r6721, 25; + add.s32 %r6723, %r6680, %r6042; + add.s32 %r6724, %r6723, %r6697; + xor.b32 %r6725, %r6669, %r6724; + shf.l.wrap.b32 %r6726, %r6725, %r6725, 16; + add.s32 %r6727, %r6726, %r6657; + xor.b32 %r6728, %r6727, %r6697; + shf.l.wrap.b32 %r6729, %r6728, %r6728, 20; + add.s32 %r6730, %r6724, %r6729; + xor.b32 %r6731, %r6730, %r6726; + shf.l.wrap.b32 %r6732, %r6731, %r6731, 24; + add.s32 %r6733, %r6732, %r6727; + xor.b32 %r6734, %r6733, %r6729; + shf.l.wrap.b32 %r6735, %r6734, %r6734, 25; + add.s32 %r6736, %r6692, %r6659; + xor.b32 %r6737, %r6736, %r6682; + shf.l.wrap.b32 %r6738, %r6737, %r6737, 16; + add.s32 %r6739, %r6738, %r6670; + xor.b32 %r6740, %r6739, %r6659; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 20; + add.s32 %r6742, %r6736, %r6741; + xor.b32 %r6743, %r6742, %r6738; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 24; + add.s32 %r6745, %r6744, %r6739; + xor.b32 %r6746, %r6745, %r6741; + shf.l.wrap.b32 %r6747, %r6746, %r6746, 25; + xor.b32 %r6748, %r6704, %r6733; + cvt.u64.u32 %rd418, %r6748; + xor.b32 %r6749, %r6745, %r6717; + and.b32 %r6750, %r6749, 255; + cvt.u64.u32 %rd419, %r6750; + bfi.b64 %rd420, %rd419, %rd418, 32, 32; + cvt.u64.u32 %rd421, %r6749; + shl.b64 %rd422, %rd421, 32; + and.b64 %rd423, %rd422, 280375465082880; + or.b64 %rd424, %rd420, %rd423; + and.b64 %rd425, %rd422, 71776119061217280; + shr.u32 %r6751, %r6749, 24; + cvt.u64.u32 %rd426, %r6751; + shl.b64 %rd427, %rd426, 56; + or.b64 %rd428, %rd424, %rd425; + or.b64 %rd22, %rd428, %rd427; + xor.b32 %r6752, %r6707, %r6730; + cvt.u64.u32 %rd429, %r6752; + xor.b32 %r6753, %r6742, %r6720; + and.b32 %r6754, %r6753, 255; + cvt.u64.u32 %rd430, %r6754; + bfi.b64 %rd431, %rd430, %rd429, 32, 32; + cvt.u64.u32 %rd432, %r6753; + shl.b64 %rd433, %rd432, 32; + and.b64 %rd434, %rd433, 280375465082880; + or.b64 %rd435, %rd431, %rd434; + and.b64 %rd436, %rd433, 71776119061217280; + shr.u32 %r6755, %r6753, 24; + cvt.u64.u32 %rd437, %r6755; + shl.b64 %rd438, %rd437, 56; + or.b64 %rd439, %rd435, %rd436; + or.b64 %rd23, %rd439, %rd438; + xor.b32 %r6756, %r6747, %r6719; + cvt.u64.u32 %rd440, %r6756; + xor.b32 %r6757, %r6709, %r6732; + and.b32 %r6758, %r6757, 255; + cvt.u64.u32 %rd441, %r6758; + bfi.b64 %rd442, %rd441, %rd440, 32, 32; + cvt.u64.u32 %rd443, %r6757; + shl.b64 %rd444, %rd443, 32; + and.b64 %rd445, %rd444, 280375465082880; + or.b64 %rd446, %rd442, %rd445; + and.b64 %rd447, %rd444, 71776119061217280; + shr.u32 %r6759, %r6757, 24; + cvt.u64.u32 %rd448, %r6759; + shl.b64 %rd449, %rd448, 56; + or.b64 %rd450, %rd446, %rd447; + or.b64 %rd24, %rd450, %rd449; + xor.b32 %r6760, %r6744, %r6722; + cvt.u64.u32 %rd451, %r6760; + xor.b32 %r6761, %r6706, %r6735; + and.b32 %r6762, %r6761, 255; + cvt.u64.u32 %rd452, %r6762; + bfi.b64 %rd453, %rd452, %rd451, 32, 32; + cvt.u64.u32 %rd454, %r6761; + shl.b64 %rd455, %rd454, 32; + and.b64 %rd456, %rd455, 280375465082880; + or.b64 %rd457, %rd453, %rd456; + and.b64 %rd458, %rd455, 71776119061217280; + shr.u32 %r6763, %r6761, 24; + cvt.u64.u32 %rd459, %r6763; + shl.b64 %rd460, %rd459, 56; + or.b64 %rd461, %rd457, %rd458; + or.b64 %rd25, %rd461, %rd460; + add.u64 %rd1248, %SPL, 2016; + mov.u32 %r29538, 0; + st.local.v4.u32 [%rd1248+32], {%r29538, %r29538, %r29538, %r29538}; + st.local.v4.u32 [%rd1248+48], {%r29538, %r29538, %r29538, %r29538}; + st.local.v4.u32 [%rd1248+64], {%r29538, %r29538, %r29538, %r29538}; + and.b64 %rd26, %rd22, -256; + st.local.v2.u64 [%rd1248], {%rd22, %rd23}; + st.local.v2.u64 [%rd1248+16], {%rd24, %rd25}; + and.b64 %rd1260, %rd22, 255; + +$L__BB2_19: + mov.b64 {%r20, %r19}, %rd1260; + mul.wide.u32 %rd463, %r20, 1908875315; + shr.u64 %rd464, %rd463, 56; + cvt.u32.u64 %r6765, %rd464; + mul.lo.s32 %r6766, %r6765, 37748717; + sub.s32 %r21, %r20, %r6766; + mov.b64 {%r24, %r23}, %rd1258; + mul.wide.u32 %rd465, %r24, 1908875315; + shr.u64 %rd466, %rd465, 56; + cvt.u32.u64 %r6767, %rd466; + mul.lo.s32 %r6768, %r6767, 37748717; + sub.s32 %r25, %r24, %r6768; + mov.b64 {%r28, %r27}, %rd1257; + mul.wide.u32 %rd467, %r28, 1908875315; + shr.u64 %rd468, %rd467, 56; + cvt.u32.u64 %r6769, %rd468; + mul.lo.s32 %r6770, %r6769, 37748717; + sub.s32 %r29, %r28, %r6770; + shl.b32 %r30, %r21, 1; + mul.wide.u32 %rd469, %r30, -954391867; + shr.u64 %rd470, %rd469, 32; + cvt.u32.u64 %r6771, %rd470; + sub.s32 %r6772, %r30, %r6771; + shr.u32 %r6773, %r6772, 1; + add.s32 %r6774, %r6773, %r6771; + shr.u32 %r6775, %r6774, 20; + mul.lo.s32 %r6776, %r6775, 1179641; + sub.s32 %r6777, %r30, %r6776; + cvta.to.global.u64 %rd471, %rd354; + mul.wide.u32 %rd472, %r6777, 64; + add.s64 %rd34, %rd471, %rd472; + or.b32 %r31, %r30, 1; + mul.wide.u32 %rd473, %r31, -954391867; + shr.u64 %rd474, %rd473, 32; + cvt.u32.u64 %r6778, %rd474; + sub.s32 %r6779, %r31, %r6778; + shr.u32 %r6780, %r6779, 1; + add.s32 %r6781, %r6780, %r6778; + shr.u32 %r6782, %r6781, 20; + mul.lo.s32 %r6783, %r6782, 1179641; + sub.s32 %r6784, %r31, %r6783; + mul.wide.u32 %rd475, %r6784, 64; + add.s64 %rd35, %rd471, %rd475; + setp.eq.s64 %p16, %rd353, 0; + @%p16 bra $L__BB2_33; + + cvta.to.global.u64 %rd476, %rd353; + mul.wide.u32 %rd477, %r21, 128; + add.s64 %rd36, %rd476, %rd477; + ld.global.u64 %rd1261, [%rd36]; + setp.eq.s64 %p17, %rd1261, 0; + @%p17 bra $L__BB2_22; + + ld.global.u64 %rd1264, [%rd36+32]; + ld.global.u64 %rd1263, [%rd36+16]; + ld.global.u64 %rd1262, [%rd36+8]; + bra.uni $L__BB2_44; + +$L__BB2_33: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd579, 1179641; + st.local.u64 [%rd3+8], %rd579; + st.local.u32 [%rd3+16], %r30; + ld.global.u64 %rd580, [%rd34]; + ld.global.u64 %rd581, [%rd34+8]; + ld.global.u64 %rd582, [%rd34+16]; + ld.global.u64 %rd583, [%rd34+24]; + ld.global.u64 %rd584, [%rd34+32]; + ld.global.u64 %rd585, [%rd34+40]; + ld.global.u64 %rd586, [%rd34+48]; + ld.global.u64 %rd587, [%rd34+56]; + st.local.u64 [%rd3+24], %rd580; + st.local.u64 [%rd3+32], %rd581; + st.local.u64 [%rd3+40], %rd582; + st.local.u64 [%rd3+48], %rd583; + st.local.u64 [%rd3+56], %rd584; + st.local.u64 [%rd3+64], %rd585; + st.local.u64 [%rd3+72], %rd586; + st.local.u64 [%rd3+80], %rd587; + cvt.u32.u64 %r10110, %rd580; + xor.b32 %r10111, %r30, %r10110; + st.local.u32 [%rd3+24], %r10111; + mov.u32 %r29776, 0; + st.local.v2.u32 [%rd3+96], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+104], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+112], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+120], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+128], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+136], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+144], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+152], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+160], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+168], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+176], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+184], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+192], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+200], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+208], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+216], {%r29776, %r29776}; + mov.u32 %r29791, -2147483648; + mov.u32 %r10083, 1; + st.local.v2.u32 [%rd3+88], {%r10083, %r29791}; + ld.local.v2.u32 {%r29812, %r29813}, [%rd3+24]; + mov.b64 {%r29810, %r29811}, %rd585; + shr.u64 %rd588, %rd581, 32; + cvt.u32.u64 %r29824, %rd581; + cvt.u32.u64 %r29825, %rd588; + shr.u64 %rd589, %rd586, 32; + cvt.u32.u64 %r29822, %rd586; + cvt.u32.u64 %r29823, %rd589; + shr.u64 %rd590, %rd582, 32; + cvt.u32.u64 %r29820, %rd582; + cvt.u32.u64 %r29821, %rd590; + shr.u64 %rd591, %rd587, 32; + cvt.u32.u64 %r29818, %rd587; + cvt.u32.u64 %r29819, %rd591; + shr.u64 %rd592, %rd583, 32; + cvt.u32.u64 %r29816, %rd583; + cvt.u32.u64 %r29817, %rd592; + shr.u64 %rd593, %rd584, 32; + cvt.u32.u64 %r29814, %rd584; + cvt.u32.u64 %r29815, %rd593; + mov.u32 %r29777, %r29776; + mov.u32 %r29778, %r29776; + mov.u32 %r29779, %r29776; + mov.u32 %r29780, %r29776; + mov.u32 %r29781, %r29776; + mov.u32 %r29782, %r29776; + mov.u32 %r29783, %r29776; + mov.u32 %r29784, %r29776; + mov.u32 %r29785, %r29776; + mov.u32 %r29786, %r29776; + mov.u32 %r29787, %r29776; + mov.u32 %r29788, %r29776; + mov.u32 %r29789, %r29776; + mov.u32 %r29790, %r10083; + mov.u32 %r29792, %r29776; + mov.u32 %r29793, %r29776; + mov.u32 %r29794, %r29776; + mov.u32 %r29795, %r29776; + mov.u32 %r29796, %r29776; + mov.u32 %r29797, %r29776; + mov.u32 %r29798, %r29776; + mov.u32 %r29799, %r29776; + mov.u32 %r29800, %r29776; + mov.u32 %r29801, %r29776; + mov.u32 %r29802, %r29776; + mov.u32 %r29803, %r29776; + mov.u32 %r29804, %r29776; + mov.u32 %r29805, %r29776; + mov.u32 %r29806, %r29776; + mov.u32 %r29807, %r29776; + mov.u32 %r29808, %r29776; + mov.u32 %r29809, %r29776; + mov.u32 %r29826, %r29776; + +$L__BB2_34: + // begin inline asm + // xor5 + lop3.b32 %r10114, %r29812, %r29810, %r29808, 0x96; + lop3.b32 %r10114, %r10114, %r29806, %r29804, 0x96; + lop3.b32 %r10115, %r29813, %r29811, %r29809, 0x96; + lop3.b32 %r10115, %r10115, %r29807, %r29805, 0x96; + // end inline asm // begin inline asm - dp4a.u32.u32 %r1712, %r1713, %r5870, %r6249; + // xor5 + lop3.b32 %r10126, %r29824, %r29822, %r29802, 0x96; + lop3.b32 %r10126, %r10126, %r29800, %r29798, 0x96; + lop3.b32 %r10127, %r29825, %r29823, %r29803, 0x96; + lop3.b32 %r10127, %r10127, %r29801, %r29799, 0x96; // end inline asm - ld.const.u32 %r1717, [matrix+4]; // begin inline asm - dp4a.u32.u32 %r1716, %r1717, %r5874, %r1712; + // xor5 + lop3.b32 %r10138, %r29820, %r29818, %r29796, 0x96; + lop3.b32 %r10138, %r10138, %r29794, %r29792, 0x96; + lop3.b32 %r10139, %r29821, %r29819, %r29797, 0x96; + lop3.b32 %r10139, %r10139, %r29795, %r29793, 0x96; // end inline asm - ld.const.u32 %r1721, [matrix+8]; // begin inline asm - dp4a.u32.u32 %r1720, %r1721, %r5878, %r1716; + // xor5 + lop3.b32 %r10150, %r29816, %r29790, %r29788, 0x96; + lop3.b32 %r10150, %r10150, %r29786, %r29784, 0x96; + lop3.b32 %r10151, %r29817, %r29791, %r29789, 0x96; + lop3.b32 %r10151, %r10151, %r29787, %r29785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10162, %r29814, %r29782, %r29780, 0x96; + lop3.b32 %r10162, %r10162, %r29778, %r29776, 0x96; + lop3.b32 %r10163, %r29815, %r29783, %r29781, 0x96; + lop3.b32 %r10163, %r10163, %r29779, %r29777, 0x96; // end inline asm - ld.const.u32 %r1725, [matrix+12]; // begin inline asm - dp4a.u32.u32 %r1724, %r1725, %r5882, %r1720; + shf.l.wrap.b32 %r10174, %r10127, %r10126, %r10083; // end inline asm - ld.const.u32 %r1729, [matrix+16]; // begin inline asm - dp4a.u32.u32 %r1728, %r1729, %r5886, %r1724; + shf.l.wrap.b32 %r10178, %r10126, %r10127, %r10083; // end inline asm - ld.const.u32 %r1733, [matrix+20]; + xor.b32 %r10608, %r10174, %r10162; + xor.b32 %r10609, %r10178, %r10163; + xor.b32 %r10441, %r29812, %r10608; + xor.b32 %r10444, %r29813, %r10609; + xor.b32 %r10348, %r29810, %r10608; + xor.b32 %r10347, %r29811, %r10609; + xor.b32 %r10395, %r29808, %r10608; + xor.b32 %r10396, %r29809, %r10609; + xor.b32 %r10300, %r29806, %r10608; + xor.b32 %r10299, %r29807, %r10609; + xor.b32 %r10251, %r29804, %r10608; + xor.b32 %r10252, %r29805, %r10609; // begin inline asm - dp4a.u32.u32 %r1732, %r1733, %r5890, %r1728; + shf.l.wrap.b32 %r10182, %r10139, %r10138, %r10083; // end inline asm - ld.const.u32 %r1737, [matrix+24]; // begin inline asm - dp4a.u32.u32 %r1736, %r1737, %r5894, %r1732; + shf.l.wrap.b32 %r10186, %r10138, %r10139, %r10083; // end inline asm - ld.const.u32 %r1741, [matrix+28]; + xor.b32 %r10610, %r10182, %r10114; + xor.b32 %r10611, %r10186, %r10115; + xor.b32 %r10403, %r29824, %r10610; + xor.b32 %r10404, %r29825, %r10611; + xor.b32 %r10220, %r29822, %r10610; + xor.b32 %r10219, %r29823, %r10611; + xor.b32 %r10379, %r29802, %r10610; + xor.b32 %r10380, %r29803, %r10611; + xor.b32 %r10340, %r29800, %r10610; + xor.b32 %r10339, %r29801, %r10611; + xor.b32 %r10323, %r29798, %r10610; + xor.b32 %r10324, %r29799, %r10611; // begin inline asm - dp4a.u32.u32 %r1740, %r1741, %r5898, %r1736; + shf.l.wrap.b32 %r10190, %r10151, %r10150, %r10083; // end inline asm - ld.const.u32 %r1745, [matrix+32]; // begin inline asm - dp4a.u32.u32 %r1744, %r1745, %r5902, %r1740; + shf.l.wrap.b32 %r10194, %r10150, %r10151, %r10083; // end inline asm - ld.const.u32 %r1749, [matrix+36]; + xor.b32 %r10612, %r10190, %r10126; + xor.b32 %r10613, %r10194, %r10127; + xor.b32 %r10260, %r29820, %r10612; + xor.b32 %r10259, %r29821, %r10613; + xor.b32 %r10387, %r29818, %r10612; + xor.b32 %r10388, %r29819, %r10613; + xor.b32 %r10268, %r29796, %r10612; + xor.b32 %r10267, %r29797, %r10613; + xor.b32 %r10371, %r29794, %r10612; + xor.b32 %r10372, %r29795, %r10613; + xor.b32 %r10236, %r29792, %r10612; + xor.b32 %r10235, %r29793, %r10613; // begin inline asm - dp4a.u32.u32 %r1748, %r1749, %r5906, %r1744; + shf.l.wrap.b32 %r10198, %r10163, %r10162, %r10083; // end inline asm - ld.const.u32 %r1753, [matrix+40]; // begin inline asm - dp4a.u32.u32 %r1752, %r1753, %r5910, %r1748; + shf.l.wrap.b32 %r10202, %r10162, %r10163, %r10083; // end inline asm - ld.const.u32 %r1757, [matrix+44]; + xor.b32 %r10614, %r10198, %r10138; + xor.b32 %r10615, %r10202, %r10139; + xor.b32 %r10355, %r29816, %r10614; + xor.b32 %r10356, %r29817, %r10615; + xor.b32 %r10332, %r29790, %r10614; + xor.b32 %r10331, %r29791, %r10615; + xor.b32 %r10275, %r29788, %r10614; + xor.b32 %r10276, %r29789, %r10615; + xor.b32 %r10363, %r29786, %r10614; + xor.b32 %r10364, %r29787, %r10615; + xor.b32 %r10292, %r29784, %r10614; + xor.b32 %r10291, %r29785, %r10615; // begin inline asm - dp4a.u32.u32 %r1756, %r1757, %r5914, %r1752; + shf.l.wrap.b32 %r10206, %r10115, %r10114, %r10083; // end inline asm - ld.const.u32 %r1761, [matrix+48]; // begin inline asm - dp4a.u32.u32 %r1760, %r1761, %r5918, %r1756; + shf.l.wrap.b32 %r10210, %r10114, %r10115, %r10083; // end inline asm - ld.const.u32 %r1765, [matrix+52]; + xor.b32 %r10616, %r10206, %r10150; + xor.b32 %r10617, %r10210, %r10151; + xor.b32 %r10307, %r29814, %r10616; + xor.b32 %r10308, %r29815, %r10617; + xor.b32 %r10227, %r29782, %r10616; + xor.b32 %r10228, %r29783, %r10617; + xor.b32 %r10244, %r29780, %r10616; + xor.b32 %r10243, %r29781, %r10617; + xor.b32 %r10283, %r29778, %r10616; + xor.b32 %r10284, %r29779, %r10617; + xor.b32 %r10315, %r29776, %r10616; + xor.b32 %r10316, %r29777, %r10617; + mov.u32 %r10221, 44; // begin inline asm - dp4a.u32.u32 %r1764, %r1765, %r5922, %r1760; + shf.l.wrap.b32 %r10214, %r10220, %r10219, %r10221; // end inline asm - ld.const.u32 %r1769, [matrix+56]; // begin inline asm - dp4a.u32.u32 %r1768, %r1769, %r5926, %r1764; + shf.l.wrap.b32 %r10218, %r10219, %r10220, %r10221; // end inline asm - ld.const.u32 %r1773, [matrix+60]; + mov.u32 %r10229, 20; // begin inline asm - dp4a.u32.u32 %r1772, %r1773, %r5930, %r1768; + shf.l.wrap.b32 %r10222, %r10228, %r10227, %r10229; // end inline asm - ld.const.u32 %r1777, [matrix+64]; // begin inline asm - dp4a.u32.u32 %r1776, %r1777, %r5870, %r6249; + shf.l.wrap.b32 %r10226, %r10227, %r10228, %r10229; // end inline asm - ld.const.u32 %r1781, [matrix+68]; + mov.u32 %r10237, 61; // begin inline asm - dp4a.u32.u32 %r1780, %r1781, %r5874, %r1776; + shf.l.wrap.b32 %r10230, %r10236, %r10235, %r10237; // end inline asm - ld.const.u32 %r1785, [matrix+72]; // begin inline asm - dp4a.u32.u32 %r1784, %r1785, %r5878, %r1780; + shf.l.wrap.b32 %r10234, %r10235, %r10236, %r10237; // end inline asm - ld.const.u32 %r1789, [matrix+76]; + mov.u32 %r10245, 39; // begin inline asm - dp4a.u32.u32 %r1788, %r1789, %r5882, %r1784; + shf.l.wrap.b32 %r10238, %r10244, %r10243, %r10245; // end inline asm - ld.const.u32 %r1793, [matrix+80]; // begin inline asm - dp4a.u32.u32 %r1792, %r1793, %r5886, %r1788; + shf.l.wrap.b32 %r10242, %r10243, %r10244, %r10245; // end inline asm - ld.const.u32 %r1797, [matrix+84]; + mov.u32 %r10253, 18; // begin inline asm - dp4a.u32.u32 %r1796, %r1797, %r5890, %r1792; + shf.l.wrap.b32 %r10246, %r10252, %r10251, %r10253; // end inline asm - ld.const.u32 %r1801, [matrix+88]; // begin inline asm - dp4a.u32.u32 %r1800, %r1801, %r5894, %r1796; + shf.l.wrap.b32 %r10250, %r10251, %r10252, %r10253; // end inline asm - ld.const.u32 %r1805, [matrix+92]; + mov.u32 %r10261, 62; // begin inline asm - dp4a.u32.u32 %r1804, %r1805, %r5898, %r1800; + shf.l.wrap.b32 %r10254, %r10260, %r10259, %r10261; // end inline asm - ld.const.u32 %r1809, [matrix+96]; // begin inline asm - dp4a.u32.u32 %r1808, %r1809, %r5902, %r1804; + shf.l.wrap.b32 %r10258, %r10259, %r10260, %r10261; // end inline asm - ld.const.u32 %r1813, [matrix+100]; + mov.u32 %r10269, 43; // begin inline asm - dp4a.u32.u32 %r1812, %r1813, %r5906, %r1808; + shf.l.wrap.b32 %r10262, %r10268, %r10267, %r10269; // end inline asm - ld.const.u32 %r1817, [matrix+104]; // begin inline asm - dp4a.u32.u32 %r1816, %r1817, %r5910, %r1812; + shf.l.wrap.b32 %r10266, %r10267, %r10268, %r10269; // end inline asm - ld.const.u32 %r1821, [matrix+108]; + mov.u32 %r10277, 25; // begin inline asm - dp4a.u32.u32 %r1820, %r1821, %r5914, %r1816; + shf.l.wrap.b32 %r10270, %r10276, %r10275, %r10277; // end inline asm - ld.const.u32 %r1825, [matrix+112]; // begin inline asm - dp4a.u32.u32 %r1824, %r1825, %r5918, %r1820; + shf.l.wrap.b32 %r10274, %r10275, %r10276, %r10277; // end inline asm - ld.const.u32 %r1829, [matrix+116]; + mov.u32 %r10285, 8; // begin inline asm - dp4a.u32.u32 %r1828, %r1829, %r5922, %r1824; + shf.l.wrap.b32 %r10278, %r10284, %r10283, %r10285; // end inline asm - ld.const.u32 %r1833, [matrix+120]; // begin inline asm - dp4a.u32.u32 %r1832, %r1833, %r5926, %r1828; + shf.l.wrap.b32 %r10282, %r10283, %r10284, %r10285; // end inline asm - ld.const.u32 %r1837, [matrix+124]; + mov.u32 %r10293, 56; // begin inline asm - dp4a.u32.u32 %r1836, %r1837, %r5930, %r1832; + shf.l.wrap.b32 %r10286, %r10292, %r10291, %r10293; // end inline asm - shr.u32 %r6077, %r1772, 6; - and.b32 %r1841, %r6077, 240; - shr.u32 %r1842, %r1836, 10; - and.b32 %r1843, %r9, 255; // begin inline asm - lop3.b32 %r1840, %r1841, %r1842, %r1843, 0x56; + shf.l.wrap.b32 %r10290, %r10291, %r10292, %r10293; // end inline asm - ld.const.u32 %r1845, [matrix+128]; + mov.u32 %r10301, 41; // begin inline asm - dp4a.u32.u32 %r1844, %r1845, %r5870, %r6249; + shf.l.wrap.b32 %r10294, %r10300, %r10299, %r10301; // end inline asm - ld.const.u32 %r1849, [matrix+132]; // begin inline asm - dp4a.u32.u32 %r1848, %r1849, %r5874, %r1844; + shf.l.wrap.b32 %r10298, %r10299, %r10300, %r10301; // end inline asm - ld.const.u32 %r1853, [matrix+136]; + mov.u32 %r10309, 27; // begin inline asm - dp4a.u32.u32 %r1852, %r1853, %r5878, %r1848; + shf.l.wrap.b32 %r10302, %r10308, %r10307, %r10309; // end inline asm - ld.const.u32 %r1857, [matrix+140]; // begin inline asm - dp4a.u32.u32 %r1856, %r1857, %r5882, %r1852; + shf.l.wrap.b32 %r10306, %r10307, %r10308, %r10309; // end inline asm - ld.const.u32 %r1861, [matrix+144]; + mov.u32 %r10317, 14; // begin inline asm - dp4a.u32.u32 %r1860, %r1861, %r5886, %r1856; + shf.l.wrap.b32 %r10310, %r10316, %r10315, %r10317; // end inline asm - ld.const.u32 %r1865, [matrix+148]; // begin inline asm - dp4a.u32.u32 %r1864, %r1865, %r5890, %r1860; + shf.l.wrap.b32 %r10314, %r10315, %r10316, %r10317; // end inline asm - ld.const.u32 %r1869, [matrix+152]; + mov.u32 %r10325, 2; // begin inline asm - dp4a.u32.u32 %r1868, %r1869, %r5894, %r1864; + shf.l.wrap.b32 %r10318, %r10324, %r10323, %r10325; // end inline asm - ld.const.u32 %r1873, [matrix+156]; // begin inline asm - dp4a.u32.u32 %r1872, %r1873, %r5898, %r1868; + shf.l.wrap.b32 %r10322, %r10323, %r10324, %r10325; // end inline asm - ld.const.u32 %r1877, [matrix+160]; + mov.u32 %r10333, 55; // begin inline asm - dp4a.u32.u32 %r1876, %r1877, %r5902, %r1872; + shf.l.wrap.b32 %r10326, %r10332, %r10331, %r10333; // end inline asm - ld.const.u32 %r1881, [matrix+164]; // begin inline asm - dp4a.u32.u32 %r1880, %r1881, %r5906, %r1876; + shf.l.wrap.b32 %r10330, %r10331, %r10332, %r10333; // end inline asm - ld.const.u32 %r1885, [matrix+168]; + mov.u32 %r10341, 45; // begin inline asm - dp4a.u32.u32 %r1884, %r1885, %r5910, %r1880; + shf.l.wrap.b32 %r10334, %r10340, %r10339, %r10341; // end inline asm - ld.const.u32 %r1889, [matrix+172]; // begin inline asm - dp4a.u32.u32 %r1888, %r1889, %r5914, %r1884; + shf.l.wrap.b32 %r10338, %r10339, %r10340, %r10341; // end inline asm - ld.const.u32 %r1893, [matrix+176]; + mov.u32 %r10349, 36; // begin inline asm - dp4a.u32.u32 %r1892, %r1893, %r5918, %r1888; + shf.l.wrap.b32 %r10342, %r10348, %r10347, %r10349; // end inline asm - ld.const.u32 %r1897, [matrix+180]; // begin inline asm - dp4a.u32.u32 %r1896, %r1897, %r5922, %r1892; + shf.l.wrap.b32 %r10346, %r10347, %r10348, %r10349; // end inline asm - ld.const.u32 %r1901, [matrix+184]; + mov.u32 %r10357, 28; // begin inline asm - dp4a.u32.u32 %r1900, %r1901, %r5926, %r1896; + shf.l.wrap.b32 %r10350, %r10356, %r10355, %r10357; // end inline asm - ld.const.u32 %r1905, [matrix+188]; // begin inline asm - dp4a.u32.u32 %r1904, %r1905, %r5930, %r1900; + shf.l.wrap.b32 %r10354, %r10355, %r10356, %r10357; // end inline asm - ld.const.u32 %r1909, [matrix+192]; + mov.u32 %r10365, 21; // begin inline asm - dp4a.u32.u32 %r1908, %r1909, %r5870, %r6249; + shf.l.wrap.b32 %r10358, %r10364, %r10363, %r10365; // end inline asm - ld.const.u32 %r1913, [matrix+196]; // begin inline asm - dp4a.u32.u32 %r1912, %r1913, %r5874, %r1908; + shf.l.wrap.b32 %r10362, %r10363, %r10364, %r10365; // end inline asm - ld.const.u32 %r1917, [matrix+200]; + mov.u32 %r10373, 15; // begin inline asm - dp4a.u32.u32 %r1916, %r1917, %r5878, %r1912; + shf.l.wrap.b32 %r10366, %r10372, %r10371, %r10373; // end inline asm - ld.const.u32 %r1921, [matrix+204]; // begin inline asm - dp4a.u32.u32 %r1920, %r1921, %r5882, %r1916; + shf.l.wrap.b32 %r10370, %r10371, %r10372, %r10373; // end inline asm - ld.const.u32 %r1925, [matrix+208]; + mov.u32 %r10381, 10; // begin inline asm - dp4a.u32.u32 %r1924, %r1925, %r5886, %r1920; + shf.l.wrap.b32 %r10374, %r10380, %r10379, %r10381; // end inline asm - ld.const.u32 %r1929, [matrix+212]; // begin inline asm - dp4a.u32.u32 %r1928, %r1929, %r5890, %r1924; + shf.l.wrap.b32 %r10378, %r10379, %r10380, %r10381; // end inline asm - ld.const.u32 %r1933, [matrix+216]; + mov.u32 %r10389, 6; // begin inline asm - dp4a.u32.u32 %r1932, %r1933, %r5894, %r1928; + shf.l.wrap.b32 %r10382, %r10388, %r10387, %r10389; // end inline asm - ld.const.u32 %r1937, [matrix+220]; // begin inline asm - dp4a.u32.u32 %r1936, %r1937, %r5898, %r1932; + shf.l.wrap.b32 %r10386, %r10387, %r10388, %r10389; // end inline asm - ld.const.u32 %r1941, [matrix+224]; + mov.u32 %r10397, 3; // begin inline asm - dp4a.u32.u32 %r1940, %r1941, %r5902, %r1936; + shf.l.wrap.b32 %r10390, %r10396, %r10395, %r10397; // end inline asm - ld.const.u32 %r1945, [matrix+228]; // begin inline asm - dp4a.u32.u32 %r1944, %r1945, %r5906, %r1940; + shf.l.wrap.b32 %r10394, %r10395, %r10396, %r10397; // end inline asm - ld.const.u32 %r1949, [matrix+232]; // begin inline asm - dp4a.u32.u32 %r1948, %r1949, %r5910, %r1944; + shf.l.wrap.b32 %r10398, %r10404, %r10403, %r10083; // end inline asm - ld.const.u32 %r1953, [matrix+236]; // begin inline asm - dp4a.u32.u32 %r1952, %r1953, %r5914, %r1948; + shf.l.wrap.b32 %r10402, %r10403, %r10404, %r10083; // end inline asm - ld.const.u32 %r1957, [matrix+240]; // begin inline asm - dp4a.u32.u32 %r1956, %r1957, %r5918, %r1952; + // chi + lop3.b32 %r10406, %r10441, %r10214, %r10262, 0xD2; + lop3.b32 %r10407, %r10444, %r10218, %r10266, 0xD2; // end inline asm - ld.const.u32 %r1961, [matrix+244]; // begin inline asm - dp4a.u32.u32 %r1960, %r1961, %r5922, %r1956; + // chi + lop3.b32 %r29824, %r10214, %r10262, %r10358, 0xD2; + lop3.b32 %r29825, %r10218, %r10266, %r10362, 0xD2; // end inline asm - ld.const.u32 %r1965, [matrix+248]; // begin inline asm - dp4a.u32.u32 %r1964, %r1965, %r5926, %r1960; + // chi + lop3.b32 %r29820, %r10262, %r10358, %r10310, 0xD2; + lop3.b32 %r29821, %r10266, %r10362, %r10314, 0xD2; // end inline asm - ld.const.u32 %r1969, [matrix+252]; // begin inline asm - dp4a.u32.u32 %r1968, %r1969, %r5930, %r1964; + // chi + lop3.b32 %r29816, %r10358, %r10310, %r10441, 0xD2; + lop3.b32 %r29817, %r10362, %r10314, %r10444, 0xD2; // end inline asm - shr.u32 %r6078, %r1904, 6; - and.b32 %r1973, %r6078, 240; - shr.u32 %r1974, %r1968, 10; - bfe.u32 %r1975, %r9, 8, 8; // begin inline asm - lop3.b32 %r1972, %r1973, %r1974, %r1975, 0x56; + // chi + lop3.b32 %r29814, %r10310, %r10441, %r10214, 0xD2; + lop3.b32 %r29815, %r10314, %r10444, %r10218, 0xD2; // end inline asm - ld.const.u32 %r1977, [matrix+256]; // begin inline asm - dp4a.u32.u32 %r1976, %r1977, %r5870, %r6249; + // chi + lop3.b32 %r29810, %r10350, %r10222, %r10390, 0xD2; + lop3.b32 %r29811, %r10354, %r10226, %r10394, 0xD2; // end inline asm - ld.const.u32 %r1981, [matrix+260]; // begin inline asm - dp4a.u32.u32 %r1980, %r1981, %r5874, %r1976; + // chi + lop3.b32 %r29822, %r10222, %r10390, %r10334, 0xD2; + lop3.b32 %r29823, %r10226, %r10394, %r10338, 0xD2; // end inline asm - ld.const.u32 %r1985, [matrix+264]; // begin inline asm - dp4a.u32.u32 %r1984, %r1985, %r5878, %r1980; + // chi + lop3.b32 %r29818, %r10390, %r10334, %r10230, 0xD2; + lop3.b32 %r29819, %r10394, %r10338, %r10234, 0xD2; // end inline asm - ld.const.u32 %r1989, [matrix+268]; // begin inline asm - dp4a.u32.u32 %r1988, %r1989, %r5882, %r1984; + // chi + lop3.b32 %r29790, %r10334, %r10230, %r10350, 0xD2; + lop3.b32 %r29791, %r10338, %r10234, %r10354, 0xD2; // end inline asm - ld.const.u32 %r1993, [matrix+272]; + st.local.v2.u32 [%rd3+88], {%r29790, %r29791}; // begin inline asm - dp4a.u32.u32 %r1992, %r1993, %r5886, %r1988; + // chi + lop3.b32 %r29782, %r10230, %r10350, %r10222, 0xD2; + lop3.b32 %r29783, %r10234, %r10354, %r10226, 0xD2; // end inline asm - ld.const.u32 %r1997, [matrix+276]; + st.local.v2.u32 [%rd3+96], {%r29782, %r29783}; // begin inline asm - dp4a.u32.u32 %r1996, %r1997, %r5890, %r1992; + // chi + lop3.b32 %r29808, %r10398, %r10382, %r10270, 0xD2; + lop3.b32 %r29809, %r10402, %r10386, %r10274, 0xD2; // end inline asm - ld.const.u32 %r2001, [matrix+280]; + st.local.v2.u32 [%rd3+104], {%r29808, %r29809}; // begin inline asm - dp4a.u32.u32 %r2000, %r2001, %r5894, %r1996; + // chi + lop3.b32 %r29802, %r10382, %r10270, %r10278, 0xD2; + lop3.b32 %r29803, %r10386, %r10274, %r10282, 0xD2; // end inline asm - ld.const.u32 %r2005, [matrix+284]; + st.local.v2.u32 [%rd3+112], {%r29802, %r29803}; // begin inline asm - dp4a.u32.u32 %r2004, %r2005, %r5898, %r2000; + // chi + lop3.b32 %r29796, %r10270, %r10278, %r10246, 0xD2; + lop3.b32 %r29797, %r10274, %r10282, %r10250, 0xD2; // end inline asm - ld.const.u32 %r2009, [matrix+288]; + st.local.v2.u32 [%rd3+120], {%r29796, %r29797}; // begin inline asm - dp4a.u32.u32 %r2008, %r2009, %r5902, %r2004; + // chi + lop3.b32 %r29788, %r10278, %r10246, %r10398, 0xD2; + lop3.b32 %r29789, %r10282, %r10250, %r10402, 0xD2; // end inline asm - ld.const.u32 %r2013, [matrix+292]; + st.local.v2.u32 [%rd3+128], {%r29788, %r29789}; // begin inline asm - dp4a.u32.u32 %r2012, %r2013, %r5906, %r2008; + // chi + lop3.b32 %r29780, %r10246, %r10398, %r10382, 0xD2; + lop3.b32 %r29781, %r10250, %r10402, %r10386, 0xD2; // end inline asm - ld.const.u32 %r2017, [matrix+296]; + st.local.v2.u32 [%rd3+136], {%r29780, %r29781}; // begin inline asm - dp4a.u32.u32 %r2016, %r2017, %r5910, %r2012; + // chi + lop3.b32 %r29806, %r10302, %r10342, %r10374, 0xD2; + lop3.b32 %r29807, %r10306, %r10346, %r10378, 0xD2; // end inline asm - ld.const.u32 %r2021, [matrix+300]; + st.local.v2.u32 [%rd3+144], {%r29806, %r29807}; // begin inline asm - dp4a.u32.u32 %r2020, %r2021, %r5914, %r2016; + // chi + lop3.b32 %r29800, %r10342, %r10374, %r10366, 0xD2; + lop3.b32 %r29801, %r10346, %r10378, %r10370, 0xD2; // end inline asm - ld.const.u32 %r2025, [matrix+304]; + st.local.v2.u32 [%rd3+152], {%r29800, %r29801}; // begin inline asm - dp4a.u32.u32 %r2024, %r2025, %r5918, %r2020; + // chi + lop3.b32 %r29794, %r10374, %r10366, %r10286, 0xD2; + lop3.b32 %r29795, %r10378, %r10370, %r10290, 0xD2; // end inline asm - ld.const.u32 %r2029, [matrix+308]; + st.local.v2.u32 [%rd3+160], {%r29794, %r29795}; // begin inline asm - dp4a.u32.u32 %r2028, %r2029, %r5922, %r2024; + // chi + lop3.b32 %r29786, %r10366, %r10286, %r10302, 0xD2; + lop3.b32 %r29787, %r10370, %r10290, %r10306, 0xD2; // end inline asm - ld.const.u32 %r2033, [matrix+312]; + st.local.v2.u32 [%rd3+168], {%r29786, %r29787}; // begin inline asm - dp4a.u32.u32 %r2032, %r2033, %r5926, %r2028; + // chi + lop3.b32 %r29778, %r10286, %r10302, %r10342, 0xD2; + lop3.b32 %r29779, %r10290, %r10306, %r10346, 0xD2; // end inline asm - ld.const.u32 %r2037, [matrix+316]; + st.local.v2.u32 [%rd3+176], {%r29778, %r29779}; // begin inline asm - dp4a.u32.u32 %r2036, %r2037, %r5930, %r2032; + // chi + lop3.b32 %r29804, %r10254, %r10326, %r10238, 0xD2; + lop3.b32 %r29805, %r10258, %r10330, %r10242, 0xD2; // end inline asm - ld.const.u32 %r2041, [matrix+320]; + st.local.v2.u32 [%rd3+184], {%r29804, %r29805}; // begin inline asm - dp4a.u32.u32 %r2040, %r2041, %r5870, %r6249; + // chi + lop3.b32 %r29798, %r10326, %r10238, %r10294, 0xD2; + lop3.b32 %r29799, %r10330, %r10242, %r10298, 0xD2; // end inline asm - ld.const.u32 %r2045, [matrix+324]; + st.local.v2.u32 [%rd3+192], {%r29798, %r29799}; // begin inline asm - dp4a.u32.u32 %r2044, %r2045, %r5874, %r2040; + // chi + lop3.b32 %r29792, %r10238, %r10294, %r10318, 0xD2; + lop3.b32 %r29793, %r10242, %r10298, %r10322, 0xD2; // end inline asm - ld.const.u32 %r2049, [matrix+328]; + st.local.v2.u32 [%rd3+200], {%r29792, %r29793}; // begin inline asm - dp4a.u32.u32 %r2048, %r2049, %r5878, %r2044; + // chi + lop3.b32 %r29784, %r10294, %r10318, %r10254, 0xD2; + lop3.b32 %r29785, %r10298, %r10322, %r10258, 0xD2; // end inline asm - ld.const.u32 %r2053, [matrix+332]; + st.local.v2.u32 [%rd3+208], {%r29784, %r29785}; // begin inline asm - dp4a.u32.u32 %r2052, %r2053, %r5882, %r2048; + // chi + lop3.b32 %r29776, %r10318, %r10254, %r10326, 0xD2; + lop3.b32 %r29777, %r10322, %r10258, %r10330, 0xD2; // end inline asm - ld.const.u32 %r2057, [matrix+336]; + st.local.v2.u32 [%rd3+216], {%r29776, %r29777}; + mul.wide.s32 %rd595, %r29826, 8; + mov.u64 %rd596, keccak_round_constants; + cvta.const.u64 %rd597, %rd596; + add.s64 %rd594, %rd597, %rd595; // begin inline asm - dp4a.u32.u32 %r2056, %r2057, %r5886, %r2052; + ld.global.nc.v2.u32 {%r10606,%r10607}, [%rd594]; // end inline asm - ld.const.u32 %r2061, [matrix+340]; + xor.b32 %r29812, %r10406, %r10606; + xor.b32 %r29813, %r10407, %r10607; + add.s32 %r29826, %r29826, 1; + setp.lt.u32 %p23, %r29826, 23; + @%p23 bra $L__BB2_34; + + add.u64 %rd84, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r29824, %r29825}; + st.local.v2.u32 [%rd3+72], {%r29822, %r29823}; + st.local.v2.u32 [%rd3+40], {%r29820, %r29821}; + st.local.v2.u32 [%rd3+80], {%r29818, %r29819}; + st.local.v2.u32 [%rd3+48], {%r29816, %r29817}; + st.local.v2.u32 [%rd3+56], {%r29814, %r29815}; + st.local.v2.u32 [%rd3+24], {%r29812, %r29813}; // begin inline asm - dp4a.u32.u32 %r2060, %r2061, %r5890, %r2056; + // xor5 + lop3.b32 %r10618, %r29812, %r29810, %r29808, 0x96; + lop3.b32 %r10618, %r10618, %r29806, %r29804, 0x96; + lop3.b32 %r10619, %r29813, %r29811, %r29809, 0x96; + lop3.b32 %r10619, %r10619, %r29807, %r29805, 0x96; // end inline asm - ld.const.u32 %r2065, [matrix+344]; // begin inline asm - dp4a.u32.u32 %r2064, %r2065, %r5894, %r2060; + // xor5 + lop3.b32 %r10630, %r29824, %r29822, %r29802, 0x96; + lop3.b32 %r10630, %r10630, %r29800, %r29798, 0x96; + lop3.b32 %r10631, %r29825, %r29823, %r29803, 0x96; + lop3.b32 %r10631, %r10631, %r29801, %r29799, 0x96; // end inline asm - ld.const.u32 %r2069, [matrix+348]; // begin inline asm - dp4a.u32.u32 %r2068, %r2069, %r5898, %r2064; + // xor5 + lop3.b32 %r10642, %r29820, %r29818, %r29796, 0x96; + lop3.b32 %r10642, %r10642, %r29794, %r29792, 0x96; + lop3.b32 %r10643, %r29821, %r29819, %r29797, 0x96; + lop3.b32 %r10643, %r10643, %r29795, %r29793, 0x96; // end inline asm - ld.const.u32 %r2073, [matrix+352]; // begin inline asm - dp4a.u32.u32 %r2072, %r2073, %r5902, %r2068; + // xor5 + lop3.b32 %r10654, %r29816, %r29790, %r29788, 0x96; + lop3.b32 %r10654, %r10654, %r29786, %r29784, 0x96; + lop3.b32 %r10655, %r29817, %r29791, %r29789, 0x96; + lop3.b32 %r10655, %r10655, %r29787, %r29785, 0x96; // end inline asm - ld.const.u32 %r2077, [matrix+356]; // begin inline asm - dp4a.u32.u32 %r2076, %r2077, %r5906, %r2072; + // xor5 + lop3.b32 %r10666, %r29814, %r29782, %r29780, 0x96; + lop3.b32 %r10666, %r10666, %r29778, %r29776, 0x96; + lop3.b32 %r10667, %r29815, %r29783, %r29781, 0x96; + lop3.b32 %r10667, %r10667, %r29779, %r29777, 0x96; // end inline asm - ld.const.u32 %r2081, [matrix+360]; + mov.u32 %r10870, 1; // begin inline asm - dp4a.u32.u32 %r2080, %r2081, %r5910, %r2076; + shf.l.wrap.b32 %r10678, %r10631, %r10630, %r10870; // end inline asm - ld.const.u32 %r2085, [matrix+364]; // begin inline asm - dp4a.u32.u32 %r2084, %r2085, %r5914, %r2080; + shf.l.wrap.b32 %r10682, %r10630, %r10631, %r10870; // end inline asm - ld.const.u32 %r2089, [matrix+368]; + xor.b32 %r10897, %r10678, %r10666; + xor.b32 %r10898, %r10682, %r10667; + xor.b32 %r10825, %r29812, %r10897; + xor.b32 %r10828, %r29813, %r10898; + xor.b32 %r10788, %r29809, %r10898; + xor.b32 %r10787, %r29808, %r10897; + st.local.v2.u32 [%rd3+104], {%r10787, %r10788}; // begin inline asm - dp4a.u32.u32 %r2088, %r2089, %r5918, %r2084; + shf.l.wrap.b32 %r10686, %r10643, %r10642, %r10870; // end inline asm - ld.const.u32 %r2093, [matrix+372]; // begin inline asm - dp4a.u32.u32 %r2092, %r2093, %r5922, %r2088; + shf.l.wrap.b32 %r10690, %r10642, %r10643, %r10870; // end inline asm - ld.const.u32 %r2097, [matrix+376]; + xor.b32 %r10899, %r10686, %r10618; + xor.b32 %r10900, %r10690, %r10619; + xor.b32 %r10724, %r29822, %r10899; + xor.b32 %r10723, %r29823, %r10900; + xor.b32 %r10763, %r29801, %r10900; + xor.b32 %r10764, %r29800, %r10899; + st.local.v2.u32 [%rd3+152], {%r10764, %r10763}; // begin inline asm - dp4a.u32.u32 %r2096, %r2097, %r5926, %r2092; + shf.l.wrap.b32 %r10694, %r10655, %r10654, %r10870; // end inline asm - ld.const.u32 %r2101, [matrix+380]; // begin inline asm - dp4a.u32.u32 %r2100, %r2101, %r5930, %r2096; + shf.l.wrap.b32 %r10698, %r10654, %r10655, %r10870; // end inline asm - shr.u32 %r6079, %r2036, 6; - and.b32 %r2105, %r6079, 240; - shr.u32 %r2106, %r2100, 10; - bfe.u32 %r2107, %r9, 16, 8; + xor.b32 %r10901, %r10694, %r10630; + xor.b32 %r10902, %r10698, %r10631; + xor.b32 %r10747, %r29797, %r10902; + xor.b32 %r10748, %r29796, %r10901; + st.local.v2.u32 [%rd3+120], {%r10748, %r10747}; + xor.b32 %r10739, %r29793, %r10902; + xor.b32 %r10740, %r29792, %r10901; + st.local.v2.u32 [%rd3+200], {%r10740, %r10739}; // begin inline asm - lop3.b32 %r2104, %r2105, %r2106, %r2107, 0x56; + shf.l.wrap.b32 %r10702, %r10667, %r10666, %r10870; // end inline asm - ld.const.u32 %r2109, [matrix+384]; // begin inline asm - dp4a.u32.u32 %r2108, %r2109, %r5870, %r6249; + shf.l.wrap.b32 %r10706, %r10666, %r10667, %r10870; // end inline asm - ld.const.u32 %r2113, [matrix+388]; + xor.b32 %r10903, %r10702, %r10642; + xor.b32 %r10904, %r10706, %r10643; + xor.b32 %r10771, %r29816, %r10903; + xor.b32 %r10772, %r29817, %r10904; + xor.b32 %r10780, %r29787, %r10904; + xor.b32 %r10779, %r29786, %r10903; + st.local.v2.u32 [%rd3+168], {%r10779, %r10780}; // begin inline asm - dp4a.u32.u32 %r2112, %r2113, %r5874, %r2108; + shf.l.wrap.b32 %r10710, %r10619, %r10618, %r10870; // end inline asm - ld.const.u32 %r2117, [matrix+392]; // begin inline asm - dp4a.u32.u32 %r2116, %r2117, %r5878, %r2112; + shf.l.wrap.b32 %r10714, %r10618, %r10619, %r10870; // end inline asm - ld.const.u32 %r2121, [matrix+396]; + xor.b32 %r10905, %r10710, %r10654; + xor.b32 %r10906, %r10714, %r10655; + xor.b32 %r10731, %r29782, %r10905; + xor.b32 %r10732, %r29783, %r10906; + xor.b32 %r10756, %r29777, %r10906; + xor.b32 %r10755, %r29776, %r10905; + st.local.v2.u32 [%rd3+216], {%r10755, %r10756}; // begin inline asm - dp4a.u32.u32 %r2120, %r2121, %r5882, %r2116; + shf.l.wrap.b32 %r10718, %r10724, %r10723, %r10221; // end inline asm - ld.const.u32 %r2125, [matrix+400]; // begin inline asm - dp4a.u32.u32 %r2124, %r2125, %r5886, %r2120; + shf.l.wrap.b32 %r10722, %r10723, %r10724, %r10221; // end inline asm - ld.const.u32 %r2129, [matrix+404]; // begin inline asm - dp4a.u32.u32 %r2128, %r2129, %r5890, %r2124; + shf.l.wrap.b32 %r10726, %r10732, %r10731, %r10229; // end inline asm - ld.const.u32 %r2133, [matrix+408]; // begin inline asm - dp4a.u32.u32 %r2132, %r2133, %r5894, %r2128; + shf.l.wrap.b32 %r10730, %r10731, %r10732, %r10229; // end inline asm - ld.const.u32 %r2137, [matrix+412]; // begin inline asm - dp4a.u32.u32 %r2136, %r2137, %r5898, %r2132; + shf.l.wrap.b32 %r10738, %r10739, %r10740, %r10237; // end inline asm - ld.const.u32 %r2141, [matrix+416]; // begin inline asm - dp4a.u32.u32 %r2140, %r2141, %r5902, %r2136; + shf.l.wrap.b32 %r10734, %r10740, %r10739, %r10237; // end inline asm - ld.const.u32 %r2145, [matrix+420]; + st.local.v2.u32 [%rd3+96], {%r10734, %r10738}; // begin inline asm - dp4a.u32.u32 %r2144, %r2145, %r5906, %r2140; + shf.l.wrap.b32 %r10742, %r10748, %r10747, %r10269; // end inline asm - ld.const.u32 %r2149, [matrix+424]; // begin inline asm - dp4a.u32.u32 %r2148, %r2149, %r5910, %r2144; + shf.l.wrap.b32 %r10746, %r10747, %r10748, %r10269; // end inline asm - ld.const.u32 %r2153, [matrix+428]; // begin inline asm - dp4a.u32.u32 %r2152, %r2153, %r5914, %r2148; + shf.l.wrap.b32 %r10750, %r10756, %r10755, %r10317; // end inline asm - ld.const.u32 %r2157, [matrix+432]; // begin inline asm - dp4a.u32.u32 %r2156, %r2157, %r5918, %r2152; + shf.l.wrap.b32 %r10754, %r10755, %r10756, %r10317; // end inline asm - ld.const.u32 %r2161, [matrix+436]; // begin inline asm - dp4a.u32.u32 %r2160, %r2161, %r5922, %r2156; + shf.l.wrap.b32 %r10762, %r10763, %r10764, %r10341; // end inline asm - ld.const.u32 %r2165, [matrix+440]; // begin inline asm - dp4a.u32.u32 %r2164, %r2165, %r5926, %r2160; + shf.l.wrap.b32 %r10758, %r10764, %r10763, %r10341; // end inline asm - ld.const.u32 %r2169, [matrix+444]; + st.local.v2.u32 [%rd3+88], {%r10758, %r10762}; // begin inline asm - dp4a.u32.u32 %r2168, %r2169, %r5930, %r2164; + shf.l.wrap.b32 %r10766, %r10772, %r10771, %r10357; // end inline asm - ld.const.u32 %r2173, [matrix+448]; // begin inline asm - dp4a.u32.u32 %r2172, %r2173, %r5870, %r6249; + shf.l.wrap.b32 %r10770, %r10771, %r10772, %r10357; // end inline asm - ld.const.u32 %r2177, [matrix+452]; // begin inline asm - dp4a.u32.u32 %r2176, %r2177, %r5874, %r2172; + shf.l.wrap.b32 %r10774, %r10780, %r10779, %r10365; // end inline asm - ld.const.u32 %r2181, [matrix+456]; // begin inline asm - dp4a.u32.u32 %r2180, %r2181, %r5878, %r2176; + shf.l.wrap.b32 %r10778, %r10779, %r10780, %r10365; // end inline asm - ld.const.u32 %r2185, [matrix+460]; // begin inline asm - dp4a.u32.u32 %r2184, %r2185, %r5882, %r2180; + shf.l.wrap.b32 %r10782, %r10788, %r10787, %r10397; // end inline asm - ld.const.u32 %r2189, [matrix+464]; // begin inline asm - dp4a.u32.u32 %r2188, %r2189, %r5886, %r2184; + shf.l.wrap.b32 %r10786, %r10787, %r10788, %r10397; // end inline asm - ld.const.u32 %r2193, [matrix+468]; // begin inline asm - dp4a.u32.u32 %r2192, %r2193, %r5890, %r2188; + // chi + lop3.b32 %r10790, %r10825, %r10718, %r10742, 0xD2; + lop3.b32 %r10791, %r10828, %r10722, %r10746, 0xD2; // end inline asm - ld.const.u32 %r2197, [matrix+472]; // begin inline asm - dp4a.u32.u32 %r2196, %r2197, %r5894, %r2192; + // chi + lop3.b32 %r29959, %r10718, %r10742, %r10774, 0xD2; + lop3.b32 %r29960, %r10722, %r10746, %r10778, 0xD2; // end inline asm - ld.const.u32 %r2201, [matrix+476]; + st.local.v2.u32 [%rd3+32], {%r29959, %r29960}; // begin inline asm - dp4a.u32.u32 %r2200, %r2201, %r5898, %r2196; + // chi + lop3.b32 %r29955, %r10742, %r10774, %r10750, 0xD2; + lop3.b32 %r29956, %r10746, %r10778, %r10754, 0xD2; // end inline asm - ld.const.u32 %r2205, [matrix+480]; + st.local.v2.u32 [%rd3+40], {%r29955, %r29956}; // begin inline asm - dp4a.u32.u32 %r2204, %r2205, %r5902, %r2200; + // chi + lop3.b32 %r29951, %r10774, %r10750, %r10825, 0xD2; + lop3.b32 %r29952, %r10778, %r10754, %r10828, 0xD2; // end inline asm - ld.const.u32 %r2209, [matrix+484]; + st.local.v2.u32 [%rd3+48], {%r29951, %r29952}; // begin inline asm - dp4a.u32.u32 %r2208, %r2209, %r5906, %r2204; + // chi + lop3.b32 %r29949, %r10750, %r10825, %r10718, 0xD2; + lop3.b32 %r29950, %r10754, %r10828, %r10722, 0xD2; // end inline asm - ld.const.u32 %r2213, [matrix+488]; + st.local.v2.u32 [%rd3+56], {%r29949, %r29950}; // begin inline asm - dp4a.u32.u32 %r2212, %r2213, %r5910, %r2208; + // chi + lop3.b32 %r29945, %r10766, %r10726, %r10782, 0xD2; + lop3.b32 %r29946, %r10770, %r10730, %r10786, 0xD2; // end inline asm - ld.const.u32 %r2217, [matrix+492]; + st.local.v2.u32 [%rd3+64], {%r29945, %r29946}; // begin inline asm - dp4a.u32.u32 %r2216, %r2217, %r5914, %r2212; + // chi + lop3.b32 %r29957, %r10726, %r10782, %r10758, 0xD2; + lop3.b32 %r29958, %r10730, %r10786, %r10762, 0xD2; // end inline asm - ld.const.u32 %r2221, [matrix+496]; + st.local.v2.u32 [%rd3+72], {%r29957, %r29958}; // begin inline asm - dp4a.u32.u32 %r2220, %r2221, %r5918, %r2216; + // chi + lop3.b32 %r29953, %r10782, %r10758, %r10734, 0xD2; + lop3.b32 %r29954, %r10786, %r10762, %r10738, 0xD2; // end inline asm - ld.const.u32 %r2225, [matrix+500]; + st.local.v2.u32 [%rd3+80], {%r29953, %r29954}; + add.s64 %rd598, %rd597, 184; // begin inline asm - dp4a.u32.u32 %r2224, %r2225, %r5922, %r2220; + ld.global.nc.v2.u32 {%r10854,%r10855}, [%rd598]; + // end inline asm + xor.b32 %r29947, %r10790, %r10854; + xor.b32 %r29948, %r10791, %r10855; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; + st.local.u64 [%rd84], %rd354; + mov.u64 %rd602, 1179641; + st.local.u64 [%rd84+8], %rd602; + st.local.u32 [%rd84+16], %r31; + ld.global.u64 %rd603, [%rd35]; + ld.global.u64 %rd604, [%rd35+8]; + ld.global.u64 %rd605, [%rd35+16]; + ld.global.u64 %rd606, [%rd35+24]; + ld.global.u64 %rd607, [%rd35+32]; + ld.global.u64 %rd608, [%rd35+40]; + ld.global.u64 %rd609, [%rd35+48]; + ld.global.u64 %rd610, [%rd35+56]; + st.local.u64 [%rd84+32], %rd604; + st.local.u64 [%rd84+40], %rd605; + st.local.u64 [%rd84+48], %rd606; + st.local.u64 [%rd84+56], %rd607; + st.local.u64 [%rd84+64], %rd608; + st.local.u64 [%rd84+72], %rd609; + st.local.u64 [%rd84+80], %rd610; + cvt.u32.u64 %r10907, %rd603; + xor.b32 %r10908, %r31, %r10907; + st.local.u64 [%rd84+24], %rd603; + st.local.u32 [%rd84+24], %r10908; + mov.u32 %r29827, 0; + st.local.v2.u32 [%rd84+96], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+104], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+112], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+120], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+128], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+136], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+144], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+152], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+160], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+168], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+176], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+184], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+192], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+200], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+208], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+216], {%r29827, %r29827}; + mov.u32 %r29842, -2147483648; + st.local.v2.u32 [%rd84+88], {%r10870, %r29842}; + ld.local.v2.u32 {%r29863, %r29864}, [%rd84+24]; + mov.b64 {%r29861, %r29862}, %rd608; + shr.u64 %rd611, %rd604, 32; + cvt.u32.u64 %r29875, %rd604; + cvt.u32.u64 %r29876, %rd611; + shr.u64 %rd612, %rd609, 32; + cvt.u32.u64 %r29873, %rd609; + cvt.u32.u64 %r29874, %rd612; + shr.u64 %rd613, %rd605, 32; + cvt.u32.u64 %r29871, %rd605; + cvt.u32.u64 %r29872, %rd613; + shr.u64 %rd614, %rd610, 32; + cvt.u32.u64 %r29869, %rd610; + cvt.u32.u64 %r29870, %rd614; + shr.u64 %rd615, %rd606, 32; + cvt.u32.u64 %r29867, %rd606; + cvt.u32.u64 %r29868, %rd615; + shr.u64 %rd616, %rd607, 32; + cvt.u32.u64 %r29865, %rd607; + cvt.u32.u64 %r29866, %rd616; + mov.u32 %r29828, %r29827; + mov.u32 %r29829, %r29827; + mov.u32 %r29830, %r29827; + mov.u32 %r29831, %r29827; + mov.u32 %r29832, %r29827; + mov.u32 %r29833, %r29827; + mov.u32 %r29834, %r29827; + mov.u32 %r29835, %r29827; + mov.u32 %r29836, %r29827; + mov.u32 %r29837, %r29827; + mov.u32 %r29838, %r29827; + mov.u32 %r29839, %r29827; + mov.u32 %r29840, %r29827; + mov.u32 %r29841, %r10870; + mov.u32 %r29843, %r29827; + mov.u32 %r29844, %r29827; + mov.u32 %r29845, %r29827; + mov.u32 %r29846, %r29827; + mov.u32 %r29847, %r29827; + mov.u32 %r29848, %r29827; + mov.u32 %r29849, %r29827; + mov.u32 %r29850, %r29827; + mov.u32 %r29851, %r29827; + mov.u32 %r29852, %r29827; + mov.u32 %r29853, %r29827; + mov.u32 %r29854, %r29827; + mov.u32 %r29855, %r29827; + mov.u32 %r29856, %r29827; + mov.u32 %r29857, %r29827; + mov.u32 %r29858, %r29827; + mov.u32 %r29859, %r29827; + mov.u32 %r29860, %r29827; + mov.u32 %r29877, %r29827; + +$L__BB2_36: + // begin inline asm + // xor5 + lop3.b32 %r10911, %r29863, %r29861, %r29859, 0x96; + lop3.b32 %r10911, %r10911, %r29857, %r29855, 0x96; + lop3.b32 %r10912, %r29864, %r29862, %r29860, 0x96; + lop3.b32 %r10912, %r10912, %r29858, %r29856, 0x96; // end inline asm - ld.const.u32 %r2229, [matrix+504]; // begin inline asm - dp4a.u32.u32 %r2228, %r2229, %r5926, %r2224; + // xor5 + lop3.b32 %r10923, %r29875, %r29873, %r29853, 0x96; + lop3.b32 %r10923, %r10923, %r29851, %r29849, 0x96; + lop3.b32 %r10924, %r29876, %r29874, %r29854, 0x96; + lop3.b32 %r10924, %r10924, %r29852, %r29850, 0x96; // end inline asm - ld.const.u32 %r2233, [matrix+508]; // begin inline asm - dp4a.u32.u32 %r2232, %r2233, %r5930, %r2228; + // xor5 + lop3.b32 %r10935, %r29871, %r29869, %r29847, 0x96; + lop3.b32 %r10935, %r10935, %r29845, %r29843, 0x96; + lop3.b32 %r10936, %r29872, %r29870, %r29848, 0x96; + lop3.b32 %r10936, %r10936, %r29846, %r29844, 0x96; // end inline asm - shr.u32 %r6080, %r2168, 6; - and.b32 %r2237, %r6080, 240; - shr.u32 %r2238, %r2232, 10; // begin inline asm - lop3.b32 %r2236, %r2237, %r2238, %r2239, 0x56; + // xor5 + lop3.b32 %r10947, %r29867, %r29841, %r29839, 0x96; + lop3.b32 %r10947, %r10947, %r29837, %r29835, 0x96; + lop3.b32 %r10948, %r29868, %r29842, %r29840, 0x96; + lop3.b32 %r10948, %r10948, %r29838, %r29836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10959, %r29865, %r29833, %r29831, 0x96; + lop3.b32 %r10959, %r10959, %r29829, %r29827, 0x96; + lop3.b32 %r10960, %r29866, %r29834, %r29832, 0x96; + lop3.b32 %r10960, %r10960, %r29830, %r29828, 0x96; // end inline asm - ld.const.u32 %r2241, [matrix+512]; // begin inline asm - dp4a.u32.u32 %r2240, %r2241, %r5870, %r6249; + shf.l.wrap.b32 %r10971, %r10924, %r10923, %r10870; // end inline asm - ld.const.u32 %r2245, [matrix+516]; // begin inline asm - dp4a.u32.u32 %r2244, %r2245, %r5874, %r2240; + shf.l.wrap.b32 %r10975, %r10923, %r10924, %r10870; // end inline asm - ld.const.u32 %r2249, [matrix+520]; + xor.b32 %r11405, %r10971, %r10959; + xor.b32 %r11406, %r10975, %r10960; + xor.b32 %r11238, %r29863, %r11405; + xor.b32 %r11241, %r29864, %r11406; + xor.b32 %r11145, %r29861, %r11405; + xor.b32 %r11144, %r29862, %r11406; + xor.b32 %r11192, %r29859, %r11405; + xor.b32 %r11193, %r29860, %r11406; + xor.b32 %r11097, %r29857, %r11405; + xor.b32 %r11096, %r29858, %r11406; + xor.b32 %r11048, %r29855, %r11405; + xor.b32 %r11049, %r29856, %r11406; // begin inline asm - dp4a.u32.u32 %r2248, %r2249, %r5878, %r2244; + shf.l.wrap.b32 %r10979, %r10936, %r10935, %r10870; // end inline asm - ld.const.u32 %r2253, [matrix+524]; // begin inline asm - dp4a.u32.u32 %r2252, %r2253, %r5882, %r2248; + shf.l.wrap.b32 %r10983, %r10935, %r10936, %r10870; // end inline asm - ld.const.u32 %r2257, [matrix+528]; + xor.b32 %r11407, %r10979, %r10911; + xor.b32 %r11408, %r10983, %r10912; + xor.b32 %r11200, %r29875, %r11407; + xor.b32 %r11201, %r29876, %r11408; + xor.b32 %r11017, %r29873, %r11407; + xor.b32 %r11016, %r29874, %r11408; + xor.b32 %r11176, %r29853, %r11407; + xor.b32 %r11177, %r29854, %r11408; + xor.b32 %r11137, %r29851, %r11407; + xor.b32 %r11136, %r29852, %r11408; + xor.b32 %r11120, %r29849, %r11407; + xor.b32 %r11121, %r29850, %r11408; // begin inline asm - dp4a.u32.u32 %r2256, %r2257, %r5886, %r2252; + shf.l.wrap.b32 %r10987, %r10948, %r10947, %r10870; // end inline asm - ld.const.u32 %r2261, [matrix+532]; // begin inline asm - dp4a.u32.u32 %r2260, %r2261, %r5890, %r2256; + shf.l.wrap.b32 %r10991, %r10947, %r10948, %r10870; // end inline asm - ld.const.u32 %r2265, [matrix+536]; + xor.b32 %r11409, %r10987, %r10923; + xor.b32 %r11410, %r10991, %r10924; + xor.b32 %r11057, %r29871, %r11409; + xor.b32 %r11056, %r29872, %r11410; + xor.b32 %r11184, %r29869, %r11409; + xor.b32 %r11185, %r29870, %r11410; + xor.b32 %r11065, %r29847, %r11409; + xor.b32 %r11064, %r29848, %r11410; + xor.b32 %r11168, %r29845, %r11409; + xor.b32 %r11169, %r29846, %r11410; + xor.b32 %r11033, %r29843, %r11409; + xor.b32 %r11032, %r29844, %r11410; // begin inline asm - dp4a.u32.u32 %r2264, %r2265, %r5894, %r2260; + shf.l.wrap.b32 %r10995, %r10960, %r10959, %r10870; // end inline asm - ld.const.u32 %r2269, [matrix+540]; // begin inline asm - dp4a.u32.u32 %r2268, %r2269, %r5898, %r2264; + shf.l.wrap.b32 %r10999, %r10959, %r10960, %r10870; // end inline asm - ld.const.u32 %r2273, [matrix+544]; + xor.b32 %r11411, %r10995, %r10935; + xor.b32 %r11412, %r10999, %r10936; + xor.b32 %r11152, %r29867, %r11411; + xor.b32 %r11153, %r29868, %r11412; + xor.b32 %r11129, %r29841, %r11411; + xor.b32 %r11128, %r29842, %r11412; + xor.b32 %r11072, %r29839, %r11411; + xor.b32 %r11073, %r29840, %r11412; + xor.b32 %r11160, %r29837, %r11411; + xor.b32 %r11161, %r29838, %r11412; + xor.b32 %r11089, %r29835, %r11411; + xor.b32 %r11088, %r29836, %r11412; // begin inline asm - dp4a.u32.u32 %r2272, %r2273, %r5902, %r2268; + shf.l.wrap.b32 %r11003, %r10912, %r10911, %r10870; // end inline asm - ld.const.u32 %r2277, [matrix+548]; // begin inline asm - dp4a.u32.u32 %r2276, %r2277, %r5906, %r2272; + shf.l.wrap.b32 %r11007, %r10911, %r10912, %r10870; // end inline asm - ld.const.u32 %r2281, [matrix+552]; + xor.b32 %r11413, %r11003, %r10947; + xor.b32 %r11414, %r11007, %r10948; + xor.b32 %r11104, %r29865, %r11413; + xor.b32 %r11105, %r29866, %r11414; + xor.b32 %r11024, %r29833, %r11413; + xor.b32 %r11025, %r29834, %r11414; + xor.b32 %r11041, %r29831, %r11413; + xor.b32 %r11040, %r29832, %r11414; + xor.b32 %r11080, %r29829, %r11413; + xor.b32 %r11081, %r29830, %r11414; + xor.b32 %r11112, %r29827, %r11413; + xor.b32 %r11113, %r29828, %r11414; + mov.u32 %r11018, 44; // begin inline asm - dp4a.u32.u32 %r2280, %r2281, %r5910, %r2276; + shf.l.wrap.b32 %r11011, %r11017, %r11016, %r11018; // end inline asm - ld.const.u32 %r2285, [matrix+556]; // begin inline asm - dp4a.u32.u32 %r2284, %r2285, %r5914, %r2280; + shf.l.wrap.b32 %r11015, %r11016, %r11017, %r11018; // end inline asm - ld.const.u32 %r2289, [matrix+560]; + mov.u32 %r11026, 20; // begin inline asm - dp4a.u32.u32 %r2288, %r2289, %r5918, %r2284; + shf.l.wrap.b32 %r11019, %r11025, %r11024, %r11026; // end inline asm - ld.const.u32 %r2293, [matrix+564]; // begin inline asm - dp4a.u32.u32 %r2292, %r2293, %r5922, %r2288; + shf.l.wrap.b32 %r11023, %r11024, %r11025, %r11026; // end inline asm - ld.const.u32 %r2297, [matrix+568]; + mov.u32 %r11034, 61; // begin inline asm - dp4a.u32.u32 %r2296, %r2297, %r5926, %r2292; + shf.l.wrap.b32 %r11027, %r11033, %r11032, %r11034; // end inline asm - ld.const.u32 %r2301, [matrix+572]; // begin inline asm - dp4a.u32.u32 %r2300, %r2301, %r5930, %r2296; + shf.l.wrap.b32 %r11031, %r11032, %r11033, %r11034; // end inline asm - ld.const.u32 %r2305, [matrix+576]; + mov.u32 %r11042, 39; // begin inline asm - dp4a.u32.u32 %r2304, %r2305, %r5870, %r6249; + shf.l.wrap.b32 %r11035, %r11041, %r11040, %r11042; // end inline asm - ld.const.u32 %r2309, [matrix+580]; // begin inline asm - dp4a.u32.u32 %r2308, %r2309, %r5874, %r2304; + shf.l.wrap.b32 %r11039, %r11040, %r11041, %r11042; // end inline asm - ld.const.u32 %r2313, [matrix+584]; + mov.u32 %r11050, 18; // begin inline asm - dp4a.u32.u32 %r2312, %r2313, %r5878, %r2308; + shf.l.wrap.b32 %r11043, %r11049, %r11048, %r11050; // end inline asm - ld.const.u32 %r2317, [matrix+588]; // begin inline asm - dp4a.u32.u32 %r2316, %r2317, %r5882, %r2312; + shf.l.wrap.b32 %r11047, %r11048, %r11049, %r11050; // end inline asm - ld.const.u32 %r2321, [matrix+592]; + mov.u32 %r11058, 62; // begin inline asm - dp4a.u32.u32 %r2320, %r2321, %r5886, %r2316; + shf.l.wrap.b32 %r11051, %r11057, %r11056, %r11058; // end inline asm - ld.const.u32 %r2325, [matrix+596]; // begin inline asm - dp4a.u32.u32 %r2324, %r2325, %r5890, %r2320; + shf.l.wrap.b32 %r11055, %r11056, %r11057, %r11058; // end inline asm - ld.const.u32 %r2329, [matrix+600]; + mov.u32 %r11066, 43; // begin inline asm - dp4a.u32.u32 %r2328, %r2329, %r5894, %r2324; + shf.l.wrap.b32 %r11059, %r11065, %r11064, %r11066; // end inline asm - ld.const.u32 %r2333, [matrix+604]; // begin inline asm - dp4a.u32.u32 %r2332, %r2333, %r5898, %r2328; + shf.l.wrap.b32 %r11063, %r11064, %r11065, %r11066; // end inline asm - ld.const.u32 %r2337, [matrix+608]; + mov.u32 %r11074, 25; // begin inline asm - dp4a.u32.u32 %r2336, %r2337, %r5902, %r2332; + shf.l.wrap.b32 %r11067, %r11073, %r11072, %r11074; // end inline asm - ld.const.u32 %r2341, [matrix+612]; // begin inline asm - dp4a.u32.u32 %r2340, %r2341, %r5906, %r2336; + shf.l.wrap.b32 %r11071, %r11072, %r11073, %r11074; // end inline asm - ld.const.u32 %r2345, [matrix+616]; + mov.u32 %r11082, 8; // begin inline asm - dp4a.u32.u32 %r2344, %r2345, %r5910, %r2340; + shf.l.wrap.b32 %r11075, %r11081, %r11080, %r11082; // end inline asm - ld.const.u32 %r2349, [matrix+620]; // begin inline asm - dp4a.u32.u32 %r2348, %r2349, %r5914, %r2344; + shf.l.wrap.b32 %r11079, %r11080, %r11081, %r11082; // end inline asm - ld.const.u32 %r2353, [matrix+624]; + mov.u32 %r11090, 56; // begin inline asm - dp4a.u32.u32 %r2352, %r2353, %r5918, %r2348; + shf.l.wrap.b32 %r11083, %r11089, %r11088, %r11090; // end inline asm - ld.const.u32 %r2357, [matrix+628]; // begin inline asm - dp4a.u32.u32 %r2356, %r2357, %r5922, %r2352; + shf.l.wrap.b32 %r11087, %r11088, %r11089, %r11090; // end inline asm - ld.const.u32 %r2361, [matrix+632]; + mov.u32 %r11098, 41; // begin inline asm - dp4a.u32.u32 %r2360, %r2361, %r5926, %r2356; + shf.l.wrap.b32 %r11091, %r11097, %r11096, %r11098; // end inline asm - ld.const.u32 %r2365, [matrix+636]; // begin inline asm - dp4a.u32.u32 %r2364, %r2365, %r5930, %r2360; + shf.l.wrap.b32 %r11095, %r11096, %r11097, %r11098; // end inline asm - shr.u32 %r6081, %r2300, 6; - and.b32 %r2369, %r6081, 240; - shr.u32 %r2370, %r2364, 10; - and.b32 %r2371, %r5953, 255; + mov.u32 %r11106, 27; // begin inline asm - lop3.b32 %r2368, %r2369, %r2370, %r2371, 0x56; + shf.l.wrap.b32 %r11099, %r11105, %r11104, %r11106; // end inline asm - ld.const.u32 %r2373, [matrix+640]; // begin inline asm - dp4a.u32.u32 %r2372, %r2373, %r5870, %r6249; + shf.l.wrap.b32 %r11103, %r11104, %r11105, %r11106; // end inline asm - ld.const.u32 %r2377, [matrix+644]; + mov.u32 %r11114, 14; // begin inline asm - dp4a.u32.u32 %r2376, %r2377, %r5874, %r2372; + shf.l.wrap.b32 %r11107, %r11113, %r11112, %r11114; // end inline asm - ld.const.u32 %r2381, [matrix+648]; // begin inline asm - dp4a.u32.u32 %r2380, %r2381, %r5878, %r2376; + shf.l.wrap.b32 %r11111, %r11112, %r11113, %r11114; // end inline asm - ld.const.u32 %r2385, [matrix+652]; + mov.u32 %r11122, 2; // begin inline asm - dp4a.u32.u32 %r2384, %r2385, %r5882, %r2380; + shf.l.wrap.b32 %r11115, %r11121, %r11120, %r11122; // end inline asm - ld.const.u32 %r2389, [matrix+656]; // begin inline asm - dp4a.u32.u32 %r2388, %r2389, %r5886, %r2384; + shf.l.wrap.b32 %r11119, %r11120, %r11121, %r11122; // end inline asm - ld.const.u32 %r2393, [matrix+660]; + mov.u32 %r11130, 55; // begin inline asm - dp4a.u32.u32 %r2392, %r2393, %r5890, %r2388; + shf.l.wrap.b32 %r11123, %r11129, %r11128, %r11130; // end inline asm - ld.const.u32 %r2397, [matrix+664]; // begin inline asm - dp4a.u32.u32 %r2396, %r2397, %r5894, %r2392; + shf.l.wrap.b32 %r11127, %r11128, %r11129, %r11130; // end inline asm - ld.const.u32 %r2401, [matrix+668]; + mov.u32 %r11138, 45; // begin inline asm - dp4a.u32.u32 %r2400, %r2401, %r5898, %r2396; + shf.l.wrap.b32 %r11131, %r11137, %r11136, %r11138; // end inline asm - ld.const.u32 %r2405, [matrix+672]; // begin inline asm - dp4a.u32.u32 %r2404, %r2405, %r5902, %r2400; + shf.l.wrap.b32 %r11135, %r11136, %r11137, %r11138; // end inline asm - ld.const.u32 %r2409, [matrix+676]; + mov.u32 %r11146, 36; // begin inline asm - dp4a.u32.u32 %r2408, %r2409, %r5906, %r2404; + shf.l.wrap.b32 %r11139, %r11145, %r11144, %r11146; // end inline asm - ld.const.u32 %r2413, [matrix+680]; // begin inline asm - dp4a.u32.u32 %r2412, %r2413, %r5910, %r2408; + shf.l.wrap.b32 %r11143, %r11144, %r11145, %r11146; // end inline asm - ld.const.u32 %r2417, [matrix+684]; + mov.u32 %r11154, 28; // begin inline asm - dp4a.u32.u32 %r2416, %r2417, %r5914, %r2412; + shf.l.wrap.b32 %r11147, %r11153, %r11152, %r11154; // end inline asm - ld.const.u32 %r2421, [matrix+688]; // begin inline asm - dp4a.u32.u32 %r2420, %r2421, %r5918, %r2416; + shf.l.wrap.b32 %r11151, %r11152, %r11153, %r11154; // end inline asm - ld.const.u32 %r2425, [matrix+692]; + mov.u32 %r11162, 21; // begin inline asm - dp4a.u32.u32 %r2424, %r2425, %r5922, %r2420; + shf.l.wrap.b32 %r11155, %r11161, %r11160, %r11162; // end inline asm - ld.const.u32 %r2429, [matrix+696]; // begin inline asm - dp4a.u32.u32 %r2428, %r2429, %r5926, %r2424; + shf.l.wrap.b32 %r11159, %r11160, %r11161, %r11162; // end inline asm - ld.const.u32 %r2433, [matrix+700]; + mov.u32 %r11170, 15; // begin inline asm - dp4a.u32.u32 %r2432, %r2433, %r5930, %r2428; + shf.l.wrap.b32 %r11163, %r11169, %r11168, %r11170; // end inline asm - ld.const.u32 %r2437, [matrix+704]; // begin inline asm - dp4a.u32.u32 %r2436, %r2437, %r5870, %r6249; + shf.l.wrap.b32 %r11167, %r11168, %r11169, %r11170; // end inline asm - ld.const.u32 %r2441, [matrix+708]; + mov.u32 %r11178, 10; // begin inline asm - dp4a.u32.u32 %r2440, %r2441, %r5874, %r2436; + shf.l.wrap.b32 %r11171, %r11177, %r11176, %r11178; // end inline asm - ld.const.u32 %r2445, [matrix+712]; // begin inline asm - dp4a.u32.u32 %r2444, %r2445, %r5878, %r2440; + shf.l.wrap.b32 %r11175, %r11176, %r11177, %r11178; // end inline asm - ld.const.u32 %r2449, [matrix+716]; + mov.u32 %r11186, 6; // begin inline asm - dp4a.u32.u32 %r2448, %r2449, %r5882, %r2444; + shf.l.wrap.b32 %r11179, %r11185, %r11184, %r11186; // end inline asm - ld.const.u32 %r2453, [matrix+720]; // begin inline asm - dp4a.u32.u32 %r2452, %r2453, %r5886, %r2448; + shf.l.wrap.b32 %r11183, %r11184, %r11185, %r11186; // end inline asm - ld.const.u32 %r2457, [matrix+724]; + mov.u32 %r11194, 3; // begin inline asm - dp4a.u32.u32 %r2456, %r2457, %r5890, %r2452; + shf.l.wrap.b32 %r11187, %r11193, %r11192, %r11194; // end inline asm - ld.const.u32 %r2461, [matrix+728]; // begin inline asm - dp4a.u32.u32 %r2460, %r2461, %r5894, %r2456; + shf.l.wrap.b32 %r11191, %r11192, %r11193, %r11194; // end inline asm - ld.const.u32 %r2465, [matrix+732]; // begin inline asm - dp4a.u32.u32 %r2464, %r2465, %r5898, %r2460; + shf.l.wrap.b32 %r11195, %r11201, %r11200, %r10870; // end inline asm - ld.const.u32 %r2469, [matrix+736]; // begin inline asm - dp4a.u32.u32 %r2468, %r2469, %r5902, %r2464; + shf.l.wrap.b32 %r11199, %r11200, %r11201, %r10870; // end inline asm - ld.const.u32 %r2473, [matrix+740]; // begin inline asm - dp4a.u32.u32 %r2472, %r2473, %r5906, %r2468; + // chi + lop3.b32 %r11203, %r11238, %r11011, %r11059, 0xD2; + lop3.b32 %r11204, %r11241, %r11015, %r11063, 0xD2; // end inline asm - ld.const.u32 %r2477, [matrix+744]; // begin inline asm - dp4a.u32.u32 %r2476, %r2477, %r5910, %r2472; + // chi + lop3.b32 %r29875, %r11011, %r11059, %r11155, 0xD2; + lop3.b32 %r29876, %r11015, %r11063, %r11159, 0xD2; // end inline asm - ld.const.u32 %r2481, [matrix+748]; // begin inline asm - dp4a.u32.u32 %r2480, %r2481, %r5914, %r2476; + // chi + lop3.b32 %r29871, %r11059, %r11155, %r11107, 0xD2; + lop3.b32 %r29872, %r11063, %r11159, %r11111, 0xD2; // end inline asm - ld.const.u32 %r2485, [matrix+752]; // begin inline asm - dp4a.u32.u32 %r2484, %r2485, %r5918, %r2480; + // chi + lop3.b32 %r29867, %r11155, %r11107, %r11238, 0xD2; + lop3.b32 %r29868, %r11159, %r11111, %r11241, 0xD2; // end inline asm - ld.const.u32 %r2489, [matrix+756]; // begin inline asm - dp4a.u32.u32 %r2488, %r2489, %r5922, %r2484; + // chi + lop3.b32 %r29865, %r11107, %r11238, %r11011, 0xD2; + lop3.b32 %r29866, %r11111, %r11241, %r11015, 0xD2; // end inline asm - ld.const.u32 %r2493, [matrix+760]; // begin inline asm - dp4a.u32.u32 %r2492, %r2493, %r5926, %r2488; + // chi + lop3.b32 %r29861, %r11147, %r11019, %r11187, 0xD2; + lop3.b32 %r29862, %r11151, %r11023, %r11191, 0xD2; // end inline asm - ld.const.u32 %r2497, [matrix+764]; // begin inline asm - dp4a.u32.u32 %r2496, %r2497, %r5930, %r2492; + // chi + lop3.b32 %r29873, %r11019, %r11187, %r11131, 0xD2; + lop3.b32 %r29874, %r11023, %r11191, %r11135, 0xD2; // end inline asm - shr.u32 %r6082, %r2432, 6; - and.b32 %r2501, %r6082, 240; - shr.u32 %r2502, %r2496, 10; - and.b32 %r2503, %r5958, 255; // begin inline asm - lop3.b32 %r2500, %r2501, %r2502, %r2503, 0x56; + // chi + lop3.b32 %r29869, %r11187, %r11131, %r11027, 0xD2; + lop3.b32 %r29870, %r11191, %r11135, %r11031, 0xD2; // end inline asm - ld.const.u32 %r2505, [matrix+768]; // begin inline asm - dp4a.u32.u32 %r2504, %r2505, %r5870, %r6249; + // chi + lop3.b32 %r29841, %r11131, %r11027, %r11147, 0xD2; + lop3.b32 %r29842, %r11135, %r11031, %r11151, 0xD2; // end inline asm - ld.const.u32 %r2509, [matrix+772]; + st.local.v2.u32 [%rd84+88], {%r29841, %r29842}; // begin inline asm - dp4a.u32.u32 %r2508, %r2509, %r5874, %r2504; + // chi + lop3.b32 %r29833, %r11027, %r11147, %r11019, 0xD2; + lop3.b32 %r29834, %r11031, %r11151, %r11023, 0xD2; // end inline asm - ld.const.u32 %r2513, [matrix+776]; + st.local.v2.u32 [%rd84+96], {%r29833, %r29834}; // begin inline asm - dp4a.u32.u32 %r2512, %r2513, %r5878, %r2508; + // chi + lop3.b32 %r29859, %r11195, %r11179, %r11067, 0xD2; + lop3.b32 %r29860, %r11199, %r11183, %r11071, 0xD2; // end inline asm - ld.const.u32 %r2517, [matrix+780]; + st.local.v2.u32 [%rd84+104], {%r29859, %r29860}; // begin inline asm - dp4a.u32.u32 %r2516, %r2517, %r5882, %r2512; + // chi + lop3.b32 %r29853, %r11179, %r11067, %r11075, 0xD2; + lop3.b32 %r29854, %r11183, %r11071, %r11079, 0xD2; // end inline asm - ld.const.u32 %r2521, [matrix+784]; + st.local.v2.u32 [%rd84+112], {%r29853, %r29854}; // begin inline asm - dp4a.u32.u32 %r2520, %r2521, %r5886, %r2516; + // chi + lop3.b32 %r29847, %r11067, %r11075, %r11043, 0xD2; + lop3.b32 %r29848, %r11071, %r11079, %r11047, 0xD2; // end inline asm - ld.const.u32 %r2525, [matrix+788]; + st.local.v2.u32 [%rd84+120], {%r29847, %r29848}; // begin inline asm - dp4a.u32.u32 %r2524, %r2525, %r5890, %r2520; + // chi + lop3.b32 %r29839, %r11075, %r11043, %r11195, 0xD2; + lop3.b32 %r29840, %r11079, %r11047, %r11199, 0xD2; // end inline asm - ld.const.u32 %r2529, [matrix+792]; + st.local.v2.u32 [%rd84+128], {%r29839, %r29840}; // begin inline asm - dp4a.u32.u32 %r2528, %r2529, %r5894, %r2524; + // chi + lop3.b32 %r29831, %r11043, %r11195, %r11179, 0xD2; + lop3.b32 %r29832, %r11047, %r11199, %r11183, 0xD2; // end inline asm - ld.const.u32 %r2533, [matrix+796]; + st.local.v2.u32 [%rd84+136], {%r29831, %r29832}; // begin inline asm - dp4a.u32.u32 %r2532, %r2533, %r5898, %r2528; + // chi + lop3.b32 %r29857, %r11099, %r11139, %r11171, 0xD2; + lop3.b32 %r29858, %r11103, %r11143, %r11175, 0xD2; // end inline asm - ld.const.u32 %r2537, [matrix+800]; + st.local.v2.u32 [%rd84+144], {%r29857, %r29858}; // begin inline asm - dp4a.u32.u32 %r2536, %r2537, %r5902, %r2532; + // chi + lop3.b32 %r29851, %r11139, %r11171, %r11163, 0xD2; + lop3.b32 %r29852, %r11143, %r11175, %r11167, 0xD2; // end inline asm - ld.const.u32 %r2541, [matrix+804]; + st.local.v2.u32 [%rd84+152], {%r29851, %r29852}; // begin inline asm - dp4a.u32.u32 %r2540, %r2541, %r5906, %r2536; + // chi + lop3.b32 %r29845, %r11171, %r11163, %r11083, 0xD2; + lop3.b32 %r29846, %r11175, %r11167, %r11087, 0xD2; // end inline asm - ld.const.u32 %r2545, [matrix+808]; + st.local.v2.u32 [%rd84+160], {%r29845, %r29846}; // begin inline asm - dp4a.u32.u32 %r2544, %r2545, %r5910, %r2540; + // chi + lop3.b32 %r29837, %r11163, %r11083, %r11099, 0xD2; + lop3.b32 %r29838, %r11167, %r11087, %r11103, 0xD2; // end inline asm - ld.const.u32 %r2549, [matrix+812]; + st.local.v2.u32 [%rd84+168], {%r29837, %r29838}; // begin inline asm - dp4a.u32.u32 %r2548, %r2549, %r5914, %r2544; + // chi + lop3.b32 %r29829, %r11083, %r11099, %r11139, 0xD2; + lop3.b32 %r29830, %r11087, %r11103, %r11143, 0xD2; // end inline asm - ld.const.u32 %r2553, [matrix+816]; + st.local.v2.u32 [%rd84+176], {%r29829, %r29830}; // begin inline asm - dp4a.u32.u32 %r2552, %r2553, %r5918, %r2548; + // chi + lop3.b32 %r29855, %r11051, %r11123, %r11035, 0xD2; + lop3.b32 %r29856, %r11055, %r11127, %r11039, 0xD2; // end inline asm - ld.const.u32 %r2557, [matrix+820]; + st.local.v2.u32 [%rd84+184], {%r29855, %r29856}; // begin inline asm - dp4a.u32.u32 %r2556, %r2557, %r5922, %r2552; + // chi + lop3.b32 %r29849, %r11123, %r11035, %r11091, 0xD2; + lop3.b32 %r29850, %r11127, %r11039, %r11095, 0xD2; // end inline asm - ld.const.u32 %r2561, [matrix+824]; + st.local.v2.u32 [%rd84+192], {%r29849, %r29850}; // begin inline asm - dp4a.u32.u32 %r2560, %r2561, %r5926, %r2556; + // chi + lop3.b32 %r29843, %r11035, %r11091, %r11115, 0xD2; + lop3.b32 %r29844, %r11039, %r11095, %r11119, 0xD2; // end inline asm - ld.const.u32 %r2565, [matrix+828]; + st.local.v2.u32 [%rd84+200], {%r29843, %r29844}; // begin inline asm - dp4a.u32.u32 %r2564, %r2565, %r5930, %r2560; + // chi + lop3.b32 %r29835, %r11091, %r11115, %r11051, 0xD2; + lop3.b32 %r29836, %r11095, %r11119, %r11055, 0xD2; // end inline asm - ld.const.u32 %r2569, [matrix+832]; + st.local.v2.u32 [%rd84+208], {%r29835, %r29836}; // begin inline asm - dp4a.u32.u32 %r2568, %r2569, %r5870, %r6249; + // chi + lop3.b32 %r29827, %r11115, %r11051, %r11123, 0xD2; + lop3.b32 %r29828, %r11119, %r11055, %r11127, 0xD2; // end inline asm - ld.const.u32 %r2573, [matrix+836]; + st.local.v2.u32 [%rd84+216], {%r29827, %r29828}; + mul.wide.s32 %rd618, %r29877, 8; + add.s64 %rd617, %rd597, %rd618; // begin inline asm - dp4a.u32.u32 %r2572, %r2573, %r5874, %r2568; + ld.global.nc.v2.u32 {%r11403,%r11404}, [%rd617]; // end inline asm - ld.const.u32 %r2577, [matrix+840]; + xor.b32 %r29863, %r11203, %r11403; + xor.b32 %r29864, %r11204, %r11404; + add.s32 %r29877, %r29877, 1; + setp.lt.u32 %p24, %r29877, 23; + @%p24 bra $L__BB2_36; + + mov.u32 %r29910, 0; + mov.u32 %r11514, 1; + st.local.v2.u32 [%rd84+32], {%r29875, %r29876}; + st.local.v2.u32 [%rd84+72], {%r29873, %r29874}; + st.local.v2.u32 [%rd84+40], {%r29871, %r29872}; + st.local.v2.u32 [%rd84+80], {%r29869, %r29870}; + st.local.v2.u32 [%rd84+48], {%r29867, %r29868}; + st.local.v2.u32 [%rd84+56], {%r29865, %r29866}; + st.local.v2.u32 [%rd84+24], {%r29863, %r29864}; // begin inline asm - dp4a.u32.u32 %r2576, %r2577, %r5878, %r2572; + // xor5 + lop3.b32 %r11415, %r29863, %r29861, %r29859, 0x96; + lop3.b32 %r11415, %r11415, %r29857, %r29855, 0x96; + lop3.b32 %r11416, %r29864, %r29862, %r29860, 0x96; + lop3.b32 %r11416, %r11416, %r29858, %r29856, 0x96; // end inline asm - ld.const.u32 %r2581, [matrix+844]; // begin inline asm - dp4a.u32.u32 %r2580, %r2581, %r5882, %r2576; + // xor5 + lop3.b32 %r11427, %r29875, %r29873, %r29853, 0x96; + lop3.b32 %r11427, %r11427, %r29851, %r29849, 0x96; + lop3.b32 %r11428, %r29876, %r29874, %r29854, 0x96; + lop3.b32 %r11428, %r11428, %r29852, %r29850, 0x96; // end inline asm - ld.const.u32 %r2585, [matrix+848]; // begin inline asm - dp4a.u32.u32 %r2584, %r2585, %r5886, %r2580; + // xor5 + lop3.b32 %r11439, %r29871, %r29869, %r29847, 0x96; + lop3.b32 %r11439, %r11439, %r29845, %r29843, 0x96; + lop3.b32 %r11440, %r29872, %r29870, %r29848, 0x96; + lop3.b32 %r11440, %r11440, %r29846, %r29844, 0x96; // end inline asm - ld.const.u32 %r2589, [matrix+852]; // begin inline asm - dp4a.u32.u32 %r2588, %r2589, %r5890, %r2584; + // xor5 + lop3.b32 %r11451, %r29867, %r29841, %r29839, 0x96; + lop3.b32 %r11451, %r11451, %r29837, %r29835, 0x96; + lop3.b32 %r11452, %r29868, %r29842, %r29840, 0x96; + lop3.b32 %r11452, %r11452, %r29838, %r29836, 0x96; // end inline asm - ld.const.u32 %r2593, [matrix+856]; // begin inline asm - dp4a.u32.u32 %r2592, %r2593, %r5894, %r2588; + // xor5 + lop3.b32 %r11463, %r29865, %r29833, %r29831, 0x96; + lop3.b32 %r11463, %r11463, %r29829, %r29827, 0x96; + lop3.b32 %r11464, %r29866, %r29834, %r29832, 0x96; + lop3.b32 %r11464, %r11464, %r29830, %r29828, 0x96; // end inline asm - ld.const.u32 %r2597, [matrix+860]; // begin inline asm - dp4a.u32.u32 %r2596, %r2597, %r5898, %r2592; + shf.l.wrap.b32 %r11475, %r11428, %r11427, %r11514; // end inline asm - ld.const.u32 %r2601, [matrix+864]; // begin inline asm - dp4a.u32.u32 %r2600, %r2601, %r5902, %r2596; + shf.l.wrap.b32 %r11479, %r11427, %r11428, %r11514; // end inline asm - ld.const.u32 %r2605, [matrix+868]; + xor.b32 %r11654, %r11475, %r11463; + xor.b32 %r11655, %r11479, %r11464; + xor.b32 %r11622, %r29863, %r11654; + xor.b32 %r11625, %r29864, %r11655; + xor.b32 %r11585, %r29860, %r11655; + xor.b32 %r11584, %r29859, %r11654; + st.local.v2.u32 [%rd84+104], {%r11584, %r11585}; // begin inline asm - dp4a.u32.u32 %r2604, %r2605, %r5906, %r2600; + shf.l.wrap.b32 %r11483, %r11440, %r11439, %r11514; // end inline asm - ld.const.u32 %r2609, [matrix+872]; // begin inline asm - dp4a.u32.u32 %r2608, %r2609, %r5910, %r2604; + shf.l.wrap.b32 %r11487, %r11439, %r11440, %r11514; // end inline asm - ld.const.u32 %r2613, [matrix+876]; + xor.b32 %r11656, %r11483, %r11415; + xor.b32 %r11657, %r11487, %r11416; + xor.b32 %r11521, %r29873, %r11656; + xor.b32 %r11520, %r29874, %r11657; + xor.b32 %r11560, %r29852, %r11657; + xor.b32 %r11561, %r29851, %r11656; + st.local.v2.u32 [%rd84+152], {%r11561, %r11560}; // begin inline asm - dp4a.u32.u32 %r2612, %r2613, %r5914, %r2608; + shf.l.wrap.b32 %r11491, %r11452, %r11451, %r11514; // end inline asm - ld.const.u32 %r2617, [matrix+880]; // begin inline asm - dp4a.u32.u32 %r2616, %r2617, %r5918, %r2612; + shf.l.wrap.b32 %r11495, %r11451, %r11452, %r11514; // end inline asm - ld.const.u32 %r2621, [matrix+884]; + xor.b32 %r11658, %r11491, %r11427; + xor.b32 %r11659, %r11495, %r11428; + xor.b32 %r11544, %r29848, %r11659; + xor.b32 %r11545, %r29847, %r11658; + st.local.v2.u32 [%rd84+120], {%r11545, %r11544}; + xor.b32 %r11536, %r29844, %r11659; + xor.b32 %r11537, %r29843, %r11658; + st.local.v2.u32 [%rd84+200], {%r11537, %r11536}; // begin inline asm - dp4a.u32.u32 %r2620, %r2621, %r5922, %r2616; + shf.l.wrap.b32 %r11499, %r11464, %r11463, %r11514; // end inline asm - ld.const.u32 %r2625, [matrix+888]; // begin inline asm - dp4a.u32.u32 %r2624, %r2625, %r5926, %r2620; + shf.l.wrap.b32 %r11503, %r11463, %r11464, %r11514; // end inline asm - ld.const.u32 %r2629, [matrix+892]; + xor.b32 %r11660, %r11499, %r11439; + xor.b32 %r11661, %r11503, %r11440; + xor.b32 %r11568, %r29867, %r11660; + xor.b32 %r11569, %r29868, %r11661; + xor.b32 %r11577, %r29838, %r11661; + xor.b32 %r11576, %r29837, %r11660; + st.local.v2.u32 [%rd84+168], {%r11576, %r11577}; // begin inline asm - dp4a.u32.u32 %r2628, %r2629, %r5930, %r2624; + shf.l.wrap.b32 %r11507, %r11416, %r11415, %r11514; // end inline asm - shr.u32 %r6083, %r2564, 6; - and.b32 %r2633, %r6083, 240; - shr.u32 %r2634, %r2628, 10; - and.b32 %r2635, %r5962, 255; // begin inline asm - lop3.b32 %r2632, %r2633, %r2634, %r2635, 0x56; + shf.l.wrap.b32 %r11511, %r11415, %r11416, %r11514; // end inline asm - ld.const.u32 %r2637, [matrix+896]; + xor.b32 %r11662, %r11507, %r11451; + xor.b32 %r11663, %r11511, %r11452; + xor.b32 %r11528, %r29833, %r11662; + xor.b32 %r11529, %r29834, %r11663; + xor.b32 %r11553, %r29828, %r11663; + xor.b32 %r11552, %r29827, %r11662; + st.local.v2.u32 [%rd84+216], {%r11552, %r11553}; // begin inline asm - dp4a.u32.u32 %r2636, %r2637, %r5870, %r6249; + shf.l.wrap.b32 %r11515, %r11521, %r11520, %r11018; // end inline asm - ld.const.u32 %r2641, [matrix+900]; // begin inline asm - dp4a.u32.u32 %r2640, %r2641, %r5874, %r2636; + shf.l.wrap.b32 %r11519, %r11520, %r11521, %r11018; // end inline asm - ld.const.u32 %r2645, [matrix+904]; // begin inline asm - dp4a.u32.u32 %r2644, %r2645, %r5878, %r2640; + shf.l.wrap.b32 %r11523, %r11529, %r11528, %r11026; // end inline asm - ld.const.u32 %r2649, [matrix+908]; // begin inline asm - dp4a.u32.u32 %r2648, %r2649, %r5882, %r2644; + shf.l.wrap.b32 %r11527, %r11528, %r11529, %r11026; // end inline asm - ld.const.u32 %r2653, [matrix+912]; // begin inline asm - dp4a.u32.u32 %r2652, %r2653, %r5886, %r2648; + shf.l.wrap.b32 %r11535, %r11536, %r11537, %r11034; // end inline asm - ld.const.u32 %r2657, [matrix+916]; // begin inline asm - dp4a.u32.u32 %r2656, %r2657, %r5890, %r2652; + shf.l.wrap.b32 %r11531, %r11537, %r11536, %r11034; // end inline asm - ld.const.u32 %r2661, [matrix+920]; + st.local.v2.u32 [%rd84+96], {%r11531, %r11535}; // begin inline asm - dp4a.u32.u32 %r2660, %r2661, %r5894, %r2656; + shf.l.wrap.b32 %r11539, %r11545, %r11544, %r11066; // end inline asm - ld.const.u32 %r2665, [matrix+924]; // begin inline asm - dp4a.u32.u32 %r2664, %r2665, %r5898, %r2660; + shf.l.wrap.b32 %r11543, %r11544, %r11545, %r11066; // end inline asm - ld.const.u32 %r2669, [matrix+928]; // begin inline asm - dp4a.u32.u32 %r2668, %r2669, %r5902, %r2664; + shf.l.wrap.b32 %r11547, %r11553, %r11552, %r11114; // end inline asm - ld.const.u32 %r2673, [matrix+932]; // begin inline asm - dp4a.u32.u32 %r2672, %r2673, %r5906, %r2668; + shf.l.wrap.b32 %r11551, %r11552, %r11553, %r11114; // end inline asm - ld.const.u32 %r2677, [matrix+936]; // begin inline asm - dp4a.u32.u32 %r2676, %r2677, %r5910, %r2672; + shf.l.wrap.b32 %r11559, %r11560, %r11561, %r11138; // end inline asm - ld.const.u32 %r2681, [matrix+940]; // begin inline asm - dp4a.u32.u32 %r2680, %r2681, %r5914, %r2676; + shf.l.wrap.b32 %r11555, %r11561, %r11560, %r11138; // end inline asm - ld.const.u32 %r2685, [matrix+944]; + st.local.v2.u32 [%rd84+88], {%r11555, %r11559}; // begin inline asm - dp4a.u32.u32 %r2684, %r2685, %r5918, %r2680; + shf.l.wrap.b32 %r11563, %r11569, %r11568, %r11154; // end inline asm - ld.const.u32 %r2689, [matrix+948]; // begin inline asm - dp4a.u32.u32 %r2688, %r2689, %r5922, %r2684; + shf.l.wrap.b32 %r11567, %r11568, %r11569, %r11154; // end inline asm - ld.const.u32 %r2693, [matrix+952]; // begin inline asm - dp4a.u32.u32 %r2692, %r2693, %r5926, %r2688; + shf.l.wrap.b32 %r11571, %r11577, %r11576, %r11162; // end inline asm - ld.const.u32 %r2697, [matrix+956]; // begin inline asm - dp4a.u32.u32 %r2696, %r2697, %r5930, %r2692; + shf.l.wrap.b32 %r11575, %r11576, %r11577, %r11162; // end inline asm - ld.const.u32 %r2701, [matrix+960]; // begin inline asm - dp4a.u32.u32 %r2700, %r2701, %r5870, %r6249; + shf.l.wrap.b32 %r11579, %r11585, %r11584, %r11194; // end inline asm - ld.const.u32 %r2705, [matrix+964]; // begin inline asm - dp4a.u32.u32 %r2704, %r2705, %r5874, %r2700; + shf.l.wrap.b32 %r11583, %r11584, %r11585, %r11194; // end inline asm - ld.const.u32 %r2709, [matrix+968]; // begin inline asm - dp4a.u32.u32 %r2708, %r2709, %r5878, %r2704; + // chi + lop3.b32 %r11587, %r11622, %r11515, %r11539, 0xD2; + lop3.b32 %r11588, %r11625, %r11519, %r11543, 0xD2; // end inline asm - ld.const.u32 %r2713, [matrix+972]; // begin inline asm - dp4a.u32.u32 %r2712, %r2713, %r5882, %r2708; + // chi + lop3.b32 %r30010, %r11515, %r11539, %r11571, 0xD2; + lop3.b32 %r30011, %r11519, %r11543, %r11575, 0xD2; // end inline asm - ld.const.u32 %r2717, [matrix+976]; + st.local.v2.u32 [%rd84+32], {%r30010, %r30011}; // begin inline asm - dp4a.u32.u32 %r2716, %r2717, %r5886, %r2712; + // chi + lop3.b32 %r30006, %r11539, %r11571, %r11547, 0xD2; + lop3.b32 %r30007, %r11543, %r11575, %r11551, 0xD2; // end inline asm - ld.const.u32 %r2721, [matrix+980]; + st.local.v2.u32 [%rd84+40], {%r30006, %r30007}; // begin inline asm - dp4a.u32.u32 %r2720, %r2721, %r5890, %r2716; + // chi + lop3.b32 %r30002, %r11571, %r11547, %r11622, 0xD2; + lop3.b32 %r30003, %r11575, %r11551, %r11625, 0xD2; // end inline asm - ld.const.u32 %r2725, [matrix+984]; + st.local.v2.u32 [%rd84+48], {%r30002, %r30003}; // begin inline asm - dp4a.u32.u32 %r2724, %r2725, %r5894, %r2720; + // chi + lop3.b32 %r30000, %r11547, %r11622, %r11515, 0xD2; + lop3.b32 %r30001, %r11551, %r11625, %r11519, 0xD2; // end inline asm - ld.const.u32 %r2729, [matrix+988]; + st.local.v2.u32 [%rd84+56], {%r30000, %r30001}; // begin inline asm - dp4a.u32.u32 %r2728, %r2729, %r5898, %r2724; + // chi + lop3.b32 %r29996, %r11563, %r11523, %r11579, 0xD2; + lop3.b32 %r29997, %r11567, %r11527, %r11583, 0xD2; // end inline asm - ld.const.u32 %r2733, [matrix+992]; + st.local.v2.u32 [%rd84+64], {%r29996, %r29997}; // begin inline asm - dp4a.u32.u32 %r2732, %r2733, %r5902, %r2728; + // chi + lop3.b32 %r30008, %r11523, %r11579, %r11555, 0xD2; + lop3.b32 %r30009, %r11527, %r11583, %r11559, 0xD2; // end inline asm - ld.const.u32 %r2737, [matrix+996]; + st.local.v2.u32 [%rd84+72], {%r30008, %r30009}; // begin inline asm - dp4a.u32.u32 %r2736, %r2737, %r5906, %r2732; + // chi + lop3.b32 %r30004, %r11579, %r11555, %r11531, 0xD2; + lop3.b32 %r30005, %r11583, %r11559, %r11535, 0xD2; // end inline asm - ld.const.u32 %r2741, [matrix+1000]; + st.local.v2.u32 [%rd84+80], {%r30004, %r30005}; // begin inline asm - dp4a.u32.u32 %r2740, %r2741, %r5910, %r2736; + ld.global.nc.v2.u32 {%r11651,%r11652}, [%rd598]; + // end inline asm + xor.b32 %r29998, %r11587, %r11651; + xor.b32 %r29999, %r11588, %r11652; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; + add.s64 %rd86, %rd84, 24; + add.s64 %rd87, %rd3, 24; + +$L__BB2_38: + shl.b32 %r11664, %r29910, 2; + cvt.u64.u32 %rd628, %r11664; + and.b64 %rd629, %rd628, 60; + add.s64 %rd630, %rd87, %rd629; + xor.b32 %r11665, %r30, %r29910; + mul.lo.s32 %r11666, %r11665, 16777619; + ld.local.u32 %r11667, [%rd630]; + xor.b32 %r11668, %r11666, %r11667; + mul.wide.u32 %rd631, %r11668, -954391867; + shr.u64 %rd632, %rd631, 32; + cvt.u32.u64 %r11669, %rd632; + sub.s32 %r11670, %r11668, %r11669; + shr.u32 %r11671, %r11670, 1; + add.s32 %r11672, %r11671, %r11669; + shr.u32 %r11673, %r11672, 20; + mul.lo.s32 %r11674, %r11673, 1179641; + sub.s32 %r11675, %r11668, %r11674; + mul.wide.u32 %rd633, %r11675, 64; + add.s64 %rd634, %rd471, %rd633; + mul.lo.s32 %r11676, %r29947, 16777619; + ld.global.u32 %r11677, [%rd634]; + xor.b32 %r29947, %r11676, %r11677; + mul.lo.s32 %r11678, %r29948, 16777619; + ld.global.u32 %r11679, [%rd634+4]; + xor.b32 %r29948, %r11678, %r11679; + mul.lo.s32 %r11680, %r29959, 16777619; + ld.global.u32 %r11681, [%rd634+8]; + mul.lo.s32 %r11682, %r29960, 16777619; + ld.global.u32 %r11683, [%rd634+12]; + xor.b32 %r11684, %r11682, %r11683; + xor.b32 %r29959, %r11680, %r11681; + mov.b64 %rd635, {%r29959, %r11684}; + mul.lo.s32 %r11685, %r29955, 16777619; + ld.global.u32 %r11686, [%rd634+16]; + mul.lo.s32 %r11687, %r29956, 16777619; + ld.global.u32 %r11688, [%rd634+20]; + xor.b32 %r11689, %r11687, %r11688; + xor.b32 %r29955, %r11685, %r11686; + mov.b64 %rd636, {%r29955, %r11689}; + mul.lo.s32 %r11690, %r29951, 16777619; + ld.global.u32 %r11691, [%rd634+24]; + mul.lo.s32 %r11692, %r29952, 16777619; + ld.global.u32 %r11693, [%rd634+28]; + xor.b32 %r11694, %r11692, %r11693; + xor.b32 %r29951, %r11690, %r11691; + mov.b64 %rd637, {%r29951, %r11694}; + mul.lo.s32 %r11695, %r29949, 16777619; + ld.global.u32 %r11696, [%rd634+32]; + mul.lo.s32 %r11697, %r29950, 16777619; + ld.global.u32 %r11698, [%rd634+36]; + xor.b32 %r11699, %r11697, %r11698; + xor.b32 %r29949, %r11695, %r11696; + mov.b64 %rd638, {%r29949, %r11699}; + mul.lo.s32 %r11700, %r29945, 16777619; + ld.global.u32 %r11701, [%rd634+40]; + xor.b32 %r29945, %r11700, %r11701; + mul.lo.s32 %r11702, %r29946, 16777619; + ld.global.u32 %r11703, [%rd634+44]; + xor.b32 %r29946, %r11702, %r11703; + mul.lo.s32 %r11704, %r29957, 16777619; + ld.global.u32 %r11705, [%rd634+48]; + mul.lo.s32 %r11706, %r29958, 16777619; + ld.global.u32 %r11707, [%rd634+52]; + xor.b32 %r11708, %r11706, %r11707; + xor.b32 %r29957, %r11704, %r11705; + mov.b64 %rd639, {%r29957, %r11708}; + mul.lo.s32 %r11709, %r29953, 16777619; + ld.global.u32 %r11710, [%rd634+56]; + mul.lo.s32 %r11711, %r29954, 16777619; + ld.global.u32 %r11712, [%rd634+60]; + xor.b32 %r11713, %r11711, %r11712; + xor.b32 %r29953, %r11709, %r11710; + mov.b64 %rd640, {%r29953, %r11713}; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; + st.local.v2.u32 [%rd3+32], {%r29959, %r11684}; + st.local.v2.u32 [%rd3+40], {%r29955, %r11689}; + st.local.v2.u32 [%rd3+48], {%r29951, %r11694}; + st.local.v2.u32 [%rd3+56], {%r29949, %r11699}; + st.local.v2.u32 [%rd3+64], {%r29945, %r29946}; + st.local.v2.u32 [%rd3+72], {%r29957, %r11708}; + st.local.v2.u32 [%rd3+80], {%r29953, %r11713}; + add.s64 %rd641, %rd86, %rd629; + xor.b32 %r11714, %r31, %r29910; + mul.lo.s32 %r11715, %r11714, 16777619; + ld.local.u32 %r11716, [%rd641]; + xor.b32 %r11717, %r11715, %r11716; + mul.wide.u32 %rd642, %r11717, -954391867; + shr.u64 %rd643, %rd642, 32; + cvt.u32.u64 %r11718, %rd643; + sub.s32 %r11719, %r11717, %r11718; + shr.u32 %r11720, %r11719, 1; + add.s32 %r11721, %r11720, %r11718; + shr.u32 %r11722, %r11721, 20; + mul.lo.s32 %r11723, %r11722, 1179641; + sub.s32 %r11724, %r11717, %r11723; + mul.wide.u32 %rd644, %r11724, 64; + add.s64 %rd645, %rd471, %rd644; + mul.lo.s32 %r11725, %r29998, 16777619; + ld.global.u32 %r11726, [%rd645]; + xor.b32 %r29998, %r11725, %r11726; + mul.lo.s32 %r11727, %r29999, 16777619; + ld.global.u32 %r11728, [%rd645+4]; + xor.b32 %r29999, %r11727, %r11728; + mul.lo.s32 %r11729, %r30010, 16777619; + ld.global.u32 %r11730, [%rd645+8]; + mul.lo.s32 %r11731, %r30011, 16777619; + ld.global.u32 %r11732, [%rd645+12]; + xor.b32 %r11733, %r11731, %r11732; + xor.b32 %r30010, %r11729, %r11730; + mov.b64 %rd646, {%r30010, %r11733}; + mul.lo.s32 %r11734, %r30006, 16777619; + ld.global.u32 %r11735, [%rd645+16]; + mul.lo.s32 %r11736, %r30007, 16777619; + ld.global.u32 %r11737, [%rd645+20]; + xor.b32 %r11738, %r11736, %r11737; + xor.b32 %r30006, %r11734, %r11735; + mov.b64 %rd647, {%r30006, %r11738}; + mul.lo.s32 %r11739, %r30002, 16777619; + ld.global.u32 %r11740, [%rd645+24]; + mul.lo.s32 %r11741, %r30003, 16777619; + ld.global.u32 %r11742, [%rd645+28]; + xor.b32 %r11743, %r11741, %r11742; + xor.b32 %r30002, %r11739, %r11740; + mov.b64 %rd648, {%r30002, %r11743}; + mul.lo.s32 %r11744, %r30000, 16777619; + ld.global.u32 %r11745, [%rd645+32]; + mul.lo.s32 %r11746, %r30001, 16777619; + ld.global.u32 %r11747, [%rd645+36]; + xor.b32 %r11748, %r11746, %r11747; + xor.b32 %r30000, %r11744, %r11745; + mov.b64 %rd649, {%r30000, %r11748}; + mul.lo.s32 %r11749, %r29996, 16777619; + ld.global.u32 %r11750, [%rd645+40]; + xor.b32 %r29996, %r11749, %r11750; + mul.lo.s32 %r11751, %r29997, 16777619; + ld.global.u32 %r11752, [%rd645+44]; + xor.b32 %r29997, %r11751, %r11752; + mul.lo.s32 %r11753, %r30008, 16777619; + ld.global.u32 %r11754, [%rd645+48]; + mul.lo.s32 %r11755, %r30009, 16777619; + ld.global.u32 %r11756, [%rd645+52]; + xor.b32 %r11757, %r11755, %r11756; + xor.b32 %r30008, %r11753, %r11754; + mov.b64 %rd650, {%r30008, %r11757}; + mul.lo.s32 %r11758, %r30004, 16777619; + ld.global.u32 %r11759, [%rd645+56]; + mul.lo.s32 %r11760, %r30005, 16777619; + ld.global.u32 %r11761, [%rd645+60]; + xor.b32 %r11762, %r11760, %r11761; + xor.b32 %r30004, %r11758, %r11759; + mov.b64 %rd651, {%r30004, %r11762}; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; + st.local.v2.u32 [%rd84+32], {%r30010, %r11733}; + st.local.v2.u32 [%rd84+40], {%r30006, %r11738}; + st.local.v2.u32 [%rd84+48], {%r30002, %r11743}; + st.local.v2.u32 [%rd84+56], {%r30000, %r11748}; + st.local.v2.u32 [%rd84+64], {%r29996, %r29997}; + st.local.v2.u32 [%rd84+72], {%r30008, %r11757}; + st.local.v2.u32 [%rd84+80], {%r30004, %r11762}; + add.s32 %r29910, %r29910, 1; + setp.lt.u32 %p25, %r29910, 512; + shr.u64 %rd652, %rd635, 32; + cvt.u32.u64 %r29960, %rd652; + shr.u64 %rd653, %rd636, 32; + cvt.u32.u64 %r29956, %rd653; + shr.u64 %rd654, %rd637, 32; + cvt.u32.u64 %r29952, %rd654; + shr.u64 %rd655, %rd638, 32; + cvt.u32.u64 %r29950, %rd655; + shr.u64 %rd656, %rd639, 32; + cvt.u32.u64 %r29958, %rd656; + shr.u64 %rd657, %rd640, 32; + cvt.u32.u64 %r29954, %rd657; + shr.u64 %rd658, %rd646, 32; + cvt.u32.u64 %r30011, %rd658; + shr.u64 %rd659, %rd647, 32; + cvt.u32.u64 %r30007, %rd659; + shr.u64 %rd660, %rd648, 32; + cvt.u32.u64 %r30003, %rd660; + shr.u64 %rd661, %rd649, 32; + cvt.u32.u64 %r30001, %rd661; + shr.u64 %rd662, %rd650, 32; + cvt.u32.u64 %r30009, %rd662; + shr.u64 %rd663, %rd651, 32; + cvt.u32.u64 %r30005, %rd663; + @%p25 bra $L__BB2_38; + + mov.u32 %r29911, 0; + st.local.v2.u32 [%rd3+96], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+104], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+112], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+120], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+128], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+136], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+144], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+152], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+160], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+168], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+176], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+184], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+192], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+200], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+208], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+216], {%r29911, %r29911}; + mov.u32 %r29926, -2147483648; + mov.u32 %r11777, 1; + st.local.v2.u32 [%rd3+88], {%r11777, %r29926}; + mov.u32 %r29912, %r29911; + mov.u32 %r29913, %r29911; + mov.u32 %r29914, %r29911; + mov.u32 %r29915, %r29911; + mov.u32 %r29916, %r29911; + mov.u32 %r29917, %r29911; + mov.u32 %r29918, %r29911; + mov.u32 %r29919, %r29911; + mov.u32 %r29920, %r29911; + mov.u32 %r29921, %r29911; + mov.u32 %r29922, %r29911; + mov.u32 %r29923, %r29911; + mov.u32 %r29924, %r29911; + mov.u32 %r29925, %r11777; + mov.u32 %r29927, %r29911; + mov.u32 %r29928, %r29911; + mov.u32 %r29929, %r29911; + mov.u32 %r29930, %r29911; + mov.u32 %r29931, %r29911; + mov.u32 %r29932, %r29911; + mov.u32 %r29933, %r29911; + mov.u32 %r29934, %r29911; + mov.u32 %r29935, %r29911; + mov.u32 %r29936, %r29911; + mov.u32 %r29937, %r29911; + mov.u32 %r29938, %r29911; + mov.u32 %r29939, %r29911; + mov.u32 %r29940, %r29911; + mov.u32 %r29941, %r29911; + mov.u32 %r29942, %r29911; + mov.u32 %r29943, %r29911; + mov.u32 %r29944, %r29911; + mov.u32 %r29961, %r29911; + +$L__BB2_40: + // begin inline asm + // xor5 + lop3.b32 %r11804, %r29947, %r29945, %r29943, 0x96; + lop3.b32 %r11804, %r11804, %r29941, %r29939, 0x96; + lop3.b32 %r11805, %r29948, %r29946, %r29944, 0x96; + lop3.b32 %r11805, %r11805, %r29942, %r29940, 0x96; // end inline asm - ld.const.u32 %r2745, [matrix+1004]; // begin inline asm - dp4a.u32.u32 %r2744, %r2745, %r5914, %r2740; + // xor5 + lop3.b32 %r11816, %r29959, %r29957, %r29937, 0x96; + lop3.b32 %r11816, %r11816, %r29935, %r29933, 0x96; + lop3.b32 %r11817, %r29960, %r29958, %r29938, 0x96; + lop3.b32 %r11817, %r11817, %r29936, %r29934, 0x96; // end inline asm - ld.const.u32 %r2749, [matrix+1008]; // begin inline asm - dp4a.u32.u32 %r2748, %r2749, %r5918, %r2744; + // xor5 + lop3.b32 %r11828, %r29955, %r29953, %r29931, 0x96; + lop3.b32 %r11828, %r11828, %r29929, %r29927, 0x96; + lop3.b32 %r11829, %r29956, %r29954, %r29932, 0x96; + lop3.b32 %r11829, %r11829, %r29930, %r29928, 0x96; // end inline asm - ld.const.u32 %r2753, [matrix+1012]; // begin inline asm - dp4a.u32.u32 %r2752, %r2753, %r5922, %r2748; + // xor5 + lop3.b32 %r11840, %r29951, %r29925, %r29923, 0x96; + lop3.b32 %r11840, %r11840, %r29921, %r29919, 0x96; + lop3.b32 %r11841, %r29952, %r29926, %r29924, 0x96; + lop3.b32 %r11841, %r11841, %r29922, %r29920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11852, %r29949, %r29917, %r29915, 0x96; + lop3.b32 %r11852, %r11852, %r29913, %r29911, 0x96; + lop3.b32 %r11853, %r29950, %r29918, %r29916, 0x96; + lop3.b32 %r11853, %r11853, %r29914, %r29912, 0x96; // end inline asm - ld.const.u32 %r2757, [matrix+1016]; // begin inline asm - dp4a.u32.u32 %r2756, %r2757, %r5926, %r2752; + shf.l.wrap.b32 %r11864, %r11817, %r11816, %r11777; // end inline asm - ld.const.u32 %r2761, [matrix+1020]; // begin inline asm - dp4a.u32.u32 %r2760, %r2761, %r5930, %r2756; + shf.l.wrap.b32 %r11868, %r11816, %r11817, %r11777; // end inline asm - shr.u32 %r6084, %r2696, 6; - and.b32 %r2765, %r6084, 240; - shr.u32 %r2766, %r2760, 10; + xor.b32 %r12298, %r11864, %r11852; + xor.b32 %r12299, %r11868, %r11853; + xor.b32 %r12131, %r29947, %r12298; + xor.b32 %r12134, %r29948, %r12299; + xor.b32 %r12038, %r29945, %r12298; + xor.b32 %r12037, %r29946, %r12299; + xor.b32 %r12085, %r29943, %r12298; + xor.b32 %r12086, %r29944, %r12299; + xor.b32 %r11990, %r29941, %r12298; + xor.b32 %r11989, %r29942, %r12299; + xor.b32 %r11941, %r29939, %r12298; + xor.b32 %r11942, %r29940, %r12299; // begin inline asm - lop3.b32 %r2764, %r2765, %r2766, %r10, 0x56; + shf.l.wrap.b32 %r11872, %r11829, %r11828, %r11777; // end inline asm - ld.const.u32 %r2769, [matrix+1024]; // begin inline asm - dp4a.u32.u32 %r2768, %r2769, %r5870, %r6249; + shf.l.wrap.b32 %r11876, %r11828, %r11829, %r11777; // end inline asm - ld.const.u32 %r2773, [matrix+1028]; + xor.b32 %r12300, %r11872, %r11804; + xor.b32 %r12301, %r11876, %r11805; + xor.b32 %r12093, %r29959, %r12300; + xor.b32 %r12094, %r29960, %r12301; + xor.b32 %r11910, %r29957, %r12300; + xor.b32 %r11909, %r29958, %r12301; + xor.b32 %r12069, %r29937, %r12300; + xor.b32 %r12070, %r29938, %r12301; + xor.b32 %r12030, %r29935, %r12300; + xor.b32 %r12029, %r29936, %r12301; + xor.b32 %r12013, %r29933, %r12300; + xor.b32 %r12014, %r29934, %r12301; // begin inline asm - dp4a.u32.u32 %r2772, %r2773, %r5874, %r2768; + shf.l.wrap.b32 %r11880, %r11841, %r11840, %r11777; // end inline asm - ld.const.u32 %r2777, [matrix+1032]; // begin inline asm - dp4a.u32.u32 %r2776, %r2777, %r5878, %r2772; + shf.l.wrap.b32 %r11884, %r11840, %r11841, %r11777; // end inline asm - ld.const.u32 %r2781, [matrix+1036]; + xor.b32 %r12302, %r11880, %r11816; + xor.b32 %r12303, %r11884, %r11817; + xor.b32 %r11950, %r29955, %r12302; + xor.b32 %r11949, %r29956, %r12303; + xor.b32 %r12077, %r29953, %r12302; + xor.b32 %r12078, %r29954, %r12303; + xor.b32 %r11958, %r29931, %r12302; + xor.b32 %r11957, %r29932, %r12303; + xor.b32 %r12061, %r29929, %r12302; + xor.b32 %r12062, %r29930, %r12303; + xor.b32 %r11926, %r29927, %r12302; + xor.b32 %r11925, %r29928, %r12303; // begin inline asm - dp4a.u32.u32 %r2780, %r2781, %r5882, %r2776; + shf.l.wrap.b32 %r11888, %r11853, %r11852, %r11777; // end inline asm - ld.const.u32 %r2785, [matrix+1040]; // begin inline asm - dp4a.u32.u32 %r2784, %r2785, %r5886, %r2780; + shf.l.wrap.b32 %r11892, %r11852, %r11853, %r11777; // end inline asm - ld.const.u32 %r2789, [matrix+1044]; + xor.b32 %r12304, %r11888, %r11828; + xor.b32 %r12305, %r11892, %r11829; + xor.b32 %r12045, %r29951, %r12304; + xor.b32 %r12046, %r29952, %r12305; + xor.b32 %r12022, %r29925, %r12304; + xor.b32 %r12021, %r29926, %r12305; + xor.b32 %r11965, %r29923, %r12304; + xor.b32 %r11966, %r29924, %r12305; + xor.b32 %r12053, %r29921, %r12304; + xor.b32 %r12054, %r29922, %r12305; + xor.b32 %r11982, %r29919, %r12304; + xor.b32 %r11981, %r29920, %r12305; // begin inline asm - dp4a.u32.u32 %r2788, %r2789, %r5890, %r2784; + shf.l.wrap.b32 %r11896, %r11805, %r11804, %r11777; // end inline asm - ld.const.u32 %r2793, [matrix+1048]; // begin inline asm - dp4a.u32.u32 %r2792, %r2793, %r5894, %r2788; + shf.l.wrap.b32 %r11900, %r11804, %r11805, %r11777; // end inline asm - ld.const.u32 %r2797, [matrix+1052]; + xor.b32 %r12306, %r11896, %r11840; + xor.b32 %r12307, %r11900, %r11841; + xor.b32 %r11997, %r29949, %r12306; + xor.b32 %r11998, %r29950, %r12307; + xor.b32 %r11917, %r29917, %r12306; + xor.b32 %r11918, %r29918, %r12307; + xor.b32 %r11934, %r29915, %r12306; + xor.b32 %r11933, %r29916, %r12307; + xor.b32 %r11973, %r29913, %r12306; + xor.b32 %r11974, %r29914, %r12307; + xor.b32 %r12005, %r29911, %r12306; + xor.b32 %r12006, %r29912, %r12307; + mov.u32 %r11911, 44; // begin inline asm - dp4a.u32.u32 %r2796, %r2797, %r5898, %r2792; + shf.l.wrap.b32 %r11904, %r11910, %r11909, %r11911; // end inline asm - ld.const.u32 %r2801, [matrix+1056]; // begin inline asm - dp4a.u32.u32 %r2800, %r2801, %r5902, %r2796; + shf.l.wrap.b32 %r11908, %r11909, %r11910, %r11911; // end inline asm - ld.const.u32 %r2805, [matrix+1060]; + mov.u32 %r11919, 20; // begin inline asm - dp4a.u32.u32 %r2804, %r2805, %r5906, %r2800; + shf.l.wrap.b32 %r11912, %r11918, %r11917, %r11919; // end inline asm - ld.const.u32 %r2809, [matrix+1064]; // begin inline asm - dp4a.u32.u32 %r2808, %r2809, %r5910, %r2804; + shf.l.wrap.b32 %r11916, %r11917, %r11918, %r11919; // end inline asm - ld.const.u32 %r2813, [matrix+1068]; + mov.u32 %r11927, 61; // begin inline asm - dp4a.u32.u32 %r2812, %r2813, %r5914, %r2808; + shf.l.wrap.b32 %r11920, %r11926, %r11925, %r11927; // end inline asm - ld.const.u32 %r2817, [matrix+1072]; // begin inline asm - dp4a.u32.u32 %r2816, %r2817, %r5918, %r2812; + shf.l.wrap.b32 %r11924, %r11925, %r11926, %r11927; // end inline asm - ld.const.u32 %r2821, [matrix+1076]; + mov.u32 %r11935, 39; // begin inline asm - dp4a.u32.u32 %r2820, %r2821, %r5922, %r2816; + shf.l.wrap.b32 %r11928, %r11934, %r11933, %r11935; // end inline asm - ld.const.u32 %r2825, [matrix+1080]; // begin inline asm - dp4a.u32.u32 %r2824, %r2825, %r5926, %r2820; + shf.l.wrap.b32 %r11932, %r11933, %r11934, %r11935; // end inline asm - ld.const.u32 %r2829, [matrix+1084]; + mov.u32 %r11943, 18; // begin inline asm - dp4a.u32.u32 %r2828, %r2829, %r5930, %r2824; + shf.l.wrap.b32 %r11936, %r11942, %r11941, %r11943; // end inline asm - ld.const.u32 %r2833, [matrix+1088]; // begin inline asm - dp4a.u32.u32 %r2832, %r2833, %r5870, %r6249; + shf.l.wrap.b32 %r11940, %r11941, %r11942, %r11943; // end inline asm - ld.const.u32 %r2837, [matrix+1092]; + mov.u32 %r11951, 62; // begin inline asm - dp4a.u32.u32 %r2836, %r2837, %r5874, %r2832; + shf.l.wrap.b32 %r11944, %r11950, %r11949, %r11951; // end inline asm - ld.const.u32 %r2841, [matrix+1096]; // begin inline asm - dp4a.u32.u32 %r2840, %r2841, %r5878, %r2836; + shf.l.wrap.b32 %r11948, %r11949, %r11950, %r11951; // end inline asm - ld.const.u32 %r2845, [matrix+1100]; + mov.u32 %r11959, 43; // begin inline asm - dp4a.u32.u32 %r2844, %r2845, %r5882, %r2840; + shf.l.wrap.b32 %r11952, %r11958, %r11957, %r11959; // end inline asm - ld.const.u32 %r2849, [matrix+1104]; // begin inline asm - dp4a.u32.u32 %r2848, %r2849, %r5886, %r2844; + shf.l.wrap.b32 %r11956, %r11957, %r11958, %r11959; // end inline asm - ld.const.u32 %r2853, [matrix+1108]; + mov.u32 %r11967, 25; // begin inline asm - dp4a.u32.u32 %r2852, %r2853, %r5890, %r2848; + shf.l.wrap.b32 %r11960, %r11966, %r11965, %r11967; // end inline asm - ld.const.u32 %r2857, [matrix+1112]; // begin inline asm - dp4a.u32.u32 %r2856, %r2857, %r5894, %r2852; + shf.l.wrap.b32 %r11964, %r11965, %r11966, %r11967; // end inline asm - ld.const.u32 %r2861, [matrix+1116]; + mov.u32 %r11975, 8; // begin inline asm - dp4a.u32.u32 %r2860, %r2861, %r5898, %r2856; + shf.l.wrap.b32 %r11968, %r11974, %r11973, %r11975; // end inline asm - ld.const.u32 %r2865, [matrix+1120]; // begin inline asm - dp4a.u32.u32 %r2864, %r2865, %r5902, %r2860; + shf.l.wrap.b32 %r11972, %r11973, %r11974, %r11975; // end inline asm - ld.const.u32 %r2869, [matrix+1124]; + mov.u32 %r11983, 56; // begin inline asm - dp4a.u32.u32 %r2868, %r2869, %r5906, %r2864; + shf.l.wrap.b32 %r11976, %r11982, %r11981, %r11983; // end inline asm - ld.const.u32 %r2873, [matrix+1128]; // begin inline asm - dp4a.u32.u32 %r2872, %r2873, %r5910, %r2868; + shf.l.wrap.b32 %r11980, %r11981, %r11982, %r11983; // end inline asm - ld.const.u32 %r2877, [matrix+1132]; + mov.u32 %r11991, 41; // begin inline asm - dp4a.u32.u32 %r2876, %r2877, %r5914, %r2872; + shf.l.wrap.b32 %r11984, %r11990, %r11989, %r11991; // end inline asm - ld.const.u32 %r2881, [matrix+1136]; // begin inline asm - dp4a.u32.u32 %r2880, %r2881, %r5918, %r2876; + shf.l.wrap.b32 %r11988, %r11989, %r11990, %r11991; // end inline asm - ld.const.u32 %r2885, [matrix+1140]; + mov.u32 %r11999, 27; // begin inline asm - dp4a.u32.u32 %r2884, %r2885, %r5922, %r2880; + shf.l.wrap.b32 %r11992, %r11998, %r11997, %r11999; // end inline asm - ld.const.u32 %r2889, [matrix+1144]; // begin inline asm - dp4a.u32.u32 %r2888, %r2889, %r5926, %r2884; + shf.l.wrap.b32 %r11996, %r11997, %r11998, %r11999; // end inline asm - ld.const.u32 %r2893, [matrix+1148]; + mov.u32 %r12007, 14; // begin inline asm - dp4a.u32.u32 %r2892, %r2893, %r5930, %r2888; + shf.l.wrap.b32 %r12000, %r12006, %r12005, %r12007; // end inline asm - shr.u32 %r6085, %r2828, 6; - and.b32 %r2897, %r6085, 240; - shr.u32 %r2898, %r2892, 10; - and.b32 %r2899, %r11, 255; // begin inline asm - lop3.b32 %r2896, %r2897, %r2898, %r2899, 0x56; + shf.l.wrap.b32 %r12004, %r12005, %r12006, %r12007; // end inline asm - ld.const.u32 %r2901, [matrix+1152]; + mov.u32 %r12015, 2; // begin inline asm - dp4a.u32.u32 %r2900, %r2901, %r5870, %r6249; + shf.l.wrap.b32 %r12008, %r12014, %r12013, %r12015; // end inline asm - ld.const.u32 %r2905, [matrix+1156]; // begin inline asm - dp4a.u32.u32 %r2904, %r2905, %r5874, %r2900; + shf.l.wrap.b32 %r12012, %r12013, %r12014, %r12015; // end inline asm - ld.const.u32 %r2909, [matrix+1160]; + mov.u32 %r12023, 55; // begin inline asm - dp4a.u32.u32 %r2908, %r2909, %r5878, %r2904; + shf.l.wrap.b32 %r12016, %r12022, %r12021, %r12023; // end inline asm - ld.const.u32 %r2913, [matrix+1164]; // begin inline asm - dp4a.u32.u32 %r2912, %r2913, %r5882, %r2908; + shf.l.wrap.b32 %r12020, %r12021, %r12022, %r12023; // end inline asm - ld.const.u32 %r2917, [matrix+1168]; + mov.u32 %r12031, 45; // begin inline asm - dp4a.u32.u32 %r2916, %r2917, %r5886, %r2912; + shf.l.wrap.b32 %r12024, %r12030, %r12029, %r12031; // end inline asm - ld.const.u32 %r2921, [matrix+1172]; // begin inline asm - dp4a.u32.u32 %r2920, %r2921, %r5890, %r2916; + shf.l.wrap.b32 %r12028, %r12029, %r12030, %r12031; // end inline asm - ld.const.u32 %r2925, [matrix+1176]; + mov.u32 %r12039, 36; // begin inline asm - dp4a.u32.u32 %r2924, %r2925, %r5894, %r2920; + shf.l.wrap.b32 %r12032, %r12038, %r12037, %r12039; // end inline asm - ld.const.u32 %r2929, [matrix+1180]; // begin inline asm - dp4a.u32.u32 %r2928, %r2929, %r5898, %r2924; + shf.l.wrap.b32 %r12036, %r12037, %r12038, %r12039; // end inline asm - ld.const.u32 %r2933, [matrix+1184]; + mov.u32 %r12047, 28; // begin inline asm - dp4a.u32.u32 %r2932, %r2933, %r5902, %r2928; + shf.l.wrap.b32 %r12040, %r12046, %r12045, %r12047; // end inline asm - ld.const.u32 %r2937, [matrix+1188]; // begin inline asm - dp4a.u32.u32 %r2936, %r2937, %r5906, %r2932; + shf.l.wrap.b32 %r12044, %r12045, %r12046, %r12047; // end inline asm - ld.const.u32 %r2941, [matrix+1192]; + mov.u32 %r12055, 21; // begin inline asm - dp4a.u32.u32 %r2940, %r2941, %r5910, %r2936; + shf.l.wrap.b32 %r12048, %r12054, %r12053, %r12055; // end inline asm - ld.const.u32 %r2945, [matrix+1196]; // begin inline asm - dp4a.u32.u32 %r2944, %r2945, %r5914, %r2940; + shf.l.wrap.b32 %r12052, %r12053, %r12054, %r12055; // end inline asm - ld.const.u32 %r2949, [matrix+1200]; + mov.u32 %r12063, 15; // begin inline asm - dp4a.u32.u32 %r2948, %r2949, %r5918, %r2944; + shf.l.wrap.b32 %r12056, %r12062, %r12061, %r12063; // end inline asm - ld.const.u32 %r2953, [matrix+1204]; // begin inline asm - dp4a.u32.u32 %r2952, %r2953, %r5922, %r2948; + shf.l.wrap.b32 %r12060, %r12061, %r12062, %r12063; // end inline asm - ld.const.u32 %r2957, [matrix+1208]; + mov.u32 %r12071, 10; // begin inline asm - dp4a.u32.u32 %r2956, %r2957, %r5926, %r2952; + shf.l.wrap.b32 %r12064, %r12070, %r12069, %r12071; // end inline asm - ld.const.u32 %r2961, [matrix+1212]; // begin inline asm - dp4a.u32.u32 %r2960, %r2961, %r5930, %r2956; + shf.l.wrap.b32 %r12068, %r12069, %r12070, %r12071; // end inline asm - ld.const.u32 %r2965, [matrix+1216]; + mov.u32 %r12079, 6; // begin inline asm - dp4a.u32.u32 %r2964, %r2965, %r5870, %r6249; + shf.l.wrap.b32 %r12072, %r12078, %r12077, %r12079; // end inline asm - ld.const.u32 %r2969, [matrix+1220]; // begin inline asm - dp4a.u32.u32 %r2968, %r2969, %r5874, %r2964; + shf.l.wrap.b32 %r12076, %r12077, %r12078, %r12079; // end inline asm - ld.const.u32 %r2973, [matrix+1224]; + mov.u32 %r12087, 3; // begin inline asm - dp4a.u32.u32 %r2972, %r2973, %r5878, %r2968; + shf.l.wrap.b32 %r12080, %r12086, %r12085, %r12087; // end inline asm - ld.const.u32 %r2977, [matrix+1228]; // begin inline asm - dp4a.u32.u32 %r2976, %r2977, %r5882, %r2972; + shf.l.wrap.b32 %r12084, %r12085, %r12086, %r12087; // end inline asm - ld.const.u32 %r2981, [matrix+1232]; // begin inline asm - dp4a.u32.u32 %r2980, %r2981, %r5886, %r2976; + shf.l.wrap.b32 %r12088, %r12094, %r12093, %r11777; // end inline asm - ld.const.u32 %r2985, [matrix+1236]; // begin inline asm - dp4a.u32.u32 %r2984, %r2985, %r5890, %r2980; + shf.l.wrap.b32 %r12092, %r12093, %r12094, %r11777; // end inline asm - ld.const.u32 %r2989, [matrix+1240]; // begin inline asm - dp4a.u32.u32 %r2988, %r2989, %r5894, %r2984; + // chi + lop3.b32 %r12096, %r12131, %r11904, %r11952, 0xD2; + lop3.b32 %r12097, %r12134, %r11908, %r11956, 0xD2; // end inline asm - ld.const.u32 %r2993, [matrix+1244]; // begin inline asm - dp4a.u32.u32 %r2992, %r2993, %r5898, %r2988; + // chi + lop3.b32 %r29959, %r11904, %r11952, %r12048, 0xD2; + lop3.b32 %r29960, %r11908, %r11956, %r12052, 0xD2; // end inline asm - ld.const.u32 %r2997, [matrix+1248]; // begin inline asm - dp4a.u32.u32 %r2996, %r2997, %r5902, %r2992; + // chi + lop3.b32 %r29955, %r11952, %r12048, %r12000, 0xD2; + lop3.b32 %r29956, %r11956, %r12052, %r12004, 0xD2; // end inline asm - ld.const.u32 %r3001, [matrix+1252]; // begin inline asm - dp4a.u32.u32 %r3000, %r3001, %r5906, %r2996; + // chi + lop3.b32 %r29951, %r12048, %r12000, %r12131, 0xD2; + lop3.b32 %r29952, %r12052, %r12004, %r12134, 0xD2; // end inline asm - ld.const.u32 %r3005, [matrix+1256]; // begin inline asm - dp4a.u32.u32 %r3004, %r3005, %r5910, %r3000; + // chi + lop3.b32 %r29949, %r12000, %r12131, %r11904, 0xD2; + lop3.b32 %r29950, %r12004, %r12134, %r11908, 0xD2; // end inline asm - ld.const.u32 %r3009, [matrix+1260]; // begin inline asm - dp4a.u32.u32 %r3008, %r3009, %r5914, %r3004; + // chi + lop3.b32 %r29945, %r12040, %r11912, %r12080, 0xD2; + lop3.b32 %r29946, %r12044, %r11916, %r12084, 0xD2; // end inline asm - ld.const.u32 %r3013, [matrix+1264]; // begin inline asm - dp4a.u32.u32 %r3012, %r3013, %r5918, %r3008; + // chi + lop3.b32 %r29957, %r11912, %r12080, %r12024, 0xD2; + lop3.b32 %r29958, %r11916, %r12084, %r12028, 0xD2; // end inline asm - ld.const.u32 %r3017, [matrix+1268]; // begin inline asm - dp4a.u32.u32 %r3016, %r3017, %r5922, %r3012; + // chi + lop3.b32 %r29953, %r12080, %r12024, %r11920, 0xD2; + lop3.b32 %r29954, %r12084, %r12028, %r11924, 0xD2; // end inline asm - ld.const.u32 %r3021, [matrix+1272]; // begin inline asm - dp4a.u32.u32 %r3020, %r3021, %r5926, %r3016; + // chi + lop3.b32 %r29925, %r12024, %r11920, %r12040, 0xD2; + lop3.b32 %r29926, %r12028, %r11924, %r12044, 0xD2; // end inline asm - ld.const.u32 %r3025, [matrix+1276]; + st.local.v2.u32 [%rd3+88], {%r29925, %r29926}; // begin inline asm - dp4a.u32.u32 %r3024, %r3025, %r5930, %r3020; + // chi + lop3.b32 %r29917, %r11920, %r12040, %r11912, 0xD2; + lop3.b32 %r29918, %r11924, %r12044, %r11916, 0xD2; // end inline asm - shr.u32 %r6086, %r2960, 6; - and.b32 %r3029, %r6086, 240; - shr.u32 %r3030, %r3024, 10; - bfe.u32 %r3031, %r11, 8, 8; + st.local.v2.u32 [%rd3+96], {%r29917, %r29918}; // begin inline asm - lop3.b32 %r3028, %r3029, %r3030, %r3031, 0x56; + // chi + lop3.b32 %r29943, %r12088, %r12072, %r11960, 0xD2; + lop3.b32 %r29944, %r12092, %r12076, %r11964, 0xD2; // end inline asm - ld.const.u32 %r3033, [matrix+1280]; + st.local.v2.u32 [%rd3+104], {%r29943, %r29944}; // begin inline asm - dp4a.u32.u32 %r3032, %r3033, %r5870, %r6249; + // chi + lop3.b32 %r29937, %r12072, %r11960, %r11968, 0xD2; + lop3.b32 %r29938, %r12076, %r11964, %r11972, 0xD2; // end inline asm - ld.const.u32 %r3037, [matrix+1284]; + st.local.v2.u32 [%rd3+112], {%r29937, %r29938}; // begin inline asm - dp4a.u32.u32 %r3036, %r3037, %r5874, %r3032; + // chi + lop3.b32 %r29931, %r11960, %r11968, %r11936, 0xD2; + lop3.b32 %r29932, %r11964, %r11972, %r11940, 0xD2; // end inline asm - ld.const.u32 %r3041, [matrix+1288]; + st.local.v2.u32 [%rd3+120], {%r29931, %r29932}; // begin inline asm - dp4a.u32.u32 %r3040, %r3041, %r5878, %r3036; + // chi + lop3.b32 %r29923, %r11968, %r11936, %r12088, 0xD2; + lop3.b32 %r29924, %r11972, %r11940, %r12092, 0xD2; // end inline asm - ld.const.u32 %r3045, [matrix+1292]; + st.local.v2.u32 [%rd3+128], {%r29923, %r29924}; // begin inline asm - dp4a.u32.u32 %r3044, %r3045, %r5882, %r3040; + // chi + lop3.b32 %r29915, %r11936, %r12088, %r12072, 0xD2; + lop3.b32 %r29916, %r11940, %r12092, %r12076, 0xD2; // end inline asm - ld.const.u32 %r3049, [matrix+1296]; + st.local.v2.u32 [%rd3+136], {%r29915, %r29916}; // begin inline asm - dp4a.u32.u32 %r3048, %r3049, %r5886, %r3044; + // chi + lop3.b32 %r29941, %r11992, %r12032, %r12064, 0xD2; + lop3.b32 %r29942, %r11996, %r12036, %r12068, 0xD2; // end inline asm - ld.const.u32 %r3053, [matrix+1300]; + st.local.v2.u32 [%rd3+144], {%r29941, %r29942}; // begin inline asm - dp4a.u32.u32 %r3052, %r3053, %r5890, %r3048; + // chi + lop3.b32 %r29935, %r12032, %r12064, %r12056, 0xD2; + lop3.b32 %r29936, %r12036, %r12068, %r12060, 0xD2; // end inline asm - ld.const.u32 %r3057, [matrix+1304]; + st.local.v2.u32 [%rd3+152], {%r29935, %r29936}; // begin inline asm - dp4a.u32.u32 %r3056, %r3057, %r5894, %r3052; + // chi + lop3.b32 %r29929, %r12064, %r12056, %r11976, 0xD2; + lop3.b32 %r29930, %r12068, %r12060, %r11980, 0xD2; // end inline asm - ld.const.u32 %r3061, [matrix+1308]; + st.local.v2.u32 [%rd3+160], {%r29929, %r29930}; // begin inline asm - dp4a.u32.u32 %r3060, %r3061, %r5898, %r3056; + // chi + lop3.b32 %r29921, %r12056, %r11976, %r11992, 0xD2; + lop3.b32 %r29922, %r12060, %r11980, %r11996, 0xD2; // end inline asm - ld.const.u32 %r3065, [matrix+1312]; + st.local.v2.u32 [%rd3+168], {%r29921, %r29922}; // begin inline asm - dp4a.u32.u32 %r3064, %r3065, %r5902, %r3060; + // chi + lop3.b32 %r29913, %r11976, %r11992, %r12032, 0xD2; + lop3.b32 %r29914, %r11980, %r11996, %r12036, 0xD2; // end inline asm - ld.const.u32 %r3069, [matrix+1316]; + st.local.v2.u32 [%rd3+176], {%r29913, %r29914}; // begin inline asm - dp4a.u32.u32 %r3068, %r3069, %r5906, %r3064; + // chi + lop3.b32 %r29939, %r11944, %r12016, %r11928, 0xD2; + lop3.b32 %r29940, %r11948, %r12020, %r11932, 0xD2; // end inline asm - ld.const.u32 %r3073, [matrix+1320]; + st.local.v2.u32 [%rd3+184], {%r29939, %r29940}; // begin inline asm - dp4a.u32.u32 %r3072, %r3073, %r5910, %r3068; + // chi + lop3.b32 %r29933, %r12016, %r11928, %r11984, 0xD2; + lop3.b32 %r29934, %r12020, %r11932, %r11988, 0xD2; // end inline asm - ld.const.u32 %r3077, [matrix+1324]; + st.local.v2.u32 [%rd3+192], {%r29933, %r29934}; // begin inline asm - dp4a.u32.u32 %r3076, %r3077, %r5914, %r3072; + // chi + lop3.b32 %r29927, %r11928, %r11984, %r12008, 0xD2; + lop3.b32 %r29928, %r11932, %r11988, %r12012, 0xD2; // end inline asm - ld.const.u32 %r3081, [matrix+1328]; + st.local.v2.u32 [%rd3+200], {%r29927, %r29928}; // begin inline asm - dp4a.u32.u32 %r3080, %r3081, %r5918, %r3076; + // chi + lop3.b32 %r29919, %r11984, %r12008, %r11944, 0xD2; + lop3.b32 %r29920, %r11988, %r12012, %r11948, 0xD2; // end inline asm - ld.const.u32 %r3085, [matrix+1332]; + st.local.v2.u32 [%rd3+208], {%r29919, %r29920}; // begin inline asm - dp4a.u32.u32 %r3084, %r3085, %r5922, %r3080; + // chi + lop3.b32 %r29911, %r12008, %r11944, %r12016, 0xD2; + lop3.b32 %r29912, %r12012, %r11948, %r12020, 0xD2; // end inline asm - ld.const.u32 %r3089, [matrix+1336]; + st.local.v2.u32 [%rd3+216], {%r29911, %r29912}; + mul.wide.s32 %rd665, %r29961, 8; + add.s64 %rd664, %rd597, %rd665; // begin inline asm - dp4a.u32.u32 %r3088, %r3089, %r5926, %r3084; + ld.global.nc.v2.u32 {%r12296,%r12297}, [%rd664]; // end inline asm - ld.const.u32 %r3093, [matrix+1340]; + xor.b32 %r29947, %r12096, %r12296; + xor.b32 %r29948, %r12097, %r12297; + add.s32 %r29961, %r29961, 1; + setp.lt.u32 %p26, %r29961, 23; + @%p26 bra $L__BB2_40; + + st.local.v2.u32 [%rd3+32], {%r29959, %r29960}; + st.local.v2.u32 [%rd3+72], {%r29957, %r29958}; + st.local.v2.u32 [%rd3+40], {%r29955, %r29956}; + st.local.v2.u32 [%rd3+80], {%r29953, %r29954}; + st.local.v2.u32 [%rd3+48], {%r29951, %r29952}; + st.local.v2.u32 [%rd3+56], {%r29949, %r29950}; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; // begin inline asm - dp4a.u32.u32 %r3092, %r3093, %r5930, %r3088; + // xor5 + lop3.b32 %r12308, %r29947, %r29945, %r29943, 0x96; + lop3.b32 %r12308, %r12308, %r29941, %r29939, 0x96; + lop3.b32 %r12309, %r29948, %r29946, %r29944, 0x96; + lop3.b32 %r12309, %r12309, %r29942, %r29940, 0x96; // end inline asm - ld.const.u32 %r3097, [matrix+1344]; // begin inline asm - dp4a.u32.u32 %r3096, %r3097, %r5870, %r6249; + // xor5 + lop3.b32 %r12320, %r29959, %r29957, %r29937, 0x96; + lop3.b32 %r12320, %r12320, %r29935, %r29933, 0x96; + lop3.b32 %r12321, %r29960, %r29958, %r29938, 0x96; + lop3.b32 %r12321, %r12321, %r29936, %r29934, 0x96; // end inline asm - ld.const.u32 %r3101, [matrix+1348]; // begin inline asm - dp4a.u32.u32 %r3100, %r3101, %r5874, %r3096; + // xor5 + lop3.b32 %r12332, %r29955, %r29953, %r29931, 0x96; + lop3.b32 %r12332, %r12332, %r29929, %r29927, 0x96; + lop3.b32 %r12333, %r29956, %r29954, %r29932, 0x96; + lop3.b32 %r12333, %r12333, %r29930, %r29928, 0x96; // end inline asm - ld.const.u32 %r3105, [matrix+1352]; // begin inline asm - dp4a.u32.u32 %r3104, %r3105, %r5878, %r3100; + // xor5 + lop3.b32 %r12344, %r29951, %r29925, %r29923, 0x96; + lop3.b32 %r12344, %r12344, %r29921, %r29919, 0x96; + lop3.b32 %r12345, %r29952, %r29926, %r29924, 0x96; + lop3.b32 %r12345, %r12345, %r29922, %r29920, 0x96; // end inline asm - ld.const.u32 %r3109, [matrix+1356]; // begin inline asm - dp4a.u32.u32 %r3108, %r3109, %r5882, %r3104; + // xor5 + lop3.b32 %r12356, %r29949, %r29917, %r29915, 0x96; + lop3.b32 %r12356, %r12356, %r29913, %r29911, 0x96; + lop3.b32 %r12357, %r29950, %r29918, %r29916, 0x96; + lop3.b32 %r12357, %r12357, %r29914, %r29912, 0x96; // end inline asm - ld.const.u32 %r3113, [matrix+1360]; + mov.u32 %r12560, 1; // begin inline asm - dp4a.u32.u32 %r3112, %r3113, %r5886, %r3108; + shf.l.wrap.b32 %r12368, %r12321, %r12320, %r12560; // end inline asm - ld.const.u32 %r3117, [matrix+1364]; // begin inline asm - dp4a.u32.u32 %r3116, %r3117, %r5890, %r3112; + shf.l.wrap.b32 %r12372, %r12320, %r12321, %r12560; // end inline asm - ld.const.u32 %r3121, [matrix+1368]; + xor.b32 %r12587, %r12368, %r12356; + xor.b32 %r12588, %r12372, %r12357; + xor.b32 %r12515, %r29947, %r12587; + xor.b32 %r12518, %r29948, %r12588; + xor.b32 %r12478, %r29944, %r12588; + xor.b32 %r12477, %r29943, %r12587; + st.local.v2.u32 [%rd3+104], {%r12477, %r12478}; // begin inline asm - dp4a.u32.u32 %r3120, %r3121, %r5894, %r3116; + shf.l.wrap.b32 %r12376, %r12333, %r12332, %r12560; // end inline asm - ld.const.u32 %r3125, [matrix+1372]; // begin inline asm - dp4a.u32.u32 %r3124, %r3125, %r5898, %r3120; + shf.l.wrap.b32 %r12380, %r12332, %r12333, %r12560; // end inline asm - ld.const.u32 %r3129, [matrix+1376]; + xor.b32 %r12589, %r12376, %r12308; + xor.b32 %r12590, %r12380, %r12309; + xor.b32 %r12414, %r29957, %r12589; + xor.b32 %r12413, %r29958, %r12590; + xor.b32 %r12453, %r29936, %r12590; + xor.b32 %r12454, %r29935, %r12589; + st.local.v2.u32 [%rd3+152], {%r12454, %r12453}; // begin inline asm - dp4a.u32.u32 %r3128, %r3129, %r5902, %r3124; + shf.l.wrap.b32 %r12384, %r12345, %r12344, %r12560; // end inline asm - ld.const.u32 %r3133, [matrix+1380]; // begin inline asm - dp4a.u32.u32 %r3132, %r3133, %r5906, %r3128; + shf.l.wrap.b32 %r12388, %r12344, %r12345, %r12560; // end inline asm - ld.const.u32 %r3137, [matrix+1384]; + xor.b32 %r12591, %r12384, %r12320; + xor.b32 %r12592, %r12388, %r12321; + xor.b32 %r12437, %r29932, %r12592; + xor.b32 %r12438, %r29931, %r12591; + st.local.v2.u32 [%rd3+120], {%r12438, %r12437}; + xor.b32 %r12429, %r29928, %r12592; + xor.b32 %r12430, %r29927, %r12591; + st.local.v2.u32 [%rd3+200], {%r12430, %r12429}; // begin inline asm - dp4a.u32.u32 %r3136, %r3137, %r5910, %r3132; + shf.l.wrap.b32 %r12392, %r12357, %r12356, %r12560; // end inline asm - ld.const.u32 %r3141, [matrix+1388]; // begin inline asm - dp4a.u32.u32 %r3140, %r3141, %r5914, %r3136; + shf.l.wrap.b32 %r12396, %r12356, %r12357, %r12560; // end inline asm - ld.const.u32 %r3145, [matrix+1392]; + xor.b32 %r12593, %r12392, %r12332; + xor.b32 %r12594, %r12396, %r12333; + xor.b32 %r12461, %r29951, %r12593; + xor.b32 %r12462, %r29952, %r12594; + xor.b32 %r12470, %r29922, %r12594; + xor.b32 %r12469, %r29921, %r12593; + st.local.v2.u32 [%rd3+168], {%r12469, %r12470}; // begin inline asm - dp4a.u32.u32 %r3144, %r3145, %r5918, %r3140; + shf.l.wrap.b32 %r12400, %r12309, %r12308, %r12560; // end inline asm - ld.const.u32 %r3149, [matrix+1396]; // begin inline asm - dp4a.u32.u32 %r3148, %r3149, %r5922, %r3144; + shf.l.wrap.b32 %r12404, %r12308, %r12309, %r12560; // end inline asm - ld.const.u32 %r3153, [matrix+1400]; + xor.b32 %r12595, %r12400, %r12344; + xor.b32 %r12596, %r12404, %r12345; + xor.b32 %r12421, %r29917, %r12595; + xor.b32 %r12422, %r29918, %r12596; + xor.b32 %r12446, %r29912, %r12596; + xor.b32 %r12445, %r29911, %r12595; + st.local.v2.u32 [%rd3+216], {%r12445, %r12446}; // begin inline asm - dp4a.u32.u32 %r3152, %r3153, %r5926, %r3148; + shf.l.wrap.b32 %r12408, %r12414, %r12413, %r11911; // end inline asm - ld.const.u32 %r3157, [matrix+1404]; // begin inline asm - dp4a.u32.u32 %r3156, %r3157, %r5930, %r3152; + shf.l.wrap.b32 %r12412, %r12413, %r12414, %r11911; // end inline asm - shr.u32 %r6087, %r3092, 6; - and.b32 %r3161, %r6087, 240; - shr.u32 %r3162, %r3156, 10; - bfe.u32 %r3163, %r11, 16, 8; // begin inline asm - lop3.b32 %r3160, %r3161, %r3162, %r3163, 0x56; + shf.l.wrap.b32 %r12416, %r12422, %r12421, %r11919; // end inline asm - ld.const.u32 %r3165, [matrix+1408]; // begin inline asm - dp4a.u32.u32 %r3164, %r3165, %r5870, %r6249; + shf.l.wrap.b32 %r12420, %r12421, %r12422, %r11919; // end inline asm - ld.const.u32 %r3169, [matrix+1412]; // begin inline asm - dp4a.u32.u32 %r3168, %r3169, %r5874, %r3164; + shf.l.wrap.b32 %r12428, %r12429, %r12430, %r11927; // end inline asm - ld.const.u32 %r3173, [matrix+1416]; // begin inline asm - dp4a.u32.u32 %r3172, %r3173, %r5878, %r3168; + shf.l.wrap.b32 %r12424, %r12430, %r12429, %r11927; // end inline asm - ld.const.u32 %r3177, [matrix+1420]; + st.local.v2.u32 [%rd3+96], {%r12424, %r12428}; // begin inline asm - dp4a.u32.u32 %r3176, %r3177, %r5882, %r3172; + shf.l.wrap.b32 %r12432, %r12438, %r12437, %r11959; // end inline asm - ld.const.u32 %r3181, [matrix+1424]; // begin inline asm - dp4a.u32.u32 %r3180, %r3181, %r5886, %r3176; + shf.l.wrap.b32 %r12436, %r12437, %r12438, %r11959; // end inline asm - ld.const.u32 %r3185, [matrix+1428]; // begin inline asm - dp4a.u32.u32 %r3184, %r3185, %r5890, %r3180; + shf.l.wrap.b32 %r12440, %r12446, %r12445, %r12007; // end inline asm - ld.const.u32 %r3189, [matrix+1432]; // begin inline asm - dp4a.u32.u32 %r3188, %r3189, %r5894, %r3184; + shf.l.wrap.b32 %r12444, %r12445, %r12446, %r12007; // end inline asm - ld.const.u32 %r3193, [matrix+1436]; // begin inline asm - dp4a.u32.u32 %r3192, %r3193, %r5898, %r3188; + shf.l.wrap.b32 %r12452, %r12453, %r12454, %r12031; // end inline asm - ld.const.u32 %r3197, [matrix+1440]; // begin inline asm - dp4a.u32.u32 %r3196, %r3197, %r5902, %r3192; + shf.l.wrap.b32 %r12448, %r12454, %r12453, %r12031; // end inline asm - ld.const.u32 %r3201, [matrix+1444]; + st.local.v2.u32 [%rd3+88], {%r12448, %r12452}; // begin inline asm - dp4a.u32.u32 %r3200, %r3201, %r5906, %r3196; + shf.l.wrap.b32 %r12456, %r12462, %r12461, %r12047; // end inline asm - ld.const.u32 %r3205, [matrix+1448]; // begin inline asm - dp4a.u32.u32 %r3204, %r3205, %r5910, %r3200; + shf.l.wrap.b32 %r12460, %r12461, %r12462, %r12047; // end inline asm - ld.const.u32 %r3209, [matrix+1452]; // begin inline asm - dp4a.u32.u32 %r3208, %r3209, %r5914, %r3204; + shf.l.wrap.b32 %r12464, %r12470, %r12469, %r12055; // end inline asm - ld.const.u32 %r3213, [matrix+1456]; // begin inline asm - dp4a.u32.u32 %r3212, %r3213, %r5918, %r3208; + shf.l.wrap.b32 %r12468, %r12469, %r12470, %r12055; // end inline asm - ld.const.u32 %r3217, [matrix+1460]; // begin inline asm - dp4a.u32.u32 %r3216, %r3217, %r5922, %r3212; + shf.l.wrap.b32 %r12472, %r12478, %r12477, %r12087; // end inline asm - ld.const.u32 %r3221, [matrix+1464]; // begin inline asm - dp4a.u32.u32 %r3220, %r3221, %r5926, %r3216; + shf.l.wrap.b32 %r12476, %r12477, %r12478, %r12087; // end inline asm - ld.const.u32 %r3225, [matrix+1468]; // begin inline asm - dp4a.u32.u32 %r3224, %r3225, %r5930, %r3220; + // chi + lop3.b32 %r12480, %r12515, %r12408, %r12432, 0xD2; + lop3.b32 %r12481, %r12518, %r12412, %r12436, 0xD2; // end inline asm - ld.const.u32 %r3229, [matrix+1472]; // begin inline asm - dp4a.u32.u32 %r3228, %r3229, %r5870, %r6249; + // chi + lop3.b32 %r12488, %r12408, %r12432, %r12464, 0xD2; + lop3.b32 %r12489, %r12412, %r12436, %r12468, 0xD2; // end inline asm - ld.const.u32 %r3233, [matrix+1476]; + st.local.v2.u32 [%rd3+32], {%r12488, %r12489}; // begin inline asm - dp4a.u32.u32 %r3232, %r3233, %r5874, %r3228; + // chi + lop3.b32 %r12496, %r12432, %r12464, %r12440, 0xD2; + lop3.b32 %r12497, %r12436, %r12468, %r12444, 0xD2; // end inline asm - ld.const.u32 %r3237, [matrix+1480]; + st.local.v2.u32 [%rd3+40], {%r12496, %r12497}; // begin inline asm - dp4a.u32.u32 %r3236, %r3237, %r5878, %r3232; + // chi + lop3.b32 %r12504, %r12464, %r12440, %r12515, 0xD2; + lop3.b32 %r12505, %r12468, %r12444, %r12518, 0xD2; // end inline asm - ld.const.u32 %r3241, [matrix+1484]; + st.local.v2.u32 [%rd3+48], {%r12504, %r12505}; // begin inline asm - dp4a.u32.u32 %r3240, %r3241, %r5882, %r3236; + // chi + lop3.b32 %r12512, %r12440, %r12515, %r12408, 0xD2; + lop3.b32 %r12513, %r12444, %r12518, %r12412, 0xD2; // end inline asm - ld.const.u32 %r3245, [matrix+1488]; + st.local.v2.u32 [%rd3+56], {%r12512, %r12513}; // begin inline asm - dp4a.u32.u32 %r3244, %r3245, %r5886, %r3240; + // chi + lop3.b32 %r12520, %r12456, %r12416, %r12472, 0xD2; + lop3.b32 %r12521, %r12460, %r12420, %r12476, 0xD2; // end inline asm - ld.const.u32 %r3249, [matrix+1492]; + st.local.v2.u32 [%rd3+64], {%r12520, %r12521}; // begin inline asm - dp4a.u32.u32 %r3248, %r3249, %r5890, %r3244; + // chi + lop3.b32 %r12528, %r12416, %r12472, %r12448, 0xD2; + lop3.b32 %r12529, %r12420, %r12476, %r12452, 0xD2; // end inline asm - ld.const.u32 %r3253, [matrix+1496]; + st.local.v2.u32 [%rd3+72], {%r12528, %r12529}; // begin inline asm - dp4a.u32.u32 %r3252, %r3253, %r5894, %r3248; + // chi + lop3.b32 %r12536, %r12472, %r12448, %r12424, 0xD2; + lop3.b32 %r12537, %r12476, %r12452, %r12428, 0xD2; // end inline asm - ld.const.u32 %r3257, [matrix+1500]; + st.local.v2.u32 [%rd3+80], {%r12536, %r12537}; // begin inline asm - dp4a.u32.u32 %r3256, %r3257, %r5898, %r3252; + ld.global.nc.v2.u32 {%r12544,%r12545}, [%rd598]; + // end inline asm + xor.b32 %r12597, %r12481, %r12545; + xor.b32 %r12598, %r12480, %r12544; + mov.b64 %rd1261, {%r12598, %r12597}; + mov.b64 %rd1262, {%r12488, %r12489}; + mov.b64 %rd1263, {%r12496, %r12497}; + mov.b64 %rd1264, {%r12512, %r12513}; + mov.u32 %r29962, 0; + st.local.v2.u32 [%rd3+24], {%r12598, %r12597}; + st.local.v2.u32 [%rd84+96], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+104], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+112], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+120], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+128], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+136], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+144], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+152], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+160], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+168], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+176], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+184], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+192], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+200], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+208], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+216], {%r29962, %r29962}; + mov.u32 %r29977, -2147483648; + st.local.v2.u32 [%rd84+88], {%r12560, %r29977}; + mov.u32 %r29963, %r29962; + mov.u32 %r29964, %r29962; + mov.u32 %r29965, %r29962; + mov.u32 %r29966, %r29962; + mov.u32 %r29967, %r29962; + mov.u32 %r29968, %r29962; + mov.u32 %r29969, %r29962; + mov.u32 %r29970, %r29962; + mov.u32 %r29971, %r29962; + mov.u32 %r29972, %r29962; + mov.u32 %r29973, %r29962; + mov.u32 %r29974, %r29962; + mov.u32 %r29975, %r29962; + mov.u32 %r29976, %r12560; + mov.u32 %r29978, %r29962; + mov.u32 %r29979, %r29962; + mov.u32 %r29980, %r29962; + mov.u32 %r29981, %r29962; + mov.u32 %r29982, %r29962; + mov.u32 %r29983, %r29962; + mov.u32 %r29984, %r29962; + mov.u32 %r29985, %r29962; + mov.u32 %r29986, %r29962; + mov.u32 %r29987, %r29962; + mov.u32 %r29988, %r29962; + mov.u32 %r29989, %r29962; + mov.u32 %r29990, %r29962; + mov.u32 %r29991, %r29962; + mov.u32 %r29992, %r29962; + mov.u32 %r29993, %r29962; + mov.u32 %r29994, %r29962; + mov.u32 %r29995, %r29962; + mov.u32 %r30012, %r29962; + +$L__BB2_42: + // begin inline asm + // xor5 + lop3.b32 %r12599, %r29998, %r29996, %r29994, 0x96; + lop3.b32 %r12599, %r12599, %r29992, %r29990, 0x96; + lop3.b32 %r12600, %r29999, %r29997, %r29995, 0x96; + lop3.b32 %r12600, %r12600, %r29993, %r29991, 0x96; // end inline asm - ld.const.u32 %r3261, [matrix+1504]; // begin inline asm - dp4a.u32.u32 %r3260, %r3261, %r5902, %r3256; + // xor5 + lop3.b32 %r12611, %r30010, %r30008, %r29988, 0x96; + lop3.b32 %r12611, %r12611, %r29986, %r29984, 0x96; + lop3.b32 %r12612, %r30011, %r30009, %r29989, 0x96; + lop3.b32 %r12612, %r12612, %r29987, %r29985, 0x96; // end inline asm - ld.const.u32 %r3265, [matrix+1508]; // begin inline asm - dp4a.u32.u32 %r3264, %r3265, %r5906, %r3260; + // xor5 + lop3.b32 %r12623, %r30006, %r30004, %r29982, 0x96; + lop3.b32 %r12623, %r12623, %r29980, %r29978, 0x96; + lop3.b32 %r12624, %r30007, %r30005, %r29983, 0x96; + lop3.b32 %r12624, %r12624, %r29981, %r29979, 0x96; // end inline asm - ld.const.u32 %r3269, [matrix+1512]; // begin inline asm - dp4a.u32.u32 %r3268, %r3269, %r5910, %r3264; + // xor5 + lop3.b32 %r12635, %r30002, %r29976, %r29974, 0x96; + lop3.b32 %r12635, %r12635, %r29972, %r29970, 0x96; + lop3.b32 %r12636, %r30003, %r29977, %r29975, 0x96; + lop3.b32 %r12636, %r12636, %r29973, %r29971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12647, %r30000, %r29968, %r29966, 0x96; + lop3.b32 %r12647, %r12647, %r29964, %r29962, 0x96; + lop3.b32 %r12648, %r30001, %r29969, %r29967, 0x96; + lop3.b32 %r12648, %r12648, %r29965, %r29963, 0x96; // end inline asm - ld.const.u32 %r3273, [matrix+1516]; // begin inline asm - dp4a.u32.u32 %r3272, %r3273, %r5914, %r3268; + shf.l.wrap.b32 %r12659, %r12612, %r12611, %r12560; // end inline asm - ld.const.u32 %r3277, [matrix+1520]; // begin inline asm - dp4a.u32.u32 %r3276, %r3277, %r5918, %r3272; + shf.l.wrap.b32 %r12663, %r12611, %r12612, %r12560; // end inline asm - ld.const.u32 %r3281, [matrix+1524]; + xor.b32 %r13093, %r12659, %r12647; + xor.b32 %r13094, %r12663, %r12648; + xor.b32 %r12926, %r29998, %r13093; + xor.b32 %r12929, %r29999, %r13094; + xor.b32 %r12833, %r29996, %r13093; + xor.b32 %r12832, %r29997, %r13094; + xor.b32 %r12880, %r29994, %r13093; + xor.b32 %r12881, %r29995, %r13094; + xor.b32 %r12785, %r29992, %r13093; + xor.b32 %r12784, %r29993, %r13094; + xor.b32 %r12736, %r29990, %r13093; + xor.b32 %r12737, %r29991, %r13094; // begin inline asm - dp4a.u32.u32 %r3280, %r3281, %r5922, %r3276; + shf.l.wrap.b32 %r12667, %r12624, %r12623, %r12560; // end inline asm - ld.const.u32 %r3285, [matrix+1528]; // begin inline asm - dp4a.u32.u32 %r3284, %r3285, %r5926, %r3280; + shf.l.wrap.b32 %r12671, %r12623, %r12624, %r12560; // end inline asm - ld.const.u32 %r3289, [matrix+1532]; + xor.b32 %r13095, %r12667, %r12599; + xor.b32 %r13096, %r12671, %r12600; + xor.b32 %r12888, %r30010, %r13095; + xor.b32 %r12889, %r30011, %r13096; + xor.b32 %r12705, %r30008, %r13095; + xor.b32 %r12704, %r30009, %r13096; + xor.b32 %r12864, %r29988, %r13095; + xor.b32 %r12865, %r29989, %r13096; + xor.b32 %r12825, %r29986, %r13095; + xor.b32 %r12824, %r29987, %r13096; + xor.b32 %r12808, %r29984, %r13095; + xor.b32 %r12809, %r29985, %r13096; // begin inline asm - dp4a.u32.u32 %r3288, %r3289, %r5930, %r3284; + shf.l.wrap.b32 %r12675, %r12636, %r12635, %r12560; // end inline asm - shr.u32 %r6088, %r3224, 6; - and.b32 %r3293, %r6088, 240; - shr.u32 %r3294, %r3288, 10; // begin inline asm - lop3.b32 %r3292, %r3293, %r3294, %r3295, 0x56; + shf.l.wrap.b32 %r12679, %r12635, %r12636, %r12560; // end inline asm - ld.const.u32 %r3297, [matrix+1536]; + xor.b32 %r13097, %r12675, %r12611; + xor.b32 %r13098, %r12679, %r12612; + xor.b32 %r12745, %r30006, %r13097; + xor.b32 %r12744, %r30007, %r13098; + xor.b32 %r12872, %r30004, %r13097; + xor.b32 %r12873, %r30005, %r13098; + xor.b32 %r12753, %r29982, %r13097; + xor.b32 %r12752, %r29983, %r13098; + xor.b32 %r12856, %r29980, %r13097; + xor.b32 %r12857, %r29981, %r13098; + xor.b32 %r12721, %r29978, %r13097; + xor.b32 %r12720, %r29979, %r13098; // begin inline asm - dp4a.u32.u32 %r3296, %r3297, %r5870, %r6249; + shf.l.wrap.b32 %r12683, %r12648, %r12647, %r12560; // end inline asm - ld.const.u32 %r3301, [matrix+1540]; // begin inline asm - dp4a.u32.u32 %r3300, %r3301, %r5874, %r3296; + shf.l.wrap.b32 %r12687, %r12647, %r12648, %r12560; // end inline asm - ld.const.u32 %r3305, [matrix+1544]; + xor.b32 %r13099, %r12683, %r12623; + xor.b32 %r13100, %r12687, %r12624; + xor.b32 %r12840, %r30002, %r13099; + xor.b32 %r12841, %r30003, %r13100; + xor.b32 %r12817, %r29976, %r13099; + xor.b32 %r12816, %r29977, %r13100; + xor.b32 %r12760, %r29974, %r13099; + xor.b32 %r12761, %r29975, %r13100; + xor.b32 %r12848, %r29972, %r13099; + xor.b32 %r12849, %r29973, %r13100; + xor.b32 %r12777, %r29970, %r13099; + xor.b32 %r12776, %r29971, %r13100; // begin inline asm - dp4a.u32.u32 %r3304, %r3305, %r5878, %r3300; + shf.l.wrap.b32 %r12691, %r12600, %r12599, %r12560; // end inline asm - ld.const.u32 %r3309, [matrix+1548]; // begin inline asm - dp4a.u32.u32 %r3308, %r3309, %r5882, %r3304; + shf.l.wrap.b32 %r12695, %r12599, %r12600, %r12560; // end inline asm - ld.const.u32 %r3313, [matrix+1552]; + xor.b32 %r13101, %r12691, %r12635; + xor.b32 %r13102, %r12695, %r12636; + xor.b32 %r12792, %r30000, %r13101; + xor.b32 %r12793, %r30001, %r13102; + xor.b32 %r12712, %r29968, %r13101; + xor.b32 %r12713, %r29969, %r13102; + xor.b32 %r12729, %r29966, %r13101; + xor.b32 %r12728, %r29967, %r13102; + xor.b32 %r12768, %r29964, %r13101; + xor.b32 %r12769, %r29965, %r13102; + xor.b32 %r12800, %r29962, %r13101; + xor.b32 %r12801, %r29963, %r13102; + mov.u32 %r12706, 44; // begin inline asm - dp4a.u32.u32 %r3312, %r3313, %r5886, %r3308; + shf.l.wrap.b32 %r12699, %r12705, %r12704, %r12706; // end inline asm - ld.const.u32 %r3317, [matrix+1556]; // begin inline asm - dp4a.u32.u32 %r3316, %r3317, %r5890, %r3312; + shf.l.wrap.b32 %r12703, %r12704, %r12705, %r12706; // end inline asm - ld.const.u32 %r3321, [matrix+1560]; + mov.u32 %r12714, 20; // begin inline asm - dp4a.u32.u32 %r3320, %r3321, %r5894, %r3316; + shf.l.wrap.b32 %r12707, %r12713, %r12712, %r12714; // end inline asm - ld.const.u32 %r3325, [matrix+1564]; // begin inline asm - dp4a.u32.u32 %r3324, %r3325, %r5898, %r3320; + shf.l.wrap.b32 %r12711, %r12712, %r12713, %r12714; // end inline asm - ld.const.u32 %r3329, [matrix+1568]; + mov.u32 %r12722, 61; // begin inline asm - dp4a.u32.u32 %r3328, %r3329, %r5902, %r3324; + shf.l.wrap.b32 %r12715, %r12721, %r12720, %r12722; // end inline asm - ld.const.u32 %r3333, [matrix+1572]; // begin inline asm - dp4a.u32.u32 %r3332, %r3333, %r5906, %r3328; + shf.l.wrap.b32 %r12719, %r12720, %r12721, %r12722; // end inline asm - ld.const.u32 %r3337, [matrix+1576]; + mov.u32 %r12730, 39; // begin inline asm - dp4a.u32.u32 %r3336, %r3337, %r5910, %r3332; + shf.l.wrap.b32 %r12723, %r12729, %r12728, %r12730; // end inline asm - ld.const.u32 %r3341, [matrix+1580]; // begin inline asm - dp4a.u32.u32 %r3340, %r3341, %r5914, %r3336; + shf.l.wrap.b32 %r12727, %r12728, %r12729, %r12730; // end inline asm - ld.const.u32 %r3345, [matrix+1584]; + mov.u32 %r12738, 18; // begin inline asm - dp4a.u32.u32 %r3344, %r3345, %r5918, %r3340; + shf.l.wrap.b32 %r12731, %r12737, %r12736, %r12738; // end inline asm - ld.const.u32 %r3349, [matrix+1588]; // begin inline asm - dp4a.u32.u32 %r3348, %r3349, %r5922, %r3344; + shf.l.wrap.b32 %r12735, %r12736, %r12737, %r12738; // end inline asm - ld.const.u32 %r3353, [matrix+1592]; + mov.u32 %r12746, 62; // begin inline asm - dp4a.u32.u32 %r3352, %r3353, %r5926, %r3348; + shf.l.wrap.b32 %r12739, %r12745, %r12744, %r12746; // end inline asm - ld.const.u32 %r3357, [matrix+1596]; // begin inline asm - dp4a.u32.u32 %r3356, %r3357, %r5930, %r3352; + shf.l.wrap.b32 %r12743, %r12744, %r12745, %r12746; // end inline asm - ld.const.u32 %r3361, [matrix+1600]; + mov.u32 %r12754, 43; // begin inline asm - dp4a.u32.u32 %r3360, %r3361, %r5870, %r6249; + shf.l.wrap.b32 %r12747, %r12753, %r12752, %r12754; // end inline asm - ld.const.u32 %r3365, [matrix+1604]; // begin inline asm - dp4a.u32.u32 %r3364, %r3365, %r5874, %r3360; + shf.l.wrap.b32 %r12751, %r12752, %r12753, %r12754; // end inline asm - ld.const.u32 %r3369, [matrix+1608]; + mov.u32 %r12762, 25; // begin inline asm - dp4a.u32.u32 %r3368, %r3369, %r5878, %r3364; + shf.l.wrap.b32 %r12755, %r12761, %r12760, %r12762; // end inline asm - ld.const.u32 %r3373, [matrix+1612]; // begin inline asm - dp4a.u32.u32 %r3372, %r3373, %r5882, %r3368; + shf.l.wrap.b32 %r12759, %r12760, %r12761, %r12762; // end inline asm - ld.const.u32 %r3377, [matrix+1616]; + mov.u32 %r12770, 8; // begin inline asm - dp4a.u32.u32 %r3376, %r3377, %r5886, %r3372; + shf.l.wrap.b32 %r12763, %r12769, %r12768, %r12770; // end inline asm - ld.const.u32 %r3381, [matrix+1620]; // begin inline asm - dp4a.u32.u32 %r3380, %r3381, %r5890, %r3376; + shf.l.wrap.b32 %r12767, %r12768, %r12769, %r12770; // end inline asm - ld.const.u32 %r3385, [matrix+1624]; + mov.u32 %r12778, 56; // begin inline asm - dp4a.u32.u32 %r3384, %r3385, %r5894, %r3380; + shf.l.wrap.b32 %r12771, %r12777, %r12776, %r12778; // end inline asm - ld.const.u32 %r3389, [matrix+1628]; // begin inline asm - dp4a.u32.u32 %r3388, %r3389, %r5898, %r3384; + shf.l.wrap.b32 %r12775, %r12776, %r12777, %r12778; // end inline asm - ld.const.u32 %r3393, [matrix+1632]; + mov.u32 %r12786, 41; // begin inline asm - dp4a.u32.u32 %r3392, %r3393, %r5902, %r3388; + shf.l.wrap.b32 %r12779, %r12785, %r12784, %r12786; // end inline asm - ld.const.u32 %r3397, [matrix+1636]; // begin inline asm - dp4a.u32.u32 %r3396, %r3397, %r5906, %r3392; + shf.l.wrap.b32 %r12783, %r12784, %r12785, %r12786; // end inline asm - ld.const.u32 %r3401, [matrix+1640]; + mov.u32 %r12794, 27; // begin inline asm - dp4a.u32.u32 %r3400, %r3401, %r5910, %r3396; + shf.l.wrap.b32 %r12787, %r12793, %r12792, %r12794; // end inline asm - ld.const.u32 %r3405, [matrix+1644]; // begin inline asm - dp4a.u32.u32 %r3404, %r3405, %r5914, %r3400; + shf.l.wrap.b32 %r12791, %r12792, %r12793, %r12794; // end inline asm - ld.const.u32 %r3409, [matrix+1648]; + mov.u32 %r12802, 14; // begin inline asm - dp4a.u32.u32 %r3408, %r3409, %r5918, %r3404; + shf.l.wrap.b32 %r12795, %r12801, %r12800, %r12802; // end inline asm - ld.const.u32 %r3413, [matrix+1652]; // begin inline asm - dp4a.u32.u32 %r3412, %r3413, %r5922, %r3408; + shf.l.wrap.b32 %r12799, %r12800, %r12801, %r12802; // end inline asm - ld.const.u32 %r3417, [matrix+1656]; + mov.u32 %r12810, 2; // begin inline asm - dp4a.u32.u32 %r3416, %r3417, %r5926, %r3412; + shf.l.wrap.b32 %r12803, %r12809, %r12808, %r12810; // end inline asm - ld.const.u32 %r3421, [matrix+1660]; // begin inline asm - dp4a.u32.u32 %r3420, %r3421, %r5930, %r3416; + shf.l.wrap.b32 %r12807, %r12808, %r12809, %r12810; // end inline asm - shr.u32 %r6089, %r3356, 6; - and.b32 %r3425, %r6089, 240; - shr.u32 %r3426, %r3420, 10; - and.b32 %r3427, %r5986, 255; + mov.u32 %r12818, 55; // begin inline asm - lop3.b32 %r3424, %r3425, %r3426, %r3427, 0x56; + shf.l.wrap.b32 %r12811, %r12817, %r12816, %r12818; // end inline asm - ld.const.u32 %r3429, [matrix+1664]; // begin inline asm - dp4a.u32.u32 %r3428, %r3429, %r5870, %r6249; + shf.l.wrap.b32 %r12815, %r12816, %r12817, %r12818; // end inline asm - ld.const.u32 %r3433, [matrix+1668]; + mov.u32 %r12826, 45; // begin inline asm - dp4a.u32.u32 %r3432, %r3433, %r5874, %r3428; + shf.l.wrap.b32 %r12819, %r12825, %r12824, %r12826; // end inline asm - ld.const.u32 %r3437, [matrix+1672]; // begin inline asm - dp4a.u32.u32 %r3436, %r3437, %r5878, %r3432; + shf.l.wrap.b32 %r12823, %r12824, %r12825, %r12826; // end inline asm - ld.const.u32 %r3441, [matrix+1676]; + mov.u32 %r12834, 36; // begin inline asm - dp4a.u32.u32 %r3440, %r3441, %r5882, %r3436; + shf.l.wrap.b32 %r12827, %r12833, %r12832, %r12834; // end inline asm - ld.const.u32 %r3445, [matrix+1680]; // begin inline asm - dp4a.u32.u32 %r3444, %r3445, %r5886, %r3440; + shf.l.wrap.b32 %r12831, %r12832, %r12833, %r12834; // end inline asm - ld.const.u32 %r3449, [matrix+1684]; + mov.u32 %r12842, 28; // begin inline asm - dp4a.u32.u32 %r3448, %r3449, %r5890, %r3444; + shf.l.wrap.b32 %r12835, %r12841, %r12840, %r12842; // end inline asm - ld.const.u32 %r3453, [matrix+1688]; // begin inline asm - dp4a.u32.u32 %r3452, %r3453, %r5894, %r3448; + shf.l.wrap.b32 %r12839, %r12840, %r12841, %r12842; // end inline asm - ld.const.u32 %r3457, [matrix+1692]; + mov.u32 %r12850, 21; // begin inline asm - dp4a.u32.u32 %r3456, %r3457, %r5898, %r3452; + shf.l.wrap.b32 %r12843, %r12849, %r12848, %r12850; // end inline asm - ld.const.u32 %r3461, [matrix+1696]; // begin inline asm - dp4a.u32.u32 %r3460, %r3461, %r5902, %r3456; + shf.l.wrap.b32 %r12847, %r12848, %r12849, %r12850; // end inline asm - ld.const.u32 %r3465, [matrix+1700]; + mov.u32 %r12858, 15; // begin inline asm - dp4a.u32.u32 %r3464, %r3465, %r5906, %r3460; + shf.l.wrap.b32 %r12851, %r12857, %r12856, %r12858; // end inline asm - ld.const.u32 %r3469, [matrix+1704]; // begin inline asm - dp4a.u32.u32 %r3468, %r3469, %r5910, %r3464; + shf.l.wrap.b32 %r12855, %r12856, %r12857, %r12858; // end inline asm - ld.const.u32 %r3473, [matrix+1708]; + mov.u32 %r12866, 10; // begin inline asm - dp4a.u32.u32 %r3472, %r3473, %r5914, %r3468; + shf.l.wrap.b32 %r12859, %r12865, %r12864, %r12866; // end inline asm - ld.const.u32 %r3477, [matrix+1712]; // begin inline asm - dp4a.u32.u32 %r3476, %r3477, %r5918, %r3472; + shf.l.wrap.b32 %r12863, %r12864, %r12865, %r12866; // end inline asm - ld.const.u32 %r3481, [matrix+1716]; + mov.u32 %r12874, 6; // begin inline asm - dp4a.u32.u32 %r3480, %r3481, %r5922, %r3476; + shf.l.wrap.b32 %r12867, %r12873, %r12872, %r12874; // end inline asm - ld.const.u32 %r3485, [matrix+1720]; // begin inline asm - dp4a.u32.u32 %r3484, %r3485, %r5926, %r3480; + shf.l.wrap.b32 %r12871, %r12872, %r12873, %r12874; // end inline asm - ld.const.u32 %r3489, [matrix+1724]; + mov.u32 %r12882, 3; // begin inline asm - dp4a.u32.u32 %r3488, %r3489, %r5930, %r3484; + shf.l.wrap.b32 %r12875, %r12881, %r12880, %r12882; // end inline asm - ld.const.u32 %r3493, [matrix+1728]; // begin inline asm - dp4a.u32.u32 %r3492, %r3493, %r5870, %r6249; + shf.l.wrap.b32 %r12879, %r12880, %r12881, %r12882; // end inline asm - ld.const.u32 %r3497, [matrix+1732]; // begin inline asm - dp4a.u32.u32 %r3496, %r3497, %r5874, %r3492; + shf.l.wrap.b32 %r12883, %r12889, %r12888, %r12560; // end inline asm - ld.const.u32 %r3501, [matrix+1736]; // begin inline asm - dp4a.u32.u32 %r3500, %r3501, %r5878, %r3496; + shf.l.wrap.b32 %r12887, %r12888, %r12889, %r12560; // end inline asm - ld.const.u32 %r3505, [matrix+1740]; // begin inline asm - dp4a.u32.u32 %r3504, %r3505, %r5882, %r3500; + // chi + lop3.b32 %r12891, %r12926, %r12699, %r12747, 0xD2; + lop3.b32 %r12892, %r12929, %r12703, %r12751, 0xD2; // end inline asm - ld.const.u32 %r3509, [matrix+1744]; // begin inline asm - dp4a.u32.u32 %r3508, %r3509, %r5886, %r3504; + // chi + lop3.b32 %r30010, %r12699, %r12747, %r12843, 0xD2; + lop3.b32 %r30011, %r12703, %r12751, %r12847, 0xD2; // end inline asm - ld.const.u32 %r3513, [matrix+1748]; // begin inline asm - dp4a.u32.u32 %r3512, %r3513, %r5890, %r3508; + // chi + lop3.b32 %r30006, %r12747, %r12843, %r12795, 0xD2; + lop3.b32 %r30007, %r12751, %r12847, %r12799, 0xD2; // end inline asm - ld.const.u32 %r3517, [matrix+1752]; // begin inline asm - dp4a.u32.u32 %r3516, %r3517, %r5894, %r3512; + // chi + lop3.b32 %r30002, %r12843, %r12795, %r12926, 0xD2; + lop3.b32 %r30003, %r12847, %r12799, %r12929, 0xD2; // end inline asm - ld.const.u32 %r3521, [matrix+1756]; // begin inline asm - dp4a.u32.u32 %r3520, %r3521, %r5898, %r3516; + // chi + lop3.b32 %r30000, %r12795, %r12926, %r12699, 0xD2; + lop3.b32 %r30001, %r12799, %r12929, %r12703, 0xD2; // end inline asm - ld.const.u32 %r3525, [matrix+1760]; // begin inline asm - dp4a.u32.u32 %r3524, %r3525, %r5902, %r3520; + // chi + lop3.b32 %r29996, %r12835, %r12707, %r12875, 0xD2; + lop3.b32 %r29997, %r12839, %r12711, %r12879, 0xD2; // end inline asm - ld.const.u32 %r3529, [matrix+1764]; // begin inline asm - dp4a.u32.u32 %r3528, %r3529, %r5906, %r3524; + // chi + lop3.b32 %r30008, %r12707, %r12875, %r12819, 0xD2; + lop3.b32 %r30009, %r12711, %r12879, %r12823, 0xD2; // end inline asm - ld.const.u32 %r3533, [matrix+1768]; // begin inline asm - dp4a.u32.u32 %r3532, %r3533, %r5910, %r3528; + // chi + lop3.b32 %r30004, %r12875, %r12819, %r12715, 0xD2; + lop3.b32 %r30005, %r12879, %r12823, %r12719, 0xD2; // end inline asm - ld.const.u32 %r3537, [matrix+1772]; // begin inline asm - dp4a.u32.u32 %r3536, %r3537, %r5914, %r3532; + // chi + lop3.b32 %r29976, %r12819, %r12715, %r12835, 0xD2; + lop3.b32 %r29977, %r12823, %r12719, %r12839, 0xD2; // end inline asm - ld.const.u32 %r3541, [matrix+1776]; + st.local.v2.u32 [%rd84+88], {%r29976, %r29977}; // begin inline asm - dp4a.u32.u32 %r3540, %r3541, %r5918, %r3536; + // chi + lop3.b32 %r29968, %r12715, %r12835, %r12707, 0xD2; + lop3.b32 %r29969, %r12719, %r12839, %r12711, 0xD2; // end inline asm - ld.const.u32 %r3545, [matrix+1780]; + st.local.v2.u32 [%rd84+96], {%r29968, %r29969}; // begin inline asm - dp4a.u32.u32 %r3544, %r3545, %r5922, %r3540; + // chi + lop3.b32 %r29994, %r12883, %r12867, %r12755, 0xD2; + lop3.b32 %r29995, %r12887, %r12871, %r12759, 0xD2; // end inline asm - ld.const.u32 %r3549, [matrix+1784]; + st.local.v2.u32 [%rd84+104], {%r29994, %r29995}; // begin inline asm - dp4a.u32.u32 %r3548, %r3549, %r5926, %r3544; + // chi + lop3.b32 %r29988, %r12867, %r12755, %r12763, 0xD2; + lop3.b32 %r29989, %r12871, %r12759, %r12767, 0xD2; // end inline asm - ld.const.u32 %r3553, [matrix+1788]; + st.local.v2.u32 [%rd84+112], {%r29988, %r29989}; // begin inline asm - dp4a.u32.u32 %r3552, %r3553, %r5930, %r3548; + // chi + lop3.b32 %r29982, %r12755, %r12763, %r12731, 0xD2; + lop3.b32 %r29983, %r12759, %r12767, %r12735, 0xD2; // end inline asm - shr.u32 %r6090, %r3488, 6; - and.b32 %r3557, %r6090, 240; - shr.u32 %r3558, %r3552, 10; - and.b32 %r3559, %r5991, 255; + st.local.v2.u32 [%rd84+120], {%r29982, %r29983}; // begin inline asm - lop3.b32 %r3556, %r3557, %r3558, %r3559, 0x56; + // chi + lop3.b32 %r29974, %r12763, %r12731, %r12883, 0xD2; + lop3.b32 %r29975, %r12767, %r12735, %r12887, 0xD2; // end inline asm - ld.const.u32 %r3561, [matrix+1792]; + st.local.v2.u32 [%rd84+128], {%r29974, %r29975}; // begin inline asm - dp4a.u32.u32 %r3560, %r3561, %r5870, %r6249; + // chi + lop3.b32 %r29966, %r12731, %r12883, %r12867, 0xD2; + lop3.b32 %r29967, %r12735, %r12887, %r12871, 0xD2; // end inline asm - ld.const.u32 %r3565, [matrix+1796]; + st.local.v2.u32 [%rd84+136], {%r29966, %r29967}; // begin inline asm - dp4a.u32.u32 %r3564, %r3565, %r5874, %r3560; + // chi + lop3.b32 %r29992, %r12787, %r12827, %r12859, 0xD2; + lop3.b32 %r29993, %r12791, %r12831, %r12863, 0xD2; // end inline asm - ld.const.u32 %r3569, [matrix+1800]; + st.local.v2.u32 [%rd84+144], {%r29992, %r29993}; // begin inline asm - dp4a.u32.u32 %r3568, %r3569, %r5878, %r3564; + // chi + lop3.b32 %r29986, %r12827, %r12859, %r12851, 0xD2; + lop3.b32 %r29987, %r12831, %r12863, %r12855, 0xD2; // end inline asm - ld.const.u32 %r3573, [matrix+1804]; + st.local.v2.u32 [%rd84+152], {%r29986, %r29987}; // begin inline asm - dp4a.u32.u32 %r3572, %r3573, %r5882, %r3568; + // chi + lop3.b32 %r29980, %r12859, %r12851, %r12771, 0xD2; + lop3.b32 %r29981, %r12863, %r12855, %r12775, 0xD2; // end inline asm - ld.const.u32 %r3577, [matrix+1808]; + st.local.v2.u32 [%rd84+160], {%r29980, %r29981}; // begin inline asm - dp4a.u32.u32 %r3576, %r3577, %r5886, %r3572; + // chi + lop3.b32 %r29972, %r12851, %r12771, %r12787, 0xD2; + lop3.b32 %r29973, %r12855, %r12775, %r12791, 0xD2; // end inline asm - ld.const.u32 %r3581, [matrix+1812]; + st.local.v2.u32 [%rd84+168], {%r29972, %r29973}; // begin inline asm - dp4a.u32.u32 %r3580, %r3581, %r5890, %r3576; + // chi + lop3.b32 %r29964, %r12771, %r12787, %r12827, 0xD2; + lop3.b32 %r29965, %r12775, %r12791, %r12831, 0xD2; // end inline asm - ld.const.u32 %r3585, [matrix+1816]; + st.local.v2.u32 [%rd84+176], {%r29964, %r29965}; // begin inline asm - dp4a.u32.u32 %r3584, %r3585, %r5894, %r3580; + // chi + lop3.b32 %r29990, %r12739, %r12811, %r12723, 0xD2; + lop3.b32 %r29991, %r12743, %r12815, %r12727, 0xD2; // end inline asm - ld.const.u32 %r3589, [matrix+1820]; + st.local.v2.u32 [%rd84+184], {%r29990, %r29991}; // begin inline asm - dp4a.u32.u32 %r3588, %r3589, %r5898, %r3584; + // chi + lop3.b32 %r29984, %r12811, %r12723, %r12779, 0xD2; + lop3.b32 %r29985, %r12815, %r12727, %r12783, 0xD2; // end inline asm - ld.const.u32 %r3593, [matrix+1824]; + st.local.v2.u32 [%rd84+192], {%r29984, %r29985}; // begin inline asm - dp4a.u32.u32 %r3592, %r3593, %r5902, %r3588; + // chi + lop3.b32 %r29978, %r12723, %r12779, %r12803, 0xD2; + lop3.b32 %r29979, %r12727, %r12783, %r12807, 0xD2; // end inline asm - ld.const.u32 %r3597, [matrix+1828]; + st.local.v2.u32 [%rd84+200], {%r29978, %r29979}; // begin inline asm - dp4a.u32.u32 %r3596, %r3597, %r5906, %r3592; + // chi + lop3.b32 %r29970, %r12779, %r12803, %r12739, 0xD2; + lop3.b32 %r29971, %r12783, %r12807, %r12743, 0xD2; // end inline asm - ld.const.u32 %r3601, [matrix+1832]; + st.local.v2.u32 [%rd84+208], {%r29970, %r29971}; // begin inline asm - dp4a.u32.u32 %r3600, %r3601, %r5910, %r3596; + // chi + lop3.b32 %r29962, %r12803, %r12739, %r12811, 0xD2; + lop3.b32 %r29963, %r12807, %r12743, %r12815, 0xD2; // end inline asm - ld.const.u32 %r3605, [matrix+1836]; + st.local.v2.u32 [%rd84+216], {%r29962, %r29963}; + mul.wide.s32 %rd672, %r30012, 8; + add.s64 %rd671, %rd597, %rd672; // begin inline asm - dp4a.u32.u32 %r3604, %r3605, %r5914, %r3600; + ld.global.nc.v2.u32 {%r13091,%r13092}, [%rd671]; // end inline asm - ld.const.u32 %r3609, [matrix+1840]; + xor.b32 %r29998, %r12891, %r13091; + xor.b32 %r29999, %r12892, %r13092; + add.s32 %r30012, %r30012, 1; + setp.lt.u32 %p27, %r30012, 23; + @%p27 bra $L__BB2_42; + + mov.u32 %r13202, 1; + st.local.v2.u32 [%rd84+32], {%r30010, %r30011}; + st.local.v2.u32 [%rd84+72], {%r30008, %r30009}; + st.local.v2.u32 [%rd84+40], {%r30006, %r30007}; + st.local.v2.u32 [%rd84+80], {%r30004, %r30005}; + st.local.v2.u32 [%rd84+48], {%r30002, %r30003}; + st.local.v2.u32 [%rd84+56], {%r30000, %r30001}; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; // begin inline asm - dp4a.u32.u32 %r3608, %r3609, %r5918, %r3604; + // xor5 + lop3.b32 %r13103, %r29998, %r29996, %r29994, 0x96; + lop3.b32 %r13103, %r13103, %r29992, %r29990, 0x96; + lop3.b32 %r13104, %r29999, %r29997, %r29995, 0x96; + lop3.b32 %r13104, %r13104, %r29993, %r29991, 0x96; // end inline asm - ld.const.u32 %r3613, [matrix+1844]; // begin inline asm - dp4a.u32.u32 %r3612, %r3613, %r5922, %r3608; + // xor5 + lop3.b32 %r13115, %r30010, %r30008, %r29988, 0x96; + lop3.b32 %r13115, %r13115, %r29986, %r29984, 0x96; + lop3.b32 %r13116, %r30011, %r30009, %r29989, 0x96; + lop3.b32 %r13116, %r13116, %r29987, %r29985, 0x96; // end inline asm - ld.const.u32 %r3617, [matrix+1848]; // begin inline asm - dp4a.u32.u32 %r3616, %r3617, %r5926, %r3612; + // xor5 + lop3.b32 %r13127, %r30006, %r30004, %r29982, 0x96; + lop3.b32 %r13127, %r13127, %r29980, %r29978, 0x96; + lop3.b32 %r13128, %r30007, %r30005, %r29983, 0x96; + lop3.b32 %r13128, %r13128, %r29981, %r29979, 0x96; // end inline asm - ld.const.u32 %r3621, [matrix+1852]; // begin inline asm - dp4a.u32.u32 %r3620, %r3621, %r5930, %r3616; + // xor5 + lop3.b32 %r13139, %r30002, %r29976, %r29974, 0x96; + lop3.b32 %r13139, %r13139, %r29972, %r29970, 0x96; + lop3.b32 %r13140, %r30003, %r29977, %r29975, 0x96; + lop3.b32 %r13140, %r13140, %r29973, %r29971, 0x96; // end inline asm - ld.const.u32 %r3625, [matrix+1856]; // begin inline asm - dp4a.u32.u32 %r3624, %r3625, %r5870, %r6249; + // xor5 + lop3.b32 %r13151, %r30000, %r29968, %r29966, 0x96; + lop3.b32 %r13151, %r13151, %r29964, %r29962, 0x96; + lop3.b32 %r13152, %r30001, %r29969, %r29967, 0x96; + lop3.b32 %r13152, %r13152, %r29965, %r29963, 0x96; // end inline asm - ld.const.u32 %r3629, [matrix+1860]; // begin inline asm - dp4a.u32.u32 %r3628, %r3629, %r5874, %r3624; + shf.l.wrap.b32 %r13163, %r13116, %r13115, %r13202; // end inline asm - ld.const.u32 %r3633, [matrix+1864]; // begin inline asm - dp4a.u32.u32 %r3632, %r3633, %r5878, %r3628; + shf.l.wrap.b32 %r13167, %r13115, %r13116, %r13202; // end inline asm - ld.const.u32 %r3637, [matrix+1868]; + xor.b32 %r13341, %r13163, %r13151; + xor.b32 %r13342, %r13167, %r13152; + xor.b32 %r13310, %r29998, %r13341; + xor.b32 %r13313, %r29999, %r13342; + xor.b32 %r13273, %r29995, %r13342; + xor.b32 %r13272, %r29994, %r13341; + st.local.v2.u32 [%rd84+104], {%r13272, %r13273}; // begin inline asm - dp4a.u32.u32 %r3636, %r3637, %r5882, %r3632; + shf.l.wrap.b32 %r13171, %r13128, %r13127, %r13202; // end inline asm - ld.const.u32 %r3641, [matrix+1872]; // begin inline asm - dp4a.u32.u32 %r3640, %r3641, %r5886, %r3636; + shf.l.wrap.b32 %r13175, %r13127, %r13128, %r13202; // end inline asm - ld.const.u32 %r3645, [matrix+1876]; + xor.b32 %r13343, %r13171, %r13103; + xor.b32 %r13344, %r13175, %r13104; + xor.b32 %r13209, %r30008, %r13343; + xor.b32 %r13208, %r30009, %r13344; + xor.b32 %r13248, %r29987, %r13344; + xor.b32 %r13249, %r29986, %r13343; + st.local.v2.u32 [%rd84+152], {%r13249, %r13248}; // begin inline asm - dp4a.u32.u32 %r3644, %r3645, %r5890, %r3640; + shf.l.wrap.b32 %r13179, %r13140, %r13139, %r13202; // end inline asm - ld.const.u32 %r3649, [matrix+1880]; // begin inline asm - dp4a.u32.u32 %r3648, %r3649, %r5894, %r3644; + shf.l.wrap.b32 %r13183, %r13139, %r13140, %r13202; // end inline asm - ld.const.u32 %r3653, [matrix+1884]; + xor.b32 %r13345, %r13179, %r13115; + xor.b32 %r13346, %r13183, %r13116; + xor.b32 %r13232, %r29983, %r13346; + xor.b32 %r13233, %r29982, %r13345; + st.local.v2.u32 [%rd84+120], {%r13233, %r13232}; + xor.b32 %r13224, %r29979, %r13346; + xor.b32 %r13225, %r29978, %r13345; + st.local.v2.u32 [%rd84+200], {%r13225, %r13224}; // begin inline asm - dp4a.u32.u32 %r3652, %r3653, %r5898, %r3648; + shf.l.wrap.b32 %r13187, %r13152, %r13151, %r13202; // end inline asm - ld.const.u32 %r3657, [matrix+1888]; // begin inline asm - dp4a.u32.u32 %r3656, %r3657, %r5902, %r3652; + shf.l.wrap.b32 %r13191, %r13151, %r13152, %r13202; // end inline asm - ld.const.u32 %r3661, [matrix+1892]; + xor.b32 %r13347, %r13187, %r13127; + xor.b32 %r13348, %r13191, %r13128; + xor.b32 %r13256, %r30002, %r13347; + xor.b32 %r13257, %r30003, %r13348; + xor.b32 %r13265, %r29973, %r13348; + xor.b32 %r13264, %r29972, %r13347; + st.local.v2.u32 [%rd84+168], {%r13264, %r13265}; // begin inline asm - dp4a.u32.u32 %r3660, %r3661, %r5906, %r3656; + shf.l.wrap.b32 %r13195, %r13104, %r13103, %r13202; // end inline asm - ld.const.u32 %r3665, [matrix+1896]; // begin inline asm - dp4a.u32.u32 %r3664, %r3665, %r5910, %r3660; + shf.l.wrap.b32 %r13199, %r13103, %r13104, %r13202; // end inline asm - ld.const.u32 %r3669, [matrix+1900]; + xor.b32 %r13349, %r13195, %r13139; + xor.b32 %r13350, %r13199, %r13140; + xor.b32 %r13216, %r29968, %r13349; + xor.b32 %r13217, %r29969, %r13350; + xor.b32 %r13241, %r29963, %r13350; + xor.b32 %r13240, %r29962, %r13349; + st.local.v2.u32 [%rd84+216], {%r13240, %r13241}; // begin inline asm - dp4a.u32.u32 %r3668, %r3669, %r5914, %r3664; + shf.l.wrap.b32 %r13203, %r13209, %r13208, %r12706; // end inline asm - ld.const.u32 %r3673, [matrix+1904]; // begin inline asm - dp4a.u32.u32 %r3672, %r3673, %r5918, %r3668; + shf.l.wrap.b32 %r13207, %r13208, %r13209, %r12706; // end inline asm - ld.const.u32 %r3677, [matrix+1908]; // begin inline asm - dp4a.u32.u32 %r3676, %r3677, %r5922, %r3672; + shf.l.wrap.b32 %r13211, %r13217, %r13216, %r12714; // end inline asm - ld.const.u32 %r3681, [matrix+1912]; // begin inline asm - dp4a.u32.u32 %r3680, %r3681, %r5926, %r3676; + shf.l.wrap.b32 %r13215, %r13216, %r13217, %r12714; // end inline asm - ld.const.u32 %r3685, [matrix+1916]; // begin inline asm - dp4a.u32.u32 %r3684, %r3685, %r5930, %r3680; + shf.l.wrap.b32 %r13223, %r13224, %r13225, %r12722; // end inline asm - shr.u32 %r6091, %r3620, 6; - and.b32 %r3689, %r6091, 240; - shr.u32 %r3690, %r3684, 10; - and.b32 %r3691, %r5995, 255; // begin inline asm - lop3.b32 %r3688, %r3689, %r3690, %r3691, 0x56; + shf.l.wrap.b32 %r13219, %r13225, %r13224, %r12722; // end inline asm - ld.const.u32 %r3693, [matrix+1920]; + st.local.v2.u32 [%rd84+96], {%r13219, %r13223}; // begin inline asm - dp4a.u32.u32 %r3692, %r3693, %r5870, %r6249; + shf.l.wrap.b32 %r13227, %r13233, %r13232, %r12754; // end inline asm - ld.const.u32 %r3697, [matrix+1924]; // begin inline asm - dp4a.u32.u32 %r3696, %r3697, %r5874, %r3692; + shf.l.wrap.b32 %r13231, %r13232, %r13233, %r12754; // end inline asm - ld.const.u32 %r3701, [matrix+1928]; // begin inline asm - dp4a.u32.u32 %r3700, %r3701, %r5878, %r3696; + shf.l.wrap.b32 %r13235, %r13241, %r13240, %r12802; // end inline asm - ld.const.u32 %r3705, [matrix+1932]; // begin inline asm - dp4a.u32.u32 %r3704, %r3705, %r5882, %r3700; + shf.l.wrap.b32 %r13239, %r13240, %r13241, %r12802; // end inline asm - ld.const.u32 %r3709, [matrix+1936]; // begin inline asm - dp4a.u32.u32 %r3708, %r3709, %r5886, %r3704; + shf.l.wrap.b32 %r13247, %r13248, %r13249, %r12826; // end inline asm - ld.const.u32 %r3713, [matrix+1940]; // begin inline asm - dp4a.u32.u32 %r3712, %r3713, %r5890, %r3708; + shf.l.wrap.b32 %r13243, %r13249, %r13248, %r12826; // end inline asm - ld.const.u32 %r3717, [matrix+1944]; + st.local.v2.u32 [%rd84+88], {%r13243, %r13247}; // begin inline asm - dp4a.u32.u32 %r3716, %r3717, %r5894, %r3712; + shf.l.wrap.b32 %r13251, %r13257, %r13256, %r12842; // end inline asm - ld.const.u32 %r3721, [matrix+1948]; // begin inline asm - dp4a.u32.u32 %r3720, %r3721, %r5898, %r3716; + shf.l.wrap.b32 %r13255, %r13256, %r13257, %r12842; // end inline asm - ld.const.u32 %r3725, [matrix+1952]; // begin inline asm - dp4a.u32.u32 %r3724, %r3725, %r5902, %r3720; + shf.l.wrap.b32 %r13259, %r13265, %r13264, %r12850; // end inline asm - ld.const.u32 %r3729, [matrix+1956]; // begin inline asm - dp4a.u32.u32 %r3728, %r3729, %r5906, %r3724; + shf.l.wrap.b32 %r13263, %r13264, %r13265, %r12850; // end inline asm - ld.const.u32 %r3733, [matrix+1960]; // begin inline asm - dp4a.u32.u32 %r3732, %r3733, %r5910, %r3728; + shf.l.wrap.b32 %r13267, %r13273, %r13272, %r12882; // end inline asm - ld.const.u32 %r3737, [matrix+1964]; // begin inline asm - dp4a.u32.u32 %r3736, %r3737, %r5914, %r3732; + shf.l.wrap.b32 %r13271, %r13272, %r13273, %r12882; // end inline asm - ld.const.u32 %r3741, [matrix+1968]; // begin inline asm - dp4a.u32.u32 %r3740, %r3741, %r5918, %r3736; + // chi + lop3.b32 %r13275, %r13310, %r13203, %r13227, 0xD2; + lop3.b32 %r13276, %r13313, %r13207, %r13231, 0xD2; // end inline asm - ld.const.u32 %r3745, [matrix+1972]; // begin inline asm - dp4a.u32.u32 %r3744, %r3745, %r5922, %r3740; + // chi + lop3.b32 %r13283, %r13203, %r13227, %r13259, 0xD2; + lop3.b32 %r13284, %r13207, %r13231, %r13263, 0xD2; // end inline asm - ld.const.u32 %r3749, [matrix+1976]; + st.local.v2.u32 [%rd84+32], {%r13283, %r13284}; // begin inline asm - dp4a.u32.u32 %r3748, %r3749, %r5926, %r3744; + // chi + lop3.b32 %r13291, %r13227, %r13259, %r13235, 0xD2; + lop3.b32 %r13292, %r13231, %r13263, %r13239, 0xD2; // end inline asm - ld.const.u32 %r3753, [matrix+1980]; + st.local.v2.u32 [%rd84+40], {%r13291, %r13292}; // begin inline asm - dp4a.u32.u32 %r3752, %r3753, %r5930, %r3748; + // chi + lop3.b32 %r13299, %r13259, %r13235, %r13310, 0xD2; + lop3.b32 %r13300, %r13263, %r13239, %r13313, 0xD2; // end inline asm - ld.const.u32 %r3757, [matrix+1984]; + st.local.v2.u32 [%rd84+48], {%r13299, %r13300}; // begin inline asm - dp4a.u32.u32 %r3756, %r3757, %r5870, %r6249; + // chi + lop3.b32 %r13307, %r13235, %r13310, %r13203, 0xD2; + lop3.b32 %r13308, %r13239, %r13313, %r13207, 0xD2; // end inline asm - ld.const.u32 %r3761, [matrix+1988]; + st.local.v2.u32 [%rd84+56], {%r13307, %r13308}; // begin inline asm - dp4a.u32.u32 %r3760, %r3761, %r5874, %r3756; + // chi + lop3.b32 %r13315, %r13251, %r13211, %r13267, 0xD2; + lop3.b32 %r13316, %r13255, %r13215, %r13271, 0xD2; // end inline asm - ld.const.u32 %r3765, [matrix+1992]; + st.local.v2.u32 [%rd84+64], {%r13315, %r13316}; // begin inline asm - dp4a.u32.u32 %r3764, %r3765, %r5878, %r3760; + // chi + lop3.b32 %r13323, %r13211, %r13267, %r13243, 0xD2; + lop3.b32 %r13324, %r13215, %r13271, %r13247, 0xD2; // end inline asm - ld.const.u32 %r3769, [matrix+1996]; + st.local.v2.u32 [%rd84+72], {%r13323, %r13324}; // begin inline asm - dp4a.u32.u32 %r3768, %r3769, %r5882, %r3764; + // chi + lop3.b32 %r13331, %r13267, %r13243, %r13219, 0xD2; + lop3.b32 %r13332, %r13271, %r13247, %r13223, 0xD2; // end inline asm - ld.const.u32 %r3773, [matrix+2000]; + st.local.v2.u32 [%rd84+80], {%r13331, %r13332}; // begin inline asm - dp4a.u32.u32 %r3772, %r3773, %r5886, %r3768; + ld.global.nc.v2.u32 {%r13339,%r13340}, [%rd598]; // end inline asm - ld.const.u32 %r3777, [matrix+2004]; + xor.b32 %r13351, %r13276, %r13340; + xor.b32 %r13352, %r13275, %r13339; + st.local.v2.u32 [%rd84+24], {%r13352, %r13351}; + bra.uni $L__BB2_44; + +$L__BB2_22: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd479, 1179641; + st.local.u64 [%rd3+8], %rd479; + st.local.u32 [%rd3+16], %r30; + ld.global.u64 %rd480, [%rd34]; + ld.global.u64 %rd481, [%rd34+8]; + ld.global.u64 %rd482, [%rd34+16]; + ld.global.u64 %rd483, [%rd34+24]; + ld.global.u64 %rd484, [%rd34+32]; + ld.global.u64 %rd485, [%rd34+40]; + ld.global.u64 %rd486, [%rd34+48]; + ld.global.u64 %rd487, [%rd34+56]; + st.local.u64 [%rd3+24], %rd480; + st.local.u64 [%rd3+32], %rd481; + st.local.u64 [%rd3+40], %rd482; + st.local.u64 [%rd3+48], %rd483; + st.local.u64 [%rd3+56], %rd484; + st.local.u64 [%rd3+64], %rd485; + st.local.u64 [%rd3+72], %rd486; + st.local.u64 [%rd3+80], %rd487; + cvt.u32.u64 %r6826, %rd480; + xor.b32 %r6827, %r30, %r6826; + st.local.u32 [%rd3+24], %r6827; + mov.u32 %r29539, 0; + st.local.v2.u32 [%rd3+96], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+104], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+112], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+120], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+128], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+136], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+144], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+152], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+160], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+168], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+176], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+184], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+192], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+200], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+208], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+216], {%r29539, %r29539}; + mov.u32 %r29554, -2147483648; + mov.u32 %r6799, 1; + st.local.v2.u32 [%rd3+88], {%r6799, %r29554}; + ld.local.v2.u32 {%r29575, %r29576}, [%rd3+24]; + mov.b64 {%r29573, %r29574}, %rd485; + shr.u64 %rd488, %rd481, 32; + cvt.u32.u64 %r29587, %rd481; + cvt.u32.u64 %r29588, %rd488; + shr.u64 %rd489, %rd486, 32; + cvt.u32.u64 %r29585, %rd486; + cvt.u32.u64 %r29586, %rd489; + shr.u64 %rd490, %rd482, 32; + cvt.u32.u64 %r29583, %rd482; + cvt.u32.u64 %r29584, %rd490; + shr.u64 %rd491, %rd487, 32; + cvt.u32.u64 %r29581, %rd487; + cvt.u32.u64 %r29582, %rd491; + shr.u64 %rd492, %rd483, 32; + cvt.u32.u64 %r29579, %rd483; + cvt.u32.u64 %r29580, %rd492; + shr.u64 %rd493, %rd484, 32; + cvt.u32.u64 %r29577, %rd484; + cvt.u32.u64 %r29578, %rd493; + mov.u32 %r29540, %r29539; + mov.u32 %r29541, %r29539; + mov.u32 %r29542, %r29539; + mov.u32 %r29543, %r29539; + mov.u32 %r29544, %r29539; + mov.u32 %r29545, %r29539; + mov.u32 %r29546, %r29539; + mov.u32 %r29547, %r29539; + mov.u32 %r29548, %r29539; + mov.u32 %r29549, %r29539; + mov.u32 %r29550, %r29539; + mov.u32 %r29551, %r29539; + mov.u32 %r29552, %r29539; + mov.u32 %r29553, %r6799; + mov.u32 %r29555, %r29539; + mov.u32 %r29556, %r29539; + mov.u32 %r29557, %r29539; + mov.u32 %r29558, %r29539; + mov.u32 %r29559, %r29539; + mov.u32 %r29560, %r29539; + mov.u32 %r29561, %r29539; + mov.u32 %r29562, %r29539; + mov.u32 %r29563, %r29539; + mov.u32 %r29564, %r29539; + mov.u32 %r29565, %r29539; + mov.u32 %r29566, %r29539; + mov.u32 %r29567, %r29539; + mov.u32 %r29568, %r29539; + mov.u32 %r29569, %r29539; + mov.u32 %r29570, %r29539; + mov.u32 %r29571, %r29539; + mov.u32 %r29572, %r29539; + mov.u32 %r29589, %r29539; + +$L__BB2_23: // begin inline asm - dp4a.u32.u32 %r3776, %r3777, %r5890, %r3772; + // xor5 + lop3.b32 %r6830, %r29575, %r29573, %r29571, 0x96; + lop3.b32 %r6830, %r6830, %r29569, %r29567, 0x96; + lop3.b32 %r6831, %r29576, %r29574, %r29572, 0x96; + lop3.b32 %r6831, %r6831, %r29570, %r29568, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6842, %r29587, %r29585, %r29565, 0x96; + lop3.b32 %r6842, %r6842, %r29563, %r29561, 0x96; + lop3.b32 %r6843, %r29588, %r29586, %r29566, 0x96; + lop3.b32 %r6843, %r6843, %r29564, %r29562, 0x96; // end inline asm - ld.const.u32 %r3781, [matrix+2008]; // begin inline asm - dp4a.u32.u32 %r3780, %r3781, %r5894, %r3776; + // xor5 + lop3.b32 %r6854, %r29583, %r29581, %r29559, 0x96; + lop3.b32 %r6854, %r6854, %r29557, %r29555, 0x96; + lop3.b32 %r6855, %r29584, %r29582, %r29560, 0x96; + lop3.b32 %r6855, %r6855, %r29558, %r29556, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6866, %r29579, %r29553, %r29551, 0x96; + lop3.b32 %r6866, %r6866, %r29549, %r29547, 0x96; + lop3.b32 %r6867, %r29580, %r29554, %r29552, 0x96; + lop3.b32 %r6867, %r6867, %r29550, %r29548, 0x96; // end inline asm - ld.const.u32 %r3785, [matrix+2012]; // begin inline asm - dp4a.u32.u32 %r3784, %r3785, %r5898, %r3780; + // xor5 + lop3.b32 %r6878, %r29577, %r29545, %r29543, 0x96; + lop3.b32 %r6878, %r6878, %r29541, %r29539, 0x96; + lop3.b32 %r6879, %r29578, %r29546, %r29544, 0x96; + lop3.b32 %r6879, %r6879, %r29542, %r29540, 0x96; // end inline asm - ld.const.u32 %r3789, [matrix+2016]; // begin inline asm - dp4a.u32.u32 %r3788, %r3789, %r5902, %r3784; + shf.l.wrap.b32 %r6890, %r6843, %r6842, %r6799; // end inline asm - ld.const.u32 %r3793, [matrix+2020]; // begin inline asm - dp4a.u32.u32 %r3792, %r3793, %r5906, %r3788; + shf.l.wrap.b32 %r6894, %r6842, %r6843, %r6799; // end inline asm - ld.const.u32 %r3797, [matrix+2024]; + xor.b32 %r7324, %r6890, %r6878; + xor.b32 %r7325, %r6894, %r6879; + xor.b32 %r7157, %r29575, %r7324; + xor.b32 %r7160, %r29576, %r7325; + xor.b32 %r7064, %r29573, %r7324; + xor.b32 %r7063, %r29574, %r7325; + xor.b32 %r7111, %r29571, %r7324; + xor.b32 %r7112, %r29572, %r7325; + xor.b32 %r7016, %r29569, %r7324; + xor.b32 %r7015, %r29570, %r7325; + xor.b32 %r6967, %r29567, %r7324; + xor.b32 %r6968, %r29568, %r7325; // begin inline asm - dp4a.u32.u32 %r3796, %r3797, %r5910, %r3792; + shf.l.wrap.b32 %r6898, %r6855, %r6854, %r6799; // end inline asm - ld.const.u32 %r3801, [matrix+2028]; // begin inline asm - dp4a.u32.u32 %r3800, %r3801, %r5914, %r3796; + shf.l.wrap.b32 %r6902, %r6854, %r6855, %r6799; // end inline asm - ld.const.u32 %r3805, [matrix+2032]; + xor.b32 %r7326, %r6898, %r6830; + xor.b32 %r7327, %r6902, %r6831; + xor.b32 %r7119, %r29587, %r7326; + xor.b32 %r7120, %r29588, %r7327; + xor.b32 %r6936, %r29585, %r7326; + xor.b32 %r6935, %r29586, %r7327; + xor.b32 %r7095, %r29565, %r7326; + xor.b32 %r7096, %r29566, %r7327; + xor.b32 %r7056, %r29563, %r7326; + xor.b32 %r7055, %r29564, %r7327; + xor.b32 %r7039, %r29561, %r7326; + xor.b32 %r7040, %r29562, %r7327; // begin inline asm - dp4a.u32.u32 %r3804, %r3805, %r5918, %r3800; + shf.l.wrap.b32 %r6906, %r6867, %r6866, %r6799; // end inline asm - ld.const.u32 %r3809, [matrix+2036]; // begin inline asm - dp4a.u32.u32 %r3808, %r3809, %r5922, %r3804; + shf.l.wrap.b32 %r6910, %r6866, %r6867, %r6799; // end inline asm - ld.const.u32 %r3813, [matrix+2040]; + xor.b32 %r7328, %r6906, %r6842; + xor.b32 %r7329, %r6910, %r6843; + xor.b32 %r6976, %r29583, %r7328; + xor.b32 %r6975, %r29584, %r7329; + xor.b32 %r7103, %r29581, %r7328; + xor.b32 %r7104, %r29582, %r7329; + xor.b32 %r6984, %r29559, %r7328; + xor.b32 %r6983, %r29560, %r7329; + xor.b32 %r7087, %r29557, %r7328; + xor.b32 %r7088, %r29558, %r7329; + xor.b32 %r6952, %r29555, %r7328; + xor.b32 %r6951, %r29556, %r7329; // begin inline asm - dp4a.u32.u32 %r3812, %r3813, %r5926, %r3808; + shf.l.wrap.b32 %r6914, %r6879, %r6878, %r6799; // end inline asm - ld.const.u32 %r3817, [matrix+2044]; // begin inline asm - dp4a.u32.u32 %r3816, %r3817, %r5930, %r3812; + shf.l.wrap.b32 %r6918, %r6878, %r6879, %r6799; // end inline asm - shr.u32 %r6092, %r3752, 6; - and.b32 %r3821, %r6092, 240; - shr.u32 %r3822, %r3816, 10; + xor.b32 %r7330, %r6914, %r6854; + xor.b32 %r7331, %r6918, %r6855; + xor.b32 %r7071, %r29579, %r7330; + xor.b32 %r7072, %r29580, %r7331; + xor.b32 %r7048, %r29553, %r7330; + xor.b32 %r7047, %r29554, %r7331; + xor.b32 %r6991, %r29551, %r7330; + xor.b32 %r6992, %r29552, %r7331; + xor.b32 %r7079, %r29549, %r7330; + xor.b32 %r7080, %r29550, %r7331; + xor.b32 %r7008, %r29547, %r7330; + xor.b32 %r7007, %r29548, %r7331; // begin inline asm - lop3.b32 %r3820, %r3821, %r3822, %r12, 0x56; + shf.l.wrap.b32 %r6922, %r6831, %r6830, %r6799; // end inline asm - ld.const.u32 %r3825, [matrix+2048]; // begin inline asm - dp4a.u32.u32 %r3824, %r3825, %r5870, %r6249; + shf.l.wrap.b32 %r6926, %r6830, %r6831, %r6799; // end inline asm - ld.const.u32 %r3829, [matrix+2052]; + xor.b32 %r7332, %r6922, %r6866; + xor.b32 %r7333, %r6926, %r6867; + xor.b32 %r7023, %r29577, %r7332; + xor.b32 %r7024, %r29578, %r7333; + xor.b32 %r6943, %r29545, %r7332; + xor.b32 %r6944, %r29546, %r7333; + xor.b32 %r6960, %r29543, %r7332; + xor.b32 %r6959, %r29544, %r7333; + xor.b32 %r6999, %r29541, %r7332; + xor.b32 %r7000, %r29542, %r7333; + xor.b32 %r7031, %r29539, %r7332; + xor.b32 %r7032, %r29540, %r7333; + mov.u32 %r6937, 44; // begin inline asm - dp4a.u32.u32 %r3828, %r3829, %r5874, %r3824; + shf.l.wrap.b32 %r6930, %r6936, %r6935, %r6937; // end inline asm - ld.const.u32 %r3833, [matrix+2056]; // begin inline asm - dp4a.u32.u32 %r3832, %r3833, %r5878, %r3828; + shf.l.wrap.b32 %r6934, %r6935, %r6936, %r6937; // end inline asm - ld.const.u32 %r3837, [matrix+2060]; + mov.u32 %r6945, 20; // begin inline asm - dp4a.u32.u32 %r3836, %r3837, %r5882, %r3832; + shf.l.wrap.b32 %r6938, %r6944, %r6943, %r6945; // end inline asm - ld.const.u32 %r3841, [matrix+2064]; // begin inline asm - dp4a.u32.u32 %r3840, %r3841, %r5886, %r3836; + shf.l.wrap.b32 %r6942, %r6943, %r6944, %r6945; // end inline asm - ld.const.u32 %r3845, [matrix+2068]; + mov.u32 %r6953, 61; // begin inline asm - dp4a.u32.u32 %r3844, %r3845, %r5890, %r3840; + shf.l.wrap.b32 %r6946, %r6952, %r6951, %r6953; // end inline asm - ld.const.u32 %r3849, [matrix+2072]; // begin inline asm - dp4a.u32.u32 %r3848, %r3849, %r5894, %r3844; + shf.l.wrap.b32 %r6950, %r6951, %r6952, %r6953; // end inline asm - ld.const.u32 %r3853, [matrix+2076]; + mov.u32 %r6961, 39; // begin inline asm - dp4a.u32.u32 %r3852, %r3853, %r5898, %r3848; + shf.l.wrap.b32 %r6954, %r6960, %r6959, %r6961; // end inline asm - ld.const.u32 %r3857, [matrix+2080]; // begin inline asm - dp4a.u32.u32 %r3856, %r3857, %r5902, %r3852; + shf.l.wrap.b32 %r6958, %r6959, %r6960, %r6961; // end inline asm - ld.const.u32 %r3861, [matrix+2084]; + mov.u32 %r6969, 18; // begin inline asm - dp4a.u32.u32 %r3860, %r3861, %r5906, %r3856; + shf.l.wrap.b32 %r6962, %r6968, %r6967, %r6969; // end inline asm - ld.const.u32 %r3865, [matrix+2088]; // begin inline asm - dp4a.u32.u32 %r3864, %r3865, %r5910, %r3860; + shf.l.wrap.b32 %r6966, %r6967, %r6968, %r6969; // end inline asm - ld.const.u32 %r3869, [matrix+2092]; + mov.u32 %r6977, 62; // begin inline asm - dp4a.u32.u32 %r3868, %r3869, %r5914, %r3864; + shf.l.wrap.b32 %r6970, %r6976, %r6975, %r6977; // end inline asm - ld.const.u32 %r3873, [matrix+2096]; // begin inline asm - dp4a.u32.u32 %r3872, %r3873, %r5918, %r3868; + shf.l.wrap.b32 %r6974, %r6975, %r6976, %r6977; // end inline asm - ld.const.u32 %r3877, [matrix+2100]; + mov.u32 %r6985, 43; // begin inline asm - dp4a.u32.u32 %r3876, %r3877, %r5922, %r3872; + shf.l.wrap.b32 %r6978, %r6984, %r6983, %r6985; // end inline asm - ld.const.u32 %r3881, [matrix+2104]; // begin inline asm - dp4a.u32.u32 %r3880, %r3881, %r5926, %r3876; + shf.l.wrap.b32 %r6982, %r6983, %r6984, %r6985; // end inline asm - ld.const.u32 %r3885, [matrix+2108]; + mov.u32 %r6993, 25; // begin inline asm - dp4a.u32.u32 %r3884, %r3885, %r5930, %r3880; + shf.l.wrap.b32 %r6986, %r6992, %r6991, %r6993; // end inline asm - ld.const.u32 %r3889, [matrix+2112]; // begin inline asm - dp4a.u32.u32 %r3888, %r3889, %r5870, %r6249; + shf.l.wrap.b32 %r6990, %r6991, %r6992, %r6993; // end inline asm - ld.const.u32 %r3893, [matrix+2116]; + mov.u32 %r7001, 8; // begin inline asm - dp4a.u32.u32 %r3892, %r3893, %r5874, %r3888; + shf.l.wrap.b32 %r6994, %r7000, %r6999, %r7001; // end inline asm - ld.const.u32 %r3897, [matrix+2120]; // begin inline asm - dp4a.u32.u32 %r3896, %r3897, %r5878, %r3892; + shf.l.wrap.b32 %r6998, %r6999, %r7000, %r7001; // end inline asm - ld.const.u32 %r3901, [matrix+2124]; + mov.u32 %r7009, 56; // begin inline asm - dp4a.u32.u32 %r3900, %r3901, %r5882, %r3896; + shf.l.wrap.b32 %r7002, %r7008, %r7007, %r7009; // end inline asm - ld.const.u32 %r3905, [matrix+2128]; // begin inline asm - dp4a.u32.u32 %r3904, %r3905, %r5886, %r3900; + shf.l.wrap.b32 %r7006, %r7007, %r7008, %r7009; // end inline asm - ld.const.u32 %r3909, [matrix+2132]; + mov.u32 %r7017, 41; // begin inline asm - dp4a.u32.u32 %r3908, %r3909, %r5890, %r3904; + shf.l.wrap.b32 %r7010, %r7016, %r7015, %r7017; // end inline asm - ld.const.u32 %r3913, [matrix+2136]; // begin inline asm - dp4a.u32.u32 %r3912, %r3913, %r5894, %r3908; + shf.l.wrap.b32 %r7014, %r7015, %r7016, %r7017; // end inline asm - ld.const.u32 %r3917, [matrix+2140]; + mov.u32 %r7025, 27; // begin inline asm - dp4a.u32.u32 %r3916, %r3917, %r5898, %r3912; + shf.l.wrap.b32 %r7018, %r7024, %r7023, %r7025; // end inline asm - ld.const.u32 %r3921, [matrix+2144]; // begin inline asm - dp4a.u32.u32 %r3920, %r3921, %r5902, %r3916; + shf.l.wrap.b32 %r7022, %r7023, %r7024, %r7025; // end inline asm - ld.const.u32 %r3925, [matrix+2148]; + mov.u32 %r7033, 14; // begin inline asm - dp4a.u32.u32 %r3924, %r3925, %r5906, %r3920; + shf.l.wrap.b32 %r7026, %r7032, %r7031, %r7033; // end inline asm - ld.const.u32 %r3929, [matrix+2152]; // begin inline asm - dp4a.u32.u32 %r3928, %r3929, %r5910, %r3924; + shf.l.wrap.b32 %r7030, %r7031, %r7032, %r7033; // end inline asm - ld.const.u32 %r3933, [matrix+2156]; + mov.u32 %r7041, 2; // begin inline asm - dp4a.u32.u32 %r3932, %r3933, %r5914, %r3928; + shf.l.wrap.b32 %r7034, %r7040, %r7039, %r7041; // end inline asm - ld.const.u32 %r3937, [matrix+2160]; // begin inline asm - dp4a.u32.u32 %r3936, %r3937, %r5918, %r3932; + shf.l.wrap.b32 %r7038, %r7039, %r7040, %r7041; // end inline asm - ld.const.u32 %r3941, [matrix+2164]; + mov.u32 %r7049, 55; // begin inline asm - dp4a.u32.u32 %r3940, %r3941, %r5922, %r3936; + shf.l.wrap.b32 %r7042, %r7048, %r7047, %r7049; // end inline asm - ld.const.u32 %r3945, [matrix+2168]; // begin inline asm - dp4a.u32.u32 %r3944, %r3945, %r5926, %r3940; + shf.l.wrap.b32 %r7046, %r7047, %r7048, %r7049; // end inline asm - ld.const.u32 %r3949, [matrix+2172]; + mov.u32 %r7057, 45; // begin inline asm - dp4a.u32.u32 %r3948, %r3949, %r5930, %r3944; + shf.l.wrap.b32 %r7050, %r7056, %r7055, %r7057; // end inline asm - shr.u32 %r6093, %r3884, 6; - and.b32 %r3953, %r6093, 240; - shr.u32 %r3954, %r3948, 10; - and.b32 %r3955, %r13, 255; // begin inline asm - lop3.b32 %r3952, %r3953, %r3954, %r3955, 0x56; + shf.l.wrap.b32 %r7054, %r7055, %r7056, %r7057; // end inline asm - ld.const.u32 %r3957, [matrix+2176]; + mov.u32 %r7065, 36; // begin inline asm - dp4a.u32.u32 %r3956, %r3957, %r5870, %r6249; + shf.l.wrap.b32 %r7058, %r7064, %r7063, %r7065; // end inline asm - ld.const.u32 %r3961, [matrix+2180]; // begin inline asm - dp4a.u32.u32 %r3960, %r3961, %r5874, %r3956; + shf.l.wrap.b32 %r7062, %r7063, %r7064, %r7065; // end inline asm - ld.const.u32 %r3965, [matrix+2184]; + mov.u32 %r7073, 28; // begin inline asm - dp4a.u32.u32 %r3964, %r3965, %r5878, %r3960; + shf.l.wrap.b32 %r7066, %r7072, %r7071, %r7073; // end inline asm - ld.const.u32 %r3969, [matrix+2188]; // begin inline asm - dp4a.u32.u32 %r3968, %r3969, %r5882, %r3964; + shf.l.wrap.b32 %r7070, %r7071, %r7072, %r7073; // end inline asm - ld.const.u32 %r3973, [matrix+2192]; + mov.u32 %r7081, 21; // begin inline asm - dp4a.u32.u32 %r3972, %r3973, %r5886, %r3968; + shf.l.wrap.b32 %r7074, %r7080, %r7079, %r7081; // end inline asm - ld.const.u32 %r3977, [matrix+2196]; // begin inline asm - dp4a.u32.u32 %r3976, %r3977, %r5890, %r3972; + shf.l.wrap.b32 %r7078, %r7079, %r7080, %r7081; // end inline asm - ld.const.u32 %r3981, [matrix+2200]; + mov.u32 %r7089, 15; // begin inline asm - dp4a.u32.u32 %r3980, %r3981, %r5894, %r3976; + shf.l.wrap.b32 %r7082, %r7088, %r7087, %r7089; // end inline asm - ld.const.u32 %r3985, [matrix+2204]; // begin inline asm - dp4a.u32.u32 %r3984, %r3985, %r5898, %r3980; + shf.l.wrap.b32 %r7086, %r7087, %r7088, %r7089; // end inline asm - ld.const.u32 %r3989, [matrix+2208]; + mov.u32 %r7097, 10; // begin inline asm - dp4a.u32.u32 %r3988, %r3989, %r5902, %r3984; + shf.l.wrap.b32 %r7090, %r7096, %r7095, %r7097; // end inline asm - ld.const.u32 %r3993, [matrix+2212]; // begin inline asm - dp4a.u32.u32 %r3992, %r3993, %r5906, %r3988; + shf.l.wrap.b32 %r7094, %r7095, %r7096, %r7097; // end inline asm - ld.const.u32 %r3997, [matrix+2216]; + mov.u32 %r7105, 6; // begin inline asm - dp4a.u32.u32 %r3996, %r3997, %r5910, %r3992; + shf.l.wrap.b32 %r7098, %r7104, %r7103, %r7105; // end inline asm - ld.const.u32 %r4001, [matrix+2220]; // begin inline asm - dp4a.u32.u32 %r4000, %r4001, %r5914, %r3996; + shf.l.wrap.b32 %r7102, %r7103, %r7104, %r7105; // end inline asm - ld.const.u32 %r4005, [matrix+2224]; + mov.u32 %r7113, 3; // begin inline asm - dp4a.u32.u32 %r4004, %r4005, %r5918, %r4000; + shf.l.wrap.b32 %r7106, %r7112, %r7111, %r7113; // end inline asm - ld.const.u32 %r4009, [matrix+2228]; // begin inline asm - dp4a.u32.u32 %r4008, %r4009, %r5922, %r4004; + shf.l.wrap.b32 %r7110, %r7111, %r7112, %r7113; // end inline asm - ld.const.u32 %r4013, [matrix+2232]; // begin inline asm - dp4a.u32.u32 %r4012, %r4013, %r5926, %r4008; + shf.l.wrap.b32 %r7114, %r7120, %r7119, %r6799; // end inline asm - ld.const.u32 %r4017, [matrix+2236]; // begin inline asm - dp4a.u32.u32 %r4016, %r4017, %r5930, %r4012; + shf.l.wrap.b32 %r7118, %r7119, %r7120, %r6799; // end inline asm - ld.const.u32 %r4021, [matrix+2240]; // begin inline asm - dp4a.u32.u32 %r4020, %r4021, %r5870, %r6249; + // chi + lop3.b32 %r7122, %r7157, %r6930, %r6978, 0xD2; + lop3.b32 %r7123, %r7160, %r6934, %r6982, 0xD2; // end inline asm - ld.const.u32 %r4025, [matrix+2244]; // begin inline asm - dp4a.u32.u32 %r4024, %r4025, %r5874, %r4020; + // chi + lop3.b32 %r29587, %r6930, %r6978, %r7074, 0xD2; + lop3.b32 %r29588, %r6934, %r6982, %r7078, 0xD2; // end inline asm - ld.const.u32 %r4029, [matrix+2248]; // begin inline asm - dp4a.u32.u32 %r4028, %r4029, %r5878, %r4024; + // chi + lop3.b32 %r29583, %r6978, %r7074, %r7026, 0xD2; + lop3.b32 %r29584, %r6982, %r7078, %r7030, 0xD2; // end inline asm - ld.const.u32 %r4033, [matrix+2252]; // begin inline asm - dp4a.u32.u32 %r4032, %r4033, %r5882, %r4028; + // chi + lop3.b32 %r29579, %r7074, %r7026, %r7157, 0xD2; + lop3.b32 %r29580, %r7078, %r7030, %r7160, 0xD2; // end inline asm - ld.const.u32 %r4037, [matrix+2256]; // begin inline asm - dp4a.u32.u32 %r4036, %r4037, %r5886, %r4032; + // chi + lop3.b32 %r29577, %r7026, %r7157, %r6930, 0xD2; + lop3.b32 %r29578, %r7030, %r7160, %r6934, 0xD2; // end inline asm - ld.const.u32 %r4041, [matrix+2260]; // begin inline asm - dp4a.u32.u32 %r4040, %r4041, %r5890, %r4036; + // chi + lop3.b32 %r29573, %r7066, %r6938, %r7106, 0xD2; + lop3.b32 %r29574, %r7070, %r6942, %r7110, 0xD2; // end inline asm - ld.const.u32 %r4045, [matrix+2264]; // begin inline asm - dp4a.u32.u32 %r4044, %r4045, %r5894, %r4040; + // chi + lop3.b32 %r29585, %r6938, %r7106, %r7050, 0xD2; + lop3.b32 %r29586, %r6942, %r7110, %r7054, 0xD2; // end inline asm - ld.const.u32 %r4049, [matrix+2268]; // begin inline asm - dp4a.u32.u32 %r4048, %r4049, %r5898, %r4044; + // chi + lop3.b32 %r29581, %r7106, %r7050, %r6946, 0xD2; + lop3.b32 %r29582, %r7110, %r7054, %r6950, 0xD2; // end inline asm - ld.const.u32 %r4053, [matrix+2272]; // begin inline asm - dp4a.u32.u32 %r4052, %r4053, %r5902, %r4048; + // chi + lop3.b32 %r29553, %r7050, %r6946, %r7066, 0xD2; + lop3.b32 %r29554, %r7054, %r6950, %r7070, 0xD2; // end inline asm - ld.const.u32 %r4057, [matrix+2276]; + st.local.v2.u32 [%rd3+88], {%r29553, %r29554}; // begin inline asm - dp4a.u32.u32 %r4056, %r4057, %r5906, %r4052; + // chi + lop3.b32 %r29545, %r6946, %r7066, %r6938, 0xD2; + lop3.b32 %r29546, %r6950, %r7070, %r6942, 0xD2; // end inline asm - ld.const.u32 %r4061, [matrix+2280]; + st.local.v2.u32 [%rd3+96], {%r29545, %r29546}; // begin inline asm - dp4a.u32.u32 %r4060, %r4061, %r5910, %r4056; + // chi + lop3.b32 %r29571, %r7114, %r7098, %r6986, 0xD2; + lop3.b32 %r29572, %r7118, %r7102, %r6990, 0xD2; // end inline asm - ld.const.u32 %r4065, [matrix+2284]; + st.local.v2.u32 [%rd3+104], {%r29571, %r29572}; // begin inline asm - dp4a.u32.u32 %r4064, %r4065, %r5914, %r4060; + // chi + lop3.b32 %r29565, %r7098, %r6986, %r6994, 0xD2; + lop3.b32 %r29566, %r7102, %r6990, %r6998, 0xD2; // end inline asm - ld.const.u32 %r4069, [matrix+2288]; + st.local.v2.u32 [%rd3+112], {%r29565, %r29566}; // begin inline asm - dp4a.u32.u32 %r4068, %r4069, %r5918, %r4064; + // chi + lop3.b32 %r29559, %r6986, %r6994, %r6962, 0xD2; + lop3.b32 %r29560, %r6990, %r6998, %r6966, 0xD2; // end inline asm - ld.const.u32 %r4073, [matrix+2292]; + st.local.v2.u32 [%rd3+120], {%r29559, %r29560}; // begin inline asm - dp4a.u32.u32 %r4072, %r4073, %r5922, %r4068; + // chi + lop3.b32 %r29551, %r6994, %r6962, %r7114, 0xD2; + lop3.b32 %r29552, %r6998, %r6966, %r7118, 0xD2; // end inline asm - ld.const.u32 %r4077, [matrix+2296]; + st.local.v2.u32 [%rd3+128], {%r29551, %r29552}; // begin inline asm - dp4a.u32.u32 %r4076, %r4077, %r5926, %r4072; + // chi + lop3.b32 %r29543, %r6962, %r7114, %r7098, 0xD2; + lop3.b32 %r29544, %r6966, %r7118, %r7102, 0xD2; // end inline asm - ld.const.u32 %r4081, [matrix+2300]; + st.local.v2.u32 [%rd3+136], {%r29543, %r29544}; // begin inline asm - dp4a.u32.u32 %r4080, %r4081, %r5930, %r4076; + // chi + lop3.b32 %r29569, %r7018, %r7058, %r7090, 0xD2; + lop3.b32 %r29570, %r7022, %r7062, %r7094, 0xD2; // end inline asm - shr.u32 %r6094, %r4016, 6; - and.b32 %r4085, %r6094, 240; - shr.u32 %r4086, %r4080, 10; - bfe.u32 %r4087, %r13, 8, 8; + st.local.v2.u32 [%rd3+144], {%r29569, %r29570}; // begin inline asm - lop3.b32 %r4084, %r4085, %r4086, %r4087, 0x56; + // chi + lop3.b32 %r29563, %r7058, %r7090, %r7082, 0xD2; + lop3.b32 %r29564, %r7062, %r7094, %r7086, 0xD2; // end inline asm - ld.const.u32 %r4089, [matrix+2304]; + st.local.v2.u32 [%rd3+152], {%r29563, %r29564}; // begin inline asm - dp4a.u32.u32 %r4088, %r4089, %r5870, %r6249; + // chi + lop3.b32 %r29557, %r7090, %r7082, %r7002, 0xD2; + lop3.b32 %r29558, %r7094, %r7086, %r7006, 0xD2; // end inline asm - ld.const.u32 %r4093, [matrix+2308]; + st.local.v2.u32 [%rd3+160], {%r29557, %r29558}; // begin inline asm - dp4a.u32.u32 %r4092, %r4093, %r5874, %r4088; + // chi + lop3.b32 %r29549, %r7082, %r7002, %r7018, 0xD2; + lop3.b32 %r29550, %r7086, %r7006, %r7022, 0xD2; // end inline asm - ld.const.u32 %r4097, [matrix+2312]; + st.local.v2.u32 [%rd3+168], {%r29549, %r29550}; // begin inline asm - dp4a.u32.u32 %r4096, %r4097, %r5878, %r4092; + // chi + lop3.b32 %r29541, %r7002, %r7018, %r7058, 0xD2; + lop3.b32 %r29542, %r7006, %r7022, %r7062, 0xD2; // end inline asm - ld.const.u32 %r4101, [matrix+2316]; + st.local.v2.u32 [%rd3+176], {%r29541, %r29542}; // begin inline asm - dp4a.u32.u32 %r4100, %r4101, %r5882, %r4096; + // chi + lop3.b32 %r29567, %r6970, %r7042, %r6954, 0xD2; + lop3.b32 %r29568, %r6974, %r7046, %r6958, 0xD2; // end inline asm - ld.const.u32 %r4105, [matrix+2320]; + st.local.v2.u32 [%rd3+184], {%r29567, %r29568}; // begin inline asm - dp4a.u32.u32 %r4104, %r4105, %r5886, %r4100; + // chi + lop3.b32 %r29561, %r7042, %r6954, %r7010, 0xD2; + lop3.b32 %r29562, %r7046, %r6958, %r7014, 0xD2; // end inline asm - ld.const.u32 %r4109, [matrix+2324]; + st.local.v2.u32 [%rd3+192], {%r29561, %r29562}; // begin inline asm - dp4a.u32.u32 %r4108, %r4109, %r5890, %r4104; + // chi + lop3.b32 %r29555, %r6954, %r7010, %r7034, 0xD2; + lop3.b32 %r29556, %r6958, %r7014, %r7038, 0xD2; // end inline asm - ld.const.u32 %r4113, [matrix+2328]; + st.local.v2.u32 [%rd3+200], {%r29555, %r29556}; // begin inline asm - dp4a.u32.u32 %r4112, %r4113, %r5894, %r4108; + // chi + lop3.b32 %r29547, %r7010, %r7034, %r6970, 0xD2; + lop3.b32 %r29548, %r7014, %r7038, %r6974, 0xD2; // end inline asm - ld.const.u32 %r4117, [matrix+2332]; + st.local.v2.u32 [%rd3+208], {%r29547, %r29548}; // begin inline asm - dp4a.u32.u32 %r4116, %r4117, %r5898, %r4112; + // chi + lop3.b32 %r29539, %r7034, %r6970, %r7042, 0xD2; + lop3.b32 %r29540, %r7038, %r6974, %r7046, 0xD2; // end inline asm - ld.const.u32 %r4121, [matrix+2336]; + st.local.v2.u32 [%rd3+216], {%r29539, %r29540}; + mul.wide.s32 %rd495, %r29589, 8; + mov.u64 %rd496, keccak_round_constants; + cvta.const.u64 %rd497, %rd496; + add.s64 %rd494, %rd497, %rd495; // begin inline asm - dp4a.u32.u32 %r4120, %r4121, %r5902, %r4116; + ld.global.nc.v2.u32 {%r7322,%r7323}, [%rd494]; // end inline asm - ld.const.u32 %r4125, [matrix+2340]; + xor.b32 %r29575, %r7122, %r7322; + xor.b32 %r29576, %r7123, %r7323; + add.s32 %r29589, %r29589, 1; + setp.lt.u32 %p18, %r29589, 23; + @%p18 bra $L__BB2_23; + + add.u64 %rd55, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r29587, %r29588}; + st.local.v2.u32 [%rd3+72], {%r29585, %r29586}; + st.local.v2.u32 [%rd3+40], {%r29583, %r29584}; + st.local.v2.u32 [%rd3+80], {%r29581, %r29582}; + st.local.v2.u32 [%rd3+48], {%r29579, %r29580}; + st.local.v2.u32 [%rd3+56], {%r29577, %r29578}; + st.local.v2.u32 [%rd3+24], {%r29575, %r29576}; // begin inline asm - dp4a.u32.u32 %r4124, %r4125, %r5906, %r4120; + // xor5 + lop3.b32 %r7334, %r29575, %r29573, %r29571, 0x96; + lop3.b32 %r7334, %r7334, %r29569, %r29567, 0x96; + lop3.b32 %r7335, %r29576, %r29574, %r29572, 0x96; + lop3.b32 %r7335, %r7335, %r29570, %r29568, 0x96; // end inline asm - ld.const.u32 %r4129, [matrix+2344]; // begin inline asm - dp4a.u32.u32 %r4128, %r4129, %r5910, %r4124; + // xor5 + lop3.b32 %r7346, %r29587, %r29585, %r29565, 0x96; + lop3.b32 %r7346, %r7346, %r29563, %r29561, 0x96; + lop3.b32 %r7347, %r29588, %r29586, %r29566, 0x96; + lop3.b32 %r7347, %r7347, %r29564, %r29562, 0x96; // end inline asm - ld.const.u32 %r4133, [matrix+2348]; // begin inline asm - dp4a.u32.u32 %r4132, %r4133, %r5914, %r4128; + // xor5 + lop3.b32 %r7358, %r29583, %r29581, %r29559, 0x96; + lop3.b32 %r7358, %r7358, %r29557, %r29555, 0x96; + lop3.b32 %r7359, %r29584, %r29582, %r29560, 0x96; + lop3.b32 %r7359, %r7359, %r29558, %r29556, 0x96; // end inline asm - ld.const.u32 %r4137, [matrix+2352]; // begin inline asm - dp4a.u32.u32 %r4136, %r4137, %r5918, %r4132; + // xor5 + lop3.b32 %r7370, %r29579, %r29553, %r29551, 0x96; + lop3.b32 %r7370, %r7370, %r29549, %r29547, 0x96; + lop3.b32 %r7371, %r29580, %r29554, %r29552, 0x96; + lop3.b32 %r7371, %r7371, %r29550, %r29548, 0x96; // end inline asm - ld.const.u32 %r4141, [matrix+2356]; // begin inline asm - dp4a.u32.u32 %r4140, %r4141, %r5922, %r4136; + // xor5 + lop3.b32 %r7382, %r29577, %r29545, %r29543, 0x96; + lop3.b32 %r7382, %r7382, %r29541, %r29539, 0x96; + lop3.b32 %r7383, %r29578, %r29546, %r29544, 0x96; + lop3.b32 %r7383, %r7383, %r29542, %r29540, 0x96; // end inline asm - ld.const.u32 %r4145, [matrix+2360]; + mov.u32 %r7586, 1; // begin inline asm - dp4a.u32.u32 %r4144, %r4145, %r5926, %r4140; + shf.l.wrap.b32 %r7394, %r7347, %r7346, %r7586; // end inline asm - ld.const.u32 %r4149, [matrix+2364]; // begin inline asm - dp4a.u32.u32 %r4148, %r4149, %r5930, %r4144; + shf.l.wrap.b32 %r7398, %r7346, %r7347, %r7586; // end inline asm - ld.const.u32 %r4153, [matrix+2368]; + xor.b32 %r7613, %r7394, %r7382; + xor.b32 %r7614, %r7398, %r7383; + xor.b32 %r7541, %r29575, %r7613; + xor.b32 %r7544, %r29576, %r7614; + xor.b32 %r7504, %r29572, %r7614; + xor.b32 %r7503, %r29571, %r7613; + st.local.v2.u32 [%rd3+104], {%r7503, %r7504}; // begin inline asm - dp4a.u32.u32 %r4152, %r4153, %r5870, %r6249; + shf.l.wrap.b32 %r7402, %r7359, %r7358, %r7586; // end inline asm - ld.const.u32 %r4157, [matrix+2372]; // begin inline asm - dp4a.u32.u32 %r4156, %r4157, %r5874, %r4152; + shf.l.wrap.b32 %r7406, %r7358, %r7359, %r7586; // end inline asm - ld.const.u32 %r4161, [matrix+2376]; + xor.b32 %r7615, %r7402, %r7334; + xor.b32 %r7616, %r7406, %r7335; + xor.b32 %r7440, %r29585, %r7615; + xor.b32 %r7439, %r29586, %r7616; + xor.b32 %r7479, %r29564, %r7616; + xor.b32 %r7480, %r29563, %r7615; + st.local.v2.u32 [%rd3+152], {%r7480, %r7479}; // begin inline asm - dp4a.u32.u32 %r4160, %r4161, %r5878, %r4156; + shf.l.wrap.b32 %r7410, %r7371, %r7370, %r7586; // end inline asm - ld.const.u32 %r4165, [matrix+2380]; // begin inline asm - dp4a.u32.u32 %r4164, %r4165, %r5882, %r4160; + shf.l.wrap.b32 %r7414, %r7370, %r7371, %r7586; // end inline asm - ld.const.u32 %r4169, [matrix+2384]; + xor.b32 %r7617, %r7410, %r7346; + xor.b32 %r7618, %r7414, %r7347; + xor.b32 %r7463, %r29560, %r7618; + xor.b32 %r7464, %r29559, %r7617; + st.local.v2.u32 [%rd3+120], {%r7464, %r7463}; + xor.b32 %r7455, %r29556, %r7618; + xor.b32 %r7456, %r29555, %r7617; + st.local.v2.u32 [%rd3+200], {%r7456, %r7455}; // begin inline asm - dp4a.u32.u32 %r4168, %r4169, %r5886, %r4164; + shf.l.wrap.b32 %r7418, %r7383, %r7382, %r7586; // end inline asm - ld.const.u32 %r4173, [matrix+2388]; // begin inline asm - dp4a.u32.u32 %r4172, %r4173, %r5890, %r4168; + shf.l.wrap.b32 %r7422, %r7382, %r7383, %r7586; // end inline asm - ld.const.u32 %r4177, [matrix+2392]; + xor.b32 %r7619, %r7418, %r7358; + xor.b32 %r7620, %r7422, %r7359; + xor.b32 %r7487, %r29579, %r7619; + xor.b32 %r7488, %r29580, %r7620; + xor.b32 %r7496, %r29550, %r7620; + xor.b32 %r7495, %r29549, %r7619; + st.local.v2.u32 [%rd3+168], {%r7495, %r7496}; // begin inline asm - dp4a.u32.u32 %r4176, %r4177, %r5894, %r4172; + shf.l.wrap.b32 %r7426, %r7335, %r7334, %r7586; // end inline asm - ld.const.u32 %r4181, [matrix+2396]; // begin inline asm - dp4a.u32.u32 %r4180, %r4181, %r5898, %r4176; + shf.l.wrap.b32 %r7430, %r7334, %r7335, %r7586; // end inline asm - ld.const.u32 %r4185, [matrix+2400]; + xor.b32 %r7621, %r7426, %r7370; + xor.b32 %r7622, %r7430, %r7371; + xor.b32 %r7447, %r29545, %r7621; + xor.b32 %r7448, %r29546, %r7622; + xor.b32 %r7472, %r29540, %r7622; + xor.b32 %r7471, %r29539, %r7621; + st.local.v2.u32 [%rd3+216], {%r7471, %r7472}; // begin inline asm - dp4a.u32.u32 %r4184, %r4185, %r5902, %r4180; + shf.l.wrap.b32 %r7434, %r7440, %r7439, %r6937; // end inline asm - ld.const.u32 %r4189, [matrix+2404]; // begin inline asm - dp4a.u32.u32 %r4188, %r4189, %r5906, %r4184; + shf.l.wrap.b32 %r7438, %r7439, %r7440, %r6937; // end inline asm - ld.const.u32 %r4193, [matrix+2408]; // begin inline asm - dp4a.u32.u32 %r4192, %r4193, %r5910, %r4188; + shf.l.wrap.b32 %r7442, %r7448, %r7447, %r6945; // end inline asm - ld.const.u32 %r4197, [matrix+2412]; // begin inline asm - dp4a.u32.u32 %r4196, %r4197, %r5914, %r4192; + shf.l.wrap.b32 %r7446, %r7447, %r7448, %r6945; // end inline asm - ld.const.u32 %r4201, [matrix+2416]; // begin inline asm - dp4a.u32.u32 %r4200, %r4201, %r5918, %r4196; + shf.l.wrap.b32 %r7454, %r7455, %r7456, %r6953; // end inline asm - ld.const.u32 %r4205, [matrix+2420]; // begin inline asm - dp4a.u32.u32 %r4204, %r4205, %r5922, %r4200; + shf.l.wrap.b32 %r7450, %r7456, %r7455, %r6953; // end inline asm - ld.const.u32 %r4209, [matrix+2424]; + st.local.v2.u32 [%rd3+96], {%r7450, %r7454}; // begin inline asm - dp4a.u32.u32 %r4208, %r4209, %r5926, %r4204; + shf.l.wrap.b32 %r7458, %r7464, %r7463, %r6985; // end inline asm - ld.const.u32 %r4213, [matrix+2428]; // begin inline asm - dp4a.u32.u32 %r4212, %r4213, %r5930, %r4208; + shf.l.wrap.b32 %r7462, %r7463, %r7464, %r6985; // end inline asm - shr.u32 %r6095, %r4148, 6; - and.b32 %r4217, %r6095, 240; - shr.u32 %r4218, %r4212, 10; - bfe.u32 %r4219, %r13, 16, 8; // begin inline asm - lop3.b32 %r4216, %r4217, %r4218, %r4219, 0x56; + shf.l.wrap.b32 %r7466, %r7472, %r7471, %r7033; // end inline asm - ld.const.u32 %r4221, [matrix+2432]; // begin inline asm - dp4a.u32.u32 %r4220, %r4221, %r5870, %r6249; + shf.l.wrap.b32 %r7470, %r7471, %r7472, %r7033; // end inline asm - ld.const.u32 %r4225, [matrix+2436]; // begin inline asm - dp4a.u32.u32 %r4224, %r4225, %r5874, %r4220; + shf.l.wrap.b32 %r7478, %r7479, %r7480, %r7057; // end inline asm - ld.const.u32 %r4229, [matrix+2440]; // begin inline asm - dp4a.u32.u32 %r4228, %r4229, %r5878, %r4224; + shf.l.wrap.b32 %r7474, %r7480, %r7479, %r7057; // end inline asm - ld.const.u32 %r4233, [matrix+2444]; + st.local.v2.u32 [%rd3+88], {%r7474, %r7478}; // begin inline asm - dp4a.u32.u32 %r4232, %r4233, %r5882, %r4228; + shf.l.wrap.b32 %r7482, %r7488, %r7487, %r7073; // end inline asm - ld.const.u32 %r4237, [matrix+2448]; // begin inline asm - dp4a.u32.u32 %r4236, %r4237, %r5886, %r4232; + shf.l.wrap.b32 %r7486, %r7487, %r7488, %r7073; // end inline asm - ld.const.u32 %r4241, [matrix+2452]; // begin inline asm - dp4a.u32.u32 %r4240, %r4241, %r5890, %r4236; + shf.l.wrap.b32 %r7490, %r7496, %r7495, %r7081; // end inline asm - ld.const.u32 %r4245, [matrix+2456]; // begin inline asm - dp4a.u32.u32 %r4244, %r4245, %r5894, %r4240; + shf.l.wrap.b32 %r7494, %r7495, %r7496, %r7081; // end inline asm - ld.const.u32 %r4249, [matrix+2460]; // begin inline asm - dp4a.u32.u32 %r4248, %r4249, %r5898, %r4244; + shf.l.wrap.b32 %r7498, %r7504, %r7503, %r7113; // end inline asm - ld.const.u32 %r4253, [matrix+2464]; // begin inline asm - dp4a.u32.u32 %r4252, %r4253, %r5902, %r4248; + shf.l.wrap.b32 %r7502, %r7503, %r7504, %r7113; // end inline asm - ld.const.u32 %r4257, [matrix+2468]; // begin inline asm - dp4a.u32.u32 %r4256, %r4257, %r5906, %r4252; + // chi + lop3.b32 %r7506, %r7541, %r7434, %r7458, 0xD2; + lop3.b32 %r7507, %r7544, %r7438, %r7462, 0xD2; // end inline asm - ld.const.u32 %r4261, [matrix+2472]; // begin inline asm - dp4a.u32.u32 %r4260, %r4261, %r5910, %r4256; + // chi + lop3.b32 %r29722, %r7434, %r7458, %r7490, 0xD2; + lop3.b32 %r29723, %r7438, %r7462, %r7494, 0xD2; // end inline asm - ld.const.u32 %r4265, [matrix+2476]; + st.local.v2.u32 [%rd3+32], {%r29722, %r29723}; // begin inline asm - dp4a.u32.u32 %r4264, %r4265, %r5914, %r4260; + // chi + lop3.b32 %r29718, %r7458, %r7490, %r7466, 0xD2; + lop3.b32 %r29719, %r7462, %r7494, %r7470, 0xD2; // end inline asm - ld.const.u32 %r4269, [matrix+2480]; + st.local.v2.u32 [%rd3+40], {%r29718, %r29719}; // begin inline asm - dp4a.u32.u32 %r4268, %r4269, %r5918, %r4264; + // chi + lop3.b32 %r29714, %r7490, %r7466, %r7541, 0xD2; + lop3.b32 %r29715, %r7494, %r7470, %r7544, 0xD2; // end inline asm - ld.const.u32 %r4273, [matrix+2484]; + st.local.v2.u32 [%rd3+48], {%r29714, %r29715}; // begin inline asm - dp4a.u32.u32 %r4272, %r4273, %r5922, %r4268; + // chi + lop3.b32 %r29712, %r7466, %r7541, %r7434, 0xD2; + lop3.b32 %r29713, %r7470, %r7544, %r7438, 0xD2; // end inline asm - ld.const.u32 %r4277, [matrix+2488]; + st.local.v2.u32 [%rd3+56], {%r29712, %r29713}; // begin inline asm - dp4a.u32.u32 %r4276, %r4277, %r5926, %r4272; + // chi + lop3.b32 %r29708, %r7482, %r7442, %r7498, 0xD2; + lop3.b32 %r29709, %r7486, %r7446, %r7502, 0xD2; // end inline asm - ld.const.u32 %r4281, [matrix+2492]; + st.local.v2.u32 [%rd3+64], {%r29708, %r29709}; // begin inline asm - dp4a.u32.u32 %r4280, %r4281, %r5930, %r4276; + // chi + lop3.b32 %r29720, %r7442, %r7498, %r7474, 0xD2; + lop3.b32 %r29721, %r7446, %r7502, %r7478, 0xD2; // end inline asm - ld.const.u32 %r4285, [matrix+2496]; + st.local.v2.u32 [%rd3+72], {%r29720, %r29721}; // begin inline asm - dp4a.u32.u32 %r4284, %r4285, %r5870, %r6249; + // chi + lop3.b32 %r29716, %r7498, %r7474, %r7450, 0xD2; + lop3.b32 %r29717, %r7502, %r7478, %r7454, 0xD2; // end inline asm - ld.const.u32 %r4289, [matrix+2500]; + st.local.v2.u32 [%rd3+80], {%r29716, %r29717}; + add.s64 %rd498, %rd497, 184; // begin inline asm - dp4a.u32.u32 %r4288, %r4289, %r5874, %r4284; + ld.global.nc.v2.u32 {%r7570,%r7571}, [%rd498]; // end inline asm - ld.const.u32 %r4293, [matrix+2504]; + xor.b32 %r29710, %r7506, %r7570; + xor.b32 %r29711, %r7507, %r7571; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; + st.local.u64 [%rd55], %rd354; + mov.u64 %rd502, 1179641; + st.local.u64 [%rd55+8], %rd502; + add.s32 %r226, %r30, 1; + st.local.u32 [%rd55+16], %r226; + ld.global.u64 %rd503, [%rd35]; + ld.global.u64 %rd504, [%rd35+8]; + ld.global.u64 %rd505, [%rd35+16]; + ld.global.u64 %rd506, [%rd35+24]; + ld.global.u64 %rd507, [%rd35+32]; + ld.global.u64 %rd508, [%rd35+40]; + ld.global.u64 %rd509, [%rd35+48]; + ld.global.u64 %rd510, [%rd35+56]; + st.local.u64 [%rd55+32], %rd504; + st.local.u64 [%rd55+40], %rd505; + st.local.u64 [%rd55+48], %rd506; + st.local.u64 [%rd55+56], %rd507; + st.local.u64 [%rd55+64], %rd508; + st.local.u64 [%rd55+72], %rd509; + st.local.u64 [%rd55+80], %rd510; + cvt.u32.u64 %r7623, %rd503; + xor.b32 %r7624, %r226, %r7623; + st.local.u64 [%rd55+24], %rd503; + st.local.u32 [%rd55+24], %r7624; + mov.u32 %r29590, 0; + st.local.v2.u32 [%rd55+96], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+104], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+112], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+120], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+128], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+136], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+144], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+152], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+160], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+168], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+176], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+184], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+192], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+200], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+208], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+216], {%r29590, %r29590}; + mov.u32 %r29605, -2147483648; + st.local.v2.u32 [%rd55+88], {%r7586, %r29605}; + ld.local.v2.u32 {%r29626, %r29627}, [%rd55+24]; + mov.b64 {%r29624, %r29625}, %rd508; + shr.u64 %rd511, %rd504, 32; + cvt.u32.u64 %r29638, %rd504; + cvt.u32.u64 %r29639, %rd511; + shr.u64 %rd512, %rd509, 32; + cvt.u32.u64 %r29636, %rd509; + cvt.u32.u64 %r29637, %rd512; + shr.u64 %rd513, %rd505, 32; + cvt.u32.u64 %r29634, %rd505; + cvt.u32.u64 %r29635, %rd513; + shr.u64 %rd514, %rd510, 32; + cvt.u32.u64 %r29632, %rd510; + cvt.u32.u64 %r29633, %rd514; + shr.u64 %rd515, %rd506, 32; + cvt.u32.u64 %r29630, %rd506; + cvt.u32.u64 %r29631, %rd515; + shr.u64 %rd516, %rd507, 32; + cvt.u32.u64 %r29628, %rd507; + cvt.u32.u64 %r29629, %rd516; + mov.u32 %r29591, %r29590; + mov.u32 %r29592, %r29590; + mov.u32 %r29593, %r29590; + mov.u32 %r29594, %r29590; + mov.u32 %r29595, %r29590; + mov.u32 %r29596, %r29590; + mov.u32 %r29597, %r29590; + mov.u32 %r29598, %r29590; + mov.u32 %r29599, %r29590; + mov.u32 %r29600, %r29590; + mov.u32 %r29601, %r29590; + mov.u32 %r29602, %r29590; + mov.u32 %r29603, %r29590; + mov.u32 %r29604, %r7586; + mov.u32 %r29606, %r29590; + mov.u32 %r29607, %r29590; + mov.u32 %r29608, %r29590; + mov.u32 %r29609, %r29590; + mov.u32 %r29610, %r29590; + mov.u32 %r29611, %r29590; + mov.u32 %r29612, %r29590; + mov.u32 %r29613, %r29590; + mov.u32 %r29614, %r29590; + mov.u32 %r29615, %r29590; + mov.u32 %r29616, %r29590; + mov.u32 %r29617, %r29590; + mov.u32 %r29618, %r29590; + mov.u32 %r29619, %r29590; + mov.u32 %r29620, %r29590; + mov.u32 %r29621, %r29590; + mov.u32 %r29622, %r29590; + mov.u32 %r29623, %r29590; + mov.u32 %r29640, %r29590; + +$L__BB2_25: // begin inline asm - dp4a.u32.u32 %r4292, %r4293, %r5878, %r4288; + // xor5 + lop3.b32 %r7627, %r29626, %r29624, %r29622, 0x96; + lop3.b32 %r7627, %r7627, %r29620, %r29618, 0x96; + lop3.b32 %r7628, %r29627, %r29625, %r29623, 0x96; + lop3.b32 %r7628, %r7628, %r29621, %r29619, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7639, %r29638, %r29636, %r29616, 0x96; + lop3.b32 %r7639, %r7639, %r29614, %r29612, 0x96; + lop3.b32 %r7640, %r29639, %r29637, %r29617, 0x96; + lop3.b32 %r7640, %r7640, %r29615, %r29613, 0x96; // end inline asm - ld.const.u32 %r4297, [matrix+2508]; // begin inline asm - dp4a.u32.u32 %r4296, %r4297, %r5882, %r4292; + // xor5 + lop3.b32 %r7651, %r29634, %r29632, %r29610, 0x96; + lop3.b32 %r7651, %r7651, %r29608, %r29606, 0x96; + lop3.b32 %r7652, %r29635, %r29633, %r29611, 0x96; + lop3.b32 %r7652, %r7652, %r29609, %r29607, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7663, %r29630, %r29604, %r29602, 0x96; + lop3.b32 %r7663, %r7663, %r29600, %r29598, 0x96; + lop3.b32 %r7664, %r29631, %r29605, %r29603, 0x96; + lop3.b32 %r7664, %r7664, %r29601, %r29599, 0x96; // end inline asm - ld.const.u32 %r4301, [matrix+2512]; // begin inline asm - dp4a.u32.u32 %r4300, %r4301, %r5886, %r4296; + // xor5 + lop3.b32 %r7675, %r29628, %r29596, %r29594, 0x96; + lop3.b32 %r7675, %r7675, %r29592, %r29590, 0x96; + lop3.b32 %r7676, %r29629, %r29597, %r29595, 0x96; + lop3.b32 %r7676, %r7676, %r29593, %r29591, 0x96; // end inline asm - ld.const.u32 %r4305, [matrix+2516]; // begin inline asm - dp4a.u32.u32 %r4304, %r4305, %r5890, %r4300; + shf.l.wrap.b32 %r7687, %r7640, %r7639, %r7586; // end inline asm - ld.const.u32 %r4309, [matrix+2520]; // begin inline asm - dp4a.u32.u32 %r4308, %r4309, %r5894, %r4304; + shf.l.wrap.b32 %r7691, %r7639, %r7640, %r7586; // end inline asm - ld.const.u32 %r4313, [matrix+2524]; + xor.b32 %r8121, %r7687, %r7675; + xor.b32 %r8122, %r7691, %r7676; + xor.b32 %r7954, %r29626, %r8121; + xor.b32 %r7957, %r29627, %r8122; + xor.b32 %r7861, %r29624, %r8121; + xor.b32 %r7860, %r29625, %r8122; + xor.b32 %r7908, %r29622, %r8121; + xor.b32 %r7909, %r29623, %r8122; + xor.b32 %r7813, %r29620, %r8121; + xor.b32 %r7812, %r29621, %r8122; + xor.b32 %r7764, %r29618, %r8121; + xor.b32 %r7765, %r29619, %r8122; // begin inline asm - dp4a.u32.u32 %r4312, %r4313, %r5898, %r4308; + shf.l.wrap.b32 %r7695, %r7652, %r7651, %r7586; // end inline asm - ld.const.u32 %r4317, [matrix+2528]; // begin inline asm - dp4a.u32.u32 %r4316, %r4317, %r5902, %r4312; + shf.l.wrap.b32 %r7699, %r7651, %r7652, %r7586; // end inline asm - ld.const.u32 %r4321, [matrix+2532]; + xor.b32 %r8123, %r7695, %r7627; + xor.b32 %r8124, %r7699, %r7628; + xor.b32 %r7916, %r29638, %r8123; + xor.b32 %r7917, %r29639, %r8124; + xor.b32 %r7733, %r29636, %r8123; + xor.b32 %r7732, %r29637, %r8124; + xor.b32 %r7892, %r29616, %r8123; + xor.b32 %r7893, %r29617, %r8124; + xor.b32 %r7853, %r29614, %r8123; + xor.b32 %r7852, %r29615, %r8124; + xor.b32 %r7836, %r29612, %r8123; + xor.b32 %r7837, %r29613, %r8124; // begin inline asm - dp4a.u32.u32 %r4320, %r4321, %r5906, %r4316; + shf.l.wrap.b32 %r7703, %r7664, %r7663, %r7586; // end inline asm - ld.const.u32 %r4325, [matrix+2536]; // begin inline asm - dp4a.u32.u32 %r4324, %r4325, %r5910, %r4320; + shf.l.wrap.b32 %r7707, %r7663, %r7664, %r7586; // end inline asm - ld.const.u32 %r4329, [matrix+2540]; + xor.b32 %r8125, %r7703, %r7639; + xor.b32 %r8126, %r7707, %r7640; + xor.b32 %r7773, %r29634, %r8125; + xor.b32 %r7772, %r29635, %r8126; + xor.b32 %r7900, %r29632, %r8125; + xor.b32 %r7901, %r29633, %r8126; + xor.b32 %r7781, %r29610, %r8125; + xor.b32 %r7780, %r29611, %r8126; + xor.b32 %r7884, %r29608, %r8125; + xor.b32 %r7885, %r29609, %r8126; + xor.b32 %r7749, %r29606, %r8125; + xor.b32 %r7748, %r29607, %r8126; // begin inline asm - dp4a.u32.u32 %r4328, %r4329, %r5914, %r4324; + shf.l.wrap.b32 %r7711, %r7676, %r7675, %r7586; // end inline asm - ld.const.u32 %r4333, [matrix+2544]; // begin inline asm - dp4a.u32.u32 %r4332, %r4333, %r5918, %r4328; + shf.l.wrap.b32 %r7715, %r7675, %r7676, %r7586; // end inline asm - ld.const.u32 %r4337, [matrix+2548]; + xor.b32 %r8127, %r7711, %r7651; + xor.b32 %r8128, %r7715, %r7652; + xor.b32 %r7868, %r29630, %r8127; + xor.b32 %r7869, %r29631, %r8128; + xor.b32 %r7845, %r29604, %r8127; + xor.b32 %r7844, %r29605, %r8128; + xor.b32 %r7788, %r29602, %r8127; + xor.b32 %r7789, %r29603, %r8128; + xor.b32 %r7876, %r29600, %r8127; + xor.b32 %r7877, %r29601, %r8128; + xor.b32 %r7805, %r29598, %r8127; + xor.b32 %r7804, %r29599, %r8128; // begin inline asm - dp4a.u32.u32 %r4336, %r4337, %r5922, %r4332; + shf.l.wrap.b32 %r7719, %r7628, %r7627, %r7586; // end inline asm - ld.const.u32 %r4341, [matrix+2552]; // begin inline asm - dp4a.u32.u32 %r4340, %r4341, %r5926, %r4336; + shf.l.wrap.b32 %r7723, %r7627, %r7628, %r7586; // end inline asm - ld.const.u32 %r4345, [matrix+2556]; + xor.b32 %r8129, %r7719, %r7663; + xor.b32 %r8130, %r7723, %r7664; + xor.b32 %r7820, %r29628, %r8129; + xor.b32 %r7821, %r29629, %r8130; + xor.b32 %r7740, %r29596, %r8129; + xor.b32 %r7741, %r29597, %r8130; + xor.b32 %r7757, %r29594, %r8129; + xor.b32 %r7756, %r29595, %r8130; + xor.b32 %r7796, %r29592, %r8129; + xor.b32 %r7797, %r29593, %r8130; + xor.b32 %r7828, %r29590, %r8129; + xor.b32 %r7829, %r29591, %r8130; + mov.u32 %r7734, 44; // begin inline asm - dp4a.u32.u32 %r4344, %r4345, %r5930, %r4340; + shf.l.wrap.b32 %r7727, %r7733, %r7732, %r7734; // end inline asm - shr.u32 %r6096, %r4280, 6; - and.b32 %r4349, %r6096, 240; - shr.u32 %r4350, %r4344, 10; // begin inline asm - lop3.b32 %r4348, %r4349, %r4350, %r4351, 0x56; + shf.l.wrap.b32 %r7731, %r7732, %r7733, %r7734; // end inline asm - ld.const.u32 %r4353, [matrix+2560]; + mov.u32 %r7742, 20; // begin inline asm - dp4a.u32.u32 %r4352, %r4353, %r5870, %r6249; + shf.l.wrap.b32 %r7735, %r7741, %r7740, %r7742; // end inline asm - ld.const.u32 %r4357, [matrix+2564]; // begin inline asm - dp4a.u32.u32 %r4356, %r4357, %r5874, %r4352; + shf.l.wrap.b32 %r7739, %r7740, %r7741, %r7742; // end inline asm - ld.const.u32 %r4361, [matrix+2568]; + mov.u32 %r7750, 61; // begin inline asm - dp4a.u32.u32 %r4360, %r4361, %r5878, %r4356; + shf.l.wrap.b32 %r7743, %r7749, %r7748, %r7750; // end inline asm - ld.const.u32 %r4365, [matrix+2572]; // begin inline asm - dp4a.u32.u32 %r4364, %r4365, %r5882, %r4360; + shf.l.wrap.b32 %r7747, %r7748, %r7749, %r7750; // end inline asm - ld.const.u32 %r4369, [matrix+2576]; + mov.u32 %r7758, 39; // begin inline asm - dp4a.u32.u32 %r4368, %r4369, %r5886, %r4364; + shf.l.wrap.b32 %r7751, %r7757, %r7756, %r7758; // end inline asm - ld.const.u32 %r4373, [matrix+2580]; // begin inline asm - dp4a.u32.u32 %r4372, %r4373, %r5890, %r4368; + shf.l.wrap.b32 %r7755, %r7756, %r7757, %r7758; // end inline asm - ld.const.u32 %r4377, [matrix+2584]; + mov.u32 %r7766, 18; // begin inline asm - dp4a.u32.u32 %r4376, %r4377, %r5894, %r4372; + shf.l.wrap.b32 %r7759, %r7765, %r7764, %r7766; // end inline asm - ld.const.u32 %r4381, [matrix+2588]; // begin inline asm - dp4a.u32.u32 %r4380, %r4381, %r5898, %r4376; + shf.l.wrap.b32 %r7763, %r7764, %r7765, %r7766; // end inline asm - ld.const.u32 %r4385, [matrix+2592]; + mov.u32 %r7774, 62; // begin inline asm - dp4a.u32.u32 %r4384, %r4385, %r5902, %r4380; + shf.l.wrap.b32 %r7767, %r7773, %r7772, %r7774; // end inline asm - ld.const.u32 %r4389, [matrix+2596]; // begin inline asm - dp4a.u32.u32 %r4388, %r4389, %r5906, %r4384; + shf.l.wrap.b32 %r7771, %r7772, %r7773, %r7774; // end inline asm - ld.const.u32 %r4393, [matrix+2600]; + mov.u32 %r7782, 43; // begin inline asm - dp4a.u32.u32 %r4392, %r4393, %r5910, %r4388; + shf.l.wrap.b32 %r7775, %r7781, %r7780, %r7782; // end inline asm - ld.const.u32 %r4397, [matrix+2604]; // begin inline asm - dp4a.u32.u32 %r4396, %r4397, %r5914, %r4392; + shf.l.wrap.b32 %r7779, %r7780, %r7781, %r7782; // end inline asm - ld.const.u32 %r4401, [matrix+2608]; + mov.u32 %r7790, 25; // begin inline asm - dp4a.u32.u32 %r4400, %r4401, %r5918, %r4396; + shf.l.wrap.b32 %r7783, %r7789, %r7788, %r7790; // end inline asm - ld.const.u32 %r4405, [matrix+2612]; // begin inline asm - dp4a.u32.u32 %r4404, %r4405, %r5922, %r4400; + shf.l.wrap.b32 %r7787, %r7788, %r7789, %r7790; // end inline asm - ld.const.u32 %r4409, [matrix+2616]; + mov.u32 %r7798, 8; // begin inline asm - dp4a.u32.u32 %r4408, %r4409, %r5926, %r4404; + shf.l.wrap.b32 %r7791, %r7797, %r7796, %r7798; // end inline asm - ld.const.u32 %r4413, [matrix+2620]; // begin inline asm - dp4a.u32.u32 %r4412, %r4413, %r5930, %r4408; + shf.l.wrap.b32 %r7795, %r7796, %r7797, %r7798; // end inline asm - ld.const.u32 %r4417, [matrix+2624]; + mov.u32 %r7806, 56; // begin inline asm - dp4a.u32.u32 %r4416, %r4417, %r5870, %r6249; + shf.l.wrap.b32 %r7799, %r7805, %r7804, %r7806; // end inline asm - ld.const.u32 %r4421, [matrix+2628]; // begin inline asm - dp4a.u32.u32 %r4420, %r4421, %r5874, %r4416; + shf.l.wrap.b32 %r7803, %r7804, %r7805, %r7806; // end inline asm - ld.const.u32 %r4425, [matrix+2632]; + mov.u32 %r7814, 41; // begin inline asm - dp4a.u32.u32 %r4424, %r4425, %r5878, %r4420; + shf.l.wrap.b32 %r7807, %r7813, %r7812, %r7814; // end inline asm - ld.const.u32 %r4429, [matrix+2636]; // begin inline asm - dp4a.u32.u32 %r4428, %r4429, %r5882, %r4424; + shf.l.wrap.b32 %r7811, %r7812, %r7813, %r7814; // end inline asm - ld.const.u32 %r4433, [matrix+2640]; + mov.u32 %r7822, 27; // begin inline asm - dp4a.u32.u32 %r4432, %r4433, %r5886, %r4428; + shf.l.wrap.b32 %r7815, %r7821, %r7820, %r7822; // end inline asm - ld.const.u32 %r4437, [matrix+2644]; // begin inline asm - dp4a.u32.u32 %r4436, %r4437, %r5890, %r4432; + shf.l.wrap.b32 %r7819, %r7820, %r7821, %r7822; // end inline asm - ld.const.u32 %r4441, [matrix+2648]; + mov.u32 %r7830, 14; // begin inline asm - dp4a.u32.u32 %r4440, %r4441, %r5894, %r4436; + shf.l.wrap.b32 %r7823, %r7829, %r7828, %r7830; // end inline asm - ld.const.u32 %r4445, [matrix+2652]; // begin inline asm - dp4a.u32.u32 %r4444, %r4445, %r5898, %r4440; + shf.l.wrap.b32 %r7827, %r7828, %r7829, %r7830; // end inline asm - ld.const.u32 %r4449, [matrix+2656]; + mov.u32 %r7838, 2; // begin inline asm - dp4a.u32.u32 %r4448, %r4449, %r5902, %r4444; + shf.l.wrap.b32 %r7831, %r7837, %r7836, %r7838; // end inline asm - ld.const.u32 %r4453, [matrix+2660]; // begin inline asm - dp4a.u32.u32 %r4452, %r4453, %r5906, %r4448; + shf.l.wrap.b32 %r7835, %r7836, %r7837, %r7838; // end inline asm - ld.const.u32 %r4457, [matrix+2664]; + mov.u32 %r7846, 55; // begin inline asm - dp4a.u32.u32 %r4456, %r4457, %r5910, %r4452; + shf.l.wrap.b32 %r7839, %r7845, %r7844, %r7846; // end inline asm - ld.const.u32 %r4461, [matrix+2668]; // begin inline asm - dp4a.u32.u32 %r4460, %r4461, %r5914, %r4456; + shf.l.wrap.b32 %r7843, %r7844, %r7845, %r7846; // end inline asm - ld.const.u32 %r4465, [matrix+2672]; + mov.u32 %r7854, 45; // begin inline asm - dp4a.u32.u32 %r4464, %r4465, %r5918, %r4460; + shf.l.wrap.b32 %r7847, %r7853, %r7852, %r7854; // end inline asm - ld.const.u32 %r4469, [matrix+2676]; // begin inline asm - dp4a.u32.u32 %r4468, %r4469, %r5922, %r4464; + shf.l.wrap.b32 %r7851, %r7852, %r7853, %r7854; // end inline asm - ld.const.u32 %r4473, [matrix+2680]; + mov.u32 %r7862, 36; // begin inline asm - dp4a.u32.u32 %r4472, %r4473, %r5926, %r4468; + shf.l.wrap.b32 %r7855, %r7861, %r7860, %r7862; // end inline asm - ld.const.u32 %r4477, [matrix+2684]; // begin inline asm - dp4a.u32.u32 %r4476, %r4477, %r5930, %r4472; + shf.l.wrap.b32 %r7859, %r7860, %r7861, %r7862; // end inline asm - shr.u32 %r6097, %r4412, 6; - and.b32 %r4481, %r6097, 240; - shr.u32 %r4482, %r4476, 10; - and.b32 %r4483, %r6019, 255; + mov.u32 %r7870, 28; // begin inline asm - lop3.b32 %r4480, %r4481, %r4482, %r4483, 0x56; + shf.l.wrap.b32 %r7863, %r7869, %r7868, %r7870; // end inline asm - ld.const.u32 %r4485, [matrix+2688]; // begin inline asm - dp4a.u32.u32 %r4484, %r4485, %r5870, %r6249; + shf.l.wrap.b32 %r7867, %r7868, %r7869, %r7870; // end inline asm - ld.const.u32 %r4489, [matrix+2692]; + mov.u32 %r7878, 21; // begin inline asm - dp4a.u32.u32 %r4488, %r4489, %r5874, %r4484; + shf.l.wrap.b32 %r7871, %r7877, %r7876, %r7878; // end inline asm - ld.const.u32 %r4493, [matrix+2696]; // begin inline asm - dp4a.u32.u32 %r4492, %r4493, %r5878, %r4488; + shf.l.wrap.b32 %r7875, %r7876, %r7877, %r7878; // end inline asm - ld.const.u32 %r4497, [matrix+2700]; + mov.u32 %r7886, 15; // begin inline asm - dp4a.u32.u32 %r4496, %r4497, %r5882, %r4492; + shf.l.wrap.b32 %r7879, %r7885, %r7884, %r7886; // end inline asm - ld.const.u32 %r4501, [matrix+2704]; // begin inline asm - dp4a.u32.u32 %r4500, %r4501, %r5886, %r4496; + shf.l.wrap.b32 %r7883, %r7884, %r7885, %r7886; // end inline asm - ld.const.u32 %r4505, [matrix+2708]; + mov.u32 %r7894, 10; // begin inline asm - dp4a.u32.u32 %r4504, %r4505, %r5890, %r4500; + shf.l.wrap.b32 %r7887, %r7893, %r7892, %r7894; // end inline asm - ld.const.u32 %r4509, [matrix+2712]; // begin inline asm - dp4a.u32.u32 %r4508, %r4509, %r5894, %r4504; + shf.l.wrap.b32 %r7891, %r7892, %r7893, %r7894; // end inline asm - ld.const.u32 %r4513, [matrix+2716]; + mov.u32 %r7902, 6; // begin inline asm - dp4a.u32.u32 %r4512, %r4513, %r5898, %r4508; + shf.l.wrap.b32 %r7895, %r7901, %r7900, %r7902; // end inline asm - ld.const.u32 %r4517, [matrix+2720]; // begin inline asm - dp4a.u32.u32 %r4516, %r4517, %r5902, %r4512; + shf.l.wrap.b32 %r7899, %r7900, %r7901, %r7902; // end inline asm - ld.const.u32 %r4521, [matrix+2724]; + mov.u32 %r7910, 3; // begin inline asm - dp4a.u32.u32 %r4520, %r4521, %r5906, %r4516; + shf.l.wrap.b32 %r7903, %r7909, %r7908, %r7910; // end inline asm - ld.const.u32 %r4525, [matrix+2728]; // begin inline asm - dp4a.u32.u32 %r4524, %r4525, %r5910, %r4520; + shf.l.wrap.b32 %r7907, %r7908, %r7909, %r7910; // end inline asm - ld.const.u32 %r4529, [matrix+2732]; // begin inline asm - dp4a.u32.u32 %r4528, %r4529, %r5914, %r4524; + shf.l.wrap.b32 %r7911, %r7917, %r7916, %r7586; // end inline asm - ld.const.u32 %r4533, [matrix+2736]; // begin inline asm - dp4a.u32.u32 %r4532, %r4533, %r5918, %r4528; + shf.l.wrap.b32 %r7915, %r7916, %r7917, %r7586; // end inline asm - ld.const.u32 %r4537, [matrix+2740]; // begin inline asm - dp4a.u32.u32 %r4536, %r4537, %r5922, %r4532; + // chi + lop3.b32 %r7919, %r7954, %r7727, %r7775, 0xD2; + lop3.b32 %r7920, %r7957, %r7731, %r7779, 0xD2; // end inline asm - ld.const.u32 %r4541, [matrix+2744]; // begin inline asm - dp4a.u32.u32 %r4540, %r4541, %r5926, %r4536; + // chi + lop3.b32 %r29638, %r7727, %r7775, %r7871, 0xD2; + lop3.b32 %r29639, %r7731, %r7779, %r7875, 0xD2; // end inline asm - ld.const.u32 %r4545, [matrix+2748]; // begin inline asm - dp4a.u32.u32 %r4544, %r4545, %r5930, %r4540; + // chi + lop3.b32 %r29634, %r7775, %r7871, %r7823, 0xD2; + lop3.b32 %r29635, %r7779, %r7875, %r7827, 0xD2; // end inline asm - ld.const.u32 %r4549, [matrix+2752]; // begin inline asm - dp4a.u32.u32 %r4548, %r4549, %r5870, %r6249; + // chi + lop3.b32 %r29630, %r7871, %r7823, %r7954, 0xD2; + lop3.b32 %r29631, %r7875, %r7827, %r7957, 0xD2; // end inline asm - ld.const.u32 %r4553, [matrix+2756]; // begin inline asm - dp4a.u32.u32 %r4552, %r4553, %r5874, %r4548; + // chi + lop3.b32 %r29628, %r7823, %r7954, %r7727, 0xD2; + lop3.b32 %r29629, %r7827, %r7957, %r7731, 0xD2; // end inline asm - ld.const.u32 %r4557, [matrix+2760]; // begin inline asm - dp4a.u32.u32 %r4556, %r4557, %r5878, %r4552; + // chi + lop3.b32 %r29624, %r7863, %r7735, %r7903, 0xD2; + lop3.b32 %r29625, %r7867, %r7739, %r7907, 0xD2; // end inline asm - ld.const.u32 %r4561, [matrix+2764]; // begin inline asm - dp4a.u32.u32 %r4560, %r4561, %r5882, %r4556; + // chi + lop3.b32 %r29636, %r7735, %r7903, %r7847, 0xD2; + lop3.b32 %r29637, %r7739, %r7907, %r7851, 0xD2; // end inline asm - ld.const.u32 %r4565, [matrix+2768]; // begin inline asm - dp4a.u32.u32 %r4564, %r4565, %r5886, %r4560; + // chi + lop3.b32 %r29632, %r7903, %r7847, %r7743, 0xD2; + lop3.b32 %r29633, %r7907, %r7851, %r7747, 0xD2; // end inline asm - ld.const.u32 %r4569, [matrix+2772]; // begin inline asm - dp4a.u32.u32 %r4568, %r4569, %r5890, %r4564; + // chi + lop3.b32 %r29604, %r7847, %r7743, %r7863, 0xD2; + lop3.b32 %r29605, %r7851, %r7747, %r7867, 0xD2; // end inline asm - ld.const.u32 %r4573, [matrix+2776]; + st.local.v2.u32 [%rd55+88], {%r29604, %r29605}; // begin inline asm - dp4a.u32.u32 %r4572, %r4573, %r5894, %r4568; + // chi + lop3.b32 %r29596, %r7743, %r7863, %r7735, 0xD2; + lop3.b32 %r29597, %r7747, %r7867, %r7739, 0xD2; // end inline asm - ld.const.u32 %r4577, [matrix+2780]; + st.local.v2.u32 [%rd55+96], {%r29596, %r29597}; // begin inline asm - dp4a.u32.u32 %r4576, %r4577, %r5898, %r4572; + // chi + lop3.b32 %r29622, %r7911, %r7895, %r7783, 0xD2; + lop3.b32 %r29623, %r7915, %r7899, %r7787, 0xD2; // end inline asm - ld.const.u32 %r4581, [matrix+2784]; + st.local.v2.u32 [%rd55+104], {%r29622, %r29623}; // begin inline asm - dp4a.u32.u32 %r4580, %r4581, %r5902, %r4576; + // chi + lop3.b32 %r29616, %r7895, %r7783, %r7791, 0xD2; + lop3.b32 %r29617, %r7899, %r7787, %r7795, 0xD2; // end inline asm - ld.const.u32 %r4585, [matrix+2788]; + st.local.v2.u32 [%rd55+112], {%r29616, %r29617}; // begin inline asm - dp4a.u32.u32 %r4584, %r4585, %r5906, %r4580; + // chi + lop3.b32 %r29610, %r7783, %r7791, %r7759, 0xD2; + lop3.b32 %r29611, %r7787, %r7795, %r7763, 0xD2; // end inline asm - ld.const.u32 %r4589, [matrix+2792]; + st.local.v2.u32 [%rd55+120], {%r29610, %r29611}; // begin inline asm - dp4a.u32.u32 %r4588, %r4589, %r5910, %r4584; + // chi + lop3.b32 %r29602, %r7791, %r7759, %r7911, 0xD2; + lop3.b32 %r29603, %r7795, %r7763, %r7915, 0xD2; // end inline asm - ld.const.u32 %r4593, [matrix+2796]; + st.local.v2.u32 [%rd55+128], {%r29602, %r29603}; // begin inline asm - dp4a.u32.u32 %r4592, %r4593, %r5914, %r4588; + // chi + lop3.b32 %r29594, %r7759, %r7911, %r7895, 0xD2; + lop3.b32 %r29595, %r7763, %r7915, %r7899, 0xD2; // end inline asm - ld.const.u32 %r4597, [matrix+2800]; + st.local.v2.u32 [%rd55+136], {%r29594, %r29595}; // begin inline asm - dp4a.u32.u32 %r4596, %r4597, %r5918, %r4592; + // chi + lop3.b32 %r29620, %r7815, %r7855, %r7887, 0xD2; + lop3.b32 %r29621, %r7819, %r7859, %r7891, 0xD2; // end inline asm - ld.const.u32 %r4601, [matrix+2804]; + st.local.v2.u32 [%rd55+144], {%r29620, %r29621}; // begin inline asm - dp4a.u32.u32 %r4600, %r4601, %r5922, %r4596; + // chi + lop3.b32 %r29614, %r7855, %r7887, %r7879, 0xD2; + lop3.b32 %r29615, %r7859, %r7891, %r7883, 0xD2; // end inline asm - ld.const.u32 %r4605, [matrix+2808]; + st.local.v2.u32 [%rd55+152], {%r29614, %r29615}; // begin inline asm - dp4a.u32.u32 %r4604, %r4605, %r5926, %r4600; + // chi + lop3.b32 %r29608, %r7887, %r7879, %r7799, 0xD2; + lop3.b32 %r29609, %r7891, %r7883, %r7803, 0xD2; // end inline asm - ld.const.u32 %r4609, [matrix+2812]; + st.local.v2.u32 [%rd55+160], {%r29608, %r29609}; // begin inline asm - dp4a.u32.u32 %r4608, %r4609, %r5930, %r4604; + // chi + lop3.b32 %r29600, %r7879, %r7799, %r7815, 0xD2; + lop3.b32 %r29601, %r7883, %r7803, %r7819, 0xD2; // end inline asm - shr.u32 %r6098, %r4544, 6; - and.b32 %r4613, %r6098, 240; - shr.u32 %r4614, %r4608, 10; - and.b32 %r4615, %r6024, 255; + st.local.v2.u32 [%rd55+168], {%r29600, %r29601}; // begin inline asm - lop3.b32 %r4612, %r4613, %r4614, %r4615, 0x56; + // chi + lop3.b32 %r29592, %r7799, %r7815, %r7855, 0xD2; + lop3.b32 %r29593, %r7803, %r7819, %r7859, 0xD2; // end inline asm - ld.const.u32 %r4617, [matrix+2816]; + st.local.v2.u32 [%rd55+176], {%r29592, %r29593}; // begin inline asm - dp4a.u32.u32 %r4616, %r4617, %r5870, %r6249; + // chi + lop3.b32 %r29618, %r7767, %r7839, %r7751, 0xD2; + lop3.b32 %r29619, %r7771, %r7843, %r7755, 0xD2; // end inline asm - ld.const.u32 %r4621, [matrix+2820]; + st.local.v2.u32 [%rd55+184], {%r29618, %r29619}; // begin inline asm - dp4a.u32.u32 %r4620, %r4621, %r5874, %r4616; + // chi + lop3.b32 %r29612, %r7839, %r7751, %r7807, 0xD2; + lop3.b32 %r29613, %r7843, %r7755, %r7811, 0xD2; // end inline asm - ld.const.u32 %r4625, [matrix+2824]; + st.local.v2.u32 [%rd55+192], {%r29612, %r29613}; // begin inline asm - dp4a.u32.u32 %r4624, %r4625, %r5878, %r4620; + // chi + lop3.b32 %r29606, %r7751, %r7807, %r7831, 0xD2; + lop3.b32 %r29607, %r7755, %r7811, %r7835, 0xD2; // end inline asm - ld.const.u32 %r4629, [matrix+2828]; + st.local.v2.u32 [%rd55+200], {%r29606, %r29607}; // begin inline asm - dp4a.u32.u32 %r4628, %r4629, %r5882, %r4624; + // chi + lop3.b32 %r29598, %r7807, %r7831, %r7767, 0xD2; + lop3.b32 %r29599, %r7811, %r7835, %r7771, 0xD2; // end inline asm - ld.const.u32 %r4633, [matrix+2832]; + st.local.v2.u32 [%rd55+208], {%r29598, %r29599}; // begin inline asm - dp4a.u32.u32 %r4632, %r4633, %r5886, %r4628; + // chi + lop3.b32 %r29590, %r7831, %r7767, %r7839, 0xD2; + lop3.b32 %r29591, %r7835, %r7771, %r7843, 0xD2; // end inline asm - ld.const.u32 %r4637, [matrix+2836]; + st.local.v2.u32 [%rd55+216], {%r29590, %r29591}; + mul.wide.s32 %rd518, %r29640, 8; + add.s64 %rd517, %rd497, %rd518; // begin inline asm - dp4a.u32.u32 %r4636, %r4637, %r5890, %r4632; + ld.global.nc.v2.u32 {%r8119,%r8120}, [%rd517]; // end inline asm - ld.const.u32 %r4641, [matrix+2840]; + xor.b32 %r29626, %r7919, %r8119; + xor.b32 %r29627, %r7920, %r8120; + add.s32 %r29640, %r29640, 1; + setp.lt.u32 %p19, %r29640, 23; + @%p19 bra $L__BB2_25; + + mov.u32 %r29673, 0; + mov.u32 %r8230, 1; + st.local.v2.u32 [%rd55+32], {%r29638, %r29639}; + st.local.v2.u32 [%rd55+72], {%r29636, %r29637}; + st.local.v2.u32 [%rd55+40], {%r29634, %r29635}; + st.local.v2.u32 [%rd55+80], {%r29632, %r29633}; + st.local.v2.u32 [%rd55+48], {%r29630, %r29631}; + st.local.v2.u32 [%rd55+56], {%r29628, %r29629}; + st.local.v2.u32 [%rd55+24], {%r29626, %r29627}; // begin inline asm - dp4a.u32.u32 %r4640, %r4641, %r5894, %r4636; + // xor5 + lop3.b32 %r8131, %r29626, %r29624, %r29622, 0x96; + lop3.b32 %r8131, %r8131, %r29620, %r29618, 0x96; + lop3.b32 %r8132, %r29627, %r29625, %r29623, 0x96; + lop3.b32 %r8132, %r8132, %r29621, %r29619, 0x96; // end inline asm - ld.const.u32 %r4645, [matrix+2844]; // begin inline asm - dp4a.u32.u32 %r4644, %r4645, %r5898, %r4640; + // xor5 + lop3.b32 %r8143, %r29638, %r29636, %r29616, 0x96; + lop3.b32 %r8143, %r8143, %r29614, %r29612, 0x96; + lop3.b32 %r8144, %r29639, %r29637, %r29617, 0x96; + lop3.b32 %r8144, %r8144, %r29615, %r29613, 0x96; // end inline asm - ld.const.u32 %r4649, [matrix+2848]; // begin inline asm - dp4a.u32.u32 %r4648, %r4649, %r5902, %r4644; + // xor5 + lop3.b32 %r8155, %r29634, %r29632, %r29610, 0x96; + lop3.b32 %r8155, %r8155, %r29608, %r29606, 0x96; + lop3.b32 %r8156, %r29635, %r29633, %r29611, 0x96; + lop3.b32 %r8156, %r8156, %r29609, %r29607, 0x96; // end inline asm - ld.const.u32 %r4653, [matrix+2852]; // begin inline asm - dp4a.u32.u32 %r4652, %r4653, %r5906, %r4648; + // xor5 + lop3.b32 %r8167, %r29630, %r29604, %r29602, 0x96; + lop3.b32 %r8167, %r8167, %r29600, %r29598, 0x96; + lop3.b32 %r8168, %r29631, %r29605, %r29603, 0x96; + lop3.b32 %r8168, %r8168, %r29601, %r29599, 0x96; // end inline asm - ld.const.u32 %r4657, [matrix+2856]; // begin inline asm - dp4a.u32.u32 %r4656, %r4657, %r5910, %r4652; + // xor5 + lop3.b32 %r8179, %r29628, %r29596, %r29594, 0x96; + lop3.b32 %r8179, %r8179, %r29592, %r29590, 0x96; + lop3.b32 %r8180, %r29629, %r29597, %r29595, 0x96; + lop3.b32 %r8180, %r8180, %r29593, %r29591, 0x96; // end inline asm - ld.const.u32 %r4661, [matrix+2860]; // begin inline asm - dp4a.u32.u32 %r4660, %r4661, %r5914, %r4656; + shf.l.wrap.b32 %r8191, %r8144, %r8143, %r8230; // end inline asm - ld.const.u32 %r4665, [matrix+2864]; // begin inline asm - dp4a.u32.u32 %r4664, %r4665, %r5918, %r4660; + shf.l.wrap.b32 %r8195, %r8143, %r8144, %r8230; // end inline asm - ld.const.u32 %r4669, [matrix+2868]; + xor.b32 %r8370, %r8191, %r8179; + xor.b32 %r8371, %r8195, %r8180; + xor.b32 %r8338, %r29626, %r8370; + xor.b32 %r8341, %r29627, %r8371; + xor.b32 %r8301, %r29623, %r8371; + xor.b32 %r8300, %r29622, %r8370; + st.local.v2.u32 [%rd55+104], {%r8300, %r8301}; // begin inline asm - dp4a.u32.u32 %r4668, %r4669, %r5922, %r4664; + shf.l.wrap.b32 %r8199, %r8156, %r8155, %r8230; // end inline asm - ld.const.u32 %r4673, [matrix+2872]; // begin inline asm - dp4a.u32.u32 %r4672, %r4673, %r5926, %r4668; + shf.l.wrap.b32 %r8203, %r8155, %r8156, %r8230; // end inline asm - ld.const.u32 %r4677, [matrix+2876]; + xor.b32 %r8372, %r8199, %r8131; + xor.b32 %r8373, %r8203, %r8132; + xor.b32 %r8237, %r29636, %r8372; + xor.b32 %r8236, %r29637, %r8373; + xor.b32 %r8276, %r29615, %r8373; + xor.b32 %r8277, %r29614, %r8372; + st.local.v2.u32 [%rd55+152], {%r8277, %r8276}; // begin inline asm - dp4a.u32.u32 %r4676, %r4677, %r5930, %r4672; + shf.l.wrap.b32 %r8207, %r8168, %r8167, %r8230; // end inline asm - ld.const.u32 %r4681, [matrix+2880]; // begin inline asm - dp4a.u32.u32 %r4680, %r4681, %r5870, %r6249; + shf.l.wrap.b32 %r8211, %r8167, %r8168, %r8230; // end inline asm - ld.const.u32 %r4685, [matrix+2884]; + xor.b32 %r8374, %r8207, %r8143; + xor.b32 %r8375, %r8211, %r8144; + xor.b32 %r8260, %r29611, %r8375; + xor.b32 %r8261, %r29610, %r8374; + st.local.v2.u32 [%rd55+120], {%r8261, %r8260}; + xor.b32 %r8252, %r29607, %r8375; + xor.b32 %r8253, %r29606, %r8374; + st.local.v2.u32 [%rd55+200], {%r8253, %r8252}; // begin inline asm - dp4a.u32.u32 %r4684, %r4685, %r5874, %r4680; + shf.l.wrap.b32 %r8215, %r8180, %r8179, %r8230; // end inline asm - ld.const.u32 %r4689, [matrix+2888]; // begin inline asm - dp4a.u32.u32 %r4688, %r4689, %r5878, %r4684; + shf.l.wrap.b32 %r8219, %r8179, %r8180, %r8230; // end inline asm - ld.const.u32 %r4693, [matrix+2892]; + xor.b32 %r8376, %r8215, %r8155; + xor.b32 %r8377, %r8219, %r8156; + xor.b32 %r8284, %r29630, %r8376; + xor.b32 %r8285, %r29631, %r8377; + xor.b32 %r8293, %r29601, %r8377; + xor.b32 %r8292, %r29600, %r8376; + st.local.v2.u32 [%rd55+168], {%r8292, %r8293}; // begin inline asm - dp4a.u32.u32 %r4692, %r4693, %r5882, %r4688; + shf.l.wrap.b32 %r8223, %r8132, %r8131, %r8230; // end inline asm - ld.const.u32 %r4697, [matrix+2896]; // begin inline asm - dp4a.u32.u32 %r4696, %r4697, %r5886, %r4692; + shf.l.wrap.b32 %r8227, %r8131, %r8132, %r8230; // end inline asm - ld.const.u32 %r4701, [matrix+2900]; + xor.b32 %r8378, %r8223, %r8167; + xor.b32 %r8379, %r8227, %r8168; + xor.b32 %r8244, %r29596, %r8378; + xor.b32 %r8245, %r29597, %r8379; + xor.b32 %r8269, %r29591, %r8379; + xor.b32 %r8268, %r29590, %r8378; + st.local.v2.u32 [%rd55+216], {%r8268, %r8269}; // begin inline asm - dp4a.u32.u32 %r4700, %r4701, %r5890, %r4696; + shf.l.wrap.b32 %r8231, %r8237, %r8236, %r7734; // end inline asm - ld.const.u32 %r4705, [matrix+2904]; // begin inline asm - dp4a.u32.u32 %r4704, %r4705, %r5894, %r4700; + shf.l.wrap.b32 %r8235, %r8236, %r8237, %r7734; // end inline asm - ld.const.u32 %r4709, [matrix+2908]; // begin inline asm - dp4a.u32.u32 %r4708, %r4709, %r5898, %r4704; + shf.l.wrap.b32 %r8239, %r8245, %r8244, %r7742; // end inline asm - ld.const.u32 %r4713, [matrix+2912]; // begin inline asm - dp4a.u32.u32 %r4712, %r4713, %r5902, %r4708; + shf.l.wrap.b32 %r8243, %r8244, %r8245, %r7742; // end inline asm - ld.const.u32 %r4717, [matrix+2916]; // begin inline asm - dp4a.u32.u32 %r4716, %r4717, %r5906, %r4712; + shf.l.wrap.b32 %r8251, %r8252, %r8253, %r7750; // end inline asm - ld.const.u32 %r4721, [matrix+2920]; // begin inline asm - dp4a.u32.u32 %r4720, %r4721, %r5910, %r4716; + shf.l.wrap.b32 %r8247, %r8253, %r8252, %r7750; // end inline asm - ld.const.u32 %r4725, [matrix+2924]; + st.local.v2.u32 [%rd55+96], {%r8247, %r8251}; // begin inline asm - dp4a.u32.u32 %r4724, %r4725, %r5914, %r4720; + shf.l.wrap.b32 %r8255, %r8261, %r8260, %r7782; // end inline asm - ld.const.u32 %r4729, [matrix+2928]; // begin inline asm - dp4a.u32.u32 %r4728, %r4729, %r5918, %r4724; + shf.l.wrap.b32 %r8259, %r8260, %r8261, %r7782; // end inline asm - ld.const.u32 %r4733, [matrix+2932]; // begin inline asm - dp4a.u32.u32 %r4732, %r4733, %r5922, %r4728; + shf.l.wrap.b32 %r8263, %r8269, %r8268, %r7830; // end inline asm - ld.const.u32 %r4737, [matrix+2936]; // begin inline asm - dp4a.u32.u32 %r4736, %r4737, %r5926, %r4732; + shf.l.wrap.b32 %r8267, %r8268, %r8269, %r7830; // end inline asm - ld.const.u32 %r4741, [matrix+2940]; // begin inline asm - dp4a.u32.u32 %r4740, %r4741, %r5930, %r4736; + shf.l.wrap.b32 %r8275, %r8276, %r8277, %r7854; // end inline asm - shr.u32 %r6099, %r4676, 6; - and.b32 %r4745, %r6099, 240; - shr.u32 %r4746, %r4740, 10; - and.b32 %r4747, %r6028, 255; // begin inline asm - lop3.b32 %r4744, %r4745, %r4746, %r4747, 0x56; + shf.l.wrap.b32 %r8271, %r8277, %r8276, %r7854; // end inline asm - ld.const.u32 %r4749, [matrix+2944]; + st.local.v2.u32 [%rd55+88], {%r8271, %r8275}; // begin inline asm - dp4a.u32.u32 %r4748, %r4749, %r5870, %r6249; + shf.l.wrap.b32 %r8279, %r8285, %r8284, %r7870; // end inline asm - ld.const.u32 %r4753, [matrix+2948]; // begin inline asm - dp4a.u32.u32 %r4752, %r4753, %r5874, %r4748; + shf.l.wrap.b32 %r8283, %r8284, %r8285, %r7870; // end inline asm - ld.const.u32 %r4757, [matrix+2952]; // begin inline asm - dp4a.u32.u32 %r4756, %r4757, %r5878, %r4752; + shf.l.wrap.b32 %r8287, %r8293, %r8292, %r7878; // end inline asm - ld.const.u32 %r4761, [matrix+2956]; // begin inline asm - dp4a.u32.u32 %r4760, %r4761, %r5882, %r4756; + shf.l.wrap.b32 %r8291, %r8292, %r8293, %r7878; // end inline asm - ld.const.u32 %r4765, [matrix+2960]; // begin inline asm - dp4a.u32.u32 %r4764, %r4765, %r5886, %r4760; + shf.l.wrap.b32 %r8295, %r8301, %r8300, %r7910; // end inline asm - ld.const.u32 %r4769, [matrix+2964]; // begin inline asm - dp4a.u32.u32 %r4768, %r4769, %r5890, %r4764; + shf.l.wrap.b32 %r8299, %r8300, %r8301, %r7910; // end inline asm - ld.const.u32 %r4773, [matrix+2968]; // begin inline asm - dp4a.u32.u32 %r4772, %r4773, %r5894, %r4768; + // chi + lop3.b32 %r8303, %r8338, %r8231, %r8255, 0xD2; + lop3.b32 %r8304, %r8341, %r8235, %r8259, 0xD2; // end inline asm - ld.const.u32 %r4777, [matrix+2972]; // begin inline asm - dp4a.u32.u32 %r4776, %r4777, %r5898, %r4772; + // chi + lop3.b32 %r29773, %r8231, %r8255, %r8287, 0xD2; + lop3.b32 %r29774, %r8235, %r8259, %r8291, 0xD2; // end inline asm - ld.const.u32 %r4781, [matrix+2976]; + st.local.v2.u32 [%rd55+32], {%r29773, %r29774}; // begin inline asm - dp4a.u32.u32 %r4780, %r4781, %r5902, %r4776; + // chi + lop3.b32 %r29769, %r8255, %r8287, %r8263, 0xD2; + lop3.b32 %r29770, %r8259, %r8291, %r8267, 0xD2; // end inline asm - ld.const.u32 %r4785, [matrix+2980]; + st.local.v2.u32 [%rd55+40], {%r29769, %r29770}; // begin inline asm - dp4a.u32.u32 %r4784, %r4785, %r5906, %r4780; + // chi + lop3.b32 %r29765, %r8287, %r8263, %r8338, 0xD2; + lop3.b32 %r29766, %r8291, %r8267, %r8341, 0xD2; // end inline asm - ld.const.u32 %r4789, [matrix+2984]; + st.local.v2.u32 [%rd55+48], {%r29765, %r29766}; // begin inline asm - dp4a.u32.u32 %r4788, %r4789, %r5910, %r4784; + // chi + lop3.b32 %r29763, %r8263, %r8338, %r8231, 0xD2; + lop3.b32 %r29764, %r8267, %r8341, %r8235, 0xD2; // end inline asm - ld.const.u32 %r4793, [matrix+2988]; + st.local.v2.u32 [%rd55+56], {%r29763, %r29764}; // begin inline asm - dp4a.u32.u32 %r4792, %r4793, %r5914, %r4788; + // chi + lop3.b32 %r29759, %r8279, %r8239, %r8295, 0xD2; + lop3.b32 %r29760, %r8283, %r8243, %r8299, 0xD2; // end inline asm - ld.const.u32 %r4797, [matrix+2992]; + st.local.v2.u32 [%rd55+64], {%r29759, %r29760}; // begin inline asm - dp4a.u32.u32 %r4796, %r4797, %r5918, %r4792; + // chi + lop3.b32 %r29771, %r8239, %r8295, %r8271, 0xD2; + lop3.b32 %r29772, %r8243, %r8299, %r8275, 0xD2; // end inline asm - ld.const.u32 %r4801, [matrix+2996]; + st.local.v2.u32 [%rd55+72], {%r29771, %r29772}; // begin inline asm - dp4a.u32.u32 %r4800, %r4801, %r5922, %r4796; + // chi + lop3.b32 %r29767, %r8295, %r8271, %r8247, 0xD2; + lop3.b32 %r29768, %r8299, %r8275, %r8251, 0xD2; // end inline asm - ld.const.u32 %r4805, [matrix+3000]; + st.local.v2.u32 [%rd55+80], {%r29767, %r29768}; // begin inline asm - dp4a.u32.u32 %r4804, %r4805, %r5926, %r4800; + ld.global.nc.v2.u32 {%r8367,%r8368}, [%rd498]; // end inline asm - ld.const.u32 %r4809, [matrix+3004]; + xor.b32 %r29761, %r8303, %r8367; + xor.b32 %r29762, %r8304, %r8368; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; + add.s64 %rd57, %rd55, 24; + add.s64 %rd58, %rd3, 24; + +$L__BB2_27: + shl.b32 %r8380, %r29673, 2; + cvt.u64.u32 %rd528, %r8380; + and.b64 %rd529, %rd528, 60; + add.s64 %rd530, %rd58, %rd529; + xor.b32 %r8381, %r30, %r29673; + mul.lo.s32 %r8382, %r8381, 16777619; + ld.local.u32 %r8383, [%rd530]; + xor.b32 %r8384, %r8382, %r8383; + mul.wide.u32 %rd531, %r8384, -954391867; + shr.u64 %rd532, %rd531, 32; + cvt.u32.u64 %r8385, %rd532; + sub.s32 %r8386, %r8384, %r8385; + shr.u32 %r8387, %r8386, 1; + add.s32 %r8388, %r8387, %r8385; + shr.u32 %r8389, %r8388, 20; + mul.lo.s32 %r8390, %r8389, 1179641; + sub.s32 %r8391, %r8384, %r8390; + mul.wide.u32 %rd533, %r8391, 64; + add.s64 %rd534, %rd471, %rd533; + mul.lo.s32 %r8392, %r29710, 16777619; + ld.global.u32 %r8393, [%rd534]; + xor.b32 %r29710, %r8392, %r8393; + mul.lo.s32 %r8394, %r29711, 16777619; + ld.global.u32 %r8395, [%rd534+4]; + xor.b32 %r29711, %r8394, %r8395; + mul.lo.s32 %r8396, %r29722, 16777619; + ld.global.u32 %r8397, [%rd534+8]; + mul.lo.s32 %r8398, %r29723, 16777619; + ld.global.u32 %r8399, [%rd534+12]; + xor.b32 %r8400, %r8398, %r8399; + xor.b32 %r29722, %r8396, %r8397; + mov.b64 %rd535, {%r29722, %r8400}; + mul.lo.s32 %r8401, %r29718, 16777619; + ld.global.u32 %r8402, [%rd534+16]; + mul.lo.s32 %r8403, %r29719, 16777619; + ld.global.u32 %r8404, [%rd534+20]; + xor.b32 %r8405, %r8403, %r8404; + xor.b32 %r29718, %r8401, %r8402; + mov.b64 %rd536, {%r29718, %r8405}; + mul.lo.s32 %r8406, %r29714, 16777619; + ld.global.u32 %r8407, [%rd534+24]; + mul.lo.s32 %r8408, %r29715, 16777619; + ld.global.u32 %r8409, [%rd534+28]; + xor.b32 %r8410, %r8408, %r8409; + xor.b32 %r29714, %r8406, %r8407; + mov.b64 %rd537, {%r29714, %r8410}; + mul.lo.s32 %r8411, %r29712, 16777619; + ld.global.u32 %r8412, [%rd534+32]; + mul.lo.s32 %r8413, %r29713, 16777619; + ld.global.u32 %r8414, [%rd534+36]; + xor.b32 %r8415, %r8413, %r8414; + xor.b32 %r29712, %r8411, %r8412; + mov.b64 %rd538, {%r29712, %r8415}; + mul.lo.s32 %r8416, %r29708, 16777619; + ld.global.u32 %r8417, [%rd534+40]; + xor.b32 %r29708, %r8416, %r8417; + mul.lo.s32 %r8418, %r29709, 16777619; + ld.global.u32 %r8419, [%rd534+44]; + xor.b32 %r29709, %r8418, %r8419; + mul.lo.s32 %r8420, %r29720, 16777619; + ld.global.u32 %r8421, [%rd534+48]; + mul.lo.s32 %r8422, %r29721, 16777619; + ld.global.u32 %r8423, [%rd534+52]; + xor.b32 %r8424, %r8422, %r8423; + xor.b32 %r29720, %r8420, %r8421; + mov.b64 %rd539, {%r29720, %r8424}; + mul.lo.s32 %r8425, %r29716, 16777619; + ld.global.u32 %r8426, [%rd534+56]; + mul.lo.s32 %r8427, %r29717, 16777619; + ld.global.u32 %r8428, [%rd534+60]; + xor.b32 %r8429, %r8427, %r8428; + xor.b32 %r29716, %r8425, %r8426; + mov.b64 %rd540, {%r29716, %r8429}; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; + st.local.v2.u32 [%rd3+32], {%r29722, %r8400}; + st.local.v2.u32 [%rd3+40], {%r29718, %r8405}; + st.local.v2.u32 [%rd3+48], {%r29714, %r8410}; + st.local.v2.u32 [%rd3+56], {%r29712, %r8415}; + st.local.v2.u32 [%rd3+64], {%r29708, %r29709}; + st.local.v2.u32 [%rd3+72], {%r29720, %r8424}; + st.local.v2.u32 [%rd3+80], {%r29716, %r8429}; + add.s64 %rd541, %rd57, %rd529; + xor.b32 %r8430, %r226, %r29673; + mul.lo.s32 %r8431, %r8430, 16777619; + ld.local.u32 %r8432, [%rd541]; + xor.b32 %r8433, %r8431, %r8432; + mul.wide.u32 %rd542, %r8433, -954391867; + shr.u64 %rd543, %rd542, 32; + cvt.u32.u64 %r8434, %rd543; + sub.s32 %r8435, %r8433, %r8434; + shr.u32 %r8436, %r8435, 1; + add.s32 %r8437, %r8436, %r8434; + shr.u32 %r8438, %r8437, 20; + mul.lo.s32 %r8439, %r8438, 1179641; + sub.s32 %r8440, %r8433, %r8439; + mul.wide.u32 %rd544, %r8440, 64; + add.s64 %rd545, %rd471, %rd544; + mul.lo.s32 %r8441, %r29761, 16777619; + ld.global.u32 %r8442, [%rd545]; + xor.b32 %r29761, %r8441, %r8442; + mul.lo.s32 %r8443, %r29762, 16777619; + ld.global.u32 %r8444, [%rd545+4]; + xor.b32 %r29762, %r8443, %r8444; + mul.lo.s32 %r8445, %r29773, 16777619; + ld.global.u32 %r8446, [%rd545+8]; + mul.lo.s32 %r8447, %r29774, 16777619; + ld.global.u32 %r8448, [%rd545+12]; + xor.b32 %r8449, %r8447, %r8448; + xor.b32 %r29773, %r8445, %r8446; + mov.b64 %rd546, {%r29773, %r8449}; + mul.lo.s32 %r8450, %r29769, 16777619; + ld.global.u32 %r8451, [%rd545+16]; + mul.lo.s32 %r8452, %r29770, 16777619; + ld.global.u32 %r8453, [%rd545+20]; + xor.b32 %r8454, %r8452, %r8453; + xor.b32 %r29769, %r8450, %r8451; + mov.b64 %rd547, {%r29769, %r8454}; + mul.lo.s32 %r8455, %r29765, 16777619; + ld.global.u32 %r8456, [%rd545+24]; + mul.lo.s32 %r8457, %r29766, 16777619; + ld.global.u32 %r8458, [%rd545+28]; + xor.b32 %r8459, %r8457, %r8458; + xor.b32 %r29765, %r8455, %r8456; + mov.b64 %rd548, {%r29765, %r8459}; + mul.lo.s32 %r8460, %r29763, 16777619; + ld.global.u32 %r8461, [%rd545+32]; + mul.lo.s32 %r8462, %r29764, 16777619; + ld.global.u32 %r8463, [%rd545+36]; + xor.b32 %r8464, %r8462, %r8463; + xor.b32 %r29763, %r8460, %r8461; + mov.b64 %rd549, {%r29763, %r8464}; + mul.lo.s32 %r8465, %r29759, 16777619; + ld.global.u32 %r8466, [%rd545+40]; + xor.b32 %r29759, %r8465, %r8466; + mul.lo.s32 %r8467, %r29760, 16777619; + ld.global.u32 %r8468, [%rd545+44]; + xor.b32 %r29760, %r8467, %r8468; + mul.lo.s32 %r8469, %r29771, 16777619; + ld.global.u32 %r8470, [%rd545+48]; + mul.lo.s32 %r8471, %r29772, 16777619; + ld.global.u32 %r8472, [%rd545+52]; + xor.b32 %r8473, %r8471, %r8472; + xor.b32 %r29771, %r8469, %r8470; + mov.b64 %rd550, {%r29771, %r8473}; + mul.lo.s32 %r8474, %r29767, 16777619; + ld.global.u32 %r8475, [%rd545+56]; + mul.lo.s32 %r8476, %r29768, 16777619; + ld.global.u32 %r8477, [%rd545+60]; + xor.b32 %r8478, %r8476, %r8477; + xor.b32 %r29767, %r8474, %r8475; + mov.b64 %rd551, {%r29767, %r8478}; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; + st.local.v2.u32 [%rd55+32], {%r29773, %r8449}; + st.local.v2.u32 [%rd55+40], {%r29769, %r8454}; + st.local.v2.u32 [%rd55+48], {%r29765, %r8459}; + st.local.v2.u32 [%rd55+56], {%r29763, %r8464}; + st.local.v2.u32 [%rd55+64], {%r29759, %r29760}; + st.local.v2.u32 [%rd55+72], {%r29771, %r8473}; + st.local.v2.u32 [%rd55+80], {%r29767, %r8478}; + add.s32 %r29673, %r29673, 1; + setp.lt.u32 %p20, %r29673, 512; + shr.u64 %rd552, %rd535, 32; + cvt.u32.u64 %r29723, %rd552; + shr.u64 %rd553, %rd536, 32; + cvt.u32.u64 %r29719, %rd553; + shr.u64 %rd554, %rd537, 32; + cvt.u32.u64 %r29715, %rd554; + shr.u64 %rd555, %rd538, 32; + cvt.u32.u64 %r29713, %rd555; + shr.u64 %rd556, %rd539, 32; + cvt.u32.u64 %r29721, %rd556; + shr.u64 %rd557, %rd540, 32; + cvt.u32.u64 %r29717, %rd557; + shr.u64 %rd558, %rd546, 32; + cvt.u32.u64 %r29774, %rd558; + shr.u64 %rd559, %rd547, 32; + cvt.u32.u64 %r29770, %rd559; + shr.u64 %rd560, %rd548, 32; + cvt.u32.u64 %r29766, %rd560; + shr.u64 %rd561, %rd549, 32; + cvt.u32.u64 %r29764, %rd561; + shr.u64 %rd562, %rd550, 32; + cvt.u32.u64 %r29772, %rd562; + shr.u64 %rd563, %rd551, 32; + cvt.u32.u64 %r29768, %rd563; + @%p20 bra $L__BB2_27; + + mov.u32 %r29674, 0; + st.local.v2.u32 [%rd3+96], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+104], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+112], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+120], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+128], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+136], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+144], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+152], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+160], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+168], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+176], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+184], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+192], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+200], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+208], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+216], {%r29674, %r29674}; + mov.u32 %r29689, -2147483648; + mov.u32 %r8493, 1; + st.local.v2.u32 [%rd3+88], {%r8493, %r29689}; + mov.u32 %r29675, %r29674; + mov.u32 %r29676, %r29674; + mov.u32 %r29677, %r29674; + mov.u32 %r29678, %r29674; + mov.u32 %r29679, %r29674; + mov.u32 %r29680, %r29674; + mov.u32 %r29681, %r29674; + mov.u32 %r29682, %r29674; + mov.u32 %r29683, %r29674; + mov.u32 %r29684, %r29674; + mov.u32 %r29685, %r29674; + mov.u32 %r29686, %r29674; + mov.u32 %r29687, %r29674; + mov.u32 %r29688, %r8493; + mov.u32 %r29690, %r29674; + mov.u32 %r29691, %r29674; + mov.u32 %r29692, %r29674; + mov.u32 %r29693, %r29674; + mov.u32 %r29694, %r29674; + mov.u32 %r29695, %r29674; + mov.u32 %r29696, %r29674; + mov.u32 %r29697, %r29674; + mov.u32 %r29698, %r29674; + mov.u32 %r29699, %r29674; + mov.u32 %r29700, %r29674; + mov.u32 %r29701, %r29674; + mov.u32 %r29702, %r29674; + mov.u32 %r29703, %r29674; + mov.u32 %r29704, %r29674; + mov.u32 %r29705, %r29674; + mov.u32 %r29706, %r29674; + mov.u32 %r29707, %r29674; + mov.u32 %r29724, %r29674; + +$L__BB2_29: // begin inline asm - dp4a.u32.u32 %r4808, %r4809, %r5930, %r4804; + // xor5 + lop3.b32 %r8520, %r29710, %r29708, %r29706, 0x96; + lop3.b32 %r8520, %r8520, %r29704, %r29702, 0x96; + lop3.b32 %r8521, %r29711, %r29709, %r29707, 0x96; + lop3.b32 %r8521, %r8521, %r29705, %r29703, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8532, %r29722, %r29720, %r29700, 0x96; + lop3.b32 %r8532, %r8532, %r29698, %r29696, 0x96; + lop3.b32 %r8533, %r29723, %r29721, %r29701, 0x96; + lop3.b32 %r8533, %r8533, %r29699, %r29697, 0x96; // end inline asm - ld.const.u32 %r4813, [matrix+3008]; // begin inline asm - dp4a.u32.u32 %r4812, %r4813, %r5870, %r6249; + // xor5 + lop3.b32 %r8544, %r29718, %r29716, %r29694, 0x96; + lop3.b32 %r8544, %r8544, %r29692, %r29690, 0x96; + lop3.b32 %r8545, %r29719, %r29717, %r29695, 0x96; + lop3.b32 %r8545, %r8545, %r29693, %r29691, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8556, %r29714, %r29688, %r29686, 0x96; + lop3.b32 %r8556, %r8556, %r29684, %r29682, 0x96; + lop3.b32 %r8557, %r29715, %r29689, %r29687, 0x96; + lop3.b32 %r8557, %r8557, %r29685, %r29683, 0x96; // end inline asm - ld.const.u32 %r4817, [matrix+3012]; // begin inline asm - dp4a.u32.u32 %r4816, %r4817, %r5874, %r4812; + // xor5 + lop3.b32 %r8568, %r29712, %r29680, %r29678, 0x96; + lop3.b32 %r8568, %r8568, %r29676, %r29674, 0x96; + lop3.b32 %r8569, %r29713, %r29681, %r29679, 0x96; + lop3.b32 %r8569, %r8569, %r29677, %r29675, 0x96; // end inline asm - ld.const.u32 %r4821, [matrix+3016]; // begin inline asm - dp4a.u32.u32 %r4820, %r4821, %r5878, %r4816; + shf.l.wrap.b32 %r8580, %r8533, %r8532, %r8493; // end inline asm - ld.const.u32 %r4825, [matrix+3020]; // begin inline asm - dp4a.u32.u32 %r4824, %r4825, %r5882, %r4820; + shf.l.wrap.b32 %r8584, %r8532, %r8533, %r8493; // end inline asm - ld.const.u32 %r4829, [matrix+3024]; + xor.b32 %r9014, %r8580, %r8568; + xor.b32 %r9015, %r8584, %r8569; + xor.b32 %r8847, %r29710, %r9014; + xor.b32 %r8850, %r29711, %r9015; + xor.b32 %r8754, %r29708, %r9014; + xor.b32 %r8753, %r29709, %r9015; + xor.b32 %r8801, %r29706, %r9014; + xor.b32 %r8802, %r29707, %r9015; + xor.b32 %r8706, %r29704, %r9014; + xor.b32 %r8705, %r29705, %r9015; + xor.b32 %r8657, %r29702, %r9014; + xor.b32 %r8658, %r29703, %r9015; // begin inline asm - dp4a.u32.u32 %r4828, %r4829, %r5886, %r4824; + shf.l.wrap.b32 %r8588, %r8545, %r8544, %r8493; // end inline asm - ld.const.u32 %r4833, [matrix+3028]; // begin inline asm - dp4a.u32.u32 %r4832, %r4833, %r5890, %r4828; + shf.l.wrap.b32 %r8592, %r8544, %r8545, %r8493; // end inline asm - ld.const.u32 %r4837, [matrix+3032]; + xor.b32 %r9016, %r8588, %r8520; + xor.b32 %r9017, %r8592, %r8521; + xor.b32 %r8809, %r29722, %r9016; + xor.b32 %r8810, %r29723, %r9017; + xor.b32 %r8626, %r29720, %r9016; + xor.b32 %r8625, %r29721, %r9017; + xor.b32 %r8785, %r29700, %r9016; + xor.b32 %r8786, %r29701, %r9017; + xor.b32 %r8746, %r29698, %r9016; + xor.b32 %r8745, %r29699, %r9017; + xor.b32 %r8729, %r29696, %r9016; + xor.b32 %r8730, %r29697, %r9017; // begin inline asm - dp4a.u32.u32 %r4836, %r4837, %r5894, %r4832; + shf.l.wrap.b32 %r8596, %r8557, %r8556, %r8493; // end inline asm - ld.const.u32 %r4841, [matrix+3036]; // begin inline asm - dp4a.u32.u32 %r4840, %r4841, %r5898, %r4836; + shf.l.wrap.b32 %r8600, %r8556, %r8557, %r8493; // end inline asm - ld.const.u32 %r4845, [matrix+3040]; + xor.b32 %r9018, %r8596, %r8532; + xor.b32 %r9019, %r8600, %r8533; + xor.b32 %r8666, %r29718, %r9018; + xor.b32 %r8665, %r29719, %r9019; + xor.b32 %r8793, %r29716, %r9018; + xor.b32 %r8794, %r29717, %r9019; + xor.b32 %r8674, %r29694, %r9018; + xor.b32 %r8673, %r29695, %r9019; + xor.b32 %r8777, %r29692, %r9018; + xor.b32 %r8778, %r29693, %r9019; + xor.b32 %r8642, %r29690, %r9018; + xor.b32 %r8641, %r29691, %r9019; // begin inline asm - dp4a.u32.u32 %r4844, %r4845, %r5902, %r4840; + shf.l.wrap.b32 %r8604, %r8569, %r8568, %r8493; // end inline asm - ld.const.u32 %r4849, [matrix+3044]; // begin inline asm - dp4a.u32.u32 %r4848, %r4849, %r5906, %r4844; + shf.l.wrap.b32 %r8608, %r8568, %r8569, %r8493; // end inline asm - ld.const.u32 %r4853, [matrix+3048]; + xor.b32 %r9020, %r8604, %r8544; + xor.b32 %r9021, %r8608, %r8545; + xor.b32 %r8761, %r29714, %r9020; + xor.b32 %r8762, %r29715, %r9021; + xor.b32 %r8738, %r29688, %r9020; + xor.b32 %r8737, %r29689, %r9021; + xor.b32 %r8681, %r29686, %r9020; + xor.b32 %r8682, %r29687, %r9021; + xor.b32 %r8769, %r29684, %r9020; + xor.b32 %r8770, %r29685, %r9021; + xor.b32 %r8698, %r29682, %r9020; + xor.b32 %r8697, %r29683, %r9021; // begin inline asm - dp4a.u32.u32 %r4852, %r4853, %r5910, %r4848; + shf.l.wrap.b32 %r8612, %r8521, %r8520, %r8493; // end inline asm - ld.const.u32 %r4857, [matrix+3052]; // begin inline asm - dp4a.u32.u32 %r4856, %r4857, %r5914, %r4852; + shf.l.wrap.b32 %r8616, %r8520, %r8521, %r8493; // end inline asm - ld.const.u32 %r4861, [matrix+3056]; + xor.b32 %r9022, %r8612, %r8556; + xor.b32 %r9023, %r8616, %r8557; + xor.b32 %r8713, %r29712, %r9022; + xor.b32 %r8714, %r29713, %r9023; + xor.b32 %r8633, %r29680, %r9022; + xor.b32 %r8634, %r29681, %r9023; + xor.b32 %r8650, %r29678, %r9022; + xor.b32 %r8649, %r29679, %r9023; + xor.b32 %r8689, %r29676, %r9022; + xor.b32 %r8690, %r29677, %r9023; + xor.b32 %r8721, %r29674, %r9022; + xor.b32 %r8722, %r29675, %r9023; + mov.u32 %r8627, 44; // begin inline asm - dp4a.u32.u32 %r4860, %r4861, %r5918, %r4856; + shf.l.wrap.b32 %r8620, %r8626, %r8625, %r8627; // end inline asm - ld.const.u32 %r4865, [matrix+3060]; // begin inline asm - dp4a.u32.u32 %r4864, %r4865, %r5922, %r4860; + shf.l.wrap.b32 %r8624, %r8625, %r8626, %r8627; // end inline asm - ld.const.u32 %r4869, [matrix+3064]; + mov.u32 %r8635, 20; // begin inline asm - dp4a.u32.u32 %r4868, %r4869, %r5926, %r4864; + shf.l.wrap.b32 %r8628, %r8634, %r8633, %r8635; // end inline asm - ld.const.u32 %r4873, [matrix+3068]; // begin inline asm - dp4a.u32.u32 %r4872, %r4873, %r5930, %r4868; + shf.l.wrap.b32 %r8632, %r8633, %r8634, %r8635; // end inline asm - shr.u32 %r6100, %r4808, 6; - and.b32 %r4877, %r6100, 240; - shr.u32 %r4878, %r4872, 10; + mov.u32 %r8643, 61; // begin inline asm - lop3.b32 %r4876, %r4877, %r4878, %r14, 0x56; + shf.l.wrap.b32 %r8636, %r8642, %r8641, %r8643; // end inline asm - ld.const.u32 %r4881, [matrix+3072]; // begin inline asm - dp4a.u32.u32 %r4880, %r4881, %r5870, %r6249; + shf.l.wrap.b32 %r8640, %r8641, %r8642, %r8643; // end inline asm - ld.const.u32 %r4885, [matrix+3076]; + mov.u32 %r8651, 39; // begin inline asm - dp4a.u32.u32 %r4884, %r4885, %r5874, %r4880; + shf.l.wrap.b32 %r8644, %r8650, %r8649, %r8651; // end inline asm - ld.const.u32 %r4889, [matrix+3080]; // begin inline asm - dp4a.u32.u32 %r4888, %r4889, %r5878, %r4884; + shf.l.wrap.b32 %r8648, %r8649, %r8650, %r8651; // end inline asm - ld.const.u32 %r4893, [matrix+3084]; + mov.u32 %r8659, 18; // begin inline asm - dp4a.u32.u32 %r4892, %r4893, %r5882, %r4888; + shf.l.wrap.b32 %r8652, %r8658, %r8657, %r8659; // end inline asm - ld.const.u32 %r4897, [matrix+3088]; // begin inline asm - dp4a.u32.u32 %r4896, %r4897, %r5886, %r4892; + shf.l.wrap.b32 %r8656, %r8657, %r8658, %r8659; // end inline asm - ld.const.u32 %r4901, [matrix+3092]; + mov.u32 %r8667, 62; // begin inline asm - dp4a.u32.u32 %r4900, %r4901, %r5890, %r4896; + shf.l.wrap.b32 %r8660, %r8666, %r8665, %r8667; // end inline asm - ld.const.u32 %r4905, [matrix+3096]; // begin inline asm - dp4a.u32.u32 %r4904, %r4905, %r5894, %r4900; + shf.l.wrap.b32 %r8664, %r8665, %r8666, %r8667; // end inline asm - ld.const.u32 %r4909, [matrix+3100]; + mov.u32 %r8675, 43; // begin inline asm - dp4a.u32.u32 %r4908, %r4909, %r5898, %r4904; + shf.l.wrap.b32 %r8668, %r8674, %r8673, %r8675; // end inline asm - ld.const.u32 %r4913, [matrix+3104]; // begin inline asm - dp4a.u32.u32 %r4912, %r4913, %r5902, %r4908; + shf.l.wrap.b32 %r8672, %r8673, %r8674, %r8675; // end inline asm - ld.const.u32 %r4917, [matrix+3108]; + mov.u32 %r8683, 25; // begin inline asm - dp4a.u32.u32 %r4916, %r4917, %r5906, %r4912; + shf.l.wrap.b32 %r8676, %r8682, %r8681, %r8683; // end inline asm - ld.const.u32 %r4921, [matrix+3112]; // begin inline asm - dp4a.u32.u32 %r4920, %r4921, %r5910, %r4916; + shf.l.wrap.b32 %r8680, %r8681, %r8682, %r8683; // end inline asm - ld.const.u32 %r4925, [matrix+3116]; + mov.u32 %r8691, 8; // begin inline asm - dp4a.u32.u32 %r4924, %r4925, %r5914, %r4920; + shf.l.wrap.b32 %r8684, %r8690, %r8689, %r8691; // end inline asm - ld.const.u32 %r4929, [matrix+3120]; // begin inline asm - dp4a.u32.u32 %r4928, %r4929, %r5918, %r4924; + shf.l.wrap.b32 %r8688, %r8689, %r8690, %r8691; // end inline asm - ld.const.u32 %r4933, [matrix+3124]; + mov.u32 %r8699, 56; // begin inline asm - dp4a.u32.u32 %r4932, %r4933, %r5922, %r4928; + shf.l.wrap.b32 %r8692, %r8698, %r8697, %r8699; // end inline asm - ld.const.u32 %r4937, [matrix+3128]; // begin inline asm - dp4a.u32.u32 %r4936, %r4937, %r5926, %r4932; + shf.l.wrap.b32 %r8696, %r8697, %r8698, %r8699; // end inline asm - ld.const.u32 %r4941, [matrix+3132]; + mov.u32 %r8707, 41; // begin inline asm - dp4a.u32.u32 %r4940, %r4941, %r5930, %r4936; + shf.l.wrap.b32 %r8700, %r8706, %r8705, %r8707; // end inline asm - ld.const.u32 %r4945, [matrix+3136]; // begin inline asm - dp4a.u32.u32 %r4944, %r4945, %r5870, %r6249; + shf.l.wrap.b32 %r8704, %r8705, %r8706, %r8707; // end inline asm - ld.const.u32 %r4949, [matrix+3140]; + mov.u32 %r8715, 27; // begin inline asm - dp4a.u32.u32 %r4948, %r4949, %r5874, %r4944; + shf.l.wrap.b32 %r8708, %r8714, %r8713, %r8715; // end inline asm - ld.const.u32 %r4953, [matrix+3144]; // begin inline asm - dp4a.u32.u32 %r4952, %r4953, %r5878, %r4948; + shf.l.wrap.b32 %r8712, %r8713, %r8714, %r8715; // end inline asm - ld.const.u32 %r4957, [matrix+3148]; + mov.u32 %r8723, 14; // begin inline asm - dp4a.u32.u32 %r4956, %r4957, %r5882, %r4952; + shf.l.wrap.b32 %r8716, %r8722, %r8721, %r8723; // end inline asm - ld.const.u32 %r4961, [matrix+3152]; // begin inline asm - dp4a.u32.u32 %r4960, %r4961, %r5886, %r4956; + shf.l.wrap.b32 %r8720, %r8721, %r8722, %r8723; // end inline asm - ld.const.u32 %r4965, [matrix+3156]; + mov.u32 %r8731, 2; // begin inline asm - dp4a.u32.u32 %r4964, %r4965, %r5890, %r4960; + shf.l.wrap.b32 %r8724, %r8730, %r8729, %r8731; // end inline asm - ld.const.u32 %r4969, [matrix+3160]; // begin inline asm - dp4a.u32.u32 %r4968, %r4969, %r5894, %r4964; + shf.l.wrap.b32 %r8728, %r8729, %r8730, %r8731; // end inline asm - ld.const.u32 %r4973, [matrix+3164]; + mov.u32 %r8739, 55; // begin inline asm - dp4a.u32.u32 %r4972, %r4973, %r5898, %r4968; + shf.l.wrap.b32 %r8732, %r8738, %r8737, %r8739; // end inline asm - ld.const.u32 %r4977, [matrix+3168]; // begin inline asm - dp4a.u32.u32 %r4976, %r4977, %r5902, %r4972; + shf.l.wrap.b32 %r8736, %r8737, %r8738, %r8739; // end inline asm - ld.const.u32 %r4981, [matrix+3172]; + mov.u32 %r8747, 45; // begin inline asm - dp4a.u32.u32 %r4980, %r4981, %r5906, %r4976; + shf.l.wrap.b32 %r8740, %r8746, %r8745, %r8747; // end inline asm - ld.const.u32 %r4985, [matrix+3176]; // begin inline asm - dp4a.u32.u32 %r4984, %r4985, %r5910, %r4980; + shf.l.wrap.b32 %r8744, %r8745, %r8746, %r8747; // end inline asm - ld.const.u32 %r4989, [matrix+3180]; + mov.u32 %r8755, 36; // begin inline asm - dp4a.u32.u32 %r4988, %r4989, %r5914, %r4984; + shf.l.wrap.b32 %r8748, %r8754, %r8753, %r8755; // end inline asm - ld.const.u32 %r4993, [matrix+3184]; // begin inline asm - dp4a.u32.u32 %r4992, %r4993, %r5918, %r4988; + shf.l.wrap.b32 %r8752, %r8753, %r8754, %r8755; // end inline asm - ld.const.u32 %r4997, [matrix+3188]; + mov.u32 %r8763, 28; // begin inline asm - dp4a.u32.u32 %r4996, %r4997, %r5922, %r4992; + shf.l.wrap.b32 %r8756, %r8762, %r8761, %r8763; // end inline asm - ld.const.u32 %r5001, [matrix+3192]; // begin inline asm - dp4a.u32.u32 %r5000, %r5001, %r5926, %r4996; + shf.l.wrap.b32 %r8760, %r8761, %r8762, %r8763; // end inline asm - ld.const.u32 %r5005, [matrix+3196]; + mov.u32 %r8771, 21; // begin inline asm - dp4a.u32.u32 %r5004, %r5005, %r5930, %r5000; + shf.l.wrap.b32 %r8764, %r8770, %r8769, %r8771; // end inline asm - shr.u32 %r6101, %r4940, 6; - and.b32 %r5009, %r6101, 240; - shr.u32 %r5010, %r5004, 10; - and.b32 %r5011, %r6037, 255; // begin inline asm - lop3.b32 %r5008, %r5009, %r5010, %r5011, 0x56; + shf.l.wrap.b32 %r8768, %r8769, %r8770, %r8771; // end inline asm - ld.const.u32 %r5013, [matrix+3200]; + mov.u32 %r8779, 15; // begin inline asm - dp4a.u32.u32 %r5012, %r5013, %r5870, %r6249; + shf.l.wrap.b32 %r8772, %r8778, %r8777, %r8779; // end inline asm - ld.const.u32 %r5017, [matrix+3204]; // begin inline asm - dp4a.u32.u32 %r5016, %r5017, %r5874, %r5012; + shf.l.wrap.b32 %r8776, %r8777, %r8778, %r8779; // end inline asm - ld.const.u32 %r5021, [matrix+3208]; + mov.u32 %r8787, 10; // begin inline asm - dp4a.u32.u32 %r5020, %r5021, %r5878, %r5016; + shf.l.wrap.b32 %r8780, %r8786, %r8785, %r8787; // end inline asm - ld.const.u32 %r5025, [matrix+3212]; // begin inline asm - dp4a.u32.u32 %r5024, %r5025, %r5882, %r5020; + shf.l.wrap.b32 %r8784, %r8785, %r8786, %r8787; // end inline asm - ld.const.u32 %r5029, [matrix+3216]; + mov.u32 %r8795, 6; // begin inline asm - dp4a.u32.u32 %r5028, %r5029, %r5886, %r5024; + shf.l.wrap.b32 %r8788, %r8794, %r8793, %r8795; // end inline asm - ld.const.u32 %r5033, [matrix+3220]; // begin inline asm - dp4a.u32.u32 %r5032, %r5033, %r5890, %r5028; + shf.l.wrap.b32 %r8792, %r8793, %r8794, %r8795; // end inline asm - ld.const.u32 %r5037, [matrix+3224]; + mov.u32 %r8803, 3; // begin inline asm - dp4a.u32.u32 %r5036, %r5037, %r5894, %r5032; + shf.l.wrap.b32 %r8796, %r8802, %r8801, %r8803; // end inline asm - ld.const.u32 %r5041, [matrix+3228]; // begin inline asm - dp4a.u32.u32 %r5040, %r5041, %r5898, %r5036; + shf.l.wrap.b32 %r8800, %r8801, %r8802, %r8803; // end inline asm - ld.const.u32 %r5045, [matrix+3232]; // begin inline asm - dp4a.u32.u32 %r5044, %r5045, %r5902, %r5040; + shf.l.wrap.b32 %r8804, %r8810, %r8809, %r8493; // end inline asm - ld.const.u32 %r5049, [matrix+3236]; // begin inline asm - dp4a.u32.u32 %r5048, %r5049, %r5906, %r5044; + shf.l.wrap.b32 %r8808, %r8809, %r8810, %r8493; // end inline asm - ld.const.u32 %r5053, [matrix+3240]; // begin inline asm - dp4a.u32.u32 %r5052, %r5053, %r5910, %r5048; + // chi + lop3.b32 %r8812, %r8847, %r8620, %r8668, 0xD2; + lop3.b32 %r8813, %r8850, %r8624, %r8672, 0xD2; // end inline asm - ld.const.u32 %r5057, [matrix+3244]; // begin inline asm - dp4a.u32.u32 %r5056, %r5057, %r5914, %r5052; + // chi + lop3.b32 %r29722, %r8620, %r8668, %r8764, 0xD2; + lop3.b32 %r29723, %r8624, %r8672, %r8768, 0xD2; // end inline asm - ld.const.u32 %r5061, [matrix+3248]; // begin inline asm - dp4a.u32.u32 %r5060, %r5061, %r5918, %r5056; + // chi + lop3.b32 %r29718, %r8668, %r8764, %r8716, 0xD2; + lop3.b32 %r29719, %r8672, %r8768, %r8720, 0xD2; // end inline asm - ld.const.u32 %r5065, [matrix+3252]; // begin inline asm - dp4a.u32.u32 %r5064, %r5065, %r5922, %r5060; + // chi + lop3.b32 %r29714, %r8764, %r8716, %r8847, 0xD2; + lop3.b32 %r29715, %r8768, %r8720, %r8850, 0xD2; // end inline asm - ld.const.u32 %r5069, [matrix+3256]; // begin inline asm - dp4a.u32.u32 %r5068, %r5069, %r5926, %r5064; + // chi + lop3.b32 %r29712, %r8716, %r8847, %r8620, 0xD2; + lop3.b32 %r29713, %r8720, %r8850, %r8624, 0xD2; // end inline asm - ld.const.u32 %r5073, [matrix+3260]; // begin inline asm - dp4a.u32.u32 %r5072, %r5073, %r5930, %r5068; + // chi + lop3.b32 %r29708, %r8756, %r8628, %r8796, 0xD2; + lop3.b32 %r29709, %r8760, %r8632, %r8800, 0xD2; // end inline asm - ld.const.u32 %r5077, [matrix+3264]; // begin inline asm - dp4a.u32.u32 %r5076, %r5077, %r5870, %r6249; + // chi + lop3.b32 %r29720, %r8628, %r8796, %r8740, 0xD2; + lop3.b32 %r29721, %r8632, %r8800, %r8744, 0xD2; // end inline asm - ld.const.u32 %r5081, [matrix+3268]; // begin inline asm - dp4a.u32.u32 %r5080, %r5081, %r5874, %r5076; + // chi + lop3.b32 %r29716, %r8796, %r8740, %r8636, 0xD2; + lop3.b32 %r29717, %r8800, %r8744, %r8640, 0xD2; // end inline asm - ld.const.u32 %r5085, [matrix+3272]; // begin inline asm - dp4a.u32.u32 %r5084, %r5085, %r5878, %r5080; + // chi + lop3.b32 %r29688, %r8740, %r8636, %r8756, 0xD2; + lop3.b32 %r29689, %r8744, %r8640, %r8760, 0xD2; // end inline asm - ld.const.u32 %r5089, [matrix+3276]; + st.local.v2.u32 [%rd3+88], {%r29688, %r29689}; // begin inline asm - dp4a.u32.u32 %r5088, %r5089, %r5882, %r5084; + // chi + lop3.b32 %r29680, %r8636, %r8756, %r8628, 0xD2; + lop3.b32 %r29681, %r8640, %r8760, %r8632, 0xD2; // end inline asm - ld.const.u32 %r5093, [matrix+3280]; + st.local.v2.u32 [%rd3+96], {%r29680, %r29681}; // begin inline asm - dp4a.u32.u32 %r5092, %r5093, %r5886, %r5088; + // chi + lop3.b32 %r29706, %r8804, %r8788, %r8676, 0xD2; + lop3.b32 %r29707, %r8808, %r8792, %r8680, 0xD2; // end inline asm - ld.const.u32 %r5097, [matrix+3284]; + st.local.v2.u32 [%rd3+104], {%r29706, %r29707}; // begin inline asm - dp4a.u32.u32 %r5096, %r5097, %r5890, %r5092; + // chi + lop3.b32 %r29700, %r8788, %r8676, %r8684, 0xD2; + lop3.b32 %r29701, %r8792, %r8680, %r8688, 0xD2; // end inline asm - ld.const.u32 %r5101, [matrix+3288]; + st.local.v2.u32 [%rd3+112], {%r29700, %r29701}; // begin inline asm - dp4a.u32.u32 %r5100, %r5101, %r5894, %r5096; + // chi + lop3.b32 %r29694, %r8676, %r8684, %r8652, 0xD2; + lop3.b32 %r29695, %r8680, %r8688, %r8656, 0xD2; // end inline asm - ld.const.u32 %r5105, [matrix+3292]; + st.local.v2.u32 [%rd3+120], {%r29694, %r29695}; // begin inline asm - dp4a.u32.u32 %r5104, %r5105, %r5898, %r5100; + // chi + lop3.b32 %r29686, %r8684, %r8652, %r8804, 0xD2; + lop3.b32 %r29687, %r8688, %r8656, %r8808, 0xD2; // end inline asm - ld.const.u32 %r5109, [matrix+3296]; + st.local.v2.u32 [%rd3+128], {%r29686, %r29687}; // begin inline asm - dp4a.u32.u32 %r5108, %r5109, %r5902, %r5104; + // chi + lop3.b32 %r29678, %r8652, %r8804, %r8788, 0xD2; + lop3.b32 %r29679, %r8656, %r8808, %r8792, 0xD2; // end inline asm - ld.const.u32 %r5113, [matrix+3300]; + st.local.v2.u32 [%rd3+136], {%r29678, %r29679}; // begin inline asm - dp4a.u32.u32 %r5112, %r5113, %r5906, %r5108; + // chi + lop3.b32 %r29704, %r8708, %r8748, %r8780, 0xD2; + lop3.b32 %r29705, %r8712, %r8752, %r8784, 0xD2; // end inline asm - ld.const.u32 %r5117, [matrix+3304]; + st.local.v2.u32 [%rd3+144], {%r29704, %r29705}; // begin inline asm - dp4a.u32.u32 %r5116, %r5117, %r5910, %r5112; + // chi + lop3.b32 %r29698, %r8748, %r8780, %r8772, 0xD2; + lop3.b32 %r29699, %r8752, %r8784, %r8776, 0xD2; // end inline asm - ld.const.u32 %r5121, [matrix+3308]; + st.local.v2.u32 [%rd3+152], {%r29698, %r29699}; // begin inline asm - dp4a.u32.u32 %r5120, %r5121, %r5914, %r5116; + // chi + lop3.b32 %r29692, %r8780, %r8772, %r8692, 0xD2; + lop3.b32 %r29693, %r8784, %r8776, %r8696, 0xD2; // end inline asm - ld.const.u32 %r5125, [matrix+3312]; + st.local.v2.u32 [%rd3+160], {%r29692, %r29693}; // begin inline asm - dp4a.u32.u32 %r5124, %r5125, %r5918, %r5120; + // chi + lop3.b32 %r29684, %r8772, %r8692, %r8708, 0xD2; + lop3.b32 %r29685, %r8776, %r8696, %r8712, 0xD2; // end inline asm - ld.const.u32 %r5129, [matrix+3316]; + st.local.v2.u32 [%rd3+168], {%r29684, %r29685}; // begin inline asm - dp4a.u32.u32 %r5128, %r5129, %r5922, %r5124; + // chi + lop3.b32 %r29676, %r8692, %r8708, %r8748, 0xD2; + lop3.b32 %r29677, %r8696, %r8712, %r8752, 0xD2; // end inline asm - ld.const.u32 %r5133, [matrix+3320]; + st.local.v2.u32 [%rd3+176], {%r29676, %r29677}; // begin inline asm - dp4a.u32.u32 %r5132, %r5133, %r5926, %r5128; + // chi + lop3.b32 %r29702, %r8660, %r8732, %r8644, 0xD2; + lop3.b32 %r29703, %r8664, %r8736, %r8648, 0xD2; // end inline asm - ld.const.u32 %r5137, [matrix+3324]; + st.local.v2.u32 [%rd3+184], {%r29702, %r29703}; // begin inline asm - dp4a.u32.u32 %r5136, %r5137, %r5930, %r5132; + // chi + lop3.b32 %r29696, %r8732, %r8644, %r8700, 0xD2; + lop3.b32 %r29697, %r8736, %r8648, %r8704, 0xD2; // end inline asm - shr.u32 %r6102, %r5072, 6; - and.b32 %r5141, %r6102, 240; - shr.u32 %r5142, %r5136, 10; - and.b32 %r5143, %r6036, 255; + st.local.v2.u32 [%rd3+192], {%r29696, %r29697}; // begin inline asm - lop3.b32 %r5140, %r5141, %r5142, %r5143, 0x56; + // chi + lop3.b32 %r29690, %r8644, %r8700, %r8724, 0xD2; + lop3.b32 %r29691, %r8648, %r8704, %r8728, 0xD2; // end inline asm - ld.const.u32 %r5145, [matrix+3328]; + st.local.v2.u32 [%rd3+200], {%r29690, %r29691}; // begin inline asm - dp4a.u32.u32 %r5144, %r5145, %r5870, %r6249; + // chi + lop3.b32 %r29682, %r8700, %r8724, %r8660, 0xD2; + lop3.b32 %r29683, %r8704, %r8728, %r8664, 0xD2; // end inline asm - ld.const.u32 %r5149, [matrix+3332]; + st.local.v2.u32 [%rd3+208], {%r29682, %r29683}; // begin inline asm - dp4a.u32.u32 %r5148, %r5149, %r5874, %r5144; + // chi + lop3.b32 %r29674, %r8724, %r8660, %r8732, 0xD2; + lop3.b32 %r29675, %r8728, %r8664, %r8736, 0xD2; // end inline asm - ld.const.u32 %r5153, [matrix+3336]; + st.local.v2.u32 [%rd3+216], {%r29674, %r29675}; + mul.wide.s32 %rd565, %r29724, 8; + add.s64 %rd564, %rd497, %rd565; // begin inline asm - dp4a.u32.u32 %r5152, %r5153, %r5878, %r5148; + ld.global.nc.v2.u32 {%r9012,%r9013}, [%rd564]; // end inline asm - ld.const.u32 %r5157, [matrix+3340]; + xor.b32 %r29710, %r8812, %r9012; + xor.b32 %r29711, %r8813, %r9013; + add.s32 %r29724, %r29724, 1; + setp.lt.u32 %p21, %r29724, 23; + @%p21 bra $L__BB2_29; + + st.local.v2.u32 [%rd3+32], {%r29722, %r29723}; + st.local.v2.u32 [%rd3+72], {%r29720, %r29721}; + st.local.v2.u32 [%rd3+40], {%r29718, %r29719}; + st.local.v2.u32 [%rd3+80], {%r29716, %r29717}; + st.local.v2.u32 [%rd3+48], {%r29714, %r29715}; + st.local.v2.u32 [%rd3+56], {%r29712, %r29713}; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; // begin inline asm - dp4a.u32.u32 %r5156, %r5157, %r5882, %r5152; + // xor5 + lop3.b32 %r9024, %r29710, %r29708, %r29706, 0x96; + lop3.b32 %r9024, %r9024, %r29704, %r29702, 0x96; + lop3.b32 %r9025, %r29711, %r29709, %r29707, 0x96; + lop3.b32 %r9025, %r9025, %r29705, %r29703, 0x96; // end inline asm - ld.const.u32 %r5161, [matrix+3344]; // begin inline asm - dp4a.u32.u32 %r5160, %r5161, %r5886, %r5156; + // xor5 + lop3.b32 %r9036, %r29722, %r29720, %r29700, 0x96; + lop3.b32 %r9036, %r9036, %r29698, %r29696, 0x96; + lop3.b32 %r9037, %r29723, %r29721, %r29701, 0x96; + lop3.b32 %r9037, %r9037, %r29699, %r29697, 0x96; // end inline asm - ld.const.u32 %r5165, [matrix+3348]; // begin inline asm - dp4a.u32.u32 %r5164, %r5165, %r5890, %r5160; + // xor5 + lop3.b32 %r9048, %r29718, %r29716, %r29694, 0x96; + lop3.b32 %r9048, %r9048, %r29692, %r29690, 0x96; + lop3.b32 %r9049, %r29719, %r29717, %r29695, 0x96; + lop3.b32 %r9049, %r9049, %r29693, %r29691, 0x96; // end inline asm - ld.const.u32 %r5169, [matrix+3352]; // begin inline asm - dp4a.u32.u32 %r5168, %r5169, %r5894, %r5164; + // xor5 + lop3.b32 %r9060, %r29714, %r29688, %r29686, 0x96; + lop3.b32 %r9060, %r9060, %r29684, %r29682, 0x96; + lop3.b32 %r9061, %r29715, %r29689, %r29687, 0x96; + lop3.b32 %r9061, %r9061, %r29685, %r29683, 0x96; // end inline asm - ld.const.u32 %r5173, [matrix+3356]; // begin inline asm - dp4a.u32.u32 %r5172, %r5173, %r5898, %r5168; + // xor5 + lop3.b32 %r9072, %r29712, %r29680, %r29678, 0x96; + lop3.b32 %r9072, %r9072, %r29676, %r29674, 0x96; + lop3.b32 %r9073, %r29713, %r29681, %r29679, 0x96; + lop3.b32 %r9073, %r9073, %r29677, %r29675, 0x96; // end inline asm - ld.const.u32 %r5177, [matrix+3360]; + mov.u32 %r9276, 1; // begin inline asm - dp4a.u32.u32 %r5176, %r5177, %r5902, %r5172; + shf.l.wrap.b32 %r9084, %r9037, %r9036, %r9276; // end inline asm - ld.const.u32 %r5181, [matrix+3364]; // begin inline asm - dp4a.u32.u32 %r5180, %r5181, %r5906, %r5176; + shf.l.wrap.b32 %r9088, %r9036, %r9037, %r9276; // end inline asm - ld.const.u32 %r5185, [matrix+3368]; + xor.b32 %r9303, %r9084, %r9072; + xor.b32 %r9304, %r9088, %r9073; + xor.b32 %r9231, %r29710, %r9303; + xor.b32 %r9234, %r29711, %r9304; + xor.b32 %r9194, %r29707, %r9304; + xor.b32 %r9193, %r29706, %r9303; + st.local.v2.u32 [%rd3+104], {%r9193, %r9194}; // begin inline asm - dp4a.u32.u32 %r5184, %r5185, %r5910, %r5180; + shf.l.wrap.b32 %r9092, %r9049, %r9048, %r9276; // end inline asm - ld.const.u32 %r5189, [matrix+3372]; // begin inline asm - dp4a.u32.u32 %r5188, %r5189, %r5914, %r5184; + shf.l.wrap.b32 %r9096, %r9048, %r9049, %r9276; // end inline asm - ld.const.u32 %r5193, [matrix+3376]; + xor.b32 %r9305, %r9092, %r9024; + xor.b32 %r9306, %r9096, %r9025; + xor.b32 %r9130, %r29720, %r9305; + xor.b32 %r9129, %r29721, %r9306; + xor.b32 %r9169, %r29699, %r9306; + xor.b32 %r9170, %r29698, %r9305; + st.local.v2.u32 [%rd3+152], {%r9170, %r9169}; // begin inline asm - dp4a.u32.u32 %r5192, %r5193, %r5918, %r5188; + shf.l.wrap.b32 %r9100, %r9061, %r9060, %r9276; // end inline asm - ld.const.u32 %r5197, [matrix+3380]; // begin inline asm - dp4a.u32.u32 %r5196, %r5197, %r5922, %r5192; + shf.l.wrap.b32 %r9104, %r9060, %r9061, %r9276; // end inline asm - ld.const.u32 %r5201, [matrix+3384]; + xor.b32 %r9307, %r9100, %r9036; + xor.b32 %r9308, %r9104, %r9037; + xor.b32 %r9153, %r29695, %r9308; + xor.b32 %r9154, %r29694, %r9307; + st.local.v2.u32 [%rd3+120], {%r9154, %r9153}; + xor.b32 %r9145, %r29691, %r9308; + xor.b32 %r9146, %r29690, %r9307; + st.local.v2.u32 [%rd3+200], {%r9146, %r9145}; // begin inline asm - dp4a.u32.u32 %r5200, %r5201, %r5926, %r5196; + shf.l.wrap.b32 %r9108, %r9073, %r9072, %r9276; // end inline asm - ld.const.u32 %r5205, [matrix+3388]; // begin inline asm - dp4a.u32.u32 %r5204, %r5205, %r5930, %r5200; + shf.l.wrap.b32 %r9112, %r9072, %r9073, %r9276; // end inline asm - ld.const.u32 %r5209, [matrix+3392]; + xor.b32 %r9309, %r9108, %r9048; + xor.b32 %r9310, %r9112, %r9049; + xor.b32 %r9177, %r29714, %r9309; + xor.b32 %r9178, %r29715, %r9310; + xor.b32 %r9186, %r29685, %r9310; + xor.b32 %r9185, %r29684, %r9309; + st.local.v2.u32 [%rd3+168], {%r9185, %r9186}; // begin inline asm - dp4a.u32.u32 %r5208, %r5209, %r5870, %r6249; + shf.l.wrap.b32 %r9116, %r9025, %r9024, %r9276; // end inline asm - ld.const.u32 %r5213, [matrix+3396]; // begin inline asm - dp4a.u32.u32 %r5212, %r5213, %r5874, %r5208; + shf.l.wrap.b32 %r9120, %r9024, %r9025, %r9276; // end inline asm - ld.const.u32 %r5217, [matrix+3400]; + xor.b32 %r9311, %r9116, %r9060; + xor.b32 %r9312, %r9120, %r9061; + xor.b32 %r9137, %r29680, %r9311; + xor.b32 %r9138, %r29681, %r9312; + xor.b32 %r9162, %r29675, %r9312; + xor.b32 %r9161, %r29674, %r9311; + st.local.v2.u32 [%rd3+216], {%r9161, %r9162}; // begin inline asm - dp4a.u32.u32 %r5216, %r5217, %r5878, %r5212; + shf.l.wrap.b32 %r9124, %r9130, %r9129, %r8627; // end inline asm - ld.const.u32 %r5221, [matrix+3404]; // begin inline asm - dp4a.u32.u32 %r5220, %r5221, %r5882, %r5216; + shf.l.wrap.b32 %r9128, %r9129, %r9130, %r8627; // end inline asm - ld.const.u32 %r5225, [matrix+3408]; // begin inline asm - dp4a.u32.u32 %r5224, %r5225, %r5886, %r5220; + shf.l.wrap.b32 %r9132, %r9138, %r9137, %r8635; // end inline asm - ld.const.u32 %r5229, [matrix+3412]; // begin inline asm - dp4a.u32.u32 %r5228, %r5229, %r5890, %r5224; + shf.l.wrap.b32 %r9136, %r9137, %r9138, %r8635; // end inline asm - ld.const.u32 %r5233, [matrix+3416]; // begin inline asm - dp4a.u32.u32 %r5232, %r5233, %r5894, %r5228; + shf.l.wrap.b32 %r9144, %r9145, %r9146, %r8643; // end inline asm - ld.const.u32 %r5237, [matrix+3420]; // begin inline asm - dp4a.u32.u32 %r5236, %r5237, %r5898, %r5232; + shf.l.wrap.b32 %r9140, %r9146, %r9145, %r8643; // end inline asm - ld.const.u32 %r5241, [matrix+3424]; + st.local.v2.u32 [%rd3+96], {%r9140, %r9144}; // begin inline asm - dp4a.u32.u32 %r5240, %r5241, %r5902, %r5236; + shf.l.wrap.b32 %r9148, %r9154, %r9153, %r8675; // end inline asm - ld.const.u32 %r5245, [matrix+3428]; // begin inline asm - dp4a.u32.u32 %r5244, %r5245, %r5906, %r5240; + shf.l.wrap.b32 %r9152, %r9153, %r9154, %r8675; // end inline asm - ld.const.u32 %r5249, [matrix+3432]; // begin inline asm - dp4a.u32.u32 %r5248, %r5249, %r5910, %r5244; + shf.l.wrap.b32 %r9156, %r9162, %r9161, %r8723; // end inline asm - ld.const.u32 %r5253, [matrix+3436]; // begin inline asm - dp4a.u32.u32 %r5252, %r5253, %r5914, %r5248; + shf.l.wrap.b32 %r9160, %r9161, %r9162, %r8723; // end inline asm - ld.const.u32 %r5257, [matrix+3440]; // begin inline asm - dp4a.u32.u32 %r5256, %r5257, %r5918, %r5252; + shf.l.wrap.b32 %r9168, %r9169, %r9170, %r8747; // end inline asm - ld.const.u32 %r5261, [matrix+3444]; // begin inline asm - dp4a.u32.u32 %r5260, %r5261, %r5922, %r5256; + shf.l.wrap.b32 %r9164, %r9170, %r9169, %r8747; // end inline asm - ld.const.u32 %r5265, [matrix+3448]; + st.local.v2.u32 [%rd3+88], {%r9164, %r9168}; // begin inline asm - dp4a.u32.u32 %r5264, %r5265, %r5926, %r5260; + shf.l.wrap.b32 %r9172, %r9178, %r9177, %r8763; // end inline asm - ld.const.u32 %r5269, [matrix+3452]; // begin inline asm - dp4a.u32.u32 %r5268, %r5269, %r5930, %r5264; + shf.l.wrap.b32 %r9176, %r9177, %r9178, %r8763; // end inline asm - shr.u32 %r6103, %r5204, 6; - and.b32 %r5273, %r6103, 240; - shr.u32 %r5274, %r5268, 10; - and.b32 %r5275, %r6047, 255; // begin inline asm - lop3.b32 %r5272, %r5273, %r5274, %r5275, 0x56; + shf.l.wrap.b32 %r9180, %r9186, %r9185, %r8771; // end inline asm - shl.b32 %r6104, %r5272, 16; - and.b32 %r6105, %r6104, 16711680; - cvt.u64.u32 %rd211, %r6105; - ld.const.u32 %r5277, [matrix+3456]; // begin inline asm - dp4a.u32.u32 %r5276, %r5277, %r5870, %r6249; + shf.l.wrap.b32 %r9184, %r9185, %r9186, %r8771; // end inline asm - ld.const.u32 %r5281, [matrix+3460]; // begin inline asm - dp4a.u32.u32 %r5280, %r5281, %r5874, %r5276; + shf.l.wrap.b32 %r9188, %r9194, %r9193, %r8803; // end inline asm - ld.const.u32 %r5285, [matrix+3464]; // begin inline asm - dp4a.u32.u32 %r5284, %r5285, %r5878, %r5280; + shf.l.wrap.b32 %r9192, %r9193, %r9194, %r8803; // end inline asm - ld.const.u32 %r5289, [matrix+3468]; // begin inline asm - dp4a.u32.u32 %r5288, %r5289, %r5882, %r5284; + // chi + lop3.b32 %r9196, %r9231, %r9124, %r9148, 0xD2; + lop3.b32 %r9197, %r9234, %r9128, %r9152, 0xD2; // end inline asm - ld.const.u32 %r5293, [matrix+3472]; // begin inline asm - dp4a.u32.u32 %r5292, %r5293, %r5886, %r5288; + // chi + lop3.b32 %r9204, %r9124, %r9148, %r9180, 0xD2; + lop3.b32 %r9205, %r9128, %r9152, %r9184, 0xD2; // end inline asm - ld.const.u32 %r5297, [matrix+3476]; + st.local.v2.u32 [%rd3+32], {%r9204, %r9205}; // begin inline asm - dp4a.u32.u32 %r5296, %r5297, %r5890, %r5292; + // chi + lop3.b32 %r9212, %r9148, %r9180, %r9156, 0xD2; + lop3.b32 %r9213, %r9152, %r9184, %r9160, 0xD2; // end inline asm - ld.const.u32 %r5301, [matrix+3480]; + st.local.v2.u32 [%rd3+40], {%r9212, %r9213}; // begin inline asm - dp4a.u32.u32 %r5300, %r5301, %r5894, %r5296; + // chi + lop3.b32 %r9220, %r9180, %r9156, %r9231, 0xD2; + lop3.b32 %r9221, %r9184, %r9160, %r9234, 0xD2; // end inline asm - ld.const.u32 %r5305, [matrix+3484]; + st.local.v2.u32 [%rd3+48], {%r9220, %r9221}; // begin inline asm - dp4a.u32.u32 %r5304, %r5305, %r5898, %r5300; + // chi + lop3.b32 %r9228, %r9156, %r9231, %r9124, 0xD2; + lop3.b32 %r9229, %r9160, %r9234, %r9128, 0xD2; // end inline asm - ld.const.u32 %r5309, [matrix+3488]; + st.local.v2.u32 [%rd3+56], {%r9228, %r9229}; // begin inline asm - dp4a.u32.u32 %r5308, %r5309, %r5902, %r5304; + // chi + lop3.b32 %r9236, %r9172, %r9132, %r9188, 0xD2; + lop3.b32 %r9237, %r9176, %r9136, %r9192, 0xD2; // end inline asm - ld.const.u32 %r5313, [matrix+3492]; + st.local.v2.u32 [%rd3+64], {%r9236, %r9237}; // begin inline asm - dp4a.u32.u32 %r5312, %r5313, %r5906, %r5308; + // chi + lop3.b32 %r9244, %r9132, %r9188, %r9164, 0xD2; + lop3.b32 %r9245, %r9136, %r9192, %r9168, 0xD2; // end inline asm - ld.const.u32 %r5317, [matrix+3496]; + st.local.v2.u32 [%rd3+72], {%r9244, %r9245}; // begin inline asm - dp4a.u32.u32 %r5316, %r5317, %r5910, %r5312; + // chi + lop3.b32 %r9252, %r9188, %r9164, %r9140, 0xD2; + lop3.b32 %r9253, %r9192, %r9168, %r9144, 0xD2; // end inline asm - ld.const.u32 %r5321, [matrix+3500]; + st.local.v2.u32 [%rd3+80], {%r9252, %r9253}; // begin inline asm - dp4a.u32.u32 %r5320, %r5321, %r5914, %r5316; + ld.global.nc.v2.u32 {%r9260,%r9261}, [%rd498]; // end inline asm - ld.const.u32 %r5325, [matrix+3504]; + xor.b32 %r9313, %r9197, %r9261; + xor.b32 %r9314, %r9196, %r9260; + mov.b64 %rd1261, {%r9314, %r9313}; + mov.b64 %rd1262, {%r9204, %r9205}; + mov.b64 %rd1263, {%r9212, %r9213}; + mov.b64 %rd62, {%r9220, %r9221}; + mov.b64 %rd1264, {%r9228, %r9229}; + mov.b64 %rd64, {%r9236, %r9237}; + mov.b64 %rd65, {%r9244, %r9245}; + mov.b64 %rd66, {%r9252, %r9253}; + mov.u32 %r29725, 0; + st.local.v2.u32 [%rd3+24], {%r9314, %r9313}; + st.local.v2.u32 [%rd55+96], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+104], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+112], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+120], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+128], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+136], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+144], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+152], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+160], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+168], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+176], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+184], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+192], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+200], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+208], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+216], {%r29725, %r29725}; + mov.u32 %r29740, -2147483648; + st.local.v2.u32 [%rd55+88], {%r9276, %r29740}; + mov.u32 %r29726, %r29725; + mov.u32 %r29727, %r29725; + mov.u32 %r29728, %r29725; + mov.u32 %r29729, %r29725; + mov.u32 %r29730, %r29725; + mov.u32 %r29731, %r29725; + mov.u32 %r29732, %r29725; + mov.u32 %r29733, %r29725; + mov.u32 %r29734, %r29725; + mov.u32 %r29735, %r29725; + mov.u32 %r29736, %r29725; + mov.u32 %r29737, %r29725; + mov.u32 %r29738, %r29725; + mov.u32 %r29739, %r9276; + mov.u32 %r29741, %r29725; + mov.u32 %r29742, %r29725; + mov.u32 %r29743, %r29725; + mov.u32 %r29744, %r29725; + mov.u32 %r29745, %r29725; + mov.u32 %r29746, %r29725; + mov.u32 %r29747, %r29725; + mov.u32 %r29748, %r29725; + mov.u32 %r29749, %r29725; + mov.u32 %r29750, %r29725; + mov.u32 %r29751, %r29725; + mov.u32 %r29752, %r29725; + mov.u32 %r29753, %r29725; + mov.u32 %r29754, %r29725; + mov.u32 %r29755, %r29725; + mov.u32 %r29756, %r29725; + mov.u32 %r29757, %r29725; + mov.u32 %r29758, %r29725; + mov.u32 %r29775, %r29725; + +$L__BB2_31: // begin inline asm - dp4a.u32.u32 %r5324, %r5325, %r5918, %r5320; + // xor5 + lop3.b32 %r9315, %r29761, %r29759, %r29757, 0x96; + lop3.b32 %r9315, %r9315, %r29755, %r29753, 0x96; + lop3.b32 %r9316, %r29762, %r29760, %r29758, 0x96; + lop3.b32 %r9316, %r9316, %r29756, %r29754, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9327, %r29773, %r29771, %r29751, 0x96; + lop3.b32 %r9327, %r9327, %r29749, %r29747, 0x96; + lop3.b32 %r9328, %r29774, %r29772, %r29752, 0x96; + lop3.b32 %r9328, %r9328, %r29750, %r29748, 0x96; // end inline asm - ld.const.u32 %r5329, [matrix+3508]; // begin inline asm - dp4a.u32.u32 %r5328, %r5329, %r5922, %r5324; + // xor5 + lop3.b32 %r9339, %r29769, %r29767, %r29745, 0x96; + lop3.b32 %r9339, %r9339, %r29743, %r29741, 0x96; + lop3.b32 %r9340, %r29770, %r29768, %r29746, 0x96; + lop3.b32 %r9340, %r9340, %r29744, %r29742, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9351, %r29765, %r29739, %r29737, 0x96; + lop3.b32 %r9351, %r9351, %r29735, %r29733, 0x96; + lop3.b32 %r9352, %r29766, %r29740, %r29738, 0x96; + lop3.b32 %r9352, %r9352, %r29736, %r29734, 0x96; // end inline asm - ld.const.u32 %r5333, [matrix+3512]; // begin inline asm - dp4a.u32.u32 %r5332, %r5333, %r5926, %r5328; + // xor5 + lop3.b32 %r9363, %r29763, %r29731, %r29729, 0x96; + lop3.b32 %r9363, %r9363, %r29727, %r29725, 0x96; + lop3.b32 %r9364, %r29764, %r29732, %r29730, 0x96; + lop3.b32 %r9364, %r9364, %r29728, %r29726, 0x96; // end inline asm - ld.const.u32 %r5337, [matrix+3516]; // begin inline asm - dp4a.u32.u32 %r5336, %r5337, %r5930, %r5332; + shf.l.wrap.b32 %r9375, %r9328, %r9327, %r9276; // end inline asm - ld.const.u32 %r5341, [matrix+3520]; // begin inline asm - dp4a.u32.u32 %r5340, %r5341, %r5870, %r6249; + shf.l.wrap.b32 %r9379, %r9327, %r9328, %r9276; // end inline asm - ld.const.u32 %r5345, [matrix+3524]; + xor.b32 %r9809, %r9375, %r9363; + xor.b32 %r9810, %r9379, %r9364; + xor.b32 %r9642, %r29761, %r9809; + xor.b32 %r9645, %r29762, %r9810; + xor.b32 %r9549, %r29759, %r9809; + xor.b32 %r9548, %r29760, %r9810; + xor.b32 %r9596, %r29757, %r9809; + xor.b32 %r9597, %r29758, %r9810; + xor.b32 %r9501, %r29755, %r9809; + xor.b32 %r9500, %r29756, %r9810; + xor.b32 %r9452, %r29753, %r9809; + xor.b32 %r9453, %r29754, %r9810; // begin inline asm - dp4a.u32.u32 %r5344, %r5345, %r5874, %r5340; + shf.l.wrap.b32 %r9383, %r9340, %r9339, %r9276; // end inline asm - ld.const.u32 %r5349, [matrix+3528]; // begin inline asm - dp4a.u32.u32 %r5348, %r5349, %r5878, %r5344; + shf.l.wrap.b32 %r9387, %r9339, %r9340, %r9276; // end inline asm - ld.const.u32 %r5353, [matrix+3532]; + xor.b32 %r9811, %r9383, %r9315; + xor.b32 %r9812, %r9387, %r9316; + xor.b32 %r9604, %r29773, %r9811; + xor.b32 %r9605, %r29774, %r9812; + xor.b32 %r9421, %r29771, %r9811; + xor.b32 %r9420, %r29772, %r9812; + xor.b32 %r9580, %r29751, %r9811; + xor.b32 %r9581, %r29752, %r9812; + xor.b32 %r9541, %r29749, %r9811; + xor.b32 %r9540, %r29750, %r9812; + xor.b32 %r9524, %r29747, %r9811; + xor.b32 %r9525, %r29748, %r9812; // begin inline asm - dp4a.u32.u32 %r5352, %r5353, %r5882, %r5348; + shf.l.wrap.b32 %r9391, %r9352, %r9351, %r9276; // end inline asm - ld.const.u32 %r5357, [matrix+3536]; // begin inline asm - dp4a.u32.u32 %r5356, %r5357, %r5886, %r5352; + shf.l.wrap.b32 %r9395, %r9351, %r9352, %r9276; // end inline asm - ld.const.u32 %r5361, [matrix+3540]; + xor.b32 %r9813, %r9391, %r9327; + xor.b32 %r9814, %r9395, %r9328; + xor.b32 %r9461, %r29769, %r9813; + xor.b32 %r9460, %r29770, %r9814; + xor.b32 %r9588, %r29767, %r9813; + xor.b32 %r9589, %r29768, %r9814; + xor.b32 %r9469, %r29745, %r9813; + xor.b32 %r9468, %r29746, %r9814; + xor.b32 %r9572, %r29743, %r9813; + xor.b32 %r9573, %r29744, %r9814; + xor.b32 %r9437, %r29741, %r9813; + xor.b32 %r9436, %r29742, %r9814; // begin inline asm - dp4a.u32.u32 %r5360, %r5361, %r5890, %r5356; + shf.l.wrap.b32 %r9399, %r9364, %r9363, %r9276; // end inline asm - ld.const.u32 %r5365, [matrix+3544]; // begin inline asm - dp4a.u32.u32 %r5364, %r5365, %r5894, %r5360; + shf.l.wrap.b32 %r9403, %r9363, %r9364, %r9276; // end inline asm - ld.const.u32 %r5369, [matrix+3548]; + xor.b32 %r9815, %r9399, %r9339; + xor.b32 %r9816, %r9403, %r9340; + xor.b32 %r9556, %r29765, %r9815; + xor.b32 %r9557, %r29766, %r9816; + xor.b32 %r9533, %r29739, %r9815; + xor.b32 %r9532, %r29740, %r9816; + xor.b32 %r9476, %r29737, %r9815; + xor.b32 %r9477, %r29738, %r9816; + xor.b32 %r9564, %r29735, %r9815; + xor.b32 %r9565, %r29736, %r9816; + xor.b32 %r9493, %r29733, %r9815; + xor.b32 %r9492, %r29734, %r9816; // begin inline asm - dp4a.u32.u32 %r5368, %r5369, %r5898, %r5364; + shf.l.wrap.b32 %r9407, %r9316, %r9315, %r9276; // end inline asm - ld.const.u32 %r5373, [matrix+3552]; // begin inline asm - dp4a.u32.u32 %r5372, %r5373, %r5902, %r5368; + shf.l.wrap.b32 %r9411, %r9315, %r9316, %r9276; // end inline asm - ld.const.u32 %r5377, [matrix+3556]; + xor.b32 %r9817, %r9407, %r9351; + xor.b32 %r9818, %r9411, %r9352; + xor.b32 %r9508, %r29763, %r9817; + xor.b32 %r9509, %r29764, %r9818; + xor.b32 %r9428, %r29731, %r9817; + xor.b32 %r9429, %r29732, %r9818; + xor.b32 %r9445, %r29729, %r9817; + xor.b32 %r9444, %r29730, %r9818; + xor.b32 %r9484, %r29727, %r9817; + xor.b32 %r9485, %r29728, %r9818; + xor.b32 %r9516, %r29725, %r9817; + xor.b32 %r9517, %r29726, %r9818; + mov.u32 %r9422, 44; // begin inline asm - dp4a.u32.u32 %r5376, %r5377, %r5906, %r5372; + shf.l.wrap.b32 %r9415, %r9421, %r9420, %r9422; // end inline asm - ld.const.u32 %r5381, [matrix+3560]; // begin inline asm - dp4a.u32.u32 %r5380, %r5381, %r5910, %r5376; + shf.l.wrap.b32 %r9419, %r9420, %r9421, %r9422; // end inline asm - ld.const.u32 %r5385, [matrix+3564]; + mov.u32 %r9430, 20; // begin inline asm - dp4a.u32.u32 %r5384, %r5385, %r5914, %r5380; + shf.l.wrap.b32 %r9423, %r9429, %r9428, %r9430; // end inline asm - ld.const.u32 %r5389, [matrix+3568]; // begin inline asm - dp4a.u32.u32 %r5388, %r5389, %r5918, %r5384; + shf.l.wrap.b32 %r9427, %r9428, %r9429, %r9430; // end inline asm - ld.const.u32 %r5393, [matrix+3572]; + mov.u32 %r9438, 61; // begin inline asm - dp4a.u32.u32 %r5392, %r5393, %r5922, %r5388; + shf.l.wrap.b32 %r9431, %r9437, %r9436, %r9438; // end inline asm - ld.const.u32 %r5397, [matrix+3576]; // begin inline asm - dp4a.u32.u32 %r5396, %r5397, %r5926, %r5392; + shf.l.wrap.b32 %r9435, %r9436, %r9437, %r9438; // end inline asm - ld.const.u32 %r5401, [matrix+3580]; + mov.u32 %r9446, 39; // begin inline asm - dp4a.u32.u32 %r5400, %r5401, %r5930, %r5396; + shf.l.wrap.b32 %r9439, %r9445, %r9444, %r9446; // end inline asm - shr.u32 %r6106, %r5336, 6; - and.b32 %r5405, %r6106, 240; - shr.u32 %r5406, %r5400, 10; - and.b32 %r5407, %r6050, 255; // begin inline asm - lop3.b32 %r5404, %r5405, %r5406, %r5407, 0x56; + shf.l.wrap.b32 %r9443, %r9444, %r9445, %r9446; // end inline asm - shl.b32 %r6107, %r5404, 24; - cvt.u64.u32 %rd212, %r6107; - ld.const.u32 %r5409, [matrix+3584]; + mov.u32 %r9454, 18; // begin inline asm - dp4a.u32.u32 %r5408, %r5409, %r5870, %r6249; + shf.l.wrap.b32 %r9447, %r9453, %r9452, %r9454; // end inline asm - ld.const.u32 %r5413, [matrix+3588]; // begin inline asm - dp4a.u32.u32 %r5412, %r5413, %r5874, %r5408; + shf.l.wrap.b32 %r9451, %r9452, %r9453, %r9454; // end inline asm - ld.const.u32 %r5417, [matrix+3592]; + mov.u32 %r9462, 62; // begin inline asm - dp4a.u32.u32 %r5416, %r5417, %r5878, %r5412; + shf.l.wrap.b32 %r9455, %r9461, %r9460, %r9462; // end inline asm - ld.const.u32 %r5421, [matrix+3596]; // begin inline asm - dp4a.u32.u32 %r5420, %r5421, %r5882, %r5416; + shf.l.wrap.b32 %r9459, %r9460, %r9461, %r9462; // end inline asm - ld.const.u32 %r5425, [matrix+3600]; + mov.u32 %r9470, 43; // begin inline asm - dp4a.u32.u32 %r5424, %r5425, %r5886, %r5420; + shf.l.wrap.b32 %r9463, %r9469, %r9468, %r9470; // end inline asm - ld.const.u32 %r5429, [matrix+3604]; // begin inline asm - dp4a.u32.u32 %r5428, %r5429, %r5890, %r5424; + shf.l.wrap.b32 %r9467, %r9468, %r9469, %r9470; // end inline asm - ld.const.u32 %r5433, [matrix+3608]; + mov.u32 %r9478, 25; // begin inline asm - dp4a.u32.u32 %r5432, %r5433, %r5894, %r5428; + shf.l.wrap.b32 %r9471, %r9477, %r9476, %r9478; // end inline asm - ld.const.u32 %r5437, [matrix+3612]; // begin inline asm - dp4a.u32.u32 %r5436, %r5437, %r5898, %r5432; + shf.l.wrap.b32 %r9475, %r9476, %r9477, %r9478; // end inline asm - ld.const.u32 %r5441, [matrix+3616]; + mov.u32 %r9486, 8; // begin inline asm - dp4a.u32.u32 %r5440, %r5441, %r5902, %r5436; + shf.l.wrap.b32 %r9479, %r9485, %r9484, %r9486; // end inline asm - ld.const.u32 %r5445, [matrix+3620]; // begin inline asm - dp4a.u32.u32 %r5444, %r5445, %r5906, %r5440; + shf.l.wrap.b32 %r9483, %r9484, %r9485, %r9486; // end inline asm - ld.const.u32 %r5449, [matrix+3624]; + mov.u32 %r9494, 56; // begin inline asm - dp4a.u32.u32 %r5448, %r5449, %r5910, %r5444; + shf.l.wrap.b32 %r9487, %r9493, %r9492, %r9494; // end inline asm - ld.const.u32 %r5453, [matrix+3628]; // begin inline asm - dp4a.u32.u32 %r5452, %r5453, %r5914, %r5448; + shf.l.wrap.b32 %r9491, %r9492, %r9493, %r9494; // end inline asm - ld.const.u32 %r5457, [matrix+3632]; + mov.u32 %r9502, 41; // begin inline asm - dp4a.u32.u32 %r5456, %r5457, %r5918, %r5452; + shf.l.wrap.b32 %r9495, %r9501, %r9500, %r9502; // end inline asm - ld.const.u32 %r5461, [matrix+3636]; // begin inline asm - dp4a.u32.u32 %r5460, %r5461, %r5922, %r5456; + shf.l.wrap.b32 %r9499, %r9500, %r9501, %r9502; // end inline asm - ld.const.u32 %r5465, [matrix+3640]; + mov.u32 %r9510, 27; // begin inline asm - dp4a.u32.u32 %r5464, %r5465, %r5926, %r5460; + shf.l.wrap.b32 %r9503, %r9509, %r9508, %r9510; // end inline asm - ld.const.u32 %r5469, [matrix+3644]; // begin inline asm - dp4a.u32.u32 %r5468, %r5469, %r5930, %r5464; + shf.l.wrap.b32 %r9507, %r9508, %r9509, %r9510; // end inline asm - ld.const.u32 %r5473, [matrix+3648]; + mov.u32 %r9518, 14; // begin inline asm - dp4a.u32.u32 %r5472, %r5473, %r5870, %r6249; + shf.l.wrap.b32 %r9511, %r9517, %r9516, %r9518; // end inline asm - ld.const.u32 %r5477, [matrix+3652]; // begin inline asm - dp4a.u32.u32 %r5476, %r5477, %r5874, %r5472; + shf.l.wrap.b32 %r9515, %r9516, %r9517, %r9518; // end inline asm - ld.const.u32 %r5481, [matrix+3656]; + mov.u32 %r9526, 2; // begin inline asm - dp4a.u32.u32 %r5480, %r5481, %r5878, %r5476; + shf.l.wrap.b32 %r9519, %r9525, %r9524, %r9526; // end inline asm - ld.const.u32 %r5485, [matrix+3660]; // begin inline asm - dp4a.u32.u32 %r5484, %r5485, %r5882, %r5480; + shf.l.wrap.b32 %r9523, %r9524, %r9525, %r9526; // end inline asm - ld.const.u32 %r5489, [matrix+3664]; + mov.u32 %r9534, 55; // begin inline asm - dp4a.u32.u32 %r5488, %r5489, %r5886, %r5484; + shf.l.wrap.b32 %r9527, %r9533, %r9532, %r9534; // end inline asm - ld.const.u32 %r5493, [matrix+3668]; // begin inline asm - dp4a.u32.u32 %r5492, %r5493, %r5890, %r5488; + shf.l.wrap.b32 %r9531, %r9532, %r9533, %r9534; // end inline asm - ld.const.u32 %r5497, [matrix+3672]; + mov.u32 %r9542, 45; // begin inline asm - dp4a.u32.u32 %r5496, %r5497, %r5894, %r5492; + shf.l.wrap.b32 %r9535, %r9541, %r9540, %r9542; // end inline asm - ld.const.u32 %r5501, [matrix+3676]; // begin inline asm - dp4a.u32.u32 %r5500, %r5501, %r5898, %r5496; + shf.l.wrap.b32 %r9539, %r9540, %r9541, %r9542; // end inline asm - ld.const.u32 %r5505, [matrix+3680]; + mov.u32 %r9550, 36; // begin inline asm - dp4a.u32.u32 %r5504, %r5505, %r5902, %r5500; + shf.l.wrap.b32 %r9543, %r9549, %r9548, %r9550; // end inline asm - ld.const.u32 %r5509, [matrix+3684]; // begin inline asm - dp4a.u32.u32 %r5508, %r5509, %r5906, %r5504; + shf.l.wrap.b32 %r9547, %r9548, %r9549, %r9550; // end inline asm - ld.const.u32 %r5513, [matrix+3688]; + mov.u32 %r9558, 28; // begin inline asm - dp4a.u32.u32 %r5512, %r5513, %r5910, %r5508; + shf.l.wrap.b32 %r9551, %r9557, %r9556, %r9558; // end inline asm - ld.const.u32 %r5517, [matrix+3692]; // begin inline asm - dp4a.u32.u32 %r5516, %r5517, %r5914, %r5512; + shf.l.wrap.b32 %r9555, %r9556, %r9557, %r9558; // end inline asm - ld.const.u32 %r5521, [matrix+3696]; + mov.u32 %r9566, 21; // begin inline asm - dp4a.u32.u32 %r5520, %r5521, %r5918, %r5516; + shf.l.wrap.b32 %r9559, %r9565, %r9564, %r9566; // end inline asm - ld.const.u32 %r5525, [matrix+3700]; // begin inline asm - dp4a.u32.u32 %r5524, %r5525, %r5922, %r5520; + shf.l.wrap.b32 %r9563, %r9564, %r9565, %r9566; // end inline asm - ld.const.u32 %r5529, [matrix+3704]; + mov.u32 %r9574, 15; // begin inline asm - dp4a.u32.u32 %r5528, %r5529, %r5926, %r5524; + shf.l.wrap.b32 %r9567, %r9573, %r9572, %r9574; // end inline asm - ld.const.u32 %r5533, [matrix+3708]; // begin inline asm - dp4a.u32.u32 %r5532, %r5533, %r5930, %r5528; + shf.l.wrap.b32 %r9571, %r9572, %r9573, %r9574; // end inline asm - shr.u32 %r6108, %r5468, 6; - and.b32 %r5537, %r6108, 240; - shr.u32 %r5538, %r5532, 10; - and.b32 %r5539, %r6056, 255; + mov.u32 %r9582, 10; // begin inline asm - lop3.b32 %r5536, %r5537, %r5538, %r5539, 0x56; + shf.l.wrap.b32 %r9575, %r9581, %r9580, %r9582; // end inline asm - ld.const.u32 %r5541, [matrix+3712]; // begin inline asm - dp4a.u32.u32 %r5540, %r5541, %r5870, %r6249; + shf.l.wrap.b32 %r9579, %r9580, %r9581, %r9582; // end inline asm - ld.const.u32 %r5545, [matrix+3716]; + mov.u32 %r9590, 6; // begin inline asm - dp4a.u32.u32 %r5544, %r5545, %r5874, %r5540; + shf.l.wrap.b32 %r9583, %r9589, %r9588, %r9590; // end inline asm - ld.const.u32 %r5549, [matrix+3720]; // begin inline asm - dp4a.u32.u32 %r5548, %r5549, %r5878, %r5544; + shf.l.wrap.b32 %r9587, %r9588, %r9589, %r9590; // end inline asm - ld.const.u32 %r5553, [matrix+3724]; + mov.u32 %r9598, 3; // begin inline asm - dp4a.u32.u32 %r5552, %r5553, %r5882, %r5548; + shf.l.wrap.b32 %r9591, %r9597, %r9596, %r9598; // end inline asm - ld.const.u32 %r5557, [matrix+3728]; // begin inline asm - dp4a.u32.u32 %r5556, %r5557, %r5886, %r5552; + shf.l.wrap.b32 %r9595, %r9596, %r9597, %r9598; // end inline asm - ld.const.u32 %r5561, [matrix+3732]; // begin inline asm - dp4a.u32.u32 %r5560, %r5561, %r5890, %r5556; + shf.l.wrap.b32 %r9599, %r9605, %r9604, %r9276; // end inline asm - ld.const.u32 %r5565, [matrix+3736]; // begin inline asm - dp4a.u32.u32 %r5564, %r5565, %r5894, %r5560; + shf.l.wrap.b32 %r9603, %r9604, %r9605, %r9276; // end inline asm - ld.const.u32 %r5569, [matrix+3740]; // begin inline asm - dp4a.u32.u32 %r5568, %r5569, %r5898, %r5564; + // chi + lop3.b32 %r9607, %r9642, %r9415, %r9463, 0xD2; + lop3.b32 %r9608, %r9645, %r9419, %r9467, 0xD2; // end inline asm - ld.const.u32 %r5573, [matrix+3744]; // begin inline asm - dp4a.u32.u32 %r5572, %r5573, %r5902, %r5568; + // chi + lop3.b32 %r29773, %r9415, %r9463, %r9559, 0xD2; + lop3.b32 %r29774, %r9419, %r9467, %r9563, 0xD2; // end inline asm - ld.const.u32 %r5577, [matrix+3748]; // begin inline asm - dp4a.u32.u32 %r5576, %r5577, %r5906, %r5572; + // chi + lop3.b32 %r29769, %r9463, %r9559, %r9511, 0xD2; + lop3.b32 %r29770, %r9467, %r9563, %r9515, 0xD2; // end inline asm - ld.const.u32 %r5581, [matrix+3752]; // begin inline asm - dp4a.u32.u32 %r5580, %r5581, %r5910, %r5576; + // chi + lop3.b32 %r29765, %r9559, %r9511, %r9642, 0xD2; + lop3.b32 %r29766, %r9563, %r9515, %r9645, 0xD2; // end inline asm - ld.const.u32 %r5585, [matrix+3756]; // begin inline asm - dp4a.u32.u32 %r5584, %r5585, %r5914, %r5580; + // chi + lop3.b32 %r29763, %r9511, %r9642, %r9415, 0xD2; + lop3.b32 %r29764, %r9515, %r9645, %r9419, 0xD2; // end inline asm - ld.const.u32 %r5589, [matrix+3760]; // begin inline asm - dp4a.u32.u32 %r5588, %r5589, %r5918, %r5584; + // chi + lop3.b32 %r29759, %r9551, %r9423, %r9591, 0xD2; + lop3.b32 %r29760, %r9555, %r9427, %r9595, 0xD2; // end inline asm - ld.const.u32 %r5593, [matrix+3764]; // begin inline asm - dp4a.u32.u32 %r5592, %r5593, %r5922, %r5588; + // chi + lop3.b32 %r29771, %r9423, %r9591, %r9535, 0xD2; + lop3.b32 %r29772, %r9427, %r9595, %r9539, 0xD2; // end inline asm - ld.const.u32 %r5597, [matrix+3768]; // begin inline asm - dp4a.u32.u32 %r5596, %r5597, %r5926, %r5592; + // chi + lop3.b32 %r29767, %r9591, %r9535, %r9431, 0xD2; + lop3.b32 %r29768, %r9595, %r9539, %r9435, 0xD2; // end inline asm - ld.const.u32 %r5601, [matrix+3772]; // begin inline asm - dp4a.u32.u32 %r5600, %r5601, %r5930, %r5596; + // chi + lop3.b32 %r29739, %r9535, %r9431, %r9551, 0xD2; + lop3.b32 %r29740, %r9539, %r9435, %r9555, 0xD2; // end inline asm - ld.const.u32 %r5605, [matrix+3776]; + st.local.v2.u32 [%rd55+88], {%r29739, %r29740}; // begin inline asm - dp4a.u32.u32 %r5604, %r5605, %r5870, %r6249; + // chi + lop3.b32 %r29731, %r9431, %r9551, %r9423, 0xD2; + lop3.b32 %r29732, %r9435, %r9555, %r9427, 0xD2; // end inline asm - ld.const.u32 %r5609, [matrix+3780]; + st.local.v2.u32 [%rd55+96], {%r29731, %r29732}; // begin inline asm - dp4a.u32.u32 %r5608, %r5609, %r5874, %r5604; + // chi + lop3.b32 %r29757, %r9599, %r9583, %r9471, 0xD2; + lop3.b32 %r29758, %r9603, %r9587, %r9475, 0xD2; // end inline asm - ld.const.u32 %r5613, [matrix+3784]; + st.local.v2.u32 [%rd55+104], {%r29757, %r29758}; // begin inline asm - dp4a.u32.u32 %r5612, %r5613, %r5878, %r5608; + // chi + lop3.b32 %r29751, %r9583, %r9471, %r9479, 0xD2; + lop3.b32 %r29752, %r9587, %r9475, %r9483, 0xD2; // end inline asm - ld.const.u32 %r5617, [matrix+3788]; + st.local.v2.u32 [%rd55+112], {%r29751, %r29752}; // begin inline asm - dp4a.u32.u32 %r5616, %r5617, %r5882, %r5612; + // chi + lop3.b32 %r29745, %r9471, %r9479, %r9447, 0xD2; + lop3.b32 %r29746, %r9475, %r9483, %r9451, 0xD2; // end inline asm - ld.const.u32 %r5621, [matrix+3792]; + st.local.v2.u32 [%rd55+120], {%r29745, %r29746}; // begin inline asm - dp4a.u32.u32 %r5620, %r5621, %r5886, %r5616; + // chi + lop3.b32 %r29737, %r9479, %r9447, %r9599, 0xD2; + lop3.b32 %r29738, %r9483, %r9451, %r9603, 0xD2; // end inline asm - ld.const.u32 %r5625, [matrix+3796]; + st.local.v2.u32 [%rd55+128], {%r29737, %r29738}; // begin inline asm - dp4a.u32.u32 %r5624, %r5625, %r5890, %r5620; + // chi + lop3.b32 %r29729, %r9447, %r9599, %r9583, 0xD2; + lop3.b32 %r29730, %r9451, %r9603, %r9587, 0xD2; // end inline asm - ld.const.u32 %r5629, [matrix+3800]; + st.local.v2.u32 [%rd55+136], {%r29729, %r29730}; // begin inline asm - dp4a.u32.u32 %r5628, %r5629, %r5894, %r5624; + // chi + lop3.b32 %r29755, %r9503, %r9543, %r9575, 0xD2; + lop3.b32 %r29756, %r9507, %r9547, %r9579, 0xD2; // end inline asm - ld.const.u32 %r5633, [matrix+3804]; + st.local.v2.u32 [%rd55+144], {%r29755, %r29756}; // begin inline asm - dp4a.u32.u32 %r5632, %r5633, %r5898, %r5628; + // chi + lop3.b32 %r29749, %r9543, %r9575, %r9567, 0xD2; + lop3.b32 %r29750, %r9547, %r9579, %r9571, 0xD2; // end inline asm - ld.const.u32 %r5637, [matrix+3808]; + st.local.v2.u32 [%rd55+152], {%r29749, %r29750}; // begin inline asm - dp4a.u32.u32 %r5636, %r5637, %r5902, %r5632; + // chi + lop3.b32 %r29743, %r9575, %r9567, %r9487, 0xD2; + lop3.b32 %r29744, %r9579, %r9571, %r9491, 0xD2; // end inline asm - ld.const.u32 %r5641, [matrix+3812]; + st.local.v2.u32 [%rd55+160], {%r29743, %r29744}; // begin inline asm - dp4a.u32.u32 %r5640, %r5641, %r5906, %r5636; + // chi + lop3.b32 %r29735, %r9567, %r9487, %r9503, 0xD2; + lop3.b32 %r29736, %r9571, %r9491, %r9507, 0xD2; // end inline asm - ld.const.u32 %r5645, [matrix+3816]; + st.local.v2.u32 [%rd55+168], {%r29735, %r29736}; // begin inline asm - dp4a.u32.u32 %r5644, %r5645, %r5910, %r5640; + // chi + lop3.b32 %r29727, %r9487, %r9503, %r9543, 0xD2; + lop3.b32 %r29728, %r9491, %r9507, %r9547, 0xD2; // end inline asm - ld.const.u32 %r5649, [matrix+3820]; + st.local.v2.u32 [%rd55+176], {%r29727, %r29728}; // begin inline asm - dp4a.u32.u32 %r5648, %r5649, %r5914, %r5644; + // chi + lop3.b32 %r29753, %r9455, %r9527, %r9439, 0xD2; + lop3.b32 %r29754, %r9459, %r9531, %r9443, 0xD2; // end inline asm - ld.const.u32 %r5653, [matrix+3824]; + st.local.v2.u32 [%rd55+184], {%r29753, %r29754}; // begin inline asm - dp4a.u32.u32 %r5652, %r5653, %r5918, %r5648; + // chi + lop3.b32 %r29747, %r9527, %r9439, %r9495, 0xD2; + lop3.b32 %r29748, %r9531, %r9443, %r9499, 0xD2; // end inline asm - ld.const.u32 %r5657, [matrix+3828]; + st.local.v2.u32 [%rd55+192], {%r29747, %r29748}; // begin inline asm - dp4a.u32.u32 %r5656, %r5657, %r5922, %r5652; + // chi + lop3.b32 %r29741, %r9439, %r9495, %r9519, 0xD2; + lop3.b32 %r29742, %r9443, %r9499, %r9523, 0xD2; // end inline asm - ld.const.u32 %r5661, [matrix+3832]; + st.local.v2.u32 [%rd55+200], {%r29741, %r29742}; // begin inline asm - dp4a.u32.u32 %r5660, %r5661, %r5926, %r5656; + // chi + lop3.b32 %r29733, %r9495, %r9519, %r9455, 0xD2; + lop3.b32 %r29734, %r9499, %r9523, %r9459, 0xD2; // end inline asm - ld.const.u32 %r5665, [matrix+3836]; + st.local.v2.u32 [%rd55+208], {%r29733, %r29734}; // begin inline asm - dp4a.u32.u32 %r5664, %r5665, %r5930, %r5660; + // chi + lop3.b32 %r29725, %r9519, %r9455, %r9527, 0xD2; + lop3.b32 %r29726, %r9523, %r9459, %r9531, 0xD2; // end inline asm - shr.u32 %r6109, %r5600, 6; - and.b32 %r5669, %r6109, 240; - shr.u32 %r5670, %r5664, 10; - and.b32 %r5671, %r6060, 255; + st.local.v2.u32 [%rd55+216], {%r29725, %r29726}; + mul.wide.s32 %rd572, %r29775, 8; + add.s64 %rd571, %rd497, %rd572; // begin inline asm - lop3.b32 %r5668, %r5669, %r5670, %r5671, 0x56; + ld.global.nc.v2.u32 {%r9807,%r9808}, [%rd571]; // end inline asm - ld.const.u32 %r5673, [matrix+3840]; + xor.b32 %r29761, %r9607, %r9807; + xor.b32 %r29762, %r9608, %r9808; + add.s32 %r29775, %r29775, 1; + setp.lt.u32 %p22, %r29775, 23; + @%p22 bra $L__BB2_31; + + mov.u32 %r9918, 1; + st.local.v2.u32 [%rd55+32], {%r29773, %r29774}; + st.local.v2.u32 [%rd55+72], {%r29771, %r29772}; + st.local.v2.u32 [%rd55+40], {%r29769, %r29770}; + st.local.v2.u32 [%rd55+80], {%r29767, %r29768}; + st.local.v2.u32 [%rd55+48], {%r29765, %r29766}; + st.local.v2.u32 [%rd55+56], {%r29763, %r29764}; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; // begin inline asm - dp4a.u32.u32 %r5672, %r5673, %r5870, %r6249; + // xor5 + lop3.b32 %r9819, %r29761, %r29759, %r29757, 0x96; + lop3.b32 %r9819, %r9819, %r29755, %r29753, 0x96; + lop3.b32 %r9820, %r29762, %r29760, %r29758, 0x96; + lop3.b32 %r9820, %r9820, %r29756, %r29754, 0x96; // end inline asm - ld.const.u32 %r5677, [matrix+3844]; // begin inline asm - dp4a.u32.u32 %r5676, %r5677, %r5874, %r5672; + // xor5 + lop3.b32 %r9831, %r29773, %r29771, %r29751, 0x96; + lop3.b32 %r9831, %r9831, %r29749, %r29747, 0x96; + lop3.b32 %r9832, %r29774, %r29772, %r29752, 0x96; + lop3.b32 %r9832, %r9832, %r29750, %r29748, 0x96; // end inline asm - ld.const.u32 %r5681, [matrix+3848]; // begin inline asm - dp4a.u32.u32 %r5680, %r5681, %r5878, %r5676; + // xor5 + lop3.b32 %r9843, %r29769, %r29767, %r29745, 0x96; + lop3.b32 %r9843, %r9843, %r29743, %r29741, 0x96; + lop3.b32 %r9844, %r29770, %r29768, %r29746, 0x96; + lop3.b32 %r9844, %r9844, %r29744, %r29742, 0x96; // end inline asm - ld.const.u32 %r5685, [matrix+3852]; // begin inline asm - dp4a.u32.u32 %r5684, %r5685, %r5882, %r5680; + // xor5 + lop3.b32 %r9855, %r29765, %r29739, %r29737, 0x96; + lop3.b32 %r9855, %r9855, %r29735, %r29733, 0x96; + lop3.b32 %r9856, %r29766, %r29740, %r29738, 0x96; + lop3.b32 %r9856, %r9856, %r29736, %r29734, 0x96; // end inline asm - ld.const.u32 %r5689, [matrix+3856]; // begin inline asm - dp4a.u32.u32 %r5688, %r5689, %r5886, %r5684; + // xor5 + lop3.b32 %r9867, %r29763, %r29731, %r29729, 0x96; + lop3.b32 %r9867, %r9867, %r29727, %r29725, 0x96; + lop3.b32 %r9868, %r29764, %r29732, %r29730, 0x96; + lop3.b32 %r9868, %r9868, %r29728, %r29726, 0x96; // end inline asm - ld.const.u32 %r5693, [matrix+3860]; // begin inline asm - dp4a.u32.u32 %r5692, %r5693, %r5890, %r5688; + shf.l.wrap.b32 %r9879, %r9832, %r9831, %r9918; // end inline asm - ld.const.u32 %r5697, [matrix+3864]; // begin inline asm - dp4a.u32.u32 %r5696, %r5697, %r5894, %r5692; + shf.l.wrap.b32 %r9883, %r9831, %r9832, %r9918; // end inline asm - ld.const.u32 %r5701, [matrix+3868]; + xor.b32 %r10057, %r9879, %r9867; + xor.b32 %r10058, %r9883, %r9868; + xor.b32 %r10026, %r29761, %r10057; + xor.b32 %r10029, %r29762, %r10058; + xor.b32 %r9989, %r29758, %r10058; + xor.b32 %r9988, %r29757, %r10057; + st.local.v2.u32 [%rd55+104], {%r9988, %r9989}; // begin inline asm - dp4a.u32.u32 %r5700, %r5701, %r5898, %r5696; + shf.l.wrap.b32 %r9887, %r9844, %r9843, %r9918; // end inline asm - ld.const.u32 %r5705, [matrix+3872]; // begin inline asm - dp4a.u32.u32 %r5704, %r5705, %r5902, %r5700; + shf.l.wrap.b32 %r9891, %r9843, %r9844, %r9918; // end inline asm - ld.const.u32 %r5709, [matrix+3876]; + xor.b32 %r10059, %r9887, %r9819; + xor.b32 %r10060, %r9891, %r9820; + xor.b32 %r9925, %r29771, %r10059; + xor.b32 %r9924, %r29772, %r10060; + xor.b32 %r9964, %r29750, %r10060; + xor.b32 %r9965, %r29749, %r10059; + st.local.v2.u32 [%rd55+152], {%r9965, %r9964}; // begin inline asm - dp4a.u32.u32 %r5708, %r5709, %r5906, %r5704; + shf.l.wrap.b32 %r9895, %r9856, %r9855, %r9918; // end inline asm - ld.const.u32 %r5713, [matrix+3880]; // begin inline asm - dp4a.u32.u32 %r5712, %r5713, %r5910, %r5708; + shf.l.wrap.b32 %r9899, %r9855, %r9856, %r9918; // end inline asm - ld.const.u32 %r5717, [matrix+3884]; + xor.b32 %r10061, %r9895, %r9831; + xor.b32 %r10062, %r9899, %r9832; + xor.b32 %r9948, %r29746, %r10062; + xor.b32 %r9949, %r29745, %r10061; + st.local.v2.u32 [%rd55+120], {%r9949, %r9948}; + xor.b32 %r9940, %r29742, %r10062; + xor.b32 %r9941, %r29741, %r10061; + st.local.v2.u32 [%rd55+200], {%r9941, %r9940}; // begin inline asm - dp4a.u32.u32 %r5716, %r5717, %r5914, %r5712; + shf.l.wrap.b32 %r9903, %r9868, %r9867, %r9918; // end inline asm - ld.const.u32 %r5721, [matrix+3888]; // begin inline asm - dp4a.u32.u32 %r5720, %r5721, %r5918, %r5716; + shf.l.wrap.b32 %r9907, %r9867, %r9868, %r9918; // end inline asm - ld.const.u32 %r5725, [matrix+3892]; + xor.b32 %r10063, %r9903, %r9843; + xor.b32 %r10064, %r9907, %r9844; + xor.b32 %r9972, %r29765, %r10063; + xor.b32 %r9973, %r29766, %r10064; + xor.b32 %r9981, %r29736, %r10064; + xor.b32 %r9980, %r29735, %r10063; + st.local.v2.u32 [%rd55+168], {%r9980, %r9981}; // begin inline asm - dp4a.u32.u32 %r5724, %r5725, %r5922, %r5720; + shf.l.wrap.b32 %r9911, %r9820, %r9819, %r9918; // end inline asm - ld.const.u32 %r5729, [matrix+3896]; // begin inline asm - dp4a.u32.u32 %r5728, %r5729, %r5926, %r5724; + shf.l.wrap.b32 %r9915, %r9819, %r9820, %r9918; // end inline asm - ld.const.u32 %r5733, [matrix+3900]; + xor.b32 %r10065, %r9911, %r9855; + xor.b32 %r10066, %r9915, %r9856; + xor.b32 %r9932, %r29731, %r10065; + xor.b32 %r9933, %r29732, %r10066; + xor.b32 %r9957, %r29726, %r10066; + xor.b32 %r9956, %r29725, %r10065; + st.local.v2.u32 [%rd55+216], {%r9956, %r9957}; // begin inline asm - dp4a.u32.u32 %r5732, %r5733, %r5930, %r5728; + shf.l.wrap.b32 %r9919, %r9925, %r9924, %r9422; // end inline asm - ld.const.u32 %r5737, [matrix+3904]; // begin inline asm - dp4a.u32.u32 %r5736, %r5737, %r5870, %r6249; + shf.l.wrap.b32 %r9923, %r9924, %r9925, %r9422; // end inline asm - ld.const.u32 %r5741, [matrix+3908]; // begin inline asm - dp4a.u32.u32 %r5740, %r5741, %r5874, %r5736; + shf.l.wrap.b32 %r9927, %r9933, %r9932, %r9430; // end inline asm - ld.const.u32 %r5745, [matrix+3912]; // begin inline asm - dp4a.u32.u32 %r5744, %r5745, %r5878, %r5740; + shf.l.wrap.b32 %r9931, %r9932, %r9933, %r9430; // end inline asm - ld.const.u32 %r5749, [matrix+3916]; // begin inline asm - dp4a.u32.u32 %r5748, %r5749, %r5882, %r5744; + shf.l.wrap.b32 %r9939, %r9940, %r9941, %r9438; // end inline asm - ld.const.u32 %r5753, [matrix+3920]; // begin inline asm - dp4a.u32.u32 %r5752, %r5753, %r5886, %r5748; + shf.l.wrap.b32 %r9935, %r9941, %r9940, %r9438; // end inline asm - ld.const.u32 %r5757, [matrix+3924]; + st.local.v2.u32 [%rd55+96], {%r9935, %r9939}; // begin inline asm - dp4a.u32.u32 %r5756, %r5757, %r5890, %r5752; + shf.l.wrap.b32 %r9943, %r9949, %r9948, %r9470; // end inline asm - ld.const.u32 %r5761, [matrix+3928]; // begin inline asm - dp4a.u32.u32 %r5760, %r5761, %r5894, %r5756; + shf.l.wrap.b32 %r9947, %r9948, %r9949, %r9470; // end inline asm - ld.const.u32 %r5765, [matrix+3932]; // begin inline asm - dp4a.u32.u32 %r5764, %r5765, %r5898, %r5760; + shf.l.wrap.b32 %r9951, %r9957, %r9956, %r9518; // end inline asm - ld.const.u32 %r5769, [matrix+3936]; // begin inline asm - dp4a.u32.u32 %r5768, %r5769, %r5902, %r5764; + shf.l.wrap.b32 %r9955, %r9956, %r9957, %r9518; // end inline asm - ld.const.u32 %r5773, [matrix+3940]; // begin inline asm - dp4a.u32.u32 %r5772, %r5773, %r5906, %r5768; + shf.l.wrap.b32 %r9963, %r9964, %r9965, %r9542; // end inline asm - ld.const.u32 %r5777, [matrix+3944]; // begin inline asm - dp4a.u32.u32 %r5776, %r5777, %r5910, %r5772; + shf.l.wrap.b32 %r9959, %r9965, %r9964, %r9542; // end inline asm - ld.const.u32 %r5781, [matrix+3948]; + st.local.v2.u32 [%rd55+88], {%r9959, %r9963}; // begin inline asm - dp4a.u32.u32 %r5780, %r5781, %r5914, %r5776; + shf.l.wrap.b32 %r9967, %r9973, %r9972, %r9558; // end inline asm - ld.const.u32 %r5785, [matrix+3952]; // begin inline asm - dp4a.u32.u32 %r5784, %r5785, %r5918, %r5780; + shf.l.wrap.b32 %r9971, %r9972, %r9973, %r9558; // end inline asm - ld.const.u32 %r5789, [matrix+3956]; // begin inline asm - dp4a.u32.u32 %r5788, %r5789, %r5922, %r5784; + shf.l.wrap.b32 %r9975, %r9981, %r9980, %r9566; // end inline asm - ld.const.u32 %r5793, [matrix+3960]; // begin inline asm - dp4a.u32.u32 %r5792, %r5793, %r5926, %r5788; + shf.l.wrap.b32 %r9979, %r9980, %r9981, %r9566; // end inline asm - ld.const.u32 %r5797, [matrix+3964]; // begin inline asm - dp4a.u32.u32 %r5796, %r5797, %r5930, %r5792; + shf.l.wrap.b32 %r9983, %r9989, %r9988, %r9598; // end inline asm - shr.u32 %r6110, %r5732, 6; - and.b32 %r5801, %r6110, 240; - shr.u32 %r5802, %r5796, 10; - and.b32 %r5803, %r6068, 255; // begin inline asm - lop3.b32 %r5800, %r5801, %r5802, %r5803, 0x56; + shf.l.wrap.b32 %r9987, %r9988, %r9989, %r9598; // end inline asm - ld.const.u32 %r5805, [matrix+3968]; // begin inline asm - dp4a.u32.u32 %r5804, %r5805, %r5870, %r6249; + // chi + lop3.b32 %r9991, %r10026, %r9919, %r9943, 0xD2; + lop3.b32 %r9992, %r10029, %r9923, %r9947, 0xD2; // end inline asm - ld.const.u32 %r5809, [matrix+3972]; // begin inline asm - dp4a.u32.u32 %r5808, %r5809, %r5874, %r5804; + // chi + lop3.b32 %r9999, %r9919, %r9943, %r9975, 0xD2; + lop3.b32 %r10000, %r9923, %r9947, %r9979, 0xD2; // end inline asm - ld.const.u32 %r5813, [matrix+3976]; + st.local.v2.u32 [%rd55+32], {%r9999, %r10000}; // begin inline asm - dp4a.u32.u32 %r5812, %r5813, %r5878, %r5808; + // chi + lop3.b32 %r10007, %r9943, %r9975, %r9951, 0xD2; + lop3.b32 %r10008, %r9947, %r9979, %r9955, 0xD2; // end inline asm - ld.const.u32 %r5817, [matrix+3980]; + st.local.v2.u32 [%rd55+40], {%r10007, %r10008}; // begin inline asm - dp4a.u32.u32 %r5816, %r5817, %r5882, %r5812; + // chi + lop3.b32 %r10015, %r9975, %r9951, %r10026, 0xD2; + lop3.b32 %r10016, %r9979, %r9955, %r10029, 0xD2; // end inline asm - ld.const.u32 %r5821, [matrix+3984]; + st.local.v2.u32 [%rd55+48], {%r10015, %r10016}; // begin inline asm - dp4a.u32.u32 %r5820, %r5821, %r5886, %r5816; + // chi + lop3.b32 %r10023, %r9951, %r10026, %r9919, 0xD2; + lop3.b32 %r10024, %r9955, %r10029, %r9923, 0xD2; // end inline asm - ld.const.u32 %r5825, [matrix+3988]; + st.local.v2.u32 [%rd55+56], {%r10023, %r10024}; // begin inline asm - dp4a.u32.u32 %r5824, %r5825, %r5890, %r5820; + // chi + lop3.b32 %r10031, %r9967, %r9927, %r9983, 0xD2; + lop3.b32 %r10032, %r9971, %r9931, %r9987, 0xD2; // end inline asm - ld.const.u32 %r5829, [matrix+3992]; + st.local.v2.u32 [%rd55+64], {%r10031, %r10032}; // begin inline asm - dp4a.u32.u32 %r5828, %r5829, %r5894, %r5824; + // chi + lop3.b32 %r10039, %r9927, %r9983, %r9959, 0xD2; + lop3.b32 %r10040, %r9931, %r9987, %r9963, 0xD2; // end inline asm - ld.const.u32 %r5833, [matrix+3996]; + st.local.v2.u32 [%rd55+72], {%r10039, %r10040}; // begin inline asm - dp4a.u32.u32 %r5832, %r5833, %r5898, %r5828; + // chi + lop3.b32 %r10047, %r9983, %r9959, %r9935, 0xD2; + lop3.b32 %r10048, %r9987, %r9963, %r9939, 0xD2; // end inline asm - ld.const.u32 %r5837, [matrix+4000]; + st.local.v2.u32 [%rd55+80], {%r10047, %r10048}; // begin inline asm - dp4a.u32.u32 %r5836, %r5837, %r5902, %r5832; + ld.global.nc.v2.u32 {%r10055,%r10056}, [%rd498]; // end inline asm - ld.const.u32 %r5841, [matrix+4004]; + xor.b32 %r10067, %r9992, %r10056; + xor.b32 %r10068, %r9991, %r10055; + st.local.v2.u32 [%rd55+24], {%r10068, %r10067}; + st.global.u64 [%rd36], %rd1261; + st.global.u64 [%rd36+8], %rd1262; + st.global.u64 [%rd36+16], %rd1263; + st.global.u64 [%rd36+24], %rd62; + st.global.u64 [%rd36+32], %rd1264; + st.global.u64 [%rd36+40], %rd64; + st.global.u64 [%rd36+48], %rd65; + st.global.u64 [%rd36+56], %rd66; + st.global.v2.u32 [%rd36+64], {%r10068, %r10067}; + st.global.v2.u32 [%rd36+72], {%r9999, %r10000}; + st.global.v2.u32 [%rd36+80], {%r10007, %r10008}; + st.global.v2.u32 [%rd36+88], {%r10015, %r10016}; + st.global.v2.u32 [%rd36+96], {%r10023, %r10024}; + st.global.v2.u32 [%rd36+104], {%r10031, %r10032}; + st.global.v2.u32 [%rd36+112], {%r10039, %r10040}; + st.global.v2.u32 [%rd36+120], {%r10047, %r10048}; + +$L__BB2_44: + shl.b32 %r1678, %r25, 1; + mul.wide.u32 %rd678, %r1678, -954391867; + shr.u64 %rd679, %rd678, 32; + cvt.u32.u64 %r13353, %rd679; + sub.s32 %r13354, %r1678, %r13353; + shr.u32 %r13355, %r13354, 1; + add.s32 %r13356, %r13355, %r13353; + shr.u32 %r13357, %r13356, 20; + mul.lo.s32 %r13358, %r13357, 1179641; + sub.s32 %r13359, %r1678, %r13358; + mul.wide.u32 %rd681, %r13359, 64; + add.s64 %rd128, %rd471, %rd681; + or.b32 %r1679, %r1678, 1; + mul.wide.u32 %rd682, %r1679, -954391867; + shr.u64 %rd683, %rd682, 32; + cvt.u32.u64 %r13360, %rd683; + sub.s32 %r13361, %r1679, %r13360; + shr.u32 %r13362, %r13361, 1; + add.s32 %r13363, %r13362, %r13360; + shr.u32 %r13364, %r13363, 20; + mul.lo.s32 %r13365, %r13364, 1179641; + sub.s32 %r13366, %r1679, %r13365; + mul.wide.u32 %rd684, %r13366, 64; + add.s64 %rd129, %rd471, %rd684; + @%p16 bra $L__BB2_58; + + cvta.to.global.u64 %rd685, %rd353; + mul.wide.u32 %rd686, %r25, 128; + add.s64 %rd130, %rd685, %rd686; + ld.global.u64 %rd1265, [%rd130]; + setp.eq.s64 %p29, %rd1265, 0; + @%p29 bra $L__BB2_47; + + ld.global.u64 %rd1268, [%rd130+32]; + ld.global.u64 %rd1267, [%rd130+16]; + ld.global.u64 %rd1266, [%rd130+8]; + bra.uni $L__BB2_69; + +$L__BB2_58: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd788, 1179641; + st.local.u64 [%rd3+8], %rd788; + st.local.u32 [%rd3+16], %r1678; + ld.global.u64 %rd789, [%rd128]; + ld.global.u64 %rd790, [%rd128+8]; + ld.global.u64 %rd791, [%rd128+16]; + ld.global.u64 %rd792, [%rd128+24]; + ld.global.u64 %rd793, [%rd128+32]; + ld.global.u64 %rd794, [%rd128+40]; + ld.global.u64 %rd795, [%rd128+48]; + ld.global.u64 %rd796, [%rd128+56]; + st.local.u64 [%rd3+24], %rd789; + st.local.u64 [%rd3+32], %rd790; + st.local.u64 [%rd3+40], %rd791; + st.local.u64 [%rd3+48], %rd792; + st.local.u64 [%rd3+56], %rd793; + st.local.u64 [%rd3+64], %rd794; + st.local.u64 [%rd3+72], %rd795; + st.local.u64 [%rd3+80], %rd796; + cvt.u32.u64 %r16692, %rd789; + xor.b32 %r16693, %r1678, %r16692; + st.local.u32 [%rd3+24], %r16693; + mov.u32 %r30250, 0; + st.local.v2.u32 [%rd3+96], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+104], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+112], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+120], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+128], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+136], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+144], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+152], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+160], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+168], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+176], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+184], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+192], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+200], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+208], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+216], {%r30250, %r30250}; + mov.u32 %r30265, -2147483648; + mov.u32 %r16665, 1; + st.local.v2.u32 [%rd3+88], {%r16665, %r30265}; + ld.local.v2.u32 {%r30286, %r30287}, [%rd3+24]; + mov.b64 {%r30284, %r30285}, %rd794; + shr.u64 %rd797, %rd790, 32; + cvt.u32.u64 %r30298, %rd790; + cvt.u32.u64 %r30299, %rd797; + shr.u64 %rd798, %rd795, 32; + cvt.u32.u64 %r30296, %rd795; + cvt.u32.u64 %r30297, %rd798; + shr.u64 %rd799, %rd791, 32; + cvt.u32.u64 %r30294, %rd791; + cvt.u32.u64 %r30295, %rd799; + shr.u64 %rd800, %rd796, 32; + cvt.u32.u64 %r30292, %rd796; + cvt.u32.u64 %r30293, %rd800; + shr.u64 %rd801, %rd792, 32; + cvt.u32.u64 %r30290, %rd792; + cvt.u32.u64 %r30291, %rd801; + shr.u64 %rd802, %rd793, 32; + cvt.u32.u64 %r30288, %rd793; + cvt.u32.u64 %r30289, %rd802; + mov.u32 %r30251, %r30250; + mov.u32 %r30252, %r30250; + mov.u32 %r30253, %r30250; + mov.u32 %r30254, %r30250; + mov.u32 %r30255, %r30250; + mov.u32 %r30256, %r30250; + mov.u32 %r30257, %r30250; + mov.u32 %r30258, %r30250; + mov.u32 %r30259, %r30250; + mov.u32 %r30260, %r30250; + mov.u32 %r30261, %r30250; + mov.u32 %r30262, %r30250; + mov.u32 %r30263, %r30250; + mov.u32 %r30264, %r16665; + mov.u32 %r30266, %r30250; + mov.u32 %r30267, %r30250; + mov.u32 %r30268, %r30250; + mov.u32 %r30269, %r30250; + mov.u32 %r30270, %r30250; + mov.u32 %r30271, %r30250; + mov.u32 %r30272, %r30250; + mov.u32 %r30273, %r30250; + mov.u32 %r30274, %r30250; + mov.u32 %r30275, %r30250; + mov.u32 %r30276, %r30250; + mov.u32 %r30277, %r30250; + mov.u32 %r30278, %r30250; + mov.u32 %r30279, %r30250; + mov.u32 %r30280, %r30250; + mov.u32 %r30281, %r30250; + mov.u32 %r30282, %r30250; + mov.u32 %r30283, %r30250; + mov.u32 %r30300, %r30250; + +$L__BB2_59: // begin inline asm - dp4a.u32.u32 %r5840, %r5841, %r5906, %r5836; + // xor5 + lop3.b32 %r16696, %r30286, %r30284, %r30282, 0x96; + lop3.b32 %r16696, %r16696, %r30280, %r30278, 0x96; + lop3.b32 %r16697, %r30287, %r30285, %r30283, 0x96; + lop3.b32 %r16697, %r16697, %r30281, %r30279, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16708, %r30298, %r30296, %r30276, 0x96; + lop3.b32 %r16708, %r16708, %r30274, %r30272, 0x96; + lop3.b32 %r16709, %r30299, %r30297, %r30277, 0x96; + lop3.b32 %r16709, %r16709, %r30275, %r30273, 0x96; // end inline asm - ld.const.u32 %r5845, [matrix+4008]; // begin inline asm - dp4a.u32.u32 %r5844, %r5845, %r5910, %r5840; + // xor5 + lop3.b32 %r16720, %r30294, %r30292, %r30270, 0x96; + lop3.b32 %r16720, %r16720, %r30268, %r30266, 0x96; + lop3.b32 %r16721, %r30295, %r30293, %r30271, 0x96; + lop3.b32 %r16721, %r16721, %r30269, %r30267, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16732, %r30290, %r30264, %r30262, 0x96; + lop3.b32 %r16732, %r16732, %r30260, %r30258, 0x96; + lop3.b32 %r16733, %r30291, %r30265, %r30263, 0x96; + lop3.b32 %r16733, %r16733, %r30261, %r30259, 0x96; // end inline asm - ld.const.u32 %r5849, [matrix+4012]; // begin inline asm - dp4a.u32.u32 %r5848, %r5849, %r5914, %r5844; + // xor5 + lop3.b32 %r16744, %r30288, %r30256, %r30254, 0x96; + lop3.b32 %r16744, %r16744, %r30252, %r30250, 0x96; + lop3.b32 %r16745, %r30289, %r30257, %r30255, 0x96; + lop3.b32 %r16745, %r16745, %r30253, %r30251, 0x96; // end inline asm - ld.const.u32 %r5853, [matrix+4016]; // begin inline asm - dp4a.u32.u32 %r5852, %r5853, %r5918, %r5848; + shf.l.wrap.b32 %r16756, %r16709, %r16708, %r16665; // end inline asm - ld.const.u32 %r5857, [matrix+4020]; // begin inline asm - dp4a.u32.u32 %r5856, %r5857, %r5922, %r5852; + shf.l.wrap.b32 %r16760, %r16708, %r16709, %r16665; // end inline asm - ld.const.u32 %r5861, [matrix+4024]; + xor.b32 %r17190, %r16756, %r16744; + xor.b32 %r17191, %r16760, %r16745; + xor.b32 %r17023, %r30286, %r17190; + xor.b32 %r17026, %r30287, %r17191; + xor.b32 %r16930, %r30284, %r17190; + xor.b32 %r16929, %r30285, %r17191; + xor.b32 %r16977, %r30282, %r17190; + xor.b32 %r16978, %r30283, %r17191; + xor.b32 %r16882, %r30280, %r17190; + xor.b32 %r16881, %r30281, %r17191; + xor.b32 %r16833, %r30278, %r17190; + xor.b32 %r16834, %r30279, %r17191; // begin inline asm - dp4a.u32.u32 %r5860, %r5861, %r5926, %r5856; + shf.l.wrap.b32 %r16764, %r16721, %r16720, %r16665; // end inline asm - ld.const.u32 %r5865, [matrix+4028]; // begin inline asm - dp4a.u32.u32 %r5864, %r5865, %r5930, %r5860; + shf.l.wrap.b32 %r16768, %r16720, %r16721, %r16665; // end inline asm - ld.const.u32 %r5869, [matrix+4032]; + xor.b32 %r17192, %r16764, %r16696; + xor.b32 %r17193, %r16768, %r16697; + xor.b32 %r16985, %r30298, %r17192; + xor.b32 %r16986, %r30299, %r17193; + xor.b32 %r16802, %r30296, %r17192; + xor.b32 %r16801, %r30297, %r17193; + xor.b32 %r16961, %r30276, %r17192; + xor.b32 %r16962, %r30277, %r17193; + xor.b32 %r16922, %r30274, %r17192; + xor.b32 %r16921, %r30275, %r17193; + xor.b32 %r16905, %r30272, %r17192; + xor.b32 %r16906, %r30273, %r17193; // begin inline asm - dp4a.u32.u32 %r5868, %r5869, %r5870, %r6249; + shf.l.wrap.b32 %r16772, %r16733, %r16732, %r16665; // end inline asm - ld.const.u32 %r5873, [matrix+4036]; // begin inline asm - dp4a.u32.u32 %r5872, %r5873, %r5874, %r5868; + shf.l.wrap.b32 %r16776, %r16732, %r16733, %r16665; // end inline asm - ld.const.u32 %r5877, [matrix+4040]; + xor.b32 %r17194, %r16772, %r16708; + xor.b32 %r17195, %r16776, %r16709; + xor.b32 %r16842, %r30294, %r17194; + xor.b32 %r16841, %r30295, %r17195; + xor.b32 %r16969, %r30292, %r17194; + xor.b32 %r16970, %r30293, %r17195; + xor.b32 %r16850, %r30270, %r17194; + xor.b32 %r16849, %r30271, %r17195; + xor.b32 %r16953, %r30268, %r17194; + xor.b32 %r16954, %r30269, %r17195; + xor.b32 %r16818, %r30266, %r17194; + xor.b32 %r16817, %r30267, %r17195; // begin inline asm - dp4a.u32.u32 %r5876, %r5877, %r5878, %r5872; + shf.l.wrap.b32 %r16780, %r16745, %r16744, %r16665; // end inline asm - ld.const.u32 %r5881, [matrix+4044]; // begin inline asm - dp4a.u32.u32 %r5880, %r5881, %r5882, %r5876; + shf.l.wrap.b32 %r16784, %r16744, %r16745, %r16665; // end inline asm - ld.const.u32 %r5885, [matrix+4048]; + xor.b32 %r17196, %r16780, %r16720; + xor.b32 %r17197, %r16784, %r16721; + xor.b32 %r16937, %r30290, %r17196; + xor.b32 %r16938, %r30291, %r17197; + xor.b32 %r16914, %r30264, %r17196; + xor.b32 %r16913, %r30265, %r17197; + xor.b32 %r16857, %r30262, %r17196; + xor.b32 %r16858, %r30263, %r17197; + xor.b32 %r16945, %r30260, %r17196; + xor.b32 %r16946, %r30261, %r17197; + xor.b32 %r16874, %r30258, %r17196; + xor.b32 %r16873, %r30259, %r17197; // begin inline asm - dp4a.u32.u32 %r5884, %r5885, %r5886, %r5880; + shf.l.wrap.b32 %r16788, %r16697, %r16696, %r16665; // end inline asm - ld.const.u32 %r5889, [matrix+4052]; // begin inline asm - dp4a.u32.u32 %r5888, %r5889, %r5890, %r5884; + shf.l.wrap.b32 %r16792, %r16696, %r16697, %r16665; // end inline asm - ld.const.u32 %r5893, [matrix+4056]; + xor.b32 %r17198, %r16788, %r16732; + xor.b32 %r17199, %r16792, %r16733; + xor.b32 %r16889, %r30288, %r17198; + xor.b32 %r16890, %r30289, %r17199; + xor.b32 %r16809, %r30256, %r17198; + xor.b32 %r16810, %r30257, %r17199; + xor.b32 %r16826, %r30254, %r17198; + xor.b32 %r16825, %r30255, %r17199; + xor.b32 %r16865, %r30252, %r17198; + xor.b32 %r16866, %r30253, %r17199; + xor.b32 %r16897, %r30250, %r17198; + xor.b32 %r16898, %r30251, %r17199; + mov.u32 %r16803, 44; // begin inline asm - dp4a.u32.u32 %r5892, %r5893, %r5894, %r5888; + shf.l.wrap.b32 %r16796, %r16802, %r16801, %r16803; // end inline asm - ld.const.u32 %r5897, [matrix+4060]; // begin inline asm - dp4a.u32.u32 %r5896, %r5897, %r5898, %r5892; + shf.l.wrap.b32 %r16800, %r16801, %r16802, %r16803; // end inline asm - ld.const.u32 %r5901, [matrix+4064]; + mov.u32 %r16811, 20; // begin inline asm - dp4a.u32.u32 %r5900, %r5901, %r5902, %r5896; + shf.l.wrap.b32 %r16804, %r16810, %r16809, %r16811; // end inline asm - ld.const.u32 %r5905, [matrix+4068]; // begin inline asm - dp4a.u32.u32 %r5904, %r5905, %r5906, %r5900; + shf.l.wrap.b32 %r16808, %r16809, %r16810, %r16811; // end inline asm - ld.const.u32 %r5909, [matrix+4072]; + mov.u32 %r16819, 61; // begin inline asm - dp4a.u32.u32 %r5908, %r5909, %r5910, %r5904; + shf.l.wrap.b32 %r16812, %r16818, %r16817, %r16819; // end inline asm - ld.const.u32 %r5913, [matrix+4076]; // begin inline asm - dp4a.u32.u32 %r5912, %r5913, %r5914, %r5908; + shf.l.wrap.b32 %r16816, %r16817, %r16818, %r16819; // end inline asm - ld.const.u32 %r5917, [matrix+4080]; + mov.u32 %r16827, 39; // begin inline asm - dp4a.u32.u32 %r5916, %r5917, %r5918, %r5912; + shf.l.wrap.b32 %r16820, %r16826, %r16825, %r16827; // end inline asm - ld.const.u32 %r5921, [matrix+4084]; // begin inline asm - dp4a.u32.u32 %r5920, %r5921, %r5922, %r5916; + shf.l.wrap.b32 %r16824, %r16825, %r16826, %r16827; // end inline asm - ld.const.u32 %r5925, [matrix+4088]; + mov.u32 %r16835, 18; // begin inline asm - dp4a.u32.u32 %r5924, %r5925, %r5926, %r5920; + shf.l.wrap.b32 %r16828, %r16834, %r16833, %r16835; // end inline asm - ld.const.u32 %r5929, [matrix+4092]; // begin inline asm - dp4a.u32.u32 %r5928, %r5929, %r5930, %r5924; + shf.l.wrap.b32 %r16832, %r16833, %r16834, %r16835; // end inline asm - shr.u32 %r6111, %r5864, 6; - and.b32 %r5933, %r6111, 240; - shr.u32 %r5934, %r5928, 10; + mov.u32 %r16843, 62; // begin inline asm - lop3.b32 %r5932, %r5933, %r5934, %r5935, 0x56; + shf.l.wrap.b32 %r16836, %r16842, %r16841, %r16843; // end inline asm - shl.b32 %r6112, %r2236, 24; - cvt.u64.u32 %rd213, %r6112; - shl.b32 %r6113, %r2104, 16; - and.b32 %r6114, %r6113, 16711680; - cvt.u64.u32 %rd214, %r6114; - shl.b32 %r6115, %r1972, 8; - and.b32 %r6116, %r6115, 65280; - cvt.u64.u32 %rd215, %r6116; - shl.b32 %r6117, %r3292, 24; - cvt.u64.u32 %rd216, %r6117; - shl.b32 %r6118, %r3160, 16; - and.b32 %r6119, %r6118, 16711680; - cvt.u64.u32 %rd217, %r6119; - shl.b32 %r6120, %r3028, 8; - and.b32 %r6121, %r6120, 65280; - cvt.u64.u32 %rd218, %r6121; - shl.b32 %r6122, %r4348, 24; - cvt.u64.u32 %rd219, %r6122; - shl.b32 %r6123, %r4216, 16; - and.b32 %r6124, %r6123, 16711680; - cvt.u64.u32 %rd220, %r6124; - shl.b32 %r6125, %r4084, 8; - and.b32 %r6126, %r6125, 65280; - cvt.u64.u32 %rd221, %r6126; - cvt.u64.u32 %rd222, %r2764; - shl.b64 %rd223, %rd222, 56; - cvt.u64.u32 %rd224, %r2632; - shl.b64 %rd225, %rd224, 48; - and.b64 %rd226, %rd225, 71776119061217280; - or.b64 %rd227, %rd223, %rd226; - cvt.u64.u32 %rd228, %r2500; - shl.b64 %rd229, %rd228, 40; - and.b64 %rd230, %rd229, 280375465082880; - or.b64 %rd231, %rd227, %rd230; - cvt.u64.u32 %rd232, %r2368; - shl.b64 %rd233, %rd232, 32; - and.b64 %rd234, %rd233, 1095216660480; - or.b64 %rd235, %rd231, %rd234; - or.b64 %rd236, %rd235, %rd213; - or.b64 %rd237, %rd236, %rd214; - and.b32 %r6127, %r1840, 255; - cvt.u64.u32 %rd238, %r6127; - or.b64 %rd239, %rd237, %rd215; - or.b64 %rd240, %rd239, %rd238; - xor.b64 %rd73, %rd240, 4239941492252378377; - cvt.u64.u32 %rd241, %r3820; - shl.b64 %rd242, %rd241, 56; - cvt.u64.u32 %rd243, %r3688; - shl.b64 %rd244, %rd243, 48; - and.b64 %rd245, %rd244, 71776119061217280; - or.b64 %rd246, %rd242, %rd245; - cvt.u64.u32 %rd247, %r3556; - shl.b64 %rd248, %rd247, 40; - and.b64 %rd249, %rd248, 280375465082880; - or.b64 %rd250, %rd246, %rd249; - cvt.u64.u32 %rd251, %r3424; - shl.b64 %rd252, %rd251, 32; - and.b64 %rd253, %rd252, 1095216660480; - or.b64 %rd254, %rd250, %rd253; - or.b64 %rd255, %rd254, %rd216; - or.b64 %rd256, %rd255, %rd217; - and.b32 %r6128, %r2896, 255; - cvt.u64.u32 %rd257, %r6128; - or.b64 %rd258, %rd256, %rd218; - or.b64 %rd259, %rd258, %rd257; - xor.b64 %rd460, %rd259, 8746723911537738262; - cvt.u64.u32 %rd260, %r4876; - shl.b64 %rd261, %rd260, 56; - cvt.u64.u32 %rd262, %r4744; - shl.b64 %rd263, %rd262, 48; - and.b64 %rd264, %rd263, 71776119061217280; - or.b64 %rd265, %rd261, %rd264; - cvt.u64.u32 %rd266, %r4612; - shl.b64 %rd267, %rd266, 40; - and.b64 %rd268, %rd267, 280375465082880; - or.b64 %rd269, %rd265, %rd268; - cvt.u64.u32 %rd270, %r4480; - shl.b64 %rd271, %rd270, 32; - and.b64 %rd272, %rd271, 1095216660480; - or.b64 %rd273, %rd269, %rd272; - or.b64 %rd274, %rd273, %rd219; - or.b64 %rd275, %rd274, %rd220; - and.b32 %r6129, %r3952, 255; - cvt.u64.u32 %rd276, %r6129; - or.b64 %rd277, %rd275, %rd221; - or.b64 %rd278, %rd277, %rd276; - xor.b64 %rd455, %rd278, 8796936657246353646; - cvt.u64.u32 %rd279, %r5932; - shl.b64 %rd280, %rd279, 56; - cvt.u64.u32 %rd281, %r5800; - shl.b64 %rd282, %rd281, 48; - and.b64 %rd283, %rd282, 71776119061217280; - or.b64 %rd284, %rd280, %rd283; - cvt.u64.u32 %rd285, %r5668; - shl.b64 %rd286, %rd285, 40; - and.b64 %rd287, %rd286, 280375465082880; - or.b64 %rd288, %rd284, %rd287; - cvt.u64.u32 %rd289, %r5536; - shl.b64 %rd290, %rd289, 32; - and.b64 %rd291, %rd290, 1095216660480; - or.b64 %rd292, %rd288, %rd291; - or.b64 %rd293, %rd292, %rd212; - shl.b32 %r6130, %r5140, 8; - and.b32 %r6131, %r6130, 65280; - cvt.u64.u32 %rd294, %r6131; - or.b64 %rd295, %rd293, %rd211; - and.b32 %r6132, %r5008, 255; - cvt.u64.u32 %rd296, %r6132; - or.b64 %rd297, %rd295, %rd294; - or.b64 %rd298, %rd297, %rd296; - xor.b64 %rd450, %rd298, 1272090201925444760; - mov.u64 %rd464, 8270816933120786537; - mov.u64 %rd463, -850687345431043546; - mov.u64 %rd462, 8596393687355028144; - mov.u64 %rd461, -4073852189716399785; - mov.u64 %rd459, -4539347866060507718; - mov.u64 %rd458, -3233781605604422593; - mov.u64 %rd457, 570094237299545110; - mov.u64 %rd456, 5171152063242093102; - mov.u64 %rd454, 6782861118970774626; - mov.u64 %rd453, 7812475424661425213; - mov.u64 %rd452, 9119540418498120711; - mov.u64 %rd451, -7873636174015165430; - mov.u64 %rd449, -9207053471590684088; - mov.u64 %rd448, 3370482334374859748; - mov.u64 %rd447, -1544774801229058759; - mov.u64 %rd446, 6096431547456407061; - mov.u64 %rd445, -1792185402154627366; - mov.u64 %rd444, -6864424130110145268; - mov.u64 %rd443, 5690099369266491460; - mov.u64 %rd442, -5074726839974049192; - mov.u64 %rd441, 1592359455985097269; - mov.u64 %rd440, RC; - -$L__BB0_9: - xor.b64 %rd299, %rd464, %rd73; - xor.b64 %rd300, %rd299, %rd463; - xor.b64 %rd301, %rd300, %rd462; - xor.b64 %rd302, %rd301, %rd461; - xor.b64 %rd303, %rd459, %rd460; - xor.b64 %rd304, %rd303, %rd458; - xor.b64 %rd305, %rd304, %rd457; - xor.b64 %rd306, %rd305, %rd456; - xor.b64 %rd307, %rd454, %rd455; - xor.b64 %rd308, %rd307, %rd453; - xor.b64 %rd309, %rd308, %rd452; - xor.b64 %rd310, %rd309, %rd451; - xor.b64 %rd311, %rd449, %rd450; - xor.b64 %rd312, %rd311, %rd448; - xor.b64 %rd313, %rd312, %rd447; - xor.b64 %rd314, %rd313, %rd446; - xor.b64 %rd315, %rd444, %rd445; - xor.b64 %rd316, %rd315, %rd443; - xor.b64 %rd317, %rd316, %rd442; - xor.b64 %rd318, %rd317, %rd441; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6133}, %rd306; - } - { - .reg .b32 %dummy; - mov.b64 {%r6134,%dummy}, %rd306; - } - shf.l.wrap.b32 %r6135, %r6134, %r6133, 1; - shf.l.wrap.b32 %r6136, %r6133, %r6134, 1; - mov.b64 %rd319, {%r6136, %r6135}; - xor.b64 %rd320, %rd318, %rd319; - xor.b64 %rd321, %rd320, %rd73; - xor.b64 %rd322, %rd464, %rd320; - xor.b64 %rd323, %rd463, %rd320; - xor.b64 %rd324, %rd462, %rd320; - xor.b64 %rd325, %rd461, %rd320; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6137}, %rd310; - } - { - .reg .b32 %dummy; - mov.b64 {%r6138,%dummy}, %rd310; - } - shf.l.wrap.b32 %r6139, %r6138, %r6137, 1; - shf.l.wrap.b32 %r6140, %r6137, %r6138, 1; - mov.b64 %rd326, {%r6140, %r6139}; - xor.b64 %rd327, %rd326, %rd302; - xor.b64 %rd328, %rd460, %rd327; - xor.b64 %rd329, %rd459, %rd327; - xor.b64 %rd330, %rd458, %rd327; - xor.b64 %rd331, %rd457, %rd327; - xor.b64 %rd332, %rd456, %rd327; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6141}, %rd314; - } - { - .reg .b32 %dummy; - mov.b64 {%r6142,%dummy}, %rd314; - } - shf.l.wrap.b32 %r6143, %r6142, %r6141, 1; - shf.l.wrap.b32 %r6144, %r6141, %r6142, 1; - mov.b64 %rd333, {%r6144, %r6143}; - xor.b64 %rd334, %rd333, %rd306; - xor.b64 %rd335, %rd455, %rd334; - xor.b64 %rd336, %rd454, %rd334; - xor.b64 %rd337, %rd453, %rd334; - xor.b64 %rd338, %rd452, %rd334; - xor.b64 %rd339, %rd451, %rd334; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6145}, %rd318; - } - { - .reg .b32 %dummy; - mov.b64 {%r6146,%dummy}, %rd318; - } - shf.l.wrap.b32 %r6147, %r6146, %r6145, 1; - shf.l.wrap.b32 %r6148, %r6145, %r6146, 1; - mov.b64 %rd340, {%r6148, %r6147}; - xor.b64 %rd341, %rd340, %rd310; - xor.b64 %rd342, %rd450, %rd341; - xor.b64 %rd343, %rd449, %rd341; - xor.b64 %rd344, %rd448, %rd341; - xor.b64 %rd345, %rd447, %rd341; - xor.b64 %rd346, %rd446, %rd341; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6149}, %rd302; - } - { - .reg .b32 %dummy; - mov.b64 {%r6150,%dummy}, %rd302; - } - shf.l.wrap.b32 %r6151, %r6150, %r6149, 1; - shf.l.wrap.b32 %r6152, %r6149, %r6150, 1; - mov.b64 %rd347, {%r6152, %r6151}; - xor.b64 %rd348, %rd314, %rd347; - xor.b64 %rd349, %rd445, %rd348; - xor.b64 %rd350, %rd444, %rd348; - xor.b64 %rd351, %rd443, %rd348; - xor.b64 %rd352, %rd442, %rd348; - xor.b64 %rd353, %rd441, %rd348; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6153}, %rd328; - } - { - .reg .b32 %dummy; - mov.b64 {%r6154,%dummy}, %rd328; - } - shf.l.wrap.b32 %r6155, %r6154, %r6153, 1; - shf.l.wrap.b32 %r6156, %r6153, %r6154, 1; - mov.b64 %rd354, {%r6156, %r6155}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6157}, %rd323; - } - { - .reg .b32 %dummy; - mov.b64 {%r6158,%dummy}, %rd323; - } - shf.l.wrap.b32 %r6159, %r6158, %r6157, 3; - shf.l.wrap.b32 %r6160, %r6157, %r6158, 3; - mov.b64 %rd355, {%r6160, %r6159}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6161}, %rd336; - } - { - .reg .b32 %dummy; - mov.b64 {%r6162,%dummy}, %rd336; - } - shf.l.wrap.b32 %r6163, %r6162, %r6161, 6; - shf.l.wrap.b32 %r6164, %r6161, %r6162, 6; - mov.b64 %rd356, {%r6164, %r6163}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6165}, %rd330; - } - { - .reg .b32 %dummy; - mov.b64 {%r6166,%dummy}, %rd330; - } - shf.l.wrap.b32 %r6167, %r6166, %r6165, 10; - shf.l.wrap.b32 %r6168, %r6165, %r6166, 10; - mov.b64 %rd357, {%r6168, %r6167}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6169}, %rd338; - } - { - .reg .b32 %dummy; - mov.b64 {%r6170,%dummy}, %rd338; - } - shf.l.wrap.b32 %r6171, %r6170, %r6169, 15; - shf.l.wrap.b32 %r6172, %r6169, %r6170, 15; - mov.b64 %rd358, {%r6172, %r6171}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6173}, %rd345; - } - { - .reg .b32 %dummy; - mov.b64 {%r6174,%dummy}, %rd345; - } - shf.l.wrap.b32 %r6175, %r6174, %r6173, 21; - shf.l.wrap.b32 %r6176, %r6173, %r6174, 21; - mov.b64 %rd359, {%r6176, %r6175}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6177}, %rd342; - } - { - .reg .b32 %dummy; - mov.b64 {%r6178,%dummy}, %rd342; - } - shf.l.wrap.b32 %r6179, %r6178, %r6177, 28; - shf.l.wrap.b32 %r6180, %r6177, %r6178, 28; - mov.b64 %rd360, {%r6180, %r6179}; - { - .reg .b32 %dummy; - mov.b64 {%r6181,%dummy}, %rd322; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6182}, %rd322; - } - shf.r.wrap.b32 %r6183, %r6182, %r6181, 28; - shf.r.wrap.b32 %r6184, %r6181, %r6182, 28; - mov.b64 %rd361, {%r6184, %r6183}; - { - .reg .b32 %dummy; - mov.b64 {%r6185,%dummy}, %rd331; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6186}, %rd331; - } - shf.r.wrap.b32 %r6187, %r6186, %r6185, 19; - shf.r.wrap.b32 %r6188, %r6185, %r6186, 19; - mov.b64 %rd362, {%r6188, %r6187}; - { - .reg .b32 %dummy; - mov.b64 {%r6189,%dummy}, %rd343; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6190}, %rd343; - } - shf.r.wrap.b32 %r6191, %r6190, %r6189, 9; - shf.r.wrap.b32 %r6192, %r6189, %r6190, 9; - mov.b64 %rd363, {%r6192, %r6191}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6193}, %rd332; - } - { - .reg .b32 %dummy; - mov.b64 {%r6194,%dummy}, %rd332; - } - shf.l.wrap.b32 %r6195, %r6194, %r6193, 2; - shf.l.wrap.b32 %r6196, %r6193, %r6194, 2; - mov.b64 %rd364, {%r6196, %r6195}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6197}, %rd353; - } - { - .reg .b32 %dummy; - mov.b64 {%r6198,%dummy}, %rd353; - } - shf.l.wrap.b32 %r6199, %r6198, %r6197, 14; - shf.l.wrap.b32 %r6200, %r6197, %r6198, 14; - mov.b64 %rd365, {%r6200, %r6199}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6201}, %rd349; - } - { - .reg .b32 %dummy; - mov.b64 {%r6202,%dummy}, %rd349; - } - shf.l.wrap.b32 %r6203, %r6202, %r6201, 27; - shf.l.wrap.b32 %r6204, %r6201, %r6202, 27; - mov.b64 %rd366, {%r6204, %r6203}; - { - .reg .b32 %dummy; - mov.b64 {%r6205,%dummy}, %rd324; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6206}, %rd324; - } - shf.r.wrap.b32 %r6207, %r6206, %r6205, 23; - shf.r.wrap.b32 %r6208, %r6205, %r6206, 23; - mov.b64 %rd367, {%r6208, %r6207}; - { - .reg .b32 %dummy; - mov.b64 {%r6209,%dummy}, %rd346; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6210}, %rd346; - } - shf.r.wrap.b32 %r6211, %r6210, %r6209, 8; - shf.r.wrap.b32 %r6212, %r6209, %r6210, 8; - mov.b64 %rd368, {%r6212, %r6211}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6213}, %rd352; - } - { - .reg .b32 %dummy; - mov.b64 {%r6214,%dummy}, %rd352; - } - shf.l.wrap.b32 %r6215, %r6214, %r6213, 8; - shf.l.wrap.b32 %r6216, %r6213, %r6214, 8; - mov.b64 %rd369, {%r6216, %r6215}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6217}, %rd344; - } - { - .reg .b32 %dummy; - mov.b64 {%r6218,%dummy}, %rd344; - } - shf.l.wrap.b32 %r6219, %r6218, %r6217, 25; - shf.l.wrap.b32 %r6220, %r6217, %r6218, 25; - mov.b64 %rd370, {%r6220, %r6219}; - { - .reg .b32 %dummy; - mov.b64 {%r6221,%dummy}, %rd337; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6222}, %rd337; - } - shf.r.wrap.b32 %r6223, %r6222, %r6221, 21; - shf.r.wrap.b32 %r6224, %r6221, %r6222, 21; - mov.b64 %rd371, {%r6224, %r6223}; - { - .reg .b32 %dummy; - mov.b64 {%r6225,%dummy}, %rd335; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6226}, %rd335; - } - shf.r.wrap.b32 %r6227, %r6226, %r6225, 2; - shf.r.wrap.b32 %r6228, %r6225, %r6226, 2; - mov.b64 %rd372, {%r6228, %r6227}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6229}, %rd325; - } - { - .reg .b32 %dummy; - mov.b64 {%r6230,%dummy}, %rd325; - } - shf.l.wrap.b32 %r6231, %r6230, %r6229, 18; - shf.l.wrap.b32 %r6232, %r6229, %r6230, 18; - mov.b64 %rd373, {%r6232, %r6231}; - { - .reg .b32 %dummy; - mov.b64 {%r6233,%dummy}, %rd351; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6234}, %rd351; - } - shf.r.wrap.b32 %r6235, %r6234, %r6233, 25; - shf.r.wrap.b32 %r6236, %r6233, %r6234, 25; - mov.b64 %rd374, {%r6236, %r6235}; - { - .reg .b32 %dummy; - mov.b64 {%r6237,%dummy}, %rd339; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6238}, %rd339; - } - shf.r.wrap.b32 %r6239, %r6238, %r6237, 3; - shf.r.wrap.b32 %r6240, %r6237, %r6238, 3; - mov.b64 %rd375, {%r6240, %r6239}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6241}, %rd350; - } - { - .reg .b32 %dummy; - mov.b64 {%r6242,%dummy}, %rd350; - } - shf.l.wrap.b32 %r6243, %r6242, %r6241, 20; - shf.l.wrap.b32 %r6244, %r6241, %r6242, 20; - mov.b64 %rd376, {%r6244, %r6243}; - { - .reg .b32 %dummy; - mov.b64 {%r6245,%dummy}, %rd329; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6246}, %rd329; - } - shf.r.wrap.b32 %r6247, %r6246, %r6245, 20; - shf.r.wrap.b32 %r6248, %r6245, %r6246, 20; - mov.b64 %rd377, {%r6248, %r6247}; - not.b64 %rd378, %rd377; - and.b64 %rd379, %rd371, %rd378; - xor.b64 %rd380, %rd379, %rd321; - not.b64 %rd381, %rd371; - and.b64 %rd382, %rd359, %rd381; - xor.b64 %rd460, %rd382, %rd377; - not.b64 %rd383, %rd359; - and.b64 %rd384, %rd365, %rd383; - xor.b64 %rd455, %rd384, %rd371; - not.b64 %rd385, %rd365; - and.b64 %rd386, %rd321, %rd385; - xor.b64 %rd450, %rd386, %rd359; - not.b64 %rd387, %rd321; - and.b64 %rd388, %rd377, %rd387; - xor.b64 %rd445, %rd365, %rd388; - not.b64 %rd389, %rd376; - and.b64 %rd390, %rd355, %rd389; - xor.b64 %rd464, %rd390, %rd360; - not.b64 %rd391, %rd355; - and.b64 %rd392, %rd362, %rd391; - xor.b64 %rd459, %rd392, %rd376; - not.b64 %rd393, %rd362; - and.b64 %rd394, %rd375, %rd393; - xor.b64 %rd454, %rd394, %rd355; - not.b64 %rd395, %rd375; - and.b64 %rd396, %rd360, %rd395; - xor.b64 %rd449, %rd396, %rd362; - not.b64 %rd397, %rd360; - and.b64 %rd398, %rd376, %rd397; - xor.b64 %rd444, %rd375, %rd398; - not.b64 %rd399, %rd356; - and.b64 %rd400, %rd370, %rd399; - xor.b64 %rd463, %rd400, %rd354; - not.b64 %rd401, %rd370; - and.b64 %rd402, %rd369, %rd401; - xor.b64 %rd458, %rd402, %rd356; - not.b64 %rd403, %rd369; - and.b64 %rd404, %rd373, %rd403; - xor.b64 %rd453, %rd404, %rd370; - not.b64 %rd405, %rd373; - and.b64 %rd406, %rd354, %rd405; - xor.b64 %rd448, %rd406, %rd369; - not.b64 %rd407, %rd354; - and.b64 %rd408, %rd356, %rd407; - xor.b64 %rd443, %rd373, %rd408; - not.b64 %rd409, %rd361; - and.b64 %rd410, %rd357, %rd409; - xor.b64 %rd462, %rd410, %rd366; - not.b64 %rd411, %rd357; - and.b64 %rd412, %rd358, %rd411; - xor.b64 %rd457, %rd412, %rd361; - not.b64 %rd413, %rd358; - and.b64 %rd414, %rd368, %rd413; - xor.b64 %rd452, %rd414, %rd357; - not.b64 %rd415, %rd368; - and.b64 %rd416, %rd366, %rd415; - xor.b64 %rd447, %rd416, %rd358; - not.b64 %rd417, %rd366; - and.b64 %rd418, %rd361, %rd417; - xor.b64 %rd442, %rd368, %rd418; - not.b64 %rd419, %rd363; - and.b64 %rd420, %rd374, %rd419; - xor.b64 %rd461, %rd420, %rd372; - not.b64 %rd421, %rd374; - and.b64 %rd422, %rd367, %rd421; - xor.b64 %rd456, %rd422, %rd363; - not.b64 %rd423, %rd367; - and.b64 %rd424, %rd364, %rd423; - xor.b64 %rd451, %rd424, %rd374; - not.b64 %rd425, %rd364; - and.b64 %rd426, %rd372, %rd425; - xor.b64 %rd446, %rd426, %rd367; - not.b64 %rd427, %rd372; - and.b64 %rd428, %rd363, %rd427; - xor.b64 %rd441, %rd364, %rd428; - ld.global.nc.u64 %rd429, [%rd440]; - xor.b64 %rd73, %rd380, %rd429; - add.s64 %rd440, %rd440, 8; - add.s32 %r6249, %r6249, 1; - setp.ne.s32 %p11, %r6249, 24; - @%p11 bra $L__BB0_9; - - ld.const.u64 %rd75, [target+24]; - setp.eq.s64 %p12, %rd450, %rd75; - @%p12 bra $L__BB0_12; - bra.uni $L__BB0_11; + // begin inline asm + shf.l.wrap.b32 %r16840, %r16841, %r16842, %r16843; + // end inline asm + mov.u32 %r16851, 43; + // begin inline asm + shf.l.wrap.b32 %r16844, %r16850, %r16849, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16848, %r16849, %r16850, %r16851; + // end inline asm + mov.u32 %r16859, 25; + // begin inline asm + shf.l.wrap.b32 %r16852, %r16858, %r16857, %r16859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16856, %r16857, %r16858, %r16859; + // end inline asm + mov.u32 %r16867, 8; + // begin inline asm + shf.l.wrap.b32 %r16860, %r16866, %r16865, %r16867; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16864, %r16865, %r16866, %r16867; + // end inline asm + mov.u32 %r16875, 56; + // begin inline asm + shf.l.wrap.b32 %r16868, %r16874, %r16873, %r16875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16872, %r16873, %r16874, %r16875; + // end inline asm + mov.u32 %r16883, 41; + // begin inline asm + shf.l.wrap.b32 %r16876, %r16882, %r16881, %r16883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16880, %r16881, %r16882, %r16883; + // end inline asm + mov.u32 %r16891, 27; + // begin inline asm + shf.l.wrap.b32 %r16884, %r16890, %r16889, %r16891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16888, %r16889, %r16890, %r16891; + // end inline asm + mov.u32 %r16899, 14; + // begin inline asm + shf.l.wrap.b32 %r16892, %r16898, %r16897, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16896, %r16897, %r16898, %r16899; + // end inline asm + mov.u32 %r16907, 2; + // begin inline asm + shf.l.wrap.b32 %r16900, %r16906, %r16905, %r16907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16904, %r16905, %r16906, %r16907; + // end inline asm + mov.u32 %r16915, 55; + // begin inline asm + shf.l.wrap.b32 %r16908, %r16914, %r16913, %r16915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16912, %r16913, %r16914, %r16915; + // end inline asm + mov.u32 %r16923, 45; + // begin inline asm + shf.l.wrap.b32 %r16916, %r16922, %r16921, %r16923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16920, %r16921, %r16922, %r16923; + // end inline asm + mov.u32 %r16931, 36; + // begin inline asm + shf.l.wrap.b32 %r16924, %r16930, %r16929, %r16931; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16928, %r16929, %r16930, %r16931; + // end inline asm + mov.u32 %r16939, 28; + // begin inline asm + shf.l.wrap.b32 %r16932, %r16938, %r16937, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16936, %r16937, %r16938, %r16939; + // end inline asm + mov.u32 %r16947, 21; + // begin inline asm + shf.l.wrap.b32 %r16940, %r16946, %r16945, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16944, %r16945, %r16946, %r16947; + // end inline asm + mov.u32 %r16955, 15; + // begin inline asm + shf.l.wrap.b32 %r16948, %r16954, %r16953, %r16955; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16952, %r16953, %r16954, %r16955; + // end inline asm + mov.u32 %r16963, 10; + // begin inline asm + shf.l.wrap.b32 %r16956, %r16962, %r16961, %r16963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16960, %r16961, %r16962, %r16963; + // end inline asm + mov.u32 %r16971, 6; + // begin inline asm + shf.l.wrap.b32 %r16964, %r16970, %r16969, %r16971; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16968, %r16969, %r16970, %r16971; + // end inline asm + mov.u32 %r16979, 3; + // begin inline asm + shf.l.wrap.b32 %r16972, %r16978, %r16977, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16976, %r16977, %r16978, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16980, %r16986, %r16985, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16984, %r16985, %r16986, %r16665; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16988, %r17023, %r16796, %r16844, 0xD2; + lop3.b32 %r16989, %r17026, %r16800, %r16848, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30298, %r16796, %r16844, %r16940, 0xD2; + lop3.b32 %r30299, %r16800, %r16848, %r16944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30294, %r16844, %r16940, %r16892, 0xD2; + lop3.b32 %r30295, %r16848, %r16944, %r16896, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30290, %r16940, %r16892, %r17023, 0xD2; + lop3.b32 %r30291, %r16944, %r16896, %r17026, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30288, %r16892, %r17023, %r16796, 0xD2; + lop3.b32 %r30289, %r16896, %r17026, %r16800, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30284, %r16932, %r16804, %r16972, 0xD2; + lop3.b32 %r30285, %r16936, %r16808, %r16976, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30296, %r16804, %r16972, %r16916, 0xD2; + lop3.b32 %r30297, %r16808, %r16976, %r16920, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30292, %r16972, %r16916, %r16812, 0xD2; + lop3.b32 %r30293, %r16976, %r16920, %r16816, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30264, %r16916, %r16812, %r16932, 0xD2; + lop3.b32 %r30265, %r16920, %r16816, %r16936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30264, %r30265}; + // begin inline asm + // chi + lop3.b32 %r30256, %r16812, %r16932, %r16804, 0xD2; + lop3.b32 %r30257, %r16816, %r16936, %r16808, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30256, %r30257}; + // begin inline asm + // chi + lop3.b32 %r30282, %r16980, %r16964, %r16852, 0xD2; + lop3.b32 %r30283, %r16984, %r16968, %r16856, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30282, %r30283}; + // begin inline asm + // chi + lop3.b32 %r30276, %r16964, %r16852, %r16860, 0xD2; + lop3.b32 %r30277, %r16968, %r16856, %r16864, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30276, %r30277}; + // begin inline asm + // chi + lop3.b32 %r30270, %r16852, %r16860, %r16828, 0xD2; + lop3.b32 %r30271, %r16856, %r16864, %r16832, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30270, %r30271}; + // begin inline asm + // chi + lop3.b32 %r30262, %r16860, %r16828, %r16980, 0xD2; + lop3.b32 %r30263, %r16864, %r16832, %r16984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30262, %r30263}; + // begin inline asm + // chi + lop3.b32 %r30254, %r16828, %r16980, %r16964, 0xD2; + lop3.b32 %r30255, %r16832, %r16984, %r16968, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30254, %r30255}; + // begin inline asm + // chi + lop3.b32 %r30280, %r16884, %r16924, %r16956, 0xD2; + lop3.b32 %r30281, %r16888, %r16928, %r16960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30280, %r30281}; + // begin inline asm + // chi + lop3.b32 %r30274, %r16924, %r16956, %r16948, 0xD2; + lop3.b32 %r30275, %r16928, %r16960, %r16952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30274, %r30275}; + // begin inline asm + // chi + lop3.b32 %r30268, %r16956, %r16948, %r16868, 0xD2; + lop3.b32 %r30269, %r16960, %r16952, %r16872, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30268, %r30269}; + // begin inline asm + // chi + lop3.b32 %r30260, %r16948, %r16868, %r16884, 0xD2; + lop3.b32 %r30261, %r16952, %r16872, %r16888, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30260, %r30261}; + // begin inline asm + // chi + lop3.b32 %r30252, %r16868, %r16884, %r16924, 0xD2; + lop3.b32 %r30253, %r16872, %r16888, %r16928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30252, %r30253}; + // begin inline asm + // chi + lop3.b32 %r30278, %r16836, %r16908, %r16820, 0xD2; + lop3.b32 %r30279, %r16840, %r16912, %r16824, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30278, %r30279}; + // begin inline asm + // chi + lop3.b32 %r30272, %r16908, %r16820, %r16876, 0xD2; + lop3.b32 %r30273, %r16912, %r16824, %r16880, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30272, %r30273}; + // begin inline asm + // chi + lop3.b32 %r30266, %r16820, %r16876, %r16900, 0xD2; + lop3.b32 %r30267, %r16824, %r16880, %r16904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30266, %r30267}; + // begin inline asm + // chi + lop3.b32 %r30258, %r16876, %r16900, %r16836, 0xD2; + lop3.b32 %r30259, %r16880, %r16904, %r16840, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30258, %r30259}; + // begin inline asm + // chi + lop3.b32 %r30250, %r16900, %r16836, %r16908, 0xD2; + lop3.b32 %r30251, %r16904, %r16840, %r16912, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30250, %r30251}; + mul.wide.s32 %rd804, %r30300, 8; + mov.u64 %rd805, keccak_round_constants; + cvta.const.u64 %rd806, %rd805; + add.s64 %rd803, %rd806, %rd804; + // begin inline asm + ld.global.nc.v2.u32 {%r17188,%r17189}, [%rd803]; + // end inline asm + xor.b32 %r30286, %r16988, %r17188; + xor.b32 %r30287, %r16989, %r17189; + add.s32 %r30300, %r30300, 1; + setp.lt.u32 %p35, %r30300, 23; + @%p35 bra $L__BB2_59; + + add.u64 %rd178, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30298, %r30299}; + st.local.v2.u32 [%rd3+72], {%r30296, %r30297}; + st.local.v2.u32 [%rd3+40], {%r30294, %r30295}; + st.local.v2.u32 [%rd3+80], {%r30292, %r30293}; + st.local.v2.u32 [%rd3+48], {%r30290, %r30291}; + st.local.v2.u32 [%rd3+56], {%r30288, %r30289}; + st.local.v2.u32 [%rd3+24], {%r30286, %r30287}; + // begin inline asm + // xor5 + lop3.b32 %r17200, %r30286, %r30284, %r30282, 0x96; + lop3.b32 %r17200, %r17200, %r30280, %r30278, 0x96; + lop3.b32 %r17201, %r30287, %r30285, %r30283, 0x96; + lop3.b32 %r17201, %r17201, %r30281, %r30279, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17212, %r30298, %r30296, %r30276, 0x96; + lop3.b32 %r17212, %r17212, %r30274, %r30272, 0x96; + lop3.b32 %r17213, %r30299, %r30297, %r30277, 0x96; + lop3.b32 %r17213, %r17213, %r30275, %r30273, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17224, %r30294, %r30292, %r30270, 0x96; + lop3.b32 %r17224, %r17224, %r30268, %r30266, 0x96; + lop3.b32 %r17225, %r30295, %r30293, %r30271, 0x96; + lop3.b32 %r17225, %r17225, %r30269, %r30267, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17236, %r30290, %r30264, %r30262, 0x96; + lop3.b32 %r17236, %r17236, %r30260, %r30258, 0x96; + lop3.b32 %r17237, %r30291, %r30265, %r30263, 0x96; + lop3.b32 %r17237, %r17237, %r30261, %r30259, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17248, %r30288, %r30256, %r30254, 0x96; + lop3.b32 %r17248, %r17248, %r30252, %r30250, 0x96; + lop3.b32 %r17249, %r30289, %r30257, %r30255, 0x96; + lop3.b32 %r17249, %r17249, %r30253, %r30251, 0x96; + // end inline asm + mov.u32 %r17452, 1; + // begin inline asm + shf.l.wrap.b32 %r17260, %r17213, %r17212, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17264, %r17212, %r17213, %r17452; + // end inline asm + xor.b32 %r17479, %r17260, %r17248; + xor.b32 %r17480, %r17264, %r17249; + xor.b32 %r17407, %r30286, %r17479; + xor.b32 %r17410, %r30287, %r17480; + xor.b32 %r17370, %r30283, %r17480; + xor.b32 %r17369, %r30282, %r17479; + st.local.v2.u32 [%rd3+104], {%r17369, %r17370}; + // begin inline asm + shf.l.wrap.b32 %r17268, %r17225, %r17224, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17272, %r17224, %r17225, %r17452; + // end inline asm + xor.b32 %r17481, %r17268, %r17200; + xor.b32 %r17482, %r17272, %r17201; + xor.b32 %r17306, %r30296, %r17481; + xor.b32 %r17305, %r30297, %r17482; + xor.b32 %r17345, %r30275, %r17482; + xor.b32 %r17346, %r30274, %r17481; + st.local.v2.u32 [%rd3+152], {%r17346, %r17345}; + // begin inline asm + shf.l.wrap.b32 %r17276, %r17237, %r17236, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17280, %r17236, %r17237, %r17452; + // end inline asm + xor.b32 %r17483, %r17276, %r17212; + xor.b32 %r17484, %r17280, %r17213; + xor.b32 %r17329, %r30271, %r17484; + xor.b32 %r17330, %r30270, %r17483; + st.local.v2.u32 [%rd3+120], {%r17330, %r17329}; + xor.b32 %r17321, %r30267, %r17484; + xor.b32 %r17322, %r30266, %r17483; + st.local.v2.u32 [%rd3+200], {%r17322, %r17321}; + // begin inline asm + shf.l.wrap.b32 %r17284, %r17249, %r17248, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17288, %r17248, %r17249, %r17452; + // end inline asm + xor.b32 %r17485, %r17284, %r17224; + xor.b32 %r17486, %r17288, %r17225; + xor.b32 %r17353, %r30290, %r17485; + xor.b32 %r17354, %r30291, %r17486; + xor.b32 %r17362, %r30261, %r17486; + xor.b32 %r17361, %r30260, %r17485; + st.local.v2.u32 [%rd3+168], {%r17361, %r17362}; + // begin inline asm + shf.l.wrap.b32 %r17292, %r17201, %r17200, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17296, %r17200, %r17201, %r17452; + // end inline asm + xor.b32 %r17487, %r17292, %r17236; + xor.b32 %r17488, %r17296, %r17237; + xor.b32 %r17313, %r30256, %r17487; + xor.b32 %r17314, %r30257, %r17488; + xor.b32 %r17338, %r30251, %r17488; + xor.b32 %r17337, %r30250, %r17487; + st.local.v2.u32 [%rd3+216], {%r17337, %r17338}; + // begin inline asm + shf.l.wrap.b32 %r17300, %r17306, %r17305, %r16803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17304, %r17305, %r17306, %r16803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17308, %r17314, %r17313, %r16811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17312, %r17313, %r17314, %r16811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17320, %r17321, %r17322, %r16819; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17316, %r17322, %r17321, %r16819; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r17316, %r17320}; + // begin inline asm + shf.l.wrap.b32 %r17324, %r17330, %r17329, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17328, %r17329, %r17330, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17332, %r17338, %r17337, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17336, %r17337, %r17338, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17344, %r17345, %r17346, %r16923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17340, %r17346, %r17345, %r16923; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r17340, %r17344}; + // begin inline asm + shf.l.wrap.b32 %r17348, %r17354, %r17353, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17352, %r17353, %r17354, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17356, %r17362, %r17361, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17360, %r17361, %r17362, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17364, %r17370, %r17369, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17368, %r17369, %r17370, %r16979; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17372, %r17407, %r17300, %r17324, 0xD2; + lop3.b32 %r17373, %r17410, %r17304, %r17328, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30433, %r17300, %r17324, %r17356, 0xD2; + lop3.b32 %r30434, %r17304, %r17328, %r17360, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30433, %r30434}; + // begin inline asm + // chi + lop3.b32 %r30429, %r17324, %r17356, %r17332, 0xD2; + lop3.b32 %r30430, %r17328, %r17360, %r17336, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30429, %r30430}; + // begin inline asm + // chi + lop3.b32 %r30425, %r17356, %r17332, %r17407, 0xD2; + lop3.b32 %r30426, %r17360, %r17336, %r17410, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30425, %r30426}; + // begin inline asm + // chi + lop3.b32 %r30423, %r17332, %r17407, %r17300, 0xD2; + lop3.b32 %r30424, %r17336, %r17410, %r17304, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30423, %r30424}; + // begin inline asm + // chi + lop3.b32 %r30419, %r17348, %r17308, %r17364, 0xD2; + lop3.b32 %r30420, %r17352, %r17312, %r17368, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30419, %r30420}; + // begin inline asm + // chi + lop3.b32 %r30431, %r17308, %r17364, %r17340, 0xD2; + lop3.b32 %r30432, %r17312, %r17368, %r17344, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30431, %r30432}; + // begin inline asm + // chi + lop3.b32 %r30427, %r17364, %r17340, %r17316, 0xD2; + lop3.b32 %r30428, %r17368, %r17344, %r17320, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30427, %r30428}; + add.s64 %rd807, %rd806, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r17436,%r17437}, [%rd807]; + // end inline asm + xor.b32 %r30421, %r17372, %r17436; + xor.b32 %r30422, %r17373, %r17437; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + st.local.u64 [%rd178], %rd354; + mov.u64 %rd811, 1179641; + st.local.u64 [%rd178+8], %rd811; + st.local.u32 [%rd178+16], %r1679; + ld.global.u64 %rd812, [%rd129]; + ld.global.u64 %rd813, [%rd129+8]; + ld.global.u64 %rd814, [%rd129+16]; + ld.global.u64 %rd815, [%rd129+24]; + ld.global.u64 %rd816, [%rd129+32]; + ld.global.u64 %rd817, [%rd129+40]; + ld.global.u64 %rd818, [%rd129+48]; + ld.global.u64 %rd819, [%rd129+56]; + st.local.u64 [%rd178+32], %rd813; + st.local.u64 [%rd178+40], %rd814; + st.local.u64 [%rd178+48], %rd815; + st.local.u64 [%rd178+56], %rd816; + st.local.u64 [%rd178+64], %rd817; + st.local.u64 [%rd178+72], %rd818; + st.local.u64 [%rd178+80], %rd819; + cvt.u32.u64 %r17489, %rd812; + xor.b32 %r17490, %r1679, %r17489; + st.local.u64 [%rd178+24], %rd812; + st.local.u32 [%rd178+24], %r17490; + mov.u32 %r30301, 0; + st.local.v2.u32 [%rd178+96], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+104], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+112], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+120], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+128], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+136], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+144], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+152], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+160], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+168], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+176], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+184], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+192], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+200], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+208], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+216], {%r30301, %r30301}; + mov.u32 %r30316, -2147483648; + st.local.v2.u32 [%rd178+88], {%r17452, %r30316}; + ld.local.v2.u32 {%r30337, %r30338}, [%rd178+24]; + mov.b64 {%r30335, %r30336}, %rd817; + shr.u64 %rd820, %rd813, 32; + cvt.u32.u64 %r30349, %rd813; + cvt.u32.u64 %r30350, %rd820; + shr.u64 %rd821, %rd818, 32; + cvt.u32.u64 %r30347, %rd818; + cvt.u32.u64 %r30348, %rd821; + shr.u64 %rd822, %rd814, 32; + cvt.u32.u64 %r30345, %rd814; + cvt.u32.u64 %r30346, %rd822; + shr.u64 %rd823, %rd819, 32; + cvt.u32.u64 %r30343, %rd819; + cvt.u32.u64 %r30344, %rd823; + shr.u64 %rd824, %rd815, 32; + cvt.u32.u64 %r30341, %rd815; + cvt.u32.u64 %r30342, %rd824; + shr.u64 %rd825, %rd816, 32; + cvt.u32.u64 %r30339, %rd816; + cvt.u32.u64 %r30340, %rd825; + mov.u32 %r30302, %r30301; + mov.u32 %r30303, %r30301; + mov.u32 %r30304, %r30301; + mov.u32 %r30305, %r30301; + mov.u32 %r30306, %r30301; + mov.u32 %r30307, %r30301; + mov.u32 %r30308, %r30301; + mov.u32 %r30309, %r30301; + mov.u32 %r30310, %r30301; + mov.u32 %r30311, %r30301; + mov.u32 %r30312, %r30301; + mov.u32 %r30313, %r30301; + mov.u32 %r30314, %r30301; + mov.u32 %r30315, %r17452; + mov.u32 %r30317, %r30301; + mov.u32 %r30318, %r30301; + mov.u32 %r30319, %r30301; + mov.u32 %r30320, %r30301; + mov.u32 %r30321, %r30301; + mov.u32 %r30322, %r30301; + mov.u32 %r30323, %r30301; + mov.u32 %r30324, %r30301; + mov.u32 %r30325, %r30301; + mov.u32 %r30326, %r30301; + mov.u32 %r30327, %r30301; + mov.u32 %r30328, %r30301; + mov.u32 %r30329, %r30301; + mov.u32 %r30330, %r30301; + mov.u32 %r30331, %r30301; + mov.u32 %r30332, %r30301; + mov.u32 %r30333, %r30301; + mov.u32 %r30334, %r30301; + mov.u32 %r30351, %r30301; + +$L__BB2_61: + // begin inline asm + // xor5 + lop3.b32 %r17493, %r30337, %r30335, %r30333, 0x96; + lop3.b32 %r17493, %r17493, %r30331, %r30329, 0x96; + lop3.b32 %r17494, %r30338, %r30336, %r30334, 0x96; + lop3.b32 %r17494, %r17494, %r30332, %r30330, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17505, %r30349, %r30347, %r30327, 0x96; + lop3.b32 %r17505, %r17505, %r30325, %r30323, 0x96; + lop3.b32 %r17506, %r30350, %r30348, %r30328, 0x96; + lop3.b32 %r17506, %r17506, %r30326, %r30324, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17517, %r30345, %r30343, %r30321, 0x96; + lop3.b32 %r17517, %r17517, %r30319, %r30317, 0x96; + lop3.b32 %r17518, %r30346, %r30344, %r30322, 0x96; + lop3.b32 %r17518, %r17518, %r30320, %r30318, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17529, %r30341, %r30315, %r30313, 0x96; + lop3.b32 %r17529, %r17529, %r30311, %r30309, 0x96; + lop3.b32 %r17530, %r30342, %r30316, %r30314, 0x96; + lop3.b32 %r17530, %r17530, %r30312, %r30310, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17541, %r30339, %r30307, %r30305, 0x96; + lop3.b32 %r17541, %r17541, %r30303, %r30301, 0x96; + lop3.b32 %r17542, %r30340, %r30308, %r30306, 0x96; + lop3.b32 %r17542, %r17542, %r30304, %r30302, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17553, %r17506, %r17505, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17557, %r17505, %r17506, %r17452; + // end inline asm + xor.b32 %r17987, %r17553, %r17541; + xor.b32 %r17988, %r17557, %r17542; + xor.b32 %r17820, %r30337, %r17987; + xor.b32 %r17823, %r30338, %r17988; + xor.b32 %r17727, %r30335, %r17987; + xor.b32 %r17726, %r30336, %r17988; + xor.b32 %r17774, %r30333, %r17987; + xor.b32 %r17775, %r30334, %r17988; + xor.b32 %r17679, %r30331, %r17987; + xor.b32 %r17678, %r30332, %r17988; + xor.b32 %r17630, %r30329, %r17987; + xor.b32 %r17631, %r30330, %r17988; + // begin inline asm + shf.l.wrap.b32 %r17561, %r17518, %r17517, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17565, %r17517, %r17518, %r17452; + // end inline asm + xor.b32 %r17989, %r17561, %r17493; + xor.b32 %r17990, %r17565, %r17494; + xor.b32 %r17782, %r30349, %r17989; + xor.b32 %r17783, %r30350, %r17990; + xor.b32 %r17599, %r30347, %r17989; + xor.b32 %r17598, %r30348, %r17990; + xor.b32 %r17758, %r30327, %r17989; + xor.b32 %r17759, %r30328, %r17990; + xor.b32 %r17719, %r30325, %r17989; + xor.b32 %r17718, %r30326, %r17990; + xor.b32 %r17702, %r30323, %r17989; + xor.b32 %r17703, %r30324, %r17990; + // begin inline asm + shf.l.wrap.b32 %r17569, %r17530, %r17529, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17573, %r17529, %r17530, %r17452; + // end inline asm + xor.b32 %r17991, %r17569, %r17505; + xor.b32 %r17992, %r17573, %r17506; + xor.b32 %r17639, %r30345, %r17991; + xor.b32 %r17638, %r30346, %r17992; + xor.b32 %r17766, %r30343, %r17991; + xor.b32 %r17767, %r30344, %r17992; + xor.b32 %r17647, %r30321, %r17991; + xor.b32 %r17646, %r30322, %r17992; + xor.b32 %r17750, %r30319, %r17991; + xor.b32 %r17751, %r30320, %r17992; + xor.b32 %r17615, %r30317, %r17991; + xor.b32 %r17614, %r30318, %r17992; + // begin inline asm + shf.l.wrap.b32 %r17577, %r17542, %r17541, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17581, %r17541, %r17542, %r17452; + // end inline asm + xor.b32 %r17993, %r17577, %r17517; + xor.b32 %r17994, %r17581, %r17518; + xor.b32 %r17734, %r30341, %r17993; + xor.b32 %r17735, %r30342, %r17994; + xor.b32 %r17711, %r30315, %r17993; + xor.b32 %r17710, %r30316, %r17994; + xor.b32 %r17654, %r30313, %r17993; + xor.b32 %r17655, %r30314, %r17994; + xor.b32 %r17742, %r30311, %r17993; + xor.b32 %r17743, %r30312, %r17994; + xor.b32 %r17671, %r30309, %r17993; + xor.b32 %r17670, %r30310, %r17994; + // begin inline asm + shf.l.wrap.b32 %r17585, %r17494, %r17493, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17589, %r17493, %r17494, %r17452; + // end inline asm + xor.b32 %r17995, %r17585, %r17529; + xor.b32 %r17996, %r17589, %r17530; + xor.b32 %r17686, %r30339, %r17995; + xor.b32 %r17687, %r30340, %r17996; + xor.b32 %r17606, %r30307, %r17995; + xor.b32 %r17607, %r30308, %r17996; + xor.b32 %r17623, %r30305, %r17995; + xor.b32 %r17622, %r30306, %r17996; + xor.b32 %r17662, %r30303, %r17995; + xor.b32 %r17663, %r30304, %r17996; + xor.b32 %r17694, %r30301, %r17995; + xor.b32 %r17695, %r30302, %r17996; + mov.u32 %r17600, 44; + // begin inline asm + shf.l.wrap.b32 %r17593, %r17599, %r17598, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17597, %r17598, %r17599, %r17600; + // end inline asm + mov.u32 %r17608, 20; + // begin inline asm + shf.l.wrap.b32 %r17601, %r17607, %r17606, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17605, %r17606, %r17607, %r17608; + // end inline asm + mov.u32 %r17616, 61; + // begin inline asm + shf.l.wrap.b32 %r17609, %r17615, %r17614, %r17616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17613, %r17614, %r17615, %r17616; + // end inline asm + mov.u32 %r17624, 39; + // begin inline asm + shf.l.wrap.b32 %r17617, %r17623, %r17622, %r17624; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17621, %r17622, %r17623, %r17624; + // end inline asm + mov.u32 %r17632, 18; + // begin inline asm + shf.l.wrap.b32 %r17625, %r17631, %r17630, %r17632; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17629, %r17630, %r17631, %r17632; + // end inline asm + mov.u32 %r17640, 62; + // begin inline asm + shf.l.wrap.b32 %r17633, %r17639, %r17638, %r17640; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17637, %r17638, %r17639, %r17640; + // end inline asm + mov.u32 %r17648, 43; + // begin inline asm + shf.l.wrap.b32 %r17641, %r17647, %r17646, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17645, %r17646, %r17647, %r17648; + // end inline asm + mov.u32 %r17656, 25; + // begin inline asm + shf.l.wrap.b32 %r17649, %r17655, %r17654, %r17656; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17653, %r17654, %r17655, %r17656; + // end inline asm + mov.u32 %r17664, 8; + // begin inline asm + shf.l.wrap.b32 %r17657, %r17663, %r17662, %r17664; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17661, %r17662, %r17663, %r17664; + // end inline asm + mov.u32 %r17672, 56; + // begin inline asm + shf.l.wrap.b32 %r17665, %r17671, %r17670, %r17672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17669, %r17670, %r17671, %r17672; + // end inline asm + mov.u32 %r17680, 41; + // begin inline asm + shf.l.wrap.b32 %r17673, %r17679, %r17678, %r17680; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17677, %r17678, %r17679, %r17680; + // end inline asm + mov.u32 %r17688, 27; + // begin inline asm + shf.l.wrap.b32 %r17681, %r17687, %r17686, %r17688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17685, %r17686, %r17687, %r17688; + // end inline asm + mov.u32 %r17696, 14; + // begin inline asm + shf.l.wrap.b32 %r17689, %r17695, %r17694, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17693, %r17694, %r17695, %r17696; + // end inline asm + mov.u32 %r17704, 2; + // begin inline asm + shf.l.wrap.b32 %r17697, %r17703, %r17702, %r17704; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17701, %r17702, %r17703, %r17704; + // end inline asm + mov.u32 %r17712, 55; + // begin inline asm + shf.l.wrap.b32 %r17705, %r17711, %r17710, %r17712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17709, %r17710, %r17711, %r17712; + // end inline asm + mov.u32 %r17720, 45; + // begin inline asm + shf.l.wrap.b32 %r17713, %r17719, %r17718, %r17720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17717, %r17718, %r17719, %r17720; + // end inline asm + mov.u32 %r17728, 36; + // begin inline asm + shf.l.wrap.b32 %r17721, %r17727, %r17726, %r17728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17725, %r17726, %r17727, %r17728; + // end inline asm + mov.u32 %r17736, 28; + // begin inline asm + shf.l.wrap.b32 %r17729, %r17735, %r17734, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17733, %r17734, %r17735, %r17736; + // end inline asm + mov.u32 %r17744, 21; + // begin inline asm + shf.l.wrap.b32 %r17737, %r17743, %r17742, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17741, %r17742, %r17743, %r17744; + // end inline asm + mov.u32 %r17752, 15; + // begin inline asm + shf.l.wrap.b32 %r17745, %r17751, %r17750, %r17752; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17749, %r17750, %r17751, %r17752; + // end inline asm + mov.u32 %r17760, 10; + // begin inline asm + shf.l.wrap.b32 %r17753, %r17759, %r17758, %r17760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17757, %r17758, %r17759, %r17760; + // end inline asm + mov.u32 %r17768, 6; + // begin inline asm + shf.l.wrap.b32 %r17761, %r17767, %r17766, %r17768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17765, %r17766, %r17767, %r17768; + // end inline asm + mov.u32 %r17776, 3; + // begin inline asm + shf.l.wrap.b32 %r17769, %r17775, %r17774, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17773, %r17774, %r17775, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17777, %r17783, %r17782, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17781, %r17782, %r17783, %r17452; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17785, %r17820, %r17593, %r17641, 0xD2; + lop3.b32 %r17786, %r17823, %r17597, %r17645, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30349, %r17593, %r17641, %r17737, 0xD2; + lop3.b32 %r30350, %r17597, %r17645, %r17741, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30345, %r17641, %r17737, %r17689, 0xD2; + lop3.b32 %r30346, %r17645, %r17741, %r17693, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30341, %r17737, %r17689, %r17820, 0xD2; + lop3.b32 %r30342, %r17741, %r17693, %r17823, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30339, %r17689, %r17820, %r17593, 0xD2; + lop3.b32 %r30340, %r17693, %r17823, %r17597, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30335, %r17729, %r17601, %r17769, 0xD2; + lop3.b32 %r30336, %r17733, %r17605, %r17773, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30347, %r17601, %r17769, %r17713, 0xD2; + lop3.b32 %r30348, %r17605, %r17773, %r17717, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30343, %r17769, %r17713, %r17609, 0xD2; + lop3.b32 %r30344, %r17773, %r17717, %r17613, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30315, %r17713, %r17609, %r17729, 0xD2; + lop3.b32 %r30316, %r17717, %r17613, %r17733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r30315, %r30316}; + // begin inline asm + // chi + lop3.b32 %r30307, %r17609, %r17729, %r17601, 0xD2; + lop3.b32 %r30308, %r17613, %r17733, %r17605, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r30307, %r30308}; + // begin inline asm + // chi + lop3.b32 %r30333, %r17777, %r17761, %r17649, 0xD2; + lop3.b32 %r30334, %r17781, %r17765, %r17653, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+104], {%r30333, %r30334}; + // begin inline asm + // chi + lop3.b32 %r30327, %r17761, %r17649, %r17657, 0xD2; + lop3.b32 %r30328, %r17765, %r17653, %r17661, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+112], {%r30327, %r30328}; + // begin inline asm + // chi + lop3.b32 %r30321, %r17649, %r17657, %r17625, 0xD2; + lop3.b32 %r30322, %r17653, %r17661, %r17629, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+120], {%r30321, %r30322}; + // begin inline asm + // chi + lop3.b32 %r30313, %r17657, %r17625, %r17777, 0xD2; + lop3.b32 %r30314, %r17661, %r17629, %r17781, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+128], {%r30313, %r30314}; + // begin inline asm + // chi + lop3.b32 %r30305, %r17625, %r17777, %r17761, 0xD2; + lop3.b32 %r30306, %r17629, %r17781, %r17765, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+136], {%r30305, %r30306}; + // begin inline asm + // chi + lop3.b32 %r30331, %r17681, %r17721, %r17753, 0xD2; + lop3.b32 %r30332, %r17685, %r17725, %r17757, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+144], {%r30331, %r30332}; + // begin inline asm + // chi + lop3.b32 %r30325, %r17721, %r17753, %r17745, 0xD2; + lop3.b32 %r30326, %r17725, %r17757, %r17749, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+152], {%r30325, %r30326}; + // begin inline asm + // chi + lop3.b32 %r30319, %r17753, %r17745, %r17665, 0xD2; + lop3.b32 %r30320, %r17757, %r17749, %r17669, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+160], {%r30319, %r30320}; + // begin inline asm + // chi + lop3.b32 %r30311, %r17745, %r17665, %r17681, 0xD2; + lop3.b32 %r30312, %r17749, %r17669, %r17685, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+168], {%r30311, %r30312}; + // begin inline asm + // chi + lop3.b32 %r30303, %r17665, %r17681, %r17721, 0xD2; + lop3.b32 %r30304, %r17669, %r17685, %r17725, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+176], {%r30303, %r30304}; + // begin inline asm + // chi + lop3.b32 %r30329, %r17633, %r17705, %r17617, 0xD2; + lop3.b32 %r30330, %r17637, %r17709, %r17621, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+184], {%r30329, %r30330}; + // begin inline asm + // chi + lop3.b32 %r30323, %r17705, %r17617, %r17673, 0xD2; + lop3.b32 %r30324, %r17709, %r17621, %r17677, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+192], {%r30323, %r30324}; + // begin inline asm + // chi + lop3.b32 %r30317, %r17617, %r17673, %r17697, 0xD2; + lop3.b32 %r30318, %r17621, %r17677, %r17701, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+200], {%r30317, %r30318}; + // begin inline asm + // chi + lop3.b32 %r30309, %r17673, %r17697, %r17633, 0xD2; + lop3.b32 %r30310, %r17677, %r17701, %r17637, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+208], {%r30309, %r30310}; + // begin inline asm + // chi + lop3.b32 %r30301, %r17697, %r17633, %r17705, 0xD2; + lop3.b32 %r30302, %r17701, %r17637, %r17709, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+216], {%r30301, %r30302}; + mul.wide.s32 %rd827, %r30351, 8; + add.s64 %rd826, %rd806, %rd827; + // begin inline asm + ld.global.nc.v2.u32 {%r17985,%r17986}, [%rd826]; + // end inline asm + xor.b32 %r30337, %r17785, %r17985; + xor.b32 %r30338, %r17786, %r17986; + add.s32 %r30351, %r30351, 1; + setp.lt.u32 %p36, %r30351, 23; + @%p36 bra $L__BB2_61; + + mov.u32 %r30384, 0; + mov.u32 %r18096, 1; + st.local.v2.u32 [%rd178+32], {%r30349, %r30350}; + st.local.v2.u32 [%rd178+72], {%r30347, %r30348}; + st.local.v2.u32 [%rd178+40], {%r30345, %r30346}; + st.local.v2.u32 [%rd178+80], {%r30343, %r30344}; + st.local.v2.u32 [%rd178+48], {%r30341, %r30342}; + st.local.v2.u32 [%rd178+56], {%r30339, %r30340}; + st.local.v2.u32 [%rd178+24], {%r30337, %r30338}; + // begin inline asm + // xor5 + lop3.b32 %r17997, %r30337, %r30335, %r30333, 0x96; + lop3.b32 %r17997, %r17997, %r30331, %r30329, 0x96; + lop3.b32 %r17998, %r30338, %r30336, %r30334, 0x96; + lop3.b32 %r17998, %r17998, %r30332, %r30330, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18009, %r30349, %r30347, %r30327, 0x96; + lop3.b32 %r18009, %r18009, %r30325, %r30323, 0x96; + lop3.b32 %r18010, %r30350, %r30348, %r30328, 0x96; + lop3.b32 %r18010, %r18010, %r30326, %r30324, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18021, %r30345, %r30343, %r30321, 0x96; + lop3.b32 %r18021, %r18021, %r30319, %r30317, 0x96; + lop3.b32 %r18022, %r30346, %r30344, %r30322, 0x96; + lop3.b32 %r18022, %r18022, %r30320, %r30318, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18033, %r30341, %r30315, %r30313, 0x96; + lop3.b32 %r18033, %r18033, %r30311, %r30309, 0x96; + lop3.b32 %r18034, %r30342, %r30316, %r30314, 0x96; + lop3.b32 %r18034, %r18034, %r30312, %r30310, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18045, %r30339, %r30307, %r30305, 0x96; + lop3.b32 %r18045, %r18045, %r30303, %r30301, 0x96; + lop3.b32 %r18046, %r30340, %r30308, %r30306, 0x96; + lop3.b32 %r18046, %r18046, %r30304, %r30302, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18057, %r18010, %r18009, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18061, %r18009, %r18010, %r18096; + // end inline asm + xor.b32 %r18236, %r18057, %r18045; + xor.b32 %r18237, %r18061, %r18046; + xor.b32 %r18204, %r30337, %r18236; + xor.b32 %r18207, %r30338, %r18237; + xor.b32 %r18167, %r30334, %r18237; + xor.b32 %r18166, %r30333, %r18236; + st.local.v2.u32 [%rd178+104], {%r18166, %r18167}; + // begin inline asm + shf.l.wrap.b32 %r18065, %r18022, %r18021, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18069, %r18021, %r18022, %r18096; + // end inline asm + xor.b32 %r18238, %r18065, %r17997; + xor.b32 %r18239, %r18069, %r17998; + xor.b32 %r18103, %r30347, %r18238; + xor.b32 %r18102, %r30348, %r18239; + xor.b32 %r18142, %r30326, %r18239; + xor.b32 %r18143, %r30325, %r18238; + st.local.v2.u32 [%rd178+152], {%r18143, %r18142}; + // begin inline asm + shf.l.wrap.b32 %r18073, %r18034, %r18033, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18077, %r18033, %r18034, %r18096; + // end inline asm + xor.b32 %r18240, %r18073, %r18009; + xor.b32 %r18241, %r18077, %r18010; + xor.b32 %r18126, %r30322, %r18241; + xor.b32 %r18127, %r30321, %r18240; + st.local.v2.u32 [%rd178+120], {%r18127, %r18126}; + xor.b32 %r18118, %r30318, %r18241; + xor.b32 %r18119, %r30317, %r18240; + st.local.v2.u32 [%rd178+200], {%r18119, %r18118}; + // begin inline asm + shf.l.wrap.b32 %r18081, %r18046, %r18045, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18085, %r18045, %r18046, %r18096; + // end inline asm + xor.b32 %r18242, %r18081, %r18021; + xor.b32 %r18243, %r18085, %r18022; + xor.b32 %r18150, %r30341, %r18242; + xor.b32 %r18151, %r30342, %r18243; + xor.b32 %r18159, %r30312, %r18243; + xor.b32 %r18158, %r30311, %r18242; + st.local.v2.u32 [%rd178+168], {%r18158, %r18159}; + // begin inline asm + shf.l.wrap.b32 %r18089, %r17998, %r17997, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18093, %r17997, %r17998, %r18096; + // end inline asm + xor.b32 %r18244, %r18089, %r18033; + xor.b32 %r18245, %r18093, %r18034; + xor.b32 %r18110, %r30307, %r18244; + xor.b32 %r18111, %r30308, %r18245; + xor.b32 %r18135, %r30302, %r18245; + xor.b32 %r18134, %r30301, %r18244; + st.local.v2.u32 [%rd178+216], {%r18134, %r18135}; + // begin inline asm + shf.l.wrap.b32 %r18097, %r18103, %r18102, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18101, %r18102, %r18103, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18105, %r18111, %r18110, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18109, %r18110, %r18111, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18117, %r18118, %r18119, %r17616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18113, %r18119, %r18118, %r17616; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r18113, %r18117}; + // begin inline asm + shf.l.wrap.b32 %r18121, %r18127, %r18126, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18125, %r18126, %r18127, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18129, %r18135, %r18134, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18133, %r18134, %r18135, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18141, %r18142, %r18143, %r17720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18137, %r18143, %r18142, %r17720; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r18137, %r18141}; + // begin inline asm + shf.l.wrap.b32 %r18145, %r18151, %r18150, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18149, %r18150, %r18151, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18153, %r18159, %r18158, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18157, %r18158, %r18159, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18161, %r18167, %r18166, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18165, %r18166, %r18167, %r17776; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18169, %r18204, %r18097, %r18121, 0xD2; + lop3.b32 %r18170, %r18207, %r18101, %r18125, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30484, %r18097, %r18121, %r18153, 0xD2; + lop3.b32 %r30485, %r18101, %r18125, %r18157, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+32], {%r30484, %r30485}; + // begin inline asm + // chi + lop3.b32 %r30480, %r18121, %r18153, %r18129, 0xD2; + lop3.b32 %r30481, %r18125, %r18157, %r18133, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+40], {%r30480, %r30481}; + // begin inline asm + // chi + lop3.b32 %r30476, %r18153, %r18129, %r18204, 0xD2; + lop3.b32 %r30477, %r18157, %r18133, %r18207, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+48], {%r30476, %r30477}; + // begin inline asm + // chi + lop3.b32 %r30474, %r18129, %r18204, %r18097, 0xD2; + lop3.b32 %r30475, %r18133, %r18207, %r18101, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+56], {%r30474, %r30475}; + // begin inline asm + // chi + lop3.b32 %r30470, %r18145, %r18105, %r18161, 0xD2; + lop3.b32 %r30471, %r18149, %r18109, %r18165, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+64], {%r30470, %r30471}; + // begin inline asm + // chi + lop3.b32 %r30482, %r18105, %r18161, %r18137, 0xD2; + lop3.b32 %r30483, %r18109, %r18165, %r18141, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+72], {%r30482, %r30483}; + // begin inline asm + // chi + lop3.b32 %r30478, %r18161, %r18137, %r18113, 0xD2; + lop3.b32 %r30479, %r18165, %r18141, %r18117, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+80], {%r30478, %r30479}; + // begin inline asm + ld.global.nc.v2.u32 {%r18233,%r18234}, [%rd807]; + // end inline asm + xor.b32 %r30472, %r18169, %r18233; + xor.b32 %r30473, %r18170, %r18234; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + add.s64 %rd180, %rd178, 24; + add.s64 %rd181, %rd3, 24; + +$L__BB2_63: + shl.b32 %r18246, %r30384, 2; + cvt.u64.u32 %rd837, %r18246; + and.b64 %rd838, %rd837, 60; + add.s64 %rd839, %rd181, %rd838; + xor.b32 %r18247, %r1678, %r30384; + mul.lo.s32 %r18248, %r18247, 16777619; + ld.local.u32 %r18249, [%rd839]; + xor.b32 %r18250, %r18248, %r18249; + mul.wide.u32 %rd840, %r18250, -954391867; + shr.u64 %rd841, %rd840, 32; + cvt.u32.u64 %r18251, %rd841; + sub.s32 %r18252, %r18250, %r18251; + shr.u32 %r18253, %r18252, 1; + add.s32 %r18254, %r18253, %r18251; + shr.u32 %r18255, %r18254, 20; + mul.lo.s32 %r18256, %r18255, 1179641; + sub.s32 %r18257, %r18250, %r18256; + mul.wide.u32 %rd842, %r18257, 64; + add.s64 %rd843, %rd471, %rd842; + mul.lo.s32 %r18258, %r30421, 16777619; + ld.global.u32 %r18259, [%rd843]; + xor.b32 %r30421, %r18258, %r18259; + mul.lo.s32 %r18260, %r30422, 16777619; + ld.global.u32 %r18261, [%rd843+4]; + xor.b32 %r30422, %r18260, %r18261; + mul.lo.s32 %r18262, %r30433, 16777619; + ld.global.u32 %r18263, [%rd843+8]; + mul.lo.s32 %r18264, %r30434, 16777619; + ld.global.u32 %r18265, [%rd843+12]; + xor.b32 %r18266, %r18264, %r18265; + xor.b32 %r30433, %r18262, %r18263; + mov.b64 %rd844, {%r30433, %r18266}; + mul.lo.s32 %r18267, %r30429, 16777619; + ld.global.u32 %r18268, [%rd843+16]; + mul.lo.s32 %r18269, %r30430, 16777619; + ld.global.u32 %r18270, [%rd843+20]; + xor.b32 %r18271, %r18269, %r18270; + xor.b32 %r30429, %r18267, %r18268; + mov.b64 %rd845, {%r30429, %r18271}; + mul.lo.s32 %r18272, %r30425, 16777619; + ld.global.u32 %r18273, [%rd843+24]; + mul.lo.s32 %r18274, %r30426, 16777619; + ld.global.u32 %r18275, [%rd843+28]; + xor.b32 %r18276, %r18274, %r18275; + xor.b32 %r30425, %r18272, %r18273; + mov.b64 %rd846, {%r30425, %r18276}; + mul.lo.s32 %r18277, %r30423, 16777619; + ld.global.u32 %r18278, [%rd843+32]; + mul.lo.s32 %r18279, %r30424, 16777619; + ld.global.u32 %r18280, [%rd843+36]; + xor.b32 %r18281, %r18279, %r18280; + xor.b32 %r30423, %r18277, %r18278; + mov.b64 %rd847, {%r30423, %r18281}; + mul.lo.s32 %r18282, %r30419, 16777619; + ld.global.u32 %r18283, [%rd843+40]; + xor.b32 %r30419, %r18282, %r18283; + mul.lo.s32 %r18284, %r30420, 16777619; + ld.global.u32 %r18285, [%rd843+44]; + xor.b32 %r30420, %r18284, %r18285; + mul.lo.s32 %r18286, %r30431, 16777619; + ld.global.u32 %r18287, [%rd843+48]; + mul.lo.s32 %r18288, %r30432, 16777619; + ld.global.u32 %r18289, [%rd843+52]; + xor.b32 %r18290, %r18288, %r18289; + xor.b32 %r30431, %r18286, %r18287; + mov.b64 %rd848, {%r30431, %r18290}; + mul.lo.s32 %r18291, %r30427, 16777619; + ld.global.u32 %r18292, [%rd843+56]; + mul.lo.s32 %r18293, %r30428, 16777619; + ld.global.u32 %r18294, [%rd843+60]; + xor.b32 %r18295, %r18293, %r18294; + xor.b32 %r30427, %r18291, %r18292; + mov.b64 %rd849, {%r30427, %r18295}; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + st.local.v2.u32 [%rd3+32], {%r30433, %r18266}; + st.local.v2.u32 [%rd3+40], {%r30429, %r18271}; + st.local.v2.u32 [%rd3+48], {%r30425, %r18276}; + st.local.v2.u32 [%rd3+56], {%r30423, %r18281}; + st.local.v2.u32 [%rd3+64], {%r30419, %r30420}; + st.local.v2.u32 [%rd3+72], {%r30431, %r18290}; + st.local.v2.u32 [%rd3+80], {%r30427, %r18295}; + add.s64 %rd850, %rd180, %rd838; + xor.b32 %r18296, %r1679, %r30384; + mul.lo.s32 %r18297, %r18296, 16777619; + ld.local.u32 %r18298, [%rd850]; + xor.b32 %r18299, %r18297, %r18298; + mul.wide.u32 %rd851, %r18299, -954391867; + shr.u64 %rd852, %rd851, 32; + cvt.u32.u64 %r18300, %rd852; + sub.s32 %r18301, %r18299, %r18300; + shr.u32 %r18302, %r18301, 1; + add.s32 %r18303, %r18302, %r18300; + shr.u32 %r18304, %r18303, 20; + mul.lo.s32 %r18305, %r18304, 1179641; + sub.s32 %r18306, %r18299, %r18305; + mul.wide.u32 %rd853, %r18306, 64; + add.s64 %rd854, %rd471, %rd853; + mul.lo.s32 %r18307, %r30472, 16777619; + ld.global.u32 %r18308, [%rd854]; + xor.b32 %r30472, %r18307, %r18308; + mul.lo.s32 %r18309, %r30473, 16777619; + ld.global.u32 %r18310, [%rd854+4]; + xor.b32 %r30473, %r18309, %r18310; + mul.lo.s32 %r18311, %r30484, 16777619; + ld.global.u32 %r18312, [%rd854+8]; + mul.lo.s32 %r18313, %r30485, 16777619; + ld.global.u32 %r18314, [%rd854+12]; + xor.b32 %r18315, %r18313, %r18314; + xor.b32 %r30484, %r18311, %r18312; + mov.b64 %rd855, {%r30484, %r18315}; + mul.lo.s32 %r18316, %r30480, 16777619; + ld.global.u32 %r18317, [%rd854+16]; + mul.lo.s32 %r18318, %r30481, 16777619; + ld.global.u32 %r18319, [%rd854+20]; + xor.b32 %r18320, %r18318, %r18319; + xor.b32 %r30480, %r18316, %r18317; + mov.b64 %rd856, {%r30480, %r18320}; + mul.lo.s32 %r18321, %r30476, 16777619; + ld.global.u32 %r18322, [%rd854+24]; + mul.lo.s32 %r18323, %r30477, 16777619; + ld.global.u32 %r18324, [%rd854+28]; + xor.b32 %r18325, %r18323, %r18324; + xor.b32 %r30476, %r18321, %r18322; + mov.b64 %rd857, {%r30476, %r18325}; + mul.lo.s32 %r18326, %r30474, 16777619; + ld.global.u32 %r18327, [%rd854+32]; + mul.lo.s32 %r18328, %r30475, 16777619; + ld.global.u32 %r18329, [%rd854+36]; + xor.b32 %r18330, %r18328, %r18329; + xor.b32 %r30474, %r18326, %r18327; + mov.b64 %rd858, {%r30474, %r18330}; + mul.lo.s32 %r18331, %r30470, 16777619; + ld.global.u32 %r18332, [%rd854+40]; + xor.b32 %r30470, %r18331, %r18332; + mul.lo.s32 %r18333, %r30471, 16777619; + ld.global.u32 %r18334, [%rd854+44]; + xor.b32 %r30471, %r18333, %r18334; + mul.lo.s32 %r18335, %r30482, 16777619; + ld.global.u32 %r18336, [%rd854+48]; + mul.lo.s32 %r18337, %r30483, 16777619; + ld.global.u32 %r18338, [%rd854+52]; + xor.b32 %r18339, %r18337, %r18338; + xor.b32 %r30482, %r18335, %r18336; + mov.b64 %rd859, {%r30482, %r18339}; + mul.lo.s32 %r18340, %r30478, 16777619; + ld.global.u32 %r18341, [%rd854+56]; + mul.lo.s32 %r18342, %r30479, 16777619; + ld.global.u32 %r18343, [%rd854+60]; + xor.b32 %r18344, %r18342, %r18343; + xor.b32 %r30478, %r18340, %r18341; + mov.b64 %rd860, {%r30478, %r18344}; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + st.local.v2.u32 [%rd178+32], {%r30484, %r18315}; + st.local.v2.u32 [%rd178+40], {%r30480, %r18320}; + st.local.v2.u32 [%rd178+48], {%r30476, %r18325}; + st.local.v2.u32 [%rd178+56], {%r30474, %r18330}; + st.local.v2.u32 [%rd178+64], {%r30470, %r30471}; + st.local.v2.u32 [%rd178+72], {%r30482, %r18339}; + st.local.v2.u32 [%rd178+80], {%r30478, %r18344}; + add.s32 %r30384, %r30384, 1; + setp.lt.u32 %p37, %r30384, 512; + shr.u64 %rd861, %rd844, 32; + cvt.u32.u64 %r30434, %rd861; + shr.u64 %rd862, %rd845, 32; + cvt.u32.u64 %r30430, %rd862; + shr.u64 %rd863, %rd846, 32; + cvt.u32.u64 %r30426, %rd863; + shr.u64 %rd864, %rd847, 32; + cvt.u32.u64 %r30424, %rd864; + shr.u64 %rd865, %rd848, 32; + cvt.u32.u64 %r30432, %rd865; + shr.u64 %rd866, %rd849, 32; + cvt.u32.u64 %r30428, %rd866; + shr.u64 %rd867, %rd855, 32; + cvt.u32.u64 %r30485, %rd867; + shr.u64 %rd868, %rd856, 32; + cvt.u32.u64 %r30481, %rd868; + shr.u64 %rd869, %rd857, 32; + cvt.u32.u64 %r30477, %rd869; + shr.u64 %rd870, %rd858, 32; + cvt.u32.u64 %r30475, %rd870; + shr.u64 %rd871, %rd859, 32; + cvt.u32.u64 %r30483, %rd871; + shr.u64 %rd872, %rd860, 32; + cvt.u32.u64 %r30479, %rd872; + @%p37 bra $L__BB2_63; + + mov.u32 %r30385, 0; + st.local.v2.u32 [%rd3+96], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+104], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+112], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+120], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+128], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+136], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+144], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+152], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+160], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+168], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+176], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+184], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+192], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+200], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+208], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+216], {%r30385, %r30385}; + mov.u32 %r30400, -2147483648; + mov.u32 %r18359, 1; + st.local.v2.u32 [%rd3+88], {%r18359, %r30400}; + mov.u32 %r30386, %r30385; + mov.u32 %r30387, %r30385; + mov.u32 %r30388, %r30385; + mov.u32 %r30389, %r30385; + mov.u32 %r30390, %r30385; + mov.u32 %r30391, %r30385; + mov.u32 %r30392, %r30385; + mov.u32 %r30393, %r30385; + mov.u32 %r30394, %r30385; + mov.u32 %r30395, %r30385; + mov.u32 %r30396, %r30385; + mov.u32 %r30397, %r30385; + mov.u32 %r30398, %r30385; + mov.u32 %r30399, %r18359; + mov.u32 %r30401, %r30385; + mov.u32 %r30402, %r30385; + mov.u32 %r30403, %r30385; + mov.u32 %r30404, %r30385; + mov.u32 %r30405, %r30385; + mov.u32 %r30406, %r30385; + mov.u32 %r30407, %r30385; + mov.u32 %r30408, %r30385; + mov.u32 %r30409, %r30385; + mov.u32 %r30410, %r30385; + mov.u32 %r30411, %r30385; + mov.u32 %r30412, %r30385; + mov.u32 %r30413, %r30385; + mov.u32 %r30414, %r30385; + mov.u32 %r30415, %r30385; + mov.u32 %r30416, %r30385; + mov.u32 %r30417, %r30385; + mov.u32 %r30418, %r30385; + mov.u32 %r30435, %r30385; + +$L__BB2_65: + // begin inline asm + // xor5 + lop3.b32 %r18386, %r30421, %r30419, %r30417, 0x96; + lop3.b32 %r18386, %r18386, %r30415, %r30413, 0x96; + lop3.b32 %r18387, %r30422, %r30420, %r30418, 0x96; + lop3.b32 %r18387, %r18387, %r30416, %r30414, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18398, %r30433, %r30431, %r30411, 0x96; + lop3.b32 %r18398, %r18398, %r30409, %r30407, 0x96; + lop3.b32 %r18399, %r30434, %r30432, %r30412, 0x96; + lop3.b32 %r18399, %r18399, %r30410, %r30408, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18410, %r30429, %r30427, %r30405, 0x96; + lop3.b32 %r18410, %r18410, %r30403, %r30401, 0x96; + lop3.b32 %r18411, %r30430, %r30428, %r30406, 0x96; + lop3.b32 %r18411, %r18411, %r30404, %r30402, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18422, %r30425, %r30399, %r30397, 0x96; + lop3.b32 %r18422, %r18422, %r30395, %r30393, 0x96; + lop3.b32 %r18423, %r30426, %r30400, %r30398, 0x96; + lop3.b32 %r18423, %r18423, %r30396, %r30394, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18434, %r30423, %r30391, %r30389, 0x96; + lop3.b32 %r18434, %r18434, %r30387, %r30385, 0x96; + lop3.b32 %r18435, %r30424, %r30392, %r30390, 0x96; + lop3.b32 %r18435, %r18435, %r30388, %r30386, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18446, %r18399, %r18398, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18450, %r18398, %r18399, %r18359; + // end inline asm + xor.b32 %r18880, %r18446, %r18434; + xor.b32 %r18881, %r18450, %r18435; + xor.b32 %r18713, %r30421, %r18880; + xor.b32 %r18716, %r30422, %r18881; + xor.b32 %r18620, %r30419, %r18880; + xor.b32 %r18619, %r30420, %r18881; + xor.b32 %r18667, %r30417, %r18880; + xor.b32 %r18668, %r30418, %r18881; + xor.b32 %r18572, %r30415, %r18880; + xor.b32 %r18571, %r30416, %r18881; + xor.b32 %r18523, %r30413, %r18880; + xor.b32 %r18524, %r30414, %r18881; + // begin inline asm + shf.l.wrap.b32 %r18454, %r18411, %r18410, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18458, %r18410, %r18411, %r18359; + // end inline asm + xor.b32 %r18882, %r18454, %r18386; + xor.b32 %r18883, %r18458, %r18387; + xor.b32 %r18675, %r30433, %r18882; + xor.b32 %r18676, %r30434, %r18883; + xor.b32 %r18492, %r30431, %r18882; + xor.b32 %r18491, %r30432, %r18883; + xor.b32 %r18651, %r30411, %r18882; + xor.b32 %r18652, %r30412, %r18883; + xor.b32 %r18612, %r30409, %r18882; + xor.b32 %r18611, %r30410, %r18883; + xor.b32 %r18595, %r30407, %r18882; + xor.b32 %r18596, %r30408, %r18883; + // begin inline asm + shf.l.wrap.b32 %r18462, %r18423, %r18422, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18466, %r18422, %r18423, %r18359; + // end inline asm + xor.b32 %r18884, %r18462, %r18398; + xor.b32 %r18885, %r18466, %r18399; + xor.b32 %r18532, %r30429, %r18884; + xor.b32 %r18531, %r30430, %r18885; + xor.b32 %r18659, %r30427, %r18884; + xor.b32 %r18660, %r30428, %r18885; + xor.b32 %r18540, %r30405, %r18884; + xor.b32 %r18539, %r30406, %r18885; + xor.b32 %r18643, %r30403, %r18884; + xor.b32 %r18644, %r30404, %r18885; + xor.b32 %r18508, %r30401, %r18884; + xor.b32 %r18507, %r30402, %r18885; + // begin inline asm + shf.l.wrap.b32 %r18470, %r18435, %r18434, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18474, %r18434, %r18435, %r18359; + // end inline asm + xor.b32 %r18886, %r18470, %r18410; + xor.b32 %r18887, %r18474, %r18411; + xor.b32 %r18627, %r30425, %r18886; + xor.b32 %r18628, %r30426, %r18887; + xor.b32 %r18604, %r30399, %r18886; + xor.b32 %r18603, %r30400, %r18887; + xor.b32 %r18547, %r30397, %r18886; + xor.b32 %r18548, %r30398, %r18887; + xor.b32 %r18635, %r30395, %r18886; + xor.b32 %r18636, %r30396, %r18887; + xor.b32 %r18564, %r30393, %r18886; + xor.b32 %r18563, %r30394, %r18887; + // begin inline asm + shf.l.wrap.b32 %r18478, %r18387, %r18386, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18482, %r18386, %r18387, %r18359; + // end inline asm + xor.b32 %r18888, %r18478, %r18422; + xor.b32 %r18889, %r18482, %r18423; + xor.b32 %r18579, %r30423, %r18888; + xor.b32 %r18580, %r30424, %r18889; + xor.b32 %r18499, %r30391, %r18888; + xor.b32 %r18500, %r30392, %r18889; + xor.b32 %r18516, %r30389, %r18888; + xor.b32 %r18515, %r30390, %r18889; + xor.b32 %r18555, %r30387, %r18888; + xor.b32 %r18556, %r30388, %r18889; + xor.b32 %r18587, %r30385, %r18888; + xor.b32 %r18588, %r30386, %r18889; + mov.u32 %r18493, 44; + // begin inline asm + shf.l.wrap.b32 %r18486, %r18492, %r18491, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18490, %r18491, %r18492, %r18493; + // end inline asm + mov.u32 %r18501, 20; + // begin inline asm + shf.l.wrap.b32 %r18494, %r18500, %r18499, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18498, %r18499, %r18500, %r18501; + // end inline asm + mov.u32 %r18509, 61; + // begin inline asm + shf.l.wrap.b32 %r18502, %r18508, %r18507, %r18509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18506, %r18507, %r18508, %r18509; + // end inline asm + mov.u32 %r18517, 39; + // begin inline asm + shf.l.wrap.b32 %r18510, %r18516, %r18515, %r18517; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18514, %r18515, %r18516, %r18517; + // end inline asm + mov.u32 %r18525, 18; + // begin inline asm + shf.l.wrap.b32 %r18518, %r18524, %r18523, %r18525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18522, %r18523, %r18524, %r18525; + // end inline asm + mov.u32 %r18533, 62; + // begin inline asm + shf.l.wrap.b32 %r18526, %r18532, %r18531, %r18533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18530, %r18531, %r18532, %r18533; + // end inline asm + mov.u32 %r18541, 43; + // begin inline asm + shf.l.wrap.b32 %r18534, %r18540, %r18539, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18538, %r18539, %r18540, %r18541; + // end inline asm + mov.u32 %r18549, 25; + // begin inline asm + shf.l.wrap.b32 %r18542, %r18548, %r18547, %r18549; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18546, %r18547, %r18548, %r18549; + // end inline asm + mov.u32 %r18557, 8; + // begin inline asm + shf.l.wrap.b32 %r18550, %r18556, %r18555, %r18557; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18554, %r18555, %r18556, %r18557; + // end inline asm + mov.u32 %r18565, 56; + // begin inline asm + shf.l.wrap.b32 %r18558, %r18564, %r18563, %r18565; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18562, %r18563, %r18564, %r18565; + // end inline asm + mov.u32 %r18573, 41; + // begin inline asm + shf.l.wrap.b32 %r18566, %r18572, %r18571, %r18573; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18570, %r18571, %r18572, %r18573; + // end inline asm + mov.u32 %r18581, 27; + // begin inline asm + shf.l.wrap.b32 %r18574, %r18580, %r18579, %r18581; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18578, %r18579, %r18580, %r18581; + // end inline asm + mov.u32 %r18589, 14; + // begin inline asm + shf.l.wrap.b32 %r18582, %r18588, %r18587, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18586, %r18587, %r18588, %r18589; + // end inline asm + mov.u32 %r18597, 2; + // begin inline asm + shf.l.wrap.b32 %r18590, %r18596, %r18595, %r18597; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18594, %r18595, %r18596, %r18597; + // end inline asm + mov.u32 %r18605, 55; + // begin inline asm + shf.l.wrap.b32 %r18598, %r18604, %r18603, %r18605; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18602, %r18603, %r18604, %r18605; + // end inline asm + mov.u32 %r18613, 45; + // begin inline asm + shf.l.wrap.b32 %r18606, %r18612, %r18611, %r18613; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18610, %r18611, %r18612, %r18613; + // end inline asm + mov.u32 %r18621, 36; + // begin inline asm + shf.l.wrap.b32 %r18614, %r18620, %r18619, %r18621; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18618, %r18619, %r18620, %r18621; + // end inline asm + mov.u32 %r18629, 28; + // begin inline asm + shf.l.wrap.b32 %r18622, %r18628, %r18627, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18626, %r18627, %r18628, %r18629; + // end inline asm + mov.u32 %r18637, 21; + // begin inline asm + shf.l.wrap.b32 %r18630, %r18636, %r18635, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18634, %r18635, %r18636, %r18637; + // end inline asm + mov.u32 %r18645, 15; + // begin inline asm + shf.l.wrap.b32 %r18638, %r18644, %r18643, %r18645; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18642, %r18643, %r18644, %r18645; + // end inline asm + mov.u32 %r18653, 10; + // begin inline asm + shf.l.wrap.b32 %r18646, %r18652, %r18651, %r18653; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18650, %r18651, %r18652, %r18653; + // end inline asm + mov.u32 %r18661, 6; + // begin inline asm + shf.l.wrap.b32 %r18654, %r18660, %r18659, %r18661; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18658, %r18659, %r18660, %r18661; + // end inline asm + mov.u32 %r18669, 3; + // begin inline asm + shf.l.wrap.b32 %r18662, %r18668, %r18667, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18666, %r18667, %r18668, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18670, %r18676, %r18675, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18674, %r18675, %r18676, %r18359; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18678, %r18713, %r18486, %r18534, 0xD2; + lop3.b32 %r18679, %r18716, %r18490, %r18538, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30433, %r18486, %r18534, %r18630, 0xD2; + lop3.b32 %r30434, %r18490, %r18538, %r18634, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30429, %r18534, %r18630, %r18582, 0xD2; + lop3.b32 %r30430, %r18538, %r18634, %r18586, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30425, %r18630, %r18582, %r18713, 0xD2; + lop3.b32 %r30426, %r18634, %r18586, %r18716, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30423, %r18582, %r18713, %r18486, 0xD2; + lop3.b32 %r30424, %r18586, %r18716, %r18490, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30419, %r18622, %r18494, %r18662, 0xD2; + lop3.b32 %r30420, %r18626, %r18498, %r18666, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30431, %r18494, %r18662, %r18606, 0xD2; + lop3.b32 %r30432, %r18498, %r18666, %r18610, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30427, %r18662, %r18606, %r18502, 0xD2; + lop3.b32 %r30428, %r18666, %r18610, %r18506, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30399, %r18606, %r18502, %r18622, 0xD2; + lop3.b32 %r30400, %r18610, %r18506, %r18626, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30399, %r30400}; + // begin inline asm + // chi + lop3.b32 %r30391, %r18502, %r18622, %r18494, 0xD2; + lop3.b32 %r30392, %r18506, %r18626, %r18498, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30391, %r30392}; + // begin inline asm + // chi + lop3.b32 %r30417, %r18670, %r18654, %r18542, 0xD2; + lop3.b32 %r30418, %r18674, %r18658, %r18546, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30417, %r30418}; + // begin inline asm + // chi + lop3.b32 %r30411, %r18654, %r18542, %r18550, 0xD2; + lop3.b32 %r30412, %r18658, %r18546, %r18554, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30411, %r30412}; + // begin inline asm + // chi + lop3.b32 %r30405, %r18542, %r18550, %r18518, 0xD2; + lop3.b32 %r30406, %r18546, %r18554, %r18522, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30405, %r30406}; + // begin inline asm + // chi + lop3.b32 %r30397, %r18550, %r18518, %r18670, 0xD2; + lop3.b32 %r30398, %r18554, %r18522, %r18674, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30397, %r30398}; + // begin inline asm + // chi + lop3.b32 %r30389, %r18518, %r18670, %r18654, 0xD2; + lop3.b32 %r30390, %r18522, %r18674, %r18658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30389, %r30390}; + // begin inline asm + // chi + lop3.b32 %r30415, %r18574, %r18614, %r18646, 0xD2; + lop3.b32 %r30416, %r18578, %r18618, %r18650, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30415, %r30416}; + // begin inline asm + // chi + lop3.b32 %r30409, %r18614, %r18646, %r18638, 0xD2; + lop3.b32 %r30410, %r18618, %r18650, %r18642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30409, %r30410}; + // begin inline asm + // chi + lop3.b32 %r30403, %r18646, %r18638, %r18558, 0xD2; + lop3.b32 %r30404, %r18650, %r18642, %r18562, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30403, %r30404}; + // begin inline asm + // chi + lop3.b32 %r30395, %r18638, %r18558, %r18574, 0xD2; + lop3.b32 %r30396, %r18642, %r18562, %r18578, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30395, %r30396}; + // begin inline asm + // chi + lop3.b32 %r30387, %r18558, %r18574, %r18614, 0xD2; + lop3.b32 %r30388, %r18562, %r18578, %r18618, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30387, %r30388}; + // begin inline asm + // chi + lop3.b32 %r30413, %r18526, %r18598, %r18510, 0xD2; + lop3.b32 %r30414, %r18530, %r18602, %r18514, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30413, %r30414}; + // begin inline asm + // chi + lop3.b32 %r30407, %r18598, %r18510, %r18566, 0xD2; + lop3.b32 %r30408, %r18602, %r18514, %r18570, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30407, %r30408}; + // begin inline asm + // chi + lop3.b32 %r30401, %r18510, %r18566, %r18590, 0xD2; + lop3.b32 %r30402, %r18514, %r18570, %r18594, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30401, %r30402}; + // begin inline asm + // chi + lop3.b32 %r30393, %r18566, %r18590, %r18526, 0xD2; + lop3.b32 %r30394, %r18570, %r18594, %r18530, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30393, %r30394}; + // begin inline asm + // chi + lop3.b32 %r30385, %r18590, %r18526, %r18598, 0xD2; + lop3.b32 %r30386, %r18594, %r18530, %r18602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30385, %r30386}; + mul.wide.s32 %rd874, %r30435, 8; + add.s64 %rd873, %rd806, %rd874; + // begin inline asm + ld.global.nc.v2.u32 {%r18878,%r18879}, [%rd873]; + // end inline asm + xor.b32 %r30421, %r18678, %r18878; + xor.b32 %r30422, %r18679, %r18879; + add.s32 %r30435, %r30435, 1; + setp.lt.u32 %p38, %r30435, 23; + @%p38 bra $L__BB2_65; + + st.local.v2.u32 [%rd3+32], {%r30433, %r30434}; + st.local.v2.u32 [%rd3+72], {%r30431, %r30432}; + st.local.v2.u32 [%rd3+40], {%r30429, %r30430}; + st.local.v2.u32 [%rd3+80], {%r30427, %r30428}; + st.local.v2.u32 [%rd3+48], {%r30425, %r30426}; + st.local.v2.u32 [%rd3+56], {%r30423, %r30424}; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + // begin inline asm + // xor5 + lop3.b32 %r18890, %r30421, %r30419, %r30417, 0x96; + lop3.b32 %r18890, %r18890, %r30415, %r30413, 0x96; + lop3.b32 %r18891, %r30422, %r30420, %r30418, 0x96; + lop3.b32 %r18891, %r18891, %r30416, %r30414, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18902, %r30433, %r30431, %r30411, 0x96; + lop3.b32 %r18902, %r18902, %r30409, %r30407, 0x96; + lop3.b32 %r18903, %r30434, %r30432, %r30412, 0x96; + lop3.b32 %r18903, %r18903, %r30410, %r30408, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18914, %r30429, %r30427, %r30405, 0x96; + lop3.b32 %r18914, %r18914, %r30403, %r30401, 0x96; + lop3.b32 %r18915, %r30430, %r30428, %r30406, 0x96; + lop3.b32 %r18915, %r18915, %r30404, %r30402, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18926, %r30425, %r30399, %r30397, 0x96; + lop3.b32 %r18926, %r18926, %r30395, %r30393, 0x96; + lop3.b32 %r18927, %r30426, %r30400, %r30398, 0x96; + lop3.b32 %r18927, %r18927, %r30396, %r30394, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18938, %r30423, %r30391, %r30389, 0x96; + lop3.b32 %r18938, %r18938, %r30387, %r30385, 0x96; + lop3.b32 %r18939, %r30424, %r30392, %r30390, 0x96; + lop3.b32 %r18939, %r18939, %r30388, %r30386, 0x96; + // end inline asm + mov.u32 %r19142, 1; + // begin inline asm + shf.l.wrap.b32 %r18950, %r18903, %r18902, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18954, %r18902, %r18903, %r19142; + // end inline asm + xor.b32 %r19169, %r18950, %r18938; + xor.b32 %r19170, %r18954, %r18939; + xor.b32 %r19097, %r30421, %r19169; + xor.b32 %r19100, %r30422, %r19170; + xor.b32 %r19060, %r30418, %r19170; + xor.b32 %r19059, %r30417, %r19169; + st.local.v2.u32 [%rd3+104], {%r19059, %r19060}; + // begin inline asm + shf.l.wrap.b32 %r18958, %r18915, %r18914, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18962, %r18914, %r18915, %r19142; + // end inline asm + xor.b32 %r19171, %r18958, %r18890; + xor.b32 %r19172, %r18962, %r18891; + xor.b32 %r18996, %r30431, %r19171; + xor.b32 %r18995, %r30432, %r19172; + xor.b32 %r19035, %r30410, %r19172; + xor.b32 %r19036, %r30409, %r19171; + st.local.v2.u32 [%rd3+152], {%r19036, %r19035}; + // begin inline asm + shf.l.wrap.b32 %r18966, %r18927, %r18926, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18970, %r18926, %r18927, %r19142; + // end inline asm + xor.b32 %r19173, %r18966, %r18902; + xor.b32 %r19174, %r18970, %r18903; + xor.b32 %r19019, %r30406, %r19174; + xor.b32 %r19020, %r30405, %r19173; + st.local.v2.u32 [%rd3+120], {%r19020, %r19019}; + xor.b32 %r19011, %r30402, %r19174; + xor.b32 %r19012, %r30401, %r19173; + st.local.v2.u32 [%rd3+200], {%r19012, %r19011}; + // begin inline asm + shf.l.wrap.b32 %r18974, %r18939, %r18938, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18978, %r18938, %r18939, %r19142; + // end inline asm + xor.b32 %r19175, %r18974, %r18914; + xor.b32 %r19176, %r18978, %r18915; + xor.b32 %r19043, %r30425, %r19175; + xor.b32 %r19044, %r30426, %r19176; + xor.b32 %r19052, %r30396, %r19176; + xor.b32 %r19051, %r30395, %r19175; + st.local.v2.u32 [%rd3+168], {%r19051, %r19052}; + // begin inline asm + shf.l.wrap.b32 %r18982, %r18891, %r18890, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18986, %r18890, %r18891, %r19142; + // end inline asm + xor.b32 %r19177, %r18982, %r18926; + xor.b32 %r19178, %r18986, %r18927; + xor.b32 %r19003, %r30391, %r19177; + xor.b32 %r19004, %r30392, %r19178; + xor.b32 %r19028, %r30386, %r19178; + xor.b32 %r19027, %r30385, %r19177; + st.local.v2.u32 [%rd3+216], {%r19027, %r19028}; + // begin inline asm + shf.l.wrap.b32 %r18990, %r18996, %r18995, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18994, %r18995, %r18996, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18998, %r19004, %r19003, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19002, %r19003, %r19004, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19010, %r19011, %r19012, %r18509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19006, %r19012, %r19011, %r18509; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r19006, %r19010}; + // begin inline asm + shf.l.wrap.b32 %r19014, %r19020, %r19019, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19018, %r19019, %r19020, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19022, %r19028, %r19027, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19026, %r19027, %r19028, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19034, %r19035, %r19036, %r18613; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19030, %r19036, %r19035, %r18613; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r19030, %r19034}; + // begin inline asm + shf.l.wrap.b32 %r19038, %r19044, %r19043, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19042, %r19043, %r19044, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19046, %r19052, %r19051, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19050, %r19051, %r19052, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19054, %r19060, %r19059, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19058, %r19059, %r19060, %r18669; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19062, %r19097, %r18990, %r19014, 0xD2; + lop3.b32 %r19063, %r19100, %r18994, %r19018, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19070, %r18990, %r19014, %r19046, 0xD2; + lop3.b32 %r19071, %r18994, %r19018, %r19050, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r19070, %r19071}; + // begin inline asm + // chi + lop3.b32 %r19078, %r19014, %r19046, %r19022, 0xD2; + lop3.b32 %r19079, %r19018, %r19050, %r19026, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r19078, %r19079}; + // begin inline asm + // chi + lop3.b32 %r19086, %r19046, %r19022, %r19097, 0xD2; + lop3.b32 %r19087, %r19050, %r19026, %r19100, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r19086, %r19087}; + // begin inline asm + // chi + lop3.b32 %r19094, %r19022, %r19097, %r18990, 0xD2; + lop3.b32 %r19095, %r19026, %r19100, %r18994, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r19094, %r19095}; + // begin inline asm + // chi + lop3.b32 %r19102, %r19038, %r18998, %r19054, 0xD2; + lop3.b32 %r19103, %r19042, %r19002, %r19058, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r19102, %r19103}; + // begin inline asm + // chi + lop3.b32 %r19110, %r18998, %r19054, %r19030, 0xD2; + lop3.b32 %r19111, %r19002, %r19058, %r19034, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r19110, %r19111}; + // begin inline asm + // chi + lop3.b32 %r19118, %r19054, %r19030, %r19006, 0xD2; + lop3.b32 %r19119, %r19058, %r19034, %r19010, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r19118, %r19119}; + // begin inline asm + ld.global.nc.v2.u32 {%r19126,%r19127}, [%rd807]; + // end inline asm + xor.b32 %r19179, %r19063, %r19127; + xor.b32 %r19180, %r19062, %r19126; + mov.b64 %rd1265, {%r19180, %r19179}; + mov.b64 %rd1266, {%r19070, %r19071}; + mov.b64 %rd1267, {%r19078, %r19079}; + mov.b64 %rd1268, {%r19094, %r19095}; + mov.u32 %r30436, 0; + st.local.v2.u32 [%rd3+24], {%r19180, %r19179}; + st.local.v2.u32 [%rd178+96], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+104], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+112], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+120], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+128], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+136], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+144], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+152], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+160], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+168], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+176], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+184], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+192], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+200], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+208], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+216], {%r30436, %r30436}; + mov.u32 %r30451, -2147483648; + st.local.v2.u32 [%rd178+88], {%r19142, %r30451}; + mov.u32 %r30437, %r30436; + mov.u32 %r30438, %r30436; + mov.u32 %r30439, %r30436; + mov.u32 %r30440, %r30436; + mov.u32 %r30441, %r30436; + mov.u32 %r30442, %r30436; + mov.u32 %r30443, %r30436; + mov.u32 %r30444, %r30436; + mov.u32 %r30445, %r30436; + mov.u32 %r30446, %r30436; + mov.u32 %r30447, %r30436; + mov.u32 %r30448, %r30436; + mov.u32 %r30449, %r30436; + mov.u32 %r30450, %r19142; + mov.u32 %r30452, %r30436; + mov.u32 %r30453, %r30436; + mov.u32 %r30454, %r30436; + mov.u32 %r30455, %r30436; + mov.u32 %r30456, %r30436; + mov.u32 %r30457, %r30436; + mov.u32 %r30458, %r30436; + mov.u32 %r30459, %r30436; + mov.u32 %r30460, %r30436; + mov.u32 %r30461, %r30436; + mov.u32 %r30462, %r30436; + mov.u32 %r30463, %r30436; + mov.u32 %r30464, %r30436; + mov.u32 %r30465, %r30436; + mov.u32 %r30466, %r30436; + mov.u32 %r30467, %r30436; + mov.u32 %r30468, %r30436; + mov.u32 %r30469, %r30436; + mov.u32 %r30486, %r30436; + +$L__BB2_67: + // begin inline asm + // xor5 + lop3.b32 %r19181, %r30472, %r30470, %r30468, 0x96; + lop3.b32 %r19181, %r19181, %r30466, %r30464, 0x96; + lop3.b32 %r19182, %r30473, %r30471, %r30469, 0x96; + lop3.b32 %r19182, %r19182, %r30467, %r30465, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19193, %r30484, %r30482, %r30462, 0x96; + lop3.b32 %r19193, %r19193, %r30460, %r30458, 0x96; + lop3.b32 %r19194, %r30485, %r30483, %r30463, 0x96; + lop3.b32 %r19194, %r19194, %r30461, %r30459, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19205, %r30480, %r30478, %r30456, 0x96; + lop3.b32 %r19205, %r19205, %r30454, %r30452, 0x96; + lop3.b32 %r19206, %r30481, %r30479, %r30457, 0x96; + lop3.b32 %r19206, %r19206, %r30455, %r30453, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19217, %r30476, %r30450, %r30448, 0x96; + lop3.b32 %r19217, %r19217, %r30446, %r30444, 0x96; + lop3.b32 %r19218, %r30477, %r30451, %r30449, 0x96; + lop3.b32 %r19218, %r19218, %r30447, %r30445, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19229, %r30474, %r30442, %r30440, 0x96; + lop3.b32 %r19229, %r19229, %r30438, %r30436, 0x96; + lop3.b32 %r19230, %r30475, %r30443, %r30441, 0x96; + lop3.b32 %r19230, %r19230, %r30439, %r30437, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19241, %r19194, %r19193, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19245, %r19193, %r19194, %r19142; + // end inline asm + xor.b32 %r19675, %r19241, %r19229; + xor.b32 %r19676, %r19245, %r19230; + xor.b32 %r19508, %r30472, %r19675; + xor.b32 %r19511, %r30473, %r19676; + xor.b32 %r19415, %r30470, %r19675; + xor.b32 %r19414, %r30471, %r19676; + xor.b32 %r19462, %r30468, %r19675; + xor.b32 %r19463, %r30469, %r19676; + xor.b32 %r19367, %r30466, %r19675; + xor.b32 %r19366, %r30467, %r19676; + xor.b32 %r19318, %r30464, %r19675; + xor.b32 %r19319, %r30465, %r19676; + // begin inline asm + shf.l.wrap.b32 %r19249, %r19206, %r19205, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19253, %r19205, %r19206, %r19142; + // end inline asm + xor.b32 %r19677, %r19249, %r19181; + xor.b32 %r19678, %r19253, %r19182; + xor.b32 %r19470, %r30484, %r19677; + xor.b32 %r19471, %r30485, %r19678; + xor.b32 %r19287, %r30482, %r19677; + xor.b32 %r19286, %r30483, %r19678; + xor.b32 %r19446, %r30462, %r19677; + xor.b32 %r19447, %r30463, %r19678; + xor.b32 %r19407, %r30460, %r19677; + xor.b32 %r19406, %r30461, %r19678; + xor.b32 %r19390, %r30458, %r19677; + xor.b32 %r19391, %r30459, %r19678; + // begin inline asm + shf.l.wrap.b32 %r19257, %r19218, %r19217, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19261, %r19217, %r19218, %r19142; + // end inline asm + xor.b32 %r19679, %r19257, %r19193; + xor.b32 %r19680, %r19261, %r19194; + xor.b32 %r19327, %r30480, %r19679; + xor.b32 %r19326, %r30481, %r19680; + xor.b32 %r19454, %r30478, %r19679; + xor.b32 %r19455, %r30479, %r19680; + xor.b32 %r19335, %r30456, %r19679; + xor.b32 %r19334, %r30457, %r19680; + xor.b32 %r19438, %r30454, %r19679; + xor.b32 %r19439, %r30455, %r19680; + xor.b32 %r19303, %r30452, %r19679; + xor.b32 %r19302, %r30453, %r19680; + // begin inline asm + shf.l.wrap.b32 %r19265, %r19230, %r19229, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19269, %r19229, %r19230, %r19142; + // end inline asm + xor.b32 %r19681, %r19265, %r19205; + xor.b32 %r19682, %r19269, %r19206; + xor.b32 %r19422, %r30476, %r19681; + xor.b32 %r19423, %r30477, %r19682; + xor.b32 %r19399, %r30450, %r19681; + xor.b32 %r19398, %r30451, %r19682; + xor.b32 %r19342, %r30448, %r19681; + xor.b32 %r19343, %r30449, %r19682; + xor.b32 %r19430, %r30446, %r19681; + xor.b32 %r19431, %r30447, %r19682; + xor.b32 %r19359, %r30444, %r19681; + xor.b32 %r19358, %r30445, %r19682; + // begin inline asm + shf.l.wrap.b32 %r19273, %r19182, %r19181, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19277, %r19181, %r19182, %r19142; + // end inline asm + xor.b32 %r19683, %r19273, %r19217; + xor.b32 %r19684, %r19277, %r19218; + xor.b32 %r19374, %r30474, %r19683; + xor.b32 %r19375, %r30475, %r19684; + xor.b32 %r19294, %r30442, %r19683; + xor.b32 %r19295, %r30443, %r19684; + xor.b32 %r19311, %r30440, %r19683; + xor.b32 %r19310, %r30441, %r19684; + xor.b32 %r19350, %r30438, %r19683; + xor.b32 %r19351, %r30439, %r19684; + xor.b32 %r19382, %r30436, %r19683; + xor.b32 %r19383, %r30437, %r19684; + mov.u32 %r19288, 44; + // begin inline asm + shf.l.wrap.b32 %r19281, %r19287, %r19286, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19285, %r19286, %r19287, %r19288; + // end inline asm + mov.u32 %r19296, 20; + // begin inline asm + shf.l.wrap.b32 %r19289, %r19295, %r19294, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19293, %r19294, %r19295, %r19296; + // end inline asm + mov.u32 %r19304, 61; + // begin inline asm + shf.l.wrap.b32 %r19297, %r19303, %r19302, %r19304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19301, %r19302, %r19303, %r19304; + // end inline asm + mov.u32 %r19312, 39; + // begin inline asm + shf.l.wrap.b32 %r19305, %r19311, %r19310, %r19312; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19309, %r19310, %r19311, %r19312; + // end inline asm + mov.u32 %r19320, 18; + // begin inline asm + shf.l.wrap.b32 %r19313, %r19319, %r19318, %r19320; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19317, %r19318, %r19319, %r19320; + // end inline asm + mov.u32 %r19328, 62; + // begin inline asm + shf.l.wrap.b32 %r19321, %r19327, %r19326, %r19328; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19325, %r19326, %r19327, %r19328; + // end inline asm + mov.u32 %r19336, 43; + // begin inline asm + shf.l.wrap.b32 %r19329, %r19335, %r19334, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19333, %r19334, %r19335, %r19336; + // end inline asm + mov.u32 %r19344, 25; + // begin inline asm + shf.l.wrap.b32 %r19337, %r19343, %r19342, %r19344; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19341, %r19342, %r19343, %r19344; + // end inline asm + mov.u32 %r19352, 8; + // begin inline asm + shf.l.wrap.b32 %r19345, %r19351, %r19350, %r19352; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19349, %r19350, %r19351, %r19352; + // end inline asm + mov.u32 %r19360, 56; + // begin inline asm + shf.l.wrap.b32 %r19353, %r19359, %r19358, %r19360; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19357, %r19358, %r19359, %r19360; + // end inline asm + mov.u32 %r19368, 41; + // begin inline asm + shf.l.wrap.b32 %r19361, %r19367, %r19366, %r19368; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19365, %r19366, %r19367, %r19368; + // end inline asm + mov.u32 %r19376, 27; + // begin inline asm + shf.l.wrap.b32 %r19369, %r19375, %r19374, %r19376; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19373, %r19374, %r19375, %r19376; + // end inline asm + mov.u32 %r19384, 14; + // begin inline asm + shf.l.wrap.b32 %r19377, %r19383, %r19382, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19381, %r19382, %r19383, %r19384; + // end inline asm + mov.u32 %r19392, 2; + // begin inline asm + shf.l.wrap.b32 %r19385, %r19391, %r19390, %r19392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19389, %r19390, %r19391, %r19392; + // end inline asm + mov.u32 %r19400, 55; + // begin inline asm + shf.l.wrap.b32 %r19393, %r19399, %r19398, %r19400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19397, %r19398, %r19399, %r19400; + // end inline asm + mov.u32 %r19408, 45; + // begin inline asm + shf.l.wrap.b32 %r19401, %r19407, %r19406, %r19408; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19405, %r19406, %r19407, %r19408; + // end inline asm + mov.u32 %r19416, 36; + // begin inline asm + shf.l.wrap.b32 %r19409, %r19415, %r19414, %r19416; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19413, %r19414, %r19415, %r19416; + // end inline asm + mov.u32 %r19424, 28; + // begin inline asm + shf.l.wrap.b32 %r19417, %r19423, %r19422, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19421, %r19422, %r19423, %r19424; + // end inline asm + mov.u32 %r19432, 21; + // begin inline asm + shf.l.wrap.b32 %r19425, %r19431, %r19430, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19429, %r19430, %r19431, %r19432; + // end inline asm + mov.u32 %r19440, 15; + // begin inline asm + shf.l.wrap.b32 %r19433, %r19439, %r19438, %r19440; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19437, %r19438, %r19439, %r19440; + // end inline asm + mov.u32 %r19448, 10; + // begin inline asm + shf.l.wrap.b32 %r19441, %r19447, %r19446, %r19448; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19445, %r19446, %r19447, %r19448; + // end inline asm + mov.u32 %r19456, 6; + // begin inline asm + shf.l.wrap.b32 %r19449, %r19455, %r19454, %r19456; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19453, %r19454, %r19455, %r19456; + // end inline asm + mov.u32 %r19464, 3; + // begin inline asm + shf.l.wrap.b32 %r19457, %r19463, %r19462, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19461, %r19462, %r19463, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19465, %r19471, %r19470, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19469, %r19470, %r19471, %r19142; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19473, %r19508, %r19281, %r19329, 0xD2; + lop3.b32 %r19474, %r19511, %r19285, %r19333, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30484, %r19281, %r19329, %r19425, 0xD2; + lop3.b32 %r30485, %r19285, %r19333, %r19429, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30480, %r19329, %r19425, %r19377, 0xD2; + lop3.b32 %r30481, %r19333, %r19429, %r19381, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30476, %r19425, %r19377, %r19508, 0xD2; + lop3.b32 %r30477, %r19429, %r19381, %r19511, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30474, %r19377, %r19508, %r19281, 0xD2; + lop3.b32 %r30475, %r19381, %r19511, %r19285, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30470, %r19417, %r19289, %r19457, 0xD2; + lop3.b32 %r30471, %r19421, %r19293, %r19461, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30482, %r19289, %r19457, %r19401, 0xD2; + lop3.b32 %r30483, %r19293, %r19461, %r19405, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30478, %r19457, %r19401, %r19297, 0xD2; + lop3.b32 %r30479, %r19461, %r19405, %r19301, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30450, %r19401, %r19297, %r19417, 0xD2; + lop3.b32 %r30451, %r19405, %r19301, %r19421, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r30450, %r30451}; + // begin inline asm + // chi + lop3.b32 %r30442, %r19297, %r19417, %r19289, 0xD2; + lop3.b32 %r30443, %r19301, %r19421, %r19293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r30442, %r30443}; + // begin inline asm + // chi + lop3.b32 %r30468, %r19465, %r19449, %r19337, 0xD2; + lop3.b32 %r30469, %r19469, %r19453, %r19341, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+104], {%r30468, %r30469}; + // begin inline asm + // chi + lop3.b32 %r30462, %r19449, %r19337, %r19345, 0xD2; + lop3.b32 %r30463, %r19453, %r19341, %r19349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+112], {%r30462, %r30463}; + // begin inline asm + // chi + lop3.b32 %r30456, %r19337, %r19345, %r19313, 0xD2; + lop3.b32 %r30457, %r19341, %r19349, %r19317, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+120], {%r30456, %r30457}; + // begin inline asm + // chi + lop3.b32 %r30448, %r19345, %r19313, %r19465, 0xD2; + lop3.b32 %r30449, %r19349, %r19317, %r19469, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+128], {%r30448, %r30449}; + // begin inline asm + // chi + lop3.b32 %r30440, %r19313, %r19465, %r19449, 0xD2; + lop3.b32 %r30441, %r19317, %r19469, %r19453, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+136], {%r30440, %r30441}; + // begin inline asm + // chi + lop3.b32 %r30466, %r19369, %r19409, %r19441, 0xD2; + lop3.b32 %r30467, %r19373, %r19413, %r19445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+144], {%r30466, %r30467}; + // begin inline asm + // chi + lop3.b32 %r30460, %r19409, %r19441, %r19433, 0xD2; + lop3.b32 %r30461, %r19413, %r19445, %r19437, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+152], {%r30460, %r30461}; + // begin inline asm + // chi + lop3.b32 %r30454, %r19441, %r19433, %r19353, 0xD2; + lop3.b32 %r30455, %r19445, %r19437, %r19357, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+160], {%r30454, %r30455}; + // begin inline asm + // chi + lop3.b32 %r30446, %r19433, %r19353, %r19369, 0xD2; + lop3.b32 %r30447, %r19437, %r19357, %r19373, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+168], {%r30446, %r30447}; + // begin inline asm + // chi + lop3.b32 %r30438, %r19353, %r19369, %r19409, 0xD2; + lop3.b32 %r30439, %r19357, %r19373, %r19413, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+176], {%r30438, %r30439}; + // begin inline asm + // chi + lop3.b32 %r30464, %r19321, %r19393, %r19305, 0xD2; + lop3.b32 %r30465, %r19325, %r19397, %r19309, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+184], {%r30464, %r30465}; + // begin inline asm + // chi + lop3.b32 %r30458, %r19393, %r19305, %r19361, 0xD2; + lop3.b32 %r30459, %r19397, %r19309, %r19365, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+192], {%r30458, %r30459}; + // begin inline asm + // chi + lop3.b32 %r30452, %r19305, %r19361, %r19385, 0xD2; + lop3.b32 %r30453, %r19309, %r19365, %r19389, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+200], {%r30452, %r30453}; + // begin inline asm + // chi + lop3.b32 %r30444, %r19361, %r19385, %r19321, 0xD2; + lop3.b32 %r30445, %r19365, %r19389, %r19325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+208], {%r30444, %r30445}; + // begin inline asm + // chi + lop3.b32 %r30436, %r19385, %r19321, %r19393, 0xD2; + lop3.b32 %r30437, %r19389, %r19325, %r19397, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+216], {%r30436, %r30437}; + mul.wide.s32 %rd881, %r30486, 8; + add.s64 %rd880, %rd806, %rd881; + // begin inline asm + ld.global.nc.v2.u32 {%r19673,%r19674}, [%rd880]; + // end inline asm + xor.b32 %r30472, %r19473, %r19673; + xor.b32 %r30473, %r19474, %r19674; + add.s32 %r30486, %r30486, 1; + setp.lt.u32 %p39, %r30486, 23; + @%p39 bra $L__BB2_67; + + mov.u32 %r19784, 1; + st.local.v2.u32 [%rd178+32], {%r30484, %r30485}; + st.local.v2.u32 [%rd178+72], {%r30482, %r30483}; + st.local.v2.u32 [%rd178+40], {%r30480, %r30481}; + st.local.v2.u32 [%rd178+80], {%r30478, %r30479}; + st.local.v2.u32 [%rd178+48], {%r30476, %r30477}; + st.local.v2.u32 [%rd178+56], {%r30474, %r30475}; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + // begin inline asm + // xor5 + lop3.b32 %r19685, %r30472, %r30470, %r30468, 0x96; + lop3.b32 %r19685, %r19685, %r30466, %r30464, 0x96; + lop3.b32 %r19686, %r30473, %r30471, %r30469, 0x96; + lop3.b32 %r19686, %r19686, %r30467, %r30465, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19697, %r30484, %r30482, %r30462, 0x96; + lop3.b32 %r19697, %r19697, %r30460, %r30458, 0x96; + lop3.b32 %r19698, %r30485, %r30483, %r30463, 0x96; + lop3.b32 %r19698, %r19698, %r30461, %r30459, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19709, %r30480, %r30478, %r30456, 0x96; + lop3.b32 %r19709, %r19709, %r30454, %r30452, 0x96; + lop3.b32 %r19710, %r30481, %r30479, %r30457, 0x96; + lop3.b32 %r19710, %r19710, %r30455, %r30453, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19721, %r30476, %r30450, %r30448, 0x96; + lop3.b32 %r19721, %r19721, %r30446, %r30444, 0x96; + lop3.b32 %r19722, %r30477, %r30451, %r30449, 0x96; + lop3.b32 %r19722, %r19722, %r30447, %r30445, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19733, %r30474, %r30442, %r30440, 0x96; + lop3.b32 %r19733, %r19733, %r30438, %r30436, 0x96; + lop3.b32 %r19734, %r30475, %r30443, %r30441, 0x96; + lop3.b32 %r19734, %r19734, %r30439, %r30437, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19745, %r19698, %r19697, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19749, %r19697, %r19698, %r19784; + // end inline asm + xor.b32 %r19923, %r19745, %r19733; + xor.b32 %r19924, %r19749, %r19734; + xor.b32 %r19892, %r30472, %r19923; + xor.b32 %r19895, %r30473, %r19924; + xor.b32 %r19855, %r30469, %r19924; + xor.b32 %r19854, %r30468, %r19923; + st.local.v2.u32 [%rd178+104], {%r19854, %r19855}; + // begin inline asm + shf.l.wrap.b32 %r19753, %r19710, %r19709, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19757, %r19709, %r19710, %r19784; + // end inline asm + xor.b32 %r19925, %r19753, %r19685; + xor.b32 %r19926, %r19757, %r19686; + xor.b32 %r19791, %r30482, %r19925; + xor.b32 %r19790, %r30483, %r19926; + xor.b32 %r19830, %r30461, %r19926; + xor.b32 %r19831, %r30460, %r19925; + st.local.v2.u32 [%rd178+152], {%r19831, %r19830}; + // begin inline asm + shf.l.wrap.b32 %r19761, %r19722, %r19721, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19765, %r19721, %r19722, %r19784; + // end inline asm + xor.b32 %r19927, %r19761, %r19697; + xor.b32 %r19928, %r19765, %r19698; + xor.b32 %r19814, %r30457, %r19928; + xor.b32 %r19815, %r30456, %r19927; + st.local.v2.u32 [%rd178+120], {%r19815, %r19814}; + xor.b32 %r19806, %r30453, %r19928; + xor.b32 %r19807, %r30452, %r19927; + st.local.v2.u32 [%rd178+200], {%r19807, %r19806}; + // begin inline asm + shf.l.wrap.b32 %r19769, %r19734, %r19733, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19773, %r19733, %r19734, %r19784; + // end inline asm + xor.b32 %r19929, %r19769, %r19709; + xor.b32 %r19930, %r19773, %r19710; + xor.b32 %r19838, %r30476, %r19929; + xor.b32 %r19839, %r30477, %r19930; + xor.b32 %r19847, %r30447, %r19930; + xor.b32 %r19846, %r30446, %r19929; + st.local.v2.u32 [%rd178+168], {%r19846, %r19847}; + // begin inline asm + shf.l.wrap.b32 %r19777, %r19686, %r19685, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19781, %r19685, %r19686, %r19784; + // end inline asm + xor.b32 %r19931, %r19777, %r19721; + xor.b32 %r19932, %r19781, %r19722; + xor.b32 %r19798, %r30442, %r19931; + xor.b32 %r19799, %r30443, %r19932; + xor.b32 %r19823, %r30437, %r19932; + xor.b32 %r19822, %r30436, %r19931; + st.local.v2.u32 [%rd178+216], {%r19822, %r19823}; + // begin inline asm + shf.l.wrap.b32 %r19785, %r19791, %r19790, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19789, %r19790, %r19791, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19793, %r19799, %r19798, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19797, %r19798, %r19799, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19805, %r19806, %r19807, %r19304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19801, %r19807, %r19806, %r19304; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r19801, %r19805}; + // begin inline asm + shf.l.wrap.b32 %r19809, %r19815, %r19814, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19813, %r19814, %r19815, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19817, %r19823, %r19822, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19821, %r19822, %r19823, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19829, %r19830, %r19831, %r19408; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19825, %r19831, %r19830, %r19408; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r19825, %r19829}; + // begin inline asm + shf.l.wrap.b32 %r19833, %r19839, %r19838, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19837, %r19838, %r19839, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19841, %r19847, %r19846, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19845, %r19846, %r19847, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19849, %r19855, %r19854, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19853, %r19854, %r19855, %r19464; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19857, %r19892, %r19785, %r19809, 0xD2; + lop3.b32 %r19858, %r19895, %r19789, %r19813, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19865, %r19785, %r19809, %r19841, 0xD2; + lop3.b32 %r19866, %r19789, %r19813, %r19845, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+32], {%r19865, %r19866}; + // begin inline asm + // chi + lop3.b32 %r19873, %r19809, %r19841, %r19817, 0xD2; + lop3.b32 %r19874, %r19813, %r19845, %r19821, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+40], {%r19873, %r19874}; + // begin inline asm + // chi + lop3.b32 %r19881, %r19841, %r19817, %r19892, 0xD2; + lop3.b32 %r19882, %r19845, %r19821, %r19895, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+48], {%r19881, %r19882}; + // begin inline asm + // chi + lop3.b32 %r19889, %r19817, %r19892, %r19785, 0xD2; + lop3.b32 %r19890, %r19821, %r19895, %r19789, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+56], {%r19889, %r19890}; + // begin inline asm + // chi + lop3.b32 %r19897, %r19833, %r19793, %r19849, 0xD2; + lop3.b32 %r19898, %r19837, %r19797, %r19853, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+64], {%r19897, %r19898}; + // begin inline asm + // chi + lop3.b32 %r19905, %r19793, %r19849, %r19825, 0xD2; + lop3.b32 %r19906, %r19797, %r19853, %r19829, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+72], {%r19905, %r19906}; + // begin inline asm + // chi + lop3.b32 %r19913, %r19849, %r19825, %r19801, 0xD2; + lop3.b32 %r19914, %r19853, %r19829, %r19805, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+80], {%r19913, %r19914}; + // begin inline asm + ld.global.nc.v2.u32 {%r19921,%r19922}, [%rd807]; + // end inline asm + xor.b32 %r19933, %r19858, %r19922; + xor.b32 %r19934, %r19857, %r19921; + st.local.v2.u32 [%rd178+24], {%r19934, %r19933}; + bra.uni $L__BB2_69; + +$L__BB2_47: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd688, 1179641; + st.local.u64 [%rd3+8], %rd688; + st.local.u32 [%rd3+16], %r1678; + ld.global.u64 %rd689, [%rd128]; + ld.global.u64 %rd690, [%rd128+8]; + ld.global.u64 %rd691, [%rd128+16]; + ld.global.u64 %rd692, [%rd128+24]; + ld.global.u64 %rd693, [%rd128+32]; + ld.global.u64 %rd694, [%rd128+40]; + ld.global.u64 %rd695, [%rd128+48]; + ld.global.u64 %rd696, [%rd128+56]; + st.local.u64 [%rd3+24], %rd689; + st.local.u64 [%rd3+32], %rd690; + st.local.u64 [%rd3+40], %rd691; + st.local.u64 [%rd3+48], %rd692; + st.local.u64 [%rd3+56], %rd693; + st.local.u64 [%rd3+64], %rd694; + st.local.u64 [%rd3+72], %rd695; + st.local.u64 [%rd3+80], %rd696; + cvt.u32.u64 %r13408, %rd689; + xor.b32 %r13409, %r1678, %r13408; + st.local.u32 [%rd3+24], %r13409; + mov.u32 %r30013, 0; + st.local.v2.u32 [%rd3+96], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+104], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+112], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+120], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+128], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+136], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+144], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+152], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+160], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+168], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+176], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+184], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+192], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+200], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+208], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+216], {%r30013, %r30013}; + mov.u32 %r30028, -2147483648; + mov.u32 %r13381, 1; + st.local.v2.u32 [%rd3+88], {%r13381, %r30028}; + ld.local.v2.u32 {%r30049, %r30050}, [%rd3+24]; + mov.b64 {%r30047, %r30048}, %rd694; + shr.u64 %rd697, %rd690, 32; + cvt.u32.u64 %r30061, %rd690; + cvt.u32.u64 %r30062, %rd697; + shr.u64 %rd698, %rd695, 32; + cvt.u32.u64 %r30059, %rd695; + cvt.u32.u64 %r30060, %rd698; + shr.u64 %rd699, %rd691, 32; + cvt.u32.u64 %r30057, %rd691; + cvt.u32.u64 %r30058, %rd699; + shr.u64 %rd700, %rd696, 32; + cvt.u32.u64 %r30055, %rd696; + cvt.u32.u64 %r30056, %rd700; + shr.u64 %rd701, %rd692, 32; + cvt.u32.u64 %r30053, %rd692; + cvt.u32.u64 %r30054, %rd701; + shr.u64 %rd702, %rd693, 32; + cvt.u32.u64 %r30051, %rd693; + cvt.u32.u64 %r30052, %rd702; + mov.u32 %r30014, %r30013; + mov.u32 %r30015, %r30013; + mov.u32 %r30016, %r30013; + mov.u32 %r30017, %r30013; + mov.u32 %r30018, %r30013; + mov.u32 %r30019, %r30013; + mov.u32 %r30020, %r30013; + mov.u32 %r30021, %r30013; + mov.u32 %r30022, %r30013; + mov.u32 %r30023, %r30013; + mov.u32 %r30024, %r30013; + mov.u32 %r30025, %r30013; + mov.u32 %r30026, %r30013; + mov.u32 %r30027, %r13381; + mov.u32 %r30029, %r30013; + mov.u32 %r30030, %r30013; + mov.u32 %r30031, %r30013; + mov.u32 %r30032, %r30013; + mov.u32 %r30033, %r30013; + mov.u32 %r30034, %r30013; + mov.u32 %r30035, %r30013; + mov.u32 %r30036, %r30013; + mov.u32 %r30037, %r30013; + mov.u32 %r30038, %r30013; + mov.u32 %r30039, %r30013; + mov.u32 %r30040, %r30013; + mov.u32 %r30041, %r30013; + mov.u32 %r30042, %r30013; + mov.u32 %r30043, %r30013; + mov.u32 %r30044, %r30013; + mov.u32 %r30045, %r30013; + mov.u32 %r30046, %r30013; + mov.u32 %r30063, %r30013; + +$L__BB2_48: + // begin inline asm + // xor5 + lop3.b32 %r13412, %r30049, %r30047, %r30045, 0x96; + lop3.b32 %r13412, %r13412, %r30043, %r30041, 0x96; + lop3.b32 %r13413, %r30050, %r30048, %r30046, 0x96; + lop3.b32 %r13413, %r13413, %r30044, %r30042, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13424, %r30061, %r30059, %r30039, 0x96; + lop3.b32 %r13424, %r13424, %r30037, %r30035, 0x96; + lop3.b32 %r13425, %r30062, %r30060, %r30040, 0x96; + lop3.b32 %r13425, %r13425, %r30038, %r30036, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13436, %r30057, %r30055, %r30033, 0x96; + lop3.b32 %r13436, %r13436, %r30031, %r30029, 0x96; + lop3.b32 %r13437, %r30058, %r30056, %r30034, 0x96; + lop3.b32 %r13437, %r13437, %r30032, %r30030, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13448, %r30053, %r30027, %r30025, 0x96; + lop3.b32 %r13448, %r13448, %r30023, %r30021, 0x96; + lop3.b32 %r13449, %r30054, %r30028, %r30026, 0x96; + lop3.b32 %r13449, %r13449, %r30024, %r30022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13460, %r30051, %r30019, %r30017, 0x96; + lop3.b32 %r13460, %r13460, %r30015, %r30013, 0x96; + lop3.b32 %r13461, %r30052, %r30020, %r30018, 0x96; + lop3.b32 %r13461, %r13461, %r30016, %r30014, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13472, %r13425, %r13424, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13476, %r13424, %r13425, %r13381; + // end inline asm + xor.b32 %r13906, %r13472, %r13460; + xor.b32 %r13907, %r13476, %r13461; + xor.b32 %r13739, %r30049, %r13906; + xor.b32 %r13742, %r30050, %r13907; + xor.b32 %r13646, %r30047, %r13906; + xor.b32 %r13645, %r30048, %r13907; + xor.b32 %r13693, %r30045, %r13906; + xor.b32 %r13694, %r30046, %r13907; + xor.b32 %r13598, %r30043, %r13906; + xor.b32 %r13597, %r30044, %r13907; + xor.b32 %r13549, %r30041, %r13906; + xor.b32 %r13550, %r30042, %r13907; + // begin inline asm + shf.l.wrap.b32 %r13480, %r13437, %r13436, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13484, %r13436, %r13437, %r13381; + // end inline asm + xor.b32 %r13908, %r13480, %r13412; + xor.b32 %r13909, %r13484, %r13413; + xor.b32 %r13701, %r30061, %r13908; + xor.b32 %r13702, %r30062, %r13909; + xor.b32 %r13518, %r30059, %r13908; + xor.b32 %r13517, %r30060, %r13909; + xor.b32 %r13677, %r30039, %r13908; + xor.b32 %r13678, %r30040, %r13909; + xor.b32 %r13638, %r30037, %r13908; + xor.b32 %r13637, %r30038, %r13909; + xor.b32 %r13621, %r30035, %r13908; + xor.b32 %r13622, %r30036, %r13909; + // begin inline asm + shf.l.wrap.b32 %r13488, %r13449, %r13448, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13492, %r13448, %r13449, %r13381; + // end inline asm + xor.b32 %r13910, %r13488, %r13424; + xor.b32 %r13911, %r13492, %r13425; + xor.b32 %r13558, %r30057, %r13910; + xor.b32 %r13557, %r30058, %r13911; + xor.b32 %r13685, %r30055, %r13910; + xor.b32 %r13686, %r30056, %r13911; + xor.b32 %r13566, %r30033, %r13910; + xor.b32 %r13565, %r30034, %r13911; + xor.b32 %r13669, %r30031, %r13910; + xor.b32 %r13670, %r30032, %r13911; + xor.b32 %r13534, %r30029, %r13910; + xor.b32 %r13533, %r30030, %r13911; + // begin inline asm + shf.l.wrap.b32 %r13496, %r13461, %r13460, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13500, %r13460, %r13461, %r13381; + // end inline asm + xor.b32 %r13912, %r13496, %r13436; + xor.b32 %r13913, %r13500, %r13437; + xor.b32 %r13653, %r30053, %r13912; + xor.b32 %r13654, %r30054, %r13913; + xor.b32 %r13630, %r30027, %r13912; + xor.b32 %r13629, %r30028, %r13913; + xor.b32 %r13573, %r30025, %r13912; + xor.b32 %r13574, %r30026, %r13913; + xor.b32 %r13661, %r30023, %r13912; + xor.b32 %r13662, %r30024, %r13913; + xor.b32 %r13590, %r30021, %r13912; + xor.b32 %r13589, %r30022, %r13913; + // begin inline asm + shf.l.wrap.b32 %r13504, %r13413, %r13412, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13508, %r13412, %r13413, %r13381; + // end inline asm + xor.b32 %r13914, %r13504, %r13448; + xor.b32 %r13915, %r13508, %r13449; + xor.b32 %r13605, %r30051, %r13914; + xor.b32 %r13606, %r30052, %r13915; + xor.b32 %r13525, %r30019, %r13914; + xor.b32 %r13526, %r30020, %r13915; + xor.b32 %r13542, %r30017, %r13914; + xor.b32 %r13541, %r30018, %r13915; + xor.b32 %r13581, %r30015, %r13914; + xor.b32 %r13582, %r30016, %r13915; + xor.b32 %r13613, %r30013, %r13914; + xor.b32 %r13614, %r30014, %r13915; + mov.u32 %r13519, 44; + // begin inline asm + shf.l.wrap.b32 %r13512, %r13518, %r13517, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13516, %r13517, %r13518, %r13519; + // end inline asm + mov.u32 %r13527, 20; + // begin inline asm + shf.l.wrap.b32 %r13520, %r13526, %r13525, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13524, %r13525, %r13526, %r13527; + // end inline asm + mov.u32 %r13535, 61; + // begin inline asm + shf.l.wrap.b32 %r13528, %r13534, %r13533, %r13535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13532, %r13533, %r13534, %r13535; + // end inline asm + mov.u32 %r13543, 39; + // begin inline asm + shf.l.wrap.b32 %r13536, %r13542, %r13541, %r13543; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13540, %r13541, %r13542, %r13543; + // end inline asm + mov.u32 %r13551, 18; + // begin inline asm + shf.l.wrap.b32 %r13544, %r13550, %r13549, %r13551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13548, %r13549, %r13550, %r13551; + // end inline asm + mov.u32 %r13559, 62; + // begin inline asm + shf.l.wrap.b32 %r13552, %r13558, %r13557, %r13559; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13556, %r13557, %r13558, %r13559; + // end inline asm + mov.u32 %r13567, 43; + // begin inline asm + shf.l.wrap.b32 %r13560, %r13566, %r13565, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13564, %r13565, %r13566, %r13567; + // end inline asm + mov.u32 %r13575, 25; + // begin inline asm + shf.l.wrap.b32 %r13568, %r13574, %r13573, %r13575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13572, %r13573, %r13574, %r13575; + // end inline asm + mov.u32 %r13583, 8; + // begin inline asm + shf.l.wrap.b32 %r13576, %r13582, %r13581, %r13583; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13580, %r13581, %r13582, %r13583; + // end inline asm + mov.u32 %r13591, 56; + // begin inline asm + shf.l.wrap.b32 %r13584, %r13590, %r13589, %r13591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13588, %r13589, %r13590, %r13591; + // end inline asm + mov.u32 %r13599, 41; + // begin inline asm + shf.l.wrap.b32 %r13592, %r13598, %r13597, %r13599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13596, %r13597, %r13598, %r13599; + // end inline asm + mov.u32 %r13607, 27; + // begin inline asm + shf.l.wrap.b32 %r13600, %r13606, %r13605, %r13607; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13604, %r13605, %r13606, %r13607; + // end inline asm + mov.u32 %r13615, 14; + // begin inline asm + shf.l.wrap.b32 %r13608, %r13614, %r13613, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13612, %r13613, %r13614, %r13615; + // end inline asm + mov.u32 %r13623, 2; + // begin inline asm + shf.l.wrap.b32 %r13616, %r13622, %r13621, %r13623; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13620, %r13621, %r13622, %r13623; + // end inline asm + mov.u32 %r13631, 55; + // begin inline asm + shf.l.wrap.b32 %r13624, %r13630, %r13629, %r13631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13628, %r13629, %r13630, %r13631; + // end inline asm + mov.u32 %r13639, 45; + // begin inline asm + shf.l.wrap.b32 %r13632, %r13638, %r13637, %r13639; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13636, %r13637, %r13638, %r13639; + // end inline asm + mov.u32 %r13647, 36; + // begin inline asm + shf.l.wrap.b32 %r13640, %r13646, %r13645, %r13647; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13644, %r13645, %r13646, %r13647; + // end inline asm + mov.u32 %r13655, 28; + // begin inline asm + shf.l.wrap.b32 %r13648, %r13654, %r13653, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13652, %r13653, %r13654, %r13655; + // end inline asm + mov.u32 %r13663, 21; + // begin inline asm + shf.l.wrap.b32 %r13656, %r13662, %r13661, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13660, %r13661, %r13662, %r13663; + // end inline asm + mov.u32 %r13671, 15; + // begin inline asm + shf.l.wrap.b32 %r13664, %r13670, %r13669, %r13671; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13668, %r13669, %r13670, %r13671; + // end inline asm + mov.u32 %r13679, 10; + // begin inline asm + shf.l.wrap.b32 %r13672, %r13678, %r13677, %r13679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13676, %r13677, %r13678, %r13679; + // end inline asm + mov.u32 %r13687, 6; + // begin inline asm + shf.l.wrap.b32 %r13680, %r13686, %r13685, %r13687; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13684, %r13685, %r13686, %r13687; + // end inline asm + mov.u32 %r13695, 3; + // begin inline asm + shf.l.wrap.b32 %r13688, %r13694, %r13693, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13692, %r13693, %r13694, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13696, %r13702, %r13701, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13700, %r13701, %r13702, %r13381; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13704, %r13739, %r13512, %r13560, 0xD2; + lop3.b32 %r13705, %r13742, %r13516, %r13564, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30061, %r13512, %r13560, %r13656, 0xD2; + lop3.b32 %r30062, %r13516, %r13564, %r13660, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30057, %r13560, %r13656, %r13608, 0xD2; + lop3.b32 %r30058, %r13564, %r13660, %r13612, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30053, %r13656, %r13608, %r13739, 0xD2; + lop3.b32 %r30054, %r13660, %r13612, %r13742, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30051, %r13608, %r13739, %r13512, 0xD2; + lop3.b32 %r30052, %r13612, %r13742, %r13516, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30047, %r13648, %r13520, %r13688, 0xD2; + lop3.b32 %r30048, %r13652, %r13524, %r13692, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30059, %r13520, %r13688, %r13632, 0xD2; + lop3.b32 %r30060, %r13524, %r13692, %r13636, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30055, %r13688, %r13632, %r13528, 0xD2; + lop3.b32 %r30056, %r13692, %r13636, %r13532, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30027, %r13632, %r13528, %r13648, 0xD2; + lop3.b32 %r30028, %r13636, %r13532, %r13652, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30027, %r30028}; + // begin inline asm + // chi + lop3.b32 %r30019, %r13528, %r13648, %r13520, 0xD2; + lop3.b32 %r30020, %r13532, %r13652, %r13524, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30019, %r30020}; + // begin inline asm + // chi + lop3.b32 %r30045, %r13696, %r13680, %r13568, 0xD2; + lop3.b32 %r30046, %r13700, %r13684, %r13572, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30045, %r30046}; + // begin inline asm + // chi + lop3.b32 %r30039, %r13680, %r13568, %r13576, 0xD2; + lop3.b32 %r30040, %r13684, %r13572, %r13580, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30039, %r30040}; + // begin inline asm + // chi + lop3.b32 %r30033, %r13568, %r13576, %r13544, 0xD2; + lop3.b32 %r30034, %r13572, %r13580, %r13548, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30033, %r30034}; + // begin inline asm + // chi + lop3.b32 %r30025, %r13576, %r13544, %r13696, 0xD2; + lop3.b32 %r30026, %r13580, %r13548, %r13700, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30025, %r30026}; + // begin inline asm + // chi + lop3.b32 %r30017, %r13544, %r13696, %r13680, 0xD2; + lop3.b32 %r30018, %r13548, %r13700, %r13684, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30017, %r30018}; + // begin inline asm + // chi + lop3.b32 %r30043, %r13600, %r13640, %r13672, 0xD2; + lop3.b32 %r30044, %r13604, %r13644, %r13676, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30043, %r30044}; + // begin inline asm + // chi + lop3.b32 %r30037, %r13640, %r13672, %r13664, 0xD2; + lop3.b32 %r30038, %r13644, %r13676, %r13668, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30037, %r30038}; + // begin inline asm + // chi + lop3.b32 %r30031, %r13672, %r13664, %r13584, 0xD2; + lop3.b32 %r30032, %r13676, %r13668, %r13588, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30031, %r30032}; + // begin inline asm + // chi + lop3.b32 %r30023, %r13664, %r13584, %r13600, 0xD2; + lop3.b32 %r30024, %r13668, %r13588, %r13604, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30023, %r30024}; + // begin inline asm + // chi + lop3.b32 %r30015, %r13584, %r13600, %r13640, 0xD2; + lop3.b32 %r30016, %r13588, %r13604, %r13644, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30015, %r30016}; + // begin inline asm + // chi + lop3.b32 %r30041, %r13552, %r13624, %r13536, 0xD2; + lop3.b32 %r30042, %r13556, %r13628, %r13540, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30041, %r30042}; + // begin inline asm + // chi + lop3.b32 %r30035, %r13624, %r13536, %r13592, 0xD2; + lop3.b32 %r30036, %r13628, %r13540, %r13596, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30035, %r30036}; + // begin inline asm + // chi + lop3.b32 %r30029, %r13536, %r13592, %r13616, 0xD2; + lop3.b32 %r30030, %r13540, %r13596, %r13620, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30029, %r30030}; + // begin inline asm + // chi + lop3.b32 %r30021, %r13592, %r13616, %r13552, 0xD2; + lop3.b32 %r30022, %r13596, %r13620, %r13556, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30021, %r30022}; + // begin inline asm + // chi + lop3.b32 %r30013, %r13616, %r13552, %r13624, 0xD2; + lop3.b32 %r30014, %r13620, %r13556, %r13628, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30013, %r30014}; + mul.wide.s32 %rd704, %r30063, 8; + mov.u64 %rd705, keccak_round_constants; + cvta.const.u64 %rd706, %rd705; + add.s64 %rd703, %rd706, %rd704; + // begin inline asm + ld.global.nc.v2.u32 {%r13904,%r13905}, [%rd703]; + // end inline asm + xor.b32 %r30049, %r13704, %r13904; + xor.b32 %r30050, %r13705, %r13905; + add.s32 %r30063, %r30063, 1; + setp.lt.u32 %p30, %r30063, 23; + @%p30 bra $L__BB2_48; + + add.u64 %rd149, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30061, %r30062}; + st.local.v2.u32 [%rd3+72], {%r30059, %r30060}; + st.local.v2.u32 [%rd3+40], {%r30057, %r30058}; + st.local.v2.u32 [%rd3+80], {%r30055, %r30056}; + st.local.v2.u32 [%rd3+48], {%r30053, %r30054}; + st.local.v2.u32 [%rd3+56], {%r30051, %r30052}; + st.local.v2.u32 [%rd3+24], {%r30049, %r30050}; + // begin inline asm + // xor5 + lop3.b32 %r13916, %r30049, %r30047, %r30045, 0x96; + lop3.b32 %r13916, %r13916, %r30043, %r30041, 0x96; + lop3.b32 %r13917, %r30050, %r30048, %r30046, 0x96; + lop3.b32 %r13917, %r13917, %r30044, %r30042, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13928, %r30061, %r30059, %r30039, 0x96; + lop3.b32 %r13928, %r13928, %r30037, %r30035, 0x96; + lop3.b32 %r13929, %r30062, %r30060, %r30040, 0x96; + lop3.b32 %r13929, %r13929, %r30038, %r30036, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13940, %r30057, %r30055, %r30033, 0x96; + lop3.b32 %r13940, %r13940, %r30031, %r30029, 0x96; + lop3.b32 %r13941, %r30058, %r30056, %r30034, 0x96; + lop3.b32 %r13941, %r13941, %r30032, %r30030, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13952, %r30053, %r30027, %r30025, 0x96; + lop3.b32 %r13952, %r13952, %r30023, %r30021, 0x96; + lop3.b32 %r13953, %r30054, %r30028, %r30026, 0x96; + lop3.b32 %r13953, %r13953, %r30024, %r30022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13964, %r30051, %r30019, %r30017, 0x96; + lop3.b32 %r13964, %r13964, %r30015, %r30013, 0x96; + lop3.b32 %r13965, %r30052, %r30020, %r30018, 0x96; + lop3.b32 %r13965, %r13965, %r30016, %r30014, 0x96; + // end inline asm + mov.u32 %r14168, 1; + // begin inline asm + shf.l.wrap.b32 %r13976, %r13929, %r13928, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13980, %r13928, %r13929, %r14168; + // end inline asm + xor.b32 %r14195, %r13976, %r13964; + xor.b32 %r14196, %r13980, %r13965; + xor.b32 %r14123, %r30049, %r14195; + xor.b32 %r14126, %r30050, %r14196; + xor.b32 %r14086, %r30046, %r14196; + xor.b32 %r14085, %r30045, %r14195; + st.local.v2.u32 [%rd3+104], {%r14085, %r14086}; + // begin inline asm + shf.l.wrap.b32 %r13984, %r13941, %r13940, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13988, %r13940, %r13941, %r14168; + // end inline asm + xor.b32 %r14197, %r13984, %r13916; + xor.b32 %r14198, %r13988, %r13917; + xor.b32 %r14022, %r30059, %r14197; + xor.b32 %r14021, %r30060, %r14198; + xor.b32 %r14061, %r30038, %r14198; + xor.b32 %r14062, %r30037, %r14197; + st.local.v2.u32 [%rd3+152], {%r14062, %r14061}; + // begin inline asm + shf.l.wrap.b32 %r13992, %r13953, %r13952, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13996, %r13952, %r13953, %r14168; + // end inline asm + xor.b32 %r14199, %r13992, %r13928; + xor.b32 %r14200, %r13996, %r13929; + xor.b32 %r14045, %r30034, %r14200; + xor.b32 %r14046, %r30033, %r14199; + st.local.v2.u32 [%rd3+120], {%r14046, %r14045}; + xor.b32 %r14037, %r30030, %r14200; + xor.b32 %r14038, %r30029, %r14199; + st.local.v2.u32 [%rd3+200], {%r14038, %r14037}; + // begin inline asm + shf.l.wrap.b32 %r14000, %r13965, %r13964, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14004, %r13964, %r13965, %r14168; + // end inline asm + xor.b32 %r14201, %r14000, %r13940; + xor.b32 %r14202, %r14004, %r13941; + xor.b32 %r14069, %r30053, %r14201; + xor.b32 %r14070, %r30054, %r14202; + xor.b32 %r14078, %r30024, %r14202; + xor.b32 %r14077, %r30023, %r14201; + st.local.v2.u32 [%rd3+168], {%r14077, %r14078}; + // begin inline asm + shf.l.wrap.b32 %r14008, %r13917, %r13916, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14012, %r13916, %r13917, %r14168; + // end inline asm + xor.b32 %r14203, %r14008, %r13952; + xor.b32 %r14204, %r14012, %r13953; + xor.b32 %r14029, %r30019, %r14203; + xor.b32 %r14030, %r30020, %r14204; + xor.b32 %r14054, %r30014, %r14204; + xor.b32 %r14053, %r30013, %r14203; + st.local.v2.u32 [%rd3+216], {%r14053, %r14054}; + // begin inline asm + shf.l.wrap.b32 %r14016, %r14022, %r14021, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14020, %r14021, %r14022, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14024, %r14030, %r14029, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14028, %r14029, %r14030, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14036, %r14037, %r14038, %r13535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14032, %r14038, %r14037, %r13535; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r14032, %r14036}; + // begin inline asm + shf.l.wrap.b32 %r14040, %r14046, %r14045, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14044, %r14045, %r14046, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14048, %r14054, %r14053, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14052, %r14053, %r14054, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14060, %r14061, %r14062, %r13639; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14056, %r14062, %r14061, %r13639; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r14056, %r14060}; + // begin inline asm + shf.l.wrap.b32 %r14064, %r14070, %r14069, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14068, %r14069, %r14070, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14072, %r14078, %r14077, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14076, %r14077, %r14078, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14080, %r14086, %r14085, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14084, %r14085, %r14086, %r13695; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14088, %r14123, %r14016, %r14040, 0xD2; + lop3.b32 %r14089, %r14126, %r14020, %r14044, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30196, %r14016, %r14040, %r14072, 0xD2; + lop3.b32 %r30197, %r14020, %r14044, %r14076, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30196, %r30197}; + // begin inline asm + // chi + lop3.b32 %r30192, %r14040, %r14072, %r14048, 0xD2; + lop3.b32 %r30193, %r14044, %r14076, %r14052, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30192, %r30193}; + // begin inline asm + // chi + lop3.b32 %r30188, %r14072, %r14048, %r14123, 0xD2; + lop3.b32 %r30189, %r14076, %r14052, %r14126, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30188, %r30189}; + // begin inline asm + // chi + lop3.b32 %r30186, %r14048, %r14123, %r14016, 0xD2; + lop3.b32 %r30187, %r14052, %r14126, %r14020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30186, %r30187}; + // begin inline asm + // chi + lop3.b32 %r30182, %r14064, %r14024, %r14080, 0xD2; + lop3.b32 %r30183, %r14068, %r14028, %r14084, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30182, %r30183}; + // begin inline asm + // chi + lop3.b32 %r30194, %r14024, %r14080, %r14056, 0xD2; + lop3.b32 %r30195, %r14028, %r14084, %r14060, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30194, %r30195}; + // begin inline asm + // chi + lop3.b32 %r30190, %r14080, %r14056, %r14032, 0xD2; + lop3.b32 %r30191, %r14084, %r14060, %r14036, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30190, %r30191}; + add.s64 %rd707, %rd706, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r14152,%r14153}, [%rd707]; + // end inline asm + xor.b32 %r30184, %r14088, %r14152; + xor.b32 %r30185, %r14089, %r14153; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + st.local.u64 [%rd149], %rd354; + mov.u64 %rd711, 1179641; + st.local.u64 [%rd149+8], %rd711; + add.s32 %r1874, %r1678, 1; + st.local.u32 [%rd149+16], %r1874; + ld.global.u64 %rd712, [%rd129]; + ld.global.u64 %rd713, [%rd129+8]; + ld.global.u64 %rd714, [%rd129+16]; + ld.global.u64 %rd715, [%rd129+24]; + ld.global.u64 %rd716, [%rd129+32]; + ld.global.u64 %rd717, [%rd129+40]; + ld.global.u64 %rd718, [%rd129+48]; + ld.global.u64 %rd719, [%rd129+56]; + st.local.u64 [%rd149+32], %rd713; + st.local.u64 [%rd149+40], %rd714; + st.local.u64 [%rd149+48], %rd715; + st.local.u64 [%rd149+56], %rd716; + st.local.u64 [%rd149+64], %rd717; + st.local.u64 [%rd149+72], %rd718; + st.local.u64 [%rd149+80], %rd719; + cvt.u32.u64 %r14205, %rd712; + xor.b32 %r14206, %r1874, %r14205; + st.local.u64 [%rd149+24], %rd712; + st.local.u32 [%rd149+24], %r14206; + mov.u32 %r30064, 0; + st.local.v2.u32 [%rd149+96], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+104], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+112], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+120], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+128], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+136], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+144], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+152], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+160], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+168], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+176], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+184], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+192], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+200], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+208], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+216], {%r30064, %r30064}; + mov.u32 %r30079, -2147483648; + st.local.v2.u32 [%rd149+88], {%r14168, %r30079}; + ld.local.v2.u32 {%r30100, %r30101}, [%rd149+24]; + mov.b64 {%r30098, %r30099}, %rd717; + shr.u64 %rd720, %rd713, 32; + cvt.u32.u64 %r30112, %rd713; + cvt.u32.u64 %r30113, %rd720; + shr.u64 %rd721, %rd718, 32; + cvt.u32.u64 %r30110, %rd718; + cvt.u32.u64 %r30111, %rd721; + shr.u64 %rd722, %rd714, 32; + cvt.u32.u64 %r30108, %rd714; + cvt.u32.u64 %r30109, %rd722; + shr.u64 %rd723, %rd719, 32; + cvt.u32.u64 %r30106, %rd719; + cvt.u32.u64 %r30107, %rd723; + shr.u64 %rd724, %rd715, 32; + cvt.u32.u64 %r30104, %rd715; + cvt.u32.u64 %r30105, %rd724; + shr.u64 %rd725, %rd716, 32; + cvt.u32.u64 %r30102, %rd716; + cvt.u32.u64 %r30103, %rd725; + mov.u32 %r30065, %r30064; + mov.u32 %r30066, %r30064; + mov.u32 %r30067, %r30064; + mov.u32 %r30068, %r30064; + mov.u32 %r30069, %r30064; + mov.u32 %r30070, %r30064; + mov.u32 %r30071, %r30064; + mov.u32 %r30072, %r30064; + mov.u32 %r30073, %r30064; + mov.u32 %r30074, %r30064; + mov.u32 %r30075, %r30064; + mov.u32 %r30076, %r30064; + mov.u32 %r30077, %r30064; + mov.u32 %r30078, %r14168; + mov.u32 %r30080, %r30064; + mov.u32 %r30081, %r30064; + mov.u32 %r30082, %r30064; + mov.u32 %r30083, %r30064; + mov.u32 %r30084, %r30064; + mov.u32 %r30085, %r30064; + mov.u32 %r30086, %r30064; + mov.u32 %r30087, %r30064; + mov.u32 %r30088, %r30064; + mov.u32 %r30089, %r30064; + mov.u32 %r30090, %r30064; + mov.u32 %r30091, %r30064; + mov.u32 %r30092, %r30064; + mov.u32 %r30093, %r30064; + mov.u32 %r30094, %r30064; + mov.u32 %r30095, %r30064; + mov.u32 %r30096, %r30064; + mov.u32 %r30097, %r30064; + mov.u32 %r30114, %r30064; + +$L__BB2_50: + // begin inline asm + // xor5 + lop3.b32 %r14209, %r30100, %r30098, %r30096, 0x96; + lop3.b32 %r14209, %r14209, %r30094, %r30092, 0x96; + lop3.b32 %r14210, %r30101, %r30099, %r30097, 0x96; + lop3.b32 %r14210, %r14210, %r30095, %r30093, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14221, %r30112, %r30110, %r30090, 0x96; + lop3.b32 %r14221, %r14221, %r30088, %r30086, 0x96; + lop3.b32 %r14222, %r30113, %r30111, %r30091, 0x96; + lop3.b32 %r14222, %r14222, %r30089, %r30087, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14233, %r30108, %r30106, %r30084, 0x96; + lop3.b32 %r14233, %r14233, %r30082, %r30080, 0x96; + lop3.b32 %r14234, %r30109, %r30107, %r30085, 0x96; + lop3.b32 %r14234, %r14234, %r30083, %r30081, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14245, %r30104, %r30078, %r30076, 0x96; + lop3.b32 %r14245, %r14245, %r30074, %r30072, 0x96; + lop3.b32 %r14246, %r30105, %r30079, %r30077, 0x96; + lop3.b32 %r14246, %r14246, %r30075, %r30073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14257, %r30102, %r30070, %r30068, 0x96; + lop3.b32 %r14257, %r14257, %r30066, %r30064, 0x96; + lop3.b32 %r14258, %r30103, %r30071, %r30069, 0x96; + lop3.b32 %r14258, %r14258, %r30067, %r30065, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14269, %r14222, %r14221, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14273, %r14221, %r14222, %r14168; + // end inline asm + xor.b32 %r14703, %r14269, %r14257; + xor.b32 %r14704, %r14273, %r14258; + xor.b32 %r14536, %r30100, %r14703; + xor.b32 %r14539, %r30101, %r14704; + xor.b32 %r14443, %r30098, %r14703; + xor.b32 %r14442, %r30099, %r14704; + xor.b32 %r14490, %r30096, %r14703; + xor.b32 %r14491, %r30097, %r14704; + xor.b32 %r14395, %r30094, %r14703; + xor.b32 %r14394, %r30095, %r14704; + xor.b32 %r14346, %r30092, %r14703; + xor.b32 %r14347, %r30093, %r14704; + // begin inline asm + shf.l.wrap.b32 %r14277, %r14234, %r14233, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14281, %r14233, %r14234, %r14168; + // end inline asm + xor.b32 %r14705, %r14277, %r14209; + xor.b32 %r14706, %r14281, %r14210; + xor.b32 %r14498, %r30112, %r14705; + xor.b32 %r14499, %r30113, %r14706; + xor.b32 %r14315, %r30110, %r14705; + xor.b32 %r14314, %r30111, %r14706; + xor.b32 %r14474, %r30090, %r14705; + xor.b32 %r14475, %r30091, %r14706; + xor.b32 %r14435, %r30088, %r14705; + xor.b32 %r14434, %r30089, %r14706; + xor.b32 %r14418, %r30086, %r14705; + xor.b32 %r14419, %r30087, %r14706; + // begin inline asm + shf.l.wrap.b32 %r14285, %r14246, %r14245, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14289, %r14245, %r14246, %r14168; + // end inline asm + xor.b32 %r14707, %r14285, %r14221; + xor.b32 %r14708, %r14289, %r14222; + xor.b32 %r14355, %r30108, %r14707; + xor.b32 %r14354, %r30109, %r14708; + xor.b32 %r14482, %r30106, %r14707; + xor.b32 %r14483, %r30107, %r14708; + xor.b32 %r14363, %r30084, %r14707; + xor.b32 %r14362, %r30085, %r14708; + xor.b32 %r14466, %r30082, %r14707; + xor.b32 %r14467, %r30083, %r14708; + xor.b32 %r14331, %r30080, %r14707; + xor.b32 %r14330, %r30081, %r14708; + // begin inline asm + shf.l.wrap.b32 %r14293, %r14258, %r14257, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14297, %r14257, %r14258, %r14168; + // end inline asm + xor.b32 %r14709, %r14293, %r14233; + xor.b32 %r14710, %r14297, %r14234; + xor.b32 %r14450, %r30104, %r14709; + xor.b32 %r14451, %r30105, %r14710; + xor.b32 %r14427, %r30078, %r14709; + xor.b32 %r14426, %r30079, %r14710; + xor.b32 %r14370, %r30076, %r14709; + xor.b32 %r14371, %r30077, %r14710; + xor.b32 %r14458, %r30074, %r14709; + xor.b32 %r14459, %r30075, %r14710; + xor.b32 %r14387, %r30072, %r14709; + xor.b32 %r14386, %r30073, %r14710; + // begin inline asm + shf.l.wrap.b32 %r14301, %r14210, %r14209, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14305, %r14209, %r14210, %r14168; + // end inline asm + xor.b32 %r14711, %r14301, %r14245; + xor.b32 %r14712, %r14305, %r14246; + xor.b32 %r14402, %r30102, %r14711; + xor.b32 %r14403, %r30103, %r14712; + xor.b32 %r14322, %r30070, %r14711; + xor.b32 %r14323, %r30071, %r14712; + xor.b32 %r14339, %r30068, %r14711; + xor.b32 %r14338, %r30069, %r14712; + xor.b32 %r14378, %r30066, %r14711; + xor.b32 %r14379, %r30067, %r14712; + xor.b32 %r14410, %r30064, %r14711; + xor.b32 %r14411, %r30065, %r14712; + mov.u32 %r14316, 44; + // begin inline asm + shf.l.wrap.b32 %r14309, %r14315, %r14314, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14313, %r14314, %r14315, %r14316; + // end inline asm + mov.u32 %r14324, 20; + // begin inline asm + shf.l.wrap.b32 %r14317, %r14323, %r14322, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14321, %r14322, %r14323, %r14324; + // end inline asm + mov.u32 %r14332, 61; + // begin inline asm + shf.l.wrap.b32 %r14325, %r14331, %r14330, %r14332; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14329, %r14330, %r14331, %r14332; + // end inline asm + mov.u32 %r14340, 39; + // begin inline asm + shf.l.wrap.b32 %r14333, %r14339, %r14338, %r14340; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14337, %r14338, %r14339, %r14340; + // end inline asm + mov.u32 %r14348, 18; + // begin inline asm + shf.l.wrap.b32 %r14341, %r14347, %r14346, %r14348; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14345, %r14346, %r14347, %r14348; + // end inline asm + mov.u32 %r14356, 62; + // begin inline asm + shf.l.wrap.b32 %r14349, %r14355, %r14354, %r14356; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14353, %r14354, %r14355, %r14356; + // end inline asm + mov.u32 %r14364, 43; + // begin inline asm + shf.l.wrap.b32 %r14357, %r14363, %r14362, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14361, %r14362, %r14363, %r14364; + // end inline asm + mov.u32 %r14372, 25; + // begin inline asm + shf.l.wrap.b32 %r14365, %r14371, %r14370, %r14372; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14369, %r14370, %r14371, %r14372; + // end inline asm + mov.u32 %r14380, 8; + // begin inline asm + shf.l.wrap.b32 %r14373, %r14379, %r14378, %r14380; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14377, %r14378, %r14379, %r14380; + // end inline asm + mov.u32 %r14388, 56; + // begin inline asm + shf.l.wrap.b32 %r14381, %r14387, %r14386, %r14388; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14385, %r14386, %r14387, %r14388; + // end inline asm + mov.u32 %r14396, 41; + // begin inline asm + shf.l.wrap.b32 %r14389, %r14395, %r14394, %r14396; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14393, %r14394, %r14395, %r14396; + // end inline asm + mov.u32 %r14404, 27; + // begin inline asm + shf.l.wrap.b32 %r14397, %r14403, %r14402, %r14404; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14401, %r14402, %r14403, %r14404; + // end inline asm + mov.u32 %r14412, 14; + // begin inline asm + shf.l.wrap.b32 %r14405, %r14411, %r14410, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14409, %r14410, %r14411, %r14412; + // end inline asm + mov.u32 %r14420, 2; + // begin inline asm + shf.l.wrap.b32 %r14413, %r14419, %r14418, %r14420; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14417, %r14418, %r14419, %r14420; + // end inline asm + mov.u32 %r14428, 55; + // begin inline asm + shf.l.wrap.b32 %r14421, %r14427, %r14426, %r14428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14425, %r14426, %r14427, %r14428; + // end inline asm + mov.u32 %r14436, 45; + // begin inline asm + shf.l.wrap.b32 %r14429, %r14435, %r14434, %r14436; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14433, %r14434, %r14435, %r14436; + // end inline asm + mov.u32 %r14444, 36; + // begin inline asm + shf.l.wrap.b32 %r14437, %r14443, %r14442, %r14444; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14441, %r14442, %r14443, %r14444; + // end inline asm + mov.u32 %r14452, 28; + // begin inline asm + shf.l.wrap.b32 %r14445, %r14451, %r14450, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14449, %r14450, %r14451, %r14452; + // end inline asm + mov.u32 %r14460, 21; + // begin inline asm + shf.l.wrap.b32 %r14453, %r14459, %r14458, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14457, %r14458, %r14459, %r14460; + // end inline asm + mov.u32 %r14468, 15; + // begin inline asm + shf.l.wrap.b32 %r14461, %r14467, %r14466, %r14468; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14465, %r14466, %r14467, %r14468; + // end inline asm + mov.u32 %r14476, 10; + // begin inline asm + shf.l.wrap.b32 %r14469, %r14475, %r14474, %r14476; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14473, %r14474, %r14475, %r14476; + // end inline asm + mov.u32 %r14484, 6; + // begin inline asm + shf.l.wrap.b32 %r14477, %r14483, %r14482, %r14484; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14481, %r14482, %r14483, %r14484; + // end inline asm + mov.u32 %r14492, 3; + // begin inline asm + shf.l.wrap.b32 %r14485, %r14491, %r14490, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14489, %r14490, %r14491, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14493, %r14499, %r14498, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14497, %r14498, %r14499, %r14168; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14501, %r14536, %r14309, %r14357, 0xD2; + lop3.b32 %r14502, %r14539, %r14313, %r14361, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30112, %r14309, %r14357, %r14453, 0xD2; + lop3.b32 %r30113, %r14313, %r14361, %r14457, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30108, %r14357, %r14453, %r14405, 0xD2; + lop3.b32 %r30109, %r14361, %r14457, %r14409, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30104, %r14453, %r14405, %r14536, 0xD2; + lop3.b32 %r30105, %r14457, %r14409, %r14539, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30102, %r14405, %r14536, %r14309, 0xD2; + lop3.b32 %r30103, %r14409, %r14539, %r14313, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30098, %r14445, %r14317, %r14485, 0xD2; + lop3.b32 %r30099, %r14449, %r14321, %r14489, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30110, %r14317, %r14485, %r14429, 0xD2; + lop3.b32 %r30111, %r14321, %r14489, %r14433, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30106, %r14485, %r14429, %r14325, 0xD2; + lop3.b32 %r30107, %r14489, %r14433, %r14329, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30078, %r14429, %r14325, %r14445, 0xD2; + lop3.b32 %r30079, %r14433, %r14329, %r14449, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r30078, %r30079}; + // begin inline asm + // chi + lop3.b32 %r30070, %r14325, %r14445, %r14317, 0xD2; + lop3.b32 %r30071, %r14329, %r14449, %r14321, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r30070, %r30071}; + // begin inline asm + // chi + lop3.b32 %r30096, %r14493, %r14477, %r14365, 0xD2; + lop3.b32 %r30097, %r14497, %r14481, %r14369, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+104], {%r30096, %r30097}; + // begin inline asm + // chi + lop3.b32 %r30090, %r14477, %r14365, %r14373, 0xD2; + lop3.b32 %r30091, %r14481, %r14369, %r14377, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+112], {%r30090, %r30091}; + // begin inline asm + // chi + lop3.b32 %r30084, %r14365, %r14373, %r14341, 0xD2; + lop3.b32 %r30085, %r14369, %r14377, %r14345, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+120], {%r30084, %r30085}; + // begin inline asm + // chi + lop3.b32 %r30076, %r14373, %r14341, %r14493, 0xD2; + lop3.b32 %r30077, %r14377, %r14345, %r14497, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+128], {%r30076, %r30077}; + // begin inline asm + // chi + lop3.b32 %r30068, %r14341, %r14493, %r14477, 0xD2; + lop3.b32 %r30069, %r14345, %r14497, %r14481, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+136], {%r30068, %r30069}; + // begin inline asm + // chi + lop3.b32 %r30094, %r14397, %r14437, %r14469, 0xD2; + lop3.b32 %r30095, %r14401, %r14441, %r14473, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+144], {%r30094, %r30095}; + // begin inline asm + // chi + lop3.b32 %r30088, %r14437, %r14469, %r14461, 0xD2; + lop3.b32 %r30089, %r14441, %r14473, %r14465, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+152], {%r30088, %r30089}; + // begin inline asm + // chi + lop3.b32 %r30082, %r14469, %r14461, %r14381, 0xD2; + lop3.b32 %r30083, %r14473, %r14465, %r14385, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+160], {%r30082, %r30083}; + // begin inline asm + // chi + lop3.b32 %r30074, %r14461, %r14381, %r14397, 0xD2; + lop3.b32 %r30075, %r14465, %r14385, %r14401, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+168], {%r30074, %r30075}; + // begin inline asm + // chi + lop3.b32 %r30066, %r14381, %r14397, %r14437, 0xD2; + lop3.b32 %r30067, %r14385, %r14401, %r14441, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+176], {%r30066, %r30067}; + // begin inline asm + // chi + lop3.b32 %r30092, %r14349, %r14421, %r14333, 0xD2; + lop3.b32 %r30093, %r14353, %r14425, %r14337, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+184], {%r30092, %r30093}; + // begin inline asm + // chi + lop3.b32 %r30086, %r14421, %r14333, %r14389, 0xD2; + lop3.b32 %r30087, %r14425, %r14337, %r14393, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+192], {%r30086, %r30087}; + // begin inline asm + // chi + lop3.b32 %r30080, %r14333, %r14389, %r14413, 0xD2; + lop3.b32 %r30081, %r14337, %r14393, %r14417, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+200], {%r30080, %r30081}; + // begin inline asm + // chi + lop3.b32 %r30072, %r14389, %r14413, %r14349, 0xD2; + lop3.b32 %r30073, %r14393, %r14417, %r14353, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+208], {%r30072, %r30073}; + // begin inline asm + // chi + lop3.b32 %r30064, %r14413, %r14349, %r14421, 0xD2; + lop3.b32 %r30065, %r14417, %r14353, %r14425, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+216], {%r30064, %r30065}; + mul.wide.s32 %rd727, %r30114, 8; + add.s64 %rd726, %rd706, %rd727; + // begin inline asm + ld.global.nc.v2.u32 {%r14701,%r14702}, [%rd726]; + // end inline asm + xor.b32 %r30100, %r14501, %r14701; + xor.b32 %r30101, %r14502, %r14702; + add.s32 %r30114, %r30114, 1; + setp.lt.u32 %p31, %r30114, 23; + @%p31 bra $L__BB2_50; + + mov.u32 %r30147, 0; + mov.u32 %r14812, 1; + st.local.v2.u32 [%rd149+32], {%r30112, %r30113}; + st.local.v2.u32 [%rd149+72], {%r30110, %r30111}; + st.local.v2.u32 [%rd149+40], {%r30108, %r30109}; + st.local.v2.u32 [%rd149+80], {%r30106, %r30107}; + st.local.v2.u32 [%rd149+48], {%r30104, %r30105}; + st.local.v2.u32 [%rd149+56], {%r30102, %r30103}; + st.local.v2.u32 [%rd149+24], {%r30100, %r30101}; + // begin inline asm + // xor5 + lop3.b32 %r14713, %r30100, %r30098, %r30096, 0x96; + lop3.b32 %r14713, %r14713, %r30094, %r30092, 0x96; + lop3.b32 %r14714, %r30101, %r30099, %r30097, 0x96; + lop3.b32 %r14714, %r14714, %r30095, %r30093, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14725, %r30112, %r30110, %r30090, 0x96; + lop3.b32 %r14725, %r14725, %r30088, %r30086, 0x96; + lop3.b32 %r14726, %r30113, %r30111, %r30091, 0x96; + lop3.b32 %r14726, %r14726, %r30089, %r30087, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14737, %r30108, %r30106, %r30084, 0x96; + lop3.b32 %r14737, %r14737, %r30082, %r30080, 0x96; + lop3.b32 %r14738, %r30109, %r30107, %r30085, 0x96; + lop3.b32 %r14738, %r14738, %r30083, %r30081, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14749, %r30104, %r30078, %r30076, 0x96; + lop3.b32 %r14749, %r14749, %r30074, %r30072, 0x96; + lop3.b32 %r14750, %r30105, %r30079, %r30077, 0x96; + lop3.b32 %r14750, %r14750, %r30075, %r30073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14761, %r30102, %r30070, %r30068, 0x96; + lop3.b32 %r14761, %r14761, %r30066, %r30064, 0x96; + lop3.b32 %r14762, %r30103, %r30071, %r30069, 0x96; + lop3.b32 %r14762, %r14762, %r30067, %r30065, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14773, %r14726, %r14725, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14777, %r14725, %r14726, %r14812; + // end inline asm + xor.b32 %r14952, %r14773, %r14761; + xor.b32 %r14953, %r14777, %r14762; + xor.b32 %r14920, %r30100, %r14952; + xor.b32 %r14923, %r30101, %r14953; + xor.b32 %r14883, %r30097, %r14953; + xor.b32 %r14882, %r30096, %r14952; + st.local.v2.u32 [%rd149+104], {%r14882, %r14883}; + // begin inline asm + shf.l.wrap.b32 %r14781, %r14738, %r14737, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14785, %r14737, %r14738, %r14812; + // end inline asm + xor.b32 %r14954, %r14781, %r14713; + xor.b32 %r14955, %r14785, %r14714; + xor.b32 %r14819, %r30110, %r14954; + xor.b32 %r14818, %r30111, %r14955; + xor.b32 %r14858, %r30089, %r14955; + xor.b32 %r14859, %r30088, %r14954; + st.local.v2.u32 [%rd149+152], {%r14859, %r14858}; + // begin inline asm + shf.l.wrap.b32 %r14789, %r14750, %r14749, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14793, %r14749, %r14750, %r14812; + // end inline asm + xor.b32 %r14956, %r14789, %r14725; + xor.b32 %r14957, %r14793, %r14726; + xor.b32 %r14842, %r30085, %r14957; + xor.b32 %r14843, %r30084, %r14956; + st.local.v2.u32 [%rd149+120], {%r14843, %r14842}; + xor.b32 %r14834, %r30081, %r14957; + xor.b32 %r14835, %r30080, %r14956; + st.local.v2.u32 [%rd149+200], {%r14835, %r14834}; + // begin inline asm + shf.l.wrap.b32 %r14797, %r14762, %r14761, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14801, %r14761, %r14762, %r14812; + // end inline asm + xor.b32 %r14958, %r14797, %r14737; + xor.b32 %r14959, %r14801, %r14738; + xor.b32 %r14866, %r30104, %r14958; + xor.b32 %r14867, %r30105, %r14959; + xor.b32 %r14875, %r30075, %r14959; + xor.b32 %r14874, %r30074, %r14958; + st.local.v2.u32 [%rd149+168], {%r14874, %r14875}; + // begin inline asm + shf.l.wrap.b32 %r14805, %r14714, %r14713, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14809, %r14713, %r14714, %r14812; + // end inline asm + xor.b32 %r14960, %r14805, %r14749; + xor.b32 %r14961, %r14809, %r14750; + xor.b32 %r14826, %r30070, %r14960; + xor.b32 %r14827, %r30071, %r14961; + xor.b32 %r14851, %r30065, %r14961; + xor.b32 %r14850, %r30064, %r14960; + st.local.v2.u32 [%rd149+216], {%r14850, %r14851}; + // begin inline asm + shf.l.wrap.b32 %r14813, %r14819, %r14818, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14817, %r14818, %r14819, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14821, %r14827, %r14826, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14825, %r14826, %r14827, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14833, %r14834, %r14835, %r14332; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14829, %r14835, %r14834, %r14332; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r14829, %r14833}; + // begin inline asm + shf.l.wrap.b32 %r14837, %r14843, %r14842, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14841, %r14842, %r14843, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14845, %r14851, %r14850, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14849, %r14850, %r14851, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14857, %r14858, %r14859, %r14436; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14853, %r14859, %r14858, %r14436; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r14853, %r14857}; + // begin inline asm + shf.l.wrap.b32 %r14861, %r14867, %r14866, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14865, %r14866, %r14867, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14869, %r14875, %r14874, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14873, %r14874, %r14875, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14877, %r14883, %r14882, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14881, %r14882, %r14883, %r14492; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14885, %r14920, %r14813, %r14837, 0xD2; + lop3.b32 %r14886, %r14923, %r14817, %r14841, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30247, %r14813, %r14837, %r14869, 0xD2; + lop3.b32 %r30248, %r14817, %r14841, %r14873, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+32], {%r30247, %r30248}; + // begin inline asm + // chi + lop3.b32 %r30243, %r14837, %r14869, %r14845, 0xD2; + lop3.b32 %r30244, %r14841, %r14873, %r14849, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+40], {%r30243, %r30244}; + // begin inline asm + // chi + lop3.b32 %r30239, %r14869, %r14845, %r14920, 0xD2; + lop3.b32 %r30240, %r14873, %r14849, %r14923, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+48], {%r30239, %r30240}; + // begin inline asm + // chi + lop3.b32 %r30237, %r14845, %r14920, %r14813, 0xD2; + lop3.b32 %r30238, %r14849, %r14923, %r14817, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+56], {%r30237, %r30238}; + // begin inline asm + // chi + lop3.b32 %r30233, %r14861, %r14821, %r14877, 0xD2; + lop3.b32 %r30234, %r14865, %r14825, %r14881, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+64], {%r30233, %r30234}; + // begin inline asm + // chi + lop3.b32 %r30245, %r14821, %r14877, %r14853, 0xD2; + lop3.b32 %r30246, %r14825, %r14881, %r14857, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+72], {%r30245, %r30246}; + // begin inline asm + // chi + lop3.b32 %r30241, %r14877, %r14853, %r14829, 0xD2; + lop3.b32 %r30242, %r14881, %r14857, %r14833, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+80], {%r30241, %r30242}; + // begin inline asm + ld.global.nc.v2.u32 {%r14949,%r14950}, [%rd707]; + // end inline asm + xor.b32 %r30235, %r14885, %r14949; + xor.b32 %r30236, %r14886, %r14950; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + add.s64 %rd151, %rd149, 24; + add.s64 %rd152, %rd3, 24; + +$L__BB2_52: + shl.b32 %r14962, %r30147, 2; + cvt.u64.u32 %rd737, %r14962; + and.b64 %rd738, %rd737, 60; + add.s64 %rd739, %rd152, %rd738; + xor.b32 %r14963, %r1678, %r30147; + mul.lo.s32 %r14964, %r14963, 16777619; + ld.local.u32 %r14965, [%rd739]; + xor.b32 %r14966, %r14964, %r14965; + mul.wide.u32 %rd740, %r14966, -954391867; + shr.u64 %rd741, %rd740, 32; + cvt.u32.u64 %r14967, %rd741; + sub.s32 %r14968, %r14966, %r14967; + shr.u32 %r14969, %r14968, 1; + add.s32 %r14970, %r14969, %r14967; + shr.u32 %r14971, %r14970, 20; + mul.lo.s32 %r14972, %r14971, 1179641; + sub.s32 %r14973, %r14966, %r14972; + mul.wide.u32 %rd742, %r14973, 64; + add.s64 %rd743, %rd471, %rd742; + mul.lo.s32 %r14974, %r30184, 16777619; + ld.global.u32 %r14975, [%rd743]; + xor.b32 %r30184, %r14974, %r14975; + mul.lo.s32 %r14976, %r30185, 16777619; + ld.global.u32 %r14977, [%rd743+4]; + xor.b32 %r30185, %r14976, %r14977; + mul.lo.s32 %r14978, %r30196, 16777619; + ld.global.u32 %r14979, [%rd743+8]; + mul.lo.s32 %r14980, %r30197, 16777619; + ld.global.u32 %r14981, [%rd743+12]; + xor.b32 %r14982, %r14980, %r14981; + xor.b32 %r30196, %r14978, %r14979; + mov.b64 %rd744, {%r30196, %r14982}; + mul.lo.s32 %r14983, %r30192, 16777619; + ld.global.u32 %r14984, [%rd743+16]; + mul.lo.s32 %r14985, %r30193, 16777619; + ld.global.u32 %r14986, [%rd743+20]; + xor.b32 %r14987, %r14985, %r14986; + xor.b32 %r30192, %r14983, %r14984; + mov.b64 %rd745, {%r30192, %r14987}; + mul.lo.s32 %r14988, %r30188, 16777619; + ld.global.u32 %r14989, [%rd743+24]; + mul.lo.s32 %r14990, %r30189, 16777619; + ld.global.u32 %r14991, [%rd743+28]; + xor.b32 %r14992, %r14990, %r14991; + xor.b32 %r30188, %r14988, %r14989; + mov.b64 %rd746, {%r30188, %r14992}; + mul.lo.s32 %r14993, %r30186, 16777619; + ld.global.u32 %r14994, [%rd743+32]; + mul.lo.s32 %r14995, %r30187, 16777619; + ld.global.u32 %r14996, [%rd743+36]; + xor.b32 %r14997, %r14995, %r14996; + xor.b32 %r30186, %r14993, %r14994; + mov.b64 %rd747, {%r30186, %r14997}; + mul.lo.s32 %r14998, %r30182, 16777619; + ld.global.u32 %r14999, [%rd743+40]; + xor.b32 %r30182, %r14998, %r14999; + mul.lo.s32 %r15000, %r30183, 16777619; + ld.global.u32 %r15001, [%rd743+44]; + xor.b32 %r30183, %r15000, %r15001; + mul.lo.s32 %r15002, %r30194, 16777619; + ld.global.u32 %r15003, [%rd743+48]; + mul.lo.s32 %r15004, %r30195, 16777619; + ld.global.u32 %r15005, [%rd743+52]; + xor.b32 %r15006, %r15004, %r15005; + xor.b32 %r30194, %r15002, %r15003; + mov.b64 %rd748, {%r30194, %r15006}; + mul.lo.s32 %r15007, %r30190, 16777619; + ld.global.u32 %r15008, [%rd743+56]; + mul.lo.s32 %r15009, %r30191, 16777619; + ld.global.u32 %r15010, [%rd743+60]; + xor.b32 %r15011, %r15009, %r15010; + xor.b32 %r30190, %r15007, %r15008; + mov.b64 %rd749, {%r30190, %r15011}; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + st.local.v2.u32 [%rd3+32], {%r30196, %r14982}; + st.local.v2.u32 [%rd3+40], {%r30192, %r14987}; + st.local.v2.u32 [%rd3+48], {%r30188, %r14992}; + st.local.v2.u32 [%rd3+56], {%r30186, %r14997}; + st.local.v2.u32 [%rd3+64], {%r30182, %r30183}; + st.local.v2.u32 [%rd3+72], {%r30194, %r15006}; + st.local.v2.u32 [%rd3+80], {%r30190, %r15011}; + add.s64 %rd750, %rd151, %rd738; + xor.b32 %r15012, %r1874, %r30147; + mul.lo.s32 %r15013, %r15012, 16777619; + ld.local.u32 %r15014, [%rd750]; + xor.b32 %r15015, %r15013, %r15014; + mul.wide.u32 %rd751, %r15015, -954391867; + shr.u64 %rd752, %rd751, 32; + cvt.u32.u64 %r15016, %rd752; + sub.s32 %r15017, %r15015, %r15016; + shr.u32 %r15018, %r15017, 1; + add.s32 %r15019, %r15018, %r15016; + shr.u32 %r15020, %r15019, 20; + mul.lo.s32 %r15021, %r15020, 1179641; + sub.s32 %r15022, %r15015, %r15021; + mul.wide.u32 %rd753, %r15022, 64; + add.s64 %rd754, %rd471, %rd753; + mul.lo.s32 %r15023, %r30235, 16777619; + ld.global.u32 %r15024, [%rd754]; + xor.b32 %r30235, %r15023, %r15024; + mul.lo.s32 %r15025, %r30236, 16777619; + ld.global.u32 %r15026, [%rd754+4]; + xor.b32 %r30236, %r15025, %r15026; + mul.lo.s32 %r15027, %r30247, 16777619; + ld.global.u32 %r15028, [%rd754+8]; + mul.lo.s32 %r15029, %r30248, 16777619; + ld.global.u32 %r15030, [%rd754+12]; + xor.b32 %r15031, %r15029, %r15030; + xor.b32 %r30247, %r15027, %r15028; + mov.b64 %rd755, {%r30247, %r15031}; + mul.lo.s32 %r15032, %r30243, 16777619; + ld.global.u32 %r15033, [%rd754+16]; + mul.lo.s32 %r15034, %r30244, 16777619; + ld.global.u32 %r15035, [%rd754+20]; + xor.b32 %r15036, %r15034, %r15035; + xor.b32 %r30243, %r15032, %r15033; + mov.b64 %rd756, {%r30243, %r15036}; + mul.lo.s32 %r15037, %r30239, 16777619; + ld.global.u32 %r15038, [%rd754+24]; + mul.lo.s32 %r15039, %r30240, 16777619; + ld.global.u32 %r15040, [%rd754+28]; + xor.b32 %r15041, %r15039, %r15040; + xor.b32 %r30239, %r15037, %r15038; + mov.b64 %rd757, {%r30239, %r15041}; + mul.lo.s32 %r15042, %r30237, 16777619; + ld.global.u32 %r15043, [%rd754+32]; + mul.lo.s32 %r15044, %r30238, 16777619; + ld.global.u32 %r15045, [%rd754+36]; + xor.b32 %r15046, %r15044, %r15045; + xor.b32 %r30237, %r15042, %r15043; + mov.b64 %rd758, {%r30237, %r15046}; + mul.lo.s32 %r15047, %r30233, 16777619; + ld.global.u32 %r15048, [%rd754+40]; + xor.b32 %r30233, %r15047, %r15048; + mul.lo.s32 %r15049, %r30234, 16777619; + ld.global.u32 %r15050, [%rd754+44]; + xor.b32 %r30234, %r15049, %r15050; + mul.lo.s32 %r15051, %r30245, 16777619; + ld.global.u32 %r15052, [%rd754+48]; + mul.lo.s32 %r15053, %r30246, 16777619; + ld.global.u32 %r15054, [%rd754+52]; + xor.b32 %r15055, %r15053, %r15054; + xor.b32 %r30245, %r15051, %r15052; + mov.b64 %rd759, {%r30245, %r15055}; + mul.lo.s32 %r15056, %r30241, 16777619; + ld.global.u32 %r15057, [%rd754+56]; + mul.lo.s32 %r15058, %r30242, 16777619; + ld.global.u32 %r15059, [%rd754+60]; + xor.b32 %r15060, %r15058, %r15059; + xor.b32 %r30241, %r15056, %r15057; + mov.b64 %rd760, {%r30241, %r15060}; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + st.local.v2.u32 [%rd149+32], {%r30247, %r15031}; + st.local.v2.u32 [%rd149+40], {%r30243, %r15036}; + st.local.v2.u32 [%rd149+48], {%r30239, %r15041}; + st.local.v2.u32 [%rd149+56], {%r30237, %r15046}; + st.local.v2.u32 [%rd149+64], {%r30233, %r30234}; + st.local.v2.u32 [%rd149+72], {%r30245, %r15055}; + st.local.v2.u32 [%rd149+80], {%r30241, %r15060}; + add.s32 %r30147, %r30147, 1; + setp.lt.u32 %p32, %r30147, 512; + shr.u64 %rd761, %rd744, 32; + cvt.u32.u64 %r30197, %rd761; + shr.u64 %rd762, %rd745, 32; + cvt.u32.u64 %r30193, %rd762; + shr.u64 %rd763, %rd746, 32; + cvt.u32.u64 %r30189, %rd763; + shr.u64 %rd764, %rd747, 32; + cvt.u32.u64 %r30187, %rd764; + shr.u64 %rd765, %rd748, 32; + cvt.u32.u64 %r30195, %rd765; + shr.u64 %rd766, %rd749, 32; + cvt.u32.u64 %r30191, %rd766; + shr.u64 %rd767, %rd755, 32; + cvt.u32.u64 %r30248, %rd767; + shr.u64 %rd768, %rd756, 32; + cvt.u32.u64 %r30244, %rd768; + shr.u64 %rd769, %rd757, 32; + cvt.u32.u64 %r30240, %rd769; + shr.u64 %rd770, %rd758, 32; + cvt.u32.u64 %r30238, %rd770; + shr.u64 %rd771, %rd759, 32; + cvt.u32.u64 %r30246, %rd771; + shr.u64 %rd772, %rd760, 32; + cvt.u32.u64 %r30242, %rd772; + @%p32 bra $L__BB2_52; + + mov.u32 %r30148, 0; + st.local.v2.u32 [%rd3+96], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+104], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+112], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+120], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+128], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+136], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+144], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+152], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+160], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+168], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+176], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+184], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+192], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+200], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+208], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+216], {%r30148, %r30148}; + mov.u32 %r30163, -2147483648; + mov.u32 %r15075, 1; + st.local.v2.u32 [%rd3+88], {%r15075, %r30163}; + mov.u32 %r30149, %r30148; + mov.u32 %r30150, %r30148; + mov.u32 %r30151, %r30148; + mov.u32 %r30152, %r30148; + mov.u32 %r30153, %r30148; + mov.u32 %r30154, %r30148; + mov.u32 %r30155, %r30148; + mov.u32 %r30156, %r30148; + mov.u32 %r30157, %r30148; + mov.u32 %r30158, %r30148; + mov.u32 %r30159, %r30148; + mov.u32 %r30160, %r30148; + mov.u32 %r30161, %r30148; + mov.u32 %r30162, %r15075; + mov.u32 %r30164, %r30148; + mov.u32 %r30165, %r30148; + mov.u32 %r30166, %r30148; + mov.u32 %r30167, %r30148; + mov.u32 %r30168, %r30148; + mov.u32 %r30169, %r30148; + mov.u32 %r30170, %r30148; + mov.u32 %r30171, %r30148; + mov.u32 %r30172, %r30148; + mov.u32 %r30173, %r30148; + mov.u32 %r30174, %r30148; + mov.u32 %r30175, %r30148; + mov.u32 %r30176, %r30148; + mov.u32 %r30177, %r30148; + mov.u32 %r30178, %r30148; + mov.u32 %r30179, %r30148; + mov.u32 %r30180, %r30148; + mov.u32 %r30181, %r30148; + mov.u32 %r30198, %r30148; + +$L__BB2_54: + // begin inline asm + // xor5 + lop3.b32 %r15102, %r30184, %r30182, %r30180, 0x96; + lop3.b32 %r15102, %r15102, %r30178, %r30176, 0x96; + lop3.b32 %r15103, %r30185, %r30183, %r30181, 0x96; + lop3.b32 %r15103, %r15103, %r30179, %r30177, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15114, %r30196, %r30194, %r30174, 0x96; + lop3.b32 %r15114, %r15114, %r30172, %r30170, 0x96; + lop3.b32 %r15115, %r30197, %r30195, %r30175, 0x96; + lop3.b32 %r15115, %r15115, %r30173, %r30171, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15126, %r30192, %r30190, %r30168, 0x96; + lop3.b32 %r15126, %r15126, %r30166, %r30164, 0x96; + lop3.b32 %r15127, %r30193, %r30191, %r30169, 0x96; + lop3.b32 %r15127, %r15127, %r30167, %r30165, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15138, %r30188, %r30162, %r30160, 0x96; + lop3.b32 %r15138, %r15138, %r30158, %r30156, 0x96; + lop3.b32 %r15139, %r30189, %r30163, %r30161, 0x96; + lop3.b32 %r15139, %r15139, %r30159, %r30157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15150, %r30186, %r30154, %r30152, 0x96; + lop3.b32 %r15150, %r15150, %r30150, %r30148, 0x96; + lop3.b32 %r15151, %r30187, %r30155, %r30153, 0x96; + lop3.b32 %r15151, %r15151, %r30151, %r30149, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15162, %r15115, %r15114, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15166, %r15114, %r15115, %r15075; + // end inline asm + xor.b32 %r15596, %r15162, %r15150; + xor.b32 %r15597, %r15166, %r15151; + xor.b32 %r15429, %r30184, %r15596; + xor.b32 %r15432, %r30185, %r15597; + xor.b32 %r15336, %r30182, %r15596; + xor.b32 %r15335, %r30183, %r15597; + xor.b32 %r15383, %r30180, %r15596; + xor.b32 %r15384, %r30181, %r15597; + xor.b32 %r15288, %r30178, %r15596; + xor.b32 %r15287, %r30179, %r15597; + xor.b32 %r15239, %r30176, %r15596; + xor.b32 %r15240, %r30177, %r15597; + // begin inline asm + shf.l.wrap.b32 %r15170, %r15127, %r15126, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15174, %r15126, %r15127, %r15075; + // end inline asm + xor.b32 %r15598, %r15170, %r15102; + xor.b32 %r15599, %r15174, %r15103; + xor.b32 %r15391, %r30196, %r15598; + xor.b32 %r15392, %r30197, %r15599; + xor.b32 %r15208, %r30194, %r15598; + xor.b32 %r15207, %r30195, %r15599; + xor.b32 %r15367, %r30174, %r15598; + xor.b32 %r15368, %r30175, %r15599; + xor.b32 %r15328, %r30172, %r15598; + xor.b32 %r15327, %r30173, %r15599; + xor.b32 %r15311, %r30170, %r15598; + xor.b32 %r15312, %r30171, %r15599; + // begin inline asm + shf.l.wrap.b32 %r15178, %r15139, %r15138, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15182, %r15138, %r15139, %r15075; + // end inline asm + xor.b32 %r15600, %r15178, %r15114; + xor.b32 %r15601, %r15182, %r15115; + xor.b32 %r15248, %r30192, %r15600; + xor.b32 %r15247, %r30193, %r15601; + xor.b32 %r15375, %r30190, %r15600; + xor.b32 %r15376, %r30191, %r15601; + xor.b32 %r15256, %r30168, %r15600; + xor.b32 %r15255, %r30169, %r15601; + xor.b32 %r15359, %r30166, %r15600; + xor.b32 %r15360, %r30167, %r15601; + xor.b32 %r15224, %r30164, %r15600; + xor.b32 %r15223, %r30165, %r15601; + // begin inline asm + shf.l.wrap.b32 %r15186, %r15151, %r15150, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15190, %r15150, %r15151, %r15075; + // end inline asm + xor.b32 %r15602, %r15186, %r15126; + xor.b32 %r15603, %r15190, %r15127; + xor.b32 %r15343, %r30188, %r15602; + xor.b32 %r15344, %r30189, %r15603; + xor.b32 %r15320, %r30162, %r15602; + xor.b32 %r15319, %r30163, %r15603; + xor.b32 %r15263, %r30160, %r15602; + xor.b32 %r15264, %r30161, %r15603; + xor.b32 %r15351, %r30158, %r15602; + xor.b32 %r15352, %r30159, %r15603; + xor.b32 %r15280, %r30156, %r15602; + xor.b32 %r15279, %r30157, %r15603; + // begin inline asm + shf.l.wrap.b32 %r15194, %r15103, %r15102, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15198, %r15102, %r15103, %r15075; + // end inline asm + xor.b32 %r15604, %r15194, %r15138; + xor.b32 %r15605, %r15198, %r15139; + xor.b32 %r15295, %r30186, %r15604; + xor.b32 %r15296, %r30187, %r15605; + xor.b32 %r15215, %r30154, %r15604; + xor.b32 %r15216, %r30155, %r15605; + xor.b32 %r15232, %r30152, %r15604; + xor.b32 %r15231, %r30153, %r15605; + xor.b32 %r15271, %r30150, %r15604; + xor.b32 %r15272, %r30151, %r15605; + xor.b32 %r15303, %r30148, %r15604; + xor.b32 %r15304, %r30149, %r15605; + mov.u32 %r15209, 44; + // begin inline asm + shf.l.wrap.b32 %r15202, %r15208, %r15207, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15206, %r15207, %r15208, %r15209; + // end inline asm + mov.u32 %r15217, 20; + // begin inline asm + shf.l.wrap.b32 %r15210, %r15216, %r15215, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15214, %r15215, %r15216, %r15217; + // end inline asm + mov.u32 %r15225, 61; + // begin inline asm + shf.l.wrap.b32 %r15218, %r15224, %r15223, %r15225; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15222, %r15223, %r15224, %r15225; + // end inline asm + mov.u32 %r15233, 39; + // begin inline asm + shf.l.wrap.b32 %r15226, %r15232, %r15231, %r15233; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15230, %r15231, %r15232, %r15233; + // end inline asm + mov.u32 %r15241, 18; + // begin inline asm + shf.l.wrap.b32 %r15234, %r15240, %r15239, %r15241; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15238, %r15239, %r15240, %r15241; + // end inline asm + mov.u32 %r15249, 62; + // begin inline asm + shf.l.wrap.b32 %r15242, %r15248, %r15247, %r15249; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15246, %r15247, %r15248, %r15249; + // end inline asm + mov.u32 %r15257, 43; + // begin inline asm + shf.l.wrap.b32 %r15250, %r15256, %r15255, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15254, %r15255, %r15256, %r15257; + // end inline asm + mov.u32 %r15265, 25; + // begin inline asm + shf.l.wrap.b32 %r15258, %r15264, %r15263, %r15265; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15262, %r15263, %r15264, %r15265; + // end inline asm + mov.u32 %r15273, 8; + // begin inline asm + shf.l.wrap.b32 %r15266, %r15272, %r15271, %r15273; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15270, %r15271, %r15272, %r15273; + // end inline asm + mov.u32 %r15281, 56; + // begin inline asm + shf.l.wrap.b32 %r15274, %r15280, %r15279, %r15281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15278, %r15279, %r15280, %r15281; + // end inline asm + mov.u32 %r15289, 41; + // begin inline asm + shf.l.wrap.b32 %r15282, %r15288, %r15287, %r15289; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15286, %r15287, %r15288, %r15289; + // end inline asm + mov.u32 %r15297, 27; + // begin inline asm + shf.l.wrap.b32 %r15290, %r15296, %r15295, %r15297; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15294, %r15295, %r15296, %r15297; + // end inline asm + mov.u32 %r15305, 14; + // begin inline asm + shf.l.wrap.b32 %r15298, %r15304, %r15303, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15302, %r15303, %r15304, %r15305; + // end inline asm + mov.u32 %r15313, 2; + // begin inline asm + shf.l.wrap.b32 %r15306, %r15312, %r15311, %r15313; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15310, %r15311, %r15312, %r15313; + // end inline asm + mov.u32 %r15321, 55; + // begin inline asm + shf.l.wrap.b32 %r15314, %r15320, %r15319, %r15321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15318, %r15319, %r15320, %r15321; + // end inline asm + mov.u32 %r15329, 45; + // begin inline asm + shf.l.wrap.b32 %r15322, %r15328, %r15327, %r15329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15326, %r15327, %r15328, %r15329; + // end inline asm + mov.u32 %r15337, 36; + // begin inline asm + shf.l.wrap.b32 %r15330, %r15336, %r15335, %r15337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15334, %r15335, %r15336, %r15337; + // end inline asm + mov.u32 %r15345, 28; + // begin inline asm + shf.l.wrap.b32 %r15338, %r15344, %r15343, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15342, %r15343, %r15344, %r15345; + // end inline asm + mov.u32 %r15353, 21; + // begin inline asm + shf.l.wrap.b32 %r15346, %r15352, %r15351, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15350, %r15351, %r15352, %r15353; + // end inline asm + mov.u32 %r15361, 15; + // begin inline asm + shf.l.wrap.b32 %r15354, %r15360, %r15359, %r15361; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15358, %r15359, %r15360, %r15361; + // end inline asm + mov.u32 %r15369, 10; + // begin inline asm + shf.l.wrap.b32 %r15362, %r15368, %r15367, %r15369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15366, %r15367, %r15368, %r15369; + // end inline asm + mov.u32 %r15377, 6; + // begin inline asm + shf.l.wrap.b32 %r15370, %r15376, %r15375, %r15377; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15374, %r15375, %r15376, %r15377; + // end inline asm + mov.u32 %r15385, 3; + // begin inline asm + shf.l.wrap.b32 %r15378, %r15384, %r15383, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15382, %r15383, %r15384, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15386, %r15392, %r15391, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15390, %r15391, %r15392, %r15075; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15394, %r15429, %r15202, %r15250, 0xD2; + lop3.b32 %r15395, %r15432, %r15206, %r15254, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30196, %r15202, %r15250, %r15346, 0xD2; + lop3.b32 %r30197, %r15206, %r15254, %r15350, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30192, %r15250, %r15346, %r15298, 0xD2; + lop3.b32 %r30193, %r15254, %r15350, %r15302, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30188, %r15346, %r15298, %r15429, 0xD2; + lop3.b32 %r30189, %r15350, %r15302, %r15432, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30186, %r15298, %r15429, %r15202, 0xD2; + lop3.b32 %r30187, %r15302, %r15432, %r15206, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30182, %r15338, %r15210, %r15378, 0xD2; + lop3.b32 %r30183, %r15342, %r15214, %r15382, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30194, %r15210, %r15378, %r15322, 0xD2; + lop3.b32 %r30195, %r15214, %r15382, %r15326, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30190, %r15378, %r15322, %r15218, 0xD2; + lop3.b32 %r30191, %r15382, %r15326, %r15222, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30162, %r15322, %r15218, %r15338, 0xD2; + lop3.b32 %r30163, %r15326, %r15222, %r15342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30162, %r30163}; + // begin inline asm + // chi + lop3.b32 %r30154, %r15218, %r15338, %r15210, 0xD2; + lop3.b32 %r30155, %r15222, %r15342, %r15214, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30154, %r30155}; + // begin inline asm + // chi + lop3.b32 %r30180, %r15386, %r15370, %r15258, 0xD2; + lop3.b32 %r30181, %r15390, %r15374, %r15262, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30180, %r30181}; + // begin inline asm + // chi + lop3.b32 %r30174, %r15370, %r15258, %r15266, 0xD2; + lop3.b32 %r30175, %r15374, %r15262, %r15270, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30174, %r30175}; + // begin inline asm + // chi + lop3.b32 %r30168, %r15258, %r15266, %r15234, 0xD2; + lop3.b32 %r30169, %r15262, %r15270, %r15238, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30168, %r30169}; + // begin inline asm + // chi + lop3.b32 %r30160, %r15266, %r15234, %r15386, 0xD2; + lop3.b32 %r30161, %r15270, %r15238, %r15390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30160, %r30161}; + // begin inline asm + // chi + lop3.b32 %r30152, %r15234, %r15386, %r15370, 0xD2; + lop3.b32 %r30153, %r15238, %r15390, %r15374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30152, %r30153}; + // begin inline asm + // chi + lop3.b32 %r30178, %r15290, %r15330, %r15362, 0xD2; + lop3.b32 %r30179, %r15294, %r15334, %r15366, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30178, %r30179}; + // begin inline asm + // chi + lop3.b32 %r30172, %r15330, %r15362, %r15354, 0xD2; + lop3.b32 %r30173, %r15334, %r15366, %r15358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30172, %r30173}; + // begin inline asm + // chi + lop3.b32 %r30166, %r15362, %r15354, %r15274, 0xD2; + lop3.b32 %r30167, %r15366, %r15358, %r15278, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30166, %r30167}; + // begin inline asm + // chi + lop3.b32 %r30158, %r15354, %r15274, %r15290, 0xD2; + lop3.b32 %r30159, %r15358, %r15278, %r15294, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30158, %r30159}; + // begin inline asm + // chi + lop3.b32 %r30150, %r15274, %r15290, %r15330, 0xD2; + lop3.b32 %r30151, %r15278, %r15294, %r15334, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30150, %r30151}; + // begin inline asm + // chi + lop3.b32 %r30176, %r15242, %r15314, %r15226, 0xD2; + lop3.b32 %r30177, %r15246, %r15318, %r15230, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30176, %r30177}; + // begin inline asm + // chi + lop3.b32 %r30170, %r15314, %r15226, %r15282, 0xD2; + lop3.b32 %r30171, %r15318, %r15230, %r15286, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30170, %r30171}; + // begin inline asm + // chi + lop3.b32 %r30164, %r15226, %r15282, %r15306, 0xD2; + lop3.b32 %r30165, %r15230, %r15286, %r15310, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30164, %r30165}; + // begin inline asm + // chi + lop3.b32 %r30156, %r15282, %r15306, %r15242, 0xD2; + lop3.b32 %r30157, %r15286, %r15310, %r15246, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30156, %r30157}; + // begin inline asm + // chi + lop3.b32 %r30148, %r15306, %r15242, %r15314, 0xD2; + lop3.b32 %r30149, %r15310, %r15246, %r15318, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30148, %r30149}; + mul.wide.s32 %rd774, %r30198, 8; + add.s64 %rd773, %rd706, %rd774; + // begin inline asm + ld.global.nc.v2.u32 {%r15594,%r15595}, [%rd773]; + // end inline asm + xor.b32 %r30184, %r15394, %r15594; + xor.b32 %r30185, %r15395, %r15595; + add.s32 %r30198, %r30198, 1; + setp.lt.u32 %p33, %r30198, 23; + @%p33 bra $L__BB2_54; + + st.local.v2.u32 [%rd3+32], {%r30196, %r30197}; + st.local.v2.u32 [%rd3+72], {%r30194, %r30195}; + st.local.v2.u32 [%rd3+40], {%r30192, %r30193}; + st.local.v2.u32 [%rd3+80], {%r30190, %r30191}; + st.local.v2.u32 [%rd3+48], {%r30188, %r30189}; + st.local.v2.u32 [%rd3+56], {%r30186, %r30187}; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + // begin inline asm + // xor5 + lop3.b32 %r15606, %r30184, %r30182, %r30180, 0x96; + lop3.b32 %r15606, %r15606, %r30178, %r30176, 0x96; + lop3.b32 %r15607, %r30185, %r30183, %r30181, 0x96; + lop3.b32 %r15607, %r15607, %r30179, %r30177, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15618, %r30196, %r30194, %r30174, 0x96; + lop3.b32 %r15618, %r15618, %r30172, %r30170, 0x96; + lop3.b32 %r15619, %r30197, %r30195, %r30175, 0x96; + lop3.b32 %r15619, %r15619, %r30173, %r30171, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15630, %r30192, %r30190, %r30168, 0x96; + lop3.b32 %r15630, %r15630, %r30166, %r30164, 0x96; + lop3.b32 %r15631, %r30193, %r30191, %r30169, 0x96; + lop3.b32 %r15631, %r15631, %r30167, %r30165, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15642, %r30188, %r30162, %r30160, 0x96; + lop3.b32 %r15642, %r15642, %r30158, %r30156, 0x96; + lop3.b32 %r15643, %r30189, %r30163, %r30161, 0x96; + lop3.b32 %r15643, %r15643, %r30159, %r30157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15654, %r30186, %r30154, %r30152, 0x96; + lop3.b32 %r15654, %r15654, %r30150, %r30148, 0x96; + lop3.b32 %r15655, %r30187, %r30155, %r30153, 0x96; + lop3.b32 %r15655, %r15655, %r30151, %r30149, 0x96; + // end inline asm + mov.u32 %r15858, 1; + // begin inline asm + shf.l.wrap.b32 %r15666, %r15619, %r15618, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15670, %r15618, %r15619, %r15858; + // end inline asm + xor.b32 %r15885, %r15666, %r15654; + xor.b32 %r15886, %r15670, %r15655; + xor.b32 %r15813, %r30184, %r15885; + xor.b32 %r15816, %r30185, %r15886; + xor.b32 %r15776, %r30181, %r15886; + xor.b32 %r15775, %r30180, %r15885; + st.local.v2.u32 [%rd3+104], {%r15775, %r15776}; + // begin inline asm + shf.l.wrap.b32 %r15674, %r15631, %r15630, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15678, %r15630, %r15631, %r15858; + // end inline asm + xor.b32 %r15887, %r15674, %r15606; + xor.b32 %r15888, %r15678, %r15607; + xor.b32 %r15712, %r30194, %r15887; + xor.b32 %r15711, %r30195, %r15888; + xor.b32 %r15751, %r30173, %r15888; + xor.b32 %r15752, %r30172, %r15887; + st.local.v2.u32 [%rd3+152], {%r15752, %r15751}; + // begin inline asm + shf.l.wrap.b32 %r15682, %r15643, %r15642, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15686, %r15642, %r15643, %r15858; + // end inline asm + xor.b32 %r15889, %r15682, %r15618; + xor.b32 %r15890, %r15686, %r15619; + xor.b32 %r15735, %r30169, %r15890; + xor.b32 %r15736, %r30168, %r15889; + st.local.v2.u32 [%rd3+120], {%r15736, %r15735}; + xor.b32 %r15727, %r30165, %r15890; + xor.b32 %r15728, %r30164, %r15889; + st.local.v2.u32 [%rd3+200], {%r15728, %r15727}; + // begin inline asm + shf.l.wrap.b32 %r15690, %r15655, %r15654, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15694, %r15654, %r15655, %r15858; + // end inline asm + xor.b32 %r15891, %r15690, %r15630; + xor.b32 %r15892, %r15694, %r15631; + xor.b32 %r15759, %r30188, %r15891; + xor.b32 %r15760, %r30189, %r15892; + xor.b32 %r15768, %r30159, %r15892; + xor.b32 %r15767, %r30158, %r15891; + st.local.v2.u32 [%rd3+168], {%r15767, %r15768}; + // begin inline asm + shf.l.wrap.b32 %r15698, %r15607, %r15606, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15702, %r15606, %r15607, %r15858; + // end inline asm + xor.b32 %r15893, %r15698, %r15642; + xor.b32 %r15894, %r15702, %r15643; + xor.b32 %r15719, %r30154, %r15893; + xor.b32 %r15720, %r30155, %r15894; + xor.b32 %r15744, %r30149, %r15894; + xor.b32 %r15743, %r30148, %r15893; + st.local.v2.u32 [%rd3+216], {%r15743, %r15744}; + // begin inline asm + shf.l.wrap.b32 %r15706, %r15712, %r15711, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15710, %r15711, %r15712, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15714, %r15720, %r15719, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15718, %r15719, %r15720, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15726, %r15727, %r15728, %r15225; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15722, %r15728, %r15727, %r15225; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r15722, %r15726}; + // begin inline asm + shf.l.wrap.b32 %r15730, %r15736, %r15735, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15734, %r15735, %r15736, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15738, %r15744, %r15743, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15742, %r15743, %r15744, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15750, %r15751, %r15752, %r15329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15746, %r15752, %r15751, %r15329; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r15746, %r15750}; + // begin inline asm + shf.l.wrap.b32 %r15754, %r15760, %r15759, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15758, %r15759, %r15760, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15762, %r15768, %r15767, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15766, %r15767, %r15768, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15770, %r15776, %r15775, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15774, %r15775, %r15776, %r15385; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15778, %r15813, %r15706, %r15730, 0xD2; + lop3.b32 %r15779, %r15816, %r15710, %r15734, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15786, %r15706, %r15730, %r15762, 0xD2; + lop3.b32 %r15787, %r15710, %r15734, %r15766, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r15786, %r15787}; + // begin inline asm + // chi + lop3.b32 %r15794, %r15730, %r15762, %r15738, 0xD2; + lop3.b32 %r15795, %r15734, %r15766, %r15742, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r15794, %r15795}; + // begin inline asm + // chi + lop3.b32 %r15802, %r15762, %r15738, %r15813, 0xD2; + lop3.b32 %r15803, %r15766, %r15742, %r15816, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r15802, %r15803}; + // begin inline asm + // chi + lop3.b32 %r15810, %r15738, %r15813, %r15706, 0xD2; + lop3.b32 %r15811, %r15742, %r15816, %r15710, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r15810, %r15811}; + // begin inline asm + // chi + lop3.b32 %r15818, %r15754, %r15714, %r15770, 0xD2; + lop3.b32 %r15819, %r15758, %r15718, %r15774, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r15818, %r15819}; + // begin inline asm + // chi + lop3.b32 %r15826, %r15714, %r15770, %r15746, 0xD2; + lop3.b32 %r15827, %r15718, %r15774, %r15750, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r15826, %r15827}; + // begin inline asm + // chi + lop3.b32 %r15834, %r15770, %r15746, %r15722, 0xD2; + lop3.b32 %r15835, %r15774, %r15750, %r15726, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r15834, %r15835}; + // begin inline asm + ld.global.nc.v2.u32 {%r15842,%r15843}, [%rd707]; + // end inline asm + xor.b32 %r15895, %r15779, %r15843; + xor.b32 %r15896, %r15778, %r15842; + mov.b64 %rd1265, {%r15896, %r15895}; + mov.b64 %rd1266, {%r15786, %r15787}; + mov.b64 %rd1267, {%r15794, %r15795}; + mov.b64 %rd156, {%r15802, %r15803}; + mov.b64 %rd1268, {%r15810, %r15811}; + mov.b64 %rd158, {%r15818, %r15819}; + mov.b64 %rd159, {%r15826, %r15827}; + mov.b64 %rd160, {%r15834, %r15835}; + mov.u32 %r30199, 0; + st.local.v2.u32 [%rd3+24], {%r15896, %r15895}; + st.local.v2.u32 [%rd149+96], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+104], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+112], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+120], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+128], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+136], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+144], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+152], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+160], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+168], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+176], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+184], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+192], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+200], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+208], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+216], {%r30199, %r30199}; + mov.u32 %r30214, -2147483648; + st.local.v2.u32 [%rd149+88], {%r15858, %r30214}; + mov.u32 %r30200, %r30199; + mov.u32 %r30201, %r30199; + mov.u32 %r30202, %r30199; + mov.u32 %r30203, %r30199; + mov.u32 %r30204, %r30199; + mov.u32 %r30205, %r30199; + mov.u32 %r30206, %r30199; + mov.u32 %r30207, %r30199; + mov.u32 %r30208, %r30199; + mov.u32 %r30209, %r30199; + mov.u32 %r30210, %r30199; + mov.u32 %r30211, %r30199; + mov.u32 %r30212, %r30199; + mov.u32 %r30213, %r15858; + mov.u32 %r30215, %r30199; + mov.u32 %r30216, %r30199; + mov.u32 %r30217, %r30199; + mov.u32 %r30218, %r30199; + mov.u32 %r30219, %r30199; + mov.u32 %r30220, %r30199; + mov.u32 %r30221, %r30199; + mov.u32 %r30222, %r30199; + mov.u32 %r30223, %r30199; + mov.u32 %r30224, %r30199; + mov.u32 %r30225, %r30199; + mov.u32 %r30226, %r30199; + mov.u32 %r30227, %r30199; + mov.u32 %r30228, %r30199; + mov.u32 %r30229, %r30199; + mov.u32 %r30230, %r30199; + mov.u32 %r30231, %r30199; + mov.u32 %r30232, %r30199; + mov.u32 %r30249, %r30199; + +$L__BB2_56: + // begin inline asm + // xor5 + lop3.b32 %r15897, %r30235, %r30233, %r30231, 0x96; + lop3.b32 %r15897, %r15897, %r30229, %r30227, 0x96; + lop3.b32 %r15898, %r30236, %r30234, %r30232, 0x96; + lop3.b32 %r15898, %r15898, %r30230, %r30228, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15909, %r30247, %r30245, %r30225, 0x96; + lop3.b32 %r15909, %r15909, %r30223, %r30221, 0x96; + lop3.b32 %r15910, %r30248, %r30246, %r30226, 0x96; + lop3.b32 %r15910, %r15910, %r30224, %r30222, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15921, %r30243, %r30241, %r30219, 0x96; + lop3.b32 %r15921, %r15921, %r30217, %r30215, 0x96; + lop3.b32 %r15922, %r30244, %r30242, %r30220, 0x96; + lop3.b32 %r15922, %r15922, %r30218, %r30216, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15933, %r30239, %r30213, %r30211, 0x96; + lop3.b32 %r15933, %r15933, %r30209, %r30207, 0x96; + lop3.b32 %r15934, %r30240, %r30214, %r30212, 0x96; + lop3.b32 %r15934, %r15934, %r30210, %r30208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15945, %r30237, %r30205, %r30203, 0x96; + lop3.b32 %r15945, %r15945, %r30201, %r30199, 0x96; + lop3.b32 %r15946, %r30238, %r30206, %r30204, 0x96; + lop3.b32 %r15946, %r15946, %r30202, %r30200, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15957, %r15910, %r15909, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15961, %r15909, %r15910, %r15858; + // end inline asm + xor.b32 %r16391, %r15957, %r15945; + xor.b32 %r16392, %r15961, %r15946; + xor.b32 %r16224, %r30235, %r16391; + xor.b32 %r16227, %r30236, %r16392; + xor.b32 %r16131, %r30233, %r16391; + xor.b32 %r16130, %r30234, %r16392; + xor.b32 %r16178, %r30231, %r16391; + xor.b32 %r16179, %r30232, %r16392; + xor.b32 %r16083, %r30229, %r16391; + xor.b32 %r16082, %r30230, %r16392; + xor.b32 %r16034, %r30227, %r16391; + xor.b32 %r16035, %r30228, %r16392; + // begin inline asm + shf.l.wrap.b32 %r15965, %r15922, %r15921, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15969, %r15921, %r15922, %r15858; + // end inline asm + xor.b32 %r16393, %r15965, %r15897; + xor.b32 %r16394, %r15969, %r15898; + xor.b32 %r16186, %r30247, %r16393; + xor.b32 %r16187, %r30248, %r16394; + xor.b32 %r16003, %r30245, %r16393; + xor.b32 %r16002, %r30246, %r16394; + xor.b32 %r16162, %r30225, %r16393; + xor.b32 %r16163, %r30226, %r16394; + xor.b32 %r16123, %r30223, %r16393; + xor.b32 %r16122, %r30224, %r16394; + xor.b32 %r16106, %r30221, %r16393; + xor.b32 %r16107, %r30222, %r16394; + // begin inline asm + shf.l.wrap.b32 %r15973, %r15934, %r15933, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15977, %r15933, %r15934, %r15858; + // end inline asm + xor.b32 %r16395, %r15973, %r15909; + xor.b32 %r16396, %r15977, %r15910; + xor.b32 %r16043, %r30243, %r16395; + xor.b32 %r16042, %r30244, %r16396; + xor.b32 %r16170, %r30241, %r16395; + xor.b32 %r16171, %r30242, %r16396; + xor.b32 %r16051, %r30219, %r16395; + xor.b32 %r16050, %r30220, %r16396; + xor.b32 %r16154, %r30217, %r16395; + xor.b32 %r16155, %r30218, %r16396; + xor.b32 %r16019, %r30215, %r16395; + xor.b32 %r16018, %r30216, %r16396; + // begin inline asm + shf.l.wrap.b32 %r15981, %r15946, %r15945, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15985, %r15945, %r15946, %r15858; + // end inline asm + xor.b32 %r16397, %r15981, %r15921; + xor.b32 %r16398, %r15985, %r15922; + xor.b32 %r16138, %r30239, %r16397; + xor.b32 %r16139, %r30240, %r16398; + xor.b32 %r16115, %r30213, %r16397; + xor.b32 %r16114, %r30214, %r16398; + xor.b32 %r16058, %r30211, %r16397; + xor.b32 %r16059, %r30212, %r16398; + xor.b32 %r16146, %r30209, %r16397; + xor.b32 %r16147, %r30210, %r16398; + xor.b32 %r16075, %r30207, %r16397; + xor.b32 %r16074, %r30208, %r16398; + // begin inline asm + shf.l.wrap.b32 %r15989, %r15898, %r15897, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15993, %r15897, %r15898, %r15858; + // end inline asm + xor.b32 %r16399, %r15989, %r15933; + xor.b32 %r16400, %r15993, %r15934; + xor.b32 %r16090, %r30237, %r16399; + xor.b32 %r16091, %r30238, %r16400; + xor.b32 %r16010, %r30205, %r16399; + xor.b32 %r16011, %r30206, %r16400; + xor.b32 %r16027, %r30203, %r16399; + xor.b32 %r16026, %r30204, %r16400; + xor.b32 %r16066, %r30201, %r16399; + xor.b32 %r16067, %r30202, %r16400; + xor.b32 %r16098, %r30199, %r16399; + xor.b32 %r16099, %r30200, %r16400; + mov.u32 %r16004, 44; + // begin inline asm + shf.l.wrap.b32 %r15997, %r16003, %r16002, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16001, %r16002, %r16003, %r16004; + // end inline asm + mov.u32 %r16012, 20; + // begin inline asm + shf.l.wrap.b32 %r16005, %r16011, %r16010, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16009, %r16010, %r16011, %r16012; + // end inline asm + mov.u32 %r16020, 61; + // begin inline asm + shf.l.wrap.b32 %r16013, %r16019, %r16018, %r16020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16017, %r16018, %r16019, %r16020; + // end inline asm + mov.u32 %r16028, 39; + // begin inline asm + shf.l.wrap.b32 %r16021, %r16027, %r16026, %r16028; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16025, %r16026, %r16027, %r16028; + // end inline asm + mov.u32 %r16036, 18; + // begin inline asm + shf.l.wrap.b32 %r16029, %r16035, %r16034, %r16036; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16033, %r16034, %r16035, %r16036; + // end inline asm + mov.u32 %r16044, 62; + // begin inline asm + shf.l.wrap.b32 %r16037, %r16043, %r16042, %r16044; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16041, %r16042, %r16043, %r16044; + // end inline asm + mov.u32 %r16052, 43; + // begin inline asm + shf.l.wrap.b32 %r16045, %r16051, %r16050, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16049, %r16050, %r16051, %r16052; + // end inline asm + mov.u32 %r16060, 25; + // begin inline asm + shf.l.wrap.b32 %r16053, %r16059, %r16058, %r16060; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16057, %r16058, %r16059, %r16060; + // end inline asm + mov.u32 %r16068, 8; + // begin inline asm + shf.l.wrap.b32 %r16061, %r16067, %r16066, %r16068; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16065, %r16066, %r16067, %r16068; + // end inline asm + mov.u32 %r16076, 56; + // begin inline asm + shf.l.wrap.b32 %r16069, %r16075, %r16074, %r16076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16073, %r16074, %r16075, %r16076; + // end inline asm + mov.u32 %r16084, 41; + // begin inline asm + shf.l.wrap.b32 %r16077, %r16083, %r16082, %r16084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16081, %r16082, %r16083, %r16084; + // end inline asm + mov.u32 %r16092, 27; + // begin inline asm + shf.l.wrap.b32 %r16085, %r16091, %r16090, %r16092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16089, %r16090, %r16091, %r16092; + // end inline asm + mov.u32 %r16100, 14; + // begin inline asm + shf.l.wrap.b32 %r16093, %r16099, %r16098, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16097, %r16098, %r16099, %r16100; + // end inline asm + mov.u32 %r16108, 2; + // begin inline asm + shf.l.wrap.b32 %r16101, %r16107, %r16106, %r16108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16105, %r16106, %r16107, %r16108; + // end inline asm + mov.u32 %r16116, 55; + // begin inline asm + shf.l.wrap.b32 %r16109, %r16115, %r16114, %r16116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16113, %r16114, %r16115, %r16116; + // end inline asm + mov.u32 %r16124, 45; + // begin inline asm + shf.l.wrap.b32 %r16117, %r16123, %r16122, %r16124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16121, %r16122, %r16123, %r16124; + // end inline asm + mov.u32 %r16132, 36; + // begin inline asm + shf.l.wrap.b32 %r16125, %r16131, %r16130, %r16132; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16129, %r16130, %r16131, %r16132; + // end inline asm + mov.u32 %r16140, 28; + // begin inline asm + shf.l.wrap.b32 %r16133, %r16139, %r16138, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16137, %r16138, %r16139, %r16140; + // end inline asm + mov.u32 %r16148, 21; + // begin inline asm + shf.l.wrap.b32 %r16141, %r16147, %r16146, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16145, %r16146, %r16147, %r16148; + // end inline asm + mov.u32 %r16156, 15; + // begin inline asm + shf.l.wrap.b32 %r16149, %r16155, %r16154, %r16156; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16153, %r16154, %r16155, %r16156; + // end inline asm + mov.u32 %r16164, 10; + // begin inline asm + shf.l.wrap.b32 %r16157, %r16163, %r16162, %r16164; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16161, %r16162, %r16163, %r16164; + // end inline asm + mov.u32 %r16172, 6; + // begin inline asm + shf.l.wrap.b32 %r16165, %r16171, %r16170, %r16172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16169, %r16170, %r16171, %r16172; + // end inline asm + mov.u32 %r16180, 3; + // begin inline asm + shf.l.wrap.b32 %r16173, %r16179, %r16178, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16177, %r16178, %r16179, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16181, %r16187, %r16186, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16185, %r16186, %r16187, %r15858; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16189, %r16224, %r15997, %r16045, 0xD2; + lop3.b32 %r16190, %r16227, %r16001, %r16049, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30247, %r15997, %r16045, %r16141, 0xD2; + lop3.b32 %r30248, %r16001, %r16049, %r16145, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30243, %r16045, %r16141, %r16093, 0xD2; + lop3.b32 %r30244, %r16049, %r16145, %r16097, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30239, %r16141, %r16093, %r16224, 0xD2; + lop3.b32 %r30240, %r16145, %r16097, %r16227, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30237, %r16093, %r16224, %r15997, 0xD2; + lop3.b32 %r30238, %r16097, %r16227, %r16001, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30233, %r16133, %r16005, %r16173, 0xD2; + lop3.b32 %r30234, %r16137, %r16009, %r16177, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30245, %r16005, %r16173, %r16117, 0xD2; + lop3.b32 %r30246, %r16009, %r16177, %r16121, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30241, %r16173, %r16117, %r16013, 0xD2; + lop3.b32 %r30242, %r16177, %r16121, %r16017, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30213, %r16117, %r16013, %r16133, 0xD2; + lop3.b32 %r30214, %r16121, %r16017, %r16137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r30213, %r30214}; + // begin inline asm + // chi + lop3.b32 %r30205, %r16013, %r16133, %r16005, 0xD2; + lop3.b32 %r30206, %r16017, %r16137, %r16009, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r30205, %r30206}; + // begin inline asm + // chi + lop3.b32 %r30231, %r16181, %r16165, %r16053, 0xD2; + lop3.b32 %r30232, %r16185, %r16169, %r16057, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+104], {%r30231, %r30232}; + // begin inline asm + // chi + lop3.b32 %r30225, %r16165, %r16053, %r16061, 0xD2; + lop3.b32 %r30226, %r16169, %r16057, %r16065, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+112], {%r30225, %r30226}; + // begin inline asm + // chi + lop3.b32 %r30219, %r16053, %r16061, %r16029, 0xD2; + lop3.b32 %r30220, %r16057, %r16065, %r16033, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+120], {%r30219, %r30220}; + // begin inline asm + // chi + lop3.b32 %r30211, %r16061, %r16029, %r16181, 0xD2; + lop3.b32 %r30212, %r16065, %r16033, %r16185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+128], {%r30211, %r30212}; + // begin inline asm + // chi + lop3.b32 %r30203, %r16029, %r16181, %r16165, 0xD2; + lop3.b32 %r30204, %r16033, %r16185, %r16169, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+136], {%r30203, %r30204}; + // begin inline asm + // chi + lop3.b32 %r30229, %r16085, %r16125, %r16157, 0xD2; + lop3.b32 %r30230, %r16089, %r16129, %r16161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+144], {%r30229, %r30230}; + // begin inline asm + // chi + lop3.b32 %r30223, %r16125, %r16157, %r16149, 0xD2; + lop3.b32 %r30224, %r16129, %r16161, %r16153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+152], {%r30223, %r30224}; + // begin inline asm + // chi + lop3.b32 %r30217, %r16157, %r16149, %r16069, 0xD2; + lop3.b32 %r30218, %r16161, %r16153, %r16073, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+160], {%r30217, %r30218}; + // begin inline asm + // chi + lop3.b32 %r30209, %r16149, %r16069, %r16085, 0xD2; + lop3.b32 %r30210, %r16153, %r16073, %r16089, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+168], {%r30209, %r30210}; + // begin inline asm + // chi + lop3.b32 %r30201, %r16069, %r16085, %r16125, 0xD2; + lop3.b32 %r30202, %r16073, %r16089, %r16129, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+176], {%r30201, %r30202}; + // begin inline asm + // chi + lop3.b32 %r30227, %r16037, %r16109, %r16021, 0xD2; + lop3.b32 %r30228, %r16041, %r16113, %r16025, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+184], {%r30227, %r30228}; + // begin inline asm + // chi + lop3.b32 %r30221, %r16109, %r16021, %r16077, 0xD2; + lop3.b32 %r30222, %r16113, %r16025, %r16081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+192], {%r30221, %r30222}; + // begin inline asm + // chi + lop3.b32 %r30215, %r16021, %r16077, %r16101, 0xD2; + lop3.b32 %r30216, %r16025, %r16081, %r16105, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+200], {%r30215, %r30216}; + // begin inline asm + // chi + lop3.b32 %r30207, %r16077, %r16101, %r16037, 0xD2; + lop3.b32 %r30208, %r16081, %r16105, %r16041, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+208], {%r30207, %r30208}; + // begin inline asm + // chi + lop3.b32 %r30199, %r16101, %r16037, %r16109, 0xD2; + lop3.b32 %r30200, %r16105, %r16041, %r16113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+216], {%r30199, %r30200}; + mul.wide.s32 %rd781, %r30249, 8; + add.s64 %rd780, %rd706, %rd781; + // begin inline asm + ld.global.nc.v2.u32 {%r16389,%r16390}, [%rd780]; + // end inline asm + xor.b32 %r30235, %r16189, %r16389; + xor.b32 %r30236, %r16190, %r16390; + add.s32 %r30249, %r30249, 1; + setp.lt.u32 %p34, %r30249, 23; + @%p34 bra $L__BB2_56; + + mov.u32 %r16500, 1; + st.local.v2.u32 [%rd149+32], {%r30247, %r30248}; + st.local.v2.u32 [%rd149+72], {%r30245, %r30246}; + st.local.v2.u32 [%rd149+40], {%r30243, %r30244}; + st.local.v2.u32 [%rd149+80], {%r30241, %r30242}; + st.local.v2.u32 [%rd149+48], {%r30239, %r30240}; + st.local.v2.u32 [%rd149+56], {%r30237, %r30238}; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + // begin inline asm + // xor5 + lop3.b32 %r16401, %r30235, %r30233, %r30231, 0x96; + lop3.b32 %r16401, %r16401, %r30229, %r30227, 0x96; + lop3.b32 %r16402, %r30236, %r30234, %r30232, 0x96; + lop3.b32 %r16402, %r16402, %r30230, %r30228, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16413, %r30247, %r30245, %r30225, 0x96; + lop3.b32 %r16413, %r16413, %r30223, %r30221, 0x96; + lop3.b32 %r16414, %r30248, %r30246, %r30226, 0x96; + lop3.b32 %r16414, %r16414, %r30224, %r30222, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16425, %r30243, %r30241, %r30219, 0x96; + lop3.b32 %r16425, %r16425, %r30217, %r30215, 0x96; + lop3.b32 %r16426, %r30244, %r30242, %r30220, 0x96; + lop3.b32 %r16426, %r16426, %r30218, %r30216, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16437, %r30239, %r30213, %r30211, 0x96; + lop3.b32 %r16437, %r16437, %r30209, %r30207, 0x96; + lop3.b32 %r16438, %r30240, %r30214, %r30212, 0x96; + lop3.b32 %r16438, %r16438, %r30210, %r30208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16449, %r30237, %r30205, %r30203, 0x96; + lop3.b32 %r16449, %r16449, %r30201, %r30199, 0x96; + lop3.b32 %r16450, %r30238, %r30206, %r30204, 0x96; + lop3.b32 %r16450, %r16450, %r30202, %r30200, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16461, %r16414, %r16413, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16465, %r16413, %r16414, %r16500; + // end inline asm + xor.b32 %r16639, %r16461, %r16449; + xor.b32 %r16640, %r16465, %r16450; + xor.b32 %r16608, %r30235, %r16639; + xor.b32 %r16611, %r30236, %r16640; + xor.b32 %r16571, %r30232, %r16640; + xor.b32 %r16570, %r30231, %r16639; + st.local.v2.u32 [%rd149+104], {%r16570, %r16571}; + // begin inline asm + shf.l.wrap.b32 %r16469, %r16426, %r16425, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16473, %r16425, %r16426, %r16500; + // end inline asm + xor.b32 %r16641, %r16469, %r16401; + xor.b32 %r16642, %r16473, %r16402; + xor.b32 %r16507, %r30245, %r16641; + xor.b32 %r16506, %r30246, %r16642; + xor.b32 %r16546, %r30224, %r16642; + xor.b32 %r16547, %r30223, %r16641; + st.local.v2.u32 [%rd149+152], {%r16547, %r16546}; + // begin inline asm + shf.l.wrap.b32 %r16477, %r16438, %r16437, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16481, %r16437, %r16438, %r16500; + // end inline asm + xor.b32 %r16643, %r16477, %r16413; + xor.b32 %r16644, %r16481, %r16414; + xor.b32 %r16530, %r30220, %r16644; + xor.b32 %r16531, %r30219, %r16643; + st.local.v2.u32 [%rd149+120], {%r16531, %r16530}; + xor.b32 %r16522, %r30216, %r16644; + xor.b32 %r16523, %r30215, %r16643; + st.local.v2.u32 [%rd149+200], {%r16523, %r16522}; + // begin inline asm + shf.l.wrap.b32 %r16485, %r16450, %r16449, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16489, %r16449, %r16450, %r16500; + // end inline asm + xor.b32 %r16645, %r16485, %r16425; + xor.b32 %r16646, %r16489, %r16426; + xor.b32 %r16554, %r30239, %r16645; + xor.b32 %r16555, %r30240, %r16646; + xor.b32 %r16563, %r30210, %r16646; + xor.b32 %r16562, %r30209, %r16645; + st.local.v2.u32 [%rd149+168], {%r16562, %r16563}; + // begin inline asm + shf.l.wrap.b32 %r16493, %r16402, %r16401, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16497, %r16401, %r16402, %r16500; + // end inline asm + xor.b32 %r16647, %r16493, %r16437; + xor.b32 %r16648, %r16497, %r16438; + xor.b32 %r16514, %r30205, %r16647; + xor.b32 %r16515, %r30206, %r16648; + xor.b32 %r16539, %r30200, %r16648; + xor.b32 %r16538, %r30199, %r16647; + st.local.v2.u32 [%rd149+216], {%r16538, %r16539}; + // begin inline asm + shf.l.wrap.b32 %r16501, %r16507, %r16506, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16505, %r16506, %r16507, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16509, %r16515, %r16514, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16513, %r16514, %r16515, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16521, %r16522, %r16523, %r16020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16517, %r16523, %r16522, %r16020; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r16517, %r16521}; + // begin inline asm + shf.l.wrap.b32 %r16525, %r16531, %r16530, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16529, %r16530, %r16531, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16533, %r16539, %r16538, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16537, %r16538, %r16539, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16545, %r16546, %r16547, %r16124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16541, %r16547, %r16546, %r16124; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r16541, %r16545}; + // begin inline asm + shf.l.wrap.b32 %r16549, %r16555, %r16554, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16553, %r16554, %r16555, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16557, %r16563, %r16562, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16561, %r16562, %r16563, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16565, %r16571, %r16570, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16569, %r16570, %r16571, %r16180; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16573, %r16608, %r16501, %r16525, 0xD2; + lop3.b32 %r16574, %r16611, %r16505, %r16529, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16581, %r16501, %r16525, %r16557, 0xD2; + lop3.b32 %r16582, %r16505, %r16529, %r16561, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+32], {%r16581, %r16582}; + // begin inline asm + // chi + lop3.b32 %r16589, %r16525, %r16557, %r16533, 0xD2; + lop3.b32 %r16590, %r16529, %r16561, %r16537, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+40], {%r16589, %r16590}; + // begin inline asm + // chi + lop3.b32 %r16597, %r16557, %r16533, %r16608, 0xD2; + lop3.b32 %r16598, %r16561, %r16537, %r16611, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+48], {%r16597, %r16598}; + // begin inline asm + // chi + lop3.b32 %r16605, %r16533, %r16608, %r16501, 0xD2; + lop3.b32 %r16606, %r16537, %r16611, %r16505, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+56], {%r16605, %r16606}; + // begin inline asm + // chi + lop3.b32 %r16613, %r16549, %r16509, %r16565, 0xD2; + lop3.b32 %r16614, %r16553, %r16513, %r16569, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+64], {%r16613, %r16614}; + // begin inline asm + // chi + lop3.b32 %r16621, %r16509, %r16565, %r16541, 0xD2; + lop3.b32 %r16622, %r16513, %r16569, %r16545, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+72], {%r16621, %r16622}; + // begin inline asm + // chi + lop3.b32 %r16629, %r16565, %r16541, %r16517, 0xD2; + lop3.b32 %r16630, %r16569, %r16545, %r16521, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+80], {%r16629, %r16630}; + // begin inline asm + ld.global.nc.v2.u32 {%r16637,%r16638}, [%rd707]; + // end inline asm + xor.b32 %r16649, %r16574, %r16638; + xor.b32 %r16650, %r16573, %r16637; + st.local.v2.u32 [%rd149+24], {%r16650, %r16649}; + st.global.u64 [%rd130], %rd1265; + st.global.u64 [%rd130+8], %rd1266; + st.global.u64 [%rd130+16], %rd1267; + st.global.u64 [%rd130+24], %rd156; + st.global.u64 [%rd130+32], %rd1268; + st.global.u64 [%rd130+40], %rd158; + st.global.u64 [%rd130+48], %rd159; + st.global.u64 [%rd130+56], %rd160; + st.global.v2.u32 [%rd130+64], {%r16650, %r16649}; + st.global.v2.u32 [%rd130+72], {%r16581, %r16582}; + st.global.v2.u32 [%rd130+80], {%r16589, %r16590}; + st.global.v2.u32 [%rd130+88], {%r16597, %r16598}; + st.global.v2.u32 [%rd130+96], {%r16605, %r16606}; + st.global.v2.u32 [%rd130+104], {%r16613, %r16614}; + st.global.v2.u32 [%rd130+112], {%r16621, %r16622}; + st.global.v2.u32 [%rd130+120], {%r16629, %r16630}; + +$L__BB2_69: + shl.b32 %r3326, %r29, 1; + mul.wide.u32 %rd887, %r3326, -954391867; + shr.u64 %rd888, %rd887, 32; + cvt.u32.u64 %r19935, %rd888; + sub.s32 %r19936, %r3326, %r19935; + shr.u32 %r19937, %r19936, 1; + add.s32 %r19938, %r19937, %r19935; + shr.u32 %r19939, %r19938, 20; + mul.lo.s32 %r19940, %r19939, 1179641; + sub.s32 %r19941, %r3326, %r19940; + mul.wide.u32 %rd890, %r19941, 64; + add.s64 %rd222, %rd471, %rd890; + or.b32 %r3327, %r3326, 1; + mul.wide.u32 %rd891, %r3327, -954391867; + shr.u64 %rd892, %rd891, 32; + cvt.u32.u64 %r19942, %rd892; + sub.s32 %r19943, %r3327, %r19942; + shr.u32 %r19944, %r19943, 1; + add.s32 %r19945, %r19944, %r19942; + shr.u32 %r19946, %r19945, 20; + mul.lo.s32 %r19947, %r19946, 1179641; + sub.s32 %r19948, %r3327, %r19947; + mul.wide.u32 %rd893, %r19948, 64; + add.s64 %rd223, %rd471, %rd893; + @%p16 bra $L__BB2_83; + + cvta.to.global.u64 %rd894, %rd353; + mul.wide.u32 %rd895, %r29, 128; + add.s64 %rd224, %rd894, %rd895; + ld.global.u64 %rd1269, [%rd224]; + setp.eq.s64 %p41, %rd1269, 0; + @%p41 bra $L__BB2_72; + + ld.global.u64 %rd1272, [%rd224+32]; + ld.global.u64 %rd1271, [%rd224+16]; + ld.global.u64 %rd1270, [%rd224+8]; + bra.uni $L__BB2_94; + +$L__BB2_83: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd1011, 1179641; + st.local.u64 [%rd3+8], %rd1011; + st.local.u32 [%rd3+16], %r3326; + ld.global.u64 %rd1012, [%rd222]; + ld.global.u64 %rd1013, [%rd222+8]; + ld.global.u64 %rd1014, [%rd222+16]; + ld.global.u64 %rd1015, [%rd222+24]; + ld.global.u64 %rd1016, [%rd222+32]; + ld.global.u64 %rd1017, [%rd222+40]; + ld.global.u64 %rd1018, [%rd222+48]; + ld.global.u64 %rd1019, [%rd222+56]; + st.local.u64 [%rd3+24], %rd1012; + st.local.u64 [%rd3+32], %rd1013; + st.local.u64 [%rd3+40], %rd1014; + st.local.u64 [%rd3+48], %rd1015; + st.local.u64 [%rd3+56], %rd1016; + st.local.u64 [%rd3+64], %rd1017; + st.local.u64 [%rd3+72], %rd1018; + st.local.u64 [%rd3+80], %rd1019; + cvt.u32.u64 %r23275, %rd1012; + xor.b32 %r23276, %r3326, %r23275; + st.local.u32 [%rd3+24], %r23276; + mov.u32 %r30724, 0; + st.local.v2.u32 [%rd3+96], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+104], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+112], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+120], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+128], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+136], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+144], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+152], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+160], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+168], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+176], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+184], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+192], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+200], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+208], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+216], {%r30724, %r30724}; + mov.u32 %r30739, -2147483648; + mov.u32 %r23248, 1; + st.local.v2.u32 [%rd3+88], {%r23248, %r30739}; + ld.local.v2.u32 {%r30760, %r30761}, [%rd3+24]; + mov.b64 {%r30758, %r30759}, %rd1017; + shr.u64 %rd1020, %rd1013, 32; + cvt.u32.u64 %r30772, %rd1013; + cvt.u32.u64 %r30773, %rd1020; + shr.u64 %rd1021, %rd1018, 32; + cvt.u32.u64 %r30770, %rd1018; + cvt.u32.u64 %r30771, %rd1021; + shr.u64 %rd1022, %rd1014, 32; + cvt.u32.u64 %r30768, %rd1014; + cvt.u32.u64 %r30769, %rd1022; + shr.u64 %rd1023, %rd1019, 32; + cvt.u32.u64 %r30766, %rd1019; + cvt.u32.u64 %r30767, %rd1023; + shr.u64 %rd1024, %rd1015, 32; + cvt.u32.u64 %r30764, %rd1015; + cvt.u32.u64 %r30765, %rd1024; + shr.u64 %rd1025, %rd1016, 32; + cvt.u32.u64 %r30762, %rd1016; + cvt.u32.u64 %r30763, %rd1025; + mov.u32 %r30725, %r30724; + mov.u32 %r30726, %r30724; + mov.u32 %r30727, %r30724; + mov.u32 %r30728, %r30724; + mov.u32 %r30729, %r30724; + mov.u32 %r30730, %r30724; + mov.u32 %r30731, %r30724; + mov.u32 %r30732, %r30724; + mov.u32 %r30733, %r30724; + mov.u32 %r30734, %r30724; + mov.u32 %r30735, %r30724; + mov.u32 %r30736, %r30724; + mov.u32 %r30737, %r30724; + mov.u32 %r30738, %r23248; + mov.u32 %r30740, %r30724; + mov.u32 %r30741, %r30724; + mov.u32 %r30742, %r30724; + mov.u32 %r30743, %r30724; + mov.u32 %r30744, %r30724; + mov.u32 %r30745, %r30724; + mov.u32 %r30746, %r30724; + mov.u32 %r30747, %r30724; + mov.u32 %r30748, %r30724; + mov.u32 %r30749, %r30724; + mov.u32 %r30750, %r30724; + mov.u32 %r30751, %r30724; + mov.u32 %r30752, %r30724; + mov.u32 %r30753, %r30724; + mov.u32 %r30754, %r30724; + mov.u32 %r30755, %r30724; + mov.u32 %r30756, %r30724; + mov.u32 %r30757, %r30724; + mov.u32 %r30774, %r30724; + +$L__BB2_84: + // begin inline asm + // xor5 + lop3.b32 %r23279, %r30760, %r30758, %r30756, 0x96; + lop3.b32 %r23279, %r23279, %r30754, %r30752, 0x96; + lop3.b32 %r23280, %r30761, %r30759, %r30757, 0x96; + lop3.b32 %r23280, %r23280, %r30755, %r30753, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23291, %r30772, %r30770, %r30750, 0x96; + lop3.b32 %r23291, %r23291, %r30748, %r30746, 0x96; + lop3.b32 %r23292, %r30773, %r30771, %r30751, 0x96; + lop3.b32 %r23292, %r23292, %r30749, %r30747, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23303, %r30768, %r30766, %r30744, 0x96; + lop3.b32 %r23303, %r23303, %r30742, %r30740, 0x96; + lop3.b32 %r23304, %r30769, %r30767, %r30745, 0x96; + lop3.b32 %r23304, %r23304, %r30743, %r30741, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23315, %r30764, %r30738, %r30736, 0x96; + lop3.b32 %r23315, %r23315, %r30734, %r30732, 0x96; + lop3.b32 %r23316, %r30765, %r30739, %r30737, 0x96; + lop3.b32 %r23316, %r23316, %r30735, %r30733, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23327, %r30762, %r30730, %r30728, 0x96; + lop3.b32 %r23327, %r23327, %r30726, %r30724, 0x96; + lop3.b32 %r23328, %r30763, %r30731, %r30729, 0x96; + lop3.b32 %r23328, %r23328, %r30727, %r30725, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23339, %r23292, %r23291, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23343, %r23291, %r23292, %r23248; + // end inline asm + xor.b32 %r23773, %r23339, %r23327; + xor.b32 %r23774, %r23343, %r23328; + xor.b32 %r23606, %r30760, %r23773; + xor.b32 %r23609, %r30761, %r23774; + xor.b32 %r23513, %r30758, %r23773; + xor.b32 %r23512, %r30759, %r23774; + xor.b32 %r23560, %r30756, %r23773; + xor.b32 %r23561, %r30757, %r23774; + xor.b32 %r23465, %r30754, %r23773; + xor.b32 %r23464, %r30755, %r23774; + xor.b32 %r23416, %r30752, %r23773; + xor.b32 %r23417, %r30753, %r23774; + // begin inline asm + shf.l.wrap.b32 %r23347, %r23304, %r23303, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23351, %r23303, %r23304, %r23248; + // end inline asm + xor.b32 %r23775, %r23347, %r23279; + xor.b32 %r23776, %r23351, %r23280; + xor.b32 %r23568, %r30772, %r23775; + xor.b32 %r23569, %r30773, %r23776; + xor.b32 %r23385, %r30770, %r23775; + xor.b32 %r23384, %r30771, %r23776; + xor.b32 %r23544, %r30750, %r23775; + xor.b32 %r23545, %r30751, %r23776; + xor.b32 %r23505, %r30748, %r23775; + xor.b32 %r23504, %r30749, %r23776; + xor.b32 %r23488, %r30746, %r23775; + xor.b32 %r23489, %r30747, %r23776; + // begin inline asm + shf.l.wrap.b32 %r23355, %r23316, %r23315, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23359, %r23315, %r23316, %r23248; + // end inline asm + xor.b32 %r23777, %r23355, %r23291; + xor.b32 %r23778, %r23359, %r23292; + xor.b32 %r23425, %r30768, %r23777; + xor.b32 %r23424, %r30769, %r23778; + xor.b32 %r23552, %r30766, %r23777; + xor.b32 %r23553, %r30767, %r23778; + xor.b32 %r23433, %r30744, %r23777; + xor.b32 %r23432, %r30745, %r23778; + xor.b32 %r23536, %r30742, %r23777; + xor.b32 %r23537, %r30743, %r23778; + xor.b32 %r23401, %r30740, %r23777; + xor.b32 %r23400, %r30741, %r23778; + // begin inline asm + shf.l.wrap.b32 %r23363, %r23328, %r23327, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23367, %r23327, %r23328, %r23248; + // end inline asm + xor.b32 %r23779, %r23363, %r23303; + xor.b32 %r23780, %r23367, %r23304; + xor.b32 %r23520, %r30764, %r23779; + xor.b32 %r23521, %r30765, %r23780; + xor.b32 %r23497, %r30738, %r23779; + xor.b32 %r23496, %r30739, %r23780; + xor.b32 %r23440, %r30736, %r23779; + xor.b32 %r23441, %r30737, %r23780; + xor.b32 %r23528, %r30734, %r23779; + xor.b32 %r23529, %r30735, %r23780; + xor.b32 %r23457, %r30732, %r23779; + xor.b32 %r23456, %r30733, %r23780; + // begin inline asm + shf.l.wrap.b32 %r23371, %r23280, %r23279, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23375, %r23279, %r23280, %r23248; + // end inline asm + xor.b32 %r23781, %r23371, %r23315; + xor.b32 %r23782, %r23375, %r23316; + xor.b32 %r23472, %r30762, %r23781; + xor.b32 %r23473, %r30763, %r23782; + xor.b32 %r23392, %r30730, %r23781; + xor.b32 %r23393, %r30731, %r23782; + xor.b32 %r23409, %r30728, %r23781; + xor.b32 %r23408, %r30729, %r23782; + xor.b32 %r23448, %r30726, %r23781; + xor.b32 %r23449, %r30727, %r23782; + xor.b32 %r23480, %r30724, %r23781; + xor.b32 %r23481, %r30725, %r23782; + mov.u32 %r23386, 44; + // begin inline asm + shf.l.wrap.b32 %r23379, %r23385, %r23384, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23383, %r23384, %r23385, %r23386; + // end inline asm + mov.u32 %r23394, 20; + // begin inline asm + shf.l.wrap.b32 %r23387, %r23393, %r23392, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23391, %r23392, %r23393, %r23394; + // end inline asm + mov.u32 %r23402, 61; + // begin inline asm + shf.l.wrap.b32 %r23395, %r23401, %r23400, %r23402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23399, %r23400, %r23401, %r23402; + // end inline asm + mov.u32 %r23410, 39; + // begin inline asm + shf.l.wrap.b32 %r23403, %r23409, %r23408, %r23410; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23407, %r23408, %r23409, %r23410; + // end inline asm + mov.u32 %r23418, 18; + // begin inline asm + shf.l.wrap.b32 %r23411, %r23417, %r23416, %r23418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23415, %r23416, %r23417, %r23418; + // end inline asm + mov.u32 %r23426, 62; + // begin inline asm + shf.l.wrap.b32 %r23419, %r23425, %r23424, %r23426; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23423, %r23424, %r23425, %r23426; + // end inline asm + mov.u32 %r23434, 43; + // begin inline asm + shf.l.wrap.b32 %r23427, %r23433, %r23432, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23431, %r23432, %r23433, %r23434; + // end inline asm + mov.u32 %r23442, 25; + // begin inline asm + shf.l.wrap.b32 %r23435, %r23441, %r23440, %r23442; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23439, %r23440, %r23441, %r23442; + // end inline asm + mov.u32 %r23450, 8; + // begin inline asm + shf.l.wrap.b32 %r23443, %r23449, %r23448, %r23450; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23447, %r23448, %r23449, %r23450; + // end inline asm + mov.u32 %r23458, 56; + // begin inline asm + shf.l.wrap.b32 %r23451, %r23457, %r23456, %r23458; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23455, %r23456, %r23457, %r23458; + // end inline asm + mov.u32 %r23466, 41; + // begin inline asm + shf.l.wrap.b32 %r23459, %r23465, %r23464, %r23466; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23463, %r23464, %r23465, %r23466; + // end inline asm + mov.u32 %r23474, 27; + // begin inline asm + shf.l.wrap.b32 %r23467, %r23473, %r23472, %r23474; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23471, %r23472, %r23473, %r23474; + // end inline asm + mov.u32 %r23482, 14; + // begin inline asm + shf.l.wrap.b32 %r23475, %r23481, %r23480, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23479, %r23480, %r23481, %r23482; + // end inline asm + mov.u32 %r23490, 2; + // begin inline asm + shf.l.wrap.b32 %r23483, %r23489, %r23488, %r23490; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23487, %r23488, %r23489, %r23490; + // end inline asm + mov.u32 %r23498, 55; + // begin inline asm + shf.l.wrap.b32 %r23491, %r23497, %r23496, %r23498; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23495, %r23496, %r23497, %r23498; + // end inline asm + mov.u32 %r23506, 45; + // begin inline asm + shf.l.wrap.b32 %r23499, %r23505, %r23504, %r23506; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23503, %r23504, %r23505, %r23506; + // end inline asm + mov.u32 %r23514, 36; + // begin inline asm + shf.l.wrap.b32 %r23507, %r23513, %r23512, %r23514; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23511, %r23512, %r23513, %r23514; + // end inline asm + mov.u32 %r23522, 28; + // begin inline asm + shf.l.wrap.b32 %r23515, %r23521, %r23520, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23519, %r23520, %r23521, %r23522; + // end inline asm + mov.u32 %r23530, 21; + // begin inline asm + shf.l.wrap.b32 %r23523, %r23529, %r23528, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23527, %r23528, %r23529, %r23530; + // end inline asm + mov.u32 %r23538, 15; + // begin inline asm + shf.l.wrap.b32 %r23531, %r23537, %r23536, %r23538; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23535, %r23536, %r23537, %r23538; + // end inline asm + mov.u32 %r23546, 10; + // begin inline asm + shf.l.wrap.b32 %r23539, %r23545, %r23544, %r23546; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23543, %r23544, %r23545, %r23546; + // end inline asm + mov.u32 %r23554, 6; + // begin inline asm + shf.l.wrap.b32 %r23547, %r23553, %r23552, %r23554; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23551, %r23552, %r23553, %r23554; + // end inline asm + mov.u32 %r23562, 3; + // begin inline asm + shf.l.wrap.b32 %r23555, %r23561, %r23560, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23559, %r23560, %r23561, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23563, %r23569, %r23568, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23567, %r23568, %r23569, %r23248; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23571, %r23606, %r23379, %r23427, 0xD2; + lop3.b32 %r23572, %r23609, %r23383, %r23431, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30772, %r23379, %r23427, %r23523, 0xD2; + lop3.b32 %r30773, %r23383, %r23431, %r23527, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30768, %r23427, %r23523, %r23475, 0xD2; + lop3.b32 %r30769, %r23431, %r23527, %r23479, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30764, %r23523, %r23475, %r23606, 0xD2; + lop3.b32 %r30765, %r23527, %r23479, %r23609, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30762, %r23475, %r23606, %r23379, 0xD2; + lop3.b32 %r30763, %r23479, %r23609, %r23383, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30758, %r23515, %r23387, %r23555, 0xD2; + lop3.b32 %r30759, %r23519, %r23391, %r23559, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30770, %r23387, %r23555, %r23499, 0xD2; + lop3.b32 %r30771, %r23391, %r23559, %r23503, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30766, %r23555, %r23499, %r23395, 0xD2; + lop3.b32 %r30767, %r23559, %r23503, %r23399, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30738, %r23499, %r23395, %r23515, 0xD2; + lop3.b32 %r30739, %r23503, %r23399, %r23519, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30738, %r30739}; + // begin inline asm + // chi + lop3.b32 %r30730, %r23395, %r23515, %r23387, 0xD2; + lop3.b32 %r30731, %r23399, %r23519, %r23391, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30730, %r30731}; + // begin inline asm + // chi + lop3.b32 %r30756, %r23563, %r23547, %r23435, 0xD2; + lop3.b32 %r30757, %r23567, %r23551, %r23439, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30756, %r30757}; + // begin inline asm + // chi + lop3.b32 %r30750, %r23547, %r23435, %r23443, 0xD2; + lop3.b32 %r30751, %r23551, %r23439, %r23447, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30750, %r30751}; + // begin inline asm + // chi + lop3.b32 %r30744, %r23435, %r23443, %r23411, 0xD2; + lop3.b32 %r30745, %r23439, %r23447, %r23415, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30744, %r30745}; + // begin inline asm + // chi + lop3.b32 %r30736, %r23443, %r23411, %r23563, 0xD2; + lop3.b32 %r30737, %r23447, %r23415, %r23567, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30736, %r30737}; + // begin inline asm + // chi + lop3.b32 %r30728, %r23411, %r23563, %r23547, 0xD2; + lop3.b32 %r30729, %r23415, %r23567, %r23551, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30728, %r30729}; + // begin inline asm + // chi + lop3.b32 %r30754, %r23467, %r23507, %r23539, 0xD2; + lop3.b32 %r30755, %r23471, %r23511, %r23543, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30754, %r30755}; + // begin inline asm + // chi + lop3.b32 %r30748, %r23507, %r23539, %r23531, 0xD2; + lop3.b32 %r30749, %r23511, %r23543, %r23535, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30748, %r30749}; + // begin inline asm + // chi + lop3.b32 %r30742, %r23539, %r23531, %r23451, 0xD2; + lop3.b32 %r30743, %r23543, %r23535, %r23455, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30742, %r30743}; + // begin inline asm + // chi + lop3.b32 %r30734, %r23531, %r23451, %r23467, 0xD2; + lop3.b32 %r30735, %r23535, %r23455, %r23471, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30734, %r30735}; + // begin inline asm + // chi + lop3.b32 %r30726, %r23451, %r23467, %r23507, 0xD2; + lop3.b32 %r30727, %r23455, %r23471, %r23511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30726, %r30727}; + // begin inline asm + // chi + lop3.b32 %r30752, %r23419, %r23491, %r23403, 0xD2; + lop3.b32 %r30753, %r23423, %r23495, %r23407, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30752, %r30753}; + // begin inline asm + // chi + lop3.b32 %r30746, %r23491, %r23403, %r23459, 0xD2; + lop3.b32 %r30747, %r23495, %r23407, %r23463, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30746, %r30747}; + // begin inline asm + // chi + lop3.b32 %r30740, %r23403, %r23459, %r23483, 0xD2; + lop3.b32 %r30741, %r23407, %r23463, %r23487, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30740, %r30741}; + // begin inline asm + // chi + lop3.b32 %r30732, %r23459, %r23483, %r23419, 0xD2; + lop3.b32 %r30733, %r23463, %r23487, %r23423, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30732, %r30733}; + // begin inline asm + // chi + lop3.b32 %r30724, %r23483, %r23419, %r23491, 0xD2; + lop3.b32 %r30725, %r23487, %r23423, %r23495, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30724, %r30725}; + mul.wide.s32 %rd1027, %r30774, 8; + mov.u64 %rd1028, keccak_round_constants; + cvta.const.u64 %rd1029, %rd1028; + add.s64 %rd1026, %rd1029, %rd1027; + // begin inline asm + ld.global.nc.v2.u32 {%r23771,%r23772}, [%rd1026]; + // end inline asm + xor.b32 %r30760, %r23571, %r23771; + xor.b32 %r30761, %r23572, %r23772; + add.s32 %r30774, %r30774, 1; + setp.lt.u32 %p47, %r30774, 23; + @%p47 bra $L__BB2_84; + + add.u64 %rd272, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30772, %r30773}; + st.local.v2.u32 [%rd3+72], {%r30770, %r30771}; + st.local.v2.u32 [%rd3+40], {%r30768, %r30769}; + st.local.v2.u32 [%rd3+80], {%r30766, %r30767}; + st.local.v2.u32 [%rd3+48], {%r30764, %r30765}; + st.local.v2.u32 [%rd3+56], {%r30762, %r30763}; + st.local.v2.u32 [%rd3+24], {%r30760, %r30761}; + // begin inline asm + // xor5 + lop3.b32 %r23783, %r30760, %r30758, %r30756, 0x96; + lop3.b32 %r23783, %r23783, %r30754, %r30752, 0x96; + lop3.b32 %r23784, %r30761, %r30759, %r30757, 0x96; + lop3.b32 %r23784, %r23784, %r30755, %r30753, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23795, %r30772, %r30770, %r30750, 0x96; + lop3.b32 %r23795, %r23795, %r30748, %r30746, 0x96; + lop3.b32 %r23796, %r30773, %r30771, %r30751, 0x96; + lop3.b32 %r23796, %r23796, %r30749, %r30747, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23807, %r30768, %r30766, %r30744, 0x96; + lop3.b32 %r23807, %r23807, %r30742, %r30740, 0x96; + lop3.b32 %r23808, %r30769, %r30767, %r30745, 0x96; + lop3.b32 %r23808, %r23808, %r30743, %r30741, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23819, %r30764, %r30738, %r30736, 0x96; + lop3.b32 %r23819, %r23819, %r30734, %r30732, 0x96; + lop3.b32 %r23820, %r30765, %r30739, %r30737, 0x96; + lop3.b32 %r23820, %r23820, %r30735, %r30733, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23831, %r30762, %r30730, %r30728, 0x96; + lop3.b32 %r23831, %r23831, %r30726, %r30724, 0x96; + lop3.b32 %r23832, %r30763, %r30731, %r30729, 0x96; + lop3.b32 %r23832, %r23832, %r30727, %r30725, 0x96; + // end inline asm + mov.u32 %r24035, 1; + // begin inline asm + shf.l.wrap.b32 %r23843, %r23796, %r23795, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23847, %r23795, %r23796, %r24035; + // end inline asm + xor.b32 %r24062, %r23843, %r23831; + xor.b32 %r24063, %r23847, %r23832; + xor.b32 %r23990, %r30760, %r24062; + xor.b32 %r23993, %r30761, %r24063; + xor.b32 %r23953, %r30757, %r24063; + xor.b32 %r23952, %r30756, %r24062; + st.local.v2.u32 [%rd3+104], {%r23952, %r23953}; + // begin inline asm + shf.l.wrap.b32 %r23851, %r23808, %r23807, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23855, %r23807, %r23808, %r24035; + // end inline asm + xor.b32 %r24064, %r23851, %r23783; + xor.b32 %r24065, %r23855, %r23784; + xor.b32 %r23889, %r30770, %r24064; + xor.b32 %r23888, %r30771, %r24065; + xor.b32 %r23928, %r30749, %r24065; + xor.b32 %r23929, %r30748, %r24064; + st.local.v2.u32 [%rd3+152], {%r23929, %r23928}; + // begin inline asm + shf.l.wrap.b32 %r23859, %r23820, %r23819, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23863, %r23819, %r23820, %r24035; + // end inline asm + xor.b32 %r24066, %r23859, %r23795; + xor.b32 %r24067, %r23863, %r23796; + xor.b32 %r23912, %r30745, %r24067; + xor.b32 %r23913, %r30744, %r24066; + st.local.v2.u32 [%rd3+120], {%r23913, %r23912}; + xor.b32 %r23904, %r30741, %r24067; + xor.b32 %r23905, %r30740, %r24066; + st.local.v2.u32 [%rd3+200], {%r23905, %r23904}; + // begin inline asm + shf.l.wrap.b32 %r23867, %r23832, %r23831, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23871, %r23831, %r23832, %r24035; + // end inline asm + xor.b32 %r24068, %r23867, %r23807; + xor.b32 %r24069, %r23871, %r23808; + xor.b32 %r23936, %r30764, %r24068; + xor.b32 %r23937, %r30765, %r24069; + xor.b32 %r23945, %r30735, %r24069; + xor.b32 %r23944, %r30734, %r24068; + st.local.v2.u32 [%rd3+168], {%r23944, %r23945}; + // begin inline asm + shf.l.wrap.b32 %r23875, %r23784, %r23783, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23879, %r23783, %r23784, %r24035; + // end inline asm + xor.b32 %r24070, %r23875, %r23819; + xor.b32 %r24071, %r23879, %r23820; + xor.b32 %r23896, %r30730, %r24070; + xor.b32 %r23897, %r30731, %r24071; + xor.b32 %r23921, %r30725, %r24071; + xor.b32 %r23920, %r30724, %r24070; + st.local.v2.u32 [%rd3+216], {%r23920, %r23921}; + // begin inline asm + shf.l.wrap.b32 %r23883, %r23889, %r23888, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23887, %r23888, %r23889, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23891, %r23897, %r23896, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23895, %r23896, %r23897, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23903, %r23904, %r23905, %r23402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23899, %r23905, %r23904, %r23402; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r23899, %r23903}; + // begin inline asm + shf.l.wrap.b32 %r23907, %r23913, %r23912, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23911, %r23912, %r23913, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23915, %r23921, %r23920, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23919, %r23920, %r23921, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23927, %r23928, %r23929, %r23506; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23923, %r23929, %r23928, %r23506; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r23923, %r23927}; + // begin inline asm + shf.l.wrap.b32 %r23931, %r23937, %r23936, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23935, %r23936, %r23937, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23939, %r23945, %r23944, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23943, %r23944, %r23945, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23947, %r23953, %r23952, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23951, %r23952, %r23953, %r23562; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23955, %r23990, %r23883, %r23907, 0xD2; + lop3.b32 %r23956, %r23993, %r23887, %r23911, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30907, %r23883, %r23907, %r23939, 0xD2; + lop3.b32 %r30908, %r23887, %r23911, %r23943, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30907, %r30908}; + // begin inline asm + // chi + lop3.b32 %r30903, %r23907, %r23939, %r23915, 0xD2; + lop3.b32 %r30904, %r23911, %r23943, %r23919, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30903, %r30904}; + // begin inline asm + // chi + lop3.b32 %r30899, %r23939, %r23915, %r23990, 0xD2; + lop3.b32 %r30900, %r23943, %r23919, %r23993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30899, %r30900}; + // begin inline asm + // chi + lop3.b32 %r30897, %r23915, %r23990, %r23883, 0xD2; + lop3.b32 %r30898, %r23919, %r23993, %r23887, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30897, %r30898}; + // begin inline asm + // chi + lop3.b32 %r30893, %r23931, %r23891, %r23947, 0xD2; + lop3.b32 %r30894, %r23935, %r23895, %r23951, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30893, %r30894}; + // begin inline asm + // chi + lop3.b32 %r30905, %r23891, %r23947, %r23923, 0xD2; + lop3.b32 %r30906, %r23895, %r23951, %r23927, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30905, %r30906}; + // begin inline asm + // chi + lop3.b32 %r30901, %r23947, %r23923, %r23899, 0xD2; + lop3.b32 %r30902, %r23951, %r23927, %r23903, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30901, %r30902}; + add.s64 %rd1030, %rd1029, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r24019,%r24020}, [%rd1030]; + // end inline asm + xor.b32 %r30895, %r23955, %r24019; + xor.b32 %r30896, %r23956, %r24020; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + st.local.u64 [%rd272], %rd354; + mov.u64 %rd1034, 1179641; + st.local.u64 [%rd272+8], %rd1034; + st.local.u32 [%rd272+16], %r3327; + ld.global.u64 %rd1035, [%rd223]; + ld.global.u64 %rd1036, [%rd223+8]; + ld.global.u64 %rd1037, [%rd223+16]; + ld.global.u64 %rd1038, [%rd223+24]; + ld.global.u64 %rd1039, [%rd223+32]; + ld.global.u64 %rd1040, [%rd223+40]; + ld.global.u64 %rd1041, [%rd223+48]; + ld.global.u64 %rd1042, [%rd223+56]; + st.local.u64 [%rd272+32], %rd1036; + st.local.u64 [%rd272+40], %rd1037; + st.local.u64 [%rd272+48], %rd1038; + st.local.u64 [%rd272+56], %rd1039; + st.local.u64 [%rd272+64], %rd1040; + st.local.u64 [%rd272+72], %rd1041; + st.local.u64 [%rd272+80], %rd1042; + cvt.u32.u64 %r24072, %rd1035; + xor.b32 %r24073, %r3327, %r24072; + st.local.u64 [%rd272+24], %rd1035; + st.local.u32 [%rd272+24], %r24073; + mov.u32 %r30775, 0; + st.local.v2.u32 [%rd272+96], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+104], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+112], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+120], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+128], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+136], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+144], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+152], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+160], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+168], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+176], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+184], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+192], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+200], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+208], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+216], {%r30775, %r30775}; + mov.u32 %r30790, -2147483648; + st.local.v2.u32 [%rd272+88], {%r24035, %r30790}; + ld.local.v2.u32 {%r30811, %r30812}, [%rd272+24]; + mov.b64 {%r30809, %r30810}, %rd1040; + shr.u64 %rd1043, %rd1036, 32; + cvt.u32.u64 %r30823, %rd1036; + cvt.u32.u64 %r30824, %rd1043; + shr.u64 %rd1044, %rd1041, 32; + cvt.u32.u64 %r30821, %rd1041; + cvt.u32.u64 %r30822, %rd1044; + shr.u64 %rd1045, %rd1037, 32; + cvt.u32.u64 %r30819, %rd1037; + cvt.u32.u64 %r30820, %rd1045; + shr.u64 %rd1046, %rd1042, 32; + cvt.u32.u64 %r30817, %rd1042; + cvt.u32.u64 %r30818, %rd1046; + shr.u64 %rd1047, %rd1038, 32; + cvt.u32.u64 %r30815, %rd1038; + cvt.u32.u64 %r30816, %rd1047; + shr.u64 %rd1048, %rd1039, 32; + cvt.u32.u64 %r30813, %rd1039; + cvt.u32.u64 %r30814, %rd1048; + mov.u32 %r30776, %r30775; + mov.u32 %r30777, %r30775; + mov.u32 %r30778, %r30775; + mov.u32 %r30779, %r30775; + mov.u32 %r30780, %r30775; + mov.u32 %r30781, %r30775; + mov.u32 %r30782, %r30775; + mov.u32 %r30783, %r30775; + mov.u32 %r30784, %r30775; + mov.u32 %r30785, %r30775; + mov.u32 %r30786, %r30775; + mov.u32 %r30787, %r30775; + mov.u32 %r30788, %r30775; + mov.u32 %r30789, %r24035; + mov.u32 %r30791, %r30775; + mov.u32 %r30792, %r30775; + mov.u32 %r30793, %r30775; + mov.u32 %r30794, %r30775; + mov.u32 %r30795, %r30775; + mov.u32 %r30796, %r30775; + mov.u32 %r30797, %r30775; + mov.u32 %r30798, %r30775; + mov.u32 %r30799, %r30775; + mov.u32 %r30800, %r30775; + mov.u32 %r30801, %r30775; + mov.u32 %r30802, %r30775; + mov.u32 %r30803, %r30775; + mov.u32 %r30804, %r30775; + mov.u32 %r30805, %r30775; + mov.u32 %r30806, %r30775; + mov.u32 %r30807, %r30775; + mov.u32 %r30808, %r30775; + mov.u32 %r30825, %r30775; + +$L__BB2_86: + // begin inline asm + // xor5 + lop3.b32 %r24076, %r30811, %r30809, %r30807, 0x96; + lop3.b32 %r24076, %r24076, %r30805, %r30803, 0x96; + lop3.b32 %r24077, %r30812, %r30810, %r30808, 0x96; + lop3.b32 %r24077, %r24077, %r30806, %r30804, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24088, %r30823, %r30821, %r30801, 0x96; + lop3.b32 %r24088, %r24088, %r30799, %r30797, 0x96; + lop3.b32 %r24089, %r30824, %r30822, %r30802, 0x96; + lop3.b32 %r24089, %r24089, %r30800, %r30798, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24100, %r30819, %r30817, %r30795, 0x96; + lop3.b32 %r24100, %r24100, %r30793, %r30791, 0x96; + lop3.b32 %r24101, %r30820, %r30818, %r30796, 0x96; + lop3.b32 %r24101, %r24101, %r30794, %r30792, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24112, %r30815, %r30789, %r30787, 0x96; + lop3.b32 %r24112, %r24112, %r30785, %r30783, 0x96; + lop3.b32 %r24113, %r30816, %r30790, %r30788, 0x96; + lop3.b32 %r24113, %r24113, %r30786, %r30784, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24124, %r30813, %r30781, %r30779, 0x96; + lop3.b32 %r24124, %r24124, %r30777, %r30775, 0x96; + lop3.b32 %r24125, %r30814, %r30782, %r30780, 0x96; + lop3.b32 %r24125, %r24125, %r30778, %r30776, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24136, %r24089, %r24088, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24140, %r24088, %r24089, %r24035; + // end inline asm + xor.b32 %r24570, %r24136, %r24124; + xor.b32 %r24571, %r24140, %r24125; + xor.b32 %r24403, %r30811, %r24570; + xor.b32 %r24406, %r30812, %r24571; + xor.b32 %r24310, %r30809, %r24570; + xor.b32 %r24309, %r30810, %r24571; + xor.b32 %r24357, %r30807, %r24570; + xor.b32 %r24358, %r30808, %r24571; + xor.b32 %r24262, %r30805, %r24570; + xor.b32 %r24261, %r30806, %r24571; + xor.b32 %r24213, %r30803, %r24570; + xor.b32 %r24214, %r30804, %r24571; + // begin inline asm + shf.l.wrap.b32 %r24144, %r24101, %r24100, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24148, %r24100, %r24101, %r24035; + // end inline asm + xor.b32 %r24572, %r24144, %r24076; + xor.b32 %r24573, %r24148, %r24077; + xor.b32 %r24365, %r30823, %r24572; + xor.b32 %r24366, %r30824, %r24573; + xor.b32 %r24182, %r30821, %r24572; + xor.b32 %r24181, %r30822, %r24573; + xor.b32 %r24341, %r30801, %r24572; + xor.b32 %r24342, %r30802, %r24573; + xor.b32 %r24302, %r30799, %r24572; + xor.b32 %r24301, %r30800, %r24573; + xor.b32 %r24285, %r30797, %r24572; + xor.b32 %r24286, %r30798, %r24573; + // begin inline asm + shf.l.wrap.b32 %r24152, %r24113, %r24112, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24156, %r24112, %r24113, %r24035; + // end inline asm + xor.b32 %r24574, %r24152, %r24088; + xor.b32 %r24575, %r24156, %r24089; + xor.b32 %r24222, %r30819, %r24574; + xor.b32 %r24221, %r30820, %r24575; + xor.b32 %r24349, %r30817, %r24574; + xor.b32 %r24350, %r30818, %r24575; + xor.b32 %r24230, %r30795, %r24574; + xor.b32 %r24229, %r30796, %r24575; + xor.b32 %r24333, %r30793, %r24574; + xor.b32 %r24334, %r30794, %r24575; + xor.b32 %r24198, %r30791, %r24574; + xor.b32 %r24197, %r30792, %r24575; + // begin inline asm + shf.l.wrap.b32 %r24160, %r24125, %r24124, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24164, %r24124, %r24125, %r24035; + // end inline asm + xor.b32 %r24576, %r24160, %r24100; + xor.b32 %r24577, %r24164, %r24101; + xor.b32 %r24317, %r30815, %r24576; + xor.b32 %r24318, %r30816, %r24577; + xor.b32 %r24294, %r30789, %r24576; + xor.b32 %r24293, %r30790, %r24577; + xor.b32 %r24237, %r30787, %r24576; + xor.b32 %r24238, %r30788, %r24577; + xor.b32 %r24325, %r30785, %r24576; + xor.b32 %r24326, %r30786, %r24577; + xor.b32 %r24254, %r30783, %r24576; + xor.b32 %r24253, %r30784, %r24577; + // begin inline asm + shf.l.wrap.b32 %r24168, %r24077, %r24076, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24172, %r24076, %r24077, %r24035; + // end inline asm + xor.b32 %r24578, %r24168, %r24112; + xor.b32 %r24579, %r24172, %r24113; + xor.b32 %r24269, %r30813, %r24578; + xor.b32 %r24270, %r30814, %r24579; + xor.b32 %r24189, %r30781, %r24578; + xor.b32 %r24190, %r30782, %r24579; + xor.b32 %r24206, %r30779, %r24578; + xor.b32 %r24205, %r30780, %r24579; + xor.b32 %r24245, %r30777, %r24578; + xor.b32 %r24246, %r30778, %r24579; + xor.b32 %r24277, %r30775, %r24578; + xor.b32 %r24278, %r30776, %r24579; + mov.u32 %r24183, 44; + // begin inline asm + shf.l.wrap.b32 %r24176, %r24182, %r24181, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24180, %r24181, %r24182, %r24183; + // end inline asm + mov.u32 %r24191, 20; + // begin inline asm + shf.l.wrap.b32 %r24184, %r24190, %r24189, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24188, %r24189, %r24190, %r24191; + // end inline asm + mov.u32 %r24199, 61; + // begin inline asm + shf.l.wrap.b32 %r24192, %r24198, %r24197, %r24199; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24196, %r24197, %r24198, %r24199; + // end inline asm + mov.u32 %r24207, 39; + // begin inline asm + shf.l.wrap.b32 %r24200, %r24206, %r24205, %r24207; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24204, %r24205, %r24206, %r24207; + // end inline asm + mov.u32 %r24215, 18; + // begin inline asm + shf.l.wrap.b32 %r24208, %r24214, %r24213, %r24215; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24212, %r24213, %r24214, %r24215; + // end inline asm + mov.u32 %r24223, 62; + // begin inline asm + shf.l.wrap.b32 %r24216, %r24222, %r24221, %r24223; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24220, %r24221, %r24222, %r24223; + // end inline asm + mov.u32 %r24231, 43; + // begin inline asm + shf.l.wrap.b32 %r24224, %r24230, %r24229, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24228, %r24229, %r24230, %r24231; + // end inline asm + mov.u32 %r24239, 25; + // begin inline asm + shf.l.wrap.b32 %r24232, %r24238, %r24237, %r24239; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24236, %r24237, %r24238, %r24239; + // end inline asm + mov.u32 %r24247, 8; + // begin inline asm + shf.l.wrap.b32 %r24240, %r24246, %r24245, %r24247; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24244, %r24245, %r24246, %r24247; + // end inline asm + mov.u32 %r24255, 56; + // begin inline asm + shf.l.wrap.b32 %r24248, %r24254, %r24253, %r24255; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24252, %r24253, %r24254, %r24255; + // end inline asm + mov.u32 %r24263, 41; + // begin inline asm + shf.l.wrap.b32 %r24256, %r24262, %r24261, %r24263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24260, %r24261, %r24262, %r24263; + // end inline asm + mov.u32 %r24271, 27; + // begin inline asm + shf.l.wrap.b32 %r24264, %r24270, %r24269, %r24271; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24268, %r24269, %r24270, %r24271; + // end inline asm + mov.u32 %r24279, 14; + // begin inline asm + shf.l.wrap.b32 %r24272, %r24278, %r24277, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24276, %r24277, %r24278, %r24279; + // end inline asm + mov.u32 %r24287, 2; + // begin inline asm + shf.l.wrap.b32 %r24280, %r24286, %r24285, %r24287; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24284, %r24285, %r24286, %r24287; + // end inline asm + mov.u32 %r24295, 55; + // begin inline asm + shf.l.wrap.b32 %r24288, %r24294, %r24293, %r24295; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24292, %r24293, %r24294, %r24295; + // end inline asm + mov.u32 %r24303, 45; + // begin inline asm + shf.l.wrap.b32 %r24296, %r24302, %r24301, %r24303; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24300, %r24301, %r24302, %r24303; + // end inline asm + mov.u32 %r24311, 36; + // begin inline asm + shf.l.wrap.b32 %r24304, %r24310, %r24309, %r24311; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24308, %r24309, %r24310, %r24311; + // end inline asm + mov.u32 %r24319, 28; + // begin inline asm + shf.l.wrap.b32 %r24312, %r24318, %r24317, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24316, %r24317, %r24318, %r24319; + // end inline asm + mov.u32 %r24327, 21; + // begin inline asm + shf.l.wrap.b32 %r24320, %r24326, %r24325, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24324, %r24325, %r24326, %r24327; + // end inline asm + mov.u32 %r24335, 15; + // begin inline asm + shf.l.wrap.b32 %r24328, %r24334, %r24333, %r24335; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24332, %r24333, %r24334, %r24335; + // end inline asm + mov.u32 %r24343, 10; + // begin inline asm + shf.l.wrap.b32 %r24336, %r24342, %r24341, %r24343; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24340, %r24341, %r24342, %r24343; + // end inline asm + mov.u32 %r24351, 6; + // begin inline asm + shf.l.wrap.b32 %r24344, %r24350, %r24349, %r24351; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24348, %r24349, %r24350, %r24351; + // end inline asm + mov.u32 %r24359, 3; + // begin inline asm + shf.l.wrap.b32 %r24352, %r24358, %r24357, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24356, %r24357, %r24358, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24360, %r24366, %r24365, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24364, %r24365, %r24366, %r24035; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24368, %r24403, %r24176, %r24224, 0xD2; + lop3.b32 %r24369, %r24406, %r24180, %r24228, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30823, %r24176, %r24224, %r24320, 0xD2; + lop3.b32 %r30824, %r24180, %r24228, %r24324, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30819, %r24224, %r24320, %r24272, 0xD2; + lop3.b32 %r30820, %r24228, %r24324, %r24276, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30815, %r24320, %r24272, %r24403, 0xD2; + lop3.b32 %r30816, %r24324, %r24276, %r24406, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30813, %r24272, %r24403, %r24176, 0xD2; + lop3.b32 %r30814, %r24276, %r24406, %r24180, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30809, %r24312, %r24184, %r24352, 0xD2; + lop3.b32 %r30810, %r24316, %r24188, %r24356, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30821, %r24184, %r24352, %r24296, 0xD2; + lop3.b32 %r30822, %r24188, %r24356, %r24300, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30817, %r24352, %r24296, %r24192, 0xD2; + lop3.b32 %r30818, %r24356, %r24300, %r24196, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30789, %r24296, %r24192, %r24312, 0xD2; + lop3.b32 %r30790, %r24300, %r24196, %r24316, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r30789, %r30790}; + // begin inline asm + // chi + lop3.b32 %r30781, %r24192, %r24312, %r24184, 0xD2; + lop3.b32 %r30782, %r24196, %r24316, %r24188, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r30781, %r30782}; + // begin inline asm + // chi + lop3.b32 %r30807, %r24360, %r24344, %r24232, 0xD2; + lop3.b32 %r30808, %r24364, %r24348, %r24236, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+104], {%r30807, %r30808}; + // begin inline asm + // chi + lop3.b32 %r30801, %r24344, %r24232, %r24240, 0xD2; + lop3.b32 %r30802, %r24348, %r24236, %r24244, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+112], {%r30801, %r30802}; + // begin inline asm + // chi + lop3.b32 %r30795, %r24232, %r24240, %r24208, 0xD2; + lop3.b32 %r30796, %r24236, %r24244, %r24212, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+120], {%r30795, %r30796}; + // begin inline asm + // chi + lop3.b32 %r30787, %r24240, %r24208, %r24360, 0xD2; + lop3.b32 %r30788, %r24244, %r24212, %r24364, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+128], {%r30787, %r30788}; + // begin inline asm + // chi + lop3.b32 %r30779, %r24208, %r24360, %r24344, 0xD2; + lop3.b32 %r30780, %r24212, %r24364, %r24348, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+136], {%r30779, %r30780}; + // begin inline asm + // chi + lop3.b32 %r30805, %r24264, %r24304, %r24336, 0xD2; + lop3.b32 %r30806, %r24268, %r24308, %r24340, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+144], {%r30805, %r30806}; + // begin inline asm + // chi + lop3.b32 %r30799, %r24304, %r24336, %r24328, 0xD2; + lop3.b32 %r30800, %r24308, %r24340, %r24332, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+152], {%r30799, %r30800}; + // begin inline asm + // chi + lop3.b32 %r30793, %r24336, %r24328, %r24248, 0xD2; + lop3.b32 %r30794, %r24340, %r24332, %r24252, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+160], {%r30793, %r30794}; + // begin inline asm + // chi + lop3.b32 %r30785, %r24328, %r24248, %r24264, 0xD2; + lop3.b32 %r30786, %r24332, %r24252, %r24268, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+168], {%r30785, %r30786}; + // begin inline asm + // chi + lop3.b32 %r30777, %r24248, %r24264, %r24304, 0xD2; + lop3.b32 %r30778, %r24252, %r24268, %r24308, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+176], {%r30777, %r30778}; + // begin inline asm + // chi + lop3.b32 %r30803, %r24216, %r24288, %r24200, 0xD2; + lop3.b32 %r30804, %r24220, %r24292, %r24204, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+184], {%r30803, %r30804}; + // begin inline asm + // chi + lop3.b32 %r30797, %r24288, %r24200, %r24256, 0xD2; + lop3.b32 %r30798, %r24292, %r24204, %r24260, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+192], {%r30797, %r30798}; + // begin inline asm + // chi + lop3.b32 %r30791, %r24200, %r24256, %r24280, 0xD2; + lop3.b32 %r30792, %r24204, %r24260, %r24284, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+200], {%r30791, %r30792}; + // begin inline asm + // chi + lop3.b32 %r30783, %r24256, %r24280, %r24216, 0xD2; + lop3.b32 %r30784, %r24260, %r24284, %r24220, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+208], {%r30783, %r30784}; + // begin inline asm + // chi + lop3.b32 %r30775, %r24280, %r24216, %r24288, 0xD2; + lop3.b32 %r30776, %r24284, %r24220, %r24292, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+216], {%r30775, %r30776}; + mul.wide.s32 %rd1050, %r30825, 8; + add.s64 %rd1049, %rd1029, %rd1050; + // begin inline asm + ld.global.nc.v2.u32 {%r24568,%r24569}, [%rd1049]; + // end inline asm + xor.b32 %r30811, %r24368, %r24568; + xor.b32 %r30812, %r24369, %r24569; + add.s32 %r30825, %r30825, 1; + setp.lt.u32 %p48, %r30825, 23; + @%p48 bra $L__BB2_86; + + mov.u32 %r30858, 0; + mov.u32 %r24679, 1; + st.local.v2.u32 [%rd272+32], {%r30823, %r30824}; + st.local.v2.u32 [%rd272+72], {%r30821, %r30822}; + st.local.v2.u32 [%rd272+40], {%r30819, %r30820}; + st.local.v2.u32 [%rd272+80], {%r30817, %r30818}; + st.local.v2.u32 [%rd272+48], {%r30815, %r30816}; + st.local.v2.u32 [%rd272+56], {%r30813, %r30814}; + st.local.v2.u32 [%rd272+24], {%r30811, %r30812}; + // begin inline asm + // xor5 + lop3.b32 %r24580, %r30811, %r30809, %r30807, 0x96; + lop3.b32 %r24580, %r24580, %r30805, %r30803, 0x96; + lop3.b32 %r24581, %r30812, %r30810, %r30808, 0x96; + lop3.b32 %r24581, %r24581, %r30806, %r30804, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24592, %r30823, %r30821, %r30801, 0x96; + lop3.b32 %r24592, %r24592, %r30799, %r30797, 0x96; + lop3.b32 %r24593, %r30824, %r30822, %r30802, 0x96; + lop3.b32 %r24593, %r24593, %r30800, %r30798, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24604, %r30819, %r30817, %r30795, 0x96; + lop3.b32 %r24604, %r24604, %r30793, %r30791, 0x96; + lop3.b32 %r24605, %r30820, %r30818, %r30796, 0x96; + lop3.b32 %r24605, %r24605, %r30794, %r30792, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24616, %r30815, %r30789, %r30787, 0x96; + lop3.b32 %r24616, %r24616, %r30785, %r30783, 0x96; + lop3.b32 %r24617, %r30816, %r30790, %r30788, 0x96; + lop3.b32 %r24617, %r24617, %r30786, %r30784, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24628, %r30813, %r30781, %r30779, 0x96; + lop3.b32 %r24628, %r24628, %r30777, %r30775, 0x96; + lop3.b32 %r24629, %r30814, %r30782, %r30780, 0x96; + lop3.b32 %r24629, %r24629, %r30778, %r30776, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24640, %r24593, %r24592, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24644, %r24592, %r24593, %r24679; + // end inline asm + xor.b32 %r24819, %r24640, %r24628; + xor.b32 %r24820, %r24644, %r24629; + xor.b32 %r24787, %r30811, %r24819; + xor.b32 %r24790, %r30812, %r24820; + xor.b32 %r24750, %r30808, %r24820; + xor.b32 %r24749, %r30807, %r24819; + st.local.v2.u32 [%rd272+104], {%r24749, %r24750}; + // begin inline asm + shf.l.wrap.b32 %r24648, %r24605, %r24604, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24652, %r24604, %r24605, %r24679; + // end inline asm + xor.b32 %r24821, %r24648, %r24580; + xor.b32 %r24822, %r24652, %r24581; + xor.b32 %r24686, %r30821, %r24821; + xor.b32 %r24685, %r30822, %r24822; + xor.b32 %r24725, %r30800, %r24822; + xor.b32 %r24726, %r30799, %r24821; + st.local.v2.u32 [%rd272+152], {%r24726, %r24725}; + // begin inline asm + shf.l.wrap.b32 %r24656, %r24617, %r24616, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24660, %r24616, %r24617, %r24679; + // end inline asm + xor.b32 %r24823, %r24656, %r24592; + xor.b32 %r24824, %r24660, %r24593; + xor.b32 %r24709, %r30796, %r24824; + xor.b32 %r24710, %r30795, %r24823; + st.local.v2.u32 [%rd272+120], {%r24710, %r24709}; + xor.b32 %r24701, %r30792, %r24824; + xor.b32 %r24702, %r30791, %r24823; + st.local.v2.u32 [%rd272+200], {%r24702, %r24701}; + // begin inline asm + shf.l.wrap.b32 %r24664, %r24629, %r24628, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24668, %r24628, %r24629, %r24679; + // end inline asm + xor.b32 %r24825, %r24664, %r24604; + xor.b32 %r24826, %r24668, %r24605; + xor.b32 %r24733, %r30815, %r24825; + xor.b32 %r24734, %r30816, %r24826; + xor.b32 %r24742, %r30786, %r24826; + xor.b32 %r24741, %r30785, %r24825; + st.local.v2.u32 [%rd272+168], {%r24741, %r24742}; + // begin inline asm + shf.l.wrap.b32 %r24672, %r24581, %r24580, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24676, %r24580, %r24581, %r24679; + // end inline asm + xor.b32 %r24827, %r24672, %r24616; + xor.b32 %r24828, %r24676, %r24617; + xor.b32 %r24693, %r30781, %r24827; + xor.b32 %r24694, %r30782, %r24828; + xor.b32 %r24718, %r30776, %r24828; + xor.b32 %r24717, %r30775, %r24827; + st.local.v2.u32 [%rd272+216], {%r24717, %r24718}; + // begin inline asm + shf.l.wrap.b32 %r24680, %r24686, %r24685, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24684, %r24685, %r24686, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24688, %r24694, %r24693, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24692, %r24693, %r24694, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24700, %r24701, %r24702, %r24199; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24696, %r24702, %r24701, %r24199; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r24696, %r24700}; + // begin inline asm + shf.l.wrap.b32 %r24704, %r24710, %r24709, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24708, %r24709, %r24710, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24712, %r24718, %r24717, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24716, %r24717, %r24718, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24724, %r24725, %r24726, %r24303; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24720, %r24726, %r24725, %r24303; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r24720, %r24724}; + // begin inline asm + shf.l.wrap.b32 %r24728, %r24734, %r24733, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24732, %r24733, %r24734, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24736, %r24742, %r24741, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24740, %r24741, %r24742, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24744, %r24750, %r24749, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24748, %r24749, %r24750, %r24359; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24752, %r24787, %r24680, %r24704, 0xD2; + lop3.b32 %r24753, %r24790, %r24684, %r24708, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30958, %r24680, %r24704, %r24736, 0xD2; + lop3.b32 %r30959, %r24684, %r24708, %r24740, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+32], {%r30958, %r30959}; + // begin inline asm + // chi + lop3.b32 %r30954, %r24704, %r24736, %r24712, 0xD2; + lop3.b32 %r30955, %r24708, %r24740, %r24716, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+40], {%r30954, %r30955}; + // begin inline asm + // chi + lop3.b32 %r30950, %r24736, %r24712, %r24787, 0xD2; + lop3.b32 %r30951, %r24740, %r24716, %r24790, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+48], {%r30950, %r30951}; + // begin inline asm + // chi + lop3.b32 %r30948, %r24712, %r24787, %r24680, 0xD2; + lop3.b32 %r30949, %r24716, %r24790, %r24684, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+56], {%r30948, %r30949}; + // begin inline asm + // chi + lop3.b32 %r30944, %r24728, %r24688, %r24744, 0xD2; + lop3.b32 %r30945, %r24732, %r24692, %r24748, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+64], {%r30944, %r30945}; + // begin inline asm + // chi + lop3.b32 %r30956, %r24688, %r24744, %r24720, 0xD2; + lop3.b32 %r30957, %r24692, %r24748, %r24724, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+72], {%r30956, %r30957}; + // begin inline asm + // chi + lop3.b32 %r30952, %r24744, %r24720, %r24696, 0xD2; + lop3.b32 %r30953, %r24748, %r24724, %r24700, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+80], {%r30952, %r30953}; + // begin inline asm + ld.global.nc.v2.u32 {%r24816,%r24817}, [%rd1030]; + // end inline asm + xor.b32 %r30946, %r24752, %r24816; + xor.b32 %r30947, %r24753, %r24817; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + add.s64 %rd275, %rd272, 24; + add.s64 %rd276, %rd3, 24; + +$L__BB2_88: + shl.b32 %r24829, %r30858, 2; + cvt.u64.u32 %rd1058, %r24829; + and.b64 %rd1059, %rd1058, 60; + add.s64 %rd1060, %rd276, %rd1059; + xor.b32 %r24830, %r3326, %r30858; + mul.lo.s32 %r24831, %r24830, 16777619; + ld.local.u32 %r24832, [%rd1060]; + xor.b32 %r24833, %r24831, %r24832; + mul.wide.u32 %rd1061, %r24833, -954391867; + shr.u64 %rd1062, %rd1061, 32; + cvt.u32.u64 %r24834, %rd1062; + sub.s32 %r24835, %r24833, %r24834; + shr.u32 %r24836, %r24835, 1; + add.s32 %r24837, %r24836, %r24834; + shr.u32 %r24838, %r24837, 20; + mul.lo.s32 %r24839, %r24838, 1179641; + sub.s32 %r24840, %r24833, %r24839; + mul.wide.u32 %rd1063, %r24840, 64; + add.s64 %rd1064, %rd471, %rd1063; + mul.lo.s32 %r24841, %r30895, 16777619; + ld.global.u32 %r24842, [%rd1064]; + xor.b32 %r30895, %r24841, %r24842; + mul.lo.s32 %r24843, %r30896, 16777619; + ld.global.u32 %r24844, [%rd1064+4]; + xor.b32 %r30896, %r24843, %r24844; + mul.lo.s32 %r24845, %r30907, 16777619; + ld.global.u32 %r24846, [%rd1064+8]; + mul.lo.s32 %r24847, %r30908, 16777619; + ld.global.u32 %r24848, [%rd1064+12]; + xor.b32 %r24849, %r24847, %r24848; + xor.b32 %r30907, %r24845, %r24846; + mov.b64 %rd1065, {%r30907, %r24849}; + mul.lo.s32 %r24850, %r30903, 16777619; + ld.global.u32 %r24851, [%rd1064+16]; + mul.lo.s32 %r24852, %r30904, 16777619; + ld.global.u32 %r24853, [%rd1064+20]; + xor.b32 %r24854, %r24852, %r24853; + xor.b32 %r30903, %r24850, %r24851; + mov.b64 %rd1066, {%r30903, %r24854}; + mul.lo.s32 %r24855, %r30899, 16777619; + ld.global.u32 %r24856, [%rd1064+24]; + mul.lo.s32 %r24857, %r30900, 16777619; + ld.global.u32 %r24858, [%rd1064+28]; + xor.b32 %r24859, %r24857, %r24858; + xor.b32 %r30899, %r24855, %r24856; + mov.b64 %rd1067, {%r30899, %r24859}; + mul.lo.s32 %r24860, %r30897, 16777619; + ld.global.u32 %r24861, [%rd1064+32]; + mul.lo.s32 %r24862, %r30898, 16777619; + ld.global.u32 %r24863, [%rd1064+36]; + xor.b32 %r24864, %r24862, %r24863; + xor.b32 %r30897, %r24860, %r24861; + mov.b64 %rd1068, {%r30897, %r24864}; + mul.lo.s32 %r24865, %r30893, 16777619; + ld.global.u32 %r24866, [%rd1064+40]; + xor.b32 %r30893, %r24865, %r24866; + mul.lo.s32 %r24867, %r30894, 16777619; + ld.global.u32 %r24868, [%rd1064+44]; + xor.b32 %r30894, %r24867, %r24868; + mul.lo.s32 %r24869, %r30905, 16777619; + ld.global.u32 %r24870, [%rd1064+48]; + mul.lo.s32 %r24871, %r30906, 16777619; + ld.global.u32 %r24872, [%rd1064+52]; + xor.b32 %r24873, %r24871, %r24872; + xor.b32 %r30905, %r24869, %r24870; + mov.b64 %rd1069, {%r30905, %r24873}; + mul.lo.s32 %r24874, %r30901, 16777619; + ld.global.u32 %r24875, [%rd1064+56]; + mul.lo.s32 %r24876, %r30902, 16777619; + ld.global.u32 %r24877, [%rd1064+60]; + xor.b32 %r24878, %r24876, %r24877; + xor.b32 %r30901, %r24874, %r24875; + mov.b64 %rd1070, {%r30901, %r24878}; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + st.local.v2.u32 [%rd3+32], {%r30907, %r24849}; + st.local.v2.u32 [%rd3+40], {%r30903, %r24854}; + st.local.v2.u32 [%rd3+48], {%r30899, %r24859}; + st.local.v2.u32 [%rd3+56], {%r30897, %r24864}; + st.local.v2.u32 [%rd3+64], {%r30893, %r30894}; + st.local.v2.u32 [%rd3+72], {%r30905, %r24873}; + st.local.v2.u32 [%rd3+80], {%r30901, %r24878}; + add.s64 %rd1071, %rd275, %rd1059; + xor.b32 %r24879, %r3327, %r30858; + mul.lo.s32 %r24880, %r24879, 16777619; + ld.local.u32 %r24881, [%rd1071]; + xor.b32 %r24882, %r24880, %r24881; + mul.wide.u32 %rd1072, %r24882, -954391867; + shr.u64 %rd1073, %rd1072, 32; + cvt.u32.u64 %r24883, %rd1073; + sub.s32 %r24884, %r24882, %r24883; + shr.u32 %r24885, %r24884, 1; + add.s32 %r24886, %r24885, %r24883; + shr.u32 %r24887, %r24886, 20; + mul.lo.s32 %r24888, %r24887, 1179641; + sub.s32 %r24889, %r24882, %r24888; + mul.wide.u32 %rd1074, %r24889, 64; + add.s64 %rd1075, %rd471, %rd1074; + mul.lo.s32 %r24890, %r30946, 16777619; + ld.global.u32 %r24891, [%rd1075]; + xor.b32 %r30946, %r24890, %r24891; + mul.lo.s32 %r24892, %r30947, 16777619; + ld.global.u32 %r24893, [%rd1075+4]; + xor.b32 %r30947, %r24892, %r24893; + mul.lo.s32 %r24894, %r30958, 16777619; + ld.global.u32 %r24895, [%rd1075+8]; + mul.lo.s32 %r24896, %r30959, 16777619; + ld.global.u32 %r24897, [%rd1075+12]; + xor.b32 %r24898, %r24896, %r24897; + xor.b32 %r30958, %r24894, %r24895; + mov.b64 %rd1076, {%r30958, %r24898}; + mul.lo.s32 %r24899, %r30954, 16777619; + ld.global.u32 %r24900, [%rd1075+16]; + mul.lo.s32 %r24901, %r30955, 16777619; + ld.global.u32 %r24902, [%rd1075+20]; + xor.b32 %r24903, %r24901, %r24902; + xor.b32 %r30954, %r24899, %r24900; + mov.b64 %rd1077, {%r30954, %r24903}; + mul.lo.s32 %r24904, %r30950, 16777619; + ld.global.u32 %r24905, [%rd1075+24]; + mul.lo.s32 %r24906, %r30951, 16777619; + ld.global.u32 %r24907, [%rd1075+28]; + xor.b32 %r24908, %r24906, %r24907; + xor.b32 %r30950, %r24904, %r24905; + mov.b64 %rd1078, {%r30950, %r24908}; + mul.lo.s32 %r24909, %r30948, 16777619; + ld.global.u32 %r24910, [%rd1075+32]; + mul.lo.s32 %r24911, %r30949, 16777619; + ld.global.u32 %r24912, [%rd1075+36]; + xor.b32 %r24913, %r24911, %r24912; + xor.b32 %r30948, %r24909, %r24910; + mov.b64 %rd1079, {%r30948, %r24913}; + mul.lo.s32 %r24914, %r30944, 16777619; + ld.global.u32 %r24915, [%rd1075+40]; + xor.b32 %r30944, %r24914, %r24915; + mul.lo.s32 %r24916, %r30945, 16777619; + ld.global.u32 %r24917, [%rd1075+44]; + xor.b32 %r30945, %r24916, %r24917; + mul.lo.s32 %r24918, %r30956, 16777619; + ld.global.u32 %r24919, [%rd1075+48]; + mul.lo.s32 %r24920, %r30957, 16777619; + ld.global.u32 %r24921, [%rd1075+52]; + xor.b32 %r24922, %r24920, %r24921; + xor.b32 %r30956, %r24918, %r24919; + mov.b64 %rd1080, {%r30956, %r24922}; + mul.lo.s32 %r24923, %r30952, 16777619; + ld.global.u32 %r24924, [%rd1075+56]; + mul.lo.s32 %r24925, %r30953, 16777619; + ld.global.u32 %r24926, [%rd1075+60]; + xor.b32 %r24927, %r24925, %r24926; + xor.b32 %r30952, %r24923, %r24924; + mov.b64 %rd1081, {%r30952, %r24927}; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + st.local.v2.u32 [%rd272+32], {%r30958, %r24898}; + st.local.v2.u32 [%rd272+40], {%r30954, %r24903}; + st.local.v2.u32 [%rd272+48], {%r30950, %r24908}; + st.local.v2.u32 [%rd272+56], {%r30948, %r24913}; + st.local.v2.u32 [%rd272+64], {%r30944, %r30945}; + st.local.v2.u32 [%rd272+72], {%r30956, %r24922}; + st.local.v2.u32 [%rd272+80], {%r30952, %r24927}; + add.s32 %r30858, %r30858, 1; + setp.lt.u32 %p49, %r30858, 512; + shr.u64 %rd1082, %rd1065, 32; + cvt.u32.u64 %r30908, %rd1082; + shr.u64 %rd1083, %rd1066, 32; + cvt.u32.u64 %r30904, %rd1083; + shr.u64 %rd1084, %rd1067, 32; + cvt.u32.u64 %r30900, %rd1084; + shr.u64 %rd1085, %rd1068, 32; + cvt.u32.u64 %r30898, %rd1085; + shr.u64 %rd1086, %rd1069, 32; + cvt.u32.u64 %r30906, %rd1086; + shr.u64 %rd1087, %rd1070, 32; + cvt.u32.u64 %r30902, %rd1087; + shr.u64 %rd1088, %rd1076, 32; + cvt.u32.u64 %r30959, %rd1088; + shr.u64 %rd1089, %rd1077, 32; + cvt.u32.u64 %r30955, %rd1089; + shr.u64 %rd1090, %rd1078, 32; + cvt.u32.u64 %r30951, %rd1090; + shr.u64 %rd1091, %rd1079, 32; + cvt.u32.u64 %r30949, %rd1091; + shr.u64 %rd1092, %rd1080, 32; + cvt.u32.u64 %r30957, %rd1092; + shr.u64 %rd1093, %rd1081, 32; + cvt.u32.u64 %r30953, %rd1093; + @%p49 bra $L__BB2_88; + + mov.u32 %r30859, 0; + st.local.v2.u32 [%rd3+96], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+104], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+112], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+120], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+128], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+136], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+144], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+152], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+160], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+168], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+176], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+184], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+192], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+200], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+208], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+216], {%r30859, %r30859}; + mov.u32 %r30874, -2147483648; + mov.u32 %r24942, 1; + st.local.v2.u32 [%rd3+88], {%r24942, %r30874}; + mov.u32 %r30860, %r30859; + mov.u32 %r30861, %r30859; + mov.u32 %r30862, %r30859; + mov.u32 %r30863, %r30859; + mov.u32 %r30864, %r30859; + mov.u32 %r30865, %r30859; + mov.u32 %r30866, %r30859; + mov.u32 %r30867, %r30859; + mov.u32 %r30868, %r30859; + mov.u32 %r30869, %r30859; + mov.u32 %r30870, %r30859; + mov.u32 %r30871, %r30859; + mov.u32 %r30872, %r30859; + mov.u32 %r30873, %r24942; + mov.u32 %r30875, %r30859; + mov.u32 %r30876, %r30859; + mov.u32 %r30877, %r30859; + mov.u32 %r30878, %r30859; + mov.u32 %r30879, %r30859; + mov.u32 %r30880, %r30859; + mov.u32 %r30881, %r30859; + mov.u32 %r30882, %r30859; + mov.u32 %r30883, %r30859; + mov.u32 %r30884, %r30859; + mov.u32 %r30885, %r30859; + mov.u32 %r30886, %r30859; + mov.u32 %r30887, %r30859; + mov.u32 %r30888, %r30859; + mov.u32 %r30889, %r30859; + mov.u32 %r30890, %r30859; + mov.u32 %r30891, %r30859; + mov.u32 %r30892, %r30859; + mov.u32 %r30909, %r30859; + +$L__BB2_90: + // begin inline asm + // xor5 + lop3.b32 %r24969, %r30895, %r30893, %r30891, 0x96; + lop3.b32 %r24969, %r24969, %r30889, %r30887, 0x96; + lop3.b32 %r24970, %r30896, %r30894, %r30892, 0x96; + lop3.b32 %r24970, %r24970, %r30890, %r30888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24981, %r30907, %r30905, %r30885, 0x96; + lop3.b32 %r24981, %r24981, %r30883, %r30881, 0x96; + lop3.b32 %r24982, %r30908, %r30906, %r30886, 0x96; + lop3.b32 %r24982, %r24982, %r30884, %r30882, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24993, %r30903, %r30901, %r30879, 0x96; + lop3.b32 %r24993, %r24993, %r30877, %r30875, 0x96; + lop3.b32 %r24994, %r30904, %r30902, %r30880, 0x96; + lop3.b32 %r24994, %r24994, %r30878, %r30876, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25005, %r30899, %r30873, %r30871, 0x96; + lop3.b32 %r25005, %r25005, %r30869, %r30867, 0x96; + lop3.b32 %r25006, %r30900, %r30874, %r30872, 0x96; + lop3.b32 %r25006, %r25006, %r30870, %r30868, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25017, %r30897, %r30865, %r30863, 0x96; + lop3.b32 %r25017, %r25017, %r30861, %r30859, 0x96; + lop3.b32 %r25018, %r30898, %r30866, %r30864, 0x96; + lop3.b32 %r25018, %r25018, %r30862, %r30860, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25029, %r24982, %r24981, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25033, %r24981, %r24982, %r24942; + // end inline asm + xor.b32 %r25463, %r25029, %r25017; + xor.b32 %r25464, %r25033, %r25018; + xor.b32 %r25296, %r30895, %r25463; + xor.b32 %r25299, %r30896, %r25464; + xor.b32 %r25203, %r30893, %r25463; + xor.b32 %r25202, %r30894, %r25464; + xor.b32 %r25250, %r30891, %r25463; + xor.b32 %r25251, %r30892, %r25464; + xor.b32 %r25155, %r30889, %r25463; + xor.b32 %r25154, %r30890, %r25464; + xor.b32 %r25106, %r30887, %r25463; + xor.b32 %r25107, %r30888, %r25464; + // begin inline asm + shf.l.wrap.b32 %r25037, %r24994, %r24993, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25041, %r24993, %r24994, %r24942; + // end inline asm + xor.b32 %r25465, %r25037, %r24969; + xor.b32 %r25466, %r25041, %r24970; + xor.b32 %r25258, %r30907, %r25465; + xor.b32 %r25259, %r30908, %r25466; + xor.b32 %r25075, %r30905, %r25465; + xor.b32 %r25074, %r30906, %r25466; + xor.b32 %r25234, %r30885, %r25465; + xor.b32 %r25235, %r30886, %r25466; + xor.b32 %r25195, %r30883, %r25465; + xor.b32 %r25194, %r30884, %r25466; + xor.b32 %r25178, %r30881, %r25465; + xor.b32 %r25179, %r30882, %r25466; + // begin inline asm + shf.l.wrap.b32 %r25045, %r25006, %r25005, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25049, %r25005, %r25006, %r24942; + // end inline asm + xor.b32 %r25467, %r25045, %r24981; + xor.b32 %r25468, %r25049, %r24982; + xor.b32 %r25115, %r30903, %r25467; + xor.b32 %r25114, %r30904, %r25468; + xor.b32 %r25242, %r30901, %r25467; + xor.b32 %r25243, %r30902, %r25468; + xor.b32 %r25123, %r30879, %r25467; + xor.b32 %r25122, %r30880, %r25468; + xor.b32 %r25226, %r30877, %r25467; + xor.b32 %r25227, %r30878, %r25468; + xor.b32 %r25091, %r30875, %r25467; + xor.b32 %r25090, %r30876, %r25468; + // begin inline asm + shf.l.wrap.b32 %r25053, %r25018, %r25017, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25057, %r25017, %r25018, %r24942; + // end inline asm + xor.b32 %r25469, %r25053, %r24993; + xor.b32 %r25470, %r25057, %r24994; + xor.b32 %r25210, %r30899, %r25469; + xor.b32 %r25211, %r30900, %r25470; + xor.b32 %r25187, %r30873, %r25469; + xor.b32 %r25186, %r30874, %r25470; + xor.b32 %r25130, %r30871, %r25469; + xor.b32 %r25131, %r30872, %r25470; + xor.b32 %r25218, %r30869, %r25469; + xor.b32 %r25219, %r30870, %r25470; + xor.b32 %r25147, %r30867, %r25469; + xor.b32 %r25146, %r30868, %r25470; + // begin inline asm + shf.l.wrap.b32 %r25061, %r24970, %r24969, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25065, %r24969, %r24970, %r24942; + // end inline asm + xor.b32 %r25471, %r25061, %r25005; + xor.b32 %r25472, %r25065, %r25006; + xor.b32 %r25162, %r30897, %r25471; + xor.b32 %r25163, %r30898, %r25472; + xor.b32 %r25082, %r30865, %r25471; + xor.b32 %r25083, %r30866, %r25472; + xor.b32 %r25099, %r30863, %r25471; + xor.b32 %r25098, %r30864, %r25472; + xor.b32 %r25138, %r30861, %r25471; + xor.b32 %r25139, %r30862, %r25472; + xor.b32 %r25170, %r30859, %r25471; + xor.b32 %r25171, %r30860, %r25472; + mov.u32 %r25076, 44; + // begin inline asm + shf.l.wrap.b32 %r25069, %r25075, %r25074, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25073, %r25074, %r25075, %r25076; + // end inline asm + mov.u32 %r25084, 20; + // begin inline asm + shf.l.wrap.b32 %r25077, %r25083, %r25082, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25081, %r25082, %r25083, %r25084; + // end inline asm + mov.u32 %r25092, 61; + // begin inline asm + shf.l.wrap.b32 %r25085, %r25091, %r25090, %r25092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25089, %r25090, %r25091, %r25092; + // end inline asm + mov.u32 %r25100, 39; + // begin inline asm + shf.l.wrap.b32 %r25093, %r25099, %r25098, %r25100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25097, %r25098, %r25099, %r25100; + // end inline asm + mov.u32 %r25108, 18; + // begin inline asm + shf.l.wrap.b32 %r25101, %r25107, %r25106, %r25108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25105, %r25106, %r25107, %r25108; + // end inline asm + mov.u32 %r25116, 62; + // begin inline asm + shf.l.wrap.b32 %r25109, %r25115, %r25114, %r25116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25113, %r25114, %r25115, %r25116; + // end inline asm + mov.u32 %r25124, 43; + // begin inline asm + shf.l.wrap.b32 %r25117, %r25123, %r25122, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25121, %r25122, %r25123, %r25124; + // end inline asm + mov.u32 %r25132, 25; + // begin inline asm + shf.l.wrap.b32 %r25125, %r25131, %r25130, %r25132; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25129, %r25130, %r25131, %r25132; + // end inline asm + mov.u32 %r25140, 8; + // begin inline asm + shf.l.wrap.b32 %r25133, %r25139, %r25138, %r25140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25137, %r25138, %r25139, %r25140; + // end inline asm + mov.u32 %r25148, 56; + // begin inline asm + shf.l.wrap.b32 %r25141, %r25147, %r25146, %r25148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25145, %r25146, %r25147, %r25148; + // end inline asm + mov.u32 %r25156, 41; + // begin inline asm + shf.l.wrap.b32 %r25149, %r25155, %r25154, %r25156; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25153, %r25154, %r25155, %r25156; + // end inline asm + mov.u32 %r25164, 27; + // begin inline asm + shf.l.wrap.b32 %r25157, %r25163, %r25162, %r25164; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25161, %r25162, %r25163, %r25164; + // end inline asm + mov.u32 %r25172, 14; + // begin inline asm + shf.l.wrap.b32 %r25165, %r25171, %r25170, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25169, %r25170, %r25171, %r25172; + // end inline asm + mov.u32 %r25180, 2; + // begin inline asm + shf.l.wrap.b32 %r25173, %r25179, %r25178, %r25180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25177, %r25178, %r25179, %r25180; + // end inline asm + mov.u32 %r25188, 55; + // begin inline asm + shf.l.wrap.b32 %r25181, %r25187, %r25186, %r25188; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25185, %r25186, %r25187, %r25188; + // end inline asm + mov.u32 %r25196, 45; + // begin inline asm + shf.l.wrap.b32 %r25189, %r25195, %r25194, %r25196; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25193, %r25194, %r25195, %r25196; + // end inline asm + mov.u32 %r25204, 36; + // begin inline asm + shf.l.wrap.b32 %r25197, %r25203, %r25202, %r25204; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25201, %r25202, %r25203, %r25204; + // end inline asm + mov.u32 %r25212, 28; + // begin inline asm + shf.l.wrap.b32 %r25205, %r25211, %r25210, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25209, %r25210, %r25211, %r25212; + // end inline asm + mov.u32 %r25220, 21; + // begin inline asm + shf.l.wrap.b32 %r25213, %r25219, %r25218, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25217, %r25218, %r25219, %r25220; + // end inline asm + mov.u32 %r25228, 15; + // begin inline asm + shf.l.wrap.b32 %r25221, %r25227, %r25226, %r25228; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25225, %r25226, %r25227, %r25228; + // end inline asm + mov.u32 %r25236, 10; + // begin inline asm + shf.l.wrap.b32 %r25229, %r25235, %r25234, %r25236; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25233, %r25234, %r25235, %r25236; + // end inline asm + mov.u32 %r25244, 6; + // begin inline asm + shf.l.wrap.b32 %r25237, %r25243, %r25242, %r25244; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25241, %r25242, %r25243, %r25244; + // end inline asm + mov.u32 %r25252, 3; + // begin inline asm + shf.l.wrap.b32 %r25245, %r25251, %r25250, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25249, %r25250, %r25251, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25253, %r25259, %r25258, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25257, %r25258, %r25259, %r24942; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25261, %r25296, %r25069, %r25117, 0xD2; + lop3.b32 %r25262, %r25299, %r25073, %r25121, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30907, %r25069, %r25117, %r25213, 0xD2; + lop3.b32 %r30908, %r25073, %r25121, %r25217, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30903, %r25117, %r25213, %r25165, 0xD2; + lop3.b32 %r30904, %r25121, %r25217, %r25169, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30899, %r25213, %r25165, %r25296, 0xD2; + lop3.b32 %r30900, %r25217, %r25169, %r25299, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30897, %r25165, %r25296, %r25069, 0xD2; + lop3.b32 %r30898, %r25169, %r25299, %r25073, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30893, %r25205, %r25077, %r25245, 0xD2; + lop3.b32 %r30894, %r25209, %r25081, %r25249, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30905, %r25077, %r25245, %r25189, 0xD2; + lop3.b32 %r30906, %r25081, %r25249, %r25193, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30901, %r25245, %r25189, %r25085, 0xD2; + lop3.b32 %r30902, %r25249, %r25193, %r25089, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30873, %r25189, %r25085, %r25205, 0xD2; + lop3.b32 %r30874, %r25193, %r25089, %r25209, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30873, %r30874}; + // begin inline asm + // chi + lop3.b32 %r30865, %r25085, %r25205, %r25077, 0xD2; + lop3.b32 %r30866, %r25089, %r25209, %r25081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30865, %r30866}; + // begin inline asm + // chi + lop3.b32 %r30891, %r25253, %r25237, %r25125, 0xD2; + lop3.b32 %r30892, %r25257, %r25241, %r25129, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30891, %r30892}; + // begin inline asm + // chi + lop3.b32 %r30885, %r25237, %r25125, %r25133, 0xD2; + lop3.b32 %r30886, %r25241, %r25129, %r25137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30885, %r30886}; + // begin inline asm + // chi + lop3.b32 %r30879, %r25125, %r25133, %r25101, 0xD2; + lop3.b32 %r30880, %r25129, %r25137, %r25105, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30879, %r30880}; + // begin inline asm + // chi + lop3.b32 %r30871, %r25133, %r25101, %r25253, 0xD2; + lop3.b32 %r30872, %r25137, %r25105, %r25257, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30871, %r30872}; + // begin inline asm + // chi + lop3.b32 %r30863, %r25101, %r25253, %r25237, 0xD2; + lop3.b32 %r30864, %r25105, %r25257, %r25241, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30863, %r30864}; + // begin inline asm + // chi + lop3.b32 %r30889, %r25157, %r25197, %r25229, 0xD2; + lop3.b32 %r30890, %r25161, %r25201, %r25233, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30889, %r30890}; + // begin inline asm + // chi + lop3.b32 %r30883, %r25197, %r25229, %r25221, 0xD2; + lop3.b32 %r30884, %r25201, %r25233, %r25225, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30883, %r30884}; + // begin inline asm + // chi + lop3.b32 %r30877, %r25229, %r25221, %r25141, 0xD2; + lop3.b32 %r30878, %r25233, %r25225, %r25145, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30877, %r30878}; + // begin inline asm + // chi + lop3.b32 %r30869, %r25221, %r25141, %r25157, 0xD2; + lop3.b32 %r30870, %r25225, %r25145, %r25161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30869, %r30870}; + // begin inline asm + // chi + lop3.b32 %r30861, %r25141, %r25157, %r25197, 0xD2; + lop3.b32 %r30862, %r25145, %r25161, %r25201, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30861, %r30862}; + // begin inline asm + // chi + lop3.b32 %r30887, %r25109, %r25181, %r25093, 0xD2; + lop3.b32 %r30888, %r25113, %r25185, %r25097, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30887, %r30888}; + // begin inline asm + // chi + lop3.b32 %r30881, %r25181, %r25093, %r25149, 0xD2; + lop3.b32 %r30882, %r25185, %r25097, %r25153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30881, %r30882}; + // begin inline asm + // chi + lop3.b32 %r30875, %r25093, %r25149, %r25173, 0xD2; + lop3.b32 %r30876, %r25097, %r25153, %r25177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30875, %r30876}; + // begin inline asm + // chi + lop3.b32 %r30867, %r25149, %r25173, %r25109, 0xD2; + lop3.b32 %r30868, %r25153, %r25177, %r25113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30867, %r30868}; + // begin inline asm + // chi + lop3.b32 %r30859, %r25173, %r25109, %r25181, 0xD2; + lop3.b32 %r30860, %r25177, %r25113, %r25185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30859, %r30860}; + mul.wide.s32 %rd1097, %r30909, 8; + add.s64 %rd1096, %rd1029, %rd1097; + // begin inline asm + ld.global.nc.v2.u32 {%r25461,%r25462}, [%rd1096]; + // end inline asm + xor.b32 %r30895, %r25261, %r25461; + xor.b32 %r30896, %r25262, %r25462; + add.s32 %r30909, %r30909, 1; + setp.lt.u32 %p50, %r30909, 23; + @%p50 bra $L__BB2_90; + + st.local.v2.u32 [%rd3+32], {%r30907, %r30908}; + st.local.v2.u32 [%rd3+72], {%r30905, %r30906}; + st.local.v2.u32 [%rd3+40], {%r30903, %r30904}; + st.local.v2.u32 [%rd3+80], {%r30901, %r30902}; + st.local.v2.u32 [%rd3+48], {%r30899, %r30900}; + st.local.v2.u32 [%rd3+56], {%r30897, %r30898}; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + // begin inline asm + // xor5 + lop3.b32 %r25473, %r30895, %r30893, %r30891, 0x96; + lop3.b32 %r25473, %r25473, %r30889, %r30887, 0x96; + lop3.b32 %r25474, %r30896, %r30894, %r30892, 0x96; + lop3.b32 %r25474, %r25474, %r30890, %r30888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25485, %r30907, %r30905, %r30885, 0x96; + lop3.b32 %r25485, %r25485, %r30883, %r30881, 0x96; + lop3.b32 %r25486, %r30908, %r30906, %r30886, 0x96; + lop3.b32 %r25486, %r25486, %r30884, %r30882, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25497, %r30903, %r30901, %r30879, 0x96; + lop3.b32 %r25497, %r25497, %r30877, %r30875, 0x96; + lop3.b32 %r25498, %r30904, %r30902, %r30880, 0x96; + lop3.b32 %r25498, %r25498, %r30878, %r30876, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25509, %r30899, %r30873, %r30871, 0x96; + lop3.b32 %r25509, %r25509, %r30869, %r30867, 0x96; + lop3.b32 %r25510, %r30900, %r30874, %r30872, 0x96; + lop3.b32 %r25510, %r25510, %r30870, %r30868, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25521, %r30897, %r30865, %r30863, 0x96; + lop3.b32 %r25521, %r25521, %r30861, %r30859, 0x96; + lop3.b32 %r25522, %r30898, %r30866, %r30864, 0x96; + lop3.b32 %r25522, %r25522, %r30862, %r30860, 0x96; + // end inline asm + mov.u32 %r25725, 1; + // begin inline asm + shf.l.wrap.b32 %r25533, %r25486, %r25485, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25537, %r25485, %r25486, %r25725; + // end inline asm + xor.b32 %r25752, %r25533, %r25521; + xor.b32 %r25753, %r25537, %r25522; + xor.b32 %r25680, %r30895, %r25752; + xor.b32 %r25683, %r30896, %r25753; + xor.b32 %r25643, %r30892, %r25753; + xor.b32 %r25642, %r30891, %r25752; + st.local.v2.u32 [%rd3+104], {%r25642, %r25643}; + // begin inline asm + shf.l.wrap.b32 %r25541, %r25498, %r25497, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25545, %r25497, %r25498, %r25725; + // end inline asm + xor.b32 %r25754, %r25541, %r25473; + xor.b32 %r25755, %r25545, %r25474; + xor.b32 %r25579, %r30905, %r25754; + xor.b32 %r25578, %r30906, %r25755; + xor.b32 %r25618, %r30884, %r25755; + xor.b32 %r25619, %r30883, %r25754; + st.local.v2.u32 [%rd3+152], {%r25619, %r25618}; + // begin inline asm + shf.l.wrap.b32 %r25549, %r25510, %r25509, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25553, %r25509, %r25510, %r25725; + // end inline asm + xor.b32 %r25756, %r25549, %r25485; + xor.b32 %r25757, %r25553, %r25486; + xor.b32 %r25602, %r30880, %r25757; + xor.b32 %r25603, %r30879, %r25756; + st.local.v2.u32 [%rd3+120], {%r25603, %r25602}; + xor.b32 %r25594, %r30876, %r25757; + xor.b32 %r25595, %r30875, %r25756; + st.local.v2.u32 [%rd3+200], {%r25595, %r25594}; + // begin inline asm + shf.l.wrap.b32 %r25557, %r25522, %r25521, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25561, %r25521, %r25522, %r25725; + // end inline asm + xor.b32 %r25758, %r25557, %r25497; + xor.b32 %r25759, %r25561, %r25498; + xor.b32 %r25626, %r30899, %r25758; + xor.b32 %r25627, %r30900, %r25759; + xor.b32 %r25635, %r30870, %r25759; + xor.b32 %r25634, %r30869, %r25758; + st.local.v2.u32 [%rd3+168], {%r25634, %r25635}; + // begin inline asm + shf.l.wrap.b32 %r25565, %r25474, %r25473, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25569, %r25473, %r25474, %r25725; + // end inline asm + xor.b32 %r25760, %r25565, %r25509; + xor.b32 %r25761, %r25569, %r25510; + xor.b32 %r25586, %r30865, %r25760; + xor.b32 %r25587, %r30866, %r25761; + xor.b32 %r25611, %r30860, %r25761; + xor.b32 %r25610, %r30859, %r25760; + st.local.v2.u32 [%rd3+216], {%r25610, %r25611}; + // begin inline asm + shf.l.wrap.b32 %r25573, %r25579, %r25578, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25577, %r25578, %r25579, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25581, %r25587, %r25586, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25585, %r25586, %r25587, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25593, %r25594, %r25595, %r25092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25589, %r25595, %r25594, %r25092; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r25589, %r25593}; + // begin inline asm + shf.l.wrap.b32 %r25597, %r25603, %r25602, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25601, %r25602, %r25603, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25605, %r25611, %r25610, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25609, %r25610, %r25611, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25617, %r25618, %r25619, %r25196; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25613, %r25619, %r25618, %r25196; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r25613, %r25617}; + // begin inline asm + shf.l.wrap.b32 %r25621, %r25627, %r25626, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25625, %r25626, %r25627, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25629, %r25635, %r25634, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25633, %r25634, %r25635, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25637, %r25643, %r25642, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25641, %r25642, %r25643, %r25252; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25645, %r25680, %r25573, %r25597, 0xD2; + lop3.b32 %r25646, %r25683, %r25577, %r25601, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25653, %r25573, %r25597, %r25629, 0xD2; + lop3.b32 %r25654, %r25577, %r25601, %r25633, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r25653, %r25654}; + // begin inline asm + // chi + lop3.b32 %r25661, %r25597, %r25629, %r25605, 0xD2; + lop3.b32 %r25662, %r25601, %r25633, %r25609, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r25661, %r25662}; + // begin inline asm + // chi + lop3.b32 %r25669, %r25629, %r25605, %r25680, 0xD2; + lop3.b32 %r25670, %r25633, %r25609, %r25683, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r25669, %r25670}; + // begin inline asm + // chi + lop3.b32 %r25677, %r25605, %r25680, %r25573, 0xD2; + lop3.b32 %r25678, %r25609, %r25683, %r25577, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r25677, %r25678}; + // begin inline asm + // chi + lop3.b32 %r25685, %r25621, %r25581, %r25637, 0xD2; + lop3.b32 %r25686, %r25625, %r25585, %r25641, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r25685, %r25686}; + // begin inline asm + // chi + lop3.b32 %r25693, %r25581, %r25637, %r25613, 0xD2; + lop3.b32 %r25694, %r25585, %r25641, %r25617, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r25693, %r25694}; + // begin inline asm + // chi + lop3.b32 %r25701, %r25637, %r25613, %r25589, 0xD2; + lop3.b32 %r25702, %r25641, %r25617, %r25593, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r25701, %r25702}; + // begin inline asm + ld.global.nc.v2.u32 {%r25709,%r25710}, [%rd1030]; + // end inline asm + xor.b32 %r25762, %r25646, %r25710; + xor.b32 %r25763, %r25645, %r25709; + mov.b64 %rd1269, {%r25763, %r25762}; + mov.b64 %rd1270, {%r25653, %r25654}; + mov.b64 %rd1271, {%r25661, %r25662}; + mov.b64 %rd1272, {%r25677, %r25678}; + mov.u32 %r30910, 0; + st.local.v2.u32 [%rd3+24], {%r25763, %r25762}; + st.local.v2.u32 [%rd272+96], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+104], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+112], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+120], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+128], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+136], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+144], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+152], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+160], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+168], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+176], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+184], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+192], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+200], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+208], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+216], {%r30910, %r30910}; + mov.u32 %r30925, -2147483648; + st.local.v2.u32 [%rd272+88], {%r25725, %r30925}; + mov.u32 %r30911, %r30910; + mov.u32 %r30912, %r30910; + mov.u32 %r30913, %r30910; + mov.u32 %r30914, %r30910; + mov.u32 %r30915, %r30910; + mov.u32 %r30916, %r30910; + mov.u32 %r30917, %r30910; + mov.u32 %r30918, %r30910; + mov.u32 %r30919, %r30910; + mov.u32 %r30920, %r30910; + mov.u32 %r30921, %r30910; + mov.u32 %r30922, %r30910; + mov.u32 %r30923, %r30910; + mov.u32 %r30924, %r25725; + mov.u32 %r30926, %r30910; + mov.u32 %r30927, %r30910; + mov.u32 %r30928, %r30910; + mov.u32 %r30929, %r30910; + mov.u32 %r30930, %r30910; + mov.u32 %r30931, %r30910; + mov.u32 %r30932, %r30910; + mov.u32 %r30933, %r30910; + mov.u32 %r30934, %r30910; + mov.u32 %r30935, %r30910; + mov.u32 %r30936, %r30910; + mov.u32 %r30937, %r30910; + mov.u32 %r30938, %r30910; + mov.u32 %r30939, %r30910; + mov.u32 %r30940, %r30910; + mov.u32 %r30941, %r30910; + mov.u32 %r30942, %r30910; + mov.u32 %r30943, %r30910; + mov.u32 %r30960, %r30910; + +$L__BB2_92: + // begin inline asm + // xor5 + lop3.b32 %r25764, %r30946, %r30944, %r30942, 0x96; + lop3.b32 %r25764, %r25764, %r30940, %r30938, 0x96; + lop3.b32 %r25765, %r30947, %r30945, %r30943, 0x96; + lop3.b32 %r25765, %r25765, %r30941, %r30939, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25776, %r30958, %r30956, %r30936, 0x96; + lop3.b32 %r25776, %r25776, %r30934, %r30932, 0x96; + lop3.b32 %r25777, %r30959, %r30957, %r30937, 0x96; + lop3.b32 %r25777, %r25777, %r30935, %r30933, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25788, %r30954, %r30952, %r30930, 0x96; + lop3.b32 %r25788, %r25788, %r30928, %r30926, 0x96; + lop3.b32 %r25789, %r30955, %r30953, %r30931, 0x96; + lop3.b32 %r25789, %r25789, %r30929, %r30927, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25800, %r30950, %r30924, %r30922, 0x96; + lop3.b32 %r25800, %r25800, %r30920, %r30918, 0x96; + lop3.b32 %r25801, %r30951, %r30925, %r30923, 0x96; + lop3.b32 %r25801, %r25801, %r30921, %r30919, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25812, %r30948, %r30916, %r30914, 0x96; + lop3.b32 %r25812, %r25812, %r30912, %r30910, 0x96; + lop3.b32 %r25813, %r30949, %r30917, %r30915, 0x96; + lop3.b32 %r25813, %r25813, %r30913, %r30911, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25824, %r25777, %r25776, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25828, %r25776, %r25777, %r25725; + // end inline asm + xor.b32 %r26258, %r25824, %r25812; + xor.b32 %r26259, %r25828, %r25813; + xor.b32 %r26091, %r30946, %r26258; + xor.b32 %r26094, %r30947, %r26259; + xor.b32 %r25998, %r30944, %r26258; + xor.b32 %r25997, %r30945, %r26259; + xor.b32 %r26045, %r30942, %r26258; + xor.b32 %r26046, %r30943, %r26259; + xor.b32 %r25950, %r30940, %r26258; + xor.b32 %r25949, %r30941, %r26259; + xor.b32 %r25901, %r30938, %r26258; + xor.b32 %r25902, %r30939, %r26259; + // begin inline asm + shf.l.wrap.b32 %r25832, %r25789, %r25788, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25836, %r25788, %r25789, %r25725; + // end inline asm + xor.b32 %r26260, %r25832, %r25764; + xor.b32 %r26261, %r25836, %r25765; + xor.b32 %r26053, %r30958, %r26260; + xor.b32 %r26054, %r30959, %r26261; + xor.b32 %r25870, %r30956, %r26260; + xor.b32 %r25869, %r30957, %r26261; + xor.b32 %r26029, %r30936, %r26260; + xor.b32 %r26030, %r30937, %r26261; + xor.b32 %r25990, %r30934, %r26260; + xor.b32 %r25989, %r30935, %r26261; + xor.b32 %r25973, %r30932, %r26260; + xor.b32 %r25974, %r30933, %r26261; + // begin inline asm + shf.l.wrap.b32 %r25840, %r25801, %r25800, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25844, %r25800, %r25801, %r25725; + // end inline asm + xor.b32 %r26262, %r25840, %r25776; + xor.b32 %r26263, %r25844, %r25777; + xor.b32 %r25910, %r30954, %r26262; + xor.b32 %r25909, %r30955, %r26263; + xor.b32 %r26037, %r30952, %r26262; + xor.b32 %r26038, %r30953, %r26263; + xor.b32 %r25918, %r30930, %r26262; + xor.b32 %r25917, %r30931, %r26263; + xor.b32 %r26021, %r30928, %r26262; + xor.b32 %r26022, %r30929, %r26263; + xor.b32 %r25886, %r30926, %r26262; + xor.b32 %r25885, %r30927, %r26263; + // begin inline asm + shf.l.wrap.b32 %r25848, %r25813, %r25812, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25852, %r25812, %r25813, %r25725; + // end inline asm + xor.b32 %r26264, %r25848, %r25788; + xor.b32 %r26265, %r25852, %r25789; + xor.b32 %r26005, %r30950, %r26264; + xor.b32 %r26006, %r30951, %r26265; + xor.b32 %r25982, %r30924, %r26264; + xor.b32 %r25981, %r30925, %r26265; + xor.b32 %r25925, %r30922, %r26264; + xor.b32 %r25926, %r30923, %r26265; + xor.b32 %r26013, %r30920, %r26264; + xor.b32 %r26014, %r30921, %r26265; + xor.b32 %r25942, %r30918, %r26264; + xor.b32 %r25941, %r30919, %r26265; + // begin inline asm + shf.l.wrap.b32 %r25856, %r25765, %r25764, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25860, %r25764, %r25765, %r25725; + // end inline asm + xor.b32 %r26266, %r25856, %r25800; + xor.b32 %r26267, %r25860, %r25801; + xor.b32 %r25957, %r30948, %r26266; + xor.b32 %r25958, %r30949, %r26267; + xor.b32 %r25877, %r30916, %r26266; + xor.b32 %r25878, %r30917, %r26267; + xor.b32 %r25894, %r30914, %r26266; + xor.b32 %r25893, %r30915, %r26267; + xor.b32 %r25933, %r30912, %r26266; + xor.b32 %r25934, %r30913, %r26267; + xor.b32 %r25965, %r30910, %r26266; + xor.b32 %r25966, %r30911, %r26267; + mov.u32 %r25871, 44; + // begin inline asm + shf.l.wrap.b32 %r25864, %r25870, %r25869, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25868, %r25869, %r25870, %r25871; + // end inline asm + mov.u32 %r25879, 20; + // begin inline asm + shf.l.wrap.b32 %r25872, %r25878, %r25877, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25876, %r25877, %r25878, %r25879; + // end inline asm + mov.u32 %r25887, 61; + // begin inline asm + shf.l.wrap.b32 %r25880, %r25886, %r25885, %r25887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25884, %r25885, %r25886, %r25887; + // end inline asm + mov.u32 %r25895, 39; + // begin inline asm + shf.l.wrap.b32 %r25888, %r25894, %r25893, %r25895; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25892, %r25893, %r25894, %r25895; + // end inline asm + mov.u32 %r25903, 18; + // begin inline asm + shf.l.wrap.b32 %r25896, %r25902, %r25901, %r25903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25900, %r25901, %r25902, %r25903; + // end inline asm + mov.u32 %r25911, 62; + // begin inline asm + shf.l.wrap.b32 %r25904, %r25910, %r25909, %r25911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25908, %r25909, %r25910, %r25911; + // end inline asm + mov.u32 %r25919, 43; + // begin inline asm + shf.l.wrap.b32 %r25912, %r25918, %r25917, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25916, %r25917, %r25918, %r25919; + // end inline asm + mov.u32 %r25927, 25; + // begin inline asm + shf.l.wrap.b32 %r25920, %r25926, %r25925, %r25927; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25924, %r25925, %r25926, %r25927; + // end inline asm + mov.u32 %r25935, 8; + // begin inline asm + shf.l.wrap.b32 %r25928, %r25934, %r25933, %r25935; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25932, %r25933, %r25934, %r25935; + // end inline asm + mov.u32 %r25943, 56; + // begin inline asm + shf.l.wrap.b32 %r25936, %r25942, %r25941, %r25943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25940, %r25941, %r25942, %r25943; + // end inline asm + mov.u32 %r25951, 41; + // begin inline asm + shf.l.wrap.b32 %r25944, %r25950, %r25949, %r25951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25948, %r25949, %r25950, %r25951; + // end inline asm + mov.u32 %r25959, 27; + // begin inline asm + shf.l.wrap.b32 %r25952, %r25958, %r25957, %r25959; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25956, %r25957, %r25958, %r25959; + // end inline asm + mov.u32 %r25967, 14; + // begin inline asm + shf.l.wrap.b32 %r25960, %r25966, %r25965, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25964, %r25965, %r25966, %r25967; + // end inline asm + mov.u32 %r25975, 2; + // begin inline asm + shf.l.wrap.b32 %r25968, %r25974, %r25973, %r25975; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25972, %r25973, %r25974, %r25975; + // end inline asm + mov.u32 %r25983, 55; + // begin inline asm + shf.l.wrap.b32 %r25976, %r25982, %r25981, %r25983; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25980, %r25981, %r25982, %r25983; + // end inline asm + mov.u32 %r25991, 45; + // begin inline asm + shf.l.wrap.b32 %r25984, %r25990, %r25989, %r25991; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25988, %r25989, %r25990, %r25991; + // end inline asm + mov.u32 %r25999, 36; + // begin inline asm + shf.l.wrap.b32 %r25992, %r25998, %r25997, %r25999; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25996, %r25997, %r25998, %r25999; + // end inline asm + mov.u32 %r26007, 28; + // begin inline asm + shf.l.wrap.b32 %r26000, %r26006, %r26005, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26004, %r26005, %r26006, %r26007; + // end inline asm + mov.u32 %r26015, 21; + // begin inline asm + shf.l.wrap.b32 %r26008, %r26014, %r26013, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26012, %r26013, %r26014, %r26015; + // end inline asm + mov.u32 %r26023, 15; + // begin inline asm + shf.l.wrap.b32 %r26016, %r26022, %r26021, %r26023; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26020, %r26021, %r26022, %r26023; + // end inline asm + mov.u32 %r26031, 10; + // begin inline asm + shf.l.wrap.b32 %r26024, %r26030, %r26029, %r26031; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26028, %r26029, %r26030, %r26031; + // end inline asm + mov.u32 %r26039, 6; + // begin inline asm + shf.l.wrap.b32 %r26032, %r26038, %r26037, %r26039; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26036, %r26037, %r26038, %r26039; + // end inline asm + mov.u32 %r26047, 3; + // begin inline asm + shf.l.wrap.b32 %r26040, %r26046, %r26045, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26044, %r26045, %r26046, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26048, %r26054, %r26053, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26052, %r26053, %r26054, %r25725; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26056, %r26091, %r25864, %r25912, 0xD2; + lop3.b32 %r26057, %r26094, %r25868, %r25916, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30958, %r25864, %r25912, %r26008, 0xD2; + lop3.b32 %r30959, %r25868, %r25916, %r26012, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30954, %r25912, %r26008, %r25960, 0xD2; + lop3.b32 %r30955, %r25916, %r26012, %r25964, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30950, %r26008, %r25960, %r26091, 0xD2; + lop3.b32 %r30951, %r26012, %r25964, %r26094, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30948, %r25960, %r26091, %r25864, 0xD2; + lop3.b32 %r30949, %r25964, %r26094, %r25868, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30944, %r26000, %r25872, %r26040, 0xD2; + lop3.b32 %r30945, %r26004, %r25876, %r26044, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30956, %r25872, %r26040, %r25984, 0xD2; + lop3.b32 %r30957, %r25876, %r26044, %r25988, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30952, %r26040, %r25984, %r25880, 0xD2; + lop3.b32 %r30953, %r26044, %r25988, %r25884, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30924, %r25984, %r25880, %r26000, 0xD2; + lop3.b32 %r30925, %r25988, %r25884, %r26004, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r30924, %r30925}; + // begin inline asm + // chi + lop3.b32 %r30916, %r25880, %r26000, %r25872, 0xD2; + lop3.b32 %r30917, %r25884, %r26004, %r25876, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r30916, %r30917}; + // begin inline asm + // chi + lop3.b32 %r30942, %r26048, %r26032, %r25920, 0xD2; + lop3.b32 %r30943, %r26052, %r26036, %r25924, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+104], {%r30942, %r30943}; + // begin inline asm + // chi + lop3.b32 %r30936, %r26032, %r25920, %r25928, 0xD2; + lop3.b32 %r30937, %r26036, %r25924, %r25932, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+112], {%r30936, %r30937}; + // begin inline asm + // chi + lop3.b32 %r30930, %r25920, %r25928, %r25896, 0xD2; + lop3.b32 %r30931, %r25924, %r25932, %r25900, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+120], {%r30930, %r30931}; + // begin inline asm + // chi + lop3.b32 %r30922, %r25928, %r25896, %r26048, 0xD2; + lop3.b32 %r30923, %r25932, %r25900, %r26052, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+128], {%r30922, %r30923}; + // begin inline asm + // chi + lop3.b32 %r30914, %r25896, %r26048, %r26032, 0xD2; + lop3.b32 %r30915, %r25900, %r26052, %r26036, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+136], {%r30914, %r30915}; + // begin inline asm + // chi + lop3.b32 %r30940, %r25952, %r25992, %r26024, 0xD2; + lop3.b32 %r30941, %r25956, %r25996, %r26028, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+144], {%r30940, %r30941}; + // begin inline asm + // chi + lop3.b32 %r30934, %r25992, %r26024, %r26016, 0xD2; + lop3.b32 %r30935, %r25996, %r26028, %r26020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+152], {%r30934, %r30935}; + // begin inline asm + // chi + lop3.b32 %r30928, %r26024, %r26016, %r25936, 0xD2; + lop3.b32 %r30929, %r26028, %r26020, %r25940, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+160], {%r30928, %r30929}; + // begin inline asm + // chi + lop3.b32 %r30920, %r26016, %r25936, %r25952, 0xD2; + lop3.b32 %r30921, %r26020, %r25940, %r25956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+168], {%r30920, %r30921}; + // begin inline asm + // chi + lop3.b32 %r30912, %r25936, %r25952, %r25992, 0xD2; + lop3.b32 %r30913, %r25940, %r25956, %r25996, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+176], {%r30912, %r30913}; + // begin inline asm + // chi + lop3.b32 %r30938, %r25904, %r25976, %r25888, 0xD2; + lop3.b32 %r30939, %r25908, %r25980, %r25892, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+184], {%r30938, %r30939}; + // begin inline asm + // chi + lop3.b32 %r30932, %r25976, %r25888, %r25944, 0xD2; + lop3.b32 %r30933, %r25980, %r25892, %r25948, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+192], {%r30932, %r30933}; + // begin inline asm + // chi + lop3.b32 %r30926, %r25888, %r25944, %r25968, 0xD2; + lop3.b32 %r30927, %r25892, %r25948, %r25972, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+200], {%r30926, %r30927}; + // begin inline asm + // chi + lop3.b32 %r30918, %r25944, %r25968, %r25904, 0xD2; + lop3.b32 %r30919, %r25948, %r25972, %r25908, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+208], {%r30918, %r30919}; + // begin inline asm + // chi + lop3.b32 %r30910, %r25968, %r25904, %r25976, 0xD2; + lop3.b32 %r30911, %r25972, %r25908, %r25980, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+216], {%r30910, %r30911}; + mul.wide.s32 %rd1108, %r30960, 8; + add.s64 %rd1107, %rd1029, %rd1108; + // begin inline asm + ld.global.nc.v2.u32 {%r26256,%r26257}, [%rd1107]; + // end inline asm + xor.b32 %r30946, %r26056, %r26256; + xor.b32 %r30947, %r26057, %r26257; + add.s32 %r30960, %r30960, 1; + setp.lt.u32 %p51, %r30960, 23; + @%p51 bra $L__BB2_92; + + mov.u32 %r26367, 1; + st.local.v2.u32 [%rd272+32], {%r30958, %r30959}; + st.local.v2.u32 [%rd272+72], {%r30956, %r30957}; + st.local.v2.u32 [%rd272+40], {%r30954, %r30955}; + st.local.v2.u32 [%rd272+80], {%r30952, %r30953}; + st.local.v2.u32 [%rd272+48], {%r30950, %r30951}; + st.local.v2.u32 [%rd272+56], {%r30948, %r30949}; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + // begin inline asm + // xor5 + lop3.b32 %r26268, %r30946, %r30944, %r30942, 0x96; + lop3.b32 %r26268, %r26268, %r30940, %r30938, 0x96; + lop3.b32 %r26269, %r30947, %r30945, %r30943, 0x96; + lop3.b32 %r26269, %r26269, %r30941, %r30939, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26280, %r30958, %r30956, %r30936, 0x96; + lop3.b32 %r26280, %r26280, %r30934, %r30932, 0x96; + lop3.b32 %r26281, %r30959, %r30957, %r30937, 0x96; + lop3.b32 %r26281, %r26281, %r30935, %r30933, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26292, %r30954, %r30952, %r30930, 0x96; + lop3.b32 %r26292, %r26292, %r30928, %r30926, 0x96; + lop3.b32 %r26293, %r30955, %r30953, %r30931, 0x96; + lop3.b32 %r26293, %r26293, %r30929, %r30927, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26304, %r30950, %r30924, %r30922, 0x96; + lop3.b32 %r26304, %r26304, %r30920, %r30918, 0x96; + lop3.b32 %r26305, %r30951, %r30925, %r30923, 0x96; + lop3.b32 %r26305, %r26305, %r30921, %r30919, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26316, %r30948, %r30916, %r30914, 0x96; + lop3.b32 %r26316, %r26316, %r30912, %r30910, 0x96; + lop3.b32 %r26317, %r30949, %r30917, %r30915, 0x96; + lop3.b32 %r26317, %r26317, %r30913, %r30911, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26328, %r26281, %r26280, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26332, %r26280, %r26281, %r26367; + // end inline asm + xor.b32 %r26506, %r26328, %r26316; + xor.b32 %r26507, %r26332, %r26317; + xor.b32 %r26475, %r30946, %r26506; + xor.b32 %r26478, %r30947, %r26507; + xor.b32 %r26438, %r30943, %r26507; + xor.b32 %r26437, %r30942, %r26506; + st.local.v2.u32 [%rd272+104], {%r26437, %r26438}; + // begin inline asm + shf.l.wrap.b32 %r26336, %r26293, %r26292, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26340, %r26292, %r26293, %r26367; + // end inline asm + xor.b32 %r26508, %r26336, %r26268; + xor.b32 %r26509, %r26340, %r26269; + xor.b32 %r26374, %r30956, %r26508; + xor.b32 %r26373, %r30957, %r26509; + xor.b32 %r26413, %r30935, %r26509; + xor.b32 %r26414, %r30934, %r26508; + st.local.v2.u32 [%rd272+152], {%r26414, %r26413}; + // begin inline asm + shf.l.wrap.b32 %r26344, %r26305, %r26304, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26348, %r26304, %r26305, %r26367; + // end inline asm + xor.b32 %r26510, %r26344, %r26280; + xor.b32 %r26511, %r26348, %r26281; + xor.b32 %r26397, %r30931, %r26511; + xor.b32 %r26398, %r30930, %r26510; + st.local.v2.u32 [%rd272+120], {%r26398, %r26397}; + xor.b32 %r26389, %r30927, %r26511; + xor.b32 %r26390, %r30926, %r26510; + st.local.v2.u32 [%rd272+200], {%r26390, %r26389}; + // begin inline asm + shf.l.wrap.b32 %r26352, %r26317, %r26316, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26356, %r26316, %r26317, %r26367; + // end inline asm + xor.b32 %r26512, %r26352, %r26292; + xor.b32 %r26513, %r26356, %r26293; + xor.b32 %r26421, %r30950, %r26512; + xor.b32 %r26422, %r30951, %r26513; + xor.b32 %r26430, %r30921, %r26513; + xor.b32 %r26429, %r30920, %r26512; + st.local.v2.u32 [%rd272+168], {%r26429, %r26430}; + // begin inline asm + shf.l.wrap.b32 %r26360, %r26269, %r26268, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26364, %r26268, %r26269, %r26367; + // end inline asm + xor.b32 %r26514, %r26360, %r26304; + xor.b32 %r26515, %r26364, %r26305; + xor.b32 %r26381, %r30916, %r26514; + xor.b32 %r26382, %r30917, %r26515; + xor.b32 %r26406, %r30911, %r26515; + xor.b32 %r26405, %r30910, %r26514; + st.local.v2.u32 [%rd272+216], {%r26405, %r26406}; + // begin inline asm + shf.l.wrap.b32 %r26368, %r26374, %r26373, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26372, %r26373, %r26374, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26376, %r26382, %r26381, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26380, %r26381, %r26382, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26388, %r26389, %r26390, %r25887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26384, %r26390, %r26389, %r25887; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r26384, %r26388}; + // begin inline asm + shf.l.wrap.b32 %r26392, %r26398, %r26397, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26396, %r26397, %r26398, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26400, %r26406, %r26405, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26404, %r26405, %r26406, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26412, %r26413, %r26414, %r25991; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26408, %r26414, %r26413, %r25991; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r26408, %r26412}; + // begin inline asm + shf.l.wrap.b32 %r26416, %r26422, %r26421, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26420, %r26421, %r26422, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26424, %r26430, %r26429, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26428, %r26429, %r26430, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26432, %r26438, %r26437, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26436, %r26437, %r26438, %r26047; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26440, %r26475, %r26368, %r26392, 0xD2; + lop3.b32 %r26441, %r26478, %r26372, %r26396, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26448, %r26368, %r26392, %r26424, 0xD2; + lop3.b32 %r26449, %r26372, %r26396, %r26428, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+32], {%r26448, %r26449}; + // begin inline asm + // chi + lop3.b32 %r26456, %r26392, %r26424, %r26400, 0xD2; + lop3.b32 %r26457, %r26396, %r26428, %r26404, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+40], {%r26456, %r26457}; + // begin inline asm + // chi + lop3.b32 %r26464, %r26424, %r26400, %r26475, 0xD2; + lop3.b32 %r26465, %r26428, %r26404, %r26478, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+48], {%r26464, %r26465}; + // begin inline asm + // chi + lop3.b32 %r26472, %r26400, %r26475, %r26368, 0xD2; + lop3.b32 %r26473, %r26404, %r26478, %r26372, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+56], {%r26472, %r26473}; + // begin inline asm + // chi + lop3.b32 %r26480, %r26416, %r26376, %r26432, 0xD2; + lop3.b32 %r26481, %r26420, %r26380, %r26436, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+64], {%r26480, %r26481}; + // begin inline asm + // chi + lop3.b32 %r26488, %r26376, %r26432, %r26408, 0xD2; + lop3.b32 %r26489, %r26380, %r26436, %r26412, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+72], {%r26488, %r26489}; + // begin inline asm + // chi + lop3.b32 %r26496, %r26432, %r26408, %r26384, 0xD2; + lop3.b32 %r26497, %r26436, %r26412, %r26388, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+80], {%r26496, %r26497}; + // begin inline asm + ld.global.nc.v2.u32 {%r26504,%r26505}, [%rd1030]; + // end inline asm + xor.b32 %r26516, %r26441, %r26505; + xor.b32 %r26517, %r26440, %r26504; + st.local.v2.u32 [%rd272+24], {%r26517, %r26516}; + bra.uni $L__BB2_94; + +$L__BB2_72: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd898, 1179641; + st.local.u64 [%rd3+8], %rd898; + st.local.u32 [%rd3+16], %r3326; + ld.global.u64 %rd899, [%rd222]; + ld.global.u64 %rd900, [%rd222+8]; + ld.global.u64 %rd901, [%rd222+16]; + ld.global.u64 %rd902, [%rd222+24]; + ld.global.u64 %rd903, [%rd222+32]; + ld.global.u64 %rd904, [%rd222+40]; + ld.global.u64 %rd905, [%rd222+48]; + ld.global.u64 %rd906, [%rd222+56]; + st.local.u64 [%rd3+24], %rd899; + st.local.u64 [%rd3+32], %rd900; + st.local.u64 [%rd3+40], %rd901; + st.local.u64 [%rd3+48], %rd902; + st.local.u64 [%rd3+56], %rd903; + st.local.u64 [%rd3+64], %rd904; + st.local.u64 [%rd3+72], %rd905; + st.local.u64 [%rd3+80], %rd906; + cvt.u32.u64 %r19990, %rd899; + xor.b32 %r19991, %r3326, %r19990; + st.local.u32 [%rd3+24], %r19991; + mov.u32 %r30487, 0; + st.local.v2.u32 [%rd3+96], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+104], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+112], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+120], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+128], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+136], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+144], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+152], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+160], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+168], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+176], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+184], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+192], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+200], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+208], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+216], {%r30487, %r30487}; + mov.u32 %r30502, -2147483648; + mov.u32 %r19963, 1; + st.local.v2.u32 [%rd3+88], {%r19963, %r30502}; + ld.local.v2.u32 {%r30523, %r30524}, [%rd3+24]; + mov.b64 {%r30521, %r30522}, %rd904; + shr.u64 %rd907, %rd900, 32; + cvt.u32.u64 %r30535, %rd900; + cvt.u32.u64 %r30536, %rd907; + shr.u64 %rd908, %rd905, 32; + cvt.u32.u64 %r30533, %rd905; + cvt.u32.u64 %r30534, %rd908; + shr.u64 %rd909, %rd901, 32; + cvt.u32.u64 %r30531, %rd901; + cvt.u32.u64 %r30532, %rd909; + shr.u64 %rd910, %rd906, 32; + cvt.u32.u64 %r30529, %rd906; + cvt.u32.u64 %r30530, %rd910; + shr.u64 %rd911, %rd902, 32; + cvt.u32.u64 %r30527, %rd902; + cvt.u32.u64 %r30528, %rd911; + shr.u64 %rd912, %rd903, 32; + cvt.u32.u64 %r30525, %rd903; + cvt.u32.u64 %r30526, %rd912; + mov.u32 %r30488, %r30487; + mov.u32 %r30489, %r30487; + mov.u32 %r30490, %r30487; + mov.u32 %r30491, %r30487; + mov.u32 %r30492, %r30487; + mov.u32 %r30493, %r30487; + mov.u32 %r30494, %r30487; + mov.u32 %r30495, %r30487; + mov.u32 %r30496, %r30487; + mov.u32 %r30497, %r30487; + mov.u32 %r30498, %r30487; + mov.u32 %r30499, %r30487; + mov.u32 %r30500, %r30487; + mov.u32 %r30501, %r19963; + mov.u32 %r30503, %r30487; + mov.u32 %r30504, %r30487; + mov.u32 %r30505, %r30487; + mov.u32 %r30506, %r30487; + mov.u32 %r30507, %r30487; + mov.u32 %r30508, %r30487; + mov.u32 %r30509, %r30487; + mov.u32 %r30510, %r30487; + mov.u32 %r30511, %r30487; + mov.u32 %r30512, %r30487; + mov.u32 %r30513, %r30487; + mov.u32 %r30514, %r30487; + mov.u32 %r30515, %r30487; + mov.u32 %r30516, %r30487; + mov.u32 %r30517, %r30487; + mov.u32 %r30518, %r30487; + mov.u32 %r30519, %r30487; + mov.u32 %r30520, %r30487; + mov.u32 %r30537, %r30487; + +$L__BB2_73: + // begin inline asm + // xor5 + lop3.b32 %r19994, %r30523, %r30521, %r30519, 0x96; + lop3.b32 %r19994, %r19994, %r30517, %r30515, 0x96; + lop3.b32 %r19995, %r30524, %r30522, %r30520, 0x96; + lop3.b32 %r19995, %r19995, %r30518, %r30516, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20006, %r30535, %r30533, %r30513, 0x96; + lop3.b32 %r20006, %r20006, %r30511, %r30509, 0x96; + lop3.b32 %r20007, %r30536, %r30534, %r30514, 0x96; + lop3.b32 %r20007, %r20007, %r30512, %r30510, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20018, %r30531, %r30529, %r30507, 0x96; + lop3.b32 %r20018, %r20018, %r30505, %r30503, 0x96; + lop3.b32 %r20019, %r30532, %r30530, %r30508, 0x96; + lop3.b32 %r20019, %r20019, %r30506, %r30504, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20030, %r30527, %r30501, %r30499, 0x96; + lop3.b32 %r20030, %r20030, %r30497, %r30495, 0x96; + lop3.b32 %r20031, %r30528, %r30502, %r30500, 0x96; + lop3.b32 %r20031, %r20031, %r30498, %r30496, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20042, %r30525, %r30493, %r30491, 0x96; + lop3.b32 %r20042, %r20042, %r30489, %r30487, 0x96; + lop3.b32 %r20043, %r30526, %r30494, %r30492, 0x96; + lop3.b32 %r20043, %r20043, %r30490, %r30488, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20054, %r20007, %r20006, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20058, %r20006, %r20007, %r19963; + // end inline asm + xor.b32 %r20488, %r20054, %r20042; + xor.b32 %r20489, %r20058, %r20043; + xor.b32 %r20321, %r30523, %r20488; + xor.b32 %r20324, %r30524, %r20489; + xor.b32 %r20228, %r30521, %r20488; + xor.b32 %r20227, %r30522, %r20489; + xor.b32 %r20275, %r30519, %r20488; + xor.b32 %r20276, %r30520, %r20489; + xor.b32 %r20180, %r30517, %r20488; + xor.b32 %r20179, %r30518, %r20489; + xor.b32 %r20131, %r30515, %r20488; + xor.b32 %r20132, %r30516, %r20489; + // begin inline asm + shf.l.wrap.b32 %r20062, %r20019, %r20018, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20066, %r20018, %r20019, %r19963; + // end inline asm + xor.b32 %r20490, %r20062, %r19994; + xor.b32 %r20491, %r20066, %r19995; + xor.b32 %r20283, %r30535, %r20490; + xor.b32 %r20284, %r30536, %r20491; + xor.b32 %r20100, %r30533, %r20490; + xor.b32 %r20099, %r30534, %r20491; + xor.b32 %r20259, %r30513, %r20490; + xor.b32 %r20260, %r30514, %r20491; + xor.b32 %r20220, %r30511, %r20490; + xor.b32 %r20219, %r30512, %r20491; + xor.b32 %r20203, %r30509, %r20490; + xor.b32 %r20204, %r30510, %r20491; + // begin inline asm + shf.l.wrap.b32 %r20070, %r20031, %r20030, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20074, %r20030, %r20031, %r19963; + // end inline asm + xor.b32 %r20492, %r20070, %r20006; + xor.b32 %r20493, %r20074, %r20007; + xor.b32 %r20140, %r30531, %r20492; + xor.b32 %r20139, %r30532, %r20493; + xor.b32 %r20267, %r30529, %r20492; + xor.b32 %r20268, %r30530, %r20493; + xor.b32 %r20148, %r30507, %r20492; + xor.b32 %r20147, %r30508, %r20493; + xor.b32 %r20251, %r30505, %r20492; + xor.b32 %r20252, %r30506, %r20493; + xor.b32 %r20116, %r30503, %r20492; + xor.b32 %r20115, %r30504, %r20493; + // begin inline asm + shf.l.wrap.b32 %r20078, %r20043, %r20042, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20082, %r20042, %r20043, %r19963; + // end inline asm + xor.b32 %r20494, %r20078, %r20018; + xor.b32 %r20495, %r20082, %r20019; + xor.b32 %r20235, %r30527, %r20494; + xor.b32 %r20236, %r30528, %r20495; + xor.b32 %r20212, %r30501, %r20494; + xor.b32 %r20211, %r30502, %r20495; + xor.b32 %r20155, %r30499, %r20494; + xor.b32 %r20156, %r30500, %r20495; + xor.b32 %r20243, %r30497, %r20494; + xor.b32 %r20244, %r30498, %r20495; + xor.b32 %r20172, %r30495, %r20494; + xor.b32 %r20171, %r30496, %r20495; + // begin inline asm + shf.l.wrap.b32 %r20086, %r19995, %r19994, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20090, %r19994, %r19995, %r19963; + // end inline asm + xor.b32 %r20496, %r20086, %r20030; + xor.b32 %r20497, %r20090, %r20031; + xor.b32 %r20187, %r30525, %r20496; + xor.b32 %r20188, %r30526, %r20497; + xor.b32 %r20107, %r30493, %r20496; + xor.b32 %r20108, %r30494, %r20497; + xor.b32 %r20124, %r30491, %r20496; + xor.b32 %r20123, %r30492, %r20497; + xor.b32 %r20163, %r30489, %r20496; + xor.b32 %r20164, %r30490, %r20497; + xor.b32 %r20195, %r30487, %r20496; + xor.b32 %r20196, %r30488, %r20497; + mov.u32 %r20101, 44; + // begin inline asm + shf.l.wrap.b32 %r20094, %r20100, %r20099, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20098, %r20099, %r20100, %r20101; + // end inline asm + mov.u32 %r20109, 20; + // begin inline asm + shf.l.wrap.b32 %r20102, %r20108, %r20107, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20106, %r20107, %r20108, %r20109; + // end inline asm + mov.u32 %r20117, 61; + // begin inline asm + shf.l.wrap.b32 %r20110, %r20116, %r20115, %r20117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20114, %r20115, %r20116, %r20117; + // end inline asm + mov.u32 %r20125, 39; + // begin inline asm + shf.l.wrap.b32 %r20118, %r20124, %r20123, %r20125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20122, %r20123, %r20124, %r20125; + // end inline asm + mov.u32 %r20133, 18; + // begin inline asm + shf.l.wrap.b32 %r20126, %r20132, %r20131, %r20133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20130, %r20131, %r20132, %r20133; + // end inline asm + mov.u32 %r20141, 62; + // begin inline asm + shf.l.wrap.b32 %r20134, %r20140, %r20139, %r20141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20138, %r20139, %r20140, %r20141; + // end inline asm + mov.u32 %r20149, 43; + // begin inline asm + shf.l.wrap.b32 %r20142, %r20148, %r20147, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20146, %r20147, %r20148, %r20149; + // end inline asm + mov.u32 %r20157, 25; + // begin inline asm + shf.l.wrap.b32 %r20150, %r20156, %r20155, %r20157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20154, %r20155, %r20156, %r20157; + // end inline asm + mov.u32 %r20165, 8; + // begin inline asm + shf.l.wrap.b32 %r20158, %r20164, %r20163, %r20165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20162, %r20163, %r20164, %r20165; + // end inline asm + mov.u32 %r20173, 56; + // begin inline asm + shf.l.wrap.b32 %r20166, %r20172, %r20171, %r20173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20170, %r20171, %r20172, %r20173; + // end inline asm + mov.u32 %r20181, 41; + // begin inline asm + shf.l.wrap.b32 %r20174, %r20180, %r20179, %r20181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20178, %r20179, %r20180, %r20181; + // end inline asm + mov.u32 %r20189, 27; + // begin inline asm + shf.l.wrap.b32 %r20182, %r20188, %r20187, %r20189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20186, %r20187, %r20188, %r20189; + // end inline asm + mov.u32 %r20197, 14; + // begin inline asm + shf.l.wrap.b32 %r20190, %r20196, %r20195, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20194, %r20195, %r20196, %r20197; + // end inline asm + mov.u32 %r20205, 2; + // begin inline asm + shf.l.wrap.b32 %r20198, %r20204, %r20203, %r20205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20202, %r20203, %r20204, %r20205; + // end inline asm + mov.u32 %r20213, 55; + // begin inline asm + shf.l.wrap.b32 %r20206, %r20212, %r20211, %r20213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20210, %r20211, %r20212, %r20213; + // end inline asm + mov.u32 %r20221, 45; + // begin inline asm + shf.l.wrap.b32 %r20214, %r20220, %r20219, %r20221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20218, %r20219, %r20220, %r20221; + // end inline asm + mov.u32 %r20229, 36; + // begin inline asm + shf.l.wrap.b32 %r20222, %r20228, %r20227, %r20229; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20226, %r20227, %r20228, %r20229; + // end inline asm + mov.u32 %r20237, 28; + // begin inline asm + shf.l.wrap.b32 %r20230, %r20236, %r20235, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20234, %r20235, %r20236, %r20237; + // end inline asm + mov.u32 %r20245, 21; + // begin inline asm + shf.l.wrap.b32 %r20238, %r20244, %r20243, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20242, %r20243, %r20244, %r20245; + // end inline asm + mov.u32 %r20253, 15; + // begin inline asm + shf.l.wrap.b32 %r20246, %r20252, %r20251, %r20253; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20250, %r20251, %r20252, %r20253; + // end inline asm + mov.u32 %r20261, 10; + // begin inline asm + shf.l.wrap.b32 %r20254, %r20260, %r20259, %r20261; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20258, %r20259, %r20260, %r20261; + // end inline asm + mov.u32 %r20269, 6; + // begin inline asm + shf.l.wrap.b32 %r20262, %r20268, %r20267, %r20269; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20266, %r20267, %r20268, %r20269; + // end inline asm + mov.u32 %r20277, 3; + // begin inline asm + shf.l.wrap.b32 %r20270, %r20276, %r20275, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20274, %r20275, %r20276, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20278, %r20284, %r20283, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20282, %r20283, %r20284, %r19963; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20286, %r20321, %r20094, %r20142, 0xD2; + lop3.b32 %r20287, %r20324, %r20098, %r20146, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30535, %r20094, %r20142, %r20238, 0xD2; + lop3.b32 %r30536, %r20098, %r20146, %r20242, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30531, %r20142, %r20238, %r20190, 0xD2; + lop3.b32 %r30532, %r20146, %r20242, %r20194, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30527, %r20238, %r20190, %r20321, 0xD2; + lop3.b32 %r30528, %r20242, %r20194, %r20324, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30525, %r20190, %r20321, %r20094, 0xD2; + lop3.b32 %r30526, %r20194, %r20324, %r20098, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30521, %r20230, %r20102, %r20270, 0xD2; + lop3.b32 %r30522, %r20234, %r20106, %r20274, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30533, %r20102, %r20270, %r20214, 0xD2; + lop3.b32 %r30534, %r20106, %r20274, %r20218, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30529, %r20270, %r20214, %r20110, 0xD2; + lop3.b32 %r30530, %r20274, %r20218, %r20114, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30501, %r20214, %r20110, %r20230, 0xD2; + lop3.b32 %r30502, %r20218, %r20114, %r20234, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30501, %r30502}; + // begin inline asm + // chi + lop3.b32 %r30493, %r20110, %r20230, %r20102, 0xD2; + lop3.b32 %r30494, %r20114, %r20234, %r20106, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30493, %r30494}; + // begin inline asm + // chi + lop3.b32 %r30519, %r20278, %r20262, %r20150, 0xD2; + lop3.b32 %r30520, %r20282, %r20266, %r20154, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30519, %r30520}; + // begin inline asm + // chi + lop3.b32 %r30513, %r20262, %r20150, %r20158, 0xD2; + lop3.b32 %r30514, %r20266, %r20154, %r20162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30513, %r30514}; + // begin inline asm + // chi + lop3.b32 %r30507, %r20150, %r20158, %r20126, 0xD2; + lop3.b32 %r30508, %r20154, %r20162, %r20130, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30507, %r30508}; + // begin inline asm + // chi + lop3.b32 %r30499, %r20158, %r20126, %r20278, 0xD2; + lop3.b32 %r30500, %r20162, %r20130, %r20282, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30499, %r30500}; + // begin inline asm + // chi + lop3.b32 %r30491, %r20126, %r20278, %r20262, 0xD2; + lop3.b32 %r30492, %r20130, %r20282, %r20266, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30491, %r30492}; + // begin inline asm + // chi + lop3.b32 %r30517, %r20182, %r20222, %r20254, 0xD2; + lop3.b32 %r30518, %r20186, %r20226, %r20258, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30517, %r30518}; + // begin inline asm + // chi + lop3.b32 %r30511, %r20222, %r20254, %r20246, 0xD2; + lop3.b32 %r30512, %r20226, %r20258, %r20250, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30511, %r30512}; + // begin inline asm + // chi + lop3.b32 %r30505, %r20254, %r20246, %r20166, 0xD2; + lop3.b32 %r30506, %r20258, %r20250, %r20170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30505, %r30506}; + // begin inline asm + // chi + lop3.b32 %r30497, %r20246, %r20166, %r20182, 0xD2; + lop3.b32 %r30498, %r20250, %r20170, %r20186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30497, %r30498}; + // begin inline asm + // chi + lop3.b32 %r30489, %r20166, %r20182, %r20222, 0xD2; + lop3.b32 %r30490, %r20170, %r20186, %r20226, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30489, %r30490}; + // begin inline asm + // chi + lop3.b32 %r30515, %r20134, %r20206, %r20118, 0xD2; + lop3.b32 %r30516, %r20138, %r20210, %r20122, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30515, %r30516}; + // begin inline asm + // chi + lop3.b32 %r30509, %r20206, %r20118, %r20174, 0xD2; + lop3.b32 %r30510, %r20210, %r20122, %r20178, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30509, %r30510}; + // begin inline asm + // chi + lop3.b32 %r30503, %r20118, %r20174, %r20198, 0xD2; + lop3.b32 %r30504, %r20122, %r20178, %r20202, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30503, %r30504}; + // begin inline asm + // chi + lop3.b32 %r30495, %r20174, %r20198, %r20134, 0xD2; + lop3.b32 %r30496, %r20178, %r20202, %r20138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30495, %r30496}; + // begin inline asm + // chi + lop3.b32 %r30487, %r20198, %r20134, %r20206, 0xD2; + lop3.b32 %r30488, %r20202, %r20138, %r20210, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30487, %r30488}; + mul.wide.s32 %rd916, %r30537, 8; + mov.u64 %rd917, keccak_round_constants; + cvta.const.u64 %rd918, %rd917; + add.s64 %rd913, %rd918, %rd916; + // begin inline asm + ld.global.nc.v2.u32 {%r20486,%r20487}, [%rd913]; + // end inline asm + xor.b32 %r30523, %r20286, %r20486; + xor.b32 %r30524, %r20287, %r20487; + add.s32 %r30537, %r30537, 1; + setp.lt.u32 %p42, %r30537, 23; + @%p42 bra $L__BB2_73; + + st.local.v2.u32 [%rd3+32], {%r30535, %r30536}; + st.local.v2.u32 [%rd3+72], {%r30533, %r30534}; + st.local.v2.u32 [%rd3+40], {%r30531, %r30532}; + st.local.v2.u32 [%rd3+80], {%r30529, %r30530}; + st.local.v2.u32 [%rd3+48], {%r30527, %r30528}; + st.local.v2.u32 [%rd3+56], {%r30525, %r30526}; + st.local.v2.u32 [%rd3+24], {%r30523, %r30524}; + // begin inline asm + // xor5 + lop3.b32 %r20498, %r30523, %r30521, %r30519, 0x96; + lop3.b32 %r20498, %r20498, %r30517, %r30515, 0x96; + lop3.b32 %r20499, %r30524, %r30522, %r30520, 0x96; + lop3.b32 %r20499, %r20499, %r30518, %r30516, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20510, %r30535, %r30533, %r30513, 0x96; + lop3.b32 %r20510, %r20510, %r30511, %r30509, 0x96; + lop3.b32 %r20511, %r30536, %r30534, %r30514, 0x96; + lop3.b32 %r20511, %r20511, %r30512, %r30510, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20522, %r30531, %r30529, %r30507, 0x96; + lop3.b32 %r20522, %r20522, %r30505, %r30503, 0x96; + lop3.b32 %r20523, %r30532, %r30530, %r30508, 0x96; + lop3.b32 %r20523, %r20523, %r30506, %r30504, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20534, %r30527, %r30501, %r30499, 0x96; + lop3.b32 %r20534, %r20534, %r30497, %r30495, 0x96; + lop3.b32 %r20535, %r30528, %r30502, %r30500, 0x96; + lop3.b32 %r20535, %r20535, %r30498, %r30496, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20546, %r30525, %r30493, %r30491, 0x96; + lop3.b32 %r20546, %r20546, %r30489, %r30487, 0x96; + lop3.b32 %r20547, %r30526, %r30494, %r30492, 0x96; + lop3.b32 %r20547, %r20547, %r30490, %r30488, 0x96; + // end inline asm + mov.u32 %r20750, 1; + // begin inline asm + shf.l.wrap.b32 %r20558, %r20511, %r20510, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20562, %r20510, %r20511, %r20750; + // end inline asm + xor.b32 %r20777, %r20558, %r20546; + xor.b32 %r20778, %r20562, %r20547; + xor.b32 %r20705, %r30523, %r20777; + xor.b32 %r20708, %r30524, %r20778; + xor.b32 %r20668, %r30520, %r20778; + xor.b32 %r20667, %r30519, %r20777; + st.local.v2.u32 [%rd3+104], {%r20667, %r20668}; + // begin inline asm + shf.l.wrap.b32 %r20566, %r20523, %r20522, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20570, %r20522, %r20523, %r20750; + // end inline asm + xor.b32 %r20779, %r20566, %r20498; + xor.b32 %r20780, %r20570, %r20499; + xor.b32 %r20604, %r30533, %r20779; + xor.b32 %r20603, %r30534, %r20780; + xor.b32 %r20643, %r30512, %r20780; + xor.b32 %r20644, %r30511, %r20779; + st.local.v2.u32 [%rd3+152], {%r20644, %r20643}; + // begin inline asm + shf.l.wrap.b32 %r20574, %r20535, %r20534, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20578, %r20534, %r20535, %r20750; + // end inline asm + xor.b32 %r20781, %r20574, %r20510; + xor.b32 %r20782, %r20578, %r20511; + xor.b32 %r20627, %r30508, %r20782; + xor.b32 %r20628, %r30507, %r20781; + st.local.v2.u32 [%rd3+120], {%r20628, %r20627}; + xor.b32 %r20619, %r30504, %r20782; + xor.b32 %r20620, %r30503, %r20781; + st.local.v2.u32 [%rd3+200], {%r20620, %r20619}; + // begin inline asm + shf.l.wrap.b32 %r20582, %r20547, %r20546, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20586, %r20546, %r20547, %r20750; + // end inline asm + xor.b32 %r20783, %r20582, %r20522; + xor.b32 %r20784, %r20586, %r20523; + xor.b32 %r20651, %r30527, %r20783; + xor.b32 %r20652, %r30528, %r20784; + xor.b32 %r20660, %r30498, %r20784; + xor.b32 %r20659, %r30497, %r20783; + st.local.v2.u32 [%rd3+168], {%r20659, %r20660}; + // begin inline asm + shf.l.wrap.b32 %r20590, %r20499, %r20498, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20594, %r20498, %r20499, %r20750; + // end inline asm + xor.b32 %r20785, %r20590, %r20534; + xor.b32 %r20786, %r20594, %r20535; + xor.b32 %r20611, %r30493, %r20785; + xor.b32 %r20612, %r30494, %r20786; + xor.b32 %r20636, %r30488, %r20786; + xor.b32 %r20635, %r30487, %r20785; + st.local.v2.u32 [%rd3+216], {%r20635, %r20636}; + // begin inline asm + shf.l.wrap.b32 %r20598, %r20604, %r20603, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20602, %r20603, %r20604, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20606, %r20612, %r20611, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20610, %r20611, %r20612, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20618, %r20619, %r20620, %r20117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20614, %r20620, %r20619, %r20117; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r20614, %r20618}; + // begin inline asm + shf.l.wrap.b32 %r20622, %r20628, %r20627, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20626, %r20627, %r20628, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20630, %r20636, %r20635, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20634, %r20635, %r20636, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20642, %r20643, %r20644, %r20221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20638, %r20644, %r20643, %r20221; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r20638, %r20642}; + // begin inline asm + shf.l.wrap.b32 %r20646, %r20652, %r20651, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20650, %r20651, %r20652, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20654, %r20660, %r20659, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20658, %r20659, %r20660, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20662, %r20668, %r20667, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20666, %r20667, %r20668, %r20277; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20670, %r20705, %r20598, %r20622, 0xD2; + lop3.b32 %r20671, %r20708, %r20602, %r20626, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30670, %r20598, %r20622, %r20654, 0xD2; + lop3.b32 %r30671, %r20602, %r20626, %r20658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30670, %r30671}; + // begin inline asm + // chi + lop3.b32 %r30666, %r20622, %r20654, %r20630, 0xD2; + lop3.b32 %r30667, %r20626, %r20658, %r20634, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30666, %r30667}; + // begin inline asm + // chi + lop3.b32 %r30662, %r20654, %r20630, %r20705, 0xD2; + lop3.b32 %r30663, %r20658, %r20634, %r20708, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30662, %r30663}; + // begin inline asm + // chi + lop3.b32 %r30660, %r20630, %r20705, %r20598, 0xD2; + lop3.b32 %r30661, %r20634, %r20708, %r20602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30660, %r30661}; + // begin inline asm + // chi + lop3.b32 %r30656, %r20646, %r20606, %r20662, 0xD2; + lop3.b32 %r30657, %r20650, %r20610, %r20666, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30656, %r30657}; + // begin inline asm + // chi + lop3.b32 %r30668, %r20606, %r20662, %r20638, 0xD2; + lop3.b32 %r30669, %r20610, %r20666, %r20642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30668, %r30669}; + // begin inline asm + // chi + lop3.b32 %r30664, %r20662, %r20638, %r20614, 0xD2; + lop3.b32 %r30665, %r20666, %r20642, %r20618, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30664, %r30665}; + add.s64 %rd919, %rd918, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r20734,%r20735}, [%rd919]; + // end inline asm + xor.b32 %r30658, %r20670, %r20734; + xor.b32 %r30659, %r20671, %r20735; + add.u64 %rd925, %SPL, 1912; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + st.local.u64 [%rd925], %rd354; + mov.u64 %rd926, 1179641; + st.local.u64 [%rd925+8], %rd926; + add.s32 %r20787, %r3326, 1; + st.local.u32 [%rd925+16], %r20787; + ld.global.u64 %rd927, [%rd223]; + ld.global.u64 %rd928, [%rd223+8]; + ld.global.u64 %rd929, [%rd223+16]; + ld.global.u64 %rd930, [%rd223+24]; + ld.global.u64 %rd931, [%rd223+32]; + ld.global.u64 %rd932, [%rd223+40]; + ld.global.u64 %rd933, [%rd223+48]; + ld.global.u64 %rd934, [%rd223+56]; + st.local.u64 [%rd925+32], %rd928; + st.local.u64 [%rd925+40], %rd929; + st.local.u64 [%rd925+48], %rd930; + st.local.u64 [%rd925+56], %rd931; + st.local.u64 [%rd925+64], %rd932; + st.local.u64 [%rd925+72], %rd933; + st.local.u64 [%rd925+80], %rd934; + cvt.u32.u64 %r20788, %rd927; + xor.b32 %r20789, %r20787, %r20788; + st.local.u64 [%rd925+24], %rd927; + st.local.u32 [%rd925+24], %r20789; + mov.u32 %r30538, 0; + st.local.v2.u32 [%rd925+96], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+104], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+112], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+120], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+128], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+136], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+144], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+152], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+160], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+168], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+176], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+184], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+192], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+200], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+208], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+216], {%r30538, %r30538}; + mov.u32 %r30553, -2147483648; + st.local.v2.u32 [%rd925+88], {%r20750, %r30553}; + ld.local.v2.u32 {%r30574, %r30575}, [%rd925+24]; + mov.b64 {%r30572, %r30573}, %rd932; + shr.u64 %rd935, %rd928, 32; + cvt.u32.u64 %r30586, %rd928; + cvt.u32.u64 %r30587, %rd935; + shr.u64 %rd936, %rd933, 32; + cvt.u32.u64 %r30584, %rd933; + cvt.u32.u64 %r30585, %rd936; + shr.u64 %rd937, %rd929, 32; + cvt.u32.u64 %r30582, %rd929; + cvt.u32.u64 %r30583, %rd937; + shr.u64 %rd938, %rd934, 32; + cvt.u32.u64 %r30580, %rd934; + cvt.u32.u64 %r30581, %rd938; + shr.u64 %rd939, %rd930, 32; + cvt.u32.u64 %r30578, %rd930; + cvt.u32.u64 %r30579, %rd939; + shr.u64 %rd940, %rd931, 32; + cvt.u32.u64 %r30576, %rd931; + cvt.u32.u64 %r30577, %rd940; + mov.u32 %r30539, %r30538; + mov.u32 %r30540, %r30538; + mov.u32 %r30541, %r30538; + mov.u32 %r30542, %r30538; + mov.u32 %r30543, %r30538; + mov.u32 %r30544, %r30538; + mov.u32 %r30545, %r30538; + mov.u32 %r30546, %r30538; + mov.u32 %r30547, %r30538; + mov.u32 %r30548, %r30538; + mov.u32 %r30549, %r30538; + mov.u32 %r30550, %r30538; + mov.u32 %r30551, %r30538; + mov.u32 %r30552, %r20750; + mov.u32 %r30554, %r30538; + mov.u32 %r30555, %r30538; + mov.u32 %r30556, %r30538; + mov.u32 %r30557, %r30538; + mov.u32 %r30558, %r30538; + mov.u32 %r30559, %r30538; + mov.u32 %r30560, %r30538; + mov.u32 %r30561, %r30538; + mov.u32 %r30562, %r30538; + mov.u32 %r30563, %r30538; + mov.u32 %r30564, %r30538; + mov.u32 %r30565, %r30538; + mov.u32 %r30566, %r30538; + mov.u32 %r30567, %r30538; + mov.u32 %r30568, %r30538; + mov.u32 %r30569, %r30538; + mov.u32 %r30570, %r30538; + mov.u32 %r30571, %r30538; + mov.u32 %r30588, %r30538; + +$L__BB2_75: + // begin inline asm + // xor5 + lop3.b32 %r20792, %r30574, %r30572, %r30570, 0x96; + lop3.b32 %r20792, %r20792, %r30568, %r30566, 0x96; + lop3.b32 %r20793, %r30575, %r30573, %r30571, 0x96; + lop3.b32 %r20793, %r20793, %r30569, %r30567, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20804, %r30586, %r30584, %r30564, 0x96; + lop3.b32 %r20804, %r20804, %r30562, %r30560, 0x96; + lop3.b32 %r20805, %r30587, %r30585, %r30565, 0x96; + lop3.b32 %r20805, %r20805, %r30563, %r30561, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20816, %r30582, %r30580, %r30558, 0x96; + lop3.b32 %r20816, %r20816, %r30556, %r30554, 0x96; + lop3.b32 %r20817, %r30583, %r30581, %r30559, 0x96; + lop3.b32 %r20817, %r20817, %r30557, %r30555, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20828, %r30578, %r30552, %r30550, 0x96; + lop3.b32 %r20828, %r20828, %r30548, %r30546, 0x96; + lop3.b32 %r20829, %r30579, %r30553, %r30551, 0x96; + lop3.b32 %r20829, %r20829, %r30549, %r30547, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20840, %r30576, %r30544, %r30542, 0x96; + lop3.b32 %r20840, %r20840, %r30540, %r30538, 0x96; + lop3.b32 %r20841, %r30577, %r30545, %r30543, 0x96; + lop3.b32 %r20841, %r20841, %r30541, %r30539, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20852, %r20805, %r20804, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20856, %r20804, %r20805, %r20750; + // end inline asm + xor.b32 %r21286, %r20852, %r20840; + xor.b32 %r21287, %r20856, %r20841; + xor.b32 %r21119, %r30574, %r21286; + xor.b32 %r21122, %r30575, %r21287; + xor.b32 %r21026, %r30572, %r21286; + xor.b32 %r21025, %r30573, %r21287; + xor.b32 %r21073, %r30570, %r21286; + xor.b32 %r21074, %r30571, %r21287; + xor.b32 %r20978, %r30568, %r21286; + xor.b32 %r20977, %r30569, %r21287; + xor.b32 %r20929, %r30566, %r21286; + xor.b32 %r20930, %r30567, %r21287; + // begin inline asm + shf.l.wrap.b32 %r20860, %r20817, %r20816, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20864, %r20816, %r20817, %r20750; + // end inline asm + xor.b32 %r21288, %r20860, %r20792; + xor.b32 %r21289, %r20864, %r20793; + xor.b32 %r21081, %r30586, %r21288; + xor.b32 %r21082, %r30587, %r21289; + xor.b32 %r20898, %r30584, %r21288; + xor.b32 %r20897, %r30585, %r21289; + xor.b32 %r21057, %r30564, %r21288; + xor.b32 %r21058, %r30565, %r21289; + xor.b32 %r21018, %r30562, %r21288; + xor.b32 %r21017, %r30563, %r21289; + xor.b32 %r21001, %r30560, %r21288; + xor.b32 %r21002, %r30561, %r21289; + // begin inline asm + shf.l.wrap.b32 %r20868, %r20829, %r20828, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20872, %r20828, %r20829, %r20750; + // end inline asm + xor.b32 %r21290, %r20868, %r20804; + xor.b32 %r21291, %r20872, %r20805; + xor.b32 %r20938, %r30582, %r21290; + xor.b32 %r20937, %r30583, %r21291; + xor.b32 %r21065, %r30580, %r21290; + xor.b32 %r21066, %r30581, %r21291; + xor.b32 %r20946, %r30558, %r21290; + xor.b32 %r20945, %r30559, %r21291; + xor.b32 %r21049, %r30556, %r21290; + xor.b32 %r21050, %r30557, %r21291; + xor.b32 %r20914, %r30554, %r21290; + xor.b32 %r20913, %r30555, %r21291; + // begin inline asm + shf.l.wrap.b32 %r20876, %r20841, %r20840, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20880, %r20840, %r20841, %r20750; + // end inline asm + xor.b32 %r21292, %r20876, %r20816; + xor.b32 %r21293, %r20880, %r20817; + xor.b32 %r21033, %r30578, %r21292; + xor.b32 %r21034, %r30579, %r21293; + xor.b32 %r21010, %r30552, %r21292; + xor.b32 %r21009, %r30553, %r21293; + xor.b32 %r20953, %r30550, %r21292; + xor.b32 %r20954, %r30551, %r21293; + xor.b32 %r21041, %r30548, %r21292; + xor.b32 %r21042, %r30549, %r21293; + xor.b32 %r20970, %r30546, %r21292; + xor.b32 %r20969, %r30547, %r21293; + // begin inline asm + shf.l.wrap.b32 %r20884, %r20793, %r20792, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20888, %r20792, %r20793, %r20750; + // end inline asm + xor.b32 %r21294, %r20884, %r20828; + xor.b32 %r21295, %r20888, %r20829; + xor.b32 %r20985, %r30576, %r21294; + xor.b32 %r20986, %r30577, %r21295; + xor.b32 %r20905, %r30544, %r21294; + xor.b32 %r20906, %r30545, %r21295; + xor.b32 %r20922, %r30542, %r21294; + xor.b32 %r20921, %r30543, %r21295; + xor.b32 %r20961, %r30540, %r21294; + xor.b32 %r20962, %r30541, %r21295; + xor.b32 %r20993, %r30538, %r21294; + xor.b32 %r20994, %r30539, %r21295; + mov.u32 %r20899, 44; + // begin inline asm + shf.l.wrap.b32 %r20892, %r20898, %r20897, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20896, %r20897, %r20898, %r20899; + // end inline asm + mov.u32 %r20907, 20; + // begin inline asm + shf.l.wrap.b32 %r20900, %r20906, %r20905, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20904, %r20905, %r20906, %r20907; + // end inline asm + mov.u32 %r20915, 61; + // begin inline asm + shf.l.wrap.b32 %r20908, %r20914, %r20913, %r20915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20912, %r20913, %r20914, %r20915; + // end inline asm + mov.u32 %r20923, 39; + // begin inline asm + shf.l.wrap.b32 %r20916, %r20922, %r20921, %r20923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20920, %r20921, %r20922, %r20923; + // end inline asm + mov.u32 %r20931, 18; + // begin inline asm + shf.l.wrap.b32 %r20924, %r20930, %r20929, %r20931; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20928, %r20929, %r20930, %r20931; + // end inline asm + mov.u32 %r20939, 62; + // begin inline asm + shf.l.wrap.b32 %r20932, %r20938, %r20937, %r20939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20936, %r20937, %r20938, %r20939; + // end inline asm + mov.u32 %r20947, 43; + // begin inline asm + shf.l.wrap.b32 %r20940, %r20946, %r20945, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20944, %r20945, %r20946, %r20947; + // end inline asm + mov.u32 %r20955, 25; + // begin inline asm + shf.l.wrap.b32 %r20948, %r20954, %r20953, %r20955; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20952, %r20953, %r20954, %r20955; + // end inline asm + mov.u32 %r20963, 8; + // begin inline asm + shf.l.wrap.b32 %r20956, %r20962, %r20961, %r20963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20960, %r20961, %r20962, %r20963; + // end inline asm + mov.u32 %r20971, 56; + // begin inline asm + shf.l.wrap.b32 %r20964, %r20970, %r20969, %r20971; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20968, %r20969, %r20970, %r20971; + // end inline asm + mov.u32 %r20979, 41; + // begin inline asm + shf.l.wrap.b32 %r20972, %r20978, %r20977, %r20979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20976, %r20977, %r20978, %r20979; + // end inline asm + mov.u32 %r20987, 27; + // begin inline asm + shf.l.wrap.b32 %r20980, %r20986, %r20985, %r20987; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20984, %r20985, %r20986, %r20987; + // end inline asm + mov.u32 %r20995, 14; + // begin inline asm + shf.l.wrap.b32 %r20988, %r20994, %r20993, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20992, %r20993, %r20994, %r20995; + // end inline asm + mov.u32 %r21003, 2; + // begin inline asm + shf.l.wrap.b32 %r20996, %r21002, %r21001, %r21003; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21000, %r21001, %r21002, %r21003; + // end inline asm + mov.u32 %r21011, 55; + // begin inline asm + shf.l.wrap.b32 %r21004, %r21010, %r21009, %r21011; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21008, %r21009, %r21010, %r21011; + // end inline asm + mov.u32 %r21019, 45; + // begin inline asm + shf.l.wrap.b32 %r21012, %r21018, %r21017, %r21019; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21016, %r21017, %r21018, %r21019; + // end inline asm + mov.u32 %r21027, 36; + // begin inline asm + shf.l.wrap.b32 %r21020, %r21026, %r21025, %r21027; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21024, %r21025, %r21026, %r21027; + // end inline asm + mov.u32 %r21035, 28; + // begin inline asm + shf.l.wrap.b32 %r21028, %r21034, %r21033, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21032, %r21033, %r21034, %r21035; + // end inline asm + mov.u32 %r21043, 21; + // begin inline asm + shf.l.wrap.b32 %r21036, %r21042, %r21041, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21040, %r21041, %r21042, %r21043; + // end inline asm + mov.u32 %r21051, 15; + // begin inline asm + shf.l.wrap.b32 %r21044, %r21050, %r21049, %r21051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21048, %r21049, %r21050, %r21051; + // end inline asm + mov.u32 %r21059, 10; + // begin inline asm + shf.l.wrap.b32 %r21052, %r21058, %r21057, %r21059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21056, %r21057, %r21058, %r21059; + // end inline asm + mov.u32 %r21067, 6; + // begin inline asm + shf.l.wrap.b32 %r21060, %r21066, %r21065, %r21067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21064, %r21065, %r21066, %r21067; + // end inline asm + mov.u32 %r21075, 3; + // begin inline asm + shf.l.wrap.b32 %r21068, %r21074, %r21073, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21072, %r21073, %r21074, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21076, %r21082, %r21081, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21080, %r21081, %r21082, %r20750; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21084, %r21119, %r20892, %r20940, 0xD2; + lop3.b32 %r21085, %r21122, %r20896, %r20944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30586, %r20892, %r20940, %r21036, 0xD2; + lop3.b32 %r30587, %r20896, %r20944, %r21040, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30582, %r20940, %r21036, %r20988, 0xD2; + lop3.b32 %r30583, %r20944, %r21040, %r20992, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30578, %r21036, %r20988, %r21119, 0xD2; + lop3.b32 %r30579, %r21040, %r20992, %r21122, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30576, %r20988, %r21119, %r20892, 0xD2; + lop3.b32 %r30577, %r20992, %r21122, %r20896, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30572, %r21028, %r20900, %r21068, 0xD2; + lop3.b32 %r30573, %r21032, %r20904, %r21072, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30584, %r20900, %r21068, %r21012, 0xD2; + lop3.b32 %r30585, %r20904, %r21072, %r21016, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30580, %r21068, %r21012, %r20908, 0xD2; + lop3.b32 %r30581, %r21072, %r21016, %r20912, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30552, %r21012, %r20908, %r21028, 0xD2; + lop3.b32 %r30553, %r21016, %r20912, %r21032, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r30552, %r30553}; + // begin inline asm + // chi + lop3.b32 %r30544, %r20908, %r21028, %r20900, 0xD2; + lop3.b32 %r30545, %r20912, %r21032, %r20904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r30544, %r30545}; + // begin inline asm + // chi + lop3.b32 %r30570, %r21076, %r21060, %r20948, 0xD2; + lop3.b32 %r30571, %r21080, %r21064, %r20952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+104], {%r30570, %r30571}; + // begin inline asm + // chi + lop3.b32 %r30564, %r21060, %r20948, %r20956, 0xD2; + lop3.b32 %r30565, %r21064, %r20952, %r20960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+112], {%r30564, %r30565}; + // begin inline asm + // chi + lop3.b32 %r30558, %r20948, %r20956, %r20924, 0xD2; + lop3.b32 %r30559, %r20952, %r20960, %r20928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+120], {%r30558, %r30559}; + // begin inline asm + // chi + lop3.b32 %r30550, %r20956, %r20924, %r21076, 0xD2; + lop3.b32 %r30551, %r20960, %r20928, %r21080, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+128], {%r30550, %r30551}; + // begin inline asm + // chi + lop3.b32 %r30542, %r20924, %r21076, %r21060, 0xD2; + lop3.b32 %r30543, %r20928, %r21080, %r21064, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+136], {%r30542, %r30543}; + // begin inline asm + // chi + lop3.b32 %r30568, %r20980, %r21020, %r21052, 0xD2; + lop3.b32 %r30569, %r20984, %r21024, %r21056, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+144], {%r30568, %r30569}; + // begin inline asm + // chi + lop3.b32 %r30562, %r21020, %r21052, %r21044, 0xD2; + lop3.b32 %r30563, %r21024, %r21056, %r21048, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+152], {%r30562, %r30563}; + // begin inline asm + // chi + lop3.b32 %r30556, %r21052, %r21044, %r20964, 0xD2; + lop3.b32 %r30557, %r21056, %r21048, %r20968, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+160], {%r30556, %r30557}; + // begin inline asm + // chi + lop3.b32 %r30548, %r21044, %r20964, %r20980, 0xD2; + lop3.b32 %r30549, %r21048, %r20968, %r20984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+168], {%r30548, %r30549}; + // begin inline asm + // chi + lop3.b32 %r30540, %r20964, %r20980, %r21020, 0xD2; + lop3.b32 %r30541, %r20968, %r20984, %r21024, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+176], {%r30540, %r30541}; + // begin inline asm + // chi + lop3.b32 %r30566, %r20932, %r21004, %r20916, 0xD2; + lop3.b32 %r30567, %r20936, %r21008, %r20920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+184], {%r30566, %r30567}; + // begin inline asm + // chi + lop3.b32 %r30560, %r21004, %r20916, %r20972, 0xD2; + lop3.b32 %r30561, %r21008, %r20920, %r20976, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+192], {%r30560, %r30561}; + // begin inline asm + // chi + lop3.b32 %r30554, %r20916, %r20972, %r20996, 0xD2; + lop3.b32 %r30555, %r20920, %r20976, %r21000, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+200], {%r30554, %r30555}; + // begin inline asm + // chi + lop3.b32 %r30546, %r20972, %r20996, %r20932, 0xD2; + lop3.b32 %r30547, %r20976, %r21000, %r20936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+208], {%r30546, %r30547}; + // begin inline asm + // chi + lop3.b32 %r30538, %r20996, %r20932, %r21004, 0xD2; + lop3.b32 %r30539, %r21000, %r20936, %r21008, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+216], {%r30538, %r30539}; + mul.wide.s32 %rd944, %r30588, 8; + add.s64 %rd941, %rd918, %rd944; + // begin inline asm + ld.global.nc.v2.u32 {%r21284,%r21285}, [%rd941]; + // end inline asm + xor.b32 %r30574, %r21084, %r21284; + xor.b32 %r30575, %r21085, %r21285; + add.s32 %r30588, %r30588, 1; + setp.lt.u32 %p43, %r30588, 23; + @%p43 bra $L__BB2_75; + + mov.u32 %r30621, 0; + mov.u32 %r21395, 1; + st.local.v2.u32 [%rd925+32], {%r30586, %r30587}; + st.local.v2.u32 [%rd925+72], {%r30584, %r30585}; + st.local.v2.u32 [%rd925+40], {%r30582, %r30583}; + st.local.v2.u32 [%rd925+80], {%r30580, %r30581}; + st.local.v2.u32 [%rd925+48], {%r30578, %r30579}; + st.local.v2.u32 [%rd925+56], {%r30576, %r30577}; + st.local.v2.u32 [%rd925+24], {%r30574, %r30575}; + // begin inline asm + // xor5 + lop3.b32 %r21296, %r30574, %r30572, %r30570, 0x96; + lop3.b32 %r21296, %r21296, %r30568, %r30566, 0x96; + lop3.b32 %r21297, %r30575, %r30573, %r30571, 0x96; + lop3.b32 %r21297, %r21297, %r30569, %r30567, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21308, %r30586, %r30584, %r30564, 0x96; + lop3.b32 %r21308, %r21308, %r30562, %r30560, 0x96; + lop3.b32 %r21309, %r30587, %r30585, %r30565, 0x96; + lop3.b32 %r21309, %r21309, %r30563, %r30561, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21320, %r30582, %r30580, %r30558, 0x96; + lop3.b32 %r21320, %r21320, %r30556, %r30554, 0x96; + lop3.b32 %r21321, %r30583, %r30581, %r30559, 0x96; + lop3.b32 %r21321, %r21321, %r30557, %r30555, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21332, %r30578, %r30552, %r30550, 0x96; + lop3.b32 %r21332, %r21332, %r30548, %r30546, 0x96; + lop3.b32 %r21333, %r30579, %r30553, %r30551, 0x96; + lop3.b32 %r21333, %r21333, %r30549, %r30547, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21344, %r30576, %r30544, %r30542, 0x96; + lop3.b32 %r21344, %r21344, %r30540, %r30538, 0x96; + lop3.b32 %r21345, %r30577, %r30545, %r30543, 0x96; + lop3.b32 %r21345, %r21345, %r30541, %r30539, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21356, %r21309, %r21308, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21360, %r21308, %r21309, %r21395; + // end inline asm + xor.b32 %r21535, %r21356, %r21344; + xor.b32 %r21536, %r21360, %r21345; + xor.b32 %r21503, %r30574, %r21535; + xor.b32 %r21506, %r30575, %r21536; + xor.b32 %r21466, %r30571, %r21536; + xor.b32 %r21465, %r30570, %r21535; + st.local.v2.u32 [%rd925+104], {%r21465, %r21466}; + // begin inline asm + shf.l.wrap.b32 %r21364, %r21321, %r21320, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21368, %r21320, %r21321, %r21395; + // end inline asm + xor.b32 %r21537, %r21364, %r21296; + xor.b32 %r21538, %r21368, %r21297; + xor.b32 %r21402, %r30584, %r21537; + xor.b32 %r21401, %r30585, %r21538; + xor.b32 %r21441, %r30563, %r21538; + xor.b32 %r21442, %r30562, %r21537; + st.local.v2.u32 [%rd925+152], {%r21442, %r21441}; + // begin inline asm + shf.l.wrap.b32 %r21372, %r21333, %r21332, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21376, %r21332, %r21333, %r21395; + // end inline asm + xor.b32 %r21539, %r21372, %r21308; + xor.b32 %r21540, %r21376, %r21309; + xor.b32 %r21425, %r30559, %r21540; + xor.b32 %r21426, %r30558, %r21539; + st.local.v2.u32 [%rd925+120], {%r21426, %r21425}; + xor.b32 %r21417, %r30555, %r21540; + xor.b32 %r21418, %r30554, %r21539; + st.local.v2.u32 [%rd925+200], {%r21418, %r21417}; + // begin inline asm + shf.l.wrap.b32 %r21380, %r21345, %r21344, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21384, %r21344, %r21345, %r21395; + // end inline asm + xor.b32 %r21541, %r21380, %r21320; + xor.b32 %r21542, %r21384, %r21321; + xor.b32 %r21449, %r30578, %r21541; + xor.b32 %r21450, %r30579, %r21542; + xor.b32 %r21458, %r30549, %r21542; + xor.b32 %r21457, %r30548, %r21541; + st.local.v2.u32 [%rd925+168], {%r21457, %r21458}; + // begin inline asm + shf.l.wrap.b32 %r21388, %r21297, %r21296, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21392, %r21296, %r21297, %r21395; + // end inline asm + xor.b32 %r21543, %r21388, %r21332; + xor.b32 %r21544, %r21392, %r21333; + xor.b32 %r21409, %r30544, %r21543; + xor.b32 %r21410, %r30545, %r21544; + xor.b32 %r21434, %r30539, %r21544; + xor.b32 %r21433, %r30538, %r21543; + st.local.v2.u32 [%rd925+216], {%r21433, %r21434}; + // begin inline asm + shf.l.wrap.b32 %r21396, %r21402, %r21401, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21400, %r21401, %r21402, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21404, %r21410, %r21409, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21408, %r21409, %r21410, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21416, %r21417, %r21418, %r20915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21412, %r21418, %r21417, %r20915; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r21412, %r21416}; + // begin inline asm + shf.l.wrap.b32 %r21420, %r21426, %r21425, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21424, %r21425, %r21426, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21428, %r21434, %r21433, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21432, %r21433, %r21434, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21440, %r21441, %r21442, %r21019; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21436, %r21442, %r21441, %r21019; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r21436, %r21440}; + // begin inline asm + shf.l.wrap.b32 %r21444, %r21450, %r21449, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21448, %r21449, %r21450, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21452, %r21458, %r21457, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21456, %r21457, %r21458, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21460, %r21466, %r21465, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21464, %r21465, %r21466, %r21075; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21468, %r21503, %r21396, %r21420, 0xD2; + lop3.b32 %r21469, %r21506, %r21400, %r21424, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30721, %r21396, %r21420, %r21452, 0xD2; + lop3.b32 %r30722, %r21400, %r21424, %r21456, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+32], {%r30721, %r30722}; + // begin inline asm + // chi + lop3.b32 %r30717, %r21420, %r21452, %r21428, 0xD2; + lop3.b32 %r30718, %r21424, %r21456, %r21432, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+40], {%r30717, %r30718}; + // begin inline asm + // chi + lop3.b32 %r30713, %r21452, %r21428, %r21503, 0xD2; + lop3.b32 %r30714, %r21456, %r21432, %r21506, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+48], {%r30713, %r30714}; + // begin inline asm + // chi + lop3.b32 %r30711, %r21428, %r21503, %r21396, 0xD2; + lop3.b32 %r30712, %r21432, %r21506, %r21400, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+56], {%r30711, %r30712}; + // begin inline asm + // chi + lop3.b32 %r30707, %r21444, %r21404, %r21460, 0xD2; + lop3.b32 %r30708, %r21448, %r21408, %r21464, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+64], {%r30707, %r30708}; + // begin inline asm + // chi + lop3.b32 %r30719, %r21404, %r21460, %r21436, 0xD2; + lop3.b32 %r30720, %r21408, %r21464, %r21440, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+72], {%r30719, %r30720}; + // begin inline asm + // chi + lop3.b32 %r30715, %r21460, %r21436, %r21412, 0xD2; + lop3.b32 %r30716, %r21464, %r21440, %r21416, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+80], {%r30715, %r30716}; + // begin inline asm + ld.global.nc.v2.u32 {%r21532,%r21533}, [%rd919]; + // end inline asm + xor.b32 %r30709, %r21468, %r21532; + xor.b32 %r30710, %r21469, %r21533; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + add.s64 %rd244, %rd925, 24; + add.s64 %rd245, %rd3, 24; + +$L__BB2_77: + shl.b32 %r21545, %r30621, 2; + cvt.u64.u32 %rd952, %r21545; + and.b64 %rd953, %rd952, 60; + add.s64 %rd954, %rd245, %rd953; + xor.b32 %r21546, %r3326, %r30621; + mul.lo.s32 %r21547, %r21546, 16777619; + ld.local.u32 %r21548, [%rd954]; + xor.b32 %r21549, %r21547, %r21548; + mul.wide.u32 %rd955, %r21549, -954391867; + shr.u64 %rd956, %rd955, 32; + cvt.u32.u64 %r21550, %rd956; + sub.s32 %r21551, %r21549, %r21550; + shr.u32 %r21552, %r21551, 1; + add.s32 %r21553, %r21552, %r21550; + shr.u32 %r21554, %r21553, 20; + mul.lo.s32 %r21555, %r21554, 1179641; + sub.s32 %r21556, %r21549, %r21555; + mul.wide.u32 %rd957, %r21556, 64; + add.s64 %rd958, %rd471, %rd957; + mul.lo.s32 %r21557, %r30658, 16777619; + ld.global.u32 %r21558, [%rd958]; + xor.b32 %r30658, %r21557, %r21558; + mul.lo.s32 %r21559, %r30659, 16777619; + ld.global.u32 %r21560, [%rd958+4]; + xor.b32 %r30659, %r21559, %r21560; + mul.lo.s32 %r21561, %r30670, 16777619; + ld.global.u32 %r21562, [%rd958+8]; + mul.lo.s32 %r21563, %r30671, 16777619; + ld.global.u32 %r21564, [%rd958+12]; + xor.b32 %r21565, %r21563, %r21564; + xor.b32 %r30670, %r21561, %r21562; + mov.b64 %rd959, {%r30670, %r21565}; + mul.lo.s32 %r21566, %r30666, 16777619; + ld.global.u32 %r21567, [%rd958+16]; + mul.lo.s32 %r21568, %r30667, 16777619; + ld.global.u32 %r21569, [%rd958+20]; + xor.b32 %r21570, %r21568, %r21569; + xor.b32 %r30666, %r21566, %r21567; + mov.b64 %rd960, {%r30666, %r21570}; + mul.lo.s32 %r21571, %r30662, 16777619; + ld.global.u32 %r21572, [%rd958+24]; + mul.lo.s32 %r21573, %r30663, 16777619; + ld.global.u32 %r21574, [%rd958+28]; + xor.b32 %r21575, %r21573, %r21574; + xor.b32 %r30662, %r21571, %r21572; + mov.b64 %rd961, {%r30662, %r21575}; + mul.lo.s32 %r21576, %r30660, 16777619; + ld.global.u32 %r21577, [%rd958+32]; + mul.lo.s32 %r21578, %r30661, 16777619; + ld.global.u32 %r21579, [%rd958+36]; + xor.b32 %r21580, %r21578, %r21579; + xor.b32 %r30660, %r21576, %r21577; + mov.b64 %rd962, {%r30660, %r21580}; + mul.lo.s32 %r21581, %r30656, 16777619; + ld.global.u32 %r21582, [%rd958+40]; + xor.b32 %r30656, %r21581, %r21582; + mul.lo.s32 %r21583, %r30657, 16777619; + ld.global.u32 %r21584, [%rd958+44]; + xor.b32 %r30657, %r21583, %r21584; + mul.lo.s32 %r21585, %r30668, 16777619; + ld.global.u32 %r21586, [%rd958+48]; + mul.lo.s32 %r21587, %r30669, 16777619; + ld.global.u32 %r21588, [%rd958+52]; + xor.b32 %r21589, %r21587, %r21588; + xor.b32 %r30668, %r21585, %r21586; + mov.b64 %rd963, {%r30668, %r21589}; + mul.lo.s32 %r21590, %r30664, 16777619; + ld.global.u32 %r21591, [%rd958+56]; + mul.lo.s32 %r21592, %r30665, 16777619; + ld.global.u32 %r21593, [%rd958+60]; + xor.b32 %r21594, %r21592, %r21593; + xor.b32 %r30664, %r21590, %r21591; + mov.b64 %rd964, {%r30664, %r21594}; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + st.local.v2.u32 [%rd3+32], {%r30670, %r21565}; + st.local.v2.u32 [%rd3+40], {%r30666, %r21570}; + st.local.v2.u32 [%rd3+48], {%r30662, %r21575}; + st.local.v2.u32 [%rd3+56], {%r30660, %r21580}; + st.local.v2.u32 [%rd3+64], {%r30656, %r30657}; + st.local.v2.u32 [%rd3+72], {%r30668, %r21589}; + st.local.v2.u32 [%rd3+80], {%r30664, %r21594}; + add.s64 %rd965, %rd244, %rd953; + xor.b32 %r21595, %r20787, %r30621; + mul.lo.s32 %r21596, %r21595, 16777619; + ld.local.u32 %r21597, [%rd965]; + xor.b32 %r21598, %r21596, %r21597; + mul.wide.u32 %rd966, %r21598, -954391867; + shr.u64 %rd967, %rd966, 32; + cvt.u32.u64 %r21599, %rd967; + sub.s32 %r21600, %r21598, %r21599; + shr.u32 %r21601, %r21600, 1; + add.s32 %r21602, %r21601, %r21599; + shr.u32 %r21603, %r21602, 20; + mul.lo.s32 %r21604, %r21603, 1179641; + sub.s32 %r21605, %r21598, %r21604; + mul.wide.u32 %rd968, %r21605, 64; + add.s64 %rd969, %rd471, %rd968; + mul.lo.s32 %r21606, %r30709, 16777619; + ld.global.u32 %r21607, [%rd969]; + xor.b32 %r30709, %r21606, %r21607; + mul.lo.s32 %r21608, %r30710, 16777619; + ld.global.u32 %r21609, [%rd969+4]; + xor.b32 %r30710, %r21608, %r21609; + mul.lo.s32 %r21610, %r30721, 16777619; + ld.global.u32 %r21611, [%rd969+8]; + mul.lo.s32 %r21612, %r30722, 16777619; + ld.global.u32 %r21613, [%rd969+12]; + xor.b32 %r21614, %r21612, %r21613; + xor.b32 %r30721, %r21610, %r21611; + mov.b64 %rd970, {%r30721, %r21614}; + mul.lo.s32 %r21615, %r30717, 16777619; + ld.global.u32 %r21616, [%rd969+16]; + mul.lo.s32 %r21617, %r30718, 16777619; + ld.global.u32 %r21618, [%rd969+20]; + xor.b32 %r21619, %r21617, %r21618; + xor.b32 %r30717, %r21615, %r21616; + mov.b64 %rd971, {%r30717, %r21619}; + mul.lo.s32 %r21620, %r30713, 16777619; + ld.global.u32 %r21621, [%rd969+24]; + mul.lo.s32 %r21622, %r30714, 16777619; + ld.global.u32 %r21623, [%rd969+28]; + xor.b32 %r21624, %r21622, %r21623; + xor.b32 %r30713, %r21620, %r21621; + mov.b64 %rd972, {%r30713, %r21624}; + mul.lo.s32 %r21625, %r30711, 16777619; + ld.global.u32 %r21626, [%rd969+32]; + mul.lo.s32 %r21627, %r30712, 16777619; + ld.global.u32 %r21628, [%rd969+36]; + xor.b32 %r21629, %r21627, %r21628; + xor.b32 %r30711, %r21625, %r21626; + mov.b64 %rd973, {%r30711, %r21629}; + mul.lo.s32 %r21630, %r30707, 16777619; + ld.global.u32 %r21631, [%rd969+40]; + xor.b32 %r30707, %r21630, %r21631; + mul.lo.s32 %r21632, %r30708, 16777619; + ld.global.u32 %r21633, [%rd969+44]; + xor.b32 %r30708, %r21632, %r21633; + mul.lo.s32 %r21634, %r30719, 16777619; + ld.global.u32 %r21635, [%rd969+48]; + mul.lo.s32 %r21636, %r30720, 16777619; + ld.global.u32 %r21637, [%rd969+52]; + xor.b32 %r21638, %r21636, %r21637; + xor.b32 %r30719, %r21634, %r21635; + mov.b64 %rd974, {%r30719, %r21638}; + mul.lo.s32 %r21639, %r30715, 16777619; + ld.global.u32 %r21640, [%rd969+56]; + mul.lo.s32 %r21641, %r30716, 16777619; + ld.global.u32 %r21642, [%rd969+60]; + xor.b32 %r21643, %r21641, %r21642; + xor.b32 %r30715, %r21639, %r21640; + mov.b64 %rd975, {%r30715, %r21643}; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + st.local.v2.u32 [%rd925+32], {%r30721, %r21614}; + st.local.v2.u32 [%rd925+40], {%r30717, %r21619}; + st.local.v2.u32 [%rd925+48], {%r30713, %r21624}; + st.local.v2.u32 [%rd925+56], {%r30711, %r21629}; + st.local.v2.u32 [%rd925+64], {%r30707, %r30708}; + st.local.v2.u32 [%rd925+72], {%r30719, %r21638}; + st.local.v2.u32 [%rd925+80], {%r30715, %r21643}; + add.s32 %r30621, %r30621, 1; + setp.lt.u32 %p44, %r30621, 512; + shr.u64 %rd976, %rd959, 32; + cvt.u32.u64 %r30671, %rd976; + shr.u64 %rd977, %rd960, 32; + cvt.u32.u64 %r30667, %rd977; + shr.u64 %rd978, %rd961, 32; + cvt.u32.u64 %r30663, %rd978; + shr.u64 %rd979, %rd962, 32; + cvt.u32.u64 %r30661, %rd979; + shr.u64 %rd980, %rd963, 32; + cvt.u32.u64 %r30669, %rd980; + shr.u64 %rd981, %rd964, 32; + cvt.u32.u64 %r30665, %rd981; + shr.u64 %rd982, %rd970, 32; + cvt.u32.u64 %r30722, %rd982; + shr.u64 %rd983, %rd971, 32; + cvt.u32.u64 %r30718, %rd983; + shr.u64 %rd984, %rd972, 32; + cvt.u32.u64 %r30714, %rd984; + shr.u64 %rd985, %rd973, 32; + cvt.u32.u64 %r30712, %rd985; + shr.u64 %rd986, %rd974, 32; + cvt.u32.u64 %r30720, %rd986; + shr.u64 %rd987, %rd975, 32; + cvt.u32.u64 %r30716, %rd987; + @%p44 bra $L__BB2_77; + + mov.u32 %r30622, 0; + st.local.v2.u32 [%rd3+96], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+104], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+112], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+120], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+128], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+136], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+144], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+152], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+160], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+168], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+176], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+184], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+192], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+200], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+208], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+216], {%r30622, %r30622}; + mov.u32 %r30637, -2147483648; + mov.u32 %r21658, 1; + st.local.v2.u32 [%rd3+88], {%r21658, %r30637}; + mov.u32 %r30623, %r30622; + mov.u32 %r30624, %r30622; + mov.u32 %r30625, %r30622; + mov.u32 %r30626, %r30622; + mov.u32 %r30627, %r30622; + mov.u32 %r30628, %r30622; + mov.u32 %r30629, %r30622; + mov.u32 %r30630, %r30622; + mov.u32 %r30631, %r30622; + mov.u32 %r30632, %r30622; + mov.u32 %r30633, %r30622; + mov.u32 %r30634, %r30622; + mov.u32 %r30635, %r30622; + mov.u32 %r30636, %r21658; + mov.u32 %r30638, %r30622; + mov.u32 %r30639, %r30622; + mov.u32 %r30640, %r30622; + mov.u32 %r30641, %r30622; + mov.u32 %r30642, %r30622; + mov.u32 %r30643, %r30622; + mov.u32 %r30644, %r30622; + mov.u32 %r30645, %r30622; + mov.u32 %r30646, %r30622; + mov.u32 %r30647, %r30622; + mov.u32 %r30648, %r30622; + mov.u32 %r30649, %r30622; + mov.u32 %r30650, %r30622; + mov.u32 %r30651, %r30622; + mov.u32 %r30652, %r30622; + mov.u32 %r30653, %r30622; + mov.u32 %r30654, %r30622; + mov.u32 %r30655, %r30622; + mov.u32 %r30672, %r30622; + +$L__BB2_79: + // begin inline asm + // xor5 + lop3.b32 %r21685, %r30658, %r30656, %r30654, 0x96; + lop3.b32 %r21685, %r21685, %r30652, %r30650, 0x96; + lop3.b32 %r21686, %r30659, %r30657, %r30655, 0x96; + lop3.b32 %r21686, %r21686, %r30653, %r30651, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21697, %r30670, %r30668, %r30648, 0x96; + lop3.b32 %r21697, %r21697, %r30646, %r30644, 0x96; + lop3.b32 %r21698, %r30671, %r30669, %r30649, 0x96; + lop3.b32 %r21698, %r21698, %r30647, %r30645, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21709, %r30666, %r30664, %r30642, 0x96; + lop3.b32 %r21709, %r21709, %r30640, %r30638, 0x96; + lop3.b32 %r21710, %r30667, %r30665, %r30643, 0x96; + lop3.b32 %r21710, %r21710, %r30641, %r30639, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21721, %r30662, %r30636, %r30634, 0x96; + lop3.b32 %r21721, %r21721, %r30632, %r30630, 0x96; + lop3.b32 %r21722, %r30663, %r30637, %r30635, 0x96; + lop3.b32 %r21722, %r21722, %r30633, %r30631, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21733, %r30660, %r30628, %r30626, 0x96; + lop3.b32 %r21733, %r21733, %r30624, %r30622, 0x96; + lop3.b32 %r21734, %r30661, %r30629, %r30627, 0x96; + lop3.b32 %r21734, %r21734, %r30625, %r30623, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21745, %r21698, %r21697, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21749, %r21697, %r21698, %r21658; + // end inline asm + xor.b32 %r22179, %r21745, %r21733; + xor.b32 %r22180, %r21749, %r21734; + xor.b32 %r22012, %r30658, %r22179; + xor.b32 %r22015, %r30659, %r22180; + xor.b32 %r21919, %r30656, %r22179; + xor.b32 %r21918, %r30657, %r22180; + xor.b32 %r21966, %r30654, %r22179; + xor.b32 %r21967, %r30655, %r22180; + xor.b32 %r21871, %r30652, %r22179; + xor.b32 %r21870, %r30653, %r22180; + xor.b32 %r21822, %r30650, %r22179; + xor.b32 %r21823, %r30651, %r22180; + // begin inline asm + shf.l.wrap.b32 %r21753, %r21710, %r21709, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21757, %r21709, %r21710, %r21658; + // end inline asm + xor.b32 %r22181, %r21753, %r21685; + xor.b32 %r22182, %r21757, %r21686; + xor.b32 %r21974, %r30670, %r22181; + xor.b32 %r21975, %r30671, %r22182; + xor.b32 %r21791, %r30668, %r22181; + xor.b32 %r21790, %r30669, %r22182; + xor.b32 %r21950, %r30648, %r22181; + xor.b32 %r21951, %r30649, %r22182; + xor.b32 %r21911, %r30646, %r22181; + xor.b32 %r21910, %r30647, %r22182; + xor.b32 %r21894, %r30644, %r22181; + xor.b32 %r21895, %r30645, %r22182; + // begin inline asm + shf.l.wrap.b32 %r21761, %r21722, %r21721, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21765, %r21721, %r21722, %r21658; + // end inline asm + xor.b32 %r22183, %r21761, %r21697; + xor.b32 %r22184, %r21765, %r21698; + xor.b32 %r21831, %r30666, %r22183; + xor.b32 %r21830, %r30667, %r22184; + xor.b32 %r21958, %r30664, %r22183; + xor.b32 %r21959, %r30665, %r22184; + xor.b32 %r21839, %r30642, %r22183; + xor.b32 %r21838, %r30643, %r22184; + xor.b32 %r21942, %r30640, %r22183; + xor.b32 %r21943, %r30641, %r22184; + xor.b32 %r21807, %r30638, %r22183; + xor.b32 %r21806, %r30639, %r22184; + // begin inline asm + shf.l.wrap.b32 %r21769, %r21734, %r21733, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21773, %r21733, %r21734, %r21658; + // end inline asm + xor.b32 %r22185, %r21769, %r21709; + xor.b32 %r22186, %r21773, %r21710; + xor.b32 %r21926, %r30662, %r22185; + xor.b32 %r21927, %r30663, %r22186; + xor.b32 %r21903, %r30636, %r22185; + xor.b32 %r21902, %r30637, %r22186; + xor.b32 %r21846, %r30634, %r22185; + xor.b32 %r21847, %r30635, %r22186; + xor.b32 %r21934, %r30632, %r22185; + xor.b32 %r21935, %r30633, %r22186; + xor.b32 %r21863, %r30630, %r22185; + xor.b32 %r21862, %r30631, %r22186; + // begin inline asm + shf.l.wrap.b32 %r21777, %r21686, %r21685, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21781, %r21685, %r21686, %r21658; + // end inline asm + xor.b32 %r22187, %r21777, %r21721; + xor.b32 %r22188, %r21781, %r21722; + xor.b32 %r21878, %r30660, %r22187; + xor.b32 %r21879, %r30661, %r22188; + xor.b32 %r21798, %r30628, %r22187; + xor.b32 %r21799, %r30629, %r22188; + xor.b32 %r21815, %r30626, %r22187; + xor.b32 %r21814, %r30627, %r22188; + xor.b32 %r21854, %r30624, %r22187; + xor.b32 %r21855, %r30625, %r22188; + xor.b32 %r21886, %r30622, %r22187; + xor.b32 %r21887, %r30623, %r22188; + mov.u32 %r21792, 44; + // begin inline asm + shf.l.wrap.b32 %r21785, %r21791, %r21790, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21789, %r21790, %r21791, %r21792; + // end inline asm + mov.u32 %r21800, 20; + // begin inline asm + shf.l.wrap.b32 %r21793, %r21799, %r21798, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21797, %r21798, %r21799, %r21800; + // end inline asm + mov.u32 %r21808, 61; + // begin inline asm + shf.l.wrap.b32 %r21801, %r21807, %r21806, %r21808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21805, %r21806, %r21807, %r21808; + // end inline asm + mov.u32 %r21816, 39; + // begin inline asm + shf.l.wrap.b32 %r21809, %r21815, %r21814, %r21816; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21813, %r21814, %r21815, %r21816; + // end inline asm + mov.u32 %r21824, 18; + // begin inline asm + shf.l.wrap.b32 %r21817, %r21823, %r21822, %r21824; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21821, %r21822, %r21823, %r21824; + // end inline asm + mov.u32 %r21832, 62; + // begin inline asm + shf.l.wrap.b32 %r21825, %r21831, %r21830, %r21832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21829, %r21830, %r21831, %r21832; + // end inline asm + mov.u32 %r21840, 43; + // begin inline asm + shf.l.wrap.b32 %r21833, %r21839, %r21838, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21837, %r21838, %r21839, %r21840; + // end inline asm + mov.u32 %r21848, 25; + // begin inline asm + shf.l.wrap.b32 %r21841, %r21847, %r21846, %r21848; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21845, %r21846, %r21847, %r21848; + // end inline asm + mov.u32 %r21856, 8; + // begin inline asm + shf.l.wrap.b32 %r21849, %r21855, %r21854, %r21856; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21853, %r21854, %r21855, %r21856; + // end inline asm + mov.u32 %r21864, 56; + // begin inline asm + shf.l.wrap.b32 %r21857, %r21863, %r21862, %r21864; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21861, %r21862, %r21863, %r21864; + // end inline asm + mov.u32 %r21872, 41; + // begin inline asm + shf.l.wrap.b32 %r21865, %r21871, %r21870, %r21872; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21869, %r21870, %r21871, %r21872; + // end inline asm + mov.u32 %r21880, 27; + // begin inline asm + shf.l.wrap.b32 %r21873, %r21879, %r21878, %r21880; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21877, %r21878, %r21879, %r21880; + // end inline asm + mov.u32 %r21888, 14; + // begin inline asm + shf.l.wrap.b32 %r21881, %r21887, %r21886, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21885, %r21886, %r21887, %r21888; + // end inline asm + mov.u32 %r21896, 2; + // begin inline asm + shf.l.wrap.b32 %r21889, %r21895, %r21894, %r21896; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21893, %r21894, %r21895, %r21896; + // end inline asm + mov.u32 %r21904, 55; + // begin inline asm + shf.l.wrap.b32 %r21897, %r21903, %r21902, %r21904; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21901, %r21902, %r21903, %r21904; + // end inline asm + mov.u32 %r21912, 45; + // begin inline asm + shf.l.wrap.b32 %r21905, %r21911, %r21910, %r21912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21909, %r21910, %r21911, %r21912; + // end inline asm + mov.u32 %r21920, 36; + // begin inline asm + shf.l.wrap.b32 %r21913, %r21919, %r21918, %r21920; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21917, %r21918, %r21919, %r21920; + // end inline asm + mov.u32 %r21928, 28; + // begin inline asm + shf.l.wrap.b32 %r21921, %r21927, %r21926, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21925, %r21926, %r21927, %r21928; + // end inline asm + mov.u32 %r21936, 21; + // begin inline asm + shf.l.wrap.b32 %r21929, %r21935, %r21934, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21933, %r21934, %r21935, %r21936; + // end inline asm + mov.u32 %r21944, 15; + // begin inline asm + shf.l.wrap.b32 %r21937, %r21943, %r21942, %r21944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21941, %r21942, %r21943, %r21944; + // end inline asm + mov.u32 %r21952, 10; + // begin inline asm + shf.l.wrap.b32 %r21945, %r21951, %r21950, %r21952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21949, %r21950, %r21951, %r21952; + // end inline asm + mov.u32 %r21960, 6; + // begin inline asm + shf.l.wrap.b32 %r21953, %r21959, %r21958, %r21960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21957, %r21958, %r21959, %r21960; + // end inline asm + mov.u32 %r21968, 3; + // begin inline asm + shf.l.wrap.b32 %r21961, %r21967, %r21966, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21965, %r21966, %r21967, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21969, %r21975, %r21974, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21973, %r21974, %r21975, %r21658; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21977, %r22012, %r21785, %r21833, 0xD2; + lop3.b32 %r21978, %r22015, %r21789, %r21837, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30670, %r21785, %r21833, %r21929, 0xD2; + lop3.b32 %r30671, %r21789, %r21837, %r21933, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30666, %r21833, %r21929, %r21881, 0xD2; + lop3.b32 %r30667, %r21837, %r21933, %r21885, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30662, %r21929, %r21881, %r22012, 0xD2; + lop3.b32 %r30663, %r21933, %r21885, %r22015, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30660, %r21881, %r22012, %r21785, 0xD2; + lop3.b32 %r30661, %r21885, %r22015, %r21789, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30656, %r21921, %r21793, %r21961, 0xD2; + lop3.b32 %r30657, %r21925, %r21797, %r21965, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30668, %r21793, %r21961, %r21905, 0xD2; + lop3.b32 %r30669, %r21797, %r21965, %r21909, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30664, %r21961, %r21905, %r21801, 0xD2; + lop3.b32 %r30665, %r21965, %r21909, %r21805, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30636, %r21905, %r21801, %r21921, 0xD2; + lop3.b32 %r30637, %r21909, %r21805, %r21925, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30636, %r30637}; + // begin inline asm + // chi + lop3.b32 %r30628, %r21801, %r21921, %r21793, 0xD2; + lop3.b32 %r30629, %r21805, %r21925, %r21797, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30628, %r30629}; + // begin inline asm + // chi + lop3.b32 %r30654, %r21969, %r21953, %r21841, 0xD2; + lop3.b32 %r30655, %r21973, %r21957, %r21845, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30654, %r30655}; + // begin inline asm + // chi + lop3.b32 %r30648, %r21953, %r21841, %r21849, 0xD2; + lop3.b32 %r30649, %r21957, %r21845, %r21853, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30648, %r30649}; + // begin inline asm + // chi + lop3.b32 %r30642, %r21841, %r21849, %r21817, 0xD2; + lop3.b32 %r30643, %r21845, %r21853, %r21821, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30642, %r30643}; + // begin inline asm + // chi + lop3.b32 %r30634, %r21849, %r21817, %r21969, 0xD2; + lop3.b32 %r30635, %r21853, %r21821, %r21973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30634, %r30635}; + // begin inline asm + // chi + lop3.b32 %r30626, %r21817, %r21969, %r21953, 0xD2; + lop3.b32 %r30627, %r21821, %r21973, %r21957, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30626, %r30627}; + // begin inline asm + // chi + lop3.b32 %r30652, %r21873, %r21913, %r21945, 0xD2; + lop3.b32 %r30653, %r21877, %r21917, %r21949, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30652, %r30653}; + // begin inline asm + // chi + lop3.b32 %r30646, %r21913, %r21945, %r21937, 0xD2; + lop3.b32 %r30647, %r21917, %r21949, %r21941, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30646, %r30647}; + // begin inline asm + // chi + lop3.b32 %r30640, %r21945, %r21937, %r21857, 0xD2; + lop3.b32 %r30641, %r21949, %r21941, %r21861, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30640, %r30641}; + // begin inline asm + // chi + lop3.b32 %r30632, %r21937, %r21857, %r21873, 0xD2; + lop3.b32 %r30633, %r21941, %r21861, %r21877, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30632, %r30633}; + // begin inline asm + // chi + lop3.b32 %r30624, %r21857, %r21873, %r21913, 0xD2; + lop3.b32 %r30625, %r21861, %r21877, %r21917, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30624, %r30625}; + // begin inline asm + // chi + lop3.b32 %r30650, %r21825, %r21897, %r21809, 0xD2; + lop3.b32 %r30651, %r21829, %r21901, %r21813, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30650, %r30651}; + // begin inline asm + // chi + lop3.b32 %r30644, %r21897, %r21809, %r21865, 0xD2; + lop3.b32 %r30645, %r21901, %r21813, %r21869, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30644, %r30645}; + // begin inline asm + // chi + lop3.b32 %r30638, %r21809, %r21865, %r21889, 0xD2; + lop3.b32 %r30639, %r21813, %r21869, %r21893, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30638, %r30639}; + // begin inline asm + // chi + lop3.b32 %r30630, %r21865, %r21889, %r21825, 0xD2; + lop3.b32 %r30631, %r21869, %r21893, %r21829, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30630, %r30631}; + // begin inline asm + // chi + lop3.b32 %r30622, %r21889, %r21825, %r21897, 0xD2; + lop3.b32 %r30623, %r21893, %r21829, %r21901, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30622, %r30623}; + mul.wide.s32 %rd991, %r30672, 8; + add.s64 %rd990, %rd918, %rd991; + // begin inline asm + ld.global.nc.v2.u32 {%r22177,%r22178}, [%rd990]; + // end inline asm + xor.b32 %r30658, %r21977, %r22177; + xor.b32 %r30659, %r21978, %r22178; + add.s32 %r30672, %r30672, 1; + setp.lt.u32 %p45, %r30672, 23; + @%p45 bra $L__BB2_79; + + st.local.v2.u32 [%rd3+32], {%r30670, %r30671}; + st.local.v2.u32 [%rd3+72], {%r30668, %r30669}; + st.local.v2.u32 [%rd3+40], {%r30666, %r30667}; + st.local.v2.u32 [%rd3+80], {%r30664, %r30665}; + st.local.v2.u32 [%rd3+48], {%r30662, %r30663}; + st.local.v2.u32 [%rd3+56], {%r30660, %r30661}; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + // begin inline asm + // xor5 + lop3.b32 %r22189, %r30658, %r30656, %r30654, 0x96; + lop3.b32 %r22189, %r22189, %r30652, %r30650, 0x96; + lop3.b32 %r22190, %r30659, %r30657, %r30655, 0x96; + lop3.b32 %r22190, %r22190, %r30653, %r30651, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22201, %r30670, %r30668, %r30648, 0x96; + lop3.b32 %r22201, %r22201, %r30646, %r30644, 0x96; + lop3.b32 %r22202, %r30671, %r30669, %r30649, 0x96; + lop3.b32 %r22202, %r22202, %r30647, %r30645, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22213, %r30666, %r30664, %r30642, 0x96; + lop3.b32 %r22213, %r22213, %r30640, %r30638, 0x96; + lop3.b32 %r22214, %r30667, %r30665, %r30643, 0x96; + lop3.b32 %r22214, %r22214, %r30641, %r30639, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22225, %r30662, %r30636, %r30634, 0x96; + lop3.b32 %r22225, %r22225, %r30632, %r30630, 0x96; + lop3.b32 %r22226, %r30663, %r30637, %r30635, 0x96; + lop3.b32 %r22226, %r22226, %r30633, %r30631, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22237, %r30660, %r30628, %r30626, 0x96; + lop3.b32 %r22237, %r22237, %r30624, %r30622, 0x96; + lop3.b32 %r22238, %r30661, %r30629, %r30627, 0x96; + lop3.b32 %r22238, %r22238, %r30625, %r30623, 0x96; + // end inline asm + mov.u32 %r22441, 1; + // begin inline asm + shf.l.wrap.b32 %r22249, %r22202, %r22201, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22253, %r22201, %r22202, %r22441; + // end inline asm + xor.b32 %r22468, %r22249, %r22237; + xor.b32 %r22469, %r22253, %r22238; + xor.b32 %r22396, %r30658, %r22468; + xor.b32 %r22399, %r30659, %r22469; + xor.b32 %r22359, %r30655, %r22469; + xor.b32 %r22358, %r30654, %r22468; + st.local.v2.u32 [%rd3+104], {%r22358, %r22359}; + // begin inline asm + shf.l.wrap.b32 %r22257, %r22214, %r22213, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22261, %r22213, %r22214, %r22441; + // end inline asm + xor.b32 %r22470, %r22257, %r22189; + xor.b32 %r22471, %r22261, %r22190; + xor.b32 %r22295, %r30668, %r22470; + xor.b32 %r22294, %r30669, %r22471; + xor.b32 %r22334, %r30647, %r22471; + xor.b32 %r22335, %r30646, %r22470; + st.local.v2.u32 [%rd3+152], {%r22335, %r22334}; + // begin inline asm + shf.l.wrap.b32 %r22265, %r22226, %r22225, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22269, %r22225, %r22226, %r22441; + // end inline asm + xor.b32 %r22472, %r22265, %r22201; + xor.b32 %r22473, %r22269, %r22202; + xor.b32 %r22318, %r30643, %r22473; + xor.b32 %r22319, %r30642, %r22472; + st.local.v2.u32 [%rd3+120], {%r22319, %r22318}; + xor.b32 %r22310, %r30639, %r22473; + xor.b32 %r22311, %r30638, %r22472; + st.local.v2.u32 [%rd3+200], {%r22311, %r22310}; + // begin inline asm + shf.l.wrap.b32 %r22273, %r22238, %r22237, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22277, %r22237, %r22238, %r22441; + // end inline asm + xor.b32 %r22474, %r22273, %r22213; + xor.b32 %r22475, %r22277, %r22214; + xor.b32 %r22342, %r30662, %r22474; + xor.b32 %r22343, %r30663, %r22475; + xor.b32 %r22351, %r30633, %r22475; + xor.b32 %r22350, %r30632, %r22474; + st.local.v2.u32 [%rd3+168], {%r22350, %r22351}; + // begin inline asm + shf.l.wrap.b32 %r22281, %r22190, %r22189, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22285, %r22189, %r22190, %r22441; + // end inline asm + xor.b32 %r22476, %r22281, %r22225; + xor.b32 %r22477, %r22285, %r22226; + xor.b32 %r22302, %r30628, %r22476; + xor.b32 %r22303, %r30629, %r22477; + xor.b32 %r22327, %r30623, %r22477; + xor.b32 %r22326, %r30622, %r22476; + st.local.v2.u32 [%rd3+216], {%r22326, %r22327}; + // begin inline asm + shf.l.wrap.b32 %r22289, %r22295, %r22294, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22293, %r22294, %r22295, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22297, %r22303, %r22302, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22301, %r22302, %r22303, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22309, %r22310, %r22311, %r21808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22305, %r22311, %r22310, %r21808; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r22305, %r22309}; + // begin inline asm + shf.l.wrap.b32 %r22313, %r22319, %r22318, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22317, %r22318, %r22319, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22321, %r22327, %r22326, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22325, %r22326, %r22327, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22333, %r22334, %r22335, %r21912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22329, %r22335, %r22334, %r21912; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r22329, %r22333}; + // begin inline asm + shf.l.wrap.b32 %r22337, %r22343, %r22342, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22341, %r22342, %r22343, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22345, %r22351, %r22350, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22349, %r22350, %r22351, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22353, %r22359, %r22358, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22357, %r22358, %r22359, %r21968; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22361, %r22396, %r22289, %r22313, 0xD2; + lop3.b32 %r22362, %r22399, %r22293, %r22317, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22369, %r22289, %r22313, %r22345, 0xD2; + lop3.b32 %r22370, %r22293, %r22317, %r22349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r22369, %r22370}; + // begin inline asm + // chi + lop3.b32 %r22377, %r22313, %r22345, %r22321, 0xD2; + lop3.b32 %r22378, %r22317, %r22349, %r22325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r22377, %r22378}; + // begin inline asm + // chi + lop3.b32 %r22385, %r22345, %r22321, %r22396, 0xD2; + lop3.b32 %r22386, %r22349, %r22325, %r22399, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r22385, %r22386}; + // begin inline asm + // chi + lop3.b32 %r22393, %r22321, %r22396, %r22289, 0xD2; + lop3.b32 %r22394, %r22325, %r22399, %r22293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r22393, %r22394}; + // begin inline asm + // chi + lop3.b32 %r22401, %r22337, %r22297, %r22353, 0xD2; + lop3.b32 %r22402, %r22341, %r22301, %r22357, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r22401, %r22402}; + // begin inline asm + // chi + lop3.b32 %r22409, %r22297, %r22353, %r22329, 0xD2; + lop3.b32 %r22410, %r22301, %r22357, %r22333, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r22409, %r22410}; + // begin inline asm + // chi + lop3.b32 %r22417, %r22353, %r22329, %r22305, 0xD2; + lop3.b32 %r22418, %r22357, %r22333, %r22309, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r22417, %r22418}; + // begin inline asm + ld.global.nc.v2.u32 {%r22425,%r22426}, [%rd919]; + // end inline asm + xor.b32 %r22478, %r22362, %r22426; + xor.b32 %r22479, %r22361, %r22425; + mov.b64 %rd1269, {%r22479, %r22478}; + mov.b64 %rd1270, {%r22369, %r22370}; + mov.b64 %rd1271, {%r22377, %r22378}; + mov.b64 %rd250, {%r22385, %r22386}; + mov.b64 %rd1272, {%r22393, %r22394}; + mov.b64 %rd252, {%r22401, %r22402}; + mov.b64 %rd253, {%r22409, %r22410}; + mov.b64 %rd254, {%r22417, %r22418}; + mov.u32 %r30673, 0; + st.local.v2.u32 [%rd3+24], {%r22479, %r22478}; + st.local.v2.u32 [%rd925+96], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+104], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+112], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+120], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+128], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+136], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+144], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+152], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+160], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+168], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+176], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+184], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+192], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+200], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+208], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+216], {%r30673, %r30673}; + mov.u32 %r30688, -2147483648; + st.local.v2.u32 [%rd925+88], {%r22441, %r30688}; + mov.u32 %r30674, %r30673; + mov.u32 %r30675, %r30673; + mov.u32 %r30676, %r30673; + mov.u32 %r30677, %r30673; + mov.u32 %r30678, %r30673; + mov.u32 %r30679, %r30673; + mov.u32 %r30680, %r30673; + mov.u32 %r30681, %r30673; + mov.u32 %r30682, %r30673; + mov.u32 %r30683, %r30673; + mov.u32 %r30684, %r30673; + mov.u32 %r30685, %r30673; + mov.u32 %r30686, %r30673; + mov.u32 %r30687, %r22441; + mov.u32 %r30689, %r30673; + mov.u32 %r30690, %r30673; + mov.u32 %r30691, %r30673; + mov.u32 %r30692, %r30673; + mov.u32 %r30693, %r30673; + mov.u32 %r30694, %r30673; + mov.u32 %r30695, %r30673; + mov.u32 %r30696, %r30673; + mov.u32 %r30697, %r30673; + mov.u32 %r30698, %r30673; + mov.u32 %r30699, %r30673; + mov.u32 %r30700, %r30673; + mov.u32 %r30701, %r30673; + mov.u32 %r30702, %r30673; + mov.u32 %r30703, %r30673; + mov.u32 %r30704, %r30673; + mov.u32 %r30705, %r30673; + mov.u32 %r30706, %r30673; + mov.u32 %r30723, %r30673; + +$L__BB2_81: + // begin inline asm + // xor5 + lop3.b32 %r22480, %r30709, %r30707, %r30705, 0x96; + lop3.b32 %r22480, %r22480, %r30703, %r30701, 0x96; + lop3.b32 %r22481, %r30710, %r30708, %r30706, 0x96; + lop3.b32 %r22481, %r22481, %r30704, %r30702, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22492, %r30721, %r30719, %r30699, 0x96; + lop3.b32 %r22492, %r22492, %r30697, %r30695, 0x96; + lop3.b32 %r22493, %r30722, %r30720, %r30700, 0x96; + lop3.b32 %r22493, %r22493, %r30698, %r30696, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22504, %r30717, %r30715, %r30693, 0x96; + lop3.b32 %r22504, %r22504, %r30691, %r30689, 0x96; + lop3.b32 %r22505, %r30718, %r30716, %r30694, 0x96; + lop3.b32 %r22505, %r22505, %r30692, %r30690, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22516, %r30713, %r30687, %r30685, 0x96; + lop3.b32 %r22516, %r22516, %r30683, %r30681, 0x96; + lop3.b32 %r22517, %r30714, %r30688, %r30686, 0x96; + lop3.b32 %r22517, %r22517, %r30684, %r30682, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22528, %r30711, %r30679, %r30677, 0x96; + lop3.b32 %r22528, %r22528, %r30675, %r30673, 0x96; + lop3.b32 %r22529, %r30712, %r30680, %r30678, 0x96; + lop3.b32 %r22529, %r22529, %r30676, %r30674, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22540, %r22493, %r22492, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22544, %r22492, %r22493, %r22441; + // end inline asm + xor.b32 %r22974, %r22540, %r22528; + xor.b32 %r22975, %r22544, %r22529; + xor.b32 %r22807, %r30709, %r22974; + xor.b32 %r22810, %r30710, %r22975; + xor.b32 %r22714, %r30707, %r22974; + xor.b32 %r22713, %r30708, %r22975; + xor.b32 %r22761, %r30705, %r22974; + xor.b32 %r22762, %r30706, %r22975; + xor.b32 %r22666, %r30703, %r22974; + xor.b32 %r22665, %r30704, %r22975; + xor.b32 %r22617, %r30701, %r22974; + xor.b32 %r22618, %r30702, %r22975; + // begin inline asm + shf.l.wrap.b32 %r22548, %r22505, %r22504, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22552, %r22504, %r22505, %r22441; + // end inline asm + xor.b32 %r22976, %r22548, %r22480; + xor.b32 %r22977, %r22552, %r22481; + xor.b32 %r22769, %r30721, %r22976; + xor.b32 %r22770, %r30722, %r22977; + xor.b32 %r22586, %r30719, %r22976; + xor.b32 %r22585, %r30720, %r22977; + xor.b32 %r22745, %r30699, %r22976; + xor.b32 %r22746, %r30700, %r22977; + xor.b32 %r22706, %r30697, %r22976; + xor.b32 %r22705, %r30698, %r22977; + xor.b32 %r22689, %r30695, %r22976; + xor.b32 %r22690, %r30696, %r22977; + // begin inline asm + shf.l.wrap.b32 %r22556, %r22517, %r22516, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22560, %r22516, %r22517, %r22441; + // end inline asm + xor.b32 %r22978, %r22556, %r22492; + xor.b32 %r22979, %r22560, %r22493; + xor.b32 %r22626, %r30717, %r22978; + xor.b32 %r22625, %r30718, %r22979; + xor.b32 %r22753, %r30715, %r22978; + xor.b32 %r22754, %r30716, %r22979; + xor.b32 %r22634, %r30693, %r22978; + xor.b32 %r22633, %r30694, %r22979; + xor.b32 %r22737, %r30691, %r22978; + xor.b32 %r22738, %r30692, %r22979; + xor.b32 %r22602, %r30689, %r22978; + xor.b32 %r22601, %r30690, %r22979; + // begin inline asm + shf.l.wrap.b32 %r22564, %r22529, %r22528, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22568, %r22528, %r22529, %r22441; + // end inline asm + xor.b32 %r22980, %r22564, %r22504; + xor.b32 %r22981, %r22568, %r22505; + xor.b32 %r22721, %r30713, %r22980; + xor.b32 %r22722, %r30714, %r22981; + xor.b32 %r22698, %r30687, %r22980; + xor.b32 %r22697, %r30688, %r22981; + xor.b32 %r22641, %r30685, %r22980; + xor.b32 %r22642, %r30686, %r22981; + xor.b32 %r22729, %r30683, %r22980; + xor.b32 %r22730, %r30684, %r22981; + xor.b32 %r22658, %r30681, %r22980; + xor.b32 %r22657, %r30682, %r22981; + // begin inline asm + shf.l.wrap.b32 %r22572, %r22481, %r22480, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22576, %r22480, %r22481, %r22441; + // end inline asm + xor.b32 %r22982, %r22572, %r22516; + xor.b32 %r22983, %r22576, %r22517; + xor.b32 %r22673, %r30711, %r22982; + xor.b32 %r22674, %r30712, %r22983; + xor.b32 %r22593, %r30679, %r22982; + xor.b32 %r22594, %r30680, %r22983; + xor.b32 %r22610, %r30677, %r22982; + xor.b32 %r22609, %r30678, %r22983; + xor.b32 %r22649, %r30675, %r22982; + xor.b32 %r22650, %r30676, %r22983; + xor.b32 %r22681, %r30673, %r22982; + xor.b32 %r22682, %r30674, %r22983; + mov.u32 %r22587, 44; + // begin inline asm + shf.l.wrap.b32 %r22580, %r22586, %r22585, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22584, %r22585, %r22586, %r22587; + // end inline asm + mov.u32 %r22595, 20; + // begin inline asm + shf.l.wrap.b32 %r22588, %r22594, %r22593, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22592, %r22593, %r22594, %r22595; + // end inline asm + mov.u32 %r22603, 61; + // begin inline asm + shf.l.wrap.b32 %r22596, %r22602, %r22601, %r22603; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22600, %r22601, %r22602, %r22603; + // end inline asm + mov.u32 %r22611, 39; + // begin inline asm + shf.l.wrap.b32 %r22604, %r22610, %r22609, %r22611; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22608, %r22609, %r22610, %r22611; + // end inline asm + mov.u32 %r22619, 18; + // begin inline asm + shf.l.wrap.b32 %r22612, %r22618, %r22617, %r22619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22616, %r22617, %r22618, %r22619; + // end inline asm + mov.u32 %r22627, 62; + // begin inline asm + shf.l.wrap.b32 %r22620, %r22626, %r22625, %r22627; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22624, %r22625, %r22626, %r22627; + // end inline asm + mov.u32 %r22635, 43; + // begin inline asm + shf.l.wrap.b32 %r22628, %r22634, %r22633, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22632, %r22633, %r22634, %r22635; + // end inline asm + mov.u32 %r22643, 25; + // begin inline asm + shf.l.wrap.b32 %r22636, %r22642, %r22641, %r22643; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22640, %r22641, %r22642, %r22643; + // end inline asm + mov.u32 %r22651, 8; + // begin inline asm + shf.l.wrap.b32 %r22644, %r22650, %r22649, %r22651; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22648, %r22649, %r22650, %r22651; + // end inline asm + mov.u32 %r22659, 56; + // begin inline asm + shf.l.wrap.b32 %r22652, %r22658, %r22657, %r22659; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22656, %r22657, %r22658, %r22659; + // end inline asm + mov.u32 %r22667, 41; + // begin inline asm + shf.l.wrap.b32 %r22660, %r22666, %r22665, %r22667; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22664, %r22665, %r22666, %r22667; + // end inline asm + mov.u32 %r22675, 27; + // begin inline asm + shf.l.wrap.b32 %r22668, %r22674, %r22673, %r22675; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22672, %r22673, %r22674, %r22675; + // end inline asm + mov.u32 %r22683, 14; + // begin inline asm + shf.l.wrap.b32 %r22676, %r22682, %r22681, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22680, %r22681, %r22682, %r22683; + // end inline asm + mov.u32 %r22691, 2; + // begin inline asm + shf.l.wrap.b32 %r22684, %r22690, %r22689, %r22691; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22688, %r22689, %r22690, %r22691; + // end inline asm + mov.u32 %r22699, 55; + // begin inline asm + shf.l.wrap.b32 %r22692, %r22698, %r22697, %r22699; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22696, %r22697, %r22698, %r22699; + // end inline asm + mov.u32 %r22707, 45; + // begin inline asm + shf.l.wrap.b32 %r22700, %r22706, %r22705, %r22707; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22704, %r22705, %r22706, %r22707; + // end inline asm + mov.u32 %r22715, 36; + // begin inline asm + shf.l.wrap.b32 %r22708, %r22714, %r22713, %r22715; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22712, %r22713, %r22714, %r22715; + // end inline asm + mov.u32 %r22723, 28; + // begin inline asm + shf.l.wrap.b32 %r22716, %r22722, %r22721, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22720, %r22721, %r22722, %r22723; + // end inline asm + mov.u32 %r22731, 21; + // begin inline asm + shf.l.wrap.b32 %r22724, %r22730, %r22729, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22728, %r22729, %r22730, %r22731; + // end inline asm + mov.u32 %r22739, 15; + // begin inline asm + shf.l.wrap.b32 %r22732, %r22738, %r22737, %r22739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22736, %r22737, %r22738, %r22739; + // end inline asm + mov.u32 %r22747, 10; + // begin inline asm + shf.l.wrap.b32 %r22740, %r22746, %r22745, %r22747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22744, %r22745, %r22746, %r22747; + // end inline asm + mov.u32 %r22755, 6; + // begin inline asm + shf.l.wrap.b32 %r22748, %r22754, %r22753, %r22755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22752, %r22753, %r22754, %r22755; + // end inline asm + mov.u32 %r22763, 3; + // begin inline asm + shf.l.wrap.b32 %r22756, %r22762, %r22761, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22760, %r22761, %r22762, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22764, %r22770, %r22769, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22768, %r22769, %r22770, %r22441; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22772, %r22807, %r22580, %r22628, 0xD2; + lop3.b32 %r22773, %r22810, %r22584, %r22632, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30721, %r22580, %r22628, %r22724, 0xD2; + lop3.b32 %r30722, %r22584, %r22632, %r22728, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30717, %r22628, %r22724, %r22676, 0xD2; + lop3.b32 %r30718, %r22632, %r22728, %r22680, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30713, %r22724, %r22676, %r22807, 0xD2; + lop3.b32 %r30714, %r22728, %r22680, %r22810, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30711, %r22676, %r22807, %r22580, 0xD2; + lop3.b32 %r30712, %r22680, %r22810, %r22584, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30707, %r22716, %r22588, %r22756, 0xD2; + lop3.b32 %r30708, %r22720, %r22592, %r22760, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30719, %r22588, %r22756, %r22700, 0xD2; + lop3.b32 %r30720, %r22592, %r22760, %r22704, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30715, %r22756, %r22700, %r22596, 0xD2; + lop3.b32 %r30716, %r22760, %r22704, %r22600, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30687, %r22700, %r22596, %r22716, 0xD2; + lop3.b32 %r30688, %r22704, %r22600, %r22720, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r30687, %r30688}; + // begin inline asm + // chi + lop3.b32 %r30679, %r22596, %r22716, %r22588, 0xD2; + lop3.b32 %r30680, %r22600, %r22720, %r22592, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r30679, %r30680}; + // begin inline asm + // chi + lop3.b32 %r30705, %r22764, %r22748, %r22636, 0xD2; + lop3.b32 %r30706, %r22768, %r22752, %r22640, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+104], {%r30705, %r30706}; + // begin inline asm + // chi + lop3.b32 %r30699, %r22748, %r22636, %r22644, 0xD2; + lop3.b32 %r30700, %r22752, %r22640, %r22648, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+112], {%r30699, %r30700}; + // begin inline asm + // chi + lop3.b32 %r30693, %r22636, %r22644, %r22612, 0xD2; + lop3.b32 %r30694, %r22640, %r22648, %r22616, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+120], {%r30693, %r30694}; + // begin inline asm + // chi + lop3.b32 %r30685, %r22644, %r22612, %r22764, 0xD2; + lop3.b32 %r30686, %r22648, %r22616, %r22768, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+128], {%r30685, %r30686}; + // begin inline asm + // chi + lop3.b32 %r30677, %r22612, %r22764, %r22748, 0xD2; + lop3.b32 %r30678, %r22616, %r22768, %r22752, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+136], {%r30677, %r30678}; + // begin inline asm + // chi + lop3.b32 %r30703, %r22668, %r22708, %r22740, 0xD2; + lop3.b32 %r30704, %r22672, %r22712, %r22744, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+144], {%r30703, %r30704}; + // begin inline asm + // chi + lop3.b32 %r30697, %r22708, %r22740, %r22732, 0xD2; + lop3.b32 %r30698, %r22712, %r22744, %r22736, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+152], {%r30697, %r30698}; + // begin inline asm + // chi + lop3.b32 %r30691, %r22740, %r22732, %r22652, 0xD2; + lop3.b32 %r30692, %r22744, %r22736, %r22656, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+160], {%r30691, %r30692}; + // begin inline asm + // chi + lop3.b32 %r30683, %r22732, %r22652, %r22668, 0xD2; + lop3.b32 %r30684, %r22736, %r22656, %r22672, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+168], {%r30683, %r30684}; + // begin inline asm + // chi + lop3.b32 %r30675, %r22652, %r22668, %r22708, 0xD2; + lop3.b32 %r30676, %r22656, %r22672, %r22712, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+176], {%r30675, %r30676}; + // begin inline asm + // chi + lop3.b32 %r30701, %r22620, %r22692, %r22604, 0xD2; + lop3.b32 %r30702, %r22624, %r22696, %r22608, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+184], {%r30701, %r30702}; + // begin inline asm + // chi + lop3.b32 %r30695, %r22692, %r22604, %r22660, 0xD2; + lop3.b32 %r30696, %r22696, %r22608, %r22664, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+192], {%r30695, %r30696}; + // begin inline asm + // chi + lop3.b32 %r30689, %r22604, %r22660, %r22684, 0xD2; + lop3.b32 %r30690, %r22608, %r22664, %r22688, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+200], {%r30689, %r30690}; + // begin inline asm + // chi + lop3.b32 %r30681, %r22660, %r22684, %r22620, 0xD2; + lop3.b32 %r30682, %r22664, %r22688, %r22624, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+208], {%r30681, %r30682}; + // begin inline asm + // chi + lop3.b32 %r30673, %r22684, %r22620, %r22692, 0xD2; + lop3.b32 %r30674, %r22688, %r22624, %r22696, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+216], {%r30673, %r30674}; + mul.wide.s32 %rd1002, %r30723, 8; + add.s64 %rd1001, %rd918, %rd1002; + // begin inline asm + ld.global.nc.v2.u32 {%r22972,%r22973}, [%rd1001]; + // end inline asm + xor.b32 %r30709, %r22772, %r22972; + xor.b32 %r30710, %r22773, %r22973; + add.s32 %r30723, %r30723, 1; + setp.lt.u32 %p46, %r30723, 23; + @%p46 bra $L__BB2_81; -$L__BB0_12: - ld.const.u64 %rd76, [target+16]; - setp.eq.s64 %p13, %rd455, %rd76; - @%p13 bra $L__BB0_14; - bra.uni $L__BB0_13; + mov.u32 %r23083, 1; + st.local.v2.u32 [%rd925+32], {%r30721, %r30722}; + st.local.v2.u32 [%rd925+72], {%r30719, %r30720}; + st.local.v2.u32 [%rd925+40], {%r30717, %r30718}; + st.local.v2.u32 [%rd925+80], {%r30715, %r30716}; + st.local.v2.u32 [%rd925+48], {%r30713, %r30714}; + st.local.v2.u32 [%rd925+56], {%r30711, %r30712}; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + // begin inline asm + // xor5 + lop3.b32 %r22984, %r30709, %r30707, %r30705, 0x96; + lop3.b32 %r22984, %r22984, %r30703, %r30701, 0x96; + lop3.b32 %r22985, %r30710, %r30708, %r30706, 0x96; + lop3.b32 %r22985, %r22985, %r30704, %r30702, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22996, %r30721, %r30719, %r30699, 0x96; + lop3.b32 %r22996, %r22996, %r30697, %r30695, 0x96; + lop3.b32 %r22997, %r30722, %r30720, %r30700, 0x96; + lop3.b32 %r22997, %r22997, %r30698, %r30696, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23008, %r30717, %r30715, %r30693, 0x96; + lop3.b32 %r23008, %r23008, %r30691, %r30689, 0x96; + lop3.b32 %r23009, %r30718, %r30716, %r30694, 0x96; + lop3.b32 %r23009, %r23009, %r30692, %r30690, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23020, %r30713, %r30687, %r30685, 0x96; + lop3.b32 %r23020, %r23020, %r30683, %r30681, 0x96; + lop3.b32 %r23021, %r30714, %r30688, %r30686, 0x96; + lop3.b32 %r23021, %r23021, %r30684, %r30682, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23032, %r30711, %r30679, %r30677, 0x96; + lop3.b32 %r23032, %r23032, %r30675, %r30673, 0x96; + lop3.b32 %r23033, %r30712, %r30680, %r30678, 0x96; + lop3.b32 %r23033, %r23033, %r30676, %r30674, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23044, %r22997, %r22996, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23048, %r22996, %r22997, %r23083; + // end inline asm + xor.b32 %r23222, %r23044, %r23032; + xor.b32 %r23223, %r23048, %r23033; + xor.b32 %r23191, %r30709, %r23222; + xor.b32 %r23194, %r30710, %r23223; + xor.b32 %r23154, %r30706, %r23223; + xor.b32 %r23153, %r30705, %r23222; + st.local.v2.u32 [%rd925+104], {%r23153, %r23154}; + // begin inline asm + shf.l.wrap.b32 %r23052, %r23009, %r23008, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23056, %r23008, %r23009, %r23083; + // end inline asm + xor.b32 %r23224, %r23052, %r22984; + xor.b32 %r23225, %r23056, %r22985; + xor.b32 %r23090, %r30719, %r23224; + xor.b32 %r23089, %r30720, %r23225; + xor.b32 %r23129, %r30698, %r23225; + xor.b32 %r23130, %r30697, %r23224; + st.local.v2.u32 [%rd925+152], {%r23130, %r23129}; + // begin inline asm + shf.l.wrap.b32 %r23060, %r23021, %r23020, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23064, %r23020, %r23021, %r23083; + // end inline asm + xor.b32 %r23226, %r23060, %r22996; + xor.b32 %r23227, %r23064, %r22997; + xor.b32 %r23113, %r30694, %r23227; + xor.b32 %r23114, %r30693, %r23226; + st.local.v2.u32 [%rd925+120], {%r23114, %r23113}; + xor.b32 %r23105, %r30690, %r23227; + xor.b32 %r23106, %r30689, %r23226; + st.local.v2.u32 [%rd925+200], {%r23106, %r23105}; + // begin inline asm + shf.l.wrap.b32 %r23068, %r23033, %r23032, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23072, %r23032, %r23033, %r23083; + // end inline asm + xor.b32 %r23228, %r23068, %r23008; + xor.b32 %r23229, %r23072, %r23009; + xor.b32 %r23137, %r30713, %r23228; + xor.b32 %r23138, %r30714, %r23229; + xor.b32 %r23146, %r30684, %r23229; + xor.b32 %r23145, %r30683, %r23228; + st.local.v2.u32 [%rd925+168], {%r23145, %r23146}; + // begin inline asm + shf.l.wrap.b32 %r23076, %r22985, %r22984, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23080, %r22984, %r22985, %r23083; + // end inline asm + xor.b32 %r23230, %r23076, %r23020; + xor.b32 %r23231, %r23080, %r23021; + xor.b32 %r23097, %r30679, %r23230; + xor.b32 %r23098, %r30680, %r23231; + xor.b32 %r23122, %r30674, %r23231; + xor.b32 %r23121, %r30673, %r23230; + st.local.v2.u32 [%rd925+216], {%r23121, %r23122}; + // begin inline asm + shf.l.wrap.b32 %r23084, %r23090, %r23089, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23088, %r23089, %r23090, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23092, %r23098, %r23097, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23096, %r23097, %r23098, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23104, %r23105, %r23106, %r22603; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23100, %r23106, %r23105, %r22603; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r23100, %r23104}; + // begin inline asm + shf.l.wrap.b32 %r23108, %r23114, %r23113, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23112, %r23113, %r23114, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23116, %r23122, %r23121, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23120, %r23121, %r23122, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23128, %r23129, %r23130, %r22707; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23124, %r23130, %r23129, %r22707; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r23124, %r23128}; + // begin inline asm + shf.l.wrap.b32 %r23132, %r23138, %r23137, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23136, %r23137, %r23138, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23140, %r23146, %r23145, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23144, %r23145, %r23146, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23148, %r23154, %r23153, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23152, %r23153, %r23154, %r22763; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23156, %r23191, %r23084, %r23108, 0xD2; + lop3.b32 %r23157, %r23194, %r23088, %r23112, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23164, %r23084, %r23108, %r23140, 0xD2; + lop3.b32 %r23165, %r23088, %r23112, %r23144, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+32], {%r23164, %r23165}; + // begin inline asm + // chi + lop3.b32 %r23172, %r23108, %r23140, %r23116, 0xD2; + lop3.b32 %r23173, %r23112, %r23144, %r23120, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+40], {%r23172, %r23173}; + // begin inline asm + // chi + lop3.b32 %r23180, %r23140, %r23116, %r23191, 0xD2; + lop3.b32 %r23181, %r23144, %r23120, %r23194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+48], {%r23180, %r23181}; + // begin inline asm + // chi + lop3.b32 %r23188, %r23116, %r23191, %r23084, 0xD2; + lop3.b32 %r23189, %r23120, %r23194, %r23088, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+56], {%r23188, %r23189}; + // begin inline asm + // chi + lop3.b32 %r23196, %r23132, %r23092, %r23148, 0xD2; + lop3.b32 %r23197, %r23136, %r23096, %r23152, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+64], {%r23196, %r23197}; + // begin inline asm + // chi + lop3.b32 %r23204, %r23092, %r23148, %r23124, 0xD2; + lop3.b32 %r23205, %r23096, %r23152, %r23128, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+72], {%r23204, %r23205}; + // begin inline asm + // chi + lop3.b32 %r23212, %r23148, %r23124, %r23100, 0xD2; + lop3.b32 %r23213, %r23152, %r23128, %r23104, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+80], {%r23212, %r23213}; + // begin inline asm + ld.global.nc.v2.u32 {%r23220,%r23221}, [%rd919]; + // end inline asm + xor.b32 %r23232, %r23157, %r23221; + xor.b32 %r23233, %r23156, %r23220; + st.local.v2.u32 [%rd925+24], {%r23233, %r23232}; + st.global.u64 [%rd224], %rd1269; + st.global.u64 [%rd224+8], %rd1270; + st.global.u64 [%rd224+16], %rd1271; + st.global.u64 [%rd224+24], %rd250; + st.global.u64 [%rd224+32], %rd1272; + st.global.u64 [%rd224+40], %rd252; + st.global.u64 [%rd224+48], %rd253; + st.global.u64 [%rd224+56], %rd254; + st.global.v2.u32 [%rd224+64], {%r23233, %r23232}; + st.global.v2.u32 [%rd224+72], {%r23164, %r23165}; + st.global.v2.u32 [%rd224+80], {%r23172, %r23173}; + st.global.v2.u32 [%rd224+88], {%r23180, %r23181}; + st.global.v2.u32 [%rd224+96], {%r23188, %r23189}; + st.global.v2.u32 [%rd224+104], {%r23196, %r23197}; + st.global.v2.u32 [%rd224+112], {%r23204, %r23205}; + st.global.v2.u32 [%rd224+120], {%r23212, %r23213}; -$L__BB0_14: - ld.const.u64 %rd77, [target+8]; - setp.eq.s64 %p14, %rd460, %rd77; - @%p14 bra $L__BB0_16; - bra.uni $L__BB0_15; +$L__BB2_94: + mul.lo.s32 %r26518, %r20, 16777619; + mov.b64 {%r26519, %r26520}, %rd1265; + mul.lo.s32 %r26521, %r19, 16777619; + xor.b32 %r26522, %r26518, %r26519; + xor.b32 %r26523, %r26521, %r26520; + mov.b64 %rd1116, {%r26522, %r26523}; + mov.b64 {%r26524, %r26525}, %rd1269; + xor.b32 %r26526, %r26525, %r19; + xor.b32 %r26527, %r26524, %r20; + mov.b64 %rd1117, {%r26527, %r26526}; + mov.b64 {%r26528, %r26529}, %rd1259; + mul.lo.s32 %r26530, %r26528, 16777619; + mov.b64 {%r26531, %r26532}, %rd1266; + mul.lo.s32 %r26533, %r26529, 16777619; + xor.b32 %r26534, %r26533, %r26532; + xor.b32 %r26535, %r26530, %r26531; + mov.b64 %rd1118, {%r26535, %r26534}; + mov.b64 {%r26536, %r26537}, %rd1270; + xor.b32 %r26538, %r26537, %r26529; + xor.b32 %r26539, %r26536, %r26528; + mov.b64 %rd1119, {%r26539, %r26538}; + mul.lo.s32 %r26540, %r24, 16777619; + mov.b64 {%r26541, %r26542}, %rd1267; + mul.lo.s32 %r26543, %r23, 16777619; + xor.b32 %r26544, %r26543, %r26542; + xor.b32 %r26545, %r26540, %r26541; + mov.b64 %rd1120, {%r26545, %r26544}; + mov.b64 {%r26546, %r26547}, %rd1271; + xor.b32 %r26548, %r26547, %r23; + xor.b32 %r26549, %r26546, %r24; + mov.b64 %rd1121, {%r26549, %r26548}; + mul.lo.s32 %r26550, %r28, 16777619; + mov.b64 {%r26551, %r26552}, %rd1268; + mul.lo.s32 %r26553, %r27, 16777619; + xor.b32 %r26554, %r26553, %r26552; + xor.b32 %r26555, %r26550, %r26551; + mov.b64 %rd1122, {%r26555, %r26554}; + mov.b64 {%r26556, %r26557}, %rd1272; + xor.b32 %r26558, %r26557, %r27; + xor.b32 %r26559, %r26556, %r28; + mov.b64 %rd1123, {%r26559, %r26558}; + mul.lo.s64 %rd1124, %rd1261, %rd1116; + add.s64 %rd1260, %rd1124, %rd1117; + mul.lo.s64 %rd1125, %rd1262, %rd1118; + add.s64 %rd1259, %rd1125, %rd1119; + mul.lo.s64 %rd1126, %rd1263, %rd1120; + add.s64 %rd1258, %rd1126, %rd1121; + mul.lo.s64 %rd1127, %rd1264, %rd1122; + add.s64 %rd1257, %rd1127, %rd1123; + add.s32 %r29538, %r29538, 1; + setp.lt.u32 %p52, %r29538, 32; + @%p52 bra $L__BB2_19; -$L__BB0_16: - ld.const.u64 %rd430, [target]; - setp.lt.u64 %p16, %rd73, %rd430; - bra.uni $L__BB0_17; + add.u64 %rd1250, %SPL, 2016; + add.u64 %rd1242, %SP, 2016; + add.u64 %rd1241, %SP, 0; + mov.u64 %rd1128, 0; + mov.b64 {%r26560, %r26561}, %rd1260; + mul.lo.s32 %r26562, %r26560, 16777619; + xor.b32 %r26563, %r26562, %r26561; + mul.lo.s32 %r26564, %r26563, 16777619; + mov.b64 {%r26565, %r26566}, %rd1259; + xor.b32 %r26567, %r26564, %r26565; + mul.lo.s32 %r26568, %r26567, 16777619; + xor.b32 %r26569, %r26568, %r26566; + mov.b32 {%rs498, %rs499}, %r26569; + mov.u32 %r26570, 0; + st.local.v4.u32 [%rd1250+32], {%r26570, %r26570, %r26570, %r26570}; + st.local.v4.u32 [%rd1250+48], {%r26570, %r26570, %r26570, %r26570}; + st.local.v4.u32 [%rd1250+64], {%r26570, %r26570, %r26570, %r26570}; + cvt.u64.u16 %rd1131, %rs498; + and.b64 %rd1132, %rd1131, 255; + or.b64 %rd1133, %rd26, %rd1132; + st.local.v2.u64 [%rd1250], {%rd1133, %rd23}; + st.local.v2.u64 [%rd1250+16], {%rd24, %rd25}; + mov.u32 %r26571, -1150833019; + mov.u32 %r26572, 1779033703; + st.local.v2.u32 [%rd3], {%r26572, %r26571}; + mov.u32 %r26573, -1521486534; + mov.u32 %r26574, 1013904242; + st.local.v2.u32 [%rd3+8], {%r26574, %r26573}; + mov.u32 %r26575, -1694144372; + mov.u32 %r26576, 1359893119; + st.local.v2.u32 [%rd3+16], {%r26576, %r26575}; + mov.u32 %r26577, 1541459225; + mov.u32 %r26578, 528734635; + st.local.v2.u32 [%rd3+24], {%r26578, %r26577}; + st.local.v2.u32 [%rd3+32], {%r26572, %r26571}; + st.local.v2.u32 [%rd3+40], {%r26574, %r26573}; + st.local.v2.u32 [%rd3+48], {%r26576, %r26575}; + st.local.v2.u32 [%rd3+56], {%r26578, %r26577}; + st.local.u64 [%rd3+64], %rd1128; + st.local.v2.u32 [%rd3+72], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+80], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+88], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+96], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+104], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+112], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+120], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+128], {%r26570, %r26570}; + mov.u16 %rs500, 0; + st.local.v2.u8 [%rd3+136], {%rs500, %rs500}; + st.local.u8 [%rd3+138], %rs500; + st.local.u8 [%rd3+144], %rs500; + { // callseq 13, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd1241; + .param .b64 param1; + st.param.b64 [param1+0], %rd1242; + call.uni + _Z20blake3_hasher_updateP13blake3_hasherPKvy, + ( + param0, + param1 + ); + } // callseq 13 + ld.local.u8 %rd1275, [%rd3+144]; + setp.eq.s64 %p53, %rd1275, 0; + @%p53 bra $L__BB2_103; -$L__BB0_11: - setp.lt.u64 %p16, %rd450, %rd75; - bra.uni $L__BB0_17; + ld.local.v2.u8 {%rs864, %rs502}, [%rd3+136]; + cvt.u32.u16 %r26579, %rs502; + mul.wide.u32 %rd1135, %r26579, 64; + cvt.u64.u16 %rd1136, %rs864; + neg.s64 %rd1137, %rd1136; + setp.eq.s64 %p54, %rd1135, %rd1137; + @%p54 bra $L__BB2_98; + bra.uni $L__BB2_97; -$L__BB0_13: - setp.lt.u64 %p16, %rd455, %rd76; - bra.uni $L__BB0_17; +$L__BB2_98: + add.s64 %rd1275, %rd1275, -2; + shl.b64 %rd1139, %rd1275, 5; + add.s64 %rd1142, %rd3, %rd1139; + ld.local.u8 %rs667, [%rd3+138]; + mov.u64 %rd1276, 0; + or.b16 %rs734, %rs667, 4; + ld.local.v2.u32 {%r30976, %r30975}, [%rd3]; + ld.local.v2.u32 {%r30974, %r30973}, [%rd3+8]; + ld.local.v2.u32 {%r30972, %r30971}, [%rd3+16]; + ld.local.v2.u32 {%r30970, %r30969}, [%rd3+24]; + ld.local.u8 %rs800, [%rd1142+145]; + ld.local.u8 %rs801, [%rd1142+146]; + ld.local.u8 %rs802, [%rd1142+147]; + ld.local.u8 %rs803, [%rd1142+148]; + ld.local.u8 %rs804, [%rd1142+149]; + ld.local.u8 %rs805, [%rd1142+150]; + ld.local.u8 %rs806, [%rd1142+151]; + ld.local.u8 %rs807, [%rd1142+152]; + ld.local.u8 %rs808, [%rd1142+153]; + ld.local.u8 %rs809, [%rd1142+154]; + ld.local.u8 %rs810, [%rd1142+155]; + ld.local.u8 %rs811, [%rd1142+156]; + ld.local.u8 %rs812, [%rd1142+157]; + ld.local.u8 %rs813, [%rd1142+158]; + ld.local.u8 %rs814, [%rd1142+159]; + ld.local.u8 %rs815, [%rd1142+160]; + ld.local.u8 %rs816, [%rd1142+161]; + ld.local.u8 %rs817, [%rd1142+162]; + ld.local.u8 %rs818, [%rd1142+163]; + ld.local.u8 %rs819, [%rd1142+164]; + ld.local.u8 %rs820, [%rd1142+165]; + ld.local.u8 %rs821, [%rd1142+166]; + ld.local.u8 %rs822, [%rd1142+167]; + ld.local.u8 %rs823, [%rd1142+168]; + ld.local.u8 %rs824, [%rd1142+169]; + ld.local.u8 %rs825, [%rd1142+170]; + ld.local.u8 %rs826, [%rd1142+171]; + ld.local.u8 %rs827, [%rd1142+172]; + ld.local.u8 %rs828, [%rd1142+173]; + ld.local.u8 %rs829, [%rd1142+174]; + ld.local.u8 %rs830, [%rd1142+175]; + ld.local.u8 %rs831, [%rd1142+176]; + ld.local.u8 %rs832, [%rd1142+177]; + ld.local.u8 %rs833, [%rd1142+178]; + ld.local.u8 %rs834, [%rd1142+179]; + ld.local.u8 %rs835, [%rd1142+180]; + ld.local.u8 %rs836, [%rd1142+181]; + ld.local.u8 %rs837, [%rd1142+182]; + ld.local.u8 %rs838, [%rd1142+183]; + ld.local.u8 %rs839, [%rd1142+184]; + ld.local.u8 %rs840, [%rd1142+185]; + ld.local.u8 %rs841, [%rd1142+186]; + ld.local.u8 %rs842, [%rd1142+187]; + ld.local.u8 %rs843, [%rd1142+188]; + ld.local.u8 %rs844, [%rd1142+189]; + ld.local.u8 %rs845, [%rd1142+190]; + ld.local.u8 %rs846, [%rd1142+191]; + ld.local.u8 %rs847, [%rd1142+192]; + ld.local.u8 %rs848, [%rd1142+193]; + ld.local.u8 %rs849, [%rd1142+194]; + ld.local.u8 %rs850, [%rd1142+195]; + ld.local.u8 %rs851, [%rd1142+196]; + ld.local.u8 %rs852, [%rd1142+197]; + ld.local.u8 %rs853, [%rd1142+198]; + ld.local.u8 %rs854, [%rd1142+199]; + ld.local.v4.u16 {%rs855, %rs857, %rs859, %rs861}, [%rd1142+200]; + shr.u16 %rs856, %rs855, 8; + shr.u16 %rs858, %rs857, 8; + shr.u16 %rs860, %rs859, 8; + shr.u16 %rs862, %rs861, 8; + ld.local.u8 %rs863, [%rd1142+208]; + mov.u16 %rs864, 64; + bra.uni $L__BB2_99; -$L__BB0_15: - setp.lt.u64 %p16, %rd460, %rd77; +$L__BB2_103: + ld.local.v4.u8 {%rs570, %rs571, %rs572, %rs573}, [%rd3+136]; + setp.eq.s16 %p58, %rs571, 0; + selp.u16 %rs575, 1, 0, %p58; + ld.local.v2.u32 {%r28596, %r28597}, [%rd3+32]; + ld.local.v2.u32 {%r28600, %r28601}, [%rd3+40]; + ld.local.v2.u32 {%r28604, %r28605}, [%rd3+48]; + ld.local.v2.u32 {%r28608, %r28609}, [%rd3+56]; + ld.local.v4.u16 {%rs576, %rs577, %rs578, %rs579}, [%rd3+72]; + shr.u16 %rs581, %rs576, 8; + shr.u16 %rs583, %rs577, 8; + shr.u16 %rs585, %rs578, 8; + shr.u16 %rs587, %rs579, 8; + ld.local.v4.u16 {%rs588, %rs589, %rs590, %rs591}, [%rd3+80]; + shr.u16 %rs593, %rs588, 8; + shr.u16 %rs595, %rs589, 8; + shr.u16 %rs597, %rs590, 8; + shr.u16 %rs599, %rs591, 8; + ld.local.v4.u16 {%rs600, %rs601, %rs602, %rs603}, [%rd3+88]; + shr.u16 %rs605, %rs600, 8; + shr.u16 %rs607, %rs601, 8; + shr.u16 %rs609, %rs602, 8; + shr.u16 %rs611, %rs603, 8; + ld.local.v4.u16 {%rs612, %rs613, %rs614, %rs615}, [%rd3+96]; + shr.u16 %rs617, %rs612, 8; + shr.u16 %rs619, %rs613, 8; + shr.u16 %rs621, %rs614, 8; + shr.u16 %rs623, %rs615, 8; + ld.local.v4.u16 {%rs624, %rs625, %rs626, %rs627}, [%rd3+104]; + shr.u16 %rs629, %rs624, 8; + shr.u16 %rs631, %rs625, 8; + shr.u16 %rs633, %rs626, 8; + shr.u16 %rs635, %rs627, 8; + ld.local.v4.u16 {%rs636, %rs637, %rs638, %rs639}, [%rd3+112]; + shr.u16 %rs641, %rs636, 8; + shr.u16 %rs643, %rs637, 8; + shr.u16 %rs645, %rs638, 8; + shr.u16 %rs647, %rs639, 8; + ld.local.v4.u16 {%rs648, %rs649, %rs650, %rs651}, [%rd3+120]; + shr.u16 %rs653, %rs648, 8; + shr.u16 %rs655, %rs649, 8; + ld.local.v2.u8 {%rs657, %rs658}, [%rd3+126]; + ld.local.u16 %r28612, [%rd3+132]; + ld.local.v2.u8 {%rs661, %rs662}, [%rd3+134]; + or.b16 %rs665, %rs572, %rs575; + or.b16 %rs666, %rs665, 10; + cvt.u32.u16 %r28613, %rs576; + and.b32 %r28614, %r28613, 255; + cvt.u32.u16 %r28615, %rs581; + prmt.b32 %r28616, %r28615, %r28614, 30212; + cvt.u32.u16 %r28617, %rs577; + prmt.b32 %r28618, %r28617, %r28616, 28756; + cvt.u32.u16 %r28619, %rs583; + prmt.b32 %r28620, %r28619, %r28618, 1620; + cvt.u32.u16 %r28621, %rs578; + and.b32 %r28622, %r28621, 255; + cvt.u32.u16 %r28623, %rs585; + prmt.b32 %r28624, %r28623, %r28622, 30212; + cvt.u32.u16 %r28625, %rs579; + prmt.b32 %r28626, %r28625, %r28624, 28756; + cvt.u32.u16 %r28627, %rs587; + prmt.b32 %r28628, %r28627, %r28626, 1620; + cvt.u32.u16 %r28629, %rs588; + and.b32 %r28630, %r28629, 255; + cvt.u32.u16 %r28631, %rs593; + prmt.b32 %r28632, %r28631, %r28630, 30212; + cvt.u32.u16 %r28633, %rs589; + prmt.b32 %r28634, %r28633, %r28632, 28756; + cvt.u32.u16 %r28635, %rs595; + prmt.b32 %r28636, %r28635, %r28634, 1620; + cvt.u32.u16 %r28637, %rs590; + and.b32 %r28638, %r28637, 255; + cvt.u32.u16 %r28639, %rs597; + prmt.b32 %r28640, %r28639, %r28638, 30212; + cvt.u32.u16 %r28641, %rs591; + prmt.b32 %r28642, %r28641, %r28640, 28756; + cvt.u32.u16 %r28643, %rs599; + prmt.b32 %r28644, %r28643, %r28642, 1620; + cvt.u32.u16 %r28645, %rs600; + and.b32 %r28646, %r28645, 255; + cvt.u32.u16 %r28647, %rs605; + prmt.b32 %r28648, %r28647, %r28646, 30212; + cvt.u32.u16 %r28649, %rs601; + prmt.b32 %r28650, %r28649, %r28648, 28756; + cvt.u32.u16 %r28651, %rs607; + prmt.b32 %r28652, %r28651, %r28650, 1620; + cvt.u32.u16 %r28653, %rs602; + and.b32 %r28654, %r28653, 255; + cvt.u32.u16 %r28655, %rs609; + prmt.b32 %r28656, %r28655, %r28654, 30212; + cvt.u32.u16 %r28657, %rs603; + prmt.b32 %r28658, %r28657, %r28656, 28756; + cvt.u32.u16 %r28659, %rs611; + prmt.b32 %r28660, %r28659, %r28658, 1620; + cvt.u32.u16 %r28661, %rs612; + and.b32 %r28662, %r28661, 255; + cvt.u32.u16 %r28663, %rs617; + prmt.b32 %r28664, %r28663, %r28662, 30212; + cvt.u32.u16 %r28665, %rs613; + prmt.b32 %r28666, %r28665, %r28664, 28756; + cvt.u32.u16 %r28667, %rs619; + prmt.b32 %r28668, %r28667, %r28666, 1620; + cvt.u32.u16 %r28669, %rs614; + and.b32 %r28670, %r28669, 255; + cvt.u32.u16 %r28671, %rs621; + prmt.b32 %r28672, %r28671, %r28670, 30212; + cvt.u32.u16 %r28673, %rs615; + prmt.b32 %r28674, %r28673, %r28672, 28756; + cvt.u32.u16 %r28675, %rs623; + prmt.b32 %r28676, %r28675, %r28674, 1620; + cvt.u32.u16 %r28677, %rs624; + and.b32 %r28678, %r28677, 255; + cvt.u32.u16 %r28679, %rs629; + prmt.b32 %r28680, %r28679, %r28678, 30212; + cvt.u32.u16 %r28681, %rs625; + prmt.b32 %r28682, %r28681, %r28680, 28756; + cvt.u32.u16 %r28683, %rs631; + prmt.b32 %r28684, %r28683, %r28682, 1620; + cvt.u32.u16 %r28685, %rs626; + and.b32 %r28686, %r28685, 255; + cvt.u32.u16 %r28687, %rs633; + prmt.b32 %r28688, %r28687, %r28686, 30212; + cvt.u32.u16 %r28689, %rs627; + prmt.b32 %r28690, %r28689, %r28688, 28756; + cvt.u32.u16 %r28691, %rs635; + prmt.b32 %r28692, %r28691, %r28690, 1620; + cvt.u32.u16 %r28693, %rs636; + and.b32 %r28694, %r28693, 255; + cvt.u32.u16 %r28695, %rs641; + prmt.b32 %r28696, %r28695, %r28694, 30212; + cvt.u32.u16 %r28697, %rs637; + prmt.b32 %r28698, %r28697, %r28696, 28756; + cvt.u32.u16 %r28699, %rs643; + prmt.b32 %r28700, %r28699, %r28698, 1620; + cvt.u32.u16 %r28701, %rs638; + and.b32 %r28702, %r28701, 255; + cvt.u32.u16 %r28703, %rs645; + prmt.b32 %r28704, %r28703, %r28702, 30212; + cvt.u32.u16 %r28705, %rs639; + prmt.b32 %r28706, %r28705, %r28704, 28756; + cvt.u32.u16 %r28707, %rs647; + prmt.b32 %r28708, %r28707, %r28706, 1620; + cvt.u32.u16 %r28709, %rs648; + and.b32 %r28710, %r28709, 255; + cvt.u32.u16 %r28711, %rs653; + prmt.b32 %r28712, %r28711, %r28710, 30212; + cvt.u32.u16 %r28713, %rs649; + prmt.b32 %r28714, %r28713, %r28712, 28756; + cvt.u32.u16 %r28715, %rs655; + prmt.b32 %r28716, %r28715, %r28714, 1620; + cvt.u32.u16 %r28717, %rs650; + and.b32 %r28718, %r28717, 255; + ld.local.u8 %r28719, [%rd3+125]; + prmt.b32 %r28720, %r28719, %r28718, 30212; + cvt.u32.u16 %r28721, %rs657; + prmt.b32 %r28722, %r28721, %r28720, 28756; + cvt.u32.u16 %r28723, %rs658; + prmt.b32 %r28724, %r28723, %r28722, 1620; + ld.local.u32 %r28725, [%rd3+128]; + cvt.u32.u16 %r28726, %rs661; + prmt.b32 %r28727, %r28726, %r28612, 28756; + cvt.u32.u16 %r28728, %rs662; + prmt.b32 %r28729, %r28728, %r28727, 1620; + cvt.u32.u16 %r28730, %rs570; + cvt.u32.u16 %r28731, %rs666; + and.b32 %r28732, %r28731, 255; + add.s32 %r28733, %r28604, %r28596; + add.s32 %r28734, %r28733, %r28620; + add.s32 %r28735, %r28628, %r28734; + add.s32 %r28736, %r28605, %r28597; + add.s32 %r28737, %r28736, %r28636; + add.s32 %r28738, %r28644, %r28737; + add.s32 %r28739, %r28608, %r28600; + add.s32 %r28740, %r28739, %r28652; + xor.b32 %r28741, %r28740, %r28730; + shr.u32 %r28742, %r28740, 16; + shl.b32 %r28743, %r28741, 16; + or.b32 %r28744, %r28743, %r28742; + add.s32 %r28745, %r28744, 1013904242; + xor.b32 %r28746, %r28745, %r28608; + shf.l.wrap.b32 %r28747, %r28746, %r28746, 20; + add.s32 %r28748, %r28660, %r28740; + add.s32 %r28749, %r28748, %r28747; + xor.b32 %r28750, %r28749, %r28744; + shf.l.wrap.b32 %r28751, %r28750, %r28750, 24; + add.s32 %r28752, %r28751, %r28745; + xor.b32 %r28753, %r28752, %r28747; + shf.l.wrap.b32 %r28754, %r28753, %r28753, 25; + add.s32 %r28755, %r28609, %r28601; + add.s32 %r28756, %r28755, %r28668; + xor.b32 %r28757, %r28756, %r28732; + shr.u32 %r28758, %r28756, 16; + shl.b32 %r28759, %r28757, 16; + or.b32 %r28760, %r28759, %r28758; + add.s32 %r28761, %r28760, -1521486534; + xor.b32 %r28762, %r28761, %r28609; + shf.l.wrap.b32 %r28763, %r28762, %r28762, 20; + add.s32 %r28764, %r28676, %r28756; + add.s32 %r28765, %r28764, %r28763; + xor.b32 %r28766, %r28765, %r28760; + shf.l.wrap.b32 %r28767, %r28766, %r28766, 24; + add.s32 %r28768, %r28767, %r28761; + xor.b32 %r28769, %r28768, %r28763; + shf.l.wrap.b32 %r28770, %r28769, %r28769, 25; + add.s32 %r28771, %r28700, %r28754; + add.s32 %r28772, %r28770, %r28749; + add.s32 %r28773, %r28772, %r28716; + add.s32 %r28774, %r28724, %r28773; + add.s32 %r28775, %r28725, %r28765; + shf.l.wrap.b32 %r28776, %r28734, %r28734, 16; + add.s32 %r28777, %r28776, 1779033703; + xor.b32 %r28778, %r28777, %r28604; + shf.l.wrap.b32 %r28779, %r28778, %r28778, 20; + add.s32 %r28780, %r28735, %r28779; + xor.b32 %r28781, %r28780, %r28776; + shf.l.wrap.b32 %r28782, %r28781, %r28781, 24; + add.s32 %r28783, %r28782, %r28777; + xor.b32 %r28784, %r28783, %r28779; + shf.l.wrap.b32 %r28785, %r28784, %r28784, 25; + shf.l.wrap.b32 %r28786, %r28737, %r28737, 16; + add.s32 %r28787, %r28786, -1150833019; + xor.b32 %r28788, %r28787, %r28605; + shf.l.wrap.b32 %r28789, %r28788, %r28788, 20; + add.s32 %r28790, %r28738, %r28789; + xor.b32 %r28791, %r28790, %r28786; + shf.l.wrap.b32 %r28792, %r28791, %r28791, 24; + add.s32 %r28793, %r28792, %r28787; + xor.b32 %r28794, %r28793, %r28789; + shf.l.wrap.b32 %r28795, %r28794, %r28794, 25; + add.s32 %r28796, %r28780, %r28684; + add.s32 %r28797, %r28796, %r28795; + xor.b32 %r28798, %r28797, %r28767; + shf.l.wrap.b32 %r28799, %r28798, %r28798, 16; + add.s32 %r28800, %r28799, %r28752; + xor.b32 %r28801, %r28800, %r28795; + shf.l.wrap.b32 %r28802, %r28801, %r28801, 20; + add.s32 %r28803, %r28797, %r28692; + add.s32 %r28804, %r28803, %r28802; + xor.b32 %r28805, %r28804, %r28799; + shf.l.wrap.b32 %r28806, %r28805, %r28805, 24; + add.s32 %r28807, %r28806, %r28800; + xor.b32 %r28808, %r28807, %r28802; + shf.l.wrap.b32 %r28809, %r28808, %r28808, 25; + add.s32 %r28810, %r28771, %r28790; + xor.b32 %r28811, %r28782, %r28810; + shf.l.wrap.b32 %r28812, %r28811, %r28811, 16; + add.s32 %r28813, %r28812, %r28768; + xor.b32 %r28814, %r28813, %r28754; + shf.l.wrap.b32 %r28815, %r28814, %r28814, 20; + add.s32 %r28816, %r28810, %r28708; + add.s32 %r28817, %r28816, %r28815; + xor.b32 %r28818, %r28817, %r28812; + shf.l.wrap.b32 %r28819, %r28818, %r28818, 24; + add.s32 %r28820, %r28819, %r28813; + xor.b32 %r28821, %r28820, %r28815; + shf.l.wrap.b32 %r28822, %r28821, %r28821, 25; + xor.b32 %r28823, %r28792, %r28773; + shf.l.wrap.b32 %r28824, %r28823, %r28823, 16; + add.s32 %r28825, %r28824, %r28783; + xor.b32 %r28826, %r28825, %r28770; + shf.l.wrap.b32 %r28827, %r28826, %r28826, 20; + add.s32 %r28828, %r28774, %r28827; + xor.b32 %r28829, %r28828, %r28824; + shf.l.wrap.b32 %r28830, %r28829, %r28829, 24; + add.s32 %r28831, %r28830, %r28825; + xor.b32 %r28832, %r28831, %r28827; + shf.l.wrap.b32 %r28833, %r28832, %r28832, 25; + add.s32 %r28834, %r28775, %r28785; + xor.b32 %r28835, %r28834, %r28751; + shf.l.wrap.b32 %r28836, %r28835, %r28835, 16; + add.s32 %r28837, %r28836, %r28793; + xor.b32 %r28838, %r28837, %r28785; + shf.l.wrap.b32 %r28839, %r28838, %r28838, 20; + add.s32 %r28840, %r28834, %r28729; + add.s32 %r28841, %r28840, %r28839; + xor.b32 %r28842, %r28841, %r28836; + shf.l.wrap.b32 %r28843, %r28842, %r28842, 24; + add.s32 %r28844, %r28843, %r28837; + xor.b32 %r28845, %r28844, %r28839; + shf.l.wrap.b32 %r28846, %r28845, %r28845, 25; + add.s32 %r28847, %r28804, %r28636; + add.s32 %r28848, %r28847, %r28846; + xor.b32 %r28849, %r28848, %r28819; + shf.l.wrap.b32 %r28850, %r28849, %r28849, 16; + add.s32 %r28851, %r28850, %r28831; + xor.b32 %r28852, %r28851, %r28846; + shf.l.wrap.b32 %r28853, %r28852, %r28852, 20; + add.s32 %r28854, %r28848, %r28668; + add.s32 %r28855, %r28854, %r28853; + xor.b32 %r28856, %r28855, %r28850; + shf.l.wrap.b32 %r28857, %r28856, %r28856, 24; + add.s32 %r28858, %r28857, %r28851; + xor.b32 %r28859, %r28858, %r28853; + shf.l.wrap.b32 %r28860, %r28859, %r28859, 25; + add.s32 %r28861, %r28817, %r28644; + add.s32 %r28862, %r28861, %r28809; + xor.b32 %r28863, %r28862, %r28830; + shf.l.wrap.b32 %r28864, %r28863, %r28863, 16; + add.s32 %r28865, %r28864, %r28844; + xor.b32 %r28866, %r28865, %r28809; + shf.l.wrap.b32 %r28867, %r28866, %r28866, 20; + add.s32 %r28868, %r28862, %r28700; + add.s32 %r28869, %r28868, %r28867; + xor.b32 %r28870, %r28869, %r28864; + shf.l.wrap.b32 %r28871, %r28870, %r28870, 24; + add.s32 %r28872, %r28871, %r28865; + xor.b32 %r28873, %r28872, %r28867; + shf.l.wrap.b32 %r28874, %r28873, %r28873, 25; + add.s32 %r28875, %r28828, %r28676; + add.s32 %r28876, %r28875, %r28822; + xor.b32 %r28877, %r28843, %r28876; + shf.l.wrap.b32 %r28878, %r28877, %r28877, 16; + add.s32 %r28879, %r28878, %r28807; + xor.b32 %r28880, %r28879, %r28822; + shf.l.wrap.b32 %r28881, %r28880, %r28880, 20; + add.s32 %r28882, %r28876, %r28620; + add.s32 %r28883, %r28882, %r28881; + xor.b32 %r28884, %r28883, %r28878; + shf.l.wrap.b32 %r28885, %r28884, %r28884, 24; + add.s32 %r28886, %r28885, %r28879; + xor.b32 %r28887, %r28886, %r28881; + shf.l.wrap.b32 %r28888, %r28887, %r28887, 25; + add.s32 %r28889, %r28841, %r28652; + add.s32 %r28890, %r28889, %r28833; + xor.b32 %r28891, %r28806, %r28890; + shf.l.wrap.b32 %r28892, %r28891, %r28891, 16; + add.s32 %r28893, %r28892, %r28820; + xor.b32 %r28894, %r28893, %r28833; + shf.l.wrap.b32 %r28895, %r28894, %r28894, 20; + add.s32 %r28896, %r28890, %r28724; + add.s32 %r28897, %r28896, %r28895; + xor.b32 %r28898, %r28897, %r28892; + shf.l.wrap.b32 %r28899, %r28898, %r28898, 24; + add.s32 %r28900, %r28899, %r28893; + xor.b32 %r28901, %r28900, %r28895; + shf.l.wrap.b32 %r28902, %r28901, %r28901, 25; + add.s32 %r28903, %r28855, %r28628; + add.s32 %r28904, %r28903, %r28874; + xor.b32 %r28905, %r28904, %r28899; + shf.l.wrap.b32 %r28906, %r28905, %r28905, 16; + add.s32 %r28907, %r28906, %r28886; + xor.b32 %r28908, %r28907, %r28874; + shf.l.wrap.b32 %r28909, %r28908, %r28908, 20; + add.s32 %r28910, %r28904, %r28708; + add.s32 %r28911, %r28910, %r28909; + xor.b32 %r28912, %r28911, %r28906; + shf.l.wrap.b32 %r28913, %r28912, %r28912, 24; + add.s32 %r28914, %r28913, %r28907; + xor.b32 %r28915, %r28914, %r28909; + shf.l.wrap.b32 %r28916, %r28915, %r28915, 25; + add.s32 %r28917, %r28888, %r28716; + add.s32 %r28918, %r28917, %r28869; + xor.b32 %r28919, %r28857, %r28918; + shf.l.wrap.b32 %r28920, %r28919, %r28919, 16; + add.s32 %r28921, %r28920, %r28900; + xor.b32 %r28922, %r28921, %r28888; + shf.l.wrap.b32 %r28923, %r28922, %r28922, 20; + add.s32 %r28924, %r28918, %r28660; + add.s32 %r28925, %r28924, %r28923; + xor.b32 %r28926, %r28925, %r28920; + shf.l.wrap.b32 %r28927, %r28926, %r28926, 24; + add.s32 %r28928, %r28927, %r28921; + xor.b32 %r28929, %r28928, %r28923; + shf.l.wrap.b32 %r28930, %r28929, %r28929, 25; + add.s32 %r28931, %r28883, %r28692; + add.s32 %r28932, %r28931, %r28902; + xor.b32 %r28933, %r28871, %r28932; + shf.l.wrap.b32 %r28934, %r28933, %r28933, 16; + add.s32 %r28935, %r28934, %r28858; + xor.b32 %r28936, %r28935, %r28902; + shf.l.wrap.b32 %r28937, %r28936, %r28936, 20; + add.s32 %r28938, %r28932, %r28725; + add.s32 %r28939, %r28938, %r28937; + xor.b32 %r28940, %r28939, %r28934; + shf.l.wrap.b32 %r28941, %r28940, %r28940, 24; + add.s32 %r28942, %r28941, %r28935; + xor.b32 %r28943, %r28942, %r28937; + shf.l.wrap.b32 %r28944, %r28943, %r28943, 25; + add.s32 %r28945, %r28897, %r28729; + add.s32 %r28946, %r28945, %r28860; + xor.b32 %r28947, %r28946, %r28885; + shf.l.wrap.b32 %r28948, %r28947, %r28947, 16; + add.s32 %r28949, %r28948, %r28872; + xor.b32 %r28950, %r28949, %r28860; + shf.l.wrap.b32 %r28951, %r28950, %r28950, 20; + add.s32 %r28952, %r28946, %r28684; + add.s32 %r28953, %r28952, %r28951; + xor.b32 %r28954, %r28953, %r28948; + shf.l.wrap.b32 %r28955, %r28954, %r28954, 24; + add.s32 %r28956, %r28955, %r28949; + xor.b32 %r28957, %r28956, %r28951; + shf.l.wrap.b32 %r28958, %r28957, %r28957, 25; + add.s32 %r28959, %r28911, %r28644; + add.s32 %r28960, %r28959, %r28958; + xor.b32 %r28961, %r28960, %r28927; + shf.l.wrap.b32 %r28962, %r28961, %r28961, 16; + add.s32 %r28963, %r28962, %r28942; + xor.b32 %r28964, %r28963, %r28958; + shf.l.wrap.b32 %r28965, %r28964, %r28964, 20; + add.s32 %r28966, %r28960, %r28652; + add.s32 %r28967, %r28966, %r28965; + xor.b32 %r28968, %r28967, %r28962; + shf.l.wrap.b32 %r28969, %r28968, %r28968, 24; + add.s32 %r28970, %r28969, %r28963; + xor.b32 %r28971, %r28970, %r28965; + shf.l.wrap.b32 %r28972, %r28971, %r28971, 25; + add.s32 %r28973, %r28925, %r28700; + add.s32 %r28974, %r28973, %r28916; + xor.b32 %r28975, %r28974, %r28941; + shf.l.wrap.b32 %r28976, %r28975, %r28975, 16; + add.s32 %r28977, %r28976, %r28956; + xor.b32 %r28978, %r28977, %r28916; + shf.l.wrap.b32 %r28979, %r28978, %r28978, 20; + add.s32 %r28980, %r28974, %r28716; + add.s32 %r28981, %r28980, %r28979; + xor.b32 %r28982, %r28981, %r28976; + shf.l.wrap.b32 %r28983, %r28982, %r28982, 24; + add.s32 %r28984, %r28983, %r28977; + xor.b32 %r28985, %r28984, %r28979; + shf.l.wrap.b32 %r28986, %r28985, %r28985, 25; + add.s32 %r28987, %r28939, %r28724; + add.s32 %r28988, %r28987, %r28930; + xor.b32 %r28989, %r28955, %r28988; + shf.l.wrap.b32 %r28990, %r28989, %r28989, 16; + add.s32 %r28991, %r28990, %r28914; + xor.b32 %r28992, %r28991, %r28930; + shf.l.wrap.b32 %r28993, %r28992, %r28992, 20; + add.s32 %r28994, %r28988, %r28636; + add.s32 %r28995, %r28994, %r28993; + xor.b32 %r28996, %r28995, %r28990; + shf.l.wrap.b32 %r28997, %r28996, %r28996, 24; + add.s32 %r28998, %r28997, %r28991; + xor.b32 %r28999, %r28998, %r28993; + shf.l.wrap.b32 %r29000, %r28999, %r28999, 25; + add.s32 %r29001, %r28953, %r28676; + add.s32 %r29002, %r29001, %r28944; + xor.b32 %r29003, %r28913, %r29002; + shf.l.wrap.b32 %r29004, %r29003, %r29003, 16; + add.s32 %r29005, %r29004, %r28928; + xor.b32 %r29006, %r29005, %r28944; + shf.l.wrap.b32 %r29007, %r29006, %r29006, 20; + add.s32 %r29008, %r29002, %r28725; + add.s32 %r29009, %r29008, %r29007; + xor.b32 %r29010, %r29009, %r29004; + shf.l.wrap.b32 %r29011, %r29010, %r29010, 24; + add.s32 %r29012, %r29011, %r29005; + xor.b32 %r29013, %r29012, %r29007; + shf.l.wrap.b32 %r29014, %r29013, %r29013, 25; + add.s32 %r29015, %r28967, %r28668; + add.s32 %r29016, %r29015, %r28986; + xor.b32 %r29017, %r29016, %r29011; + shf.l.wrap.b32 %r29018, %r29017, %r29017, 16; + add.s32 %r29019, %r29018, %r28998; + xor.b32 %r29020, %r29019, %r28986; + shf.l.wrap.b32 %r29021, %r29020, %r29020, 20; + add.s32 %r29022, %r29016, %r28660; + add.s32 %r29023, %r29022, %r29021; + xor.b32 %r29024, %r29023, %r29018; + shf.l.wrap.b32 %r29025, %r29024, %r29024, 24; + add.s32 %r29026, %r29025, %r29019; + xor.b32 %r29027, %r29026, %r29021; + shf.l.wrap.b32 %r29028, %r29027, %r29027, 25; + add.s32 %r29029, %r29000, %r28692; + add.s32 %r29030, %r29029, %r28981; + xor.b32 %r29031, %r28969, %r29030; + shf.l.wrap.b32 %r29032, %r29031, %r29031, 16; + add.s32 %r29033, %r29032, %r29012; + xor.b32 %r29034, %r29033, %r29000; + shf.l.wrap.b32 %r29035, %r29034, %r29034, 20; + add.s32 %r29036, %r29030, %r28620; + add.s32 %r29037, %r29036, %r29035; + xor.b32 %r29038, %r29037, %r29032; + shf.l.wrap.b32 %r29039, %r29038, %r29038, 24; + add.s32 %r29040, %r29039, %r29033; + xor.b32 %r29041, %r29040, %r29035; + shf.l.wrap.b32 %r29042, %r29041, %r29041, 25; + add.s32 %r29043, %r28995, %r28708; + add.s32 %r29044, %r29043, %r29014; + xor.b32 %r29045, %r28983, %r29044; + shf.l.wrap.b32 %r29046, %r29045, %r29045, 16; + add.s32 %r29047, %r29046, %r28970; + xor.b32 %r29048, %r29047, %r29014; + shf.l.wrap.b32 %r29049, %r29048, %r29048, 20; + add.s32 %r29050, %r29044, %r28729; + add.s32 %r29051, %r29050, %r29049; + xor.b32 %r29052, %r29051, %r29046; + shf.l.wrap.b32 %r29053, %r29052, %r29052, 24; + add.s32 %r29054, %r29053, %r29047; + xor.b32 %r29055, %r29054, %r29049; + shf.l.wrap.b32 %r29056, %r29055, %r29055, 25; + add.s32 %r29057, %r29009, %r28684; + add.s32 %r29058, %r29057, %r28972; + xor.b32 %r29059, %r29058, %r28997; + shf.l.wrap.b32 %r29060, %r29059, %r29059, 16; + add.s32 %r29061, %r29060, %r28984; + xor.b32 %r29062, %r29061, %r28972; + shf.l.wrap.b32 %r29063, %r29062, %r29062, 20; + add.s32 %r29064, %r29058, %r28628; + add.s32 %r29065, %r29064, %r29063; + xor.b32 %r29066, %r29065, %r29060; + shf.l.wrap.b32 %r29067, %r29066, %r29066, 24; + add.s32 %r29068, %r29067, %r29061; + xor.b32 %r29069, %r29068, %r29063; + shf.l.wrap.b32 %r29070, %r29069, %r29069, 25; + add.s32 %r29071, %r29023, %r28700; + add.s32 %r29072, %r29071, %r29070; + xor.b32 %r29073, %r29072, %r29039; + shf.l.wrap.b32 %r29074, %r29073, %r29073, 16; + add.s32 %r29075, %r29074, %r29054; + xor.b32 %r29076, %r29075, %r29070; + shf.l.wrap.b32 %r29077, %r29076, %r29076, 20; + add.s32 %r29078, %r29072, %r28676; + add.s32 %r29079, %r29078, %r29077; + xor.b32 %r29080, %r29079, %r29074; + shf.l.wrap.b32 %r29081, %r29080, %r29080, 24; + add.s32 %r29082, %r29081, %r29075; + xor.b32 %r29083, %r29082, %r29077; + shf.l.wrap.b32 %r29084, %r29083, %r29083, 25; + add.s32 %r29085, %r29037, %r28716; + add.s32 %r29086, %r29085, %r29028; + xor.b32 %r29087, %r29086, %r29053; + shf.l.wrap.b32 %r29088, %r29087, %r29087, 16; + add.s32 %r29089, %r29088, %r29068; + xor.b32 %r29090, %r29089, %r29028; + shf.l.wrap.b32 %r29091, %r29090, %r29090, 20; + add.s32 %r29092, %r29086, %r28692; + add.s32 %r29093, %r29092, %r29091; + xor.b32 %r29094, %r29093, %r29088; + shf.l.wrap.b32 %r29095, %r29094, %r29094, 24; + add.s32 %r29096, %r29095, %r29089; + xor.b32 %r29097, %r29096, %r29091; + shf.l.wrap.b32 %r29098, %r29097, %r29097, 25; + add.s32 %r29099, %r29051, %r28725; + add.s32 %r29100, %r29099, %r29042; + xor.b32 %r29101, %r29067, %r29100; + shf.l.wrap.b32 %r29102, %r29101, %r29101, 16; + add.s32 %r29103, %r29102, %r29026; + xor.b32 %r29104, %r29103, %r29042; + shf.l.wrap.b32 %r29105, %r29104, %r29104, 20; + add.s32 %r29106, %r29100, %r28644; + add.s32 %r29107, %r29106, %r29105; + xor.b32 %r29108, %r29107, %r29102; + shf.l.wrap.b32 %r29109, %r29108, %r29108, 24; + add.s32 %r29110, %r29109, %r29103; + xor.b32 %r29111, %r29110, %r29105; + shf.l.wrap.b32 %r29112, %r29111, %r29111, 25; + add.s32 %r29113, %r29065, %r28724; + add.s32 %r29114, %r29113, %r29056; + xor.b32 %r29115, %r29025, %r29114; + shf.l.wrap.b32 %r29116, %r29115, %r29115, 16; + add.s32 %r29117, %r29116, %r29040; + xor.b32 %r29118, %r29117, %r29056; + shf.l.wrap.b32 %r29119, %r29118, %r29118, 20; + add.s32 %r29120, %r29114, %r28729; + add.s32 %r29121, %r29120, %r29119; + xor.b32 %r29122, %r29121, %r29116; + shf.l.wrap.b32 %r29123, %r29122, %r29122, 24; + add.s32 %r29124, %r29123, %r29117; + xor.b32 %r29125, %r29124, %r29119; + shf.l.wrap.b32 %r29126, %r29125, %r29125, 25; + add.s32 %r29127, %r29079, %r28652; + add.s32 %r29128, %r29127, %r29098; + xor.b32 %r29129, %r29128, %r29123; + shf.l.wrap.b32 %r29130, %r29129, %r29129, 16; + add.s32 %r29131, %r29130, %r29110; + xor.b32 %r29132, %r29131, %r29098; + shf.l.wrap.b32 %r29133, %r29132, %r29132, 20; + add.s32 %r29134, %r29128, %r28620; + add.s32 %r29135, %r29134, %r29133; + xor.b32 %r29136, %r29135, %r29130; + shf.l.wrap.b32 %r29137, %r29136, %r29136, 24; + add.s32 %r29138, %r29137, %r29131; + xor.b32 %r29139, %r29138, %r29133; + shf.l.wrap.b32 %r29140, %r29139, %r29139, 25; + add.s32 %r29141, %r29112, %r28708; + add.s32 %r29142, %r29141, %r29093; + xor.b32 %r29143, %r29081, %r29142; + shf.l.wrap.b32 %r29144, %r29143, %r29143, 16; + add.s32 %r29145, %r29144, %r29124; + xor.b32 %r29146, %r29145, %r29112; + shf.l.wrap.b32 %r29147, %r29146, %r29146, 20; + add.s32 %r29148, %r29142, %r28636; + add.s32 %r29149, %r29148, %r29147; + xor.b32 %r29150, %r29149, %r29144; + shf.l.wrap.b32 %r29151, %r29150, %r29150, 24; + add.s32 %r29152, %r29151, %r29145; + xor.b32 %r29153, %r29152, %r29147; + shf.l.wrap.b32 %r29154, %r29153, %r29153, 25; + add.s32 %r29155, %r29107, %r28660; + add.s32 %r29156, %r29155, %r29126; + xor.b32 %r29157, %r29095, %r29156; + shf.l.wrap.b32 %r29158, %r29157, %r29157, 16; + add.s32 %r29159, %r29158, %r29082; + xor.b32 %r29160, %r29159, %r29126; + shf.l.wrap.b32 %r29161, %r29160, %r29160, 20; + add.s32 %r29162, %r29156, %r28684; + add.s32 %r29163, %r29162, %r29161; + xor.b32 %r29164, %r29163, %r29158; + shf.l.wrap.b32 %r29165, %r29164, %r29164, 24; + add.s32 %r29166, %r29165, %r29159; + xor.b32 %r29167, %r29166, %r29161; + shf.l.wrap.b32 %r29168, %r29167, %r29167, 25; + add.s32 %r29169, %r29121, %r28628; + add.s32 %r29170, %r29169, %r29084; + xor.b32 %r29171, %r29170, %r29109; + shf.l.wrap.b32 %r29172, %r29171, %r29171, 16; + add.s32 %r29173, %r29172, %r29096; + xor.b32 %r29174, %r29173, %r29084; + shf.l.wrap.b32 %r29175, %r29174, %r29174, 20; + add.s32 %r29176, %r29170, %r28668; + add.s32 %r29177, %r29176, %r29175; + xor.b32 %r29178, %r29177, %r29172; + shf.l.wrap.b32 %r29179, %r29178, %r29178, 24; + add.s32 %r29180, %r29179, %r29173; + xor.b32 %r29181, %r29180, %r29175; + shf.l.wrap.b32 %r29182, %r29181, %r29181, 25; + add.s32 %r29183, %r29135, %r28716; + add.s32 %r29184, %r29183, %r29182; + xor.b32 %r29185, %r29184, %r29151; + shf.l.wrap.b32 %r29186, %r29185, %r29185, 16; + add.s32 %r29187, %r29186, %r29166; + xor.b32 %r29188, %r29187, %r29182; + shf.l.wrap.b32 %r29189, %r29188, %r29188, 20; + add.s32 %r29190, %r29184, %r28724; + add.s32 %r29191, %r29190, %r29189; + xor.b32 %r29192, %r29191, %r29186; + shf.l.wrap.b32 %r29193, %r29192, %r29192, 24; + add.s32 %r29194, %r29193, %r29187; + xor.b32 %r29195, %r29194, %r29189; + shf.l.wrap.b32 %r29196, %r29195, %r29195, 25; + add.s32 %r29197, %r29149, %r28692; + add.s32 %r29198, %r29197, %r29140; + xor.b32 %r29199, %r29198, %r29165; + shf.l.wrap.b32 %r29200, %r29199, %r29199, 16; + add.s32 %r29201, %r29200, %r29180; + xor.b32 %r29202, %r29201, %r29140; + shf.l.wrap.b32 %r29203, %r29202, %r29202, 20; + add.s32 %r29204, %r29198, %r28708; + add.s32 %r29205, %r29204, %r29203; + xor.b32 %r29206, %r29205, %r29200; + shf.l.wrap.b32 %r29207, %r29206, %r29206, 24; + add.s32 %r29208, %r29207, %r29201; + xor.b32 %r29209, %r29208, %r29203; + shf.l.wrap.b32 %r29210, %r29209, %r29209, 25; + add.s32 %r29211, %r29163, %r28729; + add.s32 %r29212, %r29211, %r29154; + xor.b32 %r29213, %r29179, %r29212; + shf.l.wrap.b32 %r29214, %r29213, %r29213, 16; + add.s32 %r29215, %r29214, %r29138; + xor.b32 %r29216, %r29215, %r29154; + shf.l.wrap.b32 %r29217, %r29216, %r29216, 20; + add.s32 %r29218, %r29212, %r28700; + add.s32 %r29219, %r29218, %r29217; + xor.b32 %r29220, %r29219, %r29214; + shf.l.wrap.b32 %r29221, %r29220, %r29220, 24; + add.s32 %r29222, %r29221, %r29215; + xor.b32 %r29223, %r29222, %r29217; + shf.l.wrap.b32 %r29224, %r29223, %r29223, 25; + add.s32 %r29225, %r29177, %r28725; + add.s32 %r29226, %r29225, %r29168; + xor.b32 %r29227, %r29137, %r29226; + shf.l.wrap.b32 %r29228, %r29227, %r29227, 16; + add.s32 %r29229, %r29228, %r29152; + xor.b32 %r29230, %r29229, %r29168; + shf.l.wrap.b32 %r29231, %r29230, %r29230, 20; + add.s32 %r29232, %r29226, %r28684; + add.s32 %r29233, %r29232, %r29231; + xor.b32 %r29234, %r29233, %r29228; + shf.l.wrap.b32 %r29235, %r29234, %r29234, 24; + add.s32 %r29236, %r29235, %r29229; + xor.b32 %r29237, %r29236, %r29231; + shf.l.wrap.b32 %r29238, %r29237, %r29237, 25; + add.s32 %r29239, %r29191, %r28676; + add.s32 %r29240, %r29239, %r29210; + xor.b32 %r29241, %r29240, %r29235; + shf.l.wrap.b32 %r29242, %r29241, %r29241, 16; + add.s32 %r29243, %r29242, %r29222; + xor.b32 %r29244, %r29243, %r29210; + shf.l.wrap.b32 %r29245, %r29244, %r29244, 20; + add.s32 %r29246, %r29240, %r28636; + add.s32 %r29247, %r29246, %r29245; + xor.b32 %r29248, %r29247, %r29242; + shf.l.wrap.b32 %r29249, %r29248, %r29248, 24; + add.s32 %r29250, %r29249, %r29243; + xor.b32 %r29251, %r29250, %r29245; + shf.l.wrap.b32 %r29252, %r29251, %r29251, 25; + add.s32 %r29253, %r29224, %r28660; + add.s32 %r29254, %r29253, %r29205; + xor.b32 %r29255, %r29193, %r29254; + shf.l.wrap.b32 %r29256, %r29255, %r29255, 16; + add.s32 %r29257, %r29256, %r29236; + xor.b32 %r29258, %r29257, %r29224; + shf.l.wrap.b32 %r29259, %r29258, %r29258, 20; + add.s32 %r29260, %r29254, %r28644; + add.s32 %r29261, %r29260, %r29259; + xor.b32 %r29262, %r29261, %r29256; + shf.l.wrap.b32 %r29263, %r29262, %r29262, 24; + add.s32 %r29264, %r29263, %r29257; + xor.b32 %r29265, %r29264, %r29259; + shf.l.wrap.b32 %r29266, %r29265, %r29265, 25; + add.s32 %r29267, %r29219, %r28620; + add.s32 %r29268, %r29267, %r29238; + xor.b32 %r29269, %r29207, %r29268; + shf.l.wrap.b32 %r29270, %r29269, %r29269, 16; + add.s32 %r29271, %r29270, %r29194; + xor.b32 %r29272, %r29271, %r29238; + shf.l.wrap.b32 %r29273, %r29272, %r29272, 20; + add.s32 %r29274, %r29268, %r28628; + add.s32 %r29275, %r29274, %r29273; + xor.b32 %r29276, %r29275, %r29270; + shf.l.wrap.b32 %r29277, %r29276, %r29276, 24; + add.s32 %r29278, %r29277, %r29271; + xor.b32 %r29279, %r29278, %r29273; + shf.l.wrap.b32 %r29280, %r29279, %r29279, 25; + add.s32 %r29281, %r29233, %r28668; + add.s32 %r29282, %r29281, %r29196; + xor.b32 %r29283, %r29282, %r29221; + shf.l.wrap.b32 %r29284, %r29283, %r29283, 16; + add.s32 %r29285, %r29284, %r29208; + xor.b32 %r29286, %r29285, %r29196; + shf.l.wrap.b32 %r29287, %r29286, %r29286, 20; + add.s32 %r29288, %r29282, %r28652; + add.s32 %r29289, %r29288, %r29287; + xor.b32 %r29290, %r29289, %r29284; + shf.l.wrap.b32 %r29291, %r29290, %r29290, 24; + add.s32 %r29292, %r29291, %r29285; + xor.b32 %r29293, %r29292, %r29287; + shf.l.wrap.b32 %r29294, %r29293, %r29293, 25; + add.s32 %r29295, %r29247, %r28692; + add.s32 %r29296, %r29295, %r29294; + xor.b32 %r29297, %r29296, %r29263; + shf.l.wrap.b32 %r29298, %r29297, %r29297, 16; + add.s32 %r29299, %r29298, %r29278; + xor.b32 %r29300, %r29299, %r29294; + shf.l.wrap.b32 %r29301, %r29300, %r29300, 20; + add.s32 %r29302, %r29296, %r28725; + add.s32 %r29303, %r29302, %r29301; + xor.b32 %r29304, %r29303, %r29298; + shf.l.wrap.b32 %r29305, %r29304, %r29304, 24; + add.s32 %r29306, %r29305, %r29299; + xor.b32 %r29307, %r29306, %r29301; + shf.l.wrap.b32 %r29308, %r29307, %r29307, 25; + add.s32 %r29309, %r29261, %r28708; + add.s32 %r29310, %r29309, %r29252; + xor.b32 %r29311, %r29310, %r29277; + shf.l.wrap.b32 %r29312, %r29311, %r29311, 16; + add.s32 %r29313, %r29312, %r29292; + xor.b32 %r29314, %r29313, %r29252; + shf.l.wrap.b32 %r29315, %r29314, %r29314, 20; + add.s32 %r29316, %r29310, %r28660; + add.s32 %r29317, %r29316, %r29315; + xor.b32 %r29318, %r29317, %r29312; + shf.l.wrap.b32 %r29319, %r29318, %r29318, 24; + add.s32 %r29320, %r29319, %r29313; + xor.b32 %r29321, %r29320, %r29315; + shf.l.wrap.b32 %r29322, %r29321, %r29321, 25; + add.s32 %r29323, %r29275, %r28684; + add.s32 %r29324, %r29323, %r29266; + xor.b32 %r29325, %r29291, %r29324; + shf.l.wrap.b32 %r29326, %r29325, %r29325, 16; + add.s32 %r29327, %r29326, %r29250; + xor.b32 %r29328, %r29327, %r29266; + shf.l.wrap.b32 %r29329, %r29328, %r29328, 20; + add.s32 %r29330, %r29324, %r28716; + add.s32 %r29331, %r29330, %r29329; + xor.b32 %r29332, %r29331, %r29326; + shf.l.wrap.b32 %r29333, %r29332, %r29332, 24; + add.s32 %r29334, %r29333, %r29327; + xor.b32 %r29335, %r29334, %r29329; + shf.l.wrap.b32 %r29336, %r29335, %r29335, 25; + add.s32 %r29337, %r29289, %r28729; + add.s32 %r29338, %r29337, %r29280; + xor.b32 %r29339, %r29249, %r29338; + shf.l.wrap.b32 %r29340, %r29339, %r29339, 16; + add.s32 %r29341, %r29340, %r29264; + xor.b32 %r29342, %r29341, %r29280; + shf.l.wrap.b32 %r29343, %r29342, %r29342, 20; + add.s32 %r29344, %r29338, %r28628; + add.s32 %r29345, %r29344, %r29343; + xor.b32 %r29346, %r29345, %r29340; + shf.l.wrap.b32 %r29347, %r29346, %r29346, 24; + add.s32 %r29348, %r29347, %r29341; + xor.b32 %r29349, %r29348, %r29343; + shf.l.wrap.b32 %r29350, %r29349, %r29349, 25; + add.s32 %r29351, %r29303, %r28724; + add.s32 %r29352, %r29351, %r29322; + xor.b32 %r29353, %r29352, %r29347; + shf.l.wrap.b32 %r29354, %r29353, %r29353, 16; + add.s32 %r29355, %r29354, %r29334; + xor.b32 %r29356, %r29355, %r29322; + shf.l.wrap.b32 %r29357, %r29356, %r29356, 20; + add.s32 %r29358, %r29352, %r28644; + add.s32 %r29359, %r29358, %r29357; + xor.b32 %r29360, %r29359, %r29354; + shf.l.wrap.b32 %r29361, %r29360, %r29360, 24; + add.s32 %r29362, %r29361, %r29355; + xor.b32 %r29363, %r29362, %r29357; + shf.l.wrap.b32 %r29364, %r29363, %r29363, 25; + add.s32 %r29365, %r29336, %r28620; + add.s32 %r29366, %r29365, %r29317; + xor.b32 %r29367, %r29305, %r29366; + shf.l.wrap.b32 %r29368, %r29367, %r29367, 16; + add.s32 %r29369, %r29368, %r29348; + xor.b32 %r29370, %r29369, %r29336; + shf.l.wrap.b32 %r29371, %r29370, %r29370, 20; + add.s32 %r29372, %r29366, %r28700; + add.s32 %r29373, %r29372, %r29371; + xor.b32 %r29374, %r29373, %r29368; + shf.l.wrap.b32 %r29375, %r29374, %r29374, 24; + add.s32 %r29376, %r29375, %r29369; + xor.b32 %r29377, %r29376, %r29371; + shf.l.wrap.b32 %r29378, %r29377, %r29377, 25; + add.s32 %r29379, %r29331, %r28636; + add.s32 %r29380, %r29379, %r29350; + xor.b32 %r29381, %r29319, %r29380; + shf.l.wrap.b32 %r29382, %r29381, %r29381, 16; + add.s32 %r29383, %r29382, %r29306; + xor.b32 %r29384, %r29383, %r29350; + shf.l.wrap.b32 %r29385, %r29384, %r29384, 20; + add.s32 %r29386, %r29380, %r28668; + add.s32 %r29387, %r29386, %r29385; + xor.b32 %r29388, %r29387, %r29382; + shf.l.wrap.b32 %r29389, %r29388, %r29388, 24; + add.s32 %r29390, %r29389, %r29383; + xor.b32 %r29391, %r29390, %r29385; + shf.l.wrap.b32 %r29392, %r29391, %r29391, 25; + add.s32 %r29393, %r29345, %r28652; + add.s32 %r29394, %r29393, %r29308; + xor.b32 %r29395, %r29394, %r29333; + shf.l.wrap.b32 %r29396, %r29395, %r29395, 16; + add.s32 %r29397, %r29396, %r29320; + xor.b32 %r29398, %r29397, %r29308; + shf.l.wrap.b32 %r29399, %r29398, %r29398, 20; + add.s32 %r29400, %r29394, %r28676; + add.s32 %r29401, %r29400, %r29399; + xor.b32 %r29402, %r29401, %r29396; + shf.l.wrap.b32 %r29403, %r29402, %r29402, 24; + add.s32 %r29404, %r29403, %r29397; + xor.b32 %r29405, %r29404, %r29399; + shf.l.wrap.b32 %r29406, %r29405, %r29405, 25; + add.s32 %r29407, %r29359, %r28708; + add.s32 %r29408, %r29407, %r29406; + xor.b32 %r29409, %r29408, %r29375; + shf.l.wrap.b32 %r29410, %r29409, %r29409, 16; + add.s32 %r29411, %r29410, %r29390; + xor.b32 %r29412, %r29411, %r29406; + shf.l.wrap.b32 %r29413, %r29412, %r29412, 20; + add.s32 %r29414, %r29408, %r28729; + add.s32 %r29415, %r29414, %r29413; + xor.b32 %r29416, %r29415, %r29410; + shf.l.wrap.b32 %r29417, %r29416, %r29416, 24; + add.s32 %r29418, %r29417, %r29411; + xor.b32 %r29419, %r29418, %r29413; + shf.l.wrap.b32 %r29420, %r29419, %r29419, 25; + add.s32 %r29421, %r29373, %r28660; + add.s32 %r29422, %r29421, %r29364; + xor.b32 %r29423, %r29422, %r29389; + shf.l.wrap.b32 %r29424, %r29423, %r29423, 16; + add.s32 %r29425, %r29424, %r29404; + xor.b32 %r29426, %r29425, %r29364; + shf.l.wrap.b32 %r29427, %r29426, %r29426, 20; + add.s32 %r29428, %r29422, %r28620; + add.s32 %r29429, %r29428, %r29427; + xor.b32 %r29430, %r29429, %r29424; + shf.l.wrap.b32 %r29431, %r29430, %r29430, 24; + add.s32 %r29432, %r29431, %r29425; + xor.b32 %r29433, %r29432, %r29427; + shf.l.wrap.b32 %r29434, %r29433, %r29433, 25; + add.s32 %r29435, %r29387, %r28628; + add.s32 %r29436, %r29435, %r29378; + xor.b32 %r29437, %r29403, %r29436; + shf.l.wrap.b32 %r29438, %r29437, %r29437, 16; + add.s32 %r29439, %r29438, %r29362; + xor.b32 %r29440, %r29439, %r29378; + shf.l.wrap.b32 %r29441, %r29440, %r29440, 20; + add.s32 %r29442, %r29436, %r28692; + add.s32 %r29443, %r29442, %r29441; + xor.b32 %r29444, %r29443, %r29438; + shf.l.wrap.b32 %r29445, %r29444, %r29444, 24; + add.s32 %r29446, %r29445, %r29439; + xor.b32 %r29447, %r29446, %r29441; + shf.l.wrap.b32 %r29448, %r29447, %r29447, 25; + add.s32 %r29449, %r29401, %r28684; + add.s32 %r29450, %r29449, %r29392; + xor.b32 %r29451, %r29361, %r29450; + shf.l.wrap.b32 %r29452, %r29451, %r29451, 16; + add.s32 %r29453, %r29452, %r29376; + xor.b32 %r29454, %r29453, %r29392; + shf.l.wrap.b32 %r29455, %r29454, %r29454, 20; + add.s32 %r29456, %r29450, %r28668; + add.s32 %r29457, %r29456, %r29455; + xor.b32 %r29458, %r29457, %r29452; + shf.l.wrap.b32 %r29459, %r29458, %r29458, 24; + add.s32 %r29460, %r29459, %r29453; + xor.b32 %r29461, %r29460, %r29455; + shf.l.wrap.b32 %r29462, %r29461, %r29461, 25; + add.s32 %r29463, %r29415, %r28725; + add.s32 %r29464, %r29463, %r29434; + xor.b32 %r29465, %r29464, %r29459; + shf.l.wrap.b32 %r29466, %r29465, %r29465, 16; + add.s32 %r29467, %r29466, %r29446; + xor.b32 %r29468, %r29467, %r29434; + shf.l.wrap.b32 %r29469, %r29468, %r29468, 20; + add.s32 %r29470, %r29464, %r28700; + add.s32 %r29471, %r29470, %r29469; + xor.b32 %r29472, %r29471, %r29466; + shf.l.wrap.b32 %r29473, %r29472, %r29472, 24; + add.s32 %r29474, %r29473, %r29467; + xor.b32 %r29475, %r29474, %r29469; + shf.l.wrap.b32 %r29476, %r29475, %r29475, 25; + add.s32 %r29477, %r29448, %r28636; + add.s32 %r29478, %r29477, %r29429; + xor.b32 %r29479, %r29417, %r29478; + shf.l.wrap.b32 %r29480, %r29479, %r29479, 16; + add.s32 %r29481, %r29480, %r29460; + xor.b32 %r29482, %r29481, %r29448; + shf.l.wrap.b32 %r29483, %r29482, %r29482, 20; + add.s32 %r29484, %r29478, %r28716; + add.s32 %r29485, %r29484, %r29483; + xor.b32 %r29486, %r29485, %r29480; + shf.l.wrap.b32 %r29487, %r29486, %r29486, 24; + add.s32 %r29488, %r29487, %r29481; + xor.b32 %r29489, %r29488, %r29483; + shf.l.wrap.b32 %r29490, %r29489, %r29489, 25; + add.s32 %r29491, %r29443, %r28644; + add.s32 %r29492, %r29491, %r29462; + xor.b32 %r29493, %r29431, %r29492; + shf.l.wrap.b32 %r29494, %r29493, %r29493, 16; + add.s32 %r29495, %r29494, %r29418; + xor.b32 %r29496, %r29495, %r29462; + shf.l.wrap.b32 %r29497, %r29496, %r29496, 20; + add.s32 %r29498, %r29492, %r28652; + add.s32 %r29499, %r29498, %r29497; + xor.b32 %r29500, %r29499, %r29494; + shf.l.wrap.b32 %r29501, %r29500, %r29500, 24; + add.s32 %r29502, %r29501, %r29495; + xor.b32 %r29503, %r29502, %r29497; + shf.l.wrap.b32 %r29504, %r29503, %r29503, 25; + add.s32 %r29505, %r29457, %r28676; + add.s32 %r29506, %r29505, %r29420; + xor.b32 %r29507, %r29506, %r29445; + shf.l.wrap.b32 %r29508, %r29507, %r29507, 16; + add.s32 %r29509, %r29508, %r29432; + xor.b32 %r29510, %r29509, %r29420; + shf.l.wrap.b32 %r29511, %r29510, %r29510, 20; + add.s32 %r29512, %r29506, %r28724; + add.s32 %r29513, %r29512, %r29511; + xor.b32 %r29514, %r29513, %r29508; + shf.l.wrap.b32 %r29515, %r29514, %r29514, 24; + add.s32 %r29516, %r29515, %r29509; + xor.b32 %r29517, %r29516, %r29511; + shf.l.wrap.b32 %r29518, %r29517, %r29517, 25; + xor.b32 %r29519, %r29471, %r29502; + cvt.u64.u32 %rd1190, %r29519; + xor.b32 %r29520, %r29516, %r29485; + and.b32 %r29521, %r29520, 255; + cvt.u64.u32 %rd1191, %r29521; + cvt.u64.u32 %rd1192, %r29520; + shl.b64 %rd1193, %rd1192, 32; + and.b64 %rd1194, %rd1193, 280375465082880; + and.b64 %rd1195, %rd1193, 71776119061217280; + shr.u32 %r29522, %r29520, 24; + cvt.u64.u32 %rd1196, %r29522; + shl.b64 %rd1197, %rd1196, 56; + bfi.b64 %rd1198, %rd1191, %rd1190, 32, 32; + or.b64 %rd1199, %rd1198, %rd1194; + or.b64 %rd1200, %rd1199, %rd1195; + or.b64 %rd341, %rd1200, %rd1197; + xor.b32 %r29523, %r29474, %r29499; + cvt.u64.u32 %rd1201, %r29523; + xor.b32 %r29524, %r29513, %r29488; + and.b32 %r29525, %r29524, 255; + cvt.u64.u32 %rd1202, %r29525; + cvt.u64.u32 %rd1203, %r29524; + shl.b64 %rd1204, %rd1203, 32; + and.b64 %rd1205, %rd1204, 280375465082880; + and.b64 %rd1206, %rd1204, 71776119061217280; + shr.u32 %r29526, %r29524, 24; + cvt.u64.u32 %rd1207, %r29526; + shl.b64 %rd1208, %rd1207, 56; + bfi.b64 %rd1209, %rd1202, %rd1201, 32, 32; + or.b64 %rd1210, %rd1209, %rd1205; + or.b64 %rd1211, %rd1210, %rd1206; + or.b64 %rd345, %rd1211, %rd1208; + xor.b32 %r29527, %r29518, %r29487; + cvt.u64.u32 %rd1212, %r29527; + xor.b32 %r29528, %r29476, %r29501; + and.b32 %r29529, %r29528, 255; + cvt.u64.u32 %rd1213, %r29529; + cvt.u64.u32 %rd1214, %r29528; + shl.b64 %rd1215, %rd1214, 32; + and.b64 %rd1216, %rd1215, 280375465082880; + and.b64 %rd1217, %rd1215, 71776119061217280; + shr.u32 %r29530, %r29528, 24; + cvt.u64.u32 %rd1218, %r29530; + shl.b64 %rd1219, %rd1218, 56; + bfi.b64 %rd1220, %rd1213, %rd1212, 32, 32; + or.b64 %rd1221, %rd1220, %rd1216; + or.b64 %rd1222, %rd1221, %rd1217; + or.b64 %rd1280, %rd1222, %rd1219; + xor.b32 %r29531, %r29515, %r29490; + cvt.u64.u32 %rd1223, %r29531; + xor.b32 %r29532, %r29473, %r29504; + and.b32 %r29533, %r29532, 255; + cvt.u64.u32 %rd1224, %r29533; + cvt.u64.u32 %rd1225, %r29532; + shl.b64 %rd1226, %rd1225, 32; + and.b64 %rd1227, %rd1226, 280375465082880; + and.b64 %rd1228, %rd1226, 71776119061217280; + shr.u32 %r29534, %r29532, 24; + cvt.u64.u32 %rd1229, %r29534; + shl.b64 %rd1230, %rd1229, 56; + bfi.b64 %rd1231, %rd1224, %rd1223, 32, 32; + or.b64 %rd1232, %rd1231, %rd1227; + or.b64 %rd1233, %rd1232, %rd1228; + or.b64 %rd1279, %rd1233, %rd1230; + mov.u64 %rd342, %rd341; + bra.uni $L__BB2_104; -$L__BB0_17: - not.pred %p15, %p16; - @%p15 bra $L__BB0_19; +$L__BB2_97: + setp.eq.s16 %p55, %rs502, 0; + selp.u16 %rs504, 1, 0, %p55; + ld.local.u8 %rs667, [%rd3+138]; + or.b16 %rs505, %rs667, %rs504; + or.b16 %rs734, %rs505, 2; + ld.local.u64 %rd1276, [%rd3+64]; + ld.local.v2.u32 {%r30976, %r30975}, [%rd3+32]; + ld.local.v2.u32 {%r30974, %r30973}, [%rd3+40]; + ld.local.v2.u32 {%r30972, %r30971}, [%rd3+48]; + ld.local.v2.u32 {%r30970, %r30969}, [%rd3+56]; + ld.local.v4.u16 {%rs800, %rs802, %rs804, %rs806}, [%rd3+72]; + shr.u16 %rs801, %rs800, 8; + shr.u16 %rs803, %rs802, 8; + shr.u16 %rs805, %rs804, 8; + shr.u16 %rs807, %rs806, 8; + ld.local.v4.u16 {%rs808, %rs810, %rs812, %rs814}, [%rd3+80]; + shr.u16 %rs809, %rs808, 8; + shr.u16 %rs811, %rs810, 8; + shr.u16 %rs813, %rs812, 8; + shr.u16 %rs815, %rs814, 8; + ld.local.v4.u16 {%rs816, %rs818, %rs820, %rs822}, [%rd3+88]; + shr.u16 %rs817, %rs816, 8; + shr.u16 %rs819, %rs818, 8; + shr.u16 %rs821, %rs820, 8; + shr.u16 %rs823, %rs822, 8; + ld.local.v4.u16 {%rs824, %rs826, %rs828, %rs830}, [%rd3+96]; + shr.u16 %rs825, %rs824, 8; + shr.u16 %rs827, %rs826, 8; + shr.u16 %rs829, %rs828, 8; + shr.u16 %rs831, %rs830, 8; + ld.local.v4.u16 {%rs832, %rs834, %rs836, %rs838}, [%rd3+104]; + shr.u16 %rs833, %rs832, 8; + shr.u16 %rs835, %rs834, 8; + shr.u16 %rs837, %rs836, 8; + shr.u16 %rs839, %rs838, 8; + ld.local.v4.u16 {%rs840, %rs842, %rs844, %rs846}, [%rd3+112]; + shr.u16 %rs841, %rs840, 8; + shr.u16 %rs843, %rs842, 8; + shr.u16 %rs845, %rs844, 8; + shr.u16 %rs847, %rs846, 8; + ld.local.v4.u8 {%rs848, %rs849, %rs850, %rs851}, [%rd3+120]; + ld.local.v2.u8 {%rs852, %rs853}, [%rd3+124]; + ld.local.v2.u8 {%rs854, %rs855}, [%rd3+126]; + ld.local.v4.u8 {%rs856, %rs857, %rs858, %rs859}, [%rd3+128]; + ld.local.v2.u8 {%rs860, %rs861}, [%rd3+132]; + ld.local.v2.u8 {%rs862, %rs863}, [%rd3+134]; - ld.param.u64 %rd438, [heavy_hash_param_0]; - ld.param.u64 %rd437, [heavy_hash_param_1]; - and.b64 %rd436, %rd439, %rd438; - or.b64 %rd435, %rd436, %rd437; - ld.param.u64 %rd434, [heavy_hash_param_5]; - cvta.to.global.u64 %rd433, %rd434; - mov.u64 %rd431, 0; - atom.global.cas.b64 %rd432, [%rd433], %rd431, %rd435; +$L__BB2_99: + setp.eq.s64 %p56, %rd1275, 0; + mov.u32 %r30977, %r30976; + mov.u32 %r30978, %r30975; + mov.u32 %r30979, %r30974; + mov.u32 %r30980, %r30973; + mov.u32 %r30981, %r30972; + mov.u32 %r30982, %r30971; + mov.u32 %r30983, %r30970; + mov.u32 %r30984, %r30969; + mov.u16 %rs865, %rs734; + @%p56 bra $L__BB2_102; -$L__BB0_19: + or.b16 %rs865, %rs667, 4; + ld.local.v2.u32 {%r30977, %r30978}, [%rd3]; + ld.local.v2.u32 {%r30979, %r30980}, [%rd3+8]; + ld.local.v2.u32 {%r30981, %r30982}, [%rd3+16]; + ld.local.v2.u32 {%r30983, %r30984}, [%rd3+24]; + mov.u16 %rs768, %rs831; + mov.u16 %rs769, %rs830; + mov.u16 %rs770, %rs829; + mov.u16 %rs771, %rs828; + mov.u16 %rs772, %rs827; + mov.u16 %rs773, %rs826; + mov.u16 %rs774, %rs825; + mov.u16 %rs775, %rs824; + mov.u16 %rs776, %rs823; + mov.u16 %rs777, %rs822; + mov.u16 %rs778, %rs821; + mov.u16 %rs779, %rs820; + mov.u16 %rs780, %rs819; + mov.u16 %rs781, %rs818; + mov.u16 %rs782, %rs817; + mov.u16 %rs783, %rs816; + mov.u16 %rs784, %rs815; + mov.u16 %rs785, %rs814; + mov.u16 %rs786, %rs813; + mov.u16 %rs787, %rs812; + mov.u16 %rs788, %rs811; + mov.u16 %rs789, %rs810; + mov.u16 %rs790, %rs809; + mov.u16 %rs791, %rs808; + mov.u16 %rs792, %rs807; + mov.u16 %rs793, %rs806; + mov.u16 %rs794, %rs805; + mov.u16 %rs795, %rs804; + mov.u16 %rs796, %rs803; + mov.u16 %rs797, %rs802; + mov.u16 %rs798, %rs801; + mov.u16 %rs799, %rs800; + +$L__BB2_101: + add.s64 %rd1275, %rd1275, -1; + shl.b64 %rd1144, %rd1275, 5; + add.s64 %rd1145, %rd3, %rd1144; + ld.local.u8 %rs800, [%rd1145+145]; + mov.u64 %rd1143, 0; + ld.local.u8 %rs801, [%rd1145+146]; + ld.local.u8 %rs802, [%rd1145+147]; + ld.local.u8 %rs803, [%rd1145+148]; + ld.local.u8 %rs804, [%rd1145+149]; + ld.local.u8 %rs805, [%rd1145+150]; + ld.local.u8 %rs806, [%rd1145+151]; + ld.local.u8 %rs807, [%rd1145+152]; + ld.local.u8 %rs808, [%rd1145+153]; + ld.local.u8 %rs809, [%rd1145+154]; + ld.local.u8 %rs810, [%rd1145+155]; + ld.local.u8 %rs811, [%rd1145+156]; + ld.local.u8 %rs812, [%rd1145+157]; + ld.local.u8 %rs813, [%rd1145+158]; + ld.local.u8 %rs814, [%rd1145+159]; + ld.local.u8 %rs815, [%rd1145+160]; + ld.local.u8 %rs816, [%rd1145+161]; + ld.local.u8 %rs817, [%rd1145+162]; + ld.local.u8 %rs818, [%rd1145+163]; + ld.local.u8 %rs819, [%rd1145+164]; + ld.local.u8 %rs820, [%rd1145+165]; + ld.local.u8 %rs821, [%rd1145+166]; + ld.local.u8 %rs822, [%rd1145+167]; + ld.local.u8 %rs823, [%rd1145+168]; + ld.local.u8 %rs824, [%rd1145+169]; + ld.local.u8 %rs825, [%rd1145+170]; + ld.local.u8 %rs826, [%rd1145+171]; + ld.local.u8 %rs827, [%rd1145+172]; + ld.local.u8 %rs828, [%rd1145+173]; + ld.local.u8 %rs829, [%rd1145+174]; + ld.local.u8 %rs830, [%rd1145+175]; + ld.local.u8 %rs831, [%rd1145+176]; + cvt.u32.u16 %r26604, %rs799; + and.b32 %r26605, %r26604, 255; + cvt.u32.u16 %r26606, %rs798; + prmt.b32 %r26607, %r26606, %r26605, 30212; + cvt.u32.u16 %r26608, %rs797; + shl.b32 %r26609, %r26608, 16; + and.b32 %r26610, %r26609, 16711680; + or.b32 %r26611, %r26607, %r26610; + cvt.u32.u16 %r26612, %rs796; + shl.b32 %r26613, %r26612, 24; + or.b32 %r26614, %r26611, %r26613; + cvt.u32.u16 %r26615, %rs795; + and.b32 %r26616, %r26615, 255; + cvt.u32.u16 %r26617, %rs794; + prmt.b32 %r26618, %r26617, %r26616, 30212; + cvt.u32.u16 %r26619, %rs793; + shl.b32 %r26620, %r26619, 16; + and.b32 %r26621, %r26620, 16711680; + or.b32 %r26622, %r26618, %r26621; + cvt.u32.u16 %r26623, %rs792; + shl.b32 %r26624, %r26623, 24; + or.b32 %r26625, %r26622, %r26624; + cvt.u32.u16 %r26626, %rs791; + and.b32 %r26627, %r26626, 255; + cvt.u32.u16 %r26628, %rs790; + prmt.b32 %r26629, %r26628, %r26627, 30212; + cvt.u32.u16 %r26630, %rs789; + shl.b32 %r26631, %r26630, 16; + and.b32 %r26632, %r26631, 16711680; + or.b32 %r26633, %r26629, %r26632; + cvt.u32.u16 %r26634, %rs788; + shl.b32 %r26635, %r26634, 24; + or.b32 %r26636, %r26633, %r26635; + cvt.u32.u16 %r26637, %rs787; + and.b32 %r26638, %r26637, 255; + cvt.u32.u16 %r26639, %rs786; + prmt.b32 %r26640, %r26639, %r26638, 30212; + cvt.u32.u16 %r26641, %rs785; + shl.b32 %r26642, %r26641, 16; + and.b32 %r26643, %r26642, 16711680; + or.b32 %r26644, %r26640, %r26643; + cvt.u32.u16 %r26645, %rs784; + shl.b32 %r26646, %r26645, 24; + or.b32 %r26647, %r26644, %r26646; + cvt.u32.u16 %r26648, %rs783; + and.b32 %r26649, %r26648, 255; + cvt.u32.u16 %r26650, %rs782; + prmt.b32 %r26651, %r26650, %r26649, 30212; + cvt.u32.u16 %r26652, %rs781; + shl.b32 %r26653, %r26652, 16; + and.b32 %r26654, %r26653, 16711680; + or.b32 %r26655, %r26651, %r26654; + cvt.u32.u16 %r26656, %rs780; + shl.b32 %r26657, %r26656, 24; + or.b32 %r26658, %r26655, %r26657; + cvt.u32.u16 %r26659, %rs779; + and.b32 %r26660, %r26659, 255; + cvt.u32.u16 %r26661, %rs778; + prmt.b32 %r26662, %r26661, %r26660, 30212; + cvt.u32.u16 %r26663, %rs777; + shl.b32 %r26664, %r26663, 16; + and.b32 %r26665, %r26664, 16711680; + or.b32 %r26666, %r26662, %r26665; + cvt.u32.u16 %r26667, %rs776; + shl.b32 %r26668, %r26667, 24; + or.b32 %r26669, %r26666, %r26668; + cvt.u32.u16 %r26670, %rs775; + and.b32 %r26671, %r26670, 255; + cvt.u32.u16 %r26672, %rs774; + prmt.b32 %r26673, %r26672, %r26671, 30212; + cvt.u32.u16 %r26674, %rs773; + shl.b32 %r26675, %r26674, 16; + and.b32 %r26676, %r26675, 16711680; + or.b32 %r26677, %r26673, %r26676; + cvt.u32.u16 %r26678, %rs772; + shl.b32 %r26679, %r26678, 24; + or.b32 %r26680, %r26677, %r26679; + cvt.u32.u16 %r26681, %rs771; + and.b32 %r26682, %r26681, 255; + cvt.u32.u16 %r26683, %rs770; + prmt.b32 %r26684, %r26683, %r26682, 30212; + cvt.u32.u16 %r26685, %rs769; + shl.b32 %r26686, %r26685, 16; + and.b32 %r26687, %r26686, 16711680; + or.b32 %r26688, %r26684, %r26687; + cvt.u32.u16 %r26689, %rs768; + shl.b32 %r26690, %r26689, 24; + or.b32 %r26691, %r26688, %r26690; + cvt.u32.u16 %r26692, %rs832; + and.b32 %r26693, %r26692, 255; + cvt.u32.u16 %r26694, %rs833; + prmt.b32 %r26695, %r26694, %r26693, 30212; + cvt.u32.u16 %r26696, %rs834; + shl.b32 %r26697, %r26696, 16; + and.b32 %r26698, %r26697, 16711680; + or.b32 %r26699, %r26695, %r26698; + cvt.u32.u16 %r26700, %rs835; + shl.b32 %r26701, %r26700, 24; + or.b32 %r26702, %r26699, %r26701; + cvt.u32.u16 %r26703, %rs836; + and.b32 %r26704, %r26703, 255; + cvt.u32.u16 %r26705, %rs837; + prmt.b32 %r26706, %r26705, %r26704, 30212; + cvt.u32.u16 %r26707, %rs838; + shl.b32 %r26708, %r26707, 16; + and.b32 %r26709, %r26708, 16711680; + or.b32 %r26710, %r26706, %r26709; + cvt.u32.u16 %r26711, %rs839; + shl.b32 %r26712, %r26711, 24; + or.b32 %r26713, %r26710, %r26712; + cvt.u32.u16 %r26714, %rs840; + and.b32 %r26715, %r26714, 255; + cvt.u32.u16 %r26716, %rs841; + prmt.b32 %r26717, %r26716, %r26715, 30212; + cvt.u32.u16 %r26718, %rs842; + shl.b32 %r26719, %r26718, 16; + and.b32 %r26720, %r26719, 16711680; + or.b32 %r26721, %r26717, %r26720; + cvt.u32.u16 %r26722, %rs843; + shl.b32 %r26723, %r26722, 24; + or.b32 %r26724, %r26721, %r26723; + cvt.u32.u16 %r26725, %rs844; + and.b32 %r26726, %r26725, 255; + cvt.u32.u16 %r26727, %rs845; + prmt.b32 %r26728, %r26727, %r26726, 30212; + cvt.u32.u16 %r26729, %rs846; + shl.b32 %r26730, %r26729, 16; + and.b32 %r26731, %r26730, 16711680; + or.b32 %r26732, %r26728, %r26731; + cvt.u32.u16 %r26733, %rs847; + shl.b32 %r26734, %r26733, 24; + or.b32 %r26735, %r26732, %r26734; + cvt.u32.u16 %r26736, %rs848; + and.b32 %r26737, %r26736, 255; + cvt.u32.u16 %r26738, %rs849; + prmt.b32 %r26739, %r26738, %r26737, 30212; + cvt.u32.u16 %r26740, %rs850; + shl.b32 %r26741, %r26740, 16; + and.b32 %r26742, %r26741, 16711680; + or.b32 %r26743, %r26739, %r26742; + cvt.u32.u16 %r26744, %rs851; + shl.b32 %r26745, %r26744, 24; + or.b32 %r26746, %r26743, %r26745; + cvt.u32.u16 %r26747, %rs852; + and.b32 %r26748, %r26747, 255; + cvt.u32.u16 %r26749, %rs853; + prmt.b32 %r26750, %r26749, %r26748, 30212; + cvt.u32.u16 %r26751, %rs854; + shl.b32 %r26752, %r26751, 16; + and.b32 %r26753, %r26752, 16711680; + or.b32 %r26754, %r26750, %r26753; + cvt.u32.u16 %r26755, %rs855; + shl.b32 %r26756, %r26755, 24; + or.b32 %r26757, %r26754, %r26756; + cvt.u32.u16 %r26758, %rs856; + and.b32 %r26759, %r26758, 255; + cvt.u32.u16 %r26760, %rs857; + prmt.b32 %r26761, %r26760, %r26759, 30212; + cvt.u32.u16 %r26762, %rs858; + shl.b32 %r26763, %r26762, 16; + and.b32 %r26764, %r26763, 16711680; + or.b32 %r26765, %r26761, %r26764; + cvt.u32.u16 %r26766, %rs859; + shl.b32 %r26767, %r26766, 24; + or.b32 %r26768, %r26765, %r26767; + cvt.u32.u16 %r26769, %rs860; + and.b32 %r26770, %r26769, 255; + cvt.u32.u16 %r26771, %rs861; + prmt.b32 %r26772, %r26771, %r26770, 30212; + cvt.u32.u16 %r26773, %rs862; + shl.b32 %r26774, %r26773, 16; + and.b32 %r26775, %r26774, 16711680; + or.b32 %r26776, %r26772, %r26775; + cvt.u32.u16 %r26777, %rs863; + shl.b32 %r26778, %r26777, 24; + or.b32 %r26779, %r26776, %r26778; + shr.u64 %rd1146, %rd1276, 32; + cvt.u32.u64 %r26780, %rd1146; + add.s32 %r26781, %r30972, %r30976; + add.s32 %r26782, %r26781, %r26614; + cvt.u32.u64 %r26783, %rd1276; + xor.b32 %r26784, %r26782, %r26783; + shf.l.wrap.b32 %r26785, %r26784, %r26784, 16; + add.s32 %r26786, %r26785, 1779033703; + xor.b32 %r26787, %r26786, %r30972; + shf.l.wrap.b32 %r26788, %r26787, %r26787, 20; + add.s32 %r26789, %r26625, %r26782; + add.s32 %r26790, %r26789, %r26788; + xor.b32 %r26791, %r26790, %r26785; + shf.l.wrap.b32 %r26792, %r26791, %r26791, 24; + add.s32 %r26793, %r26792, %r26786; + xor.b32 %r26794, %r26793, %r26788; + shf.l.wrap.b32 %r26795, %r26794, %r26794, 25; + add.s32 %r26796, %r30971, %r30975; + add.s32 %r26797, %r26796, %r26636; + xor.b32 %r26798, %r26797, %r26780; + shf.l.wrap.b32 %r26799, %r26798, %r26798, 16; + add.s32 %r26800, %r26799, -1150833019; + xor.b32 %r26801, %r26800, %r30971; + shf.l.wrap.b32 %r26802, %r26801, %r26801, 20; + add.s32 %r26803, %r26647, %r26797; + add.s32 %r26804, %r26803, %r26802; + xor.b32 %r26805, %r26804, %r26799; + shf.l.wrap.b32 %r26806, %r26805, %r26805, 24; + add.s32 %r26807, %r26806, %r26800; + xor.b32 %r26808, %r26807, %r26802; + shf.l.wrap.b32 %r26809, %r26808, %r26808, 25; + add.s32 %r26810, %r30970, %r30974; + add.s32 %r26811, %r26810, %r26658; + cvt.u32.u16 %r26812, %rs864; + and.b32 %r26813, %r26812, 255; + xor.b32 %r26814, %r26811, %r26813; + shr.u32 %r26815, %r26811, 16; + shl.b32 %r26816, %r26814, 16; + or.b32 %r26817, %r26816, %r26815; + add.s32 %r26818, %r26817, 1013904242; + xor.b32 %r26819, %r26818, %r30970; + shf.l.wrap.b32 %r26820, %r26819, %r26819, 20; + add.s32 %r26821, %r26669, %r26811; + add.s32 %r26822, %r26821, %r26820; + xor.b32 %r26823, %r26822, %r26817; + shf.l.wrap.b32 %r26824, %r26823, %r26823, 24; + add.s32 %r26825, %r26824, %r26818; + xor.b32 %r26826, %r26825, %r26820; + shf.l.wrap.b32 %r26827, %r26826, %r26826, 25; + add.s32 %r26828, %r30969, %r30973; + add.s32 %r26829, %r26828, %r26680; + cvt.u32.u16 %r26830, %rs734; + and.b32 %r26831, %r26830, 255; + xor.b32 %r26832, %r26829, %r26831; + shr.u32 %r26833, %r26829, 16; + shl.b32 %r26834, %r26832, 16; + or.b32 %r26835, %r26834, %r26833; + add.s32 %r26836, %r26835, -1521486534; + xor.b32 %r26837, %r26836, %r30969; + shf.l.wrap.b32 %r26838, %r26837, %r26837, 20; + add.s32 %r26839, %r26691, %r26829; + add.s32 %r26840, %r26839, %r26838; + xor.b32 %r26841, %r26840, %r26835; + shf.l.wrap.b32 %r26842, %r26841, %r26841, 24; + add.s32 %r26843, %r26842, %r26836; + xor.b32 %r26844, %r26843, %r26838; + shf.l.wrap.b32 %r26845, %r26844, %r26844, 25; + add.s32 %r26846, %r26809, %r26790; + add.s32 %r26847, %r26846, %r26702; + xor.b32 %r26848, %r26842, %r26847; + shf.l.wrap.b32 %r26849, %r26848, %r26848, 16; + add.s32 %r26850, %r26849, %r26825; + xor.b32 %r26851, %r26850, %r26809; + shf.l.wrap.b32 %r26852, %r26851, %r26851, 20; + add.s32 %r26853, %r26713, %r26847; + add.s32 %r26854, %r26853, %r26852; + xor.b32 %r26855, %r26854, %r26849; + shf.l.wrap.b32 %r26856, %r26855, %r26855, 24; + add.s32 %r26857, %r26856, %r26850; + xor.b32 %r26858, %r26857, %r26852; + shf.l.wrap.b32 %r26859, %r26858, %r26858, 25; + add.s32 %r26860, %r26724, %r26804; + add.s32 %r26861, %r26860, %r26827; + xor.b32 %r26862, %r26861, %r26792; + shf.l.wrap.b32 %r26863, %r26862, %r26862, 16; + add.s32 %r26864, %r26863, %r26843; + xor.b32 %r26865, %r26864, %r26827; + shf.l.wrap.b32 %r26866, %r26865, %r26865, 20; + add.s32 %r26867, %r26861, %r26735; + add.s32 %r26868, %r26867, %r26866; + xor.b32 %r26869, %r26868, %r26863; + shf.l.wrap.b32 %r26870, %r26869, %r26869, 24; + add.s32 %r26871, %r26870, %r26864; + xor.b32 %r26872, %r26871, %r26866; + shf.l.wrap.b32 %r26873, %r26872, %r26872, 25; + add.s32 %r26874, %r26822, %r26746; + add.s32 %r26875, %r26874, %r26845; + xor.b32 %r26876, %r26875, %r26806; + shf.l.wrap.b32 %r26877, %r26876, %r26876, 16; + add.s32 %r26878, %r26877, %r26793; + xor.b32 %r26879, %r26878, %r26845; + shf.l.wrap.b32 %r26880, %r26879, %r26879, 20; + add.s32 %r26881, %r26875, %r26757; + add.s32 %r26882, %r26881, %r26880; + xor.b32 %r26883, %r26882, %r26877; + shf.l.wrap.b32 %r26884, %r26883, %r26883, 24; + add.s32 %r26885, %r26884, %r26878; + xor.b32 %r26886, %r26885, %r26880; + shf.l.wrap.b32 %r26887, %r26886, %r26886, 25; + add.s32 %r26888, %r26768, %r26795; + add.s32 %r26889, %r26888, %r26840; + xor.b32 %r26890, %r26824, %r26889; + shf.l.wrap.b32 %r26891, %r26890, %r26890, 16; + add.s32 %r26892, %r26891, %r26807; + xor.b32 %r26893, %r26892, %r26795; + shf.l.wrap.b32 %r26894, %r26893, %r26893, 20; + add.s32 %r26895, %r26889, %r26779; + add.s32 %r26896, %r26895, %r26894; + xor.b32 %r26897, %r26896, %r26891; + shf.l.wrap.b32 %r26898, %r26897, %r26897, 24; + add.s32 %r26899, %r26898, %r26892; + xor.b32 %r26900, %r26899, %r26894; + shf.l.wrap.b32 %r26901, %r26900, %r26900, 25; + add.s32 %r26902, %r26854, %r26636; + add.s32 %r26903, %r26902, %r26901; + xor.b32 %r26904, %r26870, %r26903; + shf.l.wrap.b32 %r26905, %r26904, %r26904, 16; + add.s32 %r26906, %r26905, %r26885; + xor.b32 %r26907, %r26906, %r26901; + shf.l.wrap.b32 %r26908, %r26907, %r26907, 20; + add.s32 %r26909, %r26903, %r26680; + add.s32 %r26910, %r26909, %r26908; + xor.b32 %r26911, %r26910, %r26905; + shf.l.wrap.b32 %r26912, %r26911, %r26911, 24; + add.s32 %r26913, %r26912, %r26906; + xor.b32 %r26914, %r26913, %r26908; + shf.l.wrap.b32 %r26915, %r26914, %r26914, 25; + add.s32 %r26916, %r26868, %r26647; + add.s32 %r26917, %r26916, %r26859; + xor.b32 %r26918, %r26884, %r26917; + shf.l.wrap.b32 %r26919, %r26918, %r26918, 16; + add.s32 %r26920, %r26919, %r26899; + xor.b32 %r26921, %r26920, %r26859; + shf.l.wrap.b32 %r26922, %r26921, %r26921, 20; + add.s32 %r26923, %r26917, %r26724; + add.s32 %r26924, %r26923, %r26922; + xor.b32 %r26925, %r26924, %r26919; + shf.l.wrap.b32 %r26926, %r26925, %r26925, 24; + add.s32 %r26927, %r26926, %r26920; + xor.b32 %r26928, %r26927, %r26922; + shf.l.wrap.b32 %r26929, %r26928, %r26928, 25; + add.s32 %r26930, %r26882, %r26691; + add.s32 %r26931, %r26930, %r26873; + xor.b32 %r26932, %r26931, %r26898; + shf.l.wrap.b32 %r26933, %r26932, %r26932, 16; + add.s32 %r26934, %r26933, %r26857; + xor.b32 %r26935, %r26934, %r26873; + shf.l.wrap.b32 %r26936, %r26935, %r26935, 20; + add.s32 %r26937, %r26931, %r26614; + add.s32 %r26938, %r26937, %r26936; + xor.b32 %r26939, %r26938, %r26933; + shf.l.wrap.b32 %r26940, %r26939, %r26939, 24; + add.s32 %r26941, %r26940, %r26934; + xor.b32 %r26942, %r26941, %r26936; + shf.l.wrap.b32 %r26943, %r26942, %r26942, 25; + add.s32 %r26944, %r26896, %r26658; + add.s32 %r26945, %r26944, %r26887; + xor.b32 %r26946, %r26945, %r26856; + shf.l.wrap.b32 %r26947, %r26946, %r26946, 16; + add.s32 %r26948, %r26947, %r26871; + xor.b32 %r26949, %r26948, %r26887; + shf.l.wrap.b32 %r26950, %r26949, %r26949, 20; + add.s32 %r26951, %r26945, %r26757; + add.s32 %r26952, %r26951, %r26950; + xor.b32 %r26953, %r26952, %r26947; + shf.l.wrap.b32 %r26954, %r26953, %r26953, 24; + add.s32 %r26955, %r26954, %r26948; + xor.b32 %r26956, %r26955, %r26950; + shf.l.wrap.b32 %r26957, %r26956, %r26956, 25; + add.s32 %r26958, %r26910, %r26625; + add.s32 %r26959, %r26958, %r26929; + xor.b32 %r26960, %r26954, %r26959; + shf.l.wrap.b32 %r26961, %r26960, %r26960, 16; + add.s32 %r26962, %r26961, %r26941; + xor.b32 %r26963, %r26962, %r26929; + shf.l.wrap.b32 %r26964, %r26963, %r26963, 20; + add.s32 %r26965, %r26959, %r26735; + add.s32 %r26966, %r26965, %r26964; + xor.b32 %r26967, %r26966, %r26961; + shf.l.wrap.b32 %r26968, %r26967, %r26967, 24; + add.s32 %r26969, %r26968, %r26962; + xor.b32 %r26970, %r26969, %r26964; + shf.l.wrap.b32 %r26971, %r26970, %r26970, 25; + add.s32 %r26972, %r26924, %r26746; + add.s32 %r26973, %r26972, %r26943; + xor.b32 %r26974, %r26973, %r26912; + shf.l.wrap.b32 %r26975, %r26974, %r26974, 16; + add.s32 %r26976, %r26975, %r26955; + xor.b32 %r26977, %r26976, %r26943; + shf.l.wrap.b32 %r26978, %r26977, %r26977, 20; + add.s32 %r26979, %r26973, %r26669; + add.s32 %r26980, %r26979, %r26978; + xor.b32 %r26981, %r26980, %r26975; + shf.l.wrap.b32 %r26982, %r26981, %r26981, 24; + add.s32 %r26983, %r26982, %r26976; + xor.b32 %r26984, %r26983, %r26978; + shf.l.wrap.b32 %r26985, %r26984, %r26984, 25; + add.s32 %r26986, %r26938, %r26713; + add.s32 %r26987, %r26986, %r26957; + xor.b32 %r26988, %r26987, %r26926; + shf.l.wrap.b32 %r26989, %r26988, %r26988, 16; + add.s32 %r26990, %r26989, %r26913; + xor.b32 %r26991, %r26990, %r26957; + shf.l.wrap.b32 %r26992, %r26991, %r26991, 20; + add.s32 %r26993, %r26987, %r26768; + add.s32 %r26994, %r26993, %r26992; + xor.b32 %r26995, %r26994, %r26989; + shf.l.wrap.b32 %r26996, %r26995, %r26995, 24; + add.s32 %r26997, %r26996, %r26990; + xor.b32 %r26998, %r26997, %r26992; + shf.l.wrap.b32 %r26999, %r26998, %r26998, 25; + add.s32 %r27000, %r26915, %r26779; + add.s32 %r27001, %r27000, %r26952; + xor.b32 %r27002, %r26940, %r27001; + shf.l.wrap.b32 %r27003, %r27002, %r27002, 16; + add.s32 %r27004, %r27003, %r26927; + xor.b32 %r27005, %r27004, %r26915; + shf.l.wrap.b32 %r27006, %r27005, %r27005, 20; + add.s32 %r27007, %r27001, %r26702; + add.s32 %r27008, %r27007, %r27006; + xor.b32 %r27009, %r27008, %r27003; + shf.l.wrap.b32 %r27010, %r27009, %r27009, 24; + add.s32 %r27011, %r27010, %r27004; + xor.b32 %r27012, %r27011, %r27006; + shf.l.wrap.b32 %r27013, %r27012, %r27012, 25; + add.s32 %r27014, %r26966, %r26647; + add.s32 %r27015, %r27014, %r27013; + xor.b32 %r27016, %r26982, %r27015; + shf.l.wrap.b32 %r27017, %r27016, %r27016, 16; + add.s32 %r27018, %r27017, %r26997; + xor.b32 %r27019, %r27018, %r27013; + shf.l.wrap.b32 %r27020, %r27019, %r27019, 20; + add.s32 %r27021, %r27015, %r26658; + add.s32 %r27022, %r27021, %r27020; + xor.b32 %r27023, %r27022, %r27017; + shf.l.wrap.b32 %r27024, %r27023, %r27023, 24; + add.s32 %r27025, %r27024, %r27018; + xor.b32 %r27026, %r27025, %r27020; + shf.l.wrap.b32 %r27027, %r27026, %r27026, 25; + add.s32 %r27028, %r26980, %r26724; + add.s32 %r27029, %r27028, %r26971; + xor.b32 %r27030, %r26996, %r27029; + shf.l.wrap.b32 %r27031, %r27030, %r27030, 16; + add.s32 %r27032, %r27031, %r27011; + xor.b32 %r27033, %r27032, %r26971; + shf.l.wrap.b32 %r27034, %r27033, %r27033, 20; + add.s32 %r27035, %r27029, %r26746; + add.s32 %r27036, %r27035, %r27034; + xor.b32 %r27037, %r27036, %r27031; + shf.l.wrap.b32 %r27038, %r27037, %r27037, 24; + add.s32 %r27039, %r27038, %r27032; + xor.b32 %r27040, %r27039, %r27034; + shf.l.wrap.b32 %r27041, %r27040, %r27040, 25; + add.s32 %r27042, %r26994, %r26757; + add.s32 %r27043, %r27042, %r26985; + xor.b32 %r27044, %r27043, %r27010; + shf.l.wrap.b32 %r27045, %r27044, %r27044, 16; + add.s32 %r27046, %r27045, %r26969; + xor.b32 %r27047, %r27046, %r26985; + shf.l.wrap.b32 %r27048, %r27047, %r27047, 20; + add.s32 %r27049, %r27043, %r26636; + add.s32 %r27050, %r27049, %r27048; + xor.b32 %r27051, %r27050, %r27045; + shf.l.wrap.b32 %r27052, %r27051, %r27051, 24; + add.s32 %r27053, %r27052, %r27046; + xor.b32 %r27054, %r27053, %r27048; + shf.l.wrap.b32 %r27055, %r27054, %r27054, 25; + add.s32 %r27056, %r27008, %r26691; + add.s32 %r27057, %r27056, %r26999; + xor.b32 %r27058, %r27057, %r26968; + shf.l.wrap.b32 %r27059, %r27058, %r27058, 16; + add.s32 %r27060, %r27059, %r26983; + xor.b32 %r27061, %r27060, %r26999; + shf.l.wrap.b32 %r27062, %r27061, %r27061, 20; + add.s32 %r27063, %r27057, %r26768; + add.s32 %r27064, %r27063, %r27062; + xor.b32 %r27065, %r27064, %r27059; + shf.l.wrap.b32 %r27066, %r27065, %r27065, 24; + add.s32 %r27067, %r27066, %r27060; + xor.b32 %r27068, %r27067, %r27062; + shf.l.wrap.b32 %r27069, %r27068, %r27068, 25; + add.s32 %r27070, %r27022, %r26680; + add.s32 %r27071, %r27070, %r27041; + xor.b32 %r27072, %r27066, %r27071; + shf.l.wrap.b32 %r27073, %r27072, %r27072, 16; + add.s32 %r27074, %r27073, %r27053; + xor.b32 %r27075, %r27074, %r27041; + shf.l.wrap.b32 %r27076, %r27075, %r27075, 20; + add.s32 %r27077, %r27071, %r26669; + add.s32 %r27078, %r27077, %r27076; + xor.b32 %r27079, %r27078, %r27073; + shf.l.wrap.b32 %r27080, %r27079, %r27079, 24; + add.s32 %r27081, %r27080, %r27074; + xor.b32 %r27082, %r27081, %r27076; + shf.l.wrap.b32 %r27083, %r27082, %r27082, 25; + add.s32 %r27084, %r27036, %r26713; + add.s32 %r27085, %r27084, %r27055; + xor.b32 %r27086, %r27085, %r27024; + shf.l.wrap.b32 %r27087, %r27086, %r27086, 16; + add.s32 %r27088, %r27087, %r27067; + xor.b32 %r27089, %r27088, %r27055; + shf.l.wrap.b32 %r27090, %r27089, %r27089, 20; + add.s32 %r27091, %r27085, %r26614; + add.s32 %r27092, %r27091, %r27090; + xor.b32 %r27093, %r27092, %r27087; + shf.l.wrap.b32 %r27094, %r27093, %r27093, 24; + add.s32 %r27095, %r27094, %r27088; + xor.b32 %r27096, %r27095, %r27090; + shf.l.wrap.b32 %r27097, %r27096, %r27096, 25; + add.s32 %r27098, %r27050, %r26735; + add.s32 %r27099, %r27098, %r27069; + xor.b32 %r27100, %r27099, %r27038; + shf.l.wrap.b32 %r27101, %r27100, %r27100, 16; + add.s32 %r27102, %r27101, %r27025; + xor.b32 %r27103, %r27102, %r27069; + shf.l.wrap.b32 %r27104, %r27103, %r27103, 20; + add.s32 %r27105, %r27099, %r26779; + add.s32 %r27106, %r27105, %r27104; + xor.b32 %r27107, %r27106, %r27101; + shf.l.wrap.b32 %r27108, %r27107, %r27107, 24; + add.s32 %r27109, %r27108, %r27102; + xor.b32 %r27110, %r27109, %r27104; + shf.l.wrap.b32 %r27111, %r27110, %r27110, 25; + add.s32 %r27112, %r27027, %r26702; + add.s32 %r27113, %r27112, %r27064; + xor.b32 %r27114, %r27052, %r27113; + shf.l.wrap.b32 %r27115, %r27114, %r27114, 16; + add.s32 %r27116, %r27115, %r27039; + xor.b32 %r27117, %r27116, %r27027; + shf.l.wrap.b32 %r27118, %r27117, %r27117, 20; + add.s32 %r27119, %r27113, %r26625; + add.s32 %r27120, %r27119, %r27118; + xor.b32 %r27121, %r27120, %r27115; + shf.l.wrap.b32 %r27122, %r27121, %r27121, 24; + add.s32 %r27123, %r27122, %r27116; + xor.b32 %r27124, %r27123, %r27118; + shf.l.wrap.b32 %r27125, %r27124, %r27124, 25; + add.s32 %r27126, %r27078, %r26724; + add.s32 %r27127, %r27126, %r27125; + xor.b32 %r27128, %r27094, %r27127; + shf.l.wrap.b32 %r27129, %r27128, %r27128, 16; + add.s32 %r27130, %r27129, %r27109; + xor.b32 %r27131, %r27130, %r27125; + shf.l.wrap.b32 %r27132, %r27131, %r27131, 20; + add.s32 %r27133, %r27127, %r26691; + add.s32 %r27134, %r27133, %r27132; + xor.b32 %r27135, %r27134, %r27129; + shf.l.wrap.b32 %r27136, %r27135, %r27135, 24; + add.s32 %r27137, %r27136, %r27130; + xor.b32 %r27138, %r27137, %r27132; + shf.l.wrap.b32 %r27139, %r27138, %r27138, 25; + add.s32 %r27140, %r27092, %r26746; + add.s32 %r27141, %r27140, %r27083; + xor.b32 %r27142, %r27108, %r27141; + shf.l.wrap.b32 %r27143, %r27142, %r27142, 16; + add.s32 %r27144, %r27143, %r27123; + xor.b32 %r27145, %r27144, %r27083; + shf.l.wrap.b32 %r27146, %r27145, %r27145, 20; + add.s32 %r27147, %r27141, %r26713; + add.s32 %r27148, %r27147, %r27146; + xor.b32 %r27149, %r27148, %r27143; + shf.l.wrap.b32 %r27150, %r27149, %r27149, 24; + add.s32 %r27151, %r27150, %r27144; + xor.b32 %r27152, %r27151, %r27146; + shf.l.wrap.b32 %r27153, %r27152, %r27152, 25; + add.s32 %r27154, %r27106, %r26768; + add.s32 %r27155, %r27154, %r27097; + xor.b32 %r27156, %r27155, %r27122; + shf.l.wrap.b32 %r27157, %r27156, %r27156, 16; + add.s32 %r27158, %r27157, %r27081; + xor.b32 %r27159, %r27158, %r27097; + shf.l.wrap.b32 %r27160, %r27159, %r27159, 20; + add.s32 %r27161, %r27155, %r26647; + add.s32 %r27162, %r27161, %r27160; + xor.b32 %r27163, %r27162, %r27157; + shf.l.wrap.b32 %r27164, %r27163, %r27163, 24; + add.s32 %r27165, %r27164, %r27158; + xor.b32 %r27166, %r27165, %r27160; + shf.l.wrap.b32 %r27167, %r27166, %r27166, 25; + add.s32 %r27168, %r27120, %r26757; + add.s32 %r27169, %r27168, %r27111; + xor.b32 %r27170, %r27169, %r27080; + shf.l.wrap.b32 %r27171, %r27170, %r27170, 16; + add.s32 %r27172, %r27171, %r27095; + xor.b32 %r27173, %r27172, %r27111; + shf.l.wrap.b32 %r27174, %r27173, %r27173, 20; + add.s32 %r27175, %r27169, %r26779; + add.s32 %r27176, %r27175, %r27174; + xor.b32 %r27177, %r27176, %r27171; + shf.l.wrap.b32 %r27178, %r27177, %r27177, 24; + add.s32 %r27179, %r27178, %r27172; + xor.b32 %r27180, %r27179, %r27174; + shf.l.wrap.b32 %r27181, %r27180, %r27180, 25; + add.s32 %r27182, %r27134, %r26658; + add.s32 %r27183, %r27182, %r27153; + xor.b32 %r27184, %r27178, %r27183; + shf.l.wrap.b32 %r27185, %r27184, %r27184, 16; + add.s32 %r27186, %r27185, %r27165; + xor.b32 %r27187, %r27186, %r27153; + shf.l.wrap.b32 %r27188, %r27187, %r27187, 20; + add.s32 %r27189, %r27183, %r26614; + add.s32 %r27190, %r27189, %r27188; + xor.b32 %r27191, %r27190, %r27185; + shf.l.wrap.b32 %r27192, %r27191, %r27191, 24; + add.s32 %r27193, %r27192, %r27186; + xor.b32 %r27194, %r27193, %r27188; + shf.l.wrap.b32 %r27195, %r27194, %r27194, 25; + add.s32 %r27196, %r27148, %r26735; + add.s32 %r27197, %r27196, %r27167; + xor.b32 %r27198, %r27197, %r27136; + shf.l.wrap.b32 %r27199, %r27198, %r27198, 16; + add.s32 %r27200, %r27199, %r27179; + xor.b32 %r27201, %r27200, %r27167; + shf.l.wrap.b32 %r27202, %r27201, %r27201, 20; + add.s32 %r27203, %r27197, %r26636; + add.s32 %r27204, %r27203, %r27202; + xor.b32 %r27205, %r27204, %r27199; + shf.l.wrap.b32 %r27206, %r27205, %r27205, 24; + add.s32 %r27207, %r27206, %r27200; + xor.b32 %r27208, %r27207, %r27202; + shf.l.wrap.b32 %r27209, %r27208, %r27208, 25; + add.s32 %r27210, %r27162, %r26669; + add.s32 %r27211, %r27210, %r27181; + xor.b32 %r27212, %r27211, %r27150; + shf.l.wrap.b32 %r27213, %r27212, %r27212, 16; + add.s32 %r27214, %r27213, %r27137; + xor.b32 %r27215, %r27214, %r27181; + shf.l.wrap.b32 %r27216, %r27215, %r27215, 20; + add.s32 %r27217, %r27211, %r26702; + add.s32 %r27218, %r27217, %r27216; + xor.b32 %r27219, %r27218, %r27213; + shf.l.wrap.b32 %r27220, %r27219, %r27219, 24; + add.s32 %r27221, %r27220, %r27214; + xor.b32 %r27222, %r27221, %r27216; + shf.l.wrap.b32 %r27223, %r27222, %r27222, 25; + add.s32 %r27224, %r27139, %r26625; + add.s32 %r27225, %r27224, %r27176; + xor.b32 %r27226, %r27164, %r27225; + shf.l.wrap.b32 %r27227, %r27226, %r27226, 16; + add.s32 %r27228, %r27227, %r27151; + xor.b32 %r27229, %r27228, %r27139; + shf.l.wrap.b32 %r27230, %r27229, %r27229, 20; + add.s32 %r27231, %r27225, %r26680; + add.s32 %r27232, %r27231, %r27230; + xor.b32 %r27233, %r27232, %r27227; + shf.l.wrap.b32 %r27234, %r27233, %r27233, 24; + add.s32 %r27235, %r27234, %r27228; + xor.b32 %r27236, %r27235, %r27230; + shf.l.wrap.b32 %r27237, %r27236, %r27236, 25; + add.s32 %r27238, %r27190, %r26746; + add.s32 %r27239, %r27238, %r27237; + xor.b32 %r27240, %r27206, %r27239; + shf.l.wrap.b32 %r27241, %r27240, %r27240, 16; + add.s32 %r27242, %r27241, %r27221; + xor.b32 %r27243, %r27242, %r27237; + shf.l.wrap.b32 %r27244, %r27243, %r27243, 20; + add.s32 %r27245, %r27239, %r26757; + add.s32 %r27246, %r27245, %r27244; + xor.b32 %r27247, %r27246, %r27241; + shf.l.wrap.b32 %r27248, %r27247, %r27247, 24; + add.s32 %r27249, %r27248, %r27242; + xor.b32 %r27250, %r27249, %r27244; + shf.l.wrap.b32 %r27251, %r27250, %r27250, 25; + add.s32 %r27252, %r27204, %r26713; + add.s32 %r27253, %r27252, %r27195; + xor.b32 %r27254, %r27220, %r27253; + shf.l.wrap.b32 %r27255, %r27254, %r27254, 16; + add.s32 %r27256, %r27255, %r27235; + xor.b32 %r27257, %r27256, %r27195; + shf.l.wrap.b32 %r27258, %r27257, %r27257, 20; + add.s32 %r27259, %r27253, %r26735; + add.s32 %r27260, %r27259, %r27258; + xor.b32 %r27261, %r27260, %r27255; + shf.l.wrap.b32 %r27262, %r27261, %r27261, 24; + add.s32 %r27263, %r27262, %r27256; + xor.b32 %r27264, %r27263, %r27258; + shf.l.wrap.b32 %r27265, %r27264, %r27264, 25; + add.s32 %r27266, %r27218, %r26779; + add.s32 %r27267, %r27266, %r27209; + xor.b32 %r27268, %r27267, %r27234; + shf.l.wrap.b32 %r27269, %r27268, %r27268, 16; + add.s32 %r27270, %r27269, %r27193; + xor.b32 %r27271, %r27270, %r27209; + shf.l.wrap.b32 %r27272, %r27271, %r27271, 20; + add.s32 %r27273, %r27267, %r26724; + add.s32 %r27274, %r27273, %r27272; + xor.b32 %r27275, %r27274, %r27269; + shf.l.wrap.b32 %r27276, %r27275, %r27275, 24; + add.s32 %r27277, %r27276, %r27270; + xor.b32 %r27278, %r27277, %r27272; + shf.l.wrap.b32 %r27279, %r27278, %r27278, 25; + add.s32 %r27280, %r27232, %r26768; + add.s32 %r27281, %r27280, %r27223; + xor.b32 %r27282, %r27281, %r27192; + shf.l.wrap.b32 %r27283, %r27282, %r27282, 16; + add.s32 %r27284, %r27283, %r27207; + xor.b32 %r27285, %r27284, %r27223; + shf.l.wrap.b32 %r27286, %r27285, %r27285, 20; + add.s32 %r27287, %r27281, %r26702; + add.s32 %r27288, %r27287, %r27286; + xor.b32 %r27289, %r27288, %r27283; + shf.l.wrap.b32 %r27290, %r27289, %r27289, 24; + add.s32 %r27291, %r27290, %r27284; + xor.b32 %r27292, %r27291, %r27286; + shf.l.wrap.b32 %r27293, %r27292, %r27292, 25; + add.s32 %r27294, %r27246, %r26691; + add.s32 %r27295, %r27294, %r27265; + xor.b32 %r27296, %r27290, %r27295; + shf.l.wrap.b32 %r27297, %r27296, %r27296, 16; + add.s32 %r27298, %r27297, %r27277; + xor.b32 %r27299, %r27298, %r27265; + shf.l.wrap.b32 %r27300, %r27299, %r27299, 20; + add.s32 %r27301, %r27295, %r26636; + add.s32 %r27302, %r27301, %r27300; + xor.b32 %r27303, %r27302, %r27297; + shf.l.wrap.b32 %r27304, %r27303, %r27303, 24; + add.s32 %r27305, %r27304, %r27298; + xor.b32 %r27306, %r27305, %r27300; + shf.l.wrap.b32 %r27307, %r27306, %r27306, 25; + add.s32 %r27308, %r27260, %r26669; + add.s32 %r27309, %r27308, %r27279; + xor.b32 %r27310, %r27309, %r27248; + shf.l.wrap.b32 %r27311, %r27310, %r27310, 16; + add.s32 %r27312, %r27311, %r27291; + xor.b32 %r27313, %r27312, %r27279; + shf.l.wrap.b32 %r27314, %r27313, %r27313, 20; + add.s32 %r27315, %r27309, %r26647; + add.s32 %r27316, %r27315, %r27314; + xor.b32 %r27317, %r27316, %r27311; + shf.l.wrap.b32 %r27318, %r27317, %r27317, 24; + add.s32 %r27319, %r27318, %r27312; + xor.b32 %r27320, %r27319, %r27314; + shf.l.wrap.b32 %r27321, %r27320, %r27320, 25; + add.s32 %r27322, %r27274, %r26614; + add.s32 %r27323, %r27322, %r27293; + xor.b32 %r27324, %r27323, %r27262; + shf.l.wrap.b32 %r27325, %r27324, %r27324, 16; + add.s32 %r27326, %r27325, %r27249; + xor.b32 %r27327, %r27326, %r27293; + shf.l.wrap.b32 %r27328, %r27327, %r27327, 20; + add.s32 %r27329, %r27323, %r26625; + add.s32 %r27330, %r27329, %r27328; + xor.b32 %r27331, %r27330, %r27325; + shf.l.wrap.b32 %r27332, %r27331, %r27331, 24; + add.s32 %r27333, %r27332, %r27326; + xor.b32 %r27334, %r27333, %r27328; + shf.l.wrap.b32 %r27335, %r27334, %r27334, 25; + add.s32 %r27336, %r27251, %r26680; + add.s32 %r27337, %r27336, %r27288; + xor.b32 %r27338, %r27276, %r27337; + shf.l.wrap.b32 %r27339, %r27338, %r27338, 16; + add.s32 %r27340, %r27339, %r27263; + xor.b32 %r27341, %r27340, %r27251; + shf.l.wrap.b32 %r27342, %r27341, %r27341, 20; + add.s32 %r27343, %r27337, %r26658; + add.s32 %r27344, %r27343, %r27342; + xor.b32 %r27345, %r27344, %r27339; + shf.l.wrap.b32 %r27346, %r27345, %r27345, 24; + add.s32 %r27347, %r27346, %r27340; + xor.b32 %r27348, %r27347, %r27342; + shf.l.wrap.b32 %r27349, %r27348, %r27348, 25; + add.s32 %r27350, %r27302, %r26713; + add.s32 %r27351, %r27350, %r27349; + xor.b32 %r27352, %r27318, %r27351; + shf.l.wrap.b32 %r27353, %r27352, %r27352, 16; + add.s32 %r27354, %r27353, %r27333; + xor.b32 %r27355, %r27354, %r27349; + shf.l.wrap.b32 %r27356, %r27355, %r27355, 20; + add.s32 %r27357, %r27351, %r26768; + add.s32 %r27358, %r27357, %r27356; + xor.b32 %r27359, %r27358, %r27353; + shf.l.wrap.b32 %r27360, %r27359, %r27359, 24; + add.s32 %r27361, %r27360, %r27354; + xor.b32 %r27362, %r27361, %r27356; + shf.l.wrap.b32 %r27363, %r27362, %r27362, 25; + add.s32 %r27364, %r27316, %r26735; + add.s32 %r27365, %r27364, %r27307; + xor.b32 %r27366, %r27332, %r27365; + shf.l.wrap.b32 %r27367, %r27366, %r27366, 16; + add.s32 %r27368, %r27367, %r27347; + xor.b32 %r27369, %r27368, %r27307; + shf.l.wrap.b32 %r27370, %r27369, %r27369, 20; + add.s32 %r27371, %r27365, %r26669; + add.s32 %r27372, %r27371, %r27370; + xor.b32 %r27373, %r27372, %r27367; + shf.l.wrap.b32 %r27374, %r27373, %r27373, 24; + add.s32 %r27375, %r27374, %r27368; + xor.b32 %r27376, %r27375, %r27370; + shf.l.wrap.b32 %r27377, %r27376, %r27376, 25; + add.s32 %r27378, %r27330, %r26702; + add.s32 %r27379, %r27378, %r27321; + xor.b32 %r27380, %r27379, %r27346; + shf.l.wrap.b32 %r27381, %r27380, %r27380, 16; + add.s32 %r27382, %r27381, %r27305; + xor.b32 %r27383, %r27382, %r27321; + shf.l.wrap.b32 %r27384, %r27383, %r27383, 20; + add.s32 %r27385, %r27379, %r26746; + add.s32 %r27386, %r27385, %r27384; + xor.b32 %r27387, %r27386, %r27381; + shf.l.wrap.b32 %r27388, %r27387, %r27387, 24; + add.s32 %r27389, %r27388, %r27382; + xor.b32 %r27390, %r27389, %r27384; + shf.l.wrap.b32 %r27391, %r27390, %r27390, 25; + add.s32 %r27392, %r27344, %r26779; + add.s32 %r27393, %r27392, %r27335; + xor.b32 %r27394, %r27393, %r27304; + shf.l.wrap.b32 %r27395, %r27394, %r27394, 16; + add.s32 %r27396, %r27395, %r27319; + xor.b32 %r27397, %r27396, %r27335; + shf.l.wrap.b32 %r27398, %r27397, %r27397, 20; + add.s32 %r27399, %r27393, %r26625; + add.s32 %r27400, %r27399, %r27398; + xor.b32 %r27401, %r27400, %r27395; + shf.l.wrap.b32 %r27402, %r27401, %r27401, 24; + add.s32 %r27403, %r27402, %r27396; + xor.b32 %r27404, %r27403, %r27398; + shf.l.wrap.b32 %r27405, %r27404, %r27404, 25; + add.s32 %r27406, %r27358, %r26757; + add.s32 %r27407, %r27406, %r27377; + xor.b32 %r27408, %r27402, %r27407; + shf.l.wrap.b32 %r27409, %r27408, %r27408, 16; + add.s32 %r27410, %r27409, %r27389; + xor.b32 %r27411, %r27410, %r27377; + shf.l.wrap.b32 %r27412, %r27411, %r27411, 20; + add.s32 %r27413, %r27407, %r26647; + add.s32 %r27414, %r27413, %r27412; + xor.b32 %r27415, %r27414, %r27409; + shf.l.wrap.b32 %r27416, %r27415, %r27415, 24; + add.s32 %r27417, %r27416, %r27410; + xor.b32 %r27418, %r27417, %r27412; + shf.l.wrap.b32 %r27419, %r27418, %r27418, 25; + add.s32 %r27420, %r27372, %r26614; + add.s32 %r27421, %r27420, %r27391; + xor.b32 %r27422, %r27421, %r27360; + shf.l.wrap.b32 %r27423, %r27422, %r27422, 16; + add.s32 %r27424, %r27423, %r27403; + xor.b32 %r27425, %r27424, %r27391; + shf.l.wrap.b32 %r27426, %r27425, %r27425, 20; + add.s32 %r27427, %r27421, %r26724; + add.s32 %r27428, %r27427, %r27426; + xor.b32 %r27429, %r27428, %r27423; + shf.l.wrap.b32 %r27430, %r27429, %r27429, 24; + add.s32 %r27431, %r27430, %r27424; + xor.b32 %r27432, %r27431, %r27426; + shf.l.wrap.b32 %r27433, %r27432, %r27432, 25; + add.s32 %r27434, %r27386, %r26636; + add.s32 %r27435, %r27434, %r27405; + xor.b32 %r27436, %r27435, %r27374; + shf.l.wrap.b32 %r27437, %r27436, %r27436, 16; + add.s32 %r27438, %r27437, %r27361; + xor.b32 %r27439, %r27438, %r27405; + shf.l.wrap.b32 %r27440, %r27439, %r27439, 20; + add.s32 %r27441, %r27435, %r26680; + add.s32 %r27442, %r27441, %r27440; + xor.b32 %r27443, %r27442, %r27437; + shf.l.wrap.b32 %r27444, %r27443, %r27443, 24; + add.s32 %r27445, %r27444, %r27438; + xor.b32 %r27446, %r27445, %r27440; + shf.l.wrap.b32 %r27447, %r27446, %r27446, 25; + add.s32 %r27448, %r27363, %r26658; + add.s32 %r27449, %r27448, %r27400; + xor.b32 %r27450, %r27388, %r27449; + shf.l.wrap.b32 %r27451, %r27450, %r27450, 16; + add.s32 %r27452, %r27451, %r27375; + xor.b32 %r27453, %r27452, %r27363; + shf.l.wrap.b32 %r27454, %r27453, %r27453, 20; + add.s32 %r27455, %r27449, %r26691; + add.s32 %r27456, %r27455, %r27454; + xor.b32 %r27457, %r27456, %r27451; + shf.l.wrap.b32 %r27458, %r27457, %r27457, 24; + add.s32 %r27459, %r27458, %r27452; + xor.b32 %r27460, %r27459, %r27454; + shf.l.wrap.b32 %r27461, %r27460, %r27460, 25; + add.s32 %r27462, %r27414, %r26735; + add.s32 %r27463, %r27462, %r27461; + xor.b32 %r27464, %r27430, %r27463; + shf.l.wrap.b32 %r27465, %r27464, %r27464, 16; + add.s32 %r27466, %r27465, %r27445; + xor.b32 %r27467, %r27466, %r27461; + shf.l.wrap.b32 %r27468, %r27467, %r27467, 20; + add.s32 %r27469, %r27463, %r26779; + add.s32 %r27470, %r27469, %r27468; + xor.b32 %r27471, %r27470, %r27465; + shf.l.wrap.b32 %r27472, %r27471, %r27471, 24; + add.s32 %r27473, %r27472, %r27466; + xor.b32 %r27474, %r27473, %r27468; + shf.l.wrap.b32 %r27475, %r27474, %r27474, 25; + add.s32 %r27476, %r27428, %r26669; + add.s32 %r27477, %r27476, %r27419; + xor.b32 %r27478, %r27444, %r27477; + shf.l.wrap.b32 %r27479, %r27478, %r27478, 16; + add.s32 %r27480, %r27479, %r27459; + xor.b32 %r27481, %r27480, %r27419; + shf.l.wrap.b32 %r27482, %r27481, %r27481, 20; + add.s32 %r27483, %r27477, %r26614; + add.s32 %r27484, %r27483, %r27482; + xor.b32 %r27485, %r27484, %r27479; + shf.l.wrap.b32 %r27486, %r27485, %r27485, 24; + add.s32 %r27487, %r27486, %r27480; + xor.b32 %r27488, %r27487, %r27482; + shf.l.wrap.b32 %r27489, %r27488, %r27488, 25; + add.s32 %r27490, %r27442, %r26625; + add.s32 %r27491, %r27490, %r27433; + xor.b32 %r27492, %r27491, %r27458; + shf.l.wrap.b32 %r27493, %r27492, %r27492, 16; + add.s32 %r27494, %r27493, %r27417; + xor.b32 %r27495, %r27494, %r27433; + shf.l.wrap.b32 %r27496, %r27495, %r27495, 20; + add.s32 %r27497, %r27491, %r26713; + add.s32 %r27498, %r27497, %r27496; + xor.b32 %r27499, %r27498, %r27493; + shf.l.wrap.b32 %r27500, %r27499, %r27499, 24; + add.s32 %r27501, %r27500, %r27494; + xor.b32 %r27502, %r27501, %r27496; + shf.l.wrap.b32 %r27503, %r27502, %r27502, 25; + add.s32 %r27504, %r27456, %r26702; + add.s32 %r27505, %r27504, %r27447; + xor.b32 %r27506, %r27505, %r27416; + shf.l.wrap.b32 %r27507, %r27506, %r27506, 16; + add.s32 %r27508, %r27507, %r27431; + xor.b32 %r27509, %r27508, %r27447; + shf.l.wrap.b32 %r27510, %r27509, %r27509, 20; + add.s32 %r27511, %r27505, %r26680; + add.s32 %r27512, %r27511, %r27510; + xor.b32 %r27513, %r27512, %r27507; + shf.l.wrap.b32 %r27514, %r27513, %r27513, 24; + add.s32 %r27515, %r27514, %r27508; + xor.b32 %r27516, %r27515, %r27510; + shf.l.wrap.b32 %r27517, %r27516, %r27516, 25; + add.s32 %r27518, %r27470, %r26768; + add.s32 %r27519, %r27518, %r27489; + xor.b32 %r27520, %r27514, %r27519; + shf.l.wrap.b32 %r27521, %r27520, %r27520, 16; + add.s32 %r27522, %r27521, %r27501; + xor.b32 %r27523, %r27522, %r27489; + shf.l.wrap.b32 %r27524, %r27523, %r27523, 20; + add.s32 %r27525, %r27519, %r26724; + add.s32 %r27526, %r27525, %r27524; + xor.b32 %r27527, %r27526, %r27521; + shr.u32 %r27528, %r27527, 8; + shf.l.wrap.b32 %r27529, %r27527, %r27527, 24; + add.s32 %r27530, %r27529, %r27522; + xor.b32 %r27531, %r27530, %r27524; + shr.u32 %r27532, %r27531, 7; + shf.l.wrap.b32 %r27533, %r27531, %r27531, 25; + add.s32 %r27534, %r27484, %r26636; + add.s32 %r27535, %r27534, %r27503; + xor.b32 %r27536, %r27535, %r27472; + shf.l.wrap.b32 %r27537, %r27536, %r27536, 16; + add.s32 %r27538, %r27537, %r27515; + xor.b32 %r27539, %r27538, %r27503; + shf.l.wrap.b32 %r27540, %r27539, %r27539, 20; + add.s32 %r27541, %r27535, %r26746; + add.s32 %r27542, %r27541, %r27540; + xor.b32 %r27543, %r27542, %r27537; + shr.u32 %r27544, %r27543, 8; + shf.l.wrap.b32 %r27545, %r27543, %r27543, 24; + add.s32 %r27546, %r27545, %r27538; + xor.b32 %r27547, %r27546, %r27540; + shr.u32 %r27548, %r27547, 7; + shf.l.wrap.b32 %r27549, %r27547, %r27547, 25; + add.s32 %r27550, %r27498, %r26647; + add.s32 %r27551, %r27550, %r27517; + xor.b32 %r27552, %r27551, %r27486; + shf.l.wrap.b32 %r27553, %r27552, %r27552, 16; + add.s32 %r27554, %r27553, %r27473; + xor.b32 %r27555, %r27554, %r27517; + shf.l.wrap.b32 %r27556, %r27555, %r27555, 20; + add.s32 %r27557, %r27551, %r26658; + add.s32 %r27558, %r27557, %r27556; + xor.b32 %r27559, %r27558, %r27553; + shr.u32 %r27560, %r27559, 8; + shf.l.wrap.b32 %r27561, %r27559, %r27559, 24; + add.s32 %r27562, %r27561, %r27554; + xor.b32 %r27563, %r27562, %r27556; + shr.u32 %r27564, %r27563, 7; + shf.l.wrap.b32 %r27565, %r27563, %r27563, 25; + add.s32 %r27566, %r27475, %r26691; + add.s32 %r27567, %r27566, %r27512; + xor.b32 %r27568, %r27500, %r27567; + shf.l.wrap.b32 %r27569, %r27568, %r27568, 16; + add.s32 %r27570, %r27569, %r27487; + xor.b32 %r27571, %r27570, %r27475; + shf.l.wrap.b32 %r27572, %r27571, %r27571, 20; + add.s32 %r27573, %r27567, %r26757; + add.s32 %r27574, %r27573, %r27572; + xor.b32 %r27575, %r27574, %r27569; + shr.u32 %r27576, %r27575, 8; + shf.l.wrap.b32 %r27577, %r27575, %r27575, 24; + add.s32 %r27578, %r27577, %r27570; + xor.b32 %r27579, %r27578, %r27572; + shr.u32 %r27580, %r27579, 7; + shf.l.wrap.b32 %r27581, %r27579, %r27579, 25; + xor.b32 %r27582, %r27562, %r27526; + xor.b32 %r27583, %r27542, %r27578; + xor.b32 %r27584, %r27558, %r27530; + xor.b32 %r27585, %r27546, %r27574; + xor.b32 %r27586, %r27545, %r27581; + xor.b32 %r27587, %r27561, %r27533; + xor.b32 %r27588, %r27549, %r27577; + xor.b32 %r27589, %r27565, %r27529; + cvt.u16.u32 %rs553, %r27526; + cvt.u16.u32 %rs554, %r27562; + xor.b16 %rs832, %rs554, %rs553; + shr.u32 %r27590, %r27582, 8; + cvt.u16.u32 %rs833, %r27590; + shr.u32 %r27591, %r27582, 16; + cvt.u16.u32 %rs834, %r27591; + shr.u32 %r27592, %r27582, 24; + cvt.u16.u32 %rs835, %r27592; + cvt.u16.u32 %rs555, %r27578; + cvt.u16.u32 %rs556, %r27542; + xor.b16 %rs836, %rs556, %rs555; + shr.u32 %r27593, %r27583, 8; + cvt.u16.u32 %rs837, %r27593; + shr.u32 %r27594, %r27583, 16; + cvt.u16.u32 %rs838, %r27594; + shr.u32 %r27595, %r27583, 24; + cvt.u16.u32 %rs839, %r27595; + cvt.u16.u32 %rs557, %r27530; + cvt.u16.u32 %rs558, %r27558; + xor.b16 %rs840, %rs558, %rs557; + shr.u32 %r27596, %r27584, 8; + cvt.u16.u32 %rs841, %r27596; + shr.u32 %r27597, %r27584, 16; + cvt.u16.u32 %rs842, %r27597; + shr.u32 %r27598, %r27584, 24; + cvt.u16.u32 %rs843, %r27598; + cvt.u16.u32 %rs559, %r27546; + cvt.u16.u32 %rs560, %r27574; + xor.b16 %rs844, %rs559, %rs560; + shr.u32 %r27599, %r27585, 8; + cvt.u16.u32 %rs845, %r27599; + shr.u32 %r27600, %r27585, 16; + cvt.u16.u32 %rs846, %r27600; + shr.u32 %r27601, %r27585, 24; + cvt.u16.u32 %rs847, %r27601; + cvt.u16.u32 %rs561, %r27580; + cvt.u16.u32 %rs562, %r27544; + xor.b16 %rs848, %rs562, %rs561; + shr.u32 %r27602, %r27586, 8; + cvt.u16.u32 %rs849, %r27602; + shr.u32 %r27603, %r27586, 16; + cvt.u16.u32 %rs850, %r27603; + shr.u32 %r27604, %r27586, 24; + cvt.u16.u32 %rs851, %r27604; + cvt.u16.u32 %rs563, %r27532; + cvt.u16.u32 %rs564, %r27560; + xor.b16 %rs852, %rs564, %rs563; + shr.u32 %r27605, %r27587, 8; + cvt.u16.u32 %rs853, %r27605; + shr.u32 %r27606, %r27587, 16; + cvt.u16.u32 %rs854, %r27606; + shr.u32 %r27607, %r27587, 24; + cvt.u16.u32 %rs855, %r27607; + cvt.u16.u32 %rs565, %r27548; + cvt.u16.u32 %rs566, %r27576; + xor.b16 %rs856, %rs565, %rs566; + shr.u32 %r27608, %r27588, 8; + cvt.u16.u32 %rs857, %r27608; + shr.u32 %r27609, %r27588, 16; + cvt.u16.u32 %rs858, %r27609; + shr.u32 %r27610, %r27588, 24; + cvt.u16.u32 %rs859, %r27610; + cvt.u16.u32 %rs567, %r27528; + cvt.u16.u32 %rs568, %r27564; + xor.b16 %rs860, %rs568, %rs567; + shr.u32 %r27611, %r27589, 8; + cvt.u16.u32 %rs861, %r27611; + shr.u32 %r27612, %r27589, 16; + cvt.u16.u32 %rs862, %r27612; + shr.u32 %r27613, %r27589, 24; + cvt.u16.u32 %rs863, %r27613; + setp.ne.s64 %p57, %rd1275, 0; + mov.u16 %rs864, 64; + mov.u16 %rs734, %rs865; + mov.u16 %rs768, %rs831; + mov.u16 %rs769, %rs830; + mov.u16 %rs770, %rs829; + mov.u16 %rs771, %rs828; + mov.u16 %rs772, %rs827; + mov.u16 %rs773, %rs826; + mov.u16 %rs774, %rs825; + mov.u16 %rs775, %rs824; + mov.u16 %rs776, %rs823; + mov.u16 %rs777, %rs822; + mov.u16 %rs778, %rs821; + mov.u16 %rs779, %rs820; + mov.u16 %rs780, %rs819; + mov.u16 %rs781, %rs818; + mov.u16 %rs782, %rs817; + mov.u16 %rs783, %rs816; + mov.u16 %rs784, %rs815; + mov.u16 %rs785, %rs814; + mov.u16 %rs786, %rs813; + mov.u16 %rs787, %rs812; + mov.u16 %rs788, %rs811; + mov.u16 %rs789, %rs810; + mov.u16 %rs790, %rs809; + mov.u16 %rs791, %rs808; + mov.u16 %rs792, %rs807; + mov.u16 %rs793, %rs806; + mov.u16 %rs794, %rs805; + mov.u16 %rs795, %rs804; + mov.u16 %rs796, %rs803; + mov.u16 %rs797, %rs802; + mov.u16 %rs798, %rs801; + mov.u16 %rs799, %rs800; + mov.u64 %rd1276, %rd1143; + mov.u32 %r30969, %r30984; + mov.u32 %r30970, %r30983; + mov.u32 %r30971, %r30982; + mov.u32 %r30972, %r30981; + mov.u32 %r30973, %r30980; + mov.u32 %r30974, %r30979; + mov.u32 %r30975, %r30978; + mov.u32 %r30976, %r30977; + @%p57 bra $L__BB2_101; + +$L__BB2_102: + cvt.u32.u16 %r27614, %rs800; + and.b32 %r27615, %r27614, 255; + cvt.u32.u16 %r27616, %rs801; + prmt.b32 %r27617, %r27616, %r27615, 30212; + cvt.u32.u16 %r27618, %rs802; + shl.b32 %r27619, %r27618, 16; + and.b32 %r27620, %r27619, 16711680; + or.b32 %r27621, %r27617, %r27620; + cvt.u32.u16 %r27622, %rs803; + shl.b32 %r27623, %r27622, 24; + or.b32 %r27624, %r27621, %r27623; + cvt.u32.u16 %r27625, %rs804; + and.b32 %r27626, %r27625, 255; + cvt.u32.u16 %r27627, %rs805; + prmt.b32 %r27628, %r27627, %r27626, 30212; + cvt.u32.u16 %r27629, %rs806; + shl.b32 %r27630, %r27629, 16; + and.b32 %r27631, %r27630, 16711680; + or.b32 %r27632, %r27628, %r27631; + cvt.u32.u16 %r27633, %rs807; + shl.b32 %r27634, %r27633, 24; + or.b32 %r27635, %r27632, %r27634; + cvt.u32.u16 %r27636, %rs808; + and.b32 %r27637, %r27636, 255; + cvt.u32.u16 %r27638, %rs809; + prmt.b32 %r27639, %r27638, %r27637, 30212; + cvt.u32.u16 %r27640, %rs810; + shl.b32 %r27641, %r27640, 16; + and.b32 %r27642, %r27641, 16711680; + or.b32 %r27643, %r27639, %r27642; + cvt.u32.u16 %r27644, %rs811; + shl.b32 %r27645, %r27644, 24; + or.b32 %r27646, %r27643, %r27645; + cvt.u32.u16 %r27647, %rs812; + and.b32 %r27648, %r27647, 255; + cvt.u32.u16 %r27649, %rs813; + prmt.b32 %r27650, %r27649, %r27648, 30212; + cvt.u32.u16 %r27651, %rs814; + shl.b32 %r27652, %r27651, 16; + and.b32 %r27653, %r27652, 16711680; + or.b32 %r27654, %r27650, %r27653; + cvt.u32.u16 %r27655, %rs815; + shl.b32 %r27656, %r27655, 24; + or.b32 %r27657, %r27654, %r27656; + cvt.u32.u16 %r27658, %rs816; + and.b32 %r27659, %r27658, 255; + cvt.u32.u16 %r27660, %rs817; + prmt.b32 %r27661, %r27660, %r27659, 30212; + cvt.u32.u16 %r27662, %rs818; + shl.b32 %r27663, %r27662, 16; + and.b32 %r27664, %r27663, 16711680; + or.b32 %r27665, %r27661, %r27664; + cvt.u32.u16 %r27666, %rs819; + shl.b32 %r27667, %r27666, 24; + or.b32 %r27668, %r27665, %r27667; + cvt.u32.u16 %r27669, %rs820; + and.b32 %r27670, %r27669, 255; + cvt.u32.u16 %r27671, %rs821; + prmt.b32 %r27672, %r27671, %r27670, 30212; + cvt.u32.u16 %r27673, %rs822; + shl.b32 %r27674, %r27673, 16; + and.b32 %r27675, %r27674, 16711680; + or.b32 %r27676, %r27672, %r27675; + cvt.u32.u16 %r27677, %rs823; + shl.b32 %r27678, %r27677, 24; + or.b32 %r27679, %r27676, %r27678; + cvt.u32.u16 %r27680, %rs824; + and.b32 %r27681, %r27680, 255; + cvt.u32.u16 %r27682, %rs825; + prmt.b32 %r27683, %r27682, %r27681, 30212; + cvt.u32.u16 %r27684, %rs826; + shl.b32 %r27685, %r27684, 16; + and.b32 %r27686, %r27685, 16711680; + or.b32 %r27687, %r27683, %r27686; + cvt.u32.u16 %r27688, %rs827; + shl.b32 %r27689, %r27688, 24; + or.b32 %r27690, %r27687, %r27689; + cvt.u32.u16 %r27691, %rs828; + and.b32 %r27692, %r27691, 255; + cvt.u32.u16 %r27693, %rs829; + prmt.b32 %r27694, %r27693, %r27692, 30212; + cvt.u32.u16 %r27695, %rs830; + shl.b32 %r27696, %r27695, 16; + and.b32 %r27697, %r27696, 16711680; + or.b32 %r27698, %r27694, %r27697; + cvt.u32.u16 %r27699, %rs831; + shl.b32 %r27700, %r27699, 24; + or.b32 %r27701, %r27698, %r27700; + cvt.u32.u16 %r27702, %rs832; + and.b32 %r27703, %r27702, 255; + cvt.u32.u16 %r27704, %rs833; + prmt.b32 %r27705, %r27704, %r27703, 30212; + cvt.u32.u16 %r27706, %rs834; + shl.b32 %r27707, %r27706, 16; + and.b32 %r27708, %r27707, 16711680; + or.b32 %r27709, %r27705, %r27708; + cvt.u32.u16 %r27710, %rs835; + shl.b32 %r27711, %r27710, 24; + or.b32 %r27712, %r27709, %r27711; + cvt.u32.u16 %r27713, %rs836; + and.b32 %r27714, %r27713, 255; + cvt.u32.u16 %r27715, %rs837; + prmt.b32 %r27716, %r27715, %r27714, 30212; + cvt.u32.u16 %r27717, %rs838; + shl.b32 %r27718, %r27717, 16; + and.b32 %r27719, %r27718, 16711680; + or.b32 %r27720, %r27716, %r27719; + cvt.u32.u16 %r27721, %rs839; + shl.b32 %r27722, %r27721, 24; + or.b32 %r27723, %r27720, %r27722; + cvt.u32.u16 %r27724, %rs840; + and.b32 %r27725, %r27724, 255; + cvt.u32.u16 %r27726, %rs841; + prmt.b32 %r27727, %r27726, %r27725, 30212; + cvt.u32.u16 %r27728, %rs842; + shl.b32 %r27729, %r27728, 16; + and.b32 %r27730, %r27729, 16711680; + or.b32 %r27731, %r27727, %r27730; + cvt.u32.u16 %r27732, %rs843; + shl.b32 %r27733, %r27732, 24; + or.b32 %r27734, %r27731, %r27733; + cvt.u32.u16 %r27735, %rs844; + and.b32 %r27736, %r27735, 255; + cvt.u32.u16 %r27737, %rs845; + prmt.b32 %r27738, %r27737, %r27736, 30212; + cvt.u32.u16 %r27739, %rs846; + shl.b32 %r27740, %r27739, 16; + and.b32 %r27741, %r27740, 16711680; + or.b32 %r27742, %r27738, %r27741; + cvt.u32.u16 %r27743, %rs847; + shl.b32 %r27744, %r27743, 24; + or.b32 %r27745, %r27742, %r27744; + cvt.u32.u16 %r27746, %rs848; + and.b32 %r27747, %r27746, 255; + cvt.u32.u16 %r27748, %rs849; + prmt.b32 %r27749, %r27748, %r27747, 30212; + cvt.u32.u16 %r27750, %rs850; + shl.b32 %r27751, %r27750, 16; + and.b32 %r27752, %r27751, 16711680; + or.b32 %r27753, %r27749, %r27752; + cvt.u32.u16 %r27754, %rs851; + shl.b32 %r27755, %r27754, 24; + or.b32 %r27756, %r27753, %r27755; + cvt.u32.u16 %r27757, %rs852; + and.b32 %r27758, %r27757, 255; + cvt.u32.u16 %r27759, %rs853; + prmt.b32 %r27760, %r27759, %r27758, 30212; + cvt.u32.u16 %r27761, %rs854; + shl.b32 %r27762, %r27761, 16; + and.b32 %r27763, %r27762, 16711680; + or.b32 %r27764, %r27760, %r27763; + cvt.u32.u16 %r27765, %rs855; + shl.b32 %r27766, %r27765, 24; + or.b32 %r27767, %r27764, %r27766; + cvt.u32.u16 %r27768, %rs856; + and.b32 %r27769, %r27768, 255; + cvt.u32.u16 %r27770, %rs857; + prmt.b32 %r27771, %r27770, %r27769, 30212; + cvt.u32.u16 %r27772, %rs858; + shl.b32 %r27773, %r27772, 16; + and.b32 %r27774, %r27773, 16711680; + or.b32 %r27775, %r27771, %r27774; + cvt.u32.u16 %r27776, %rs859; + shl.b32 %r27777, %r27776, 24; + or.b32 %r27778, %r27775, %r27777; + cvt.u32.u16 %r27779, %rs860; + and.b32 %r27780, %r27779, 255; + cvt.u32.u16 %r27781, %rs861; + prmt.b32 %r27782, %r27781, %r27780, 30212; + cvt.u32.u16 %r27783, %rs862; + shl.b32 %r27784, %r27783, 16; + and.b32 %r27785, %r27784, 16711680; + or.b32 %r27786, %r27782, %r27785; + cvt.u32.u16 %r27787, %rs863; + shl.b32 %r27788, %r27787, 24; + or.b32 %r27789, %r27786, %r27788; + or.b16 %rs569, %rs865, 8; + cvt.u32.u16 %r27790, %rs569; + and.b32 %r27791, %r27790, 255; + add.s32 %r27792, %r30981, %r30977; + add.s32 %r27793, %r27792, %r27624; + add.s32 %r27794, %r27635, %r27793; + add.s32 %r27795, %r30982, %r30978; + add.s32 %r27796, %r27795, %r27646; + add.s32 %r27797, %r27657, %r27796; + add.s32 %r27798, %r30983, %r30979; + add.s32 %r27799, %r27798, %r27668; + cvt.u32.u16 %r27800, %rs864; + and.b32 %r27801, %r27800, 255; + xor.b32 %r27802, %r27799, %r27801; + shr.u32 %r27803, %r27799, 16; + shl.b32 %r27804, %r27802, 16; + or.b32 %r27805, %r27804, %r27803; + add.s32 %r27806, %r27805, 1013904242; + xor.b32 %r27807, %r27806, %r30983; + shf.l.wrap.b32 %r27808, %r27807, %r27807, 20; + add.s32 %r27809, %r27679, %r27799; + add.s32 %r27810, %r27809, %r27808; + xor.b32 %r27811, %r27810, %r27805; + shf.l.wrap.b32 %r27812, %r27811, %r27811, 24; + add.s32 %r27813, %r27812, %r27806; + xor.b32 %r27814, %r27813, %r27808; + shf.l.wrap.b32 %r27815, %r27814, %r27814, 25; + add.s32 %r27816, %r30984, %r30980; + add.s32 %r27817, %r27816, %r27690; + xor.b32 %r27818, %r27817, %r27791; + shr.u32 %r27819, %r27817, 16; + shl.b32 %r27820, %r27818, 16; + or.b32 %r27821, %r27820, %r27819; + add.s32 %r27822, %r27821, -1521486534; + xor.b32 %r27823, %r27822, %r30984; + shf.l.wrap.b32 %r27824, %r27823, %r27823, 20; + add.s32 %r27825, %r27701, %r27817; + add.s32 %r27826, %r27825, %r27824; + xor.b32 %r27827, %r27826, %r27821; + shf.l.wrap.b32 %r27828, %r27827, %r27827, 24; + add.s32 %r27829, %r27828, %r27822; + xor.b32 %r27830, %r27829, %r27824; + shf.l.wrap.b32 %r27831, %r27830, %r27830, 25; + add.s32 %r27832, %r27815, %r27734; + add.s32 %r27833, %r27810, %r27756; + add.s32 %r27834, %r27833, %r27831; + add.s32 %r27835, %r27834, %r27767; + add.s32 %r27836, %r27826, %r27778; + shf.l.wrap.b32 %r27837, %r27793, %r27793, 16; + add.s32 %r27838, %r27837, 1779033703; + xor.b32 %r27839, %r27838, %r30981; + shf.l.wrap.b32 %r27840, %r27839, %r27839, 20; + add.s32 %r27841, %r27794, %r27840; + xor.b32 %r27842, %r27841, %r27837; + shf.l.wrap.b32 %r27843, %r27842, %r27842, 24; + add.s32 %r27844, %r27843, %r27838; + xor.b32 %r27845, %r27844, %r27840; + shf.l.wrap.b32 %r27846, %r27845, %r27845, 25; + shf.l.wrap.b32 %r27847, %r27796, %r27796, 16; + add.s32 %r27848, %r27847, -1150833019; + xor.b32 %r27849, %r27848, %r30982; + shf.l.wrap.b32 %r27850, %r27849, %r27849, 20; + add.s32 %r27851, %r27797, %r27850; + xor.b32 %r27852, %r27851, %r27847; + shf.l.wrap.b32 %r27853, %r27852, %r27852, 24; + add.s32 %r27854, %r27853, %r27848; + xor.b32 %r27855, %r27854, %r27850; + shf.l.wrap.b32 %r27856, %r27855, %r27855, 25; + add.s32 %r27857, %r27841, %r27712; + add.s32 %r27858, %r27857, %r27856; + xor.b32 %r27859, %r27858, %r27828; + shf.l.wrap.b32 %r27860, %r27859, %r27859, 16; + add.s32 %r27861, %r27860, %r27813; + xor.b32 %r27862, %r27861, %r27856; + shf.l.wrap.b32 %r27863, %r27862, %r27862, 20; + add.s32 %r27864, %r27858, %r27723; + add.s32 %r27865, %r27864, %r27863; + xor.b32 %r27866, %r27865, %r27860; + shf.l.wrap.b32 %r27867, %r27866, %r27866, 24; + add.s32 %r27868, %r27867, %r27861; + xor.b32 %r27869, %r27868, %r27863; + shf.l.wrap.b32 %r27870, %r27869, %r27869, 25; + add.s32 %r27871, %r27832, %r27851; + xor.b32 %r27872, %r27843, %r27871; + shf.l.wrap.b32 %r27873, %r27872, %r27872, 16; + add.s32 %r27874, %r27873, %r27829; + xor.b32 %r27875, %r27874, %r27815; + shf.l.wrap.b32 %r27876, %r27875, %r27875, 20; + add.s32 %r27877, %r27871, %r27745; + add.s32 %r27878, %r27877, %r27876; + xor.b32 %r27879, %r27878, %r27873; + shf.l.wrap.b32 %r27880, %r27879, %r27879, 24; + add.s32 %r27881, %r27880, %r27874; + xor.b32 %r27882, %r27881, %r27876; + shf.l.wrap.b32 %r27883, %r27882, %r27882, 25; + xor.b32 %r27884, %r27853, %r27834; + shf.l.wrap.b32 %r27885, %r27884, %r27884, 16; + add.s32 %r27886, %r27885, %r27844; + xor.b32 %r27887, %r27886, %r27831; + shf.l.wrap.b32 %r27888, %r27887, %r27887, 20; + add.s32 %r27889, %r27835, %r27888; + xor.b32 %r27890, %r27889, %r27885; + shf.l.wrap.b32 %r27891, %r27890, %r27890, 24; + add.s32 %r27892, %r27891, %r27886; + xor.b32 %r27893, %r27892, %r27888; + shf.l.wrap.b32 %r27894, %r27893, %r27893, 25; + add.s32 %r27895, %r27836, %r27846; + xor.b32 %r27896, %r27895, %r27812; + shf.l.wrap.b32 %r27897, %r27896, %r27896, 16; + add.s32 %r27898, %r27897, %r27854; + xor.b32 %r27899, %r27898, %r27846; + shf.l.wrap.b32 %r27900, %r27899, %r27899, 20; + add.s32 %r27901, %r27895, %r27789; + add.s32 %r27902, %r27901, %r27900; + xor.b32 %r27903, %r27902, %r27897; + shf.l.wrap.b32 %r27904, %r27903, %r27903, 24; + add.s32 %r27905, %r27904, %r27898; + xor.b32 %r27906, %r27905, %r27900; + shf.l.wrap.b32 %r27907, %r27906, %r27906, 25; + add.s32 %r27908, %r27865, %r27646; + add.s32 %r27909, %r27908, %r27907; + xor.b32 %r27910, %r27909, %r27880; + shf.l.wrap.b32 %r27911, %r27910, %r27910, 16; + add.s32 %r27912, %r27911, %r27892; + xor.b32 %r27913, %r27912, %r27907; + shf.l.wrap.b32 %r27914, %r27913, %r27913, 20; + add.s32 %r27915, %r27909, %r27690; + add.s32 %r27916, %r27915, %r27914; + xor.b32 %r27917, %r27916, %r27911; + shf.l.wrap.b32 %r27918, %r27917, %r27917, 24; + add.s32 %r27919, %r27918, %r27912; + xor.b32 %r27920, %r27919, %r27914; + shf.l.wrap.b32 %r27921, %r27920, %r27920, 25; + add.s32 %r27922, %r27878, %r27657; + add.s32 %r27923, %r27922, %r27870; + xor.b32 %r27924, %r27923, %r27891; + shf.l.wrap.b32 %r27925, %r27924, %r27924, 16; + add.s32 %r27926, %r27925, %r27905; + xor.b32 %r27927, %r27926, %r27870; + shf.l.wrap.b32 %r27928, %r27927, %r27927, 20; + add.s32 %r27929, %r27923, %r27734; + add.s32 %r27930, %r27929, %r27928; + xor.b32 %r27931, %r27930, %r27925; + shf.l.wrap.b32 %r27932, %r27931, %r27931, 24; + add.s32 %r27933, %r27932, %r27926; + xor.b32 %r27934, %r27933, %r27928; + shf.l.wrap.b32 %r27935, %r27934, %r27934, 25; + add.s32 %r27936, %r27889, %r27701; + add.s32 %r27937, %r27936, %r27883; + xor.b32 %r27938, %r27904, %r27937; + shf.l.wrap.b32 %r27939, %r27938, %r27938, 16; + add.s32 %r27940, %r27939, %r27868; + xor.b32 %r27941, %r27940, %r27883; + shf.l.wrap.b32 %r27942, %r27941, %r27941, 20; + add.s32 %r27943, %r27937, %r27624; + add.s32 %r27944, %r27943, %r27942; + xor.b32 %r27945, %r27944, %r27939; + shf.l.wrap.b32 %r27946, %r27945, %r27945, 24; + add.s32 %r27947, %r27946, %r27940; + xor.b32 %r27948, %r27947, %r27942; + shf.l.wrap.b32 %r27949, %r27948, %r27948, 25; + add.s32 %r27950, %r27902, %r27668; + add.s32 %r27951, %r27950, %r27894; + xor.b32 %r27952, %r27867, %r27951; + shf.l.wrap.b32 %r27953, %r27952, %r27952, 16; + add.s32 %r27954, %r27953, %r27881; + xor.b32 %r27955, %r27954, %r27894; + shf.l.wrap.b32 %r27956, %r27955, %r27955, 20; + add.s32 %r27957, %r27951, %r27767; + add.s32 %r27958, %r27957, %r27956; + xor.b32 %r27959, %r27958, %r27953; + shf.l.wrap.b32 %r27960, %r27959, %r27959, 24; + add.s32 %r27961, %r27960, %r27954; + xor.b32 %r27962, %r27961, %r27956; + shf.l.wrap.b32 %r27963, %r27962, %r27962, 25; + add.s32 %r27964, %r27916, %r27635; + add.s32 %r27965, %r27964, %r27935; + xor.b32 %r27966, %r27965, %r27960; + shf.l.wrap.b32 %r27967, %r27966, %r27966, 16; + add.s32 %r27968, %r27967, %r27947; + xor.b32 %r27969, %r27968, %r27935; + shf.l.wrap.b32 %r27970, %r27969, %r27969, 20; + add.s32 %r27971, %r27965, %r27745; + add.s32 %r27972, %r27971, %r27970; + xor.b32 %r27973, %r27972, %r27967; + shf.l.wrap.b32 %r27974, %r27973, %r27973, 24; + add.s32 %r27975, %r27974, %r27968; + xor.b32 %r27976, %r27975, %r27970; + shf.l.wrap.b32 %r27977, %r27976, %r27976, 25; + add.s32 %r27978, %r27949, %r27756; + add.s32 %r27979, %r27978, %r27930; + xor.b32 %r27980, %r27918, %r27979; + shf.l.wrap.b32 %r27981, %r27980, %r27980, 16; + add.s32 %r27982, %r27981, %r27961; + xor.b32 %r27983, %r27982, %r27949; + shf.l.wrap.b32 %r27984, %r27983, %r27983, 20; + add.s32 %r27985, %r27979, %r27679; + add.s32 %r27986, %r27985, %r27984; + xor.b32 %r27987, %r27986, %r27981; + shf.l.wrap.b32 %r27988, %r27987, %r27987, 24; + add.s32 %r27989, %r27988, %r27982; + xor.b32 %r27990, %r27989, %r27984; + shf.l.wrap.b32 %r27991, %r27990, %r27990, 25; + add.s32 %r27992, %r27944, %r27723; + add.s32 %r27993, %r27992, %r27963; + xor.b32 %r27994, %r27932, %r27993; + shf.l.wrap.b32 %r27995, %r27994, %r27994, 16; + add.s32 %r27996, %r27995, %r27919; + xor.b32 %r27997, %r27996, %r27963; + shf.l.wrap.b32 %r27998, %r27997, %r27997, 20; + add.s32 %r27999, %r27993, %r27778; + add.s32 %r28000, %r27999, %r27998; + xor.b32 %r28001, %r28000, %r27995; + shf.l.wrap.b32 %r28002, %r28001, %r28001, 24; + add.s32 %r28003, %r28002, %r27996; + xor.b32 %r28004, %r28003, %r27998; + shf.l.wrap.b32 %r28005, %r28004, %r28004, 25; + add.s32 %r28006, %r27958, %r27789; + add.s32 %r28007, %r28006, %r27921; + xor.b32 %r28008, %r28007, %r27946; + shf.l.wrap.b32 %r28009, %r28008, %r28008, 16; + add.s32 %r28010, %r28009, %r27933; + xor.b32 %r28011, %r28010, %r27921; + shf.l.wrap.b32 %r28012, %r28011, %r28011, 20; + add.s32 %r28013, %r28007, %r27712; + add.s32 %r28014, %r28013, %r28012; + xor.b32 %r28015, %r28014, %r28009; + shf.l.wrap.b32 %r28016, %r28015, %r28015, 24; + add.s32 %r28017, %r28016, %r28010; + xor.b32 %r28018, %r28017, %r28012; + shf.l.wrap.b32 %r28019, %r28018, %r28018, 25; + add.s32 %r28020, %r27972, %r27657; + add.s32 %r28021, %r28020, %r28019; + xor.b32 %r28022, %r28021, %r27988; + shf.l.wrap.b32 %r28023, %r28022, %r28022, 16; + add.s32 %r28024, %r28023, %r28003; + xor.b32 %r28025, %r28024, %r28019; + shf.l.wrap.b32 %r28026, %r28025, %r28025, 20; + add.s32 %r28027, %r28021, %r27668; + add.s32 %r28028, %r28027, %r28026; + xor.b32 %r28029, %r28028, %r28023; + shf.l.wrap.b32 %r28030, %r28029, %r28029, 24; + add.s32 %r28031, %r28030, %r28024; + xor.b32 %r28032, %r28031, %r28026; + shf.l.wrap.b32 %r28033, %r28032, %r28032, 25; + add.s32 %r28034, %r27986, %r27734; + add.s32 %r28035, %r28034, %r27977; + xor.b32 %r28036, %r28035, %r28002; + shf.l.wrap.b32 %r28037, %r28036, %r28036, 16; + add.s32 %r28038, %r28037, %r28017; + xor.b32 %r28039, %r28038, %r27977; + shf.l.wrap.b32 %r28040, %r28039, %r28039, 20; + add.s32 %r28041, %r28035, %r27756; + add.s32 %r28042, %r28041, %r28040; + xor.b32 %r28043, %r28042, %r28037; + shf.l.wrap.b32 %r28044, %r28043, %r28043, 24; + add.s32 %r28045, %r28044, %r28038; + xor.b32 %r28046, %r28045, %r28040; + shf.l.wrap.b32 %r28047, %r28046, %r28046, 25; + add.s32 %r28048, %r28000, %r27767; + add.s32 %r28049, %r28048, %r27991; + xor.b32 %r28050, %r28016, %r28049; + shf.l.wrap.b32 %r28051, %r28050, %r28050, 16; + add.s32 %r28052, %r28051, %r27975; + xor.b32 %r28053, %r28052, %r27991; + shf.l.wrap.b32 %r28054, %r28053, %r28053, 20; + add.s32 %r28055, %r28049, %r27646; + add.s32 %r28056, %r28055, %r28054; + xor.b32 %r28057, %r28056, %r28051; + shf.l.wrap.b32 %r28058, %r28057, %r28057, 24; + add.s32 %r28059, %r28058, %r28052; + xor.b32 %r28060, %r28059, %r28054; + shf.l.wrap.b32 %r28061, %r28060, %r28060, 25; + add.s32 %r28062, %r28014, %r27701; + add.s32 %r28063, %r28062, %r28005; + xor.b32 %r28064, %r27974, %r28063; + shf.l.wrap.b32 %r28065, %r28064, %r28064, 16; + add.s32 %r28066, %r28065, %r27989; + xor.b32 %r28067, %r28066, %r28005; + shf.l.wrap.b32 %r28068, %r28067, %r28067, 20; + add.s32 %r28069, %r28063, %r27778; + add.s32 %r28070, %r28069, %r28068; + xor.b32 %r28071, %r28070, %r28065; + shf.l.wrap.b32 %r28072, %r28071, %r28071, 24; + add.s32 %r28073, %r28072, %r28066; + xor.b32 %r28074, %r28073, %r28068; + shf.l.wrap.b32 %r28075, %r28074, %r28074, 25; + add.s32 %r28076, %r28028, %r27690; + add.s32 %r28077, %r28076, %r28047; + xor.b32 %r28078, %r28077, %r28072; + shf.l.wrap.b32 %r28079, %r28078, %r28078, 16; + add.s32 %r28080, %r28079, %r28059; + xor.b32 %r28081, %r28080, %r28047; + shf.l.wrap.b32 %r28082, %r28081, %r28081, 20; + add.s32 %r28083, %r28077, %r27679; + add.s32 %r28084, %r28083, %r28082; + xor.b32 %r28085, %r28084, %r28079; + shf.l.wrap.b32 %r28086, %r28085, %r28085, 24; + add.s32 %r28087, %r28086, %r28080; + xor.b32 %r28088, %r28087, %r28082; + shf.l.wrap.b32 %r28089, %r28088, %r28088, 25; + add.s32 %r28090, %r28061, %r27723; + add.s32 %r28091, %r28090, %r28042; + xor.b32 %r28092, %r28030, %r28091; + shf.l.wrap.b32 %r28093, %r28092, %r28092, 16; + add.s32 %r28094, %r28093, %r28073; + xor.b32 %r28095, %r28094, %r28061; + shf.l.wrap.b32 %r28096, %r28095, %r28095, 20; + add.s32 %r28097, %r28091, %r27624; + add.s32 %r28098, %r28097, %r28096; + xor.b32 %r28099, %r28098, %r28093; + shf.l.wrap.b32 %r28100, %r28099, %r28099, 24; + add.s32 %r28101, %r28100, %r28094; + xor.b32 %r28102, %r28101, %r28096; + shf.l.wrap.b32 %r28103, %r28102, %r28102, 25; + add.s32 %r28104, %r28056, %r27745; + add.s32 %r28105, %r28104, %r28075; + xor.b32 %r28106, %r28044, %r28105; + shf.l.wrap.b32 %r28107, %r28106, %r28106, 16; + add.s32 %r28108, %r28107, %r28031; + xor.b32 %r28109, %r28108, %r28075; + shf.l.wrap.b32 %r28110, %r28109, %r28109, 20; + add.s32 %r28111, %r28105, %r27789; + add.s32 %r28112, %r28111, %r28110; + xor.b32 %r28113, %r28112, %r28107; + shf.l.wrap.b32 %r28114, %r28113, %r28113, 24; + add.s32 %r28115, %r28114, %r28108; + xor.b32 %r28116, %r28115, %r28110; + shf.l.wrap.b32 %r28117, %r28116, %r28116, 25; + add.s32 %r28118, %r28070, %r27712; + add.s32 %r28119, %r28118, %r28033; + xor.b32 %r28120, %r28119, %r28058; + shf.l.wrap.b32 %r28121, %r28120, %r28120, 16; + add.s32 %r28122, %r28121, %r28045; + xor.b32 %r28123, %r28122, %r28033; + shf.l.wrap.b32 %r28124, %r28123, %r28123, 20; + add.s32 %r28125, %r28119, %r27635; + add.s32 %r28126, %r28125, %r28124; + xor.b32 %r28127, %r28126, %r28121; + shf.l.wrap.b32 %r28128, %r28127, %r28127, 24; + add.s32 %r28129, %r28128, %r28122; + xor.b32 %r28130, %r28129, %r28124; + shf.l.wrap.b32 %r28131, %r28130, %r28130, 25; + add.s32 %r28132, %r28084, %r27734; + add.s32 %r28133, %r28132, %r28131; + xor.b32 %r28134, %r28133, %r28100; + shf.l.wrap.b32 %r28135, %r28134, %r28134, 16; + add.s32 %r28136, %r28135, %r28115; + xor.b32 %r28137, %r28136, %r28131; + shf.l.wrap.b32 %r28138, %r28137, %r28137, 20; + add.s32 %r28139, %r28133, %r27701; + add.s32 %r28140, %r28139, %r28138; + xor.b32 %r28141, %r28140, %r28135; + shf.l.wrap.b32 %r28142, %r28141, %r28141, 24; + add.s32 %r28143, %r28142, %r28136; + xor.b32 %r28144, %r28143, %r28138; + shf.l.wrap.b32 %r28145, %r28144, %r28144, 25; + add.s32 %r28146, %r28098, %r27756; + add.s32 %r28147, %r28146, %r28089; + xor.b32 %r28148, %r28147, %r28114; + shf.l.wrap.b32 %r28149, %r28148, %r28148, 16; + add.s32 %r28150, %r28149, %r28129; + xor.b32 %r28151, %r28150, %r28089; + shf.l.wrap.b32 %r28152, %r28151, %r28151, 20; + add.s32 %r28153, %r28147, %r27723; + add.s32 %r28154, %r28153, %r28152; + xor.b32 %r28155, %r28154, %r28149; + shf.l.wrap.b32 %r28156, %r28155, %r28155, 24; + add.s32 %r28157, %r28156, %r28150; + xor.b32 %r28158, %r28157, %r28152; + shf.l.wrap.b32 %r28159, %r28158, %r28158, 25; + add.s32 %r28160, %r28112, %r27778; + add.s32 %r28161, %r28160, %r28103; + xor.b32 %r28162, %r28128, %r28161; + shf.l.wrap.b32 %r28163, %r28162, %r28162, 16; + add.s32 %r28164, %r28163, %r28087; + xor.b32 %r28165, %r28164, %r28103; + shf.l.wrap.b32 %r28166, %r28165, %r28165, 20; + add.s32 %r28167, %r28161, %r27657; + add.s32 %r28168, %r28167, %r28166; + xor.b32 %r28169, %r28168, %r28163; + shf.l.wrap.b32 %r28170, %r28169, %r28169, 24; + add.s32 %r28171, %r28170, %r28164; + xor.b32 %r28172, %r28171, %r28166; + shf.l.wrap.b32 %r28173, %r28172, %r28172, 25; + add.s32 %r28174, %r28126, %r27767; + add.s32 %r28175, %r28174, %r28117; + xor.b32 %r28176, %r28086, %r28175; + shf.l.wrap.b32 %r28177, %r28176, %r28176, 16; + add.s32 %r28178, %r28177, %r28101; + xor.b32 %r28179, %r28178, %r28117; + shf.l.wrap.b32 %r28180, %r28179, %r28179, 20; + add.s32 %r28181, %r28175, %r27789; + add.s32 %r28182, %r28181, %r28180; + xor.b32 %r28183, %r28182, %r28177; + shf.l.wrap.b32 %r28184, %r28183, %r28183, 24; + add.s32 %r28185, %r28184, %r28178; + xor.b32 %r28186, %r28185, %r28180; + shf.l.wrap.b32 %r28187, %r28186, %r28186, 25; + add.s32 %r28188, %r28140, %r27668; + add.s32 %r28189, %r28188, %r28159; + xor.b32 %r28190, %r28189, %r28184; + shf.l.wrap.b32 %r28191, %r28190, %r28190, 16; + add.s32 %r28192, %r28191, %r28171; + xor.b32 %r28193, %r28192, %r28159; + shf.l.wrap.b32 %r28194, %r28193, %r28193, 20; + add.s32 %r28195, %r28189, %r27624; + add.s32 %r28196, %r28195, %r28194; + xor.b32 %r28197, %r28196, %r28191; + shf.l.wrap.b32 %r28198, %r28197, %r28197, 24; + add.s32 %r28199, %r28198, %r28192; + xor.b32 %r28200, %r28199, %r28194; + shf.l.wrap.b32 %r28201, %r28200, %r28200, 25; + add.s32 %r28202, %r28173, %r27745; + add.s32 %r28203, %r28202, %r28154; + xor.b32 %r28204, %r28142, %r28203; + shf.l.wrap.b32 %r28205, %r28204, %r28204, 16; + add.s32 %r28206, %r28205, %r28185; + xor.b32 %r28207, %r28206, %r28173; + shf.l.wrap.b32 %r28208, %r28207, %r28207, 20; + add.s32 %r28209, %r28203, %r27646; + add.s32 %r28210, %r28209, %r28208; + xor.b32 %r28211, %r28210, %r28205; + shf.l.wrap.b32 %r28212, %r28211, %r28211, 24; + add.s32 %r28213, %r28212, %r28206; + xor.b32 %r28214, %r28213, %r28208; + shf.l.wrap.b32 %r28215, %r28214, %r28214, 25; + add.s32 %r28216, %r28168, %r27679; + add.s32 %r28217, %r28216, %r28187; + xor.b32 %r28218, %r28156, %r28217; + shf.l.wrap.b32 %r28219, %r28218, %r28218, 16; + add.s32 %r28220, %r28219, %r28143; + xor.b32 %r28221, %r28220, %r28187; + shf.l.wrap.b32 %r28222, %r28221, %r28221, 20; + add.s32 %r28223, %r28217, %r27712; + add.s32 %r28224, %r28223, %r28222; + xor.b32 %r28225, %r28224, %r28219; + shf.l.wrap.b32 %r28226, %r28225, %r28225, 24; + add.s32 %r28227, %r28226, %r28220; + xor.b32 %r28228, %r28227, %r28222; + shf.l.wrap.b32 %r28229, %r28228, %r28228, 25; + add.s32 %r28230, %r28182, %r27635; + add.s32 %r28231, %r28230, %r28145; + xor.b32 %r28232, %r28231, %r28170; + shf.l.wrap.b32 %r28233, %r28232, %r28232, 16; + add.s32 %r28234, %r28233, %r28157; + xor.b32 %r28235, %r28234, %r28145; + shf.l.wrap.b32 %r28236, %r28235, %r28235, 20; + add.s32 %r28237, %r28231, %r27690; + add.s32 %r28238, %r28237, %r28236; + xor.b32 %r28239, %r28238, %r28233; + shf.l.wrap.b32 %r28240, %r28239, %r28239, 24; + add.s32 %r28241, %r28240, %r28234; + xor.b32 %r28242, %r28241, %r28236; + shf.l.wrap.b32 %r28243, %r28242, %r28242, 25; + add.s32 %r28244, %r28196, %r27756; + add.s32 %r28245, %r28244, %r28243; + xor.b32 %r28246, %r28245, %r28212; + shf.l.wrap.b32 %r28247, %r28246, %r28246, 16; + add.s32 %r28248, %r28247, %r28227; + xor.b32 %r28249, %r28248, %r28243; + shf.l.wrap.b32 %r28250, %r28249, %r28249, 20; + add.s32 %r28251, %r28245, %r27767; + add.s32 %r28252, %r28251, %r28250; + xor.b32 %r28253, %r28252, %r28247; + shf.l.wrap.b32 %r28254, %r28253, %r28253, 24; + add.s32 %r28255, %r28254, %r28248; + xor.b32 %r28256, %r28255, %r28250; + shf.l.wrap.b32 %r28257, %r28256, %r28256, 25; + add.s32 %r28258, %r28210, %r27723; + add.s32 %r28259, %r28258, %r28201; + xor.b32 %r28260, %r28259, %r28226; + shf.l.wrap.b32 %r28261, %r28260, %r28260, 16; + add.s32 %r28262, %r28261, %r28241; + xor.b32 %r28263, %r28262, %r28201; + shf.l.wrap.b32 %r28264, %r28263, %r28263, 20; + add.s32 %r28265, %r28259, %r27745; + add.s32 %r28266, %r28265, %r28264; + xor.b32 %r28267, %r28266, %r28261; + shf.l.wrap.b32 %r28268, %r28267, %r28267, 24; + add.s32 %r28269, %r28268, %r28262; + xor.b32 %r28270, %r28269, %r28264; + shf.l.wrap.b32 %r28271, %r28270, %r28270, 25; + add.s32 %r28272, %r28224, %r27789; + add.s32 %r28273, %r28272, %r28215; + xor.b32 %r28274, %r28240, %r28273; + shf.l.wrap.b32 %r28275, %r28274, %r28274, 16; + add.s32 %r28276, %r28275, %r28199; + xor.b32 %r28277, %r28276, %r28215; + shf.l.wrap.b32 %r28278, %r28277, %r28277, 20; + add.s32 %r28279, %r28273, %r27734; + add.s32 %r28280, %r28279, %r28278; + xor.b32 %r28281, %r28280, %r28275; + shf.l.wrap.b32 %r28282, %r28281, %r28281, 24; + add.s32 %r28283, %r28282, %r28276; + xor.b32 %r28284, %r28283, %r28278; + shf.l.wrap.b32 %r28285, %r28284, %r28284, 25; + add.s32 %r28286, %r28238, %r27778; + add.s32 %r28287, %r28286, %r28229; + xor.b32 %r28288, %r28198, %r28287; + shf.l.wrap.b32 %r28289, %r28288, %r28288, 16; + add.s32 %r28290, %r28289, %r28213; + xor.b32 %r28291, %r28290, %r28229; + shf.l.wrap.b32 %r28292, %r28291, %r28291, 20; + add.s32 %r28293, %r28287, %r27712; + add.s32 %r28294, %r28293, %r28292; + xor.b32 %r28295, %r28294, %r28289; + shf.l.wrap.b32 %r28296, %r28295, %r28295, 24; + add.s32 %r28297, %r28296, %r28290; + xor.b32 %r28298, %r28297, %r28292; + shf.l.wrap.b32 %r28299, %r28298, %r28298, 25; + add.s32 %r28300, %r28252, %r27701; + add.s32 %r28301, %r28300, %r28271; + xor.b32 %r28302, %r28301, %r28296; + shf.l.wrap.b32 %r28303, %r28302, %r28302, 16; + add.s32 %r28304, %r28303, %r28283; + xor.b32 %r28305, %r28304, %r28271; + shf.l.wrap.b32 %r28306, %r28305, %r28305, 20; + add.s32 %r28307, %r28301, %r27646; + add.s32 %r28308, %r28307, %r28306; + xor.b32 %r28309, %r28308, %r28303; + shf.l.wrap.b32 %r28310, %r28309, %r28309, 24; + add.s32 %r28311, %r28310, %r28304; + xor.b32 %r28312, %r28311, %r28306; + shf.l.wrap.b32 %r28313, %r28312, %r28312, 25; + add.s32 %r28314, %r28285, %r27679; + add.s32 %r28315, %r28314, %r28266; + xor.b32 %r28316, %r28254, %r28315; + shf.l.wrap.b32 %r28317, %r28316, %r28316, 16; + add.s32 %r28318, %r28317, %r28297; + xor.b32 %r28319, %r28318, %r28285; + shf.l.wrap.b32 %r28320, %r28319, %r28319, 20; + add.s32 %r28321, %r28315, %r27657; + add.s32 %r28322, %r28321, %r28320; + xor.b32 %r28323, %r28322, %r28317; + shf.l.wrap.b32 %r28324, %r28323, %r28323, 24; + add.s32 %r28325, %r28324, %r28318; + xor.b32 %r28326, %r28325, %r28320; + shf.l.wrap.b32 %r28327, %r28326, %r28326, 25; + add.s32 %r28328, %r28280, %r27624; + add.s32 %r28329, %r28328, %r28299; + xor.b32 %r28330, %r28268, %r28329; + shf.l.wrap.b32 %r28331, %r28330, %r28330, 16; + add.s32 %r28332, %r28331, %r28255; + xor.b32 %r28333, %r28332, %r28299; + shf.l.wrap.b32 %r28334, %r28333, %r28333, 20; + add.s32 %r28335, %r28329, %r27635; + add.s32 %r28336, %r28335, %r28334; + xor.b32 %r28337, %r28336, %r28331; + shf.l.wrap.b32 %r28338, %r28337, %r28337, 24; + add.s32 %r28339, %r28338, %r28332; + xor.b32 %r28340, %r28339, %r28334; + shf.l.wrap.b32 %r28341, %r28340, %r28340, 25; + add.s32 %r28342, %r28294, %r27690; + add.s32 %r28343, %r28342, %r28257; + xor.b32 %r28344, %r28343, %r28282; + shf.l.wrap.b32 %r28345, %r28344, %r28344, 16; + add.s32 %r28346, %r28345, %r28269; + xor.b32 %r28347, %r28346, %r28257; + shf.l.wrap.b32 %r28348, %r28347, %r28347, 20; + add.s32 %r28349, %r28343, %r27668; + add.s32 %r28350, %r28349, %r28348; + xor.b32 %r28351, %r28350, %r28345; + shf.l.wrap.b32 %r28352, %r28351, %r28351, 24; + add.s32 %r28353, %r28352, %r28346; + xor.b32 %r28354, %r28353, %r28348; + shf.l.wrap.b32 %r28355, %r28354, %r28354, 25; + add.s32 %r28356, %r28308, %r27723; + add.s32 %r28357, %r28356, %r28355; + xor.b32 %r28358, %r28357, %r28324; + shf.l.wrap.b32 %r28359, %r28358, %r28358, 16; + add.s32 %r28360, %r28359, %r28339; + xor.b32 %r28361, %r28360, %r28355; + shf.l.wrap.b32 %r28362, %r28361, %r28361, 20; + add.s32 %r28363, %r28357, %r27778; + add.s32 %r28364, %r28363, %r28362; + xor.b32 %r28365, %r28364, %r28359; + shf.l.wrap.b32 %r28366, %r28365, %r28365, 24; + add.s32 %r28367, %r28366, %r28360; + xor.b32 %r28368, %r28367, %r28362; + shf.l.wrap.b32 %r28369, %r28368, %r28368, 25; + add.s32 %r28370, %r28322, %r27745; + add.s32 %r28371, %r28370, %r28313; + xor.b32 %r28372, %r28371, %r28338; + shf.l.wrap.b32 %r28373, %r28372, %r28372, 16; + add.s32 %r28374, %r28373, %r28353; + xor.b32 %r28375, %r28374, %r28313; + shf.l.wrap.b32 %r28376, %r28375, %r28375, 20; + add.s32 %r28377, %r28371, %r27679; + add.s32 %r28378, %r28377, %r28376; + xor.b32 %r28379, %r28378, %r28373; + shf.l.wrap.b32 %r28380, %r28379, %r28379, 24; + add.s32 %r28381, %r28380, %r28374; + xor.b32 %r28382, %r28381, %r28376; + shf.l.wrap.b32 %r28383, %r28382, %r28382, 25; + add.s32 %r28384, %r28336, %r27712; + add.s32 %r28385, %r28384, %r28327; + xor.b32 %r28386, %r28352, %r28385; + shf.l.wrap.b32 %r28387, %r28386, %r28386, 16; + add.s32 %r28388, %r28387, %r28311; + xor.b32 %r28389, %r28388, %r28327; + shf.l.wrap.b32 %r28390, %r28389, %r28389, 20; + add.s32 %r28391, %r28385, %r27756; + add.s32 %r28392, %r28391, %r28390; + xor.b32 %r28393, %r28392, %r28387; + shf.l.wrap.b32 %r28394, %r28393, %r28393, 24; + add.s32 %r28395, %r28394, %r28388; + xor.b32 %r28396, %r28395, %r28390; + shf.l.wrap.b32 %r28397, %r28396, %r28396, 25; + add.s32 %r28398, %r28350, %r27789; + add.s32 %r28399, %r28398, %r28341; + xor.b32 %r28400, %r28310, %r28399; + shf.l.wrap.b32 %r28401, %r28400, %r28400, 16; + add.s32 %r28402, %r28401, %r28325; + xor.b32 %r28403, %r28402, %r28341; + shf.l.wrap.b32 %r28404, %r28403, %r28403, 20; + add.s32 %r28405, %r28399, %r27635; + add.s32 %r28406, %r28405, %r28404; + xor.b32 %r28407, %r28406, %r28401; + shf.l.wrap.b32 %r28408, %r28407, %r28407, 24; + add.s32 %r28409, %r28408, %r28402; + xor.b32 %r28410, %r28409, %r28404; + shf.l.wrap.b32 %r28411, %r28410, %r28410, 25; + add.s32 %r28412, %r28364, %r27767; + add.s32 %r28413, %r28412, %r28383; + xor.b32 %r28414, %r28413, %r28408; + shf.l.wrap.b32 %r28415, %r28414, %r28414, 16; + add.s32 %r28416, %r28415, %r28395; + xor.b32 %r28417, %r28416, %r28383; + shf.l.wrap.b32 %r28418, %r28417, %r28417, 20; + add.s32 %r28419, %r28413, %r27657; + add.s32 %r28420, %r28419, %r28418; + xor.b32 %r28421, %r28420, %r28415; + shf.l.wrap.b32 %r28422, %r28421, %r28421, 24; + add.s32 %r28423, %r28422, %r28416; + xor.b32 %r28424, %r28423, %r28418; + shf.l.wrap.b32 %r28425, %r28424, %r28424, 25; + add.s32 %r28426, %r28397, %r27624; + add.s32 %r28427, %r28426, %r28378; + xor.b32 %r28428, %r28366, %r28427; + shf.l.wrap.b32 %r28429, %r28428, %r28428, 16; + add.s32 %r28430, %r28429, %r28409; + xor.b32 %r28431, %r28430, %r28397; + shf.l.wrap.b32 %r28432, %r28431, %r28431, 20; + add.s32 %r28433, %r28427, %r27734; + add.s32 %r28434, %r28433, %r28432; + xor.b32 %r28435, %r28434, %r28429; + shf.l.wrap.b32 %r28436, %r28435, %r28435, 24; + add.s32 %r28437, %r28436, %r28430; + xor.b32 %r28438, %r28437, %r28432; + shf.l.wrap.b32 %r28439, %r28438, %r28438, 25; + add.s32 %r28440, %r28392, %r27646; + add.s32 %r28441, %r28440, %r28411; + xor.b32 %r28442, %r28380, %r28441; + shf.l.wrap.b32 %r28443, %r28442, %r28442, 16; + add.s32 %r28444, %r28443, %r28367; + xor.b32 %r28445, %r28444, %r28411; + shf.l.wrap.b32 %r28446, %r28445, %r28445, 20; + add.s32 %r28447, %r28441, %r27690; + add.s32 %r28448, %r28447, %r28446; + xor.b32 %r28449, %r28448, %r28443; + shf.l.wrap.b32 %r28450, %r28449, %r28449, 24; + add.s32 %r28451, %r28450, %r28444; + xor.b32 %r28452, %r28451, %r28446; + shf.l.wrap.b32 %r28453, %r28452, %r28452, 25; + add.s32 %r28454, %r28406, %r27668; + add.s32 %r28455, %r28454, %r28369; + xor.b32 %r28456, %r28455, %r28394; + shf.l.wrap.b32 %r28457, %r28456, %r28456, 16; + add.s32 %r28458, %r28457, %r28381; + xor.b32 %r28459, %r28458, %r28369; + shf.l.wrap.b32 %r28460, %r28459, %r28459, 20; + add.s32 %r28461, %r28455, %r27701; + add.s32 %r28462, %r28461, %r28460; + xor.b32 %r28463, %r28462, %r28457; + shf.l.wrap.b32 %r28464, %r28463, %r28463, 24; + add.s32 %r28465, %r28464, %r28458; + xor.b32 %r28466, %r28465, %r28460; + shf.l.wrap.b32 %r28467, %r28466, %r28466, 25; + add.s32 %r28468, %r28420, %r27745; + add.s32 %r28469, %r28468, %r28467; + xor.b32 %r28470, %r28469, %r28436; + shf.l.wrap.b32 %r28471, %r28470, %r28470, 16; + add.s32 %r28472, %r28471, %r28451; + xor.b32 %r28473, %r28472, %r28467; + shf.l.wrap.b32 %r28474, %r28473, %r28473, 20; + add.s32 %r28475, %r28469, %r27789; + add.s32 %r28476, %r28475, %r28474; + xor.b32 %r28477, %r28476, %r28471; + shf.l.wrap.b32 %r28478, %r28477, %r28477, 24; + add.s32 %r28479, %r28478, %r28472; + xor.b32 %r28480, %r28479, %r28474; + shf.l.wrap.b32 %r28481, %r28480, %r28480, 25; + add.s32 %r28482, %r28434, %r27679; + add.s32 %r28483, %r28482, %r28425; + xor.b32 %r28484, %r28483, %r28450; + shf.l.wrap.b32 %r28485, %r28484, %r28484, 16; + add.s32 %r28486, %r28485, %r28465; + xor.b32 %r28487, %r28486, %r28425; + shf.l.wrap.b32 %r28488, %r28487, %r28487, 20; + add.s32 %r28489, %r28483, %r27624; + add.s32 %r28490, %r28489, %r28488; + xor.b32 %r28491, %r28490, %r28485; + shf.l.wrap.b32 %r28492, %r28491, %r28491, 24; + add.s32 %r28493, %r28492, %r28486; + xor.b32 %r28494, %r28493, %r28488; + shf.l.wrap.b32 %r28495, %r28494, %r28494, 25; + add.s32 %r28496, %r28448, %r27635; + add.s32 %r28497, %r28496, %r28439; + xor.b32 %r28498, %r28464, %r28497; + shf.l.wrap.b32 %r28499, %r28498, %r28498, 16; + add.s32 %r28500, %r28499, %r28423; + xor.b32 %r28501, %r28500, %r28439; + shf.l.wrap.b32 %r28502, %r28501, %r28501, 20; + add.s32 %r28503, %r28497, %r27723; + add.s32 %r28504, %r28503, %r28502; + xor.b32 %r28505, %r28504, %r28499; + shf.l.wrap.b32 %r28506, %r28505, %r28505, 24; + add.s32 %r28507, %r28506, %r28500; + xor.b32 %r28508, %r28507, %r28502; + shf.l.wrap.b32 %r28509, %r28508, %r28508, 25; + add.s32 %r28510, %r28462, %r27712; + add.s32 %r28511, %r28510, %r28453; + xor.b32 %r28512, %r28422, %r28511; + shf.l.wrap.b32 %r28513, %r28512, %r28512, 16; + add.s32 %r28514, %r28513, %r28437; + xor.b32 %r28515, %r28514, %r28453; + shf.l.wrap.b32 %r28516, %r28515, %r28515, 20; + add.s32 %r28517, %r28511, %r27690; + add.s32 %r28518, %r28517, %r28516; + xor.b32 %r28519, %r28518, %r28513; + shf.l.wrap.b32 %r28520, %r28519, %r28519, 24; + add.s32 %r28521, %r28520, %r28514; + xor.b32 %r28522, %r28521, %r28516; + shf.l.wrap.b32 %r28523, %r28522, %r28522, 25; + add.s32 %r28524, %r28476, %r27778; + add.s32 %r28525, %r28524, %r28495; + xor.b32 %r28526, %r28525, %r28520; + shf.l.wrap.b32 %r28527, %r28526, %r28526, 16; + add.s32 %r28528, %r28527, %r28507; + xor.b32 %r28529, %r28528, %r28495; + shf.l.wrap.b32 %r28530, %r28529, %r28529, 20; + add.s32 %r28531, %r28525, %r27734; + add.s32 %r28532, %r28531, %r28530; + xor.b32 %r28533, %r28532, %r28527; + shf.l.wrap.b32 %r28534, %r28533, %r28533, 24; + add.s32 %r28535, %r28534, %r28528; + xor.b32 %r28536, %r28535, %r28530; + shf.l.wrap.b32 %r28537, %r28536, %r28536, 25; + add.s32 %r28538, %r28509, %r27646; + add.s32 %r28539, %r28538, %r28490; + xor.b32 %r28540, %r28478, %r28539; + shf.l.wrap.b32 %r28541, %r28540, %r28540, 16; + add.s32 %r28542, %r28541, %r28521; + xor.b32 %r28543, %r28542, %r28509; + shf.l.wrap.b32 %r28544, %r28543, %r28543, 20; + add.s32 %r28545, %r28539, %r27756; + add.s32 %r28546, %r28545, %r28544; + xor.b32 %r28547, %r28546, %r28541; + shf.l.wrap.b32 %r28548, %r28547, %r28547, 24; + add.s32 %r28549, %r28548, %r28542; + xor.b32 %r28550, %r28549, %r28544; + shf.l.wrap.b32 %r28551, %r28550, %r28550, 25; + add.s32 %r28552, %r28504, %r27657; + add.s32 %r28553, %r28552, %r28523; + xor.b32 %r28554, %r28492, %r28553; + shf.l.wrap.b32 %r28555, %r28554, %r28554, 16; + add.s32 %r28556, %r28555, %r28479; + xor.b32 %r28557, %r28556, %r28523; + shf.l.wrap.b32 %r28558, %r28557, %r28557, 20; + add.s32 %r28559, %r28553, %r27668; + add.s32 %r28560, %r28559, %r28558; + xor.b32 %r28561, %r28560, %r28555; + shf.l.wrap.b32 %r28562, %r28561, %r28561, 24; + add.s32 %r28563, %r28562, %r28556; + xor.b32 %r28564, %r28563, %r28558; + shf.l.wrap.b32 %r28565, %r28564, %r28564, 25; + add.s32 %r28566, %r28518, %r27701; + add.s32 %r28567, %r28566, %r28481; + xor.b32 %r28568, %r28567, %r28506; + shf.l.wrap.b32 %r28569, %r28568, %r28568, 16; + add.s32 %r28570, %r28569, %r28493; + xor.b32 %r28571, %r28570, %r28481; + shf.l.wrap.b32 %r28572, %r28571, %r28571, 20; + add.s32 %r28573, %r28567, %r27767; + add.s32 %r28574, %r28573, %r28572; + xor.b32 %r28575, %r28574, %r28569; + shf.l.wrap.b32 %r28576, %r28575, %r28575, 24; + add.s32 %r28577, %r28576, %r28570; + xor.b32 %r28578, %r28577, %r28572; + shf.l.wrap.b32 %r28579, %r28578, %r28578, 25; + xor.b32 %r28580, %r28532, %r28563; + cvt.u64.u32 %rd342, %r28580; + xor.b32 %r28581, %r28577, %r28546; + and.b32 %r28582, %r28581, 255; + cvt.u64.u32 %rd1147, %r28582; + cvt.u64.u32 %rd1148, %r28581; + shl.b64 %rd1149, %rd1148, 32; + and.b64 %rd1150, %rd1149, 280375465082880; + and.b64 %rd1151, %rd1149, 71776119061217280; + shr.u32 %r28583, %r28581, 24; + cvt.u64.u32 %rd1152, %r28583; + shl.b64 %rd1153, %rd1152, 56; + bfi.b64 %rd1154, %rd1147, %rd342, 32, 32; + or.b64 %rd1155, %rd1154, %rd1150; + or.b64 %rd1156, %rd1155, %rd1151; + or.b64 %rd341, %rd1156, %rd1153; + xor.b32 %r28584, %r28535, %r28560; + cvt.u64.u32 %rd1157, %r28584; + xor.b32 %r28585, %r28574, %r28549; + and.b32 %r28586, %r28585, 255; + cvt.u64.u32 %rd1158, %r28586; + cvt.u64.u32 %rd1159, %r28585; + shl.b64 %rd1160, %rd1159, 32; + and.b64 %rd1161, %rd1160, 280375465082880; + and.b64 %rd1162, %rd1160, 71776119061217280; + shr.u32 %r28587, %r28585, 24; + cvt.u64.u32 %rd1163, %r28587; + shl.b64 %rd1164, %rd1163, 56; + bfi.b64 %rd1165, %rd1158, %rd1157, 32, 32; + or.b64 %rd1166, %rd1165, %rd1161; + or.b64 %rd1167, %rd1166, %rd1162; + or.b64 %rd345, %rd1167, %rd1164; + xor.b32 %r28588, %r28579, %r28548; + cvt.u64.u32 %rd1168, %r28588; + xor.b32 %r28589, %r28537, %r28562; + and.b32 %r28590, %r28589, 255; + cvt.u64.u32 %rd1169, %r28590; + cvt.u64.u32 %rd1170, %r28589; + shl.b64 %rd1171, %rd1170, 32; + and.b64 %rd1172, %rd1171, 280375465082880; + and.b64 %rd1173, %rd1171, 71776119061217280; + shr.u32 %r28591, %r28589, 24; + cvt.u64.u32 %rd1174, %r28591; + shl.b64 %rd1175, %rd1174, 56; + bfi.b64 %rd1176, %rd1169, %rd1168, 32, 32; + or.b64 %rd1177, %rd1176, %rd1172; + or.b64 %rd1178, %rd1177, %rd1173; + or.b64 %rd1280, %rd1178, %rd1175; + xor.b32 %r28592, %r28576, %r28551; + cvt.u64.u32 %rd1179, %r28592; + xor.b32 %r28593, %r28534, %r28565; + and.b32 %r28594, %r28593, 255; + cvt.u64.u32 %rd1180, %r28594; + cvt.u64.u32 %rd1181, %r28593; + shl.b64 %rd1182, %rd1181, 32; + and.b64 %rd1183, %rd1182, 280375465082880; + and.b64 %rd1184, %rd1182, 71776119061217280; + shr.u32 %r28595, %r28593, 24; + cvt.u64.u32 %rd1185, %r28595; + shl.b64 %rd1186, %rd1185, 56; + bfi.b64 %rd1187, %rd1180, %rd1179, 32, 32; + or.b64 %rd1188, %rd1187, %rd1183; + or.b64 %rd1189, %rd1188, %rd1184; + or.b64 %rd1279, %rd1189, %rd1186; + +$L__BB2_104: + ld.const.u64 %rd346, [target+24]; + setp.eq.s64 %p59, %rd1279, %rd346; + @%p59 bra $L__BB2_106; + bra.uni $L__BB2_105; + +$L__BB2_106: + ld.const.u64 %rd347, [target+16]; + setp.eq.s64 %p60, %rd1280, %rd347; + @%p60 bra $L__BB2_108; + bra.uni $L__BB2_107; + +$L__BB2_108: + ld.const.u64 %rd348, [target+8]; + setp.eq.s64 %p61, %rd345, %rd348; + @%p61 bra $L__BB2_110; + bra.uni $L__BB2_109; + +$L__BB2_110: + and.b64 %rd1234, %rd342, 255; + and.b64 %rd1235, %rd341, -256; + or.b64 %rd1236, %rd1235, %rd1234; + ld.const.u64 %rd1237, [target]; + setp.lt.u64 %p63, %rd1236, %rd1237; + bra.uni $L__BB2_111; + +$L__BB2_105: + setp.lt.u64 %p63, %rd1279, %rd346; + bra.uni $L__BB2_111; + +$L__BB2_107: + setp.lt.u64 %p63, %rd1280, %rd347; + bra.uni $L__BB2_111; + +$L__BB2_109: + setp.lt.u64 %p63, %rd345, %rd348; + +$L__BB2_111: + not.pred %p62, %p63; + @%p62 bra $L__BB2_113; + + ld.param.u64 %rd1247, [heavy_hash_param_0]; + ld.param.u64 %rd1246, [heavy_hash_param_1]; + and.b64 %rd1245, %rd1255, %rd1247; + or.b64 %rd1244, %rd1245, %rd1246; + ld.param.u64 %rd1243, [heavy_hash_param_5]; + cvta.to.global.u64 %rd1238, %rd1243; + mov.u64 %rd1239, 0; + atom.global.cas.b64 %rd1240, [%rd1238], %rd1239, %rd1244; + +$L__BB2_113: ret; } diff --git a/plugins/cuda/resources/kaspa-cuda-sm75.ptx b/plugins/cuda/resources/kaspa-cuda-sm75.ptx index a001843..cf13603 100644 --- a/plugins/cuda/resources/kaspa-cuda-sm75.ptx +++ b/plugins/cuda/resources/kaspa-cuda-sm75.ptx @@ -10,7 +10,12 @@ .target sm_75 .address_size 64 - // .globl heavy_hash +.extern .func (.param .b32 func_retval0) vprintf +( + .param .b64 vprintf_param_0, + .param .b64 vprintf_param_1 +) +; .global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; .global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; .global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; @@ -18,7063 +23,41900 @@ .global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; .global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; .global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; -.const .align 4 .b8 matrix[4096]; -.const .align 8 .b8 hash_header[72]; +.const .align 8 .b8 keccak_round_constants[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .u64 _ZZN10item_state6updateEjE9num_words = 16; +.global .align 8 .u64 _ZZ15fishhash_kernelRK16fishhash_contextRK7hash512E9num_words = 32; +.const .align 1 .b8 matrix[4096]; +.const .align 1 .b8 hash_header[72]; .const .align 8 .b8 target[32]; .const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; .const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; +.global .align 1 .b8 $str[5] = {37, 48, 50, 120, 0}; +.global .align 1 .b8 $str$1[2] = {10, 0}; +.global .align 1 .b8 $str$2[32] = {104, 101, 97, 118, 121, 95, 104, 97, 115, 104, 32, 84, 104, 114, 101, 97, 100, 32, 37, 100, 44, 32, 66, 108, 111, 99, 107, 32, 37, 100, 10, 0}; +.global .align 1 .b8 $str$3[22] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 49, 48, 93, 32, 105, 115, 32, 58, 32, 0}; +.global .align 1 .b8 $str$4[22] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 52, 50, 93, 32, 105, 115, 32, 58, 32, 0}; +.global .align 1 .b8 $str$5[25] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 49, 50, 51, 52, 53, 93, 32, 105, 115, 32, 58, 32, 0}; -.visible .entry heavy_hash( - .param .u64 heavy_hash_param_0, - .param .u64 heavy_hash_param_1, - .param .u64 heavy_hash_param_2, - .param .u8 heavy_hash_param_3, - .param .u64 heavy_hash_param_4, - .param .u64 heavy_hash_param_5 +.func (.param .b64 func_retval0) _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh( + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3, + .param .b32 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5 ) { - .local .align 8 .b8 __local_depot0[1912]; + .local .align 16 .b8 __local_depot0[224]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<17>; - .reg .b16 %rs<113>; - .reg .b32 %r<6245>; - .reg .b64 %rd<490>; + .reg .pred %p<28>; + .reg .b16 %rs<233>; + .reg .b32 %r<3965>; + .reg .b64 %rd<175>; mov.u64 %SPL, __local_depot0; - ld.param.u8 %rs11, [heavy_hash_param_3]; - ld.param.u64 %rd78, [heavy_hash_param_0]; - ld.param.u64 %rd79, [heavy_hash_param_1]; - ld.param.u64 %rd80, [heavy_hash_param_2]; - ld.param.u64 %rd81, [heavy_hash_param_4]; - ld.param.u64 %rd82, [heavy_hash_param_5]; - cvta.to.global.u64 %rd1, %rd81; - cvta.to.global.u64 %rd2, %rd82; - add.u64 %rd3, %SPL, 0; - mov.u32 %r17, %ntid.x; - mov.u32 %r18, %ctaid.x; - mov.u32 %r19, %tid.x; - mad.lo.s32 %r20, %r18, %r17, %r19; - cvt.s64.s32 %rd4, %r20; - setp.ge.u64 %p6, %rd4, %rd80; - @%p6 bra $L__BB0_19; - - cvt.u32.u64 %r21, %rd4; - setp.ne.s32 %p7, %r21, 0; - @%p7 bra $L__BB0_3; - - mov.u64 %rd84, 0; - st.global.u64 [%rd2], %rd84; - -$L__BB0_3: - setp.eq.s16 %p8, %rs11, 0; - @%p8 bra $L__BB0_5; - - shl.b64 %rd85, %rd4, 5; - add.s64 %rd86, %rd1, %rd85; - ld.global.v2.u64 {%rd87, %rd88}, [%rd86]; - mul.lo.s64 %rd91, %rd88, 5; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd91, 7; - shr.b64 %rhs, %rd91, 57; - add.u64 %rd92, %lhs, %rhs; - } - mul.lo.s64 %rd463, %rd92, 9; - shl.b64 %rd93, %rd88, 17; - ld.global.v2.u64 {%rd94, %rd95}, [%rd86+16]; - xor.b64 %rd98, %rd94, %rd87; - xor.b64 %rd99, %rd95, %rd88; - xor.b64 %rd100, %rd88, %rd98; - xor.b64 %rd101, %rd87, %rd99; - st.global.v2.u64 [%rd86], {%rd101, %rd100}; - { - .reg .b32 %dummy; - mov.b64 {%r22,%dummy}, %rd99; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r23}, %rd99; - } - shf.r.wrap.b32 %r24, %r23, %r22, 19; - shf.r.wrap.b32 %r25, %r22, %r23, 19; - mov.b64 %rd102, {%r25, %r24}; - xor.b64 %rd103, %rd98, %rd93; - st.global.v2.u64 [%rd86+16], {%rd103, %rd102}; - bra.uni $L__BB0_6; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs75, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4]; + ld.param.u64 %rd69, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd171, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + ld.param.u64 %rd71, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2]; + ld.param.u64 %rd165, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + ld.param.u64 %rd73, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd155, %rd73; + cvta.to.local.u64 %rd2, %rd71; + add.u64 %rd153, %SPL, 16; + add.u64 %rd149, %SP, 96; + cvta.to.local.u64 %rd4, %rd149; + setp.lt.u64 %p1, %rd171, 1025; + @%p1 bra $L__BB0_14; + bra.uni $L__BB0_1; + +$L__BB0_14: + add.u64 %rd162, %SPL, 0; + setp.ne.s64 %p16, %rd171, 1024; + mov.u64 %rd159, 0; + mov.u64 %rd151, %rd159; + @%p16 bra $L__BB0_16; + + mov.u64 %rd171, 0; + st.local.u64 [%rd162], %rd69; + mov.u64 %rd151, 1; + mov.u64 %rd159, 1024; + +$L__BB0_16: + setp.eq.s64 %p17, %rd151, 0; + @%p17 bra $L__BB0_21; + + or.b16 %rs1, %rs75, 1; + mov.u64 %rd163, %rd151; + +$L__BB0_18: + ld.local.u64 %rd166, [%rd162]; + ld.local.u8 %r1060, [%rd2]; + ld.local.u8 %r1061, [%rd2+1]; + prmt.b32 %r1062, %r1061, %r1060, 30212; + ld.local.u8 %r1063, [%rd2+2]; + ld.local.u8 %r1064, [%rd2+3]; + prmt.b32 %r1065, %r1064, %r1063, 30212; + prmt.b32 %r3948, %r1065, %r1062, 4180; + ld.local.u8 %r1066, [%rd2+4]; + ld.local.u8 %r1067, [%rd2+5]; + prmt.b32 %r1068, %r1067, %r1066, 30212; + ld.local.u8 %r1069, [%rd2+6]; + ld.local.u8 %r1070, [%rd2+7]; + prmt.b32 %r1071, %r1070, %r1069, 30212; + prmt.b32 %r3947, %r1071, %r1068, 4180; + ld.local.u8 %r1072, [%rd2+8]; + ld.local.u8 %r1073, [%rd2+9]; + prmt.b32 %r1074, %r1073, %r1072, 30212; + ld.local.u8 %r1075, [%rd2+10]; + ld.local.u8 %r1076, [%rd2+11]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + prmt.b32 %r3946, %r1077, %r1074, 4180; + ld.local.u8 %r1078, [%rd2+12]; + ld.local.u8 %r1079, [%rd2+13]; + prmt.b32 %r1080, %r1079, %r1078, 30212; + ld.local.u8 %r1081, [%rd2+14]; + ld.local.u8 %r1082, [%rd2+15]; + prmt.b32 %r1083, %r1082, %r1081, 30212; + prmt.b32 %r3945, %r1083, %r1080, 4180; + mov.u64 %rd167, 16; + ld.local.u8 %r1084, [%rd2+16]; + ld.local.u8 %r1085, [%rd2+17]; + prmt.b32 %r1086, %r1085, %r1084, 30212; + ld.local.u8 %r1087, [%rd2+18]; + ld.local.u8 %r1088, [%rd2+19]; + prmt.b32 %r1089, %r1088, %r1087, 30212; + prmt.b32 %r3944, %r1089, %r1086, 4180; + ld.local.u8 %r1090, [%rd2+20]; + ld.local.u8 %r1091, [%rd2+21]; + prmt.b32 %r1092, %r1091, %r1090, 30212; + ld.local.u8 %r1093, [%rd2+22]; + ld.local.u8 %r1094, [%rd2+23]; + prmt.b32 %r1095, %r1094, %r1093, 30212; + prmt.b32 %r3943, %r1095, %r1092, 4180; + ld.local.u8 %r1096, [%rd2+24]; + ld.local.u8 %r1097, [%rd2+25]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd2+26]; + ld.local.u8 %r1100, [%rd2+27]; + prmt.b32 %r1101, %r1100, %r1099, 30212; + prmt.b32 %r3942, %r1101, %r1098, 4180; + ld.local.u8 %r1102, [%rd2+28]; + ld.local.u8 %r1103, [%rd2+29]; + prmt.b32 %r1104, %r1103, %r1102, 30212; + ld.local.u8 %r1105, [%rd2+30]; + ld.local.u8 %r1106, [%rd2+31]; + prmt.b32 %r1107, %r1106, %r1105, 30212; + prmt.b32 %r3941, %r1107, %r1104, 4180; + mov.u16 %rs197, %rs1; + +$L__BB0_19: + shr.u64 %rd143, %rd165, 32; + cvt.u32.u64 %r3940, %rd143; + cvt.u32.u64 %r3939, %rd165; + setp.eq.s64 %p18, %rd167, 1; + selp.b16 %rs79, 2, 0, %p18; + or.b16 %rs80, %rs79, %rs197; + ld.u8 %r1108, [%rd166]; + ld.u8 %r1109, [%rd166+1]; + prmt.b32 %r1110, %r1109, %r1108, 30212; + ld.u8 %r1111, [%rd166+2]; + prmt.b32 %r1112, %r1111, %r1110, 28756; + ld.u8 %r1113, [%rd166+3]; + prmt.b32 %r1114, %r1113, %r1112, 1620; + ld.u8 %r1115, [%rd166+4]; + ld.u8 %r1116, [%rd166+5]; + prmt.b32 %r1117, %r1116, %r1115, 30212; + ld.u8 %r1118, [%rd166+6]; + prmt.b32 %r1119, %r1118, %r1117, 28756; + ld.u8 %r1120, [%rd166+7]; + prmt.b32 %r1121, %r1120, %r1119, 1620; + ld.u8 %r1122, [%rd166+8]; + ld.u8 %r1123, [%rd166+9]; + prmt.b32 %r1124, %r1123, %r1122, 30212; + ld.u8 %r1125, [%rd166+10]; + prmt.b32 %r1126, %r1125, %r1124, 28756; + ld.u8 %r1127, [%rd166+11]; + prmt.b32 %r1128, %r1127, %r1126, 1620; + ld.u8 %r1129, [%rd166+12]; + ld.u8 %r1130, [%rd166+13]; + prmt.b32 %r1131, %r1130, %r1129, 30212; + ld.u8 %r1132, [%rd166+14]; + prmt.b32 %r1133, %r1132, %r1131, 28756; + ld.u8 %r1134, [%rd166+15]; + prmt.b32 %r1135, %r1134, %r1133, 1620; + ld.u8 %r1136, [%rd166+16]; + ld.u8 %r1137, [%rd166+17]; + prmt.b32 %r1138, %r1137, %r1136, 30212; + ld.u8 %r1139, [%rd166+18]; + prmt.b32 %r1140, %r1139, %r1138, 28756; + ld.u8 %r1141, [%rd166+19]; + prmt.b32 %r1142, %r1141, %r1140, 1620; + ld.u8 %r1143, [%rd166+20]; + ld.u8 %r1144, [%rd166+21]; + prmt.b32 %r1145, %r1144, %r1143, 30212; + ld.u8 %r1146, [%rd166+22]; + prmt.b32 %r1147, %r1146, %r1145, 28756; + ld.u8 %r1148, [%rd166+23]; + prmt.b32 %r1149, %r1148, %r1147, 1620; + ld.u8 %r1150, [%rd166+24]; + ld.u8 %r1151, [%rd166+25]; + prmt.b32 %r1152, %r1151, %r1150, 30212; + ld.u8 %r1153, [%rd166+26]; + prmt.b32 %r1154, %r1153, %r1152, 28756; + ld.u8 %r1155, [%rd166+27]; + prmt.b32 %r1156, %r1155, %r1154, 1620; + ld.u8 %r1157, [%rd166+28]; + ld.u8 %r1158, [%rd166+29]; + prmt.b32 %r1159, %r1158, %r1157, 30212; + ld.u8 %r1160, [%rd166+30]; + prmt.b32 %r1161, %r1160, %r1159, 28756; + ld.u8 %r1162, [%rd166+31]; + prmt.b32 %r1163, %r1162, %r1161, 1620; + ld.u8 %r1164, [%rd166+32]; + ld.u8 %r1165, [%rd166+33]; + prmt.b32 %r1166, %r1165, %r1164, 30212; + ld.u8 %r1167, [%rd166+34]; + prmt.b32 %r1168, %r1167, %r1166, 28756; + ld.u8 %r1169, [%rd166+35]; + prmt.b32 %r1170, %r1169, %r1168, 1620; + ld.u8 %r1171, [%rd166+36]; + ld.u8 %r1172, [%rd166+37]; + prmt.b32 %r1173, %r1172, %r1171, 30212; + ld.u8 %r1174, [%rd166+38]; + prmt.b32 %r1175, %r1174, %r1173, 28756; + ld.u8 %r1176, [%rd166+39]; + prmt.b32 %r1177, %r1176, %r1175, 1620; + ld.u8 %r1178, [%rd166+40]; + ld.u8 %r1179, [%rd166+41]; + prmt.b32 %r1180, %r1179, %r1178, 30212; + ld.u8 %r1181, [%rd166+42]; + prmt.b32 %r1182, %r1181, %r1180, 28756; + ld.u8 %r1183, [%rd166+43]; + prmt.b32 %r1184, %r1183, %r1182, 1620; + ld.u8 %r1185, [%rd166+44]; + ld.u8 %r1186, [%rd166+45]; + prmt.b32 %r1187, %r1186, %r1185, 30212; + ld.u8 %r1188, [%rd166+46]; + prmt.b32 %r1189, %r1188, %r1187, 28756; + ld.u8 %r1190, [%rd166+47]; + prmt.b32 %r1191, %r1190, %r1189, 1620; + ld.u8 %r1192, [%rd166+48]; + ld.u8 %r1193, [%rd166+49]; + prmt.b32 %r1194, %r1193, %r1192, 30212; + ld.u8 %r1195, [%rd166+50]; + prmt.b32 %r1196, %r1195, %r1194, 28756; + ld.u8 %r1197, [%rd166+51]; + prmt.b32 %r1198, %r1197, %r1196, 1620; + ld.u8 %r1199, [%rd166+52]; + ld.u8 %r1200, [%rd166+53]; + prmt.b32 %r1201, %r1200, %r1199, 30212; + ld.u8 %r1202, [%rd166+54]; + prmt.b32 %r1203, %r1202, %r1201, 28756; + ld.u8 %r1204, [%rd166+55]; + prmt.b32 %r1205, %r1204, %r1203, 1620; + ld.u8 %r1206, [%rd166+56]; + ld.u8 %r1207, [%rd166+57]; + prmt.b32 %r1208, %r1207, %r1206, 30212; + ld.u8 %r1209, [%rd166+58]; + prmt.b32 %r1210, %r1209, %r1208, 28756; + ld.u8 %r1211, [%rd166+59]; + prmt.b32 %r1212, %r1211, %r1210, 1620; + ld.u8 %r1213, [%rd166+60]; + ld.u8 %r1214, [%rd166+61]; + prmt.b32 %r1215, %r1214, %r1213, 30212; + ld.u8 %r1216, [%rd166+62]; + prmt.b32 %r1217, %r1216, %r1215, 28756; + ld.u8 %r1218, [%rd166+63]; + prmt.b32 %r1219, %r1218, %r1217, 1620; + cvt.u32.u16 %r1220, %rs80; + and.b32 %r1221, %r1220, 255; + add.s32 %r1222, %r3944, %r3948; + add.s32 %r1223, %r1222, %r1114; + xor.b32 %r1224, %r1223, %r3939; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 16; + add.s32 %r1226, %r1225, 1779033703; + xor.b32 %r1227, %r1226, %r3944; + shf.l.wrap.b32 %r1228, %r1227, %r1227, 20; + add.s32 %r1229, %r1121, %r1223; + add.s32 %r1230, %r1229, %r1228; + xor.b32 %r1231, %r1230, %r1225; + shf.l.wrap.b32 %r1232, %r1231, %r1231, 24; + add.s32 %r1233, %r1232, %r1226; + xor.b32 %r1234, %r1233, %r1228; + shf.l.wrap.b32 %r1235, %r1234, %r1234, 25; + add.s32 %r1236, %r3943, %r3947; + add.s32 %r1237, %r1236, %r1128; + xor.b32 %r1238, %r1237, %r3940; + shf.l.wrap.b32 %r1239, %r1238, %r1238, 16; + add.s32 %r1240, %r1239, -1150833019; + xor.b32 %r1241, %r1240, %r3943; + shf.l.wrap.b32 %r1242, %r1241, %r1241, 20; + add.s32 %r1243, %r1135, %r1237; + add.s32 %r1244, %r1243, %r1242; + xor.b32 %r1245, %r1244, %r1239; + shf.l.wrap.b32 %r1246, %r1245, %r1245, 24; + add.s32 %r1247, %r1246, %r1240; + xor.b32 %r1248, %r1247, %r1242; + shf.l.wrap.b32 %r1249, %r1248, %r1248, 25; + add.s32 %r1250, %r3942, %r3946; + add.s32 %r1251, %r1250, %r1142; + shr.u32 %r1252, %r1251, 16; + shl.b32 %r1253, %r1251, 16; + xor.b32 %r1254, %r1253, 4194304; + or.b32 %r1255, %r1254, %r1252; + add.s32 %r1256, %r1255, 1013904242; + xor.b32 %r1257, %r1256, %r3942; + shf.l.wrap.b32 %r1258, %r1257, %r1257, 20; + add.s32 %r1259, %r1149, %r1251; + add.s32 %r1260, %r1259, %r1258; + xor.b32 %r1261, %r1260, %r1255; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 24; + add.s32 %r1263, %r1262, %r1256; + xor.b32 %r1264, %r1263, %r1258; + shf.l.wrap.b32 %r1265, %r1264, %r1264, 25; + add.s32 %r1266, %r3941, %r3945; + add.s32 %r1267, %r1266, %r1156; + xor.b32 %r1268, %r1267, %r1221; + shr.u32 %r1269, %r1267, 16; + shl.b32 %r1270, %r1268, 16; + or.b32 %r1271, %r1270, %r1269; + add.s32 %r1272, %r1271, -1521486534; + xor.b32 %r1273, %r1272, %r3941; + shf.l.wrap.b32 %r1274, %r1273, %r1273, 20; + add.s32 %r1275, %r1163, %r1267; + add.s32 %r1276, %r1275, %r1274; + xor.b32 %r1277, %r1276, %r1271; + shf.l.wrap.b32 %r1278, %r1277, %r1277, 24; + add.s32 %r1279, %r1278, %r1272; + xor.b32 %r1280, %r1279, %r1274; + shf.l.wrap.b32 %r1281, %r1280, %r1280, 25; + add.s32 %r1282, %r1249, %r1230; + add.s32 %r1283, %r1282, %r1170; + xor.b32 %r1284, %r1278, %r1283; + shf.l.wrap.b32 %r1285, %r1284, %r1284, 16; + add.s32 %r1286, %r1285, %r1263; + xor.b32 %r1287, %r1286, %r1249; + shf.l.wrap.b32 %r1288, %r1287, %r1287, 20; + add.s32 %r1289, %r1177, %r1283; + add.s32 %r1290, %r1289, %r1288; + xor.b32 %r1291, %r1290, %r1285; + shf.l.wrap.b32 %r1292, %r1291, %r1291, 24; + add.s32 %r1293, %r1292, %r1286; + xor.b32 %r1294, %r1293, %r1288; + shf.l.wrap.b32 %r1295, %r1294, %r1294, 25; + add.s32 %r1296, %r1265, %r1244; + add.s32 %r1297, %r1296, %r1184; + xor.b32 %r1298, %r1297, %r1232; + shf.l.wrap.b32 %r1299, %r1298, %r1298, 16; + add.s32 %r1300, %r1299, %r1279; + xor.b32 %r1301, %r1300, %r1265; + shf.l.wrap.b32 %r1302, %r1301, %r1301, 20; + add.s32 %r1303, %r1191, %r1297; + add.s32 %r1304, %r1303, %r1302; + xor.b32 %r1305, %r1304, %r1299; + shf.l.wrap.b32 %r1306, %r1305, %r1305, 24; + add.s32 %r1307, %r1306, %r1300; + xor.b32 %r1308, %r1307, %r1302; + shf.l.wrap.b32 %r1309, %r1308, %r1308, 25; + add.s32 %r1310, %r1281, %r1260; + add.s32 %r1311, %r1310, %r1198; + xor.b32 %r1312, %r1311, %r1246; + shf.l.wrap.b32 %r1313, %r1312, %r1312, 16; + add.s32 %r1314, %r1313, %r1233; + xor.b32 %r1315, %r1314, %r1281; + shf.l.wrap.b32 %r1316, %r1315, %r1315, 20; + add.s32 %r1317, %r1205, %r1311; + add.s32 %r1318, %r1317, %r1316; + xor.b32 %r1319, %r1318, %r1313; + shf.l.wrap.b32 %r1320, %r1319, %r1319, 24; + add.s32 %r1321, %r1320, %r1314; + xor.b32 %r1322, %r1321, %r1316; + shf.l.wrap.b32 %r1323, %r1322, %r1322, 25; + add.s32 %r1324, %r1276, %r1235; + add.s32 %r1325, %r1324, %r1212; + xor.b32 %r1326, %r1325, %r1262; + shf.l.wrap.b32 %r1327, %r1326, %r1326, 16; + add.s32 %r1328, %r1327, %r1247; + xor.b32 %r1329, %r1328, %r1235; + shf.l.wrap.b32 %r1330, %r1329, %r1329, 20; + add.s32 %r1331, %r1219, %r1325; + add.s32 %r1332, %r1331, %r1330; + xor.b32 %r1333, %r1332, %r1327; + shf.l.wrap.b32 %r1334, %r1333, %r1333, 24; + add.s32 %r1335, %r1334, %r1328; + xor.b32 %r1336, %r1335, %r1330; + shf.l.wrap.b32 %r1337, %r1336, %r1336, 25; + add.s32 %r1338, %r1290, %r1128; + add.s32 %r1339, %r1338, %r1337; + xor.b32 %r1340, %r1339, %r1306; + shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; + add.s32 %r1342, %r1341, %r1321; + xor.b32 %r1343, %r1342, %r1337; + shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; + add.s32 %r1345, %r1339, %r1156; + add.s32 %r1346, %r1345, %r1344; + xor.b32 %r1347, %r1346, %r1341; + shf.l.wrap.b32 %r1348, %r1347, %r1347, 24; + add.s32 %r1349, %r1348, %r1342; + xor.b32 %r1350, %r1349, %r1344; + shf.l.wrap.b32 %r1351, %r1350, %r1350, 25; + add.s32 %r1352, %r1304, %r1135; + add.s32 %r1353, %r1352, %r1295; + xor.b32 %r1354, %r1320, %r1353; + shf.l.wrap.b32 %r1355, %r1354, %r1354, 16; + add.s32 %r1356, %r1335, %r1355; + xor.b32 %r1357, %r1356, %r1295; + shf.l.wrap.b32 %r1358, %r1357, %r1357, 20; + add.s32 %r1359, %r1353, %r1184; + add.s32 %r1360, %r1359, %r1358; + xor.b32 %r1361, %r1360, %r1355; + shf.l.wrap.b32 %r1362, %r1361, %r1361, 24; + add.s32 %r1363, %r1362, %r1356; + xor.b32 %r1364, %r1363, %r1358; + shf.l.wrap.b32 %r1365, %r1364, %r1364, 25; + add.s32 %r1366, %r1309, %r1163; + add.s32 %r1367, %r1366, %r1318; + xor.b32 %r1368, %r1334, %r1367; + shf.l.wrap.b32 %r1369, %r1368, %r1368, 16; + add.s32 %r1370, %r1369, %r1293; + xor.b32 %r1371, %r1370, %r1309; + shf.l.wrap.b32 %r1372, %r1371, %r1371, 20; + add.s32 %r1373, %r1367, %r1114; + add.s32 %r1374, %r1373, %r1372; + xor.b32 %r1375, %r1374, %r1369; + shf.l.wrap.b32 %r1376, %r1375, %r1375, 24; + add.s32 %r1377, %r1376, %r1370; + xor.b32 %r1378, %r1377, %r1372; + shf.l.wrap.b32 %r1379, %r1378, %r1378, 25; + add.s32 %r1380, %r1323, %r1142; + add.s32 %r1381, %r1380, %r1332; + xor.b32 %r1382, %r1381, %r1292; + shf.l.wrap.b32 %r1383, %r1382, %r1382, 16; + add.s32 %r1384, %r1383, %r1307; + xor.b32 %r1385, %r1384, %r1323; + shf.l.wrap.b32 %r1386, %r1385, %r1385, 20; + add.s32 %r1387, %r1381, %r1205; + add.s32 %r1388, %r1387, %r1386; + xor.b32 %r1389, %r1388, %r1383; + shf.l.wrap.b32 %r1390, %r1389, %r1389, 24; + add.s32 %r1391, %r1390, %r1384; + xor.b32 %r1392, %r1391, %r1386; + shf.l.wrap.b32 %r1393, %r1392, %r1392, 25; + add.s32 %r1394, %r1365, %r1121; + add.s32 %r1395, %r1394, %r1346; + xor.b32 %r1396, %r1395, %r1390; + shf.l.wrap.b32 %r1397, %r1396, %r1396, 16; + add.s32 %r1398, %r1397, %r1377; + xor.b32 %r1399, %r1398, %r1365; + shf.l.wrap.b32 %r1400, %r1399, %r1399, 20; + add.s32 %r1401, %r1395, %r1191; + add.s32 %r1402, %r1401, %r1400; + xor.b32 %r1403, %r1402, %r1397; + shf.l.wrap.b32 %r1404, %r1403, %r1403, 24; + add.s32 %r1405, %r1404, %r1398; + xor.b32 %r1406, %r1405, %r1400; + shf.l.wrap.b32 %r1407, %r1406, %r1406, 25; + add.s32 %r1408, %r1360, %r1198; + add.s32 %r1409, %r1408, %r1379; + xor.b32 %r1410, %r1348, %r1409; + shf.l.wrap.b32 %r1411, %r1410, %r1410, 16; + add.s32 %r1412, %r1411, %r1391; + xor.b32 %r1413, %r1412, %r1379; + shf.l.wrap.b32 %r1414, %r1413, %r1413, 20; + add.s32 %r1415, %r1409, %r1149; + add.s32 %r1416, %r1415, %r1414; + xor.b32 %r1417, %r1416, %r1411; + shf.l.wrap.b32 %r1418, %r1417, %r1417, 24; + add.s32 %r1419, %r1418, %r1412; + xor.b32 %r1420, %r1419, %r1414; + shf.l.wrap.b32 %r1421, %r1420, %r1420, 25; + add.s32 %r1422, %r1374, %r1177; + add.s32 %r1423, %r1422, %r1393; + xor.b32 %r1424, %r1423, %r1362; + shf.l.wrap.b32 %r1425, %r1424, %r1424, 16; + add.s32 %r1426, %r1425, %r1349; + xor.b32 %r1427, %r1426, %r1393; + shf.l.wrap.b32 %r1428, %r1427, %r1427, 20; + add.s32 %r1429, %r1423, %r1212; + add.s32 %r1430, %r1429, %r1428; + xor.b32 %r1431, %r1430, %r1425; + shf.l.wrap.b32 %r1432, %r1431, %r1431, 24; + add.s32 %r1433, %r1432, %r1426; + xor.b32 %r1434, %r1433, %r1428; + shf.l.wrap.b32 %r1435, %r1434, %r1434, 25; + add.s32 %r1436, %r1388, %r1219; + add.s32 %r1437, %r1436, %r1351; + xor.b32 %r1438, %r1437, %r1376; + shf.l.wrap.b32 %r1439, %r1438, %r1438, 16; + add.s32 %r1440, %r1439, %r1363; + xor.b32 %r1441, %r1440, %r1351; + shf.l.wrap.b32 %r1442, %r1441, %r1441, 20; + add.s32 %r1443, %r1437, %r1170; + add.s32 %r1444, %r1443, %r1442; + xor.b32 %r1445, %r1444, %r1439; + shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; + add.s32 %r1447, %r1446, %r1440; + xor.b32 %r1448, %r1447, %r1442; + shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; + add.s32 %r1450, %r1402, %r1135; + add.s32 %r1451, %r1450, %r1449; + xor.b32 %r1452, %r1451, %r1418; + shf.l.wrap.b32 %r1453, %r1452, %r1452, 16; + add.s32 %r1454, %r1453, %r1433; + xor.b32 %r1455, %r1454, %r1449; + shf.l.wrap.b32 %r1456, %r1455, %r1455, 20; + add.s32 %r1457, %r1451, %r1142; + add.s32 %r1458, %r1457, %r1456; + xor.b32 %r1459, %r1458, %r1453; + shf.l.wrap.b32 %r1460, %r1459, %r1459, 24; + add.s32 %r1461, %r1460, %r1454; + xor.b32 %r1462, %r1461, %r1456; + shf.l.wrap.b32 %r1463, %r1462, %r1462, 25; + add.s32 %r1464, %r1416, %r1184; + add.s32 %r1465, %r1464, %r1407; + xor.b32 %r1466, %r1465, %r1432; + shf.l.wrap.b32 %r1467, %r1466, %r1466, 16; + add.s32 %r1468, %r1467, %r1447; + xor.b32 %r1469, %r1468, %r1407; + shf.l.wrap.b32 %r1470, %r1469, %r1469, 20; + add.s32 %r1471, %r1465, %r1198; + add.s32 %r1472, %r1471, %r1470; + xor.b32 %r1473, %r1472, %r1467; + shf.l.wrap.b32 %r1474, %r1473, %r1473, 24; + add.s32 %r1475, %r1474, %r1468; + xor.b32 %r1476, %r1475, %r1470; + shf.l.wrap.b32 %r1477, %r1476, %r1476, 25; + add.s32 %r1478, %r1430, %r1205; + add.s32 %r1479, %r1478, %r1421; + xor.b32 %r1480, %r1446, %r1479; + shf.l.wrap.b32 %r1481, %r1480, %r1480, 16; + add.s32 %r1482, %r1481, %r1405; + xor.b32 %r1483, %r1482, %r1421; + shf.l.wrap.b32 %r1484, %r1483, %r1483, 20; + add.s32 %r1485, %r1479, %r1128; + add.s32 %r1486, %r1485, %r1484; + xor.b32 %r1487, %r1486, %r1481; + shf.l.wrap.b32 %r1488, %r1487, %r1487, 24; + add.s32 %r1489, %r1488, %r1482; + xor.b32 %r1490, %r1489, %r1484; + shf.l.wrap.b32 %r1491, %r1490, %r1490, 25; + add.s32 %r1492, %r1435, %r1163; + add.s32 %r1493, %r1492, %r1444; + xor.b32 %r1494, %r1493, %r1404; + shf.l.wrap.b32 %r1495, %r1494, %r1494, 16; + add.s32 %r1496, %r1495, %r1419; + xor.b32 %r1497, %r1496, %r1435; + shf.l.wrap.b32 %r1498, %r1497, %r1497, 20; + add.s32 %r1499, %r1493, %r1212; + add.s32 %r1500, %r1499, %r1498; + xor.b32 %r1501, %r1500, %r1495; + shf.l.wrap.b32 %r1502, %r1501, %r1501, 24; + add.s32 %r1503, %r1502, %r1496; + xor.b32 %r1504, %r1503, %r1498; + shf.l.wrap.b32 %r1505, %r1504, %r1504, 25; + add.s32 %r1506, %r1477, %r1156; + add.s32 %r1507, %r1506, %r1458; + xor.b32 %r1508, %r1507, %r1502; + shf.l.wrap.b32 %r1509, %r1508, %r1508, 16; + add.s32 %r1510, %r1509, %r1489; + xor.b32 %r1511, %r1510, %r1477; + shf.l.wrap.b32 %r1512, %r1511, %r1511, 20; + add.s32 %r1513, %r1507, %r1149; + add.s32 %r1514, %r1513, %r1512; + xor.b32 %r1515, %r1514, %r1509; + shf.l.wrap.b32 %r1516, %r1515, %r1515, 24; + add.s32 %r1517, %r1516, %r1510; + xor.b32 %r1518, %r1517, %r1512; + shf.l.wrap.b32 %r1519, %r1518, %r1518, 25; + add.s32 %r1520, %r1472, %r1177; + add.s32 %r1521, %r1520, %r1491; + xor.b32 %r1522, %r1460, %r1521; + shf.l.wrap.b32 %r1523, %r1522, %r1522, 16; + add.s32 %r1524, %r1523, %r1503; + xor.b32 %r1525, %r1524, %r1491; + shf.l.wrap.b32 %r1526, %r1525, %r1525, 20; + add.s32 %r1527, %r1521, %r1114; + add.s32 %r1528, %r1527, %r1526; + xor.b32 %r1529, %r1528, %r1523; + shf.l.wrap.b32 %r1530, %r1529, %r1529, 24; + add.s32 %r1531, %r1530, %r1524; + xor.b32 %r1532, %r1531, %r1526; + shf.l.wrap.b32 %r1533, %r1532, %r1532, 25; + add.s32 %r1534, %r1486, %r1191; + add.s32 %r1535, %r1534, %r1505; + xor.b32 %r1536, %r1535, %r1474; + shf.l.wrap.b32 %r1537, %r1536, %r1536, 16; + add.s32 %r1538, %r1537, %r1461; + xor.b32 %r1539, %r1538, %r1505; + shf.l.wrap.b32 %r1540, %r1539, %r1539, 20; + add.s32 %r1541, %r1535, %r1219; + add.s32 %r1542, %r1541, %r1540; + xor.b32 %r1543, %r1542, %r1537; + shf.l.wrap.b32 %r1544, %r1543, %r1543, 24; + add.s32 %r1545, %r1544, %r1538; + xor.b32 %r1546, %r1545, %r1540; + shf.l.wrap.b32 %r1547, %r1546, %r1546, 25; + add.s32 %r1548, %r1500, %r1170; + add.s32 %r1549, %r1548, %r1463; + xor.b32 %r1550, %r1549, %r1488; + shf.l.wrap.b32 %r1551, %r1550, %r1550, 16; + add.s32 %r1552, %r1551, %r1475; + xor.b32 %r1553, %r1552, %r1463; + shf.l.wrap.b32 %r1554, %r1553, %r1553, 20; + add.s32 %r1555, %r1549, %r1121; + add.s32 %r1556, %r1555, %r1554; + xor.b32 %r1557, %r1556, %r1551; + shf.l.wrap.b32 %r1558, %r1557, %r1557, 24; + add.s32 %r1559, %r1558, %r1552; + xor.b32 %r1560, %r1559, %r1554; + shf.l.wrap.b32 %r1561, %r1560, %r1560, 25; + add.s32 %r1562, %r1514, %r1184; + add.s32 %r1563, %r1562, %r1561; + xor.b32 %r1564, %r1563, %r1530; + shf.l.wrap.b32 %r1565, %r1564, %r1564, 16; + add.s32 %r1566, %r1565, %r1545; + xor.b32 %r1567, %r1566, %r1561; + shf.l.wrap.b32 %r1568, %r1567, %r1567, 20; + add.s32 %r1569, %r1563, %r1163; + add.s32 %r1570, %r1569, %r1568; + xor.b32 %r1571, %r1570, %r1565; + shf.l.wrap.b32 %r1572, %r1571, %r1571, 24; + add.s32 %r1573, %r1572, %r1566; + xor.b32 %r1574, %r1573, %r1568; + shf.l.wrap.b32 %r1575, %r1574, %r1574, 25; + add.s32 %r1576, %r1528, %r1198; + add.s32 %r1577, %r1576, %r1519; + xor.b32 %r1578, %r1577, %r1544; + shf.l.wrap.b32 %r1579, %r1578, %r1578, 16; + add.s32 %r1580, %r1579, %r1559; + xor.b32 %r1581, %r1580, %r1519; + shf.l.wrap.b32 %r1582, %r1581, %r1581, 20; + add.s32 %r1583, %r1577, %r1177; + add.s32 %r1584, %r1583, %r1582; + xor.b32 %r1585, %r1584, %r1579; + shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; + add.s32 %r1587, %r1586, %r1580; + xor.b32 %r1588, %r1587, %r1582; + shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; + add.s32 %r1590, %r1542, %r1212; + add.s32 %r1591, %r1590, %r1533; + xor.b32 %r1592, %r1558, %r1591; + shf.l.wrap.b32 %r1593, %r1592, %r1592, 16; + add.s32 %r1594, %r1593, %r1517; + xor.b32 %r1595, %r1594, %r1533; + shf.l.wrap.b32 %r1596, %r1595, %r1595, 20; + add.s32 %r1597, %r1591, %r1135; + add.s32 %r1598, %r1597, %r1596; + xor.b32 %r1599, %r1598, %r1593; + shf.l.wrap.b32 %r1600, %r1599, %r1599, 24; + add.s32 %r1601, %r1600, %r1594; + xor.b32 %r1602, %r1601, %r1596; + shf.l.wrap.b32 %r1603, %r1602, %r1602, 25; + add.s32 %r1604, %r1547, %r1205; + add.s32 %r1605, %r1604, %r1556; + xor.b32 %r1606, %r1605, %r1516; + shf.l.wrap.b32 %r1607, %r1606, %r1606, 16; + add.s32 %r1608, %r1607, %r1531; + xor.b32 %r1609, %r1608, %r1547; + shf.l.wrap.b32 %r1610, %r1609, %r1609, 20; + add.s32 %r1611, %r1605, %r1219; + add.s32 %r1612, %r1611, %r1610; + xor.b32 %r1613, %r1612, %r1607; + shf.l.wrap.b32 %r1614, %r1613, %r1613, 24; + add.s32 %r1615, %r1614, %r1608; + xor.b32 %r1616, %r1615, %r1610; + shf.l.wrap.b32 %r1617, %r1616, %r1616, 25; + add.s32 %r1618, %r1589, %r1142; + add.s32 %r1619, %r1618, %r1570; + xor.b32 %r1620, %r1619, %r1614; + shf.l.wrap.b32 %r1621, %r1620, %r1620, 16; + add.s32 %r1622, %r1621, %r1601; + xor.b32 %r1623, %r1622, %r1589; + shf.l.wrap.b32 %r1624, %r1623, %r1623, 20; + add.s32 %r1625, %r1619, %r1114; + add.s32 %r1626, %r1625, %r1624; + xor.b32 %r1627, %r1626, %r1621; + shf.l.wrap.b32 %r1628, %r1627, %r1627, 24; + add.s32 %r1629, %r1628, %r1622; + xor.b32 %r1630, %r1629, %r1624; + shf.l.wrap.b32 %r1631, %r1630, %r1630, 25; + add.s32 %r1632, %r1584, %r1191; + add.s32 %r1633, %r1632, %r1603; + xor.b32 %r1634, %r1572, %r1633; + shf.l.wrap.b32 %r1635, %r1634, %r1634, 16; + add.s32 %r1636, %r1635, %r1615; + xor.b32 %r1637, %r1636, %r1603; + shf.l.wrap.b32 %r1638, %r1637, %r1637, 20; + add.s32 %r1639, %r1633, %r1128; + add.s32 %r1640, %r1639, %r1638; + xor.b32 %r1641, %r1640, %r1635; + shf.l.wrap.b32 %r1642, %r1641, %r1641, 24; + add.s32 %r1643, %r1642, %r1636; + xor.b32 %r1644, %r1643, %r1638; + shf.l.wrap.b32 %r1645, %r1644, %r1644, 25; + add.s32 %r1646, %r1598, %r1149; + add.s32 %r1647, %r1646, %r1617; + xor.b32 %r1648, %r1647, %r1586; + shf.l.wrap.b32 %r1649, %r1648, %r1648, 16; + add.s32 %r1650, %r1649, %r1573; + xor.b32 %r1651, %r1650, %r1617; + shf.l.wrap.b32 %r1652, %r1651, %r1651, 20; + add.s32 %r1653, %r1647, %r1170; + add.s32 %r1654, %r1653, %r1652; + xor.b32 %r1655, %r1654, %r1649; + shf.l.wrap.b32 %r1656, %r1655, %r1655, 24; + add.s32 %r1657, %r1656, %r1650; + xor.b32 %r1658, %r1657, %r1652; + shf.l.wrap.b32 %r1659, %r1658, %r1658, 25; + add.s32 %r1660, %r1612, %r1121; + add.s32 %r1661, %r1660, %r1575; + xor.b32 %r1662, %r1661, %r1600; + shf.l.wrap.b32 %r1663, %r1662, %r1662, 16; + add.s32 %r1664, %r1663, %r1587; + xor.b32 %r1665, %r1664, %r1575; + shf.l.wrap.b32 %r1666, %r1665, %r1665, 20; + add.s32 %r1667, %r1661, %r1156; + add.s32 %r1668, %r1667, %r1666; + xor.b32 %r1669, %r1668, %r1663; + shf.l.wrap.b32 %r1670, %r1669, %r1669, 24; + add.s32 %r1671, %r1670, %r1664; + xor.b32 %r1672, %r1671, %r1666; + shf.l.wrap.b32 %r1673, %r1672, %r1672, 25; + add.s32 %r1674, %r1626, %r1198; + add.s32 %r1675, %r1674, %r1673; + xor.b32 %r1676, %r1675, %r1642; + shf.l.wrap.b32 %r1677, %r1676, %r1676, 16; + add.s32 %r1678, %r1677, %r1657; + xor.b32 %r1679, %r1678, %r1673; + shf.l.wrap.b32 %r1680, %r1679, %r1679, 20; + add.s32 %r1681, %r1675, %r1205; + add.s32 %r1682, %r1681, %r1680; + xor.b32 %r1683, %r1682, %r1677; + shf.l.wrap.b32 %r1684, %r1683, %r1683, 24; + add.s32 %r1685, %r1684, %r1678; + xor.b32 %r1686, %r1685, %r1680; + shf.l.wrap.b32 %r1687, %r1686, %r1686, 25; + add.s32 %r1688, %r1640, %r1177; + add.s32 %r1689, %r1688, %r1631; + xor.b32 %r1690, %r1689, %r1656; + shf.l.wrap.b32 %r1691, %r1690, %r1690, 16; + add.s32 %r1692, %r1691, %r1671; + xor.b32 %r1693, %r1692, %r1631; + shf.l.wrap.b32 %r1694, %r1693, %r1693, 20; + add.s32 %r1695, %r1689, %r1191; + add.s32 %r1696, %r1695, %r1694; + xor.b32 %r1697, %r1696, %r1691; + shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; + add.s32 %r1699, %r1698, %r1692; + xor.b32 %r1700, %r1699, %r1694; + shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; + add.s32 %r1702, %r1654, %r1219; + add.s32 %r1703, %r1702, %r1645; + xor.b32 %r1704, %r1670, %r1703; + shf.l.wrap.b32 %r1705, %r1704, %r1704, 16; + add.s32 %r1706, %r1705, %r1629; + xor.b32 %r1707, %r1706, %r1645; + shf.l.wrap.b32 %r1708, %r1707, %r1707, 20; + add.s32 %r1709, %r1703, %r1184; + add.s32 %r1710, %r1709, %r1708; + xor.b32 %r1711, %r1710, %r1705; + shf.l.wrap.b32 %r1712, %r1711, %r1711, 24; + add.s32 %r1713, %r1712, %r1706; + xor.b32 %r1714, %r1713, %r1708; + shf.l.wrap.b32 %r1715, %r1714, %r1714, 25; + add.s32 %r1716, %r1659, %r1212; + add.s32 %r1717, %r1716, %r1668; + xor.b32 %r1718, %r1717, %r1628; + shf.l.wrap.b32 %r1719, %r1718, %r1718, 16; + add.s32 %r1720, %r1719, %r1643; + xor.b32 %r1721, %r1720, %r1659; + shf.l.wrap.b32 %r1722, %r1721, %r1721, 20; + add.s32 %r1723, %r1717, %r1170; + add.s32 %r1724, %r1723, %r1722; + xor.b32 %r1725, %r1724, %r1719; + shf.l.wrap.b32 %r1726, %r1725, %r1725, 24; + add.s32 %r1727, %r1726, %r1720; + xor.b32 %r1728, %r1727, %r1722; + shf.l.wrap.b32 %r1729, %r1728, %r1728, 25; + add.s32 %r1730, %r1701, %r1163; + add.s32 %r1731, %r1730, %r1682; + xor.b32 %r1732, %r1731, %r1726; + shf.l.wrap.b32 %r1733, %r1732, %r1732, 16; + add.s32 %r1734, %r1733, %r1713; + xor.b32 %r1735, %r1734, %r1701; + shf.l.wrap.b32 %r1736, %r1735, %r1735, 20; + add.s32 %r1737, %r1731, %r1128; + add.s32 %r1738, %r1737, %r1736; + xor.b32 %r1739, %r1738, %r1733; + shf.l.wrap.b32 %r1740, %r1739, %r1739, 24; + add.s32 %r1741, %r1740, %r1734; + xor.b32 %r1742, %r1741, %r1736; + shf.l.wrap.b32 %r1743, %r1742, %r1742, 25; + add.s32 %r1744, %r1696, %r1149; + add.s32 %r1745, %r1744, %r1715; + xor.b32 %r1746, %r1684, %r1745; + shf.l.wrap.b32 %r1747, %r1746, %r1746, 16; + add.s32 %r1748, %r1747, %r1727; + xor.b32 %r1749, %r1748, %r1715; + shf.l.wrap.b32 %r1750, %r1749, %r1749, 20; + add.s32 %r1751, %r1745, %r1135; + add.s32 %r1752, %r1751, %r1750; + xor.b32 %r1753, %r1752, %r1747; + shf.l.wrap.b32 %r1754, %r1753, %r1753, 24; + add.s32 %r1755, %r1754, %r1748; + xor.b32 %r1756, %r1755, %r1750; + shf.l.wrap.b32 %r1757, %r1756, %r1756, 25; + add.s32 %r1758, %r1710, %r1114; + add.s32 %r1759, %r1758, %r1729; + xor.b32 %r1760, %r1759, %r1698; + shf.l.wrap.b32 %r1761, %r1760, %r1760, 16; + add.s32 %r1762, %r1761, %r1685; + xor.b32 %r1763, %r1762, %r1729; + shf.l.wrap.b32 %r1764, %r1763, %r1763, 20; + add.s32 %r1765, %r1759, %r1121; + add.s32 %r1766, %r1765, %r1764; + xor.b32 %r1767, %r1766, %r1761; + shf.l.wrap.b32 %r1768, %r1767, %r1767, 24; + add.s32 %r1769, %r1768, %r1762; + xor.b32 %r1770, %r1769, %r1764; + shf.l.wrap.b32 %r1771, %r1770, %r1770, 25; + add.s32 %r1772, %r1724, %r1156; + add.s32 %r1773, %r1772, %r1687; + xor.b32 %r1774, %r1773, %r1712; + shf.l.wrap.b32 %r1775, %r1774, %r1774, 16; + add.s32 %r1776, %r1775, %r1699; + xor.b32 %r1777, %r1776, %r1687; + shf.l.wrap.b32 %r1778, %r1777, %r1777, 20; + add.s32 %r1779, %r1773, %r1142; + add.s32 %r1780, %r1779, %r1778; + xor.b32 %r1781, %r1780, %r1775; + shf.l.wrap.b32 %r1782, %r1781, %r1781, 24; + add.s32 %r1783, %r1782, %r1776; + xor.b32 %r1784, %r1783, %r1778; + shf.l.wrap.b32 %r1785, %r1784, %r1784, 25; + add.s32 %r1786, %r1738, %r1177; + add.s32 %r1787, %r1786, %r1785; + xor.b32 %r1788, %r1787, %r1754; + shf.l.wrap.b32 %r1789, %r1788, %r1788, 16; + add.s32 %r1790, %r1789, %r1769; + xor.b32 %r1791, %r1790, %r1785; + shf.l.wrap.b32 %r1792, %r1791, %r1791, 20; + add.s32 %r1793, %r1787, %r1212; + add.s32 %r1794, %r1793, %r1792; + xor.b32 %r1795, %r1794, %r1789; + shf.l.wrap.b32 %r1796, %r1795, %r1795, 24; + add.s32 %r1797, %r1796, %r1790; + xor.b32 %r1798, %r1797, %r1792; + shf.l.wrap.b32 %r1799, %r1798, %r1798, 25; + add.s32 %r1800, %r1752, %r1191; + add.s32 %r1801, %r1800, %r1743; + xor.b32 %r1802, %r1801, %r1768; + shf.l.wrap.b32 %r1803, %r1802, %r1802, 16; + add.s32 %r1804, %r1803, %r1783; + xor.b32 %r1805, %r1804, %r1743; + shf.l.wrap.b32 %r1806, %r1805, %r1805, 20; + add.s32 %r1807, %r1801, %r1149; + add.s32 %r1808, %r1807, %r1806; + xor.b32 %r1809, %r1808, %r1803; + shf.l.wrap.b32 %r1810, %r1809, %r1809, 24; + add.s32 %r1811, %r1810, %r1804; + xor.b32 %r1812, %r1811, %r1806; + shf.l.wrap.b32 %r1813, %r1812, %r1812, 25; + add.s32 %r1814, %r1766, %r1170; + add.s32 %r1815, %r1814, %r1757; + xor.b32 %r1816, %r1782, %r1815; + shf.l.wrap.b32 %r1817, %r1816, %r1816, 16; + add.s32 %r1818, %r1817, %r1741; + xor.b32 %r1819, %r1818, %r1757; + shf.l.wrap.b32 %r1820, %r1819, %r1819, 20; + add.s32 %r1821, %r1815, %r1198; + add.s32 %r1822, %r1821, %r1820; + xor.b32 %r1823, %r1822, %r1817; + shf.l.wrap.b32 %r1824, %r1823, %r1823, 24; + add.s32 %r1825, %r1824, %r1818; + xor.b32 %r1826, %r1825, %r1820; + shf.l.wrap.b32 %r1827, %r1826, %r1826, 25; + add.s32 %r1828, %r1771, %r1219; + add.s32 %r1829, %r1828, %r1780; + xor.b32 %r1830, %r1829, %r1740; + shf.l.wrap.b32 %r1831, %r1830, %r1830, 16; + add.s32 %r1832, %r1831, %r1755; + xor.b32 %r1833, %r1832, %r1771; + shf.l.wrap.b32 %r1834, %r1833, %r1833, 20; + add.s32 %r1835, %r1829, %r1121; + add.s32 %r1836, %r1835, %r1834; + xor.b32 %r1837, %r1836, %r1831; + shf.l.wrap.b32 %r1838, %r1837, %r1837, 24; + add.s32 %r1839, %r1838, %r1832; + xor.b32 %r1840, %r1839, %r1834; + shf.l.wrap.b32 %r1841, %r1840, %r1840, 25; + add.s32 %r1842, %r1813, %r1205; + add.s32 %r1843, %r1842, %r1794; + xor.b32 %r1844, %r1843, %r1838; + shf.l.wrap.b32 %r1845, %r1844, %r1844, 16; + add.s32 %r1846, %r1845, %r1825; + xor.b32 %r1847, %r1846, %r1813; + shf.l.wrap.b32 %r1848, %r1847, %r1847, 20; + add.s32 %r1849, %r1843, %r1135; + add.s32 %r1850, %r1849, %r1848; + xor.b32 %r1851, %r1850, %r1845; + shf.l.wrap.b32 %r1852, %r1851, %r1851, 24; + add.s32 %r1853, %r1852, %r1846; + xor.b32 %r1854, %r1853, %r1848; + shf.l.wrap.b32 %r1855, %r1854, %r1854, 25; + add.s32 %r1856, %r1808, %r1114; + add.s32 %r1857, %r1856, %r1827; + xor.b32 %r1858, %r1796, %r1857; + shf.l.wrap.b32 %r1859, %r1858, %r1858, 16; + add.s32 %r1860, %r1859, %r1839; + xor.b32 %r1861, %r1860, %r1827; + shf.l.wrap.b32 %r1862, %r1861, %r1861, 20; + add.s32 %r1863, %r1857, %r1184; + add.s32 %r1864, %r1863, %r1862; + xor.b32 %r1865, %r1864, %r1859; + shf.l.wrap.b32 %r1866, %r1865, %r1865, 24; + add.s32 %r1867, %r1866, %r1860; + xor.b32 %r1868, %r1867, %r1862; + shf.l.wrap.b32 %r1869, %r1868, %r1868, 25; + add.s32 %r1870, %r1822, %r1128; + add.s32 %r1871, %r1870, %r1841; + xor.b32 %r1872, %r1871, %r1810; + shf.l.wrap.b32 %r1873, %r1872, %r1872, 16; + add.s32 %r1874, %r1873, %r1797; + xor.b32 %r1875, %r1874, %r1841; + shf.l.wrap.b32 %r1876, %r1875, %r1875, 20; + add.s32 %r1877, %r1871, %r1156; + add.s32 %r1878, %r1877, %r1876; + xor.b32 %r1879, %r1878, %r1873; + shf.l.wrap.b32 %r1880, %r1879, %r1879, 24; + add.s32 %r1881, %r1880, %r1874; + xor.b32 %r1882, %r1881, %r1876; + shf.l.wrap.b32 %r1883, %r1882, %r1882, 25; + add.s32 %r1884, %r1836, %r1142; + add.s32 %r1885, %r1884, %r1799; + xor.b32 %r1886, %r1885, %r1824; + shf.l.wrap.b32 %r1887, %r1886, %r1886, 16; + add.s32 %r1888, %r1887, %r1811; + xor.b32 %r1889, %r1888, %r1799; + shf.l.wrap.b32 %r1890, %r1889, %r1889, 20; + add.s32 %r1891, %r1885, %r1163; + add.s32 %r1892, %r1891, %r1890; + xor.b32 %r1893, %r1892, %r1887; + shf.l.wrap.b32 %r1894, %r1893, %r1893, 24; + add.s32 %r1895, %r1894, %r1888; + xor.b32 %r1896, %r1895, %r1890; + shf.l.wrap.b32 %r1897, %r1896, %r1896, 25; + add.s32 %r1898, %r1850, %r1191; + add.s32 %r1899, %r1898, %r1897; + xor.b32 %r1900, %r1899, %r1866; + shf.l.wrap.b32 %r1901, %r1900, %r1900, 16; + add.s32 %r1902, %r1901, %r1881; + xor.b32 %r1903, %r1902, %r1897; + shf.l.wrap.b32 %r1904, %r1903, %r1903, 20; + add.s32 %r1905, %r1899, %r1219; + add.s32 %r1906, %r1905, %r1904; + xor.b32 %r1907, %r1906, %r1901; + shf.l.wrap.b32 %r1908, %r1907, %r1907, 24; + add.s32 %r1909, %r1908, %r1902; + xor.b32 %r1910, %r1909, %r1904; + shf.l.wrap.b32 %r1911, %r1910, %r1910, 25; + add.s32 %r1912, %r1864, %r1149; + add.s32 %r1913, %r1912, %r1855; + xor.b32 %r1914, %r1913, %r1880; + shf.l.wrap.b32 %r1915, %r1914, %r1914, 16; + add.s32 %r1916, %r1915, %r1895; + xor.b32 %r1917, %r1916, %r1855; + shf.l.wrap.b32 %r1918, %r1917, %r1917, 20; + add.s32 %r1919, %r1913, %r1114; + add.s32 %r1920, %r1919, %r1918; + xor.b32 %r1921, %r1920, %r1915; + shf.l.wrap.b32 %r1922, %r1921, %r1921, 24; + add.s32 %r1923, %r1922, %r1916; + xor.b32 %r1924, %r1923, %r1918; + shf.l.wrap.b32 %r1925, %r1924, %r1924, 25; + add.s32 %r1926, %r1878, %r1121; + add.s32 %r1927, %r1926, %r1869; + xor.b32 %r1928, %r1894, %r1927; + shf.l.wrap.b32 %r1929, %r1928, %r1928, 16; + add.s32 %r1930, %r1929, %r1853; + xor.b32 %r1931, %r1930, %r1869; + shf.l.wrap.b32 %r1932, %r1931, %r1931, 20; + add.s32 %r1933, %r1927, %r1177; + add.s32 %r1934, %r1933, %r1932; + xor.b32 %r1935, %r1934, %r1929; + shf.l.wrap.b32 %r1936, %r1935, %r1935, 24; + add.s32 %r1937, %r1936, %r1930; + xor.b32 %r1938, %r1937, %r1932; + shf.l.wrap.b32 %r1939, %r1938, %r1938, 25; + add.s32 %r1940, %r1883, %r1170; + add.s32 %r1941, %r1940, %r1892; + xor.b32 %r1942, %r1941, %r1852; + shf.l.wrap.b32 %r1943, %r1942, %r1942, 16; + add.s32 %r1944, %r1943, %r1867; + xor.b32 %r1945, %r1944, %r1883; + shf.l.wrap.b32 %r1946, %r1945, %r1945, 20; + add.s32 %r1947, %r1941, %r1156; + add.s32 %r1948, %r1947, %r1946; + xor.b32 %r1949, %r1948, %r1943; + shf.l.wrap.b32 %r1950, %r1949, %r1949, 24; + add.s32 %r1951, %r1950, %r1944; + xor.b32 %r1952, %r1951, %r1946; + shf.l.wrap.b32 %r1953, %r1952, %r1952, 25; + add.s32 %r1954, %r1925, %r1212; + add.s32 %r1955, %r1954, %r1906; + xor.b32 %r1956, %r1955, %r1950; + shf.l.wrap.b32 %r1957, %r1956, %r1956, 16; + add.s32 %r1958, %r1957, %r1937; + xor.b32 %r1959, %r1958, %r1925; + shf.l.wrap.b32 %r1960, %r1959, %r1959, 20; + add.s32 %r1961, %r1955, %r1184; + add.s32 %r1962, %r1961, %r1960; + xor.b32 %r1963, %r1962, %r1957; + shf.l.wrap.b32 %r1964, %r1963, %r1963, 24; + add.s32 %r1965, %r1964, %r1958; + xor.b32 %r1966, %r1965, %r1960; + shf.l.wrap.b32 %r1967, %r1966, %r1966, 25; + add.s32 %r1968, %r1920, %r1128; + add.s32 %r1969, %r1968, %r1939; + xor.b32 %r1970, %r1908, %r1969; + shf.l.wrap.b32 %r1971, %r1970, %r1970, 16; + add.s32 %r1972, %r1971, %r1951; + xor.b32 %r1973, %r1972, %r1939; + shf.l.wrap.b32 %r1974, %r1973, %r1973, 20; + add.s32 %r1975, %r1969, %r1198; + add.s32 %r1976, %r1975, %r1974; + xor.b32 %r1977, %r1976, %r1971; + shf.l.wrap.b32 %r1978, %r1977, %r1977, 24; + add.s32 %r1979, %r1978, %r1972; + xor.b32 %r1980, %r1979, %r1974; + shf.l.wrap.b32 %r1981, %r1980, %r1980, 25; + add.s32 %r1982, %r1934, %r1135; + add.s32 %r1983, %r1982, %r1953; + xor.b32 %r1984, %r1983, %r1922; + shf.l.wrap.b32 %r1985, %r1984, %r1984, 16; + add.s32 %r1986, %r1985, %r1909; + xor.b32 %r1987, %r1986, %r1953; + shf.l.wrap.b32 %r1988, %r1987, %r1987, 20; + add.s32 %r1989, %r1983, %r1142; + add.s32 %r1990, %r1989, %r1988; + xor.b32 %r1991, %r1990, %r1985; + shf.l.wrap.b32 %r1992, %r1991, %r1991, 24; + add.s32 %r1993, %r1992, %r1986; + xor.b32 %r1994, %r1993, %r1988; + shf.l.wrap.b32 %r1995, %r1994, %r1994, 25; + add.s32 %r1996, %r1948, %r1163; + add.s32 %r1997, %r1996, %r1911; + xor.b32 %r1998, %r1997, %r1936; + shf.l.wrap.b32 %r1999, %r1998, %r1998, 16; + add.s32 %r2000, %r1999, %r1923; + xor.b32 %r2001, %r2000, %r1911; + shf.l.wrap.b32 %r2002, %r2001, %r2001, 20; + add.s32 %r2003, %r1997, %r1205; + add.s32 %r2004, %r2003, %r2002; + xor.b32 %r2005, %r2004, %r1999; + shf.l.wrap.b32 %r2006, %r2005, %r2005, 24; + add.s32 %r2007, %r2006, %r2000; + xor.b32 %r2008, %r2007, %r2002; + shf.l.wrap.b32 %r2009, %r2008, %r2008, 25; + xor.b32 %r3948, %r1993, %r1962; + xor.b32 %r3947, %r2007, %r1976; + xor.b32 %r3946, %r1965, %r1990; + xor.b32 %r3945, %r2004, %r1979; + xor.b32 %r3944, %r2009, %r1978; + xor.b32 %r3943, %r1967, %r1992; + xor.b32 %r3942, %r2006, %r1981; + xor.b32 %r3941, %r1995, %r1964; + add.s64 %rd166, %rd166, 64; + add.s64 %rd167, %rd167, -1; + setp.ne.s64 %p19, %rd167, 0; + mov.u16 %rs197, %rs75; + @%p19 bra $L__BB0_19; + + st.local.u8 [%rd155], %r3948; + shr.u32 %r2010, %r3948, 8; + st.local.u8 [%rd155+1], %r2010; + shr.u32 %r2011, %r3948, 16; + st.local.u8 [%rd155+2], %r2011; + shr.u32 %r2012, %r3948, 24; + st.local.u8 [%rd155+3], %r2012; + st.local.u8 [%rd155+4], %r3947; + shr.u32 %r2013, %r3947, 8; + st.local.u8 [%rd155+5], %r2013; + shr.u32 %r2014, %r3947, 16; + st.local.u8 [%rd155+6], %r2014; + shr.u32 %r2015, %r3947, 24; + st.local.u8 [%rd155+7], %r2015; + st.local.u8 [%rd155+8], %r3946; + shr.u32 %r2016, %r3946, 8; + st.local.u8 [%rd155+9], %r2016; + shr.u32 %r2017, %r3946, 16; + st.local.u8 [%rd155+10], %r2017; + shr.u32 %r2018, %r3946, 24; + st.local.u8 [%rd155+11], %r2018; + st.local.u8 [%rd155+12], %r3945; + shr.u32 %r2019, %r3945, 8; + st.local.u8 [%rd155+13], %r2019; + shr.u32 %r2020, %r3945, 16; + st.local.u8 [%rd155+14], %r2020; + shr.u32 %r2021, %r3945, 24; + st.local.u8 [%rd155+15], %r2021; + st.local.u8 [%rd155+16], %r3944; + shr.u32 %r2022, %r3944, 8; + st.local.u8 [%rd155+17], %r2022; + shr.u32 %r2023, %r3944, 16; + st.local.u8 [%rd155+18], %r2023; + shr.u32 %r2024, %r3944, 24; + st.local.u8 [%rd155+19], %r2024; + st.local.u8 [%rd155+20], %r3943; + shr.u32 %r2025, %r3943, 8; + st.local.u8 [%rd155+21], %r2025; + shr.u32 %r2026, %r3943, 16; + st.local.u8 [%rd155+22], %r2026; + shr.u32 %r2027, %r3943, 24; + st.local.u8 [%rd155+23], %r2027; + st.local.u8 [%rd155+24], %r3942; + shr.u32 %r2028, %r3942, 8; + st.local.u8 [%rd155+25], %r2028; + shr.u32 %r2029, %r3942, 16; + st.local.u8 [%rd155+26], %r2029; + shr.u32 %r2030, %r3942, 24; + st.local.u8 [%rd155+27], %r2030; + st.local.u8 [%rd155+28], %r3941; + shr.u32 %r2031, %r3941, 8; + st.local.u8 [%rd155+29], %r2031; + shr.u32 %r2032, %r3941, 16; + st.local.u8 [%rd155+30], %r2032; + shr.u32 %r2033, %r3941, 24; + st.local.u8 [%rd155+31], %r2033; + add.s64 %rd165, %rd165, 1; + add.s64 %rd162, %rd162, 8; + add.s64 %rd155, %rd155, 32; + add.s64 %rd163, %rd163, -1; + setp.ne.s64 %p20, %rd163, 0; + @%p20 bra $L__BB0_18; + +$L__BB0_21: + ld.param.u64 %rd139, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + setp.ge.u64 %p21, %rd159, %rd139; + @%p21 bra $L__BB0_30; + + ld.param.u64 %rd140, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd135, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + add.s64 %rd127, %rd151, %rd135; + ld.local.u8 %r2034, [%rd2]; + ld.local.u8 %r2035, [%rd2+1]; + prmt.b32 %r2036, %r2035, %r2034, 30212; + ld.local.u8 %r2037, [%rd2+2]; + ld.local.u8 %r2038, [%rd2+3]; + prmt.b32 %r2039, %r2038, %r2037, 30212; + prmt.b32 %r3964, %r2039, %r2036, 4180; + ld.local.u8 %r2040, [%rd2+4]; + ld.local.u8 %r2041, [%rd2+5]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd2+6]; + ld.local.u8 %r2044, [%rd2+7]; + prmt.b32 %r2045, %r2044, %r2043, 30212; + prmt.b32 %r3963, %r2045, %r2042, 4180; + ld.local.u8 %r2046, [%rd2+8]; + ld.local.u8 %r2047, [%rd2+9]; + prmt.b32 %r2048, %r2047, %r2046, 30212; + ld.local.u8 %r2049, [%rd2+10]; + ld.local.u8 %r2050, [%rd2+11]; + prmt.b32 %r2051, %r2050, %r2049, 30212; + prmt.b32 %r3962, %r2051, %r2048, 4180; + ld.local.u8 %r2052, [%rd2+12]; + ld.local.u8 %r2053, [%rd2+13]; + prmt.b32 %r2054, %r2053, %r2052, 30212; + ld.local.u8 %r2055, [%rd2+14]; + ld.local.u8 %r2056, [%rd2+15]; + prmt.b32 %r2057, %r2056, %r2055, 30212; + prmt.b32 %r3961, %r2057, %r2054, 4180; + ld.local.u8 %r2058, [%rd2+16]; + ld.local.u8 %r2059, [%rd2+17]; + prmt.b32 %r2060, %r2059, %r2058, 30212; + ld.local.u8 %r2061, [%rd2+18]; + ld.local.u8 %r2062, [%rd2+19]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + prmt.b32 %r3960, %r2063, %r2060, 4180; + ld.local.u8 %r2064, [%rd2+20]; + ld.local.u8 %r2065, [%rd2+21]; + prmt.b32 %r2066, %r2065, %r2064, 30212; + ld.local.u8 %r2067, [%rd2+22]; + ld.local.u8 %r2068, [%rd2+23]; + prmt.b32 %r2069, %r2068, %r2067, 30212; + prmt.b32 %r3959, %r2069, %r2066, 4180; + ld.local.u8 %r2070, [%rd2+24]; + ld.local.u8 %r2071, [%rd2+25]; + prmt.b32 %r2072, %r2071, %r2070, 30212; + ld.local.u8 %r2073, [%rd2+26]; + ld.local.u8 %r2074, [%rd2+27]; + prmt.b32 %r2075, %r2074, %r2073, 30212; + prmt.b32 %r3958, %r2075, %r2072, 4180; + ld.local.u8 %r2076, [%rd2+28]; + ld.local.u8 %r2077, [%rd2+29]; + prmt.b32 %r2078, %r2077, %r2076, 30212; + ld.local.u8 %r2079, [%rd2+30]; + ld.local.u8 %r2080, [%rd2+31]; + prmt.b32 %r2081, %r2080, %r2079, 30212; + prmt.b32 %r3957, %r2081, %r2078, 4180; + add.u64 %rd53, %SPL, 16; + mov.u32 %r2082, 0; + st.local.v2.u32 [%rd53], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+8], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+16], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+24], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+32], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+40], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+48], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+56], {%r2082, %r2082}; + mov.u16 %rs199, 0; + st.local.v2.u8 [%rd53+64], {%rs199, %rs199}; + st.local.u8 [%rd53+66], %rs75; + add.s64 %rd170, %rd140, %rd159; + cvt.u32.u64 %r36, %rd127; + shr.u64 %rd129, %rd127, 32; + cvt.u32.u64 %r37, %rd129; + setp.lt.u64 %p22, %rd171, 65; + @%p22 bra $L__BB0_25; + + add.s64 %rd56, %rd53, 64; + mov.u16 %rs198, 0; + +$L__BB0_24: + and.b16 %rs83, %rs198, 255; + setp.eq.s16 %p23, %rs83, 0; + selp.u16 %rs84, 1, 0, %p23; + or.b16 %rs85, %rs84, %rs75; + ld.u8 %r2083, [%rd170]; + ld.u8 %r2084, [%rd170+1]; + prmt.b32 %r2085, %r2084, %r2083, 30212; + ld.u8 %r2086, [%rd170+2]; + prmt.b32 %r2087, %r2086, %r2085, 28756; + ld.u8 %r2088, [%rd170+3]; + prmt.b32 %r2089, %r2088, %r2087, 1620; + ld.u8 %r2090, [%rd170+4]; + ld.u8 %r2091, [%rd170+5]; + prmt.b32 %r2092, %r2091, %r2090, 30212; + ld.u8 %r2093, [%rd170+6]; + prmt.b32 %r2094, %r2093, %r2092, 28756; + ld.u8 %r2095, [%rd170+7]; + prmt.b32 %r2096, %r2095, %r2094, 1620; + ld.u8 %r2097, [%rd170+8]; + ld.u8 %r2098, [%rd170+9]; + prmt.b32 %r2099, %r2098, %r2097, 30212; + ld.u8 %r2100, [%rd170+10]; + prmt.b32 %r2101, %r2100, %r2099, 28756; + ld.u8 %r2102, [%rd170+11]; + prmt.b32 %r2103, %r2102, %r2101, 1620; + ld.u8 %r2104, [%rd170+12]; + ld.u8 %r2105, [%rd170+13]; + prmt.b32 %r2106, %r2105, %r2104, 30212; + ld.u8 %r2107, [%rd170+14]; + prmt.b32 %r2108, %r2107, %r2106, 28756; + ld.u8 %r2109, [%rd170+15]; + prmt.b32 %r2110, %r2109, %r2108, 1620; + ld.u8 %r2111, [%rd170+16]; + ld.u8 %r2112, [%rd170+17]; + prmt.b32 %r2113, %r2112, %r2111, 30212; + ld.u8 %r2114, [%rd170+18]; + prmt.b32 %r2115, %r2114, %r2113, 28756; + ld.u8 %r2116, [%rd170+19]; + prmt.b32 %r2117, %r2116, %r2115, 1620; + ld.u8 %r2118, [%rd170+20]; + ld.u8 %r2119, [%rd170+21]; + prmt.b32 %r2120, %r2119, %r2118, 30212; + ld.u8 %r2121, [%rd170+22]; + prmt.b32 %r2122, %r2121, %r2120, 28756; + ld.u8 %r2123, [%rd170+23]; + prmt.b32 %r2124, %r2123, %r2122, 1620; + ld.u8 %r2125, [%rd170+24]; + ld.u8 %r2126, [%rd170+25]; + prmt.b32 %r2127, %r2126, %r2125, 30212; + ld.u8 %r2128, [%rd170+26]; + prmt.b32 %r2129, %r2128, %r2127, 28756; + ld.u8 %r2130, [%rd170+27]; + prmt.b32 %r2131, %r2130, %r2129, 1620; + ld.u8 %r2132, [%rd170+28]; + ld.u8 %r2133, [%rd170+29]; + prmt.b32 %r2134, %r2133, %r2132, 30212; + ld.u8 %r2135, [%rd170+30]; + prmt.b32 %r2136, %r2135, %r2134, 28756; + ld.u8 %r2137, [%rd170+31]; + prmt.b32 %r2138, %r2137, %r2136, 1620; + ld.u8 %r2139, [%rd170+32]; + ld.u8 %r2140, [%rd170+33]; + prmt.b32 %r2141, %r2140, %r2139, 30212; + ld.u8 %r2142, [%rd170+34]; + prmt.b32 %r2143, %r2142, %r2141, 28756; + ld.u8 %r2144, [%rd170+35]; + prmt.b32 %r2145, %r2144, %r2143, 1620; + ld.u8 %r2146, [%rd170+36]; + ld.u8 %r2147, [%rd170+37]; + prmt.b32 %r2148, %r2147, %r2146, 30212; + ld.u8 %r2149, [%rd170+38]; + prmt.b32 %r2150, %r2149, %r2148, 28756; + ld.u8 %r2151, [%rd170+39]; + prmt.b32 %r2152, %r2151, %r2150, 1620; + ld.u8 %r2153, [%rd170+40]; + ld.u8 %r2154, [%rd170+41]; + prmt.b32 %r2155, %r2154, %r2153, 30212; + ld.u8 %r2156, [%rd170+42]; + prmt.b32 %r2157, %r2156, %r2155, 28756; + ld.u8 %r2158, [%rd170+43]; + prmt.b32 %r2159, %r2158, %r2157, 1620; + ld.u8 %r2160, [%rd170+44]; + ld.u8 %r2161, [%rd170+45]; + prmt.b32 %r2162, %r2161, %r2160, 30212; + ld.u8 %r2163, [%rd170+46]; + prmt.b32 %r2164, %r2163, %r2162, 28756; + ld.u8 %r2165, [%rd170+47]; + prmt.b32 %r2166, %r2165, %r2164, 1620; + ld.u8 %r2167, [%rd170+48]; + ld.u8 %r2168, [%rd170+49]; + prmt.b32 %r2169, %r2168, %r2167, 30212; + ld.u8 %r2170, [%rd170+50]; + prmt.b32 %r2171, %r2170, %r2169, 28756; + ld.u8 %r2172, [%rd170+51]; + prmt.b32 %r2173, %r2172, %r2171, 1620; + ld.u8 %r2174, [%rd170+52]; + ld.u8 %r2175, [%rd170+53]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.u8 %r2177, [%rd170+54]; + prmt.b32 %r2178, %r2177, %r2176, 28756; + ld.u8 %r2179, [%rd170+55]; + prmt.b32 %r2180, %r2179, %r2178, 1620; + ld.u8 %r2181, [%rd170+56]; + ld.u8 %r2182, [%rd170+57]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.u8 %r2184, [%rd170+58]; + prmt.b32 %r2185, %r2184, %r2183, 28756; + ld.u8 %r2186, [%rd170+59]; + prmt.b32 %r2187, %r2186, %r2185, 1620; + ld.u8 %r2188, [%rd170+60]; + ld.u8 %r2189, [%rd170+61]; + prmt.b32 %r2190, %r2189, %r2188, 30212; + ld.u8 %r2191, [%rd170+62]; + prmt.b32 %r2192, %r2191, %r2190, 28756; + ld.u8 %r2193, [%rd170+63]; + prmt.b32 %r2194, %r2193, %r2192, 1620; + cvt.u32.u16 %r2195, %rs85; + add.s32 %r2196, %r3964, %r2089; + add.s32 %r2197, %r2196, %r3960; + xor.b32 %r2198, %r2197, %r36; + shf.l.wrap.b32 %r2199, %r2198, %r2198, 16; + add.s32 %r2200, %r2199, 1779033703; + xor.b32 %r2201, %r2200, %r3960; + shf.l.wrap.b32 %r2202, %r2201, %r2201, 20; + add.s32 %r2203, %r2197, %r2096; + add.s32 %r2204, %r2203, %r2202; + xor.b32 %r2205, %r2204, %r2199; + shf.l.wrap.b32 %r2206, %r2205, %r2205, 24; + add.s32 %r2207, %r2206, %r2200; + xor.b32 %r2208, %r2207, %r2202; + shf.l.wrap.b32 %r2209, %r2208, %r2208, 25; + add.s32 %r2210, %r3963, %r2103; + add.s32 %r2211, %r2210, %r3959; + xor.b32 %r2212, %r2211, %r37; + shf.l.wrap.b32 %r2213, %r2212, %r2212, 16; + add.s32 %r2214, %r2213, -1150833019; + xor.b32 %r2215, %r2214, %r3959; + shf.l.wrap.b32 %r2216, %r2215, %r2215, 20; + add.s32 %r2217, %r2211, %r2110; + add.s32 %r2218, %r2217, %r2216; + xor.b32 %r2219, %r2218, %r2213; + shf.l.wrap.b32 %r2220, %r2219, %r2219, 24; + add.s32 %r2221, %r2220, %r2214; + xor.b32 %r2222, %r2221, %r2216; + shf.l.wrap.b32 %r2223, %r2222, %r2222, 25; + add.s32 %r2224, %r3962, %r2117; + add.s32 %r2225, %r2224, %r3958; + shr.u32 %r2226, %r2225, 16; + shl.b32 %r2227, %r2225, 16; + xor.b32 %r2228, %r2227, 4194304; + or.b32 %r2229, %r2228, %r2226; + add.s32 %r2230, %r2229, 1013904242; + xor.b32 %r2231, %r2230, %r3958; + shf.l.wrap.b32 %r2232, %r2231, %r2231, 20; + add.s32 %r2233, %r2225, %r2124; + add.s32 %r2234, %r2233, %r2232; + xor.b32 %r2235, %r2234, %r2229; + shf.l.wrap.b32 %r2236, %r2235, %r2235, 24; + add.s32 %r2237, %r2236, %r2230; + xor.b32 %r2238, %r2237, %r2232; + shf.l.wrap.b32 %r2239, %r2238, %r2238, 25; + add.s32 %r2240, %r3961, %r2131; + add.s32 %r2241, %r2240, %r3957; + xor.b32 %r2242, %r2241, %r2195; + shr.u32 %r2243, %r2241, 16; + shl.b32 %r2244, %r2242, 16; + or.b32 %r2245, %r2244, %r2243; + add.s32 %r2246, %r2245, -1521486534; + xor.b32 %r2247, %r2246, %r3957; + shf.l.wrap.b32 %r2248, %r2247, %r2247, 20; + add.s32 %r2249, %r2241, %r2138; + add.s32 %r2250, %r2249, %r2248; + xor.b32 %r2251, %r2250, %r2245; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 24; + add.s32 %r2253, %r2252, %r2246; + xor.b32 %r2254, %r2253, %r2248; + shf.l.wrap.b32 %r2255, %r2254, %r2254, 25; + add.s32 %r2256, %r2204, %r2145; + add.s32 %r2257, %r2256, %r2223; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 16; + add.s32 %r2260, %r2259, %r2237; + xor.b32 %r2261, %r2260, %r2223; + shf.l.wrap.b32 %r2262, %r2261, %r2261, 20; + add.s32 %r2263, %r2257, %r2152; + add.s32 %r2264, %r2263, %r2262; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 24; + add.s32 %r2267, %r2266, %r2260; + xor.b32 %r2268, %r2267, %r2262; + shf.l.wrap.b32 %r2269, %r2268, %r2268, 25; + add.s32 %r2270, %r2218, %r2159; + add.s32 %r2271, %r2270, %r2239; + xor.b32 %r2272, %r2271, %r2206; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 16; + add.s32 %r2274, %r2273, %r2253; + xor.b32 %r2275, %r2274, %r2239; + shf.l.wrap.b32 %r2276, %r2275, %r2275, 20; + add.s32 %r2277, %r2271, %r2166; + add.s32 %r2278, %r2277, %r2276; + xor.b32 %r2279, %r2278, %r2273; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 24; + add.s32 %r2281, %r2280, %r2274; + xor.b32 %r2282, %r2281, %r2276; + shf.l.wrap.b32 %r2283, %r2282, %r2282, 25; + add.s32 %r2284, %r2234, %r2173; + add.s32 %r2285, %r2284, %r2255; + xor.b32 %r2286, %r2285, %r2220; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 16; + add.s32 %r2288, %r2287, %r2207; + xor.b32 %r2289, %r2288, %r2255; + shf.l.wrap.b32 %r2290, %r2289, %r2289, 20; + add.s32 %r2291, %r2285, %r2180; + add.s32 %r2292, %r2291, %r2290; + xor.b32 %r2293, %r2292, %r2287; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 24; + add.s32 %r2295, %r2294, %r2288; + xor.b32 %r2296, %r2295, %r2290; + shf.l.wrap.b32 %r2297, %r2296, %r2296, 25; + add.s32 %r2298, %r2250, %r2187; + add.s32 %r2299, %r2298, %r2209; + xor.b32 %r2300, %r2299, %r2236; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 16; + add.s32 %r2302, %r2301, %r2221; + xor.b32 %r2303, %r2302, %r2209; + shf.l.wrap.b32 %r2304, %r2303, %r2303, 20; + add.s32 %r2305, %r2299, %r2194; + add.s32 %r2306, %r2305, %r2304; + xor.b32 %r2307, %r2306, %r2301; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 24; + add.s32 %r2309, %r2308, %r2302; + xor.b32 %r2310, %r2309, %r2304; + shf.l.wrap.b32 %r2311, %r2310, %r2310, 25; + add.s32 %r2312, %r2264, %r2103; + add.s32 %r2313, %r2312, %r2311; + xor.b32 %r2314, %r2313, %r2280; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 16; + add.s32 %r2316, %r2315, %r2295; + xor.b32 %r2317, %r2316, %r2311; + shf.l.wrap.b32 %r2318, %r2317, %r2317, 20; + add.s32 %r2319, %r2313, %r2131; + add.s32 %r2320, %r2319, %r2318; + xor.b32 %r2321, %r2320, %r2315; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 24; + add.s32 %r2323, %r2322, %r2316; + xor.b32 %r2324, %r2323, %r2318; + shf.l.wrap.b32 %r2325, %r2324, %r2324, 25; + add.s32 %r2326, %r2278, %r2110; + add.s32 %r2327, %r2326, %r2269; + xor.b32 %r2328, %r2327, %r2294; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 16; + add.s32 %r2330, %r2329, %r2309; + xor.b32 %r2331, %r2330, %r2269; + shf.l.wrap.b32 %r2332, %r2331, %r2331, 20; + add.s32 %r2333, %r2327, %r2159; + add.s32 %r2334, %r2333, %r2332; + xor.b32 %r2335, %r2334, %r2329; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 24; + add.s32 %r2337, %r2336, %r2330; + xor.b32 %r2338, %r2337, %r2332; + shf.l.wrap.b32 %r2339, %r2338, %r2338, 25; + add.s32 %r2340, %r2292, %r2138; + add.s32 %r2341, %r2340, %r2283; + xor.b32 %r2342, %r2341, %r2308; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 16; + add.s32 %r2344, %r2343, %r2267; + xor.b32 %r2345, %r2344, %r2283; + shf.l.wrap.b32 %r2346, %r2345, %r2345, 20; + add.s32 %r2347, %r2341, %r2089; + add.s32 %r2348, %r2347, %r2346; + xor.b32 %r2349, %r2348, %r2343; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 24; + add.s32 %r2351, %r2350, %r2344; + xor.b32 %r2352, %r2351, %r2346; + shf.l.wrap.b32 %r2353, %r2352, %r2352, 25; + add.s32 %r2354, %r2306, %r2117; + add.s32 %r2355, %r2354, %r2297; + xor.b32 %r2356, %r2355, %r2266; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 16; + add.s32 %r2358, %r2357, %r2281; + xor.b32 %r2359, %r2358, %r2297; + shf.l.wrap.b32 %r2360, %r2359, %r2359, 20; + add.s32 %r2361, %r2355, %r2180; + add.s32 %r2362, %r2361, %r2360; + xor.b32 %r2363, %r2362, %r2357; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 24; + add.s32 %r2365, %r2364, %r2358; + xor.b32 %r2366, %r2365, %r2360; + shf.l.wrap.b32 %r2367, %r2366, %r2366, 25; + add.s32 %r2368, %r2320, %r2096; + add.s32 %r2369, %r2368, %r2339; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 16; + add.s32 %r2372, %r2371, %r2351; + xor.b32 %r2373, %r2372, %r2339; + shf.l.wrap.b32 %r2374, %r2373, %r2373, 20; + add.s32 %r2375, %r2369, %r2166; + add.s32 %r2376, %r2375, %r2374; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 24; + add.s32 %r2379, %r2378, %r2372; + xor.b32 %r2380, %r2379, %r2374; + shf.l.wrap.b32 %r2381, %r2380, %r2380, 25; + add.s32 %r2382, %r2334, %r2173; + add.s32 %r2383, %r2382, %r2353; + xor.b32 %r2384, %r2383, %r2322; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 16; + add.s32 %r2386, %r2385, %r2365; + xor.b32 %r2387, %r2386, %r2353; + shf.l.wrap.b32 %r2388, %r2387, %r2387, 20; + add.s32 %r2389, %r2383, %r2124; + add.s32 %r2390, %r2389, %r2388; + xor.b32 %r2391, %r2390, %r2385; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 24; + add.s32 %r2393, %r2392, %r2386; + xor.b32 %r2394, %r2393, %r2388; + shf.l.wrap.b32 %r2395, %r2394, %r2394, 25; + add.s32 %r2396, %r2348, %r2152; + add.s32 %r2397, %r2396, %r2367; + xor.b32 %r2398, %r2397, %r2336; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 16; + add.s32 %r2400, %r2399, %r2323; + xor.b32 %r2401, %r2400, %r2367; + shf.l.wrap.b32 %r2402, %r2401, %r2401, 20; + add.s32 %r2403, %r2397, %r2187; + add.s32 %r2404, %r2403, %r2402; + xor.b32 %r2405, %r2404, %r2399; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 24; + add.s32 %r2407, %r2406, %r2400; + xor.b32 %r2408, %r2407, %r2402; + shf.l.wrap.b32 %r2409, %r2408, %r2408, 25; + add.s32 %r2410, %r2362, %r2194; + add.s32 %r2411, %r2410, %r2325; + xor.b32 %r2412, %r2411, %r2350; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 16; + add.s32 %r2414, %r2413, %r2337; + xor.b32 %r2415, %r2414, %r2325; + shf.l.wrap.b32 %r2416, %r2415, %r2415, 20; + add.s32 %r2417, %r2411, %r2145; + add.s32 %r2418, %r2417, %r2416; + xor.b32 %r2419, %r2418, %r2413; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 24; + add.s32 %r2421, %r2420, %r2414; + xor.b32 %r2422, %r2421, %r2416; + shf.l.wrap.b32 %r2423, %r2422, %r2422, 25; + add.s32 %r2424, %r2376, %r2110; + add.s32 %r2425, %r2424, %r2423; + xor.b32 %r2426, %r2425, %r2392; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 16; + add.s32 %r2428, %r2427, %r2407; + xor.b32 %r2429, %r2428, %r2423; + shf.l.wrap.b32 %r2430, %r2429, %r2429, 20; + add.s32 %r2431, %r2425, %r2117; + add.s32 %r2432, %r2431, %r2430; + xor.b32 %r2433, %r2432, %r2427; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 24; + add.s32 %r2435, %r2434, %r2428; + xor.b32 %r2436, %r2435, %r2430; + shf.l.wrap.b32 %r2437, %r2436, %r2436, 25; + add.s32 %r2438, %r2390, %r2159; + add.s32 %r2439, %r2438, %r2381; + xor.b32 %r2440, %r2439, %r2406; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 16; + add.s32 %r2442, %r2441, %r2421; + xor.b32 %r2443, %r2442, %r2381; + shf.l.wrap.b32 %r2444, %r2443, %r2443, 20; + add.s32 %r2445, %r2439, %r2173; + add.s32 %r2446, %r2445, %r2444; + xor.b32 %r2447, %r2446, %r2441; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 24; + add.s32 %r2449, %r2448, %r2442; + xor.b32 %r2450, %r2449, %r2444; + shf.l.wrap.b32 %r2451, %r2450, %r2450, 25; + add.s32 %r2452, %r2404, %r2180; + add.s32 %r2453, %r2452, %r2395; + xor.b32 %r2454, %r2453, %r2420; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 16; + add.s32 %r2456, %r2455, %r2379; + xor.b32 %r2457, %r2456, %r2395; + shf.l.wrap.b32 %r2458, %r2457, %r2457, 20; + add.s32 %r2459, %r2453, %r2103; + add.s32 %r2460, %r2459, %r2458; + xor.b32 %r2461, %r2460, %r2455; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 24; + add.s32 %r2463, %r2462, %r2456; + xor.b32 %r2464, %r2463, %r2458; + shf.l.wrap.b32 %r2465, %r2464, %r2464, 25; + add.s32 %r2466, %r2418, %r2138; + add.s32 %r2467, %r2466, %r2409; + xor.b32 %r2468, %r2467, %r2378; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 16; + add.s32 %r2470, %r2469, %r2393; + xor.b32 %r2471, %r2470, %r2409; + shf.l.wrap.b32 %r2472, %r2471, %r2471, 20; + add.s32 %r2473, %r2467, %r2187; + add.s32 %r2474, %r2473, %r2472; + xor.b32 %r2475, %r2474, %r2469; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 24; + add.s32 %r2477, %r2476, %r2470; + xor.b32 %r2478, %r2477, %r2472; + shf.l.wrap.b32 %r2479, %r2478, %r2478, 25; + add.s32 %r2480, %r2432, %r2131; + add.s32 %r2481, %r2480, %r2451; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 16; + add.s32 %r2484, %r2483, %r2463; + xor.b32 %r2485, %r2484, %r2451; + shf.l.wrap.b32 %r2486, %r2485, %r2485, 20; + add.s32 %r2487, %r2481, %r2124; + add.s32 %r2488, %r2487, %r2486; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 24; + add.s32 %r2491, %r2490, %r2484; + xor.b32 %r2492, %r2491, %r2486; + shf.l.wrap.b32 %r2493, %r2492, %r2492, 25; + add.s32 %r2494, %r2446, %r2152; + add.s32 %r2495, %r2494, %r2465; + xor.b32 %r2496, %r2495, %r2434; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 16; + add.s32 %r2498, %r2497, %r2477; + xor.b32 %r2499, %r2498, %r2465; + shf.l.wrap.b32 %r2500, %r2499, %r2499, 20; + add.s32 %r2501, %r2495, %r2089; + add.s32 %r2502, %r2501, %r2500; + xor.b32 %r2503, %r2502, %r2497; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 24; + add.s32 %r2505, %r2504, %r2498; + xor.b32 %r2506, %r2505, %r2500; + shf.l.wrap.b32 %r2507, %r2506, %r2506, 25; + add.s32 %r2508, %r2460, %r2166; + add.s32 %r2509, %r2508, %r2479; + xor.b32 %r2510, %r2509, %r2448; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 16; + add.s32 %r2512, %r2511, %r2435; + xor.b32 %r2513, %r2512, %r2479; + shf.l.wrap.b32 %r2514, %r2513, %r2513, 20; + add.s32 %r2515, %r2509, %r2194; + add.s32 %r2516, %r2515, %r2514; + xor.b32 %r2517, %r2516, %r2511; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 24; + add.s32 %r2519, %r2518, %r2512; + xor.b32 %r2520, %r2519, %r2514; + shf.l.wrap.b32 %r2521, %r2520, %r2520, 25; + add.s32 %r2522, %r2474, %r2145; + add.s32 %r2523, %r2522, %r2437; + xor.b32 %r2524, %r2523, %r2462; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 16; + add.s32 %r2526, %r2525, %r2449; + xor.b32 %r2527, %r2526, %r2437; + shf.l.wrap.b32 %r2528, %r2527, %r2527, 20; + add.s32 %r2529, %r2523, %r2096; + add.s32 %r2530, %r2529, %r2528; + xor.b32 %r2531, %r2530, %r2525; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 24; + add.s32 %r2533, %r2532, %r2526; + xor.b32 %r2534, %r2533, %r2528; + shf.l.wrap.b32 %r2535, %r2534, %r2534, 25; + add.s32 %r2536, %r2488, %r2159; + add.s32 %r2537, %r2536, %r2535; + xor.b32 %r2538, %r2537, %r2504; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 16; + add.s32 %r2540, %r2539, %r2519; + xor.b32 %r2541, %r2540, %r2535; + shf.l.wrap.b32 %r2542, %r2541, %r2541, 20; + add.s32 %r2543, %r2537, %r2138; + add.s32 %r2544, %r2543, %r2542; + xor.b32 %r2545, %r2544, %r2539; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 24; + add.s32 %r2547, %r2546, %r2540; + xor.b32 %r2548, %r2547, %r2542; + shf.l.wrap.b32 %r2549, %r2548, %r2548, 25; + add.s32 %r2550, %r2502, %r2173; + add.s32 %r2551, %r2550, %r2493; + xor.b32 %r2552, %r2551, %r2518; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 16; + add.s32 %r2554, %r2553, %r2533; + xor.b32 %r2555, %r2554, %r2493; + shf.l.wrap.b32 %r2556, %r2555, %r2555, 20; + add.s32 %r2557, %r2551, %r2152; + add.s32 %r2558, %r2557, %r2556; + xor.b32 %r2559, %r2558, %r2553; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 24; + add.s32 %r2561, %r2560, %r2554; + xor.b32 %r2562, %r2561, %r2556; + shf.l.wrap.b32 %r2563, %r2562, %r2562, 25; + add.s32 %r2564, %r2516, %r2187; + add.s32 %r2565, %r2564, %r2507; + xor.b32 %r2566, %r2565, %r2532; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 16; + add.s32 %r2568, %r2567, %r2491; + xor.b32 %r2569, %r2568, %r2507; + shf.l.wrap.b32 %r2570, %r2569, %r2569, 20; + add.s32 %r2571, %r2565, %r2110; + add.s32 %r2572, %r2571, %r2570; + xor.b32 %r2573, %r2572, %r2567; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 24; + add.s32 %r2575, %r2574, %r2568; + xor.b32 %r2576, %r2575, %r2570; + shf.l.wrap.b32 %r2577, %r2576, %r2576, 25; + add.s32 %r2578, %r2530, %r2180; + add.s32 %r2579, %r2578, %r2521; + xor.b32 %r2580, %r2579, %r2490; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 16; + add.s32 %r2582, %r2581, %r2505; + xor.b32 %r2583, %r2582, %r2521; + shf.l.wrap.b32 %r2584, %r2583, %r2583, 20; + add.s32 %r2585, %r2579, %r2194; + add.s32 %r2586, %r2585, %r2584; + xor.b32 %r2587, %r2586, %r2581; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 24; + add.s32 %r2589, %r2588, %r2582; + xor.b32 %r2590, %r2589, %r2584; + shf.l.wrap.b32 %r2591, %r2590, %r2590, 25; + add.s32 %r2592, %r2544, %r2117; + add.s32 %r2593, %r2592, %r2563; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 16; + add.s32 %r2596, %r2595, %r2575; + xor.b32 %r2597, %r2596, %r2563; + shf.l.wrap.b32 %r2598, %r2597, %r2597, 20; + add.s32 %r2599, %r2593, %r2089; + add.s32 %r2600, %r2599, %r2598; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 24; + add.s32 %r2603, %r2602, %r2596; + xor.b32 %r2604, %r2603, %r2598; + shf.l.wrap.b32 %r2605, %r2604, %r2604, 25; + add.s32 %r2606, %r2558, %r2166; + add.s32 %r2607, %r2606, %r2577; + xor.b32 %r2608, %r2607, %r2546; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 16; + add.s32 %r2610, %r2609, %r2589; + xor.b32 %r2611, %r2610, %r2577; + shf.l.wrap.b32 %r2612, %r2611, %r2611, 20; + add.s32 %r2613, %r2607, %r2103; + add.s32 %r2614, %r2613, %r2612; + xor.b32 %r2615, %r2614, %r2609; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 24; + add.s32 %r2617, %r2616, %r2610; + xor.b32 %r2618, %r2617, %r2612; + shf.l.wrap.b32 %r2619, %r2618, %r2618, 25; + add.s32 %r2620, %r2572, %r2124; + add.s32 %r2621, %r2620, %r2591; + xor.b32 %r2622, %r2621, %r2560; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 16; + add.s32 %r2624, %r2623, %r2547; + xor.b32 %r2625, %r2624, %r2591; + shf.l.wrap.b32 %r2626, %r2625, %r2625, 20; + add.s32 %r2627, %r2621, %r2145; + add.s32 %r2628, %r2627, %r2626; + xor.b32 %r2629, %r2628, %r2623; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 24; + add.s32 %r2631, %r2630, %r2624; + xor.b32 %r2632, %r2631, %r2626; + shf.l.wrap.b32 %r2633, %r2632, %r2632, 25; + add.s32 %r2634, %r2586, %r2096; + add.s32 %r2635, %r2634, %r2549; + xor.b32 %r2636, %r2635, %r2574; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 16; + add.s32 %r2638, %r2637, %r2561; + xor.b32 %r2639, %r2638, %r2549; + shf.l.wrap.b32 %r2640, %r2639, %r2639, 20; + add.s32 %r2641, %r2635, %r2131; + add.s32 %r2642, %r2641, %r2640; + xor.b32 %r2643, %r2642, %r2637; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 24; + add.s32 %r2645, %r2644, %r2638; + xor.b32 %r2646, %r2645, %r2640; + shf.l.wrap.b32 %r2647, %r2646, %r2646, 25; + add.s32 %r2648, %r2600, %r2173; + add.s32 %r2649, %r2648, %r2647; + xor.b32 %r2650, %r2649, %r2616; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 16; + add.s32 %r2652, %r2651, %r2631; + xor.b32 %r2653, %r2652, %r2647; + shf.l.wrap.b32 %r2654, %r2653, %r2653, 20; + add.s32 %r2655, %r2649, %r2180; + add.s32 %r2656, %r2655, %r2654; + xor.b32 %r2657, %r2656, %r2651; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 24; + add.s32 %r2659, %r2658, %r2652; + xor.b32 %r2660, %r2659, %r2654; + shf.l.wrap.b32 %r2661, %r2660, %r2660, 25; + add.s32 %r2662, %r2614, %r2152; + add.s32 %r2663, %r2662, %r2605; + xor.b32 %r2664, %r2663, %r2630; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 16; + add.s32 %r2666, %r2665, %r2645; + xor.b32 %r2667, %r2666, %r2605; + shf.l.wrap.b32 %r2668, %r2667, %r2667, 20; + add.s32 %r2669, %r2663, %r2166; + add.s32 %r2670, %r2669, %r2668; + xor.b32 %r2671, %r2670, %r2665; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 24; + add.s32 %r2673, %r2672, %r2666; + xor.b32 %r2674, %r2673, %r2668; + shf.l.wrap.b32 %r2675, %r2674, %r2674, 25; + add.s32 %r2676, %r2628, %r2194; + add.s32 %r2677, %r2676, %r2619; + xor.b32 %r2678, %r2677, %r2644; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 16; + add.s32 %r2680, %r2679, %r2603; + xor.b32 %r2681, %r2680, %r2619; + shf.l.wrap.b32 %r2682, %r2681, %r2681, 20; + add.s32 %r2683, %r2677, %r2159; + add.s32 %r2684, %r2683, %r2682; + xor.b32 %r2685, %r2684, %r2679; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 24; + add.s32 %r2687, %r2686, %r2680; + xor.b32 %r2688, %r2687, %r2682; + shf.l.wrap.b32 %r2689, %r2688, %r2688, 25; + add.s32 %r2690, %r2642, %r2187; + add.s32 %r2691, %r2690, %r2633; + xor.b32 %r2692, %r2691, %r2602; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 16; + add.s32 %r2694, %r2693, %r2617; + xor.b32 %r2695, %r2694, %r2633; + shf.l.wrap.b32 %r2696, %r2695, %r2695, 20; + add.s32 %r2697, %r2691, %r2145; + add.s32 %r2698, %r2697, %r2696; + xor.b32 %r2699, %r2698, %r2693; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 24; + add.s32 %r2701, %r2700, %r2694; + xor.b32 %r2702, %r2701, %r2696; + shf.l.wrap.b32 %r2703, %r2702, %r2702, 25; + add.s32 %r2704, %r2656, %r2138; + add.s32 %r2705, %r2704, %r2675; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 16; + add.s32 %r2708, %r2707, %r2687; + xor.b32 %r2709, %r2708, %r2675; + shf.l.wrap.b32 %r2710, %r2709, %r2709, 20; + add.s32 %r2711, %r2705, %r2103; + add.s32 %r2712, %r2711, %r2710; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 24; + add.s32 %r2715, %r2714, %r2708; + xor.b32 %r2716, %r2715, %r2710; + shf.l.wrap.b32 %r2717, %r2716, %r2716, 25; + add.s32 %r2718, %r2670, %r2124; + add.s32 %r2719, %r2718, %r2689; + xor.b32 %r2720, %r2719, %r2658; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 16; + add.s32 %r2722, %r2721, %r2701; + xor.b32 %r2723, %r2722, %r2689; + shf.l.wrap.b32 %r2724, %r2723, %r2723, 20; + add.s32 %r2725, %r2719, %r2110; + add.s32 %r2726, %r2725, %r2724; + xor.b32 %r2727, %r2726, %r2721; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 24; + add.s32 %r2729, %r2728, %r2722; + xor.b32 %r2730, %r2729, %r2724; + shf.l.wrap.b32 %r2731, %r2730, %r2730, 25; + add.s32 %r2732, %r2684, %r2089; + add.s32 %r2733, %r2732, %r2703; + xor.b32 %r2734, %r2733, %r2672; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 16; + add.s32 %r2736, %r2735, %r2659; + xor.b32 %r2737, %r2736, %r2703; + shf.l.wrap.b32 %r2738, %r2737, %r2737, 20; + add.s32 %r2739, %r2733, %r2096; + add.s32 %r2740, %r2739, %r2738; + xor.b32 %r2741, %r2740, %r2735; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 24; + add.s32 %r2743, %r2742, %r2736; + xor.b32 %r2744, %r2743, %r2738; + shf.l.wrap.b32 %r2745, %r2744, %r2744, 25; + add.s32 %r2746, %r2698, %r2131; + add.s32 %r2747, %r2746, %r2661; + xor.b32 %r2748, %r2747, %r2686; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 16; + add.s32 %r2750, %r2749, %r2673; + xor.b32 %r2751, %r2750, %r2661; + shf.l.wrap.b32 %r2752, %r2751, %r2751, 20; + add.s32 %r2753, %r2747, %r2117; + add.s32 %r2754, %r2753, %r2752; + xor.b32 %r2755, %r2754, %r2749; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 24; + add.s32 %r2757, %r2756, %r2750; + xor.b32 %r2758, %r2757, %r2752; + shf.l.wrap.b32 %r2759, %r2758, %r2758, 25; + add.s32 %r2760, %r2712, %r2152; + add.s32 %r2761, %r2760, %r2759; + xor.b32 %r2762, %r2761, %r2728; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 16; + add.s32 %r2764, %r2763, %r2743; + xor.b32 %r2765, %r2764, %r2759; + shf.l.wrap.b32 %r2766, %r2765, %r2765, 20; + add.s32 %r2767, %r2761, %r2187; + add.s32 %r2768, %r2767, %r2766; + xor.b32 %r2769, %r2768, %r2763; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 24; + add.s32 %r2771, %r2770, %r2764; + xor.b32 %r2772, %r2771, %r2766; + shf.l.wrap.b32 %r2773, %r2772, %r2772, 25; + add.s32 %r2774, %r2726, %r2166; + add.s32 %r2775, %r2774, %r2717; + xor.b32 %r2776, %r2775, %r2742; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 16; + add.s32 %r2778, %r2777, %r2757; + xor.b32 %r2779, %r2778, %r2717; + shf.l.wrap.b32 %r2780, %r2779, %r2779, 20; + add.s32 %r2781, %r2775, %r2124; + add.s32 %r2782, %r2781, %r2780; + xor.b32 %r2783, %r2782, %r2777; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 24; + add.s32 %r2785, %r2784, %r2778; + xor.b32 %r2786, %r2785, %r2780; + shf.l.wrap.b32 %r2787, %r2786, %r2786, 25; + add.s32 %r2788, %r2740, %r2145; + add.s32 %r2789, %r2788, %r2731; + xor.b32 %r2790, %r2789, %r2756; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 16; + add.s32 %r2792, %r2791, %r2715; + xor.b32 %r2793, %r2792, %r2731; + shf.l.wrap.b32 %r2794, %r2793, %r2793, 20; + add.s32 %r2795, %r2789, %r2173; + add.s32 %r2796, %r2795, %r2794; + xor.b32 %r2797, %r2796, %r2791; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 24; + add.s32 %r2799, %r2798, %r2792; + xor.b32 %r2800, %r2799, %r2794; + shf.l.wrap.b32 %r2801, %r2800, %r2800, 25; + add.s32 %r2802, %r2754, %r2194; + add.s32 %r2803, %r2802, %r2745; + xor.b32 %r2804, %r2803, %r2714; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 16; + add.s32 %r2806, %r2805, %r2729; + xor.b32 %r2807, %r2806, %r2745; + shf.l.wrap.b32 %r2808, %r2807, %r2807, 20; + add.s32 %r2809, %r2803, %r2096; + add.s32 %r2810, %r2809, %r2808; + xor.b32 %r2811, %r2810, %r2805; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 24; + add.s32 %r2813, %r2812, %r2806; + xor.b32 %r2814, %r2813, %r2808; + shf.l.wrap.b32 %r2815, %r2814, %r2814, 25; + add.s32 %r2816, %r2768, %r2180; + add.s32 %r2817, %r2816, %r2787; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 16; + add.s32 %r2820, %r2819, %r2799; + xor.b32 %r2821, %r2820, %r2787; + shf.l.wrap.b32 %r2822, %r2821, %r2821, 20; + add.s32 %r2823, %r2817, %r2110; + add.s32 %r2824, %r2823, %r2822; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 24; + add.s32 %r2827, %r2826, %r2820; + xor.b32 %r2828, %r2827, %r2822; + shf.l.wrap.b32 %r2829, %r2828, %r2828, 25; + add.s32 %r2830, %r2782, %r2089; + add.s32 %r2831, %r2830, %r2801; + xor.b32 %r2832, %r2831, %r2770; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 16; + add.s32 %r2834, %r2833, %r2813; + xor.b32 %r2835, %r2834, %r2801; + shf.l.wrap.b32 %r2836, %r2835, %r2835, 20; + add.s32 %r2837, %r2831, %r2159; + add.s32 %r2838, %r2837, %r2836; + xor.b32 %r2839, %r2838, %r2833; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 24; + add.s32 %r2841, %r2840, %r2834; + xor.b32 %r2842, %r2841, %r2836; + shf.l.wrap.b32 %r2843, %r2842, %r2842, 25; + add.s32 %r2844, %r2796, %r2103; + add.s32 %r2845, %r2844, %r2815; + xor.b32 %r2846, %r2845, %r2784; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 16; + add.s32 %r2848, %r2847, %r2771; + xor.b32 %r2849, %r2848, %r2815; + shf.l.wrap.b32 %r2850, %r2849, %r2849, 20; + add.s32 %r2851, %r2845, %r2131; + add.s32 %r2852, %r2851, %r2850; + xor.b32 %r2853, %r2852, %r2847; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 24; + add.s32 %r2855, %r2854, %r2848; + xor.b32 %r2856, %r2855, %r2850; + shf.l.wrap.b32 %r2857, %r2856, %r2856, 25; + add.s32 %r2858, %r2810, %r2117; + add.s32 %r2859, %r2858, %r2773; + xor.b32 %r2860, %r2859, %r2798; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 16; + add.s32 %r2862, %r2861, %r2785; + xor.b32 %r2863, %r2862, %r2773; + shf.l.wrap.b32 %r2864, %r2863, %r2863, 20; + add.s32 %r2865, %r2859, %r2138; + add.s32 %r2866, %r2865, %r2864; + xor.b32 %r2867, %r2866, %r2861; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 24; + add.s32 %r2869, %r2868, %r2862; + xor.b32 %r2870, %r2869, %r2864; + shf.l.wrap.b32 %r2871, %r2870, %r2870, 25; + add.s32 %r2872, %r2824, %r2166; + add.s32 %r2873, %r2872, %r2871; + xor.b32 %r2874, %r2873, %r2840; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 16; + add.s32 %r2876, %r2875, %r2855; + xor.b32 %r2877, %r2876, %r2871; + shf.l.wrap.b32 %r2878, %r2877, %r2877, 20; + add.s32 %r2879, %r2873, %r2194; + add.s32 %r2880, %r2879, %r2878; + xor.b32 %r2881, %r2880, %r2875; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 24; + add.s32 %r2883, %r2882, %r2876; + xor.b32 %r2884, %r2883, %r2878; + shf.l.wrap.b32 %r2885, %r2884, %r2884, 25; + add.s32 %r2886, %r2838, %r2124; + add.s32 %r2887, %r2886, %r2829; + xor.b32 %r2888, %r2887, %r2854; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 16; + add.s32 %r2890, %r2889, %r2869; + xor.b32 %r2891, %r2890, %r2829; + shf.l.wrap.b32 %r2892, %r2891, %r2891, 20; + add.s32 %r2893, %r2887, %r2089; + add.s32 %r2894, %r2893, %r2892; + xor.b32 %r2895, %r2894, %r2889; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 24; + add.s32 %r2897, %r2896, %r2890; + xor.b32 %r2898, %r2897, %r2892; + shf.l.wrap.b32 %r2899, %r2898, %r2898, 25; + add.s32 %r2900, %r2852, %r2096; + add.s32 %r2901, %r2900, %r2843; + xor.b32 %r2902, %r2901, %r2868; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 16; + add.s32 %r2904, %r2903, %r2827; + xor.b32 %r2905, %r2904, %r2843; + shf.l.wrap.b32 %r2906, %r2905, %r2905, 20; + add.s32 %r2907, %r2901, %r2152; + add.s32 %r2908, %r2907, %r2906; + xor.b32 %r2909, %r2908, %r2903; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 24; + add.s32 %r2911, %r2910, %r2904; + xor.b32 %r2912, %r2911, %r2906; + shf.l.wrap.b32 %r2913, %r2912, %r2912, 25; + add.s32 %r2914, %r2866, %r2145; + add.s32 %r2915, %r2914, %r2857; + xor.b32 %r2916, %r2915, %r2826; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 16; + add.s32 %r2918, %r2917, %r2841; + xor.b32 %r2919, %r2918, %r2857; + shf.l.wrap.b32 %r2920, %r2919, %r2919, 20; + add.s32 %r2921, %r2915, %r2131; + add.s32 %r2922, %r2921, %r2920; + xor.b32 %r2923, %r2922, %r2917; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 24; + add.s32 %r2925, %r2924, %r2918; + xor.b32 %r2926, %r2925, %r2920; + shf.l.wrap.b32 %r2927, %r2926, %r2926, 25; + add.s32 %r2928, %r2880, %r2187; + add.s32 %r2929, %r2928, %r2899; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 16; + add.s32 %r2932, %r2931, %r2911; + xor.b32 %r2933, %r2932, %r2899; + shf.l.wrap.b32 %r2934, %r2933, %r2933, 20; + add.s32 %r2935, %r2929, %r2159; + add.s32 %r2936, %r2935, %r2934; + xor.b32 %r2937, %r2936, %r2931; + shf.l.wrap.b32 %r2938, %r2937, %r2937, 24; + add.s32 %r2939, %r2938, %r2932; + xor.b32 %r2940, %r2939, %r2934; + shf.l.wrap.b32 %r2941, %r2940, %r2940, 25; + add.s32 %r2942, %r2894, %r2103; + add.s32 %r2943, %r2942, %r2913; + xor.b32 %r2944, %r2943, %r2882; + shf.l.wrap.b32 %r2945, %r2944, %r2944, 16; + add.s32 %r2946, %r2945, %r2925; + xor.b32 %r2947, %r2946, %r2913; + shf.l.wrap.b32 %r2948, %r2947, %r2947, 20; + add.s32 %r2949, %r2943, %r2173; + add.s32 %r2950, %r2949, %r2948; + xor.b32 %r2951, %r2950, %r2945; + shf.l.wrap.b32 %r2952, %r2951, %r2951, 24; + add.s32 %r2953, %r2952, %r2946; + xor.b32 %r2954, %r2953, %r2948; + shf.l.wrap.b32 %r2955, %r2954, %r2954, 25; + add.s32 %r2956, %r2908, %r2110; + add.s32 %r2957, %r2956, %r2927; + xor.b32 %r2958, %r2957, %r2896; + shf.l.wrap.b32 %r2959, %r2958, %r2958, 16; + add.s32 %r2960, %r2959, %r2883; + xor.b32 %r2961, %r2960, %r2927; + shf.l.wrap.b32 %r2962, %r2961, %r2961, 20; + add.s32 %r2963, %r2957, %r2117; + add.s32 %r2964, %r2963, %r2962; + xor.b32 %r2965, %r2964, %r2959; + shf.l.wrap.b32 %r2966, %r2965, %r2965, 24; + add.s32 %r2967, %r2966, %r2960; + xor.b32 %r2968, %r2967, %r2962; + shf.l.wrap.b32 %r2969, %r2968, %r2968, 25; + add.s32 %r2970, %r2922, %r2138; + add.s32 %r2971, %r2970, %r2885; + xor.b32 %r2972, %r2971, %r2910; + shf.l.wrap.b32 %r2973, %r2972, %r2972, 16; + add.s32 %r2974, %r2973, %r2897; + xor.b32 %r2975, %r2974, %r2885; + shf.l.wrap.b32 %r2976, %r2975, %r2975, 20; + add.s32 %r2977, %r2971, %r2180; + add.s32 %r2978, %r2977, %r2976; + xor.b32 %r2979, %r2978, %r2973; + shf.l.wrap.b32 %r2980, %r2979, %r2979, 24; + add.s32 %r2981, %r2980, %r2974; + xor.b32 %r2982, %r2981, %r2976; + shf.l.wrap.b32 %r2983, %r2982, %r2982, 25; + xor.b32 %r3964, %r2967, %r2936; + xor.b32 %r3963, %r2981, %r2950; + xor.b32 %r3962, %r2939, %r2964; + xor.b32 %r3961, %r2953, %r2978; + xor.b32 %r3960, %r2983, %r2952; + xor.b32 %r3959, %r2941, %r2966; + xor.b32 %r3958, %r2955, %r2980; + xor.b32 %r3957, %r2969, %r2938; + add.s16 %rs198, %rs198, 1; + st.local.u8 [%rd56+1], %rs198; + add.s64 %rd170, %rd170, 64; + add.s64 %rd171, %rd171, -64; + setp.gt.u64 %p24, %rd171, 64; + @%p24 bra $L__BB0_24; + +$L__BB0_25: + min.u64 %rd63, %rd171, 64; + setp.eq.s64 %p25, %rd63, 0; + mov.u16 %rs200, %rs199; + mov.u16 %rs201, %rs199; + mov.u16 %rs202, %rs199; + mov.u16 %rs203, %rs199; + mov.u16 %rs204, %rs199; + mov.u16 %rs205, %rs199; + mov.u16 %rs206, %rs199; + mov.u16 %rs207, %rs199; + mov.u16 %rs208, %rs199; + mov.u16 %rs209, %rs199; + mov.u16 %rs210, %rs199; + mov.u16 %rs211, %rs199; + mov.u16 %rs212, %rs199; + mov.u16 %rs213, %rs199; + mov.u16 %rs214, %rs199; + mov.u16 %rs215, %rs199; + mov.u16 %rs216, %rs199; + mov.u16 %rs217, %rs199; + mov.u16 %rs218, %rs199; + mov.u16 %rs219, %rs199; + mov.u16 %rs220, %rs199; + mov.u16 %rs221, %rs199; + mov.u16 %rs222, %rs199; + mov.u16 %rs223, %rs199; + mov.u16 %rs224, %rs199; + mov.u16 %rs225, %rs199; + mov.u16 %rs226, %rs199; + mov.u16 %rs227, %rs199; + mov.u16 %rs228, %rs199; + mov.u16 %rs229, %rs199; + mov.u16 %rs230, %rs199; + mov.u16 %rs231, %rs199; + mov.u16 %rs232, %rs199; + @%p25 bra $L__BB0_29; + + mov.u64 %rd172, 0; + +$L__BB0_27: + add.s64 %rd131, %rd170, %rd172; + ld.u8 %rs121, [%rd131]; + add.s64 %rd132, %rd53, %rd172; + st.local.u8 [%rd132], %rs121; + add.s64 %rd172, %rd172, 1; + setp.lt.u64 %p26, %rd172, %rd63; + @%p26 bra $L__BB0_27; + + ld.local.v4.u16 {%rs229, %rs230, %rs231, %rs232}, [%rd53]; + ld.local.v4.u16 {%rs225, %rs226, %rs227, %rs228}, [%rd53+8]; + ld.local.v4.u16 {%rs221, %rs222, %rs223, %rs224}, [%rd53+16]; + ld.local.v4.u16 {%rs217, %rs218, %rs219, %rs220}, [%rd53+24]; + ld.local.v4.u16 {%rs213, %rs214, %rs215, %rs216}, [%rd53+32]; + ld.local.v4.u16 {%rs209, %rs210, %rs211, %rs212}, [%rd53+40]; + ld.local.v4.u16 {%rs205, %rs206, %rs207, %rs208}, [%rd53+48]; + ld.local.v4.u16 {%rs202, %rs203, %rs204, %rs153}, [%rd53+56]; + ld.local.u8 %rs201, [%rd53+61]; + ld.local.v2.u8 {%rs199, %rs200}, [%rd53+62]; + +$L__BB0_29: + ld.param.u64 %rd138, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd137, %rd138; + ld.local.v4.u8 {%rs156, %rs157, %rs158, %rs159}, [%rd53+64]; + cvt.u16.u64 %rs162, %rd63; + add.s16 %rs163, %rs156, %rs162; + st.local.u8 [%rd53+64], %rs163; + setp.eq.s16 %p27, %rs157, 0; + selp.u16 %rs164, 1, 0, %p27; + or.b16 %rs165, %rs158, %rs164; + or.b16 %rs166, %rs165, 2; + shr.u16 %rs167, %rs229, 8; + shr.u16 %rs168, %rs230, 8; + shr.u16 %rs169, %rs231, 8; + shr.u16 %rs170, %rs232, 8; + shr.u16 %rs171, %rs225, 8; + shr.u16 %rs172, %rs226, 8; + shr.u16 %rs173, %rs227, 8; + shr.u16 %rs174, %rs228, 8; + shr.u16 %rs175, %rs221, 8; + shr.u16 %rs176, %rs222, 8; + shr.u16 %rs177, %rs223, 8; + shr.u16 %rs178, %rs224, 8; + shr.u16 %rs179, %rs217, 8; + shr.u16 %rs180, %rs218, 8; + shr.u16 %rs181, %rs219, 8; + shr.u16 %rs182, %rs220, 8; + shr.u16 %rs183, %rs213, 8; + shr.u16 %rs184, %rs214, 8; + shr.u16 %rs185, %rs215, 8; + shr.u16 %rs186, %rs216, 8; + shr.u16 %rs187, %rs209, 8; + shr.u16 %rs188, %rs210, 8; + shr.u16 %rs189, %rs211, 8; + shr.u16 %rs190, %rs212, 8; + shr.u16 %rs191, %rs205, 8; + shr.u16 %rs192, %rs206, 8; + shr.u16 %rs193, %rs207, 8; + shr.u16 %rs194, %rs208, 8; + shr.u16 %rs195, %rs202, 8; + shr.u16 %rs196, %rs203, 8; + shl.b64 %rd133, %rd151, 5; + add.s64 %rd134, %rd137, %rd133; + cvt.u32.u16 %r2984, %rs229; + and.b32 %r2985, %r2984, 255; + cvt.u32.u16 %r2986, %rs167; + prmt.b32 %r2987, %r2986, %r2985, 30212; + cvt.u32.u16 %r2988, %rs230; + prmt.b32 %r2989, %r2988, %r2987, 28756; + cvt.u32.u16 %r2990, %rs168; + prmt.b32 %r2991, %r2990, %r2989, 1620; + cvt.u32.u16 %r2992, %rs231; + and.b32 %r2993, %r2992, 255; + cvt.u32.u16 %r2994, %rs169; + prmt.b32 %r2995, %r2994, %r2993, 30212; + cvt.u32.u16 %r2996, %rs232; + prmt.b32 %r2997, %r2996, %r2995, 28756; + cvt.u32.u16 %r2998, %rs170; + prmt.b32 %r2999, %r2998, %r2997, 1620; + cvt.u32.u16 %r3000, %rs225; + and.b32 %r3001, %r3000, 255; + cvt.u32.u16 %r3002, %rs171; + prmt.b32 %r3003, %r3002, %r3001, 30212; + cvt.u32.u16 %r3004, %rs226; + prmt.b32 %r3005, %r3004, %r3003, 28756; + cvt.u32.u16 %r3006, %rs172; + prmt.b32 %r3007, %r3006, %r3005, 1620; + cvt.u32.u16 %r3008, %rs227; + and.b32 %r3009, %r3008, 255; + cvt.u32.u16 %r3010, %rs173; + prmt.b32 %r3011, %r3010, %r3009, 30212; + cvt.u32.u16 %r3012, %rs228; + prmt.b32 %r3013, %r3012, %r3011, 28756; + cvt.u32.u16 %r3014, %rs174; + prmt.b32 %r3015, %r3014, %r3013, 1620; + cvt.u32.u16 %r3016, %rs221; + and.b32 %r3017, %r3016, 255; + cvt.u32.u16 %r3018, %rs175; + prmt.b32 %r3019, %r3018, %r3017, 30212; + cvt.u32.u16 %r3020, %rs222; + prmt.b32 %r3021, %r3020, %r3019, 28756; + cvt.u32.u16 %r3022, %rs176; + prmt.b32 %r3023, %r3022, %r3021, 1620; + cvt.u32.u16 %r3024, %rs223; + and.b32 %r3025, %r3024, 255; + cvt.u32.u16 %r3026, %rs177; + prmt.b32 %r3027, %r3026, %r3025, 30212; + cvt.u32.u16 %r3028, %rs224; + prmt.b32 %r3029, %r3028, %r3027, 28756; + cvt.u32.u16 %r3030, %rs178; + prmt.b32 %r3031, %r3030, %r3029, 1620; + cvt.u32.u16 %r3032, %rs217; + and.b32 %r3033, %r3032, 255; + cvt.u32.u16 %r3034, %rs179; + prmt.b32 %r3035, %r3034, %r3033, 30212; + cvt.u32.u16 %r3036, %rs218; + prmt.b32 %r3037, %r3036, %r3035, 28756; + cvt.u32.u16 %r3038, %rs180; + prmt.b32 %r3039, %r3038, %r3037, 1620; + cvt.u32.u16 %r3040, %rs219; + and.b32 %r3041, %r3040, 255; + cvt.u32.u16 %r3042, %rs181; + prmt.b32 %r3043, %r3042, %r3041, 30212; + cvt.u32.u16 %r3044, %rs220; + prmt.b32 %r3045, %r3044, %r3043, 28756; + cvt.u32.u16 %r3046, %rs182; + prmt.b32 %r3047, %r3046, %r3045, 1620; + cvt.u32.u16 %r3048, %rs213; + and.b32 %r3049, %r3048, 255; + cvt.u32.u16 %r3050, %rs183; + prmt.b32 %r3051, %r3050, %r3049, 30212; + cvt.u32.u16 %r3052, %rs214; + prmt.b32 %r3053, %r3052, %r3051, 28756; + cvt.u32.u16 %r3054, %rs184; + prmt.b32 %r3055, %r3054, %r3053, 1620; + cvt.u32.u16 %r3056, %rs215; + and.b32 %r3057, %r3056, 255; + cvt.u32.u16 %r3058, %rs185; + prmt.b32 %r3059, %r3058, %r3057, 30212; + cvt.u32.u16 %r3060, %rs216; + prmt.b32 %r3061, %r3060, %r3059, 28756; + cvt.u32.u16 %r3062, %rs186; + prmt.b32 %r3063, %r3062, %r3061, 1620; + cvt.u32.u16 %r3064, %rs209; + and.b32 %r3065, %r3064, 255; + cvt.u32.u16 %r3066, %rs187; + prmt.b32 %r3067, %r3066, %r3065, 30212; + cvt.u32.u16 %r3068, %rs210; + prmt.b32 %r3069, %r3068, %r3067, 28756; + cvt.u32.u16 %r3070, %rs188; + prmt.b32 %r3071, %r3070, %r3069, 1620; + cvt.u32.u16 %r3072, %rs211; + and.b32 %r3073, %r3072, 255; + cvt.u32.u16 %r3074, %rs189; + prmt.b32 %r3075, %r3074, %r3073, 30212; + cvt.u32.u16 %r3076, %rs212; + prmt.b32 %r3077, %r3076, %r3075, 28756; + cvt.u32.u16 %r3078, %rs190; + prmt.b32 %r3079, %r3078, %r3077, 1620; + cvt.u32.u16 %r3080, %rs205; + and.b32 %r3081, %r3080, 255; + cvt.u32.u16 %r3082, %rs191; + prmt.b32 %r3083, %r3082, %r3081, 30212; + cvt.u32.u16 %r3084, %rs206; + prmt.b32 %r3085, %r3084, %r3083, 28756; + cvt.u32.u16 %r3086, %rs192; + prmt.b32 %r3087, %r3086, %r3085, 1620; + cvt.u32.u16 %r3088, %rs207; + and.b32 %r3089, %r3088, 255; + cvt.u32.u16 %r3090, %rs193; + prmt.b32 %r3091, %r3090, %r3089, 30212; + cvt.u32.u16 %r3092, %rs208; + prmt.b32 %r3093, %r3092, %r3091, 28756; + cvt.u32.u16 %r3094, %rs194; + prmt.b32 %r3095, %r3094, %r3093, 1620; + cvt.u32.u16 %r3096, %rs202; + and.b32 %r3097, %r3096, 255; + cvt.u32.u16 %r3098, %rs195; + prmt.b32 %r3099, %r3098, %r3097, 30212; + cvt.u32.u16 %r3100, %rs203; + prmt.b32 %r3101, %r3100, %r3099, 28756; + cvt.u32.u16 %r3102, %rs196; + prmt.b32 %r3103, %r3102, %r3101, 1620; + cvt.u32.u16 %r3104, %rs204; + and.b32 %r3105, %r3104, 255; + cvt.u32.u16 %r3106, %rs201; + prmt.b32 %r3107, %r3106, %r3105, 30212; + cvt.u32.u16 %r3108, %rs199; + shl.b32 %r3109, %r3108, 16; + and.b32 %r3110, %r3109, 16711680; + or.b32 %r3111, %r3107, %r3110; + cvt.u32.u16 %r3112, %rs200; + shl.b32 %r3113, %r3112, 24; + or.b32 %r3114, %r3111, %r3113; + cvt.u32.u16 %r3115, %rs163; + and.b32 %r3116, %r3115, 255; + cvt.u32.u16 %r3117, %rs166; + and.b32 %r3118, %r3117, 255; + add.s32 %r3119, %r3960, %r3964; + add.s32 %r3120, %r3119, %r2991; + xor.b32 %r3121, %r3120, %r36; + shf.l.wrap.b32 %r3122, %r3121, %r3121, 16; + add.s32 %r3123, %r3122, 1779033703; + xor.b32 %r3124, %r3123, %r3960; + shf.l.wrap.b32 %r3125, %r3124, %r3124, 20; + add.s32 %r3126, %r2999, %r3120; + add.s32 %r3127, %r3126, %r3125; + xor.b32 %r3128, %r3127, %r3122; + shf.l.wrap.b32 %r3129, %r3128, %r3128, 24; + add.s32 %r3130, %r3129, %r3123; + xor.b32 %r3131, %r3130, %r3125; + shf.l.wrap.b32 %r3132, %r3131, %r3131, 25; + add.s32 %r3133, %r3959, %r3963; + add.s32 %r3134, %r3133, %r3007; + xor.b32 %r3135, %r3134, %r37; + shf.l.wrap.b32 %r3136, %r3135, %r3135, 16; + add.s32 %r3137, %r3136, -1150833019; + xor.b32 %r3138, %r3137, %r3959; + shf.l.wrap.b32 %r3139, %r3138, %r3138, 20; + add.s32 %r3140, %r3015, %r3134; + add.s32 %r3141, %r3140, %r3139; + xor.b32 %r3142, %r3141, %r3136; + shf.l.wrap.b32 %r3143, %r3142, %r3142, 24; + add.s32 %r3144, %r3143, %r3137; + xor.b32 %r3145, %r3144, %r3139; + shf.l.wrap.b32 %r3146, %r3145, %r3145, 25; + add.s32 %r3147, %r3958, %r3962; + add.s32 %r3148, %r3147, %r3023; + xor.b32 %r3149, %r3148, %r3116; + shr.u32 %r3150, %r3148, 16; + shl.b32 %r3151, %r3149, 16; + or.b32 %r3152, %r3151, %r3150; + add.s32 %r3153, %r3152, 1013904242; + xor.b32 %r3154, %r3153, %r3958; + shf.l.wrap.b32 %r3155, %r3154, %r3154, 20; + add.s32 %r3156, %r3031, %r3148; + add.s32 %r3157, %r3156, %r3155; + xor.b32 %r3158, %r3157, %r3152; + shf.l.wrap.b32 %r3159, %r3158, %r3158, 24; + add.s32 %r3160, %r3159, %r3153; + xor.b32 %r3161, %r3160, %r3155; + shf.l.wrap.b32 %r3162, %r3161, %r3161, 25; + add.s32 %r3163, %r3957, %r3961; + add.s32 %r3164, %r3163, %r3039; + xor.b32 %r3165, %r3164, %r3118; + shr.u32 %r3166, %r3164, 16; + shl.b32 %r3167, %r3165, 16; + or.b32 %r3168, %r3167, %r3166; + add.s32 %r3169, %r3168, -1521486534; + xor.b32 %r3170, %r3169, %r3957; + shf.l.wrap.b32 %r3171, %r3170, %r3170, 20; + add.s32 %r3172, %r3047, %r3164; + add.s32 %r3173, %r3172, %r3171; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 24; + add.s32 %r3176, %r3175, %r3169; + xor.b32 %r3177, %r3176, %r3171; + shf.l.wrap.b32 %r3178, %r3177, %r3177, 25; + add.s32 %r3179, %r3146, %r3127; + add.s32 %r3180, %r3179, %r3055; + xor.b32 %r3181, %r3175, %r3180; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 16; + add.s32 %r3183, %r3182, %r3160; + xor.b32 %r3184, %r3183, %r3146; + shf.l.wrap.b32 %r3185, %r3184, %r3184, 20; + add.s32 %r3186, %r3063, %r3180; + add.s32 %r3187, %r3186, %r3185; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 24; + add.s32 %r3190, %r3189, %r3183; + xor.b32 %r3191, %r3190, %r3185; + shf.l.wrap.b32 %r3192, %r3191, %r3191, 25; + add.s32 %r3193, %r3162, %r3141; + add.s32 %r3194, %r3193, %r3071; + xor.b32 %r3195, %r3194, %r3129; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 16; + add.s32 %r3197, %r3196, %r3176; + xor.b32 %r3198, %r3197, %r3162; + shf.l.wrap.b32 %r3199, %r3198, %r3198, 20; + add.s32 %r3200, %r3079, %r3194; + add.s32 %r3201, %r3200, %r3199; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 24; + add.s32 %r3204, %r3203, %r3197; + xor.b32 %r3205, %r3204, %r3199; + shf.l.wrap.b32 %r3206, %r3205, %r3205, 25; + add.s32 %r3207, %r3178, %r3157; + add.s32 %r3208, %r3207, %r3087; + xor.b32 %r3209, %r3208, %r3143; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 16; + add.s32 %r3211, %r3210, %r3130; + xor.b32 %r3212, %r3211, %r3178; + shf.l.wrap.b32 %r3213, %r3212, %r3212, 20; + add.s32 %r3214, %r3095, %r3208; + add.s32 %r3215, %r3214, %r3213; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 24; + add.s32 %r3218, %r3217, %r3211; + xor.b32 %r3219, %r3218, %r3213; + shf.l.wrap.b32 %r3220, %r3219, %r3219, 25; + add.s32 %r3221, %r3173, %r3132; + add.s32 %r3222, %r3221, %r3103; + xor.b32 %r3223, %r3222, %r3159; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 16; + add.s32 %r3225, %r3224, %r3144; + xor.b32 %r3226, %r3225, %r3132; + shf.l.wrap.b32 %r3227, %r3226, %r3226, 20; + add.s32 %r3228, %r3114, %r3222; + add.s32 %r3229, %r3228, %r3227; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 24; + add.s32 %r3232, %r3231, %r3225; + xor.b32 %r3233, %r3232, %r3227; + shf.l.wrap.b32 %r3234, %r3233, %r3233, 25; + add.s32 %r3235, %r3187, %r3007; + add.s32 %r3236, %r3235, %r3234; + xor.b32 %r3237, %r3236, %r3203; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 16; + add.s32 %r3239, %r3238, %r3218; + xor.b32 %r3240, %r3239, %r3234; + shf.l.wrap.b32 %r3241, %r3240, %r3240, 20; + add.s32 %r3242, %r3236, %r3039; + add.s32 %r3243, %r3242, %r3241; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 24; + add.s32 %r3246, %r3245, %r3239; + xor.b32 %r3247, %r3246, %r3241; + shf.l.wrap.b32 %r3248, %r3247, %r3247, 25; + add.s32 %r3249, %r3201, %r3015; + add.s32 %r3250, %r3249, %r3192; + xor.b32 %r3251, %r3217, %r3250; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 16; + add.s32 %r3253, %r3232, %r3252; + xor.b32 %r3254, %r3253, %r3192; + shf.l.wrap.b32 %r3255, %r3254, %r3254, 20; + add.s32 %r3256, %r3250, %r3071; + add.s32 %r3257, %r3256, %r3255; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 24; + add.s32 %r3260, %r3259, %r3253; + xor.b32 %r3261, %r3260, %r3255; + shf.l.wrap.b32 %r3262, %r3261, %r3261, 25; + add.s32 %r3263, %r3206, %r3047; + add.s32 %r3264, %r3263, %r3215; + xor.b32 %r3265, %r3231, %r3264; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 16; + add.s32 %r3267, %r3266, %r3190; + xor.b32 %r3268, %r3267, %r3206; + shf.l.wrap.b32 %r3269, %r3268, %r3268, 20; + add.s32 %r3270, %r3264, %r2991; + add.s32 %r3271, %r3270, %r3269; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 24; + add.s32 %r3274, %r3273, %r3267; + xor.b32 %r3275, %r3274, %r3269; + shf.l.wrap.b32 %r3276, %r3275, %r3275, 25; + add.s32 %r3277, %r3220, %r3023; + add.s32 %r3278, %r3277, %r3229; + xor.b32 %r3279, %r3278, %r3189; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 16; + add.s32 %r3281, %r3280, %r3204; + xor.b32 %r3282, %r3281, %r3220; + shf.l.wrap.b32 %r3283, %r3282, %r3282, 20; + add.s32 %r3284, %r3278, %r3095; + add.s32 %r3285, %r3284, %r3283; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 24; + add.s32 %r3288, %r3287, %r3281; + xor.b32 %r3289, %r3288, %r3283; + shf.l.wrap.b32 %r3290, %r3289, %r3289, 25; + add.s32 %r3291, %r3262, %r2999; + add.s32 %r3292, %r3291, %r3243; + xor.b32 %r3293, %r3292, %r3287; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 16; + add.s32 %r3295, %r3294, %r3274; + xor.b32 %r3296, %r3295, %r3262; + shf.l.wrap.b32 %r3297, %r3296, %r3296, 20; + add.s32 %r3298, %r3292, %r3079; + add.s32 %r3299, %r3298, %r3297; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 24; + add.s32 %r3302, %r3301, %r3295; + xor.b32 %r3303, %r3302, %r3297; + shf.l.wrap.b32 %r3304, %r3303, %r3303, 25; + add.s32 %r3305, %r3257, %r3087; + add.s32 %r3306, %r3305, %r3276; + xor.b32 %r3307, %r3245, %r3306; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 16; + add.s32 %r3309, %r3308, %r3288; + xor.b32 %r3310, %r3309, %r3276; + shf.l.wrap.b32 %r3311, %r3310, %r3310, 20; + add.s32 %r3312, %r3306, %r3031; + add.s32 %r3313, %r3312, %r3311; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 24; + add.s32 %r3316, %r3315, %r3309; + xor.b32 %r3317, %r3316, %r3311; + shf.l.wrap.b32 %r3318, %r3317, %r3317, 25; + add.s32 %r3319, %r3271, %r3063; + add.s32 %r3320, %r3319, %r3290; + xor.b32 %r3321, %r3320, %r3259; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 16; + add.s32 %r3323, %r3322, %r3246; + xor.b32 %r3324, %r3323, %r3290; + shf.l.wrap.b32 %r3325, %r3324, %r3324, 20; + add.s32 %r3326, %r3320, %r3103; + add.s32 %r3327, %r3326, %r3325; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 24; + add.s32 %r3330, %r3329, %r3323; + xor.b32 %r3331, %r3330, %r3325; + shf.l.wrap.b32 %r3332, %r3331, %r3331, 25; + add.s32 %r3333, %r3285, %r3114; + add.s32 %r3334, %r3333, %r3248; + xor.b32 %r3335, %r3334, %r3273; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 16; + add.s32 %r3337, %r3336, %r3260; + xor.b32 %r3338, %r3337, %r3248; + shf.l.wrap.b32 %r3339, %r3338, %r3338, 20; + add.s32 %r3340, %r3334, %r3055; + add.s32 %r3341, %r3340, %r3339; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 24; + add.s32 %r3344, %r3343, %r3337; + xor.b32 %r3345, %r3344, %r3339; + shf.l.wrap.b32 %r3346, %r3345, %r3345, 25; + add.s32 %r3347, %r3299, %r3015; + add.s32 %r3348, %r3347, %r3346; + xor.b32 %r3349, %r3348, %r3315; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 16; + add.s32 %r3351, %r3350, %r3330; + xor.b32 %r3352, %r3351, %r3346; + shf.l.wrap.b32 %r3353, %r3352, %r3352, 20; + add.s32 %r3354, %r3348, %r3023; + add.s32 %r3355, %r3354, %r3353; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 24; + add.s32 %r3358, %r3357, %r3351; + xor.b32 %r3359, %r3358, %r3353; + shf.l.wrap.b32 %r3360, %r3359, %r3359, 25; + add.s32 %r3361, %r3313, %r3071; + add.s32 %r3362, %r3361, %r3304; + xor.b32 %r3363, %r3362, %r3329; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 16; + add.s32 %r3365, %r3364, %r3344; + xor.b32 %r3366, %r3365, %r3304; + shf.l.wrap.b32 %r3367, %r3366, %r3366, 20; + add.s32 %r3368, %r3362, %r3087; + add.s32 %r3369, %r3368, %r3367; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 24; + add.s32 %r3372, %r3371, %r3365; + xor.b32 %r3373, %r3372, %r3367; + shf.l.wrap.b32 %r3374, %r3373, %r3373, 25; + add.s32 %r3375, %r3327, %r3095; + add.s32 %r3376, %r3375, %r3318; + xor.b32 %r3377, %r3343, %r3376; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 16; + add.s32 %r3379, %r3378, %r3302; + xor.b32 %r3380, %r3379, %r3318; + shf.l.wrap.b32 %r3381, %r3380, %r3380, 20; + add.s32 %r3382, %r3376, %r3007; + add.s32 %r3383, %r3382, %r3381; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 24; + add.s32 %r3386, %r3385, %r3379; + xor.b32 %r3387, %r3386, %r3381; + shf.l.wrap.b32 %r3388, %r3387, %r3387, 25; + add.s32 %r3389, %r3332, %r3047; + add.s32 %r3390, %r3389, %r3341; + xor.b32 %r3391, %r3390, %r3301; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 16; + add.s32 %r3393, %r3392, %r3316; + xor.b32 %r3394, %r3393, %r3332; + shf.l.wrap.b32 %r3395, %r3394, %r3394, 20; + add.s32 %r3396, %r3390, %r3103; + add.s32 %r3397, %r3396, %r3395; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 24; + add.s32 %r3400, %r3399, %r3393; + xor.b32 %r3401, %r3400, %r3395; + shf.l.wrap.b32 %r3402, %r3401, %r3401, 25; + add.s32 %r3403, %r3374, %r3039; + add.s32 %r3404, %r3403, %r3355; + xor.b32 %r3405, %r3404, %r3399; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 16; + add.s32 %r3407, %r3406, %r3386; + xor.b32 %r3408, %r3407, %r3374; + shf.l.wrap.b32 %r3409, %r3408, %r3408, 20; + add.s32 %r3410, %r3404, %r3031; + add.s32 %r3411, %r3410, %r3409; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 24; + add.s32 %r3414, %r3413, %r3407; + xor.b32 %r3415, %r3414, %r3409; + shf.l.wrap.b32 %r3416, %r3415, %r3415, 25; + add.s32 %r3417, %r3369, %r3063; + add.s32 %r3418, %r3417, %r3388; + xor.b32 %r3419, %r3357, %r3418; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 16; + add.s32 %r3421, %r3420, %r3400; + xor.b32 %r3422, %r3421, %r3388; + shf.l.wrap.b32 %r3423, %r3422, %r3422, 20; + add.s32 %r3424, %r3418, %r2991; + add.s32 %r3425, %r3424, %r3423; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 24; + add.s32 %r3428, %r3427, %r3421; + xor.b32 %r3429, %r3428, %r3423; + shf.l.wrap.b32 %r3430, %r3429, %r3429, 25; + add.s32 %r3431, %r3383, %r3079; + add.s32 %r3432, %r3431, %r3402; + xor.b32 %r3433, %r3432, %r3371; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 16; + add.s32 %r3435, %r3434, %r3358; + xor.b32 %r3436, %r3435, %r3402; + shf.l.wrap.b32 %r3437, %r3436, %r3436, 20; + add.s32 %r3438, %r3432, %r3114; + add.s32 %r3439, %r3438, %r3437; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 24; + add.s32 %r3442, %r3441, %r3435; + xor.b32 %r3443, %r3442, %r3437; + shf.l.wrap.b32 %r3444, %r3443, %r3443, 25; + add.s32 %r3445, %r3397, %r3055; + add.s32 %r3446, %r3445, %r3360; + xor.b32 %r3447, %r3446, %r3385; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 16; + add.s32 %r3449, %r3448, %r3372; + xor.b32 %r3450, %r3449, %r3360; + shf.l.wrap.b32 %r3451, %r3450, %r3450, 20; + add.s32 %r3452, %r3446, %r2999; + add.s32 %r3453, %r3452, %r3451; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 24; + add.s32 %r3456, %r3455, %r3449; + xor.b32 %r3457, %r3456, %r3451; + shf.l.wrap.b32 %r3458, %r3457, %r3457, 25; + add.s32 %r3459, %r3411, %r3071; + add.s32 %r3460, %r3459, %r3458; + xor.b32 %r3461, %r3460, %r3427; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 16; + add.s32 %r3463, %r3462, %r3442; + xor.b32 %r3464, %r3463, %r3458; + shf.l.wrap.b32 %r3465, %r3464, %r3464, 20; + add.s32 %r3466, %r3460, %r3047; + add.s32 %r3467, %r3466, %r3465; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 24; + add.s32 %r3470, %r3469, %r3463; + xor.b32 %r3471, %r3470, %r3465; + shf.l.wrap.b32 %r3472, %r3471, %r3471, 25; + add.s32 %r3473, %r3425, %r3087; + add.s32 %r3474, %r3473, %r3416; + xor.b32 %r3475, %r3474, %r3441; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 16; + add.s32 %r3477, %r3476, %r3456; + xor.b32 %r3478, %r3477, %r3416; + shf.l.wrap.b32 %r3479, %r3478, %r3478, 20; + add.s32 %r3480, %r3474, %r3063; + add.s32 %r3481, %r3480, %r3479; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 24; + add.s32 %r3484, %r3483, %r3477; + xor.b32 %r3485, %r3484, %r3479; + shf.l.wrap.b32 %r3486, %r3485, %r3485, 25; + add.s32 %r3487, %r3439, %r3103; + add.s32 %r3488, %r3487, %r3430; + xor.b32 %r3489, %r3455, %r3488; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 16; + add.s32 %r3491, %r3490, %r3414; + xor.b32 %r3492, %r3491, %r3430; + shf.l.wrap.b32 %r3493, %r3492, %r3492, 20; + add.s32 %r3494, %r3488, %r3015; + add.s32 %r3495, %r3494, %r3493; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 24; + add.s32 %r3498, %r3497, %r3491; + xor.b32 %r3499, %r3498, %r3493; + shf.l.wrap.b32 %r3500, %r3499, %r3499, 25; + add.s32 %r3501, %r3444, %r3095; + add.s32 %r3502, %r3501, %r3453; + xor.b32 %r3503, %r3502, %r3413; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 16; + add.s32 %r3505, %r3504, %r3428; + xor.b32 %r3506, %r3505, %r3444; + shf.l.wrap.b32 %r3507, %r3506, %r3506, 20; + add.s32 %r3508, %r3502, %r3114; + add.s32 %r3509, %r3508, %r3507; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 24; + add.s32 %r3512, %r3511, %r3505; + xor.b32 %r3513, %r3512, %r3507; + shf.l.wrap.b32 %r3514, %r3513, %r3513, 25; + add.s32 %r3515, %r3486, %r3023; + add.s32 %r3516, %r3515, %r3467; + xor.b32 %r3517, %r3516, %r3511; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 16; + add.s32 %r3519, %r3518, %r3498; + xor.b32 %r3520, %r3519, %r3486; + shf.l.wrap.b32 %r3521, %r3520, %r3520, 20; + add.s32 %r3522, %r3516, %r2991; + add.s32 %r3523, %r3522, %r3521; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 24; + add.s32 %r3526, %r3525, %r3519; + xor.b32 %r3527, %r3526, %r3521; + shf.l.wrap.b32 %r3528, %r3527, %r3527, 25; + add.s32 %r3529, %r3481, %r3079; + add.s32 %r3530, %r3529, %r3500; + xor.b32 %r3531, %r3469, %r3530; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 16; + add.s32 %r3533, %r3532, %r3512; + xor.b32 %r3534, %r3533, %r3500; + shf.l.wrap.b32 %r3535, %r3534, %r3534, 20; + add.s32 %r3536, %r3530, %r3007; + add.s32 %r3537, %r3536, %r3535; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 24; + add.s32 %r3540, %r3539, %r3533; + xor.b32 %r3541, %r3540, %r3535; + shf.l.wrap.b32 %r3542, %r3541, %r3541, 25; + add.s32 %r3543, %r3495, %r3031; + add.s32 %r3544, %r3543, %r3514; + xor.b32 %r3545, %r3544, %r3483; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 16; + add.s32 %r3547, %r3546, %r3470; + xor.b32 %r3548, %r3547, %r3514; + shf.l.wrap.b32 %r3549, %r3548, %r3548, 20; + add.s32 %r3550, %r3544, %r3055; + add.s32 %r3551, %r3550, %r3549; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 24; + add.s32 %r3554, %r3553, %r3547; + xor.b32 %r3555, %r3554, %r3549; + shf.l.wrap.b32 %r3556, %r3555, %r3555, 25; + add.s32 %r3557, %r3509, %r2999; + add.s32 %r3558, %r3557, %r3472; + xor.b32 %r3559, %r3558, %r3497; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 16; + add.s32 %r3561, %r3560, %r3484; + xor.b32 %r3562, %r3561, %r3472; + shf.l.wrap.b32 %r3563, %r3562, %r3562, 20; + add.s32 %r3564, %r3558, %r3039; + add.s32 %r3565, %r3564, %r3563; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 24; + add.s32 %r3568, %r3567, %r3561; + xor.b32 %r3569, %r3568, %r3563; + shf.l.wrap.b32 %r3570, %r3569, %r3569, 25; + add.s32 %r3571, %r3523, %r3087; + add.s32 %r3572, %r3571, %r3570; + xor.b32 %r3573, %r3572, %r3539; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 16; + add.s32 %r3575, %r3574, %r3554; + xor.b32 %r3576, %r3575, %r3570; + shf.l.wrap.b32 %r3577, %r3576, %r3576, 20; + add.s32 %r3578, %r3572, %r3095; + add.s32 %r3579, %r3578, %r3577; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 24; + add.s32 %r3582, %r3581, %r3575; + xor.b32 %r3583, %r3582, %r3577; + shf.l.wrap.b32 %r3584, %r3583, %r3583, 25; + add.s32 %r3585, %r3537, %r3063; + add.s32 %r3586, %r3585, %r3528; + xor.b32 %r3587, %r3586, %r3553; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 16; + add.s32 %r3589, %r3588, %r3568; + xor.b32 %r3590, %r3589, %r3528; + shf.l.wrap.b32 %r3591, %r3590, %r3590, 20; + add.s32 %r3592, %r3586, %r3079; + add.s32 %r3593, %r3592, %r3591; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 24; + add.s32 %r3596, %r3595, %r3589; + xor.b32 %r3597, %r3596, %r3591; + shf.l.wrap.b32 %r3598, %r3597, %r3597, 25; + add.s32 %r3599, %r3551, %r3114; + add.s32 %r3600, %r3599, %r3542; + xor.b32 %r3601, %r3567, %r3600; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 16; + add.s32 %r3603, %r3602, %r3526; + xor.b32 %r3604, %r3603, %r3542; + shf.l.wrap.b32 %r3605, %r3604, %r3604, 20; + add.s32 %r3606, %r3600, %r3071; + add.s32 %r3607, %r3606, %r3605; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 24; + add.s32 %r3610, %r3609, %r3603; + xor.b32 %r3611, %r3610, %r3605; + shf.l.wrap.b32 %r3612, %r3611, %r3611, 25; + add.s32 %r3613, %r3556, %r3103; + add.s32 %r3614, %r3613, %r3565; + xor.b32 %r3615, %r3614, %r3525; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 16; + add.s32 %r3617, %r3616, %r3540; + xor.b32 %r3618, %r3617, %r3556; + shf.l.wrap.b32 %r3619, %r3618, %r3618, 20; + add.s32 %r3620, %r3614, %r3055; + add.s32 %r3621, %r3620, %r3619; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 24; + add.s32 %r3624, %r3623, %r3617; + xor.b32 %r3625, %r3624, %r3619; + shf.l.wrap.b32 %r3626, %r3625, %r3625, 25; + add.s32 %r3627, %r3598, %r3047; + add.s32 %r3628, %r3627, %r3579; + xor.b32 %r3629, %r3628, %r3623; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 16; + add.s32 %r3631, %r3630, %r3610; + xor.b32 %r3632, %r3631, %r3598; + shf.l.wrap.b32 %r3633, %r3632, %r3632, 20; + add.s32 %r3634, %r3628, %r3007; + add.s32 %r3635, %r3634, %r3633; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 24; + add.s32 %r3638, %r3637, %r3631; + xor.b32 %r3639, %r3638, %r3633; + shf.l.wrap.b32 %r3640, %r3639, %r3639, 25; + add.s32 %r3641, %r3593, %r3031; + add.s32 %r3642, %r3641, %r3612; + xor.b32 %r3643, %r3581, %r3642; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 16; + add.s32 %r3645, %r3644, %r3624; + xor.b32 %r3646, %r3645, %r3612; + shf.l.wrap.b32 %r3647, %r3646, %r3646, 20; + add.s32 %r3648, %r3642, %r3015; + add.s32 %r3649, %r3648, %r3647; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 24; + add.s32 %r3652, %r3651, %r3645; + xor.b32 %r3653, %r3652, %r3647; + shf.l.wrap.b32 %r3654, %r3653, %r3653, 25; + add.s32 %r3655, %r3607, %r2991; + add.s32 %r3656, %r3655, %r3626; + xor.b32 %r3657, %r3656, %r3595; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 16; + add.s32 %r3659, %r3658, %r3582; + xor.b32 %r3660, %r3659, %r3626; + shf.l.wrap.b32 %r3661, %r3660, %r3660, 20; + add.s32 %r3662, %r3656, %r2999; + add.s32 %r3663, %r3662, %r3661; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 24; + add.s32 %r3666, %r3665, %r3659; + xor.b32 %r3667, %r3666, %r3661; + shf.l.wrap.b32 %r3668, %r3667, %r3667, 25; + add.s32 %r3669, %r3621, %r3039; + add.s32 %r3670, %r3669, %r3584; + xor.b32 %r3671, %r3670, %r3609; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 16; + add.s32 %r3673, %r3672, %r3596; + xor.b32 %r3674, %r3673, %r3584; + shf.l.wrap.b32 %r3675, %r3674, %r3674, 20; + add.s32 %r3676, %r3670, %r3023; + add.s32 %r3677, %r3676, %r3675; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 24; + add.s32 %r3680, %r3679, %r3673; + xor.b32 %r3681, %r3680, %r3675; + shf.l.wrap.b32 %r3682, %r3681, %r3681, 25; + add.s32 %r3683, %r3635, %r3063; + add.s32 %r3684, %r3683, %r3682; + xor.b32 %r3685, %r3684, %r3651; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 16; + add.s32 %r3687, %r3686, %r3666; + xor.b32 %r3688, %r3687, %r3682; + shf.l.wrap.b32 %r3689, %r3688, %r3688, 20; + add.s32 %r3690, %r3684, %r3103; + add.s32 %r3691, %r3690, %r3689; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 24; + add.s32 %r3694, %r3693, %r3687; + xor.b32 %r3695, %r3694, %r3689; + shf.l.wrap.b32 %r3696, %r3695, %r3695, 25; + add.s32 %r3697, %r3649, %r3079; + add.s32 %r3698, %r3697, %r3640; + xor.b32 %r3699, %r3698, %r3665; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 16; + add.s32 %r3701, %r3700, %r3680; + xor.b32 %r3702, %r3701, %r3640; + shf.l.wrap.b32 %r3703, %r3702, %r3702, 20; + add.s32 %r3704, %r3698, %r3031; + add.s32 %r3705, %r3704, %r3703; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 24; + add.s32 %r3708, %r3707, %r3701; + xor.b32 %r3709, %r3708, %r3703; + shf.l.wrap.b32 %r3710, %r3709, %r3709, 25; + add.s32 %r3711, %r3663, %r3055; + add.s32 %r3712, %r3711, %r3654; + xor.b32 %r3713, %r3679, %r3712; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 16; + add.s32 %r3715, %r3714, %r3638; + xor.b32 %r3716, %r3715, %r3654; + shf.l.wrap.b32 %r3717, %r3716, %r3716, 20; + add.s32 %r3718, %r3712, %r3087; + add.s32 %r3719, %r3718, %r3717; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 24; + add.s32 %r3722, %r3721, %r3715; + xor.b32 %r3723, %r3722, %r3717; + shf.l.wrap.b32 %r3724, %r3723, %r3723, 25; + add.s32 %r3725, %r3668, %r3114; + add.s32 %r3726, %r3725, %r3677; + xor.b32 %r3727, %r3726, %r3637; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 16; + add.s32 %r3729, %r3728, %r3652; + xor.b32 %r3730, %r3729, %r3668; + shf.l.wrap.b32 %r3731, %r3730, %r3730, 20; + add.s32 %r3732, %r3726, %r2999; + add.s32 %r3733, %r3732, %r3731; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 24; + add.s32 %r3736, %r3735, %r3729; + xor.b32 %r3737, %r3736, %r3731; + shf.l.wrap.b32 %r3738, %r3737, %r3737, 25; + add.s32 %r3739, %r3710, %r3095; + add.s32 %r3740, %r3739, %r3691; + xor.b32 %r3741, %r3740, %r3735; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 16; + add.s32 %r3743, %r3742, %r3722; + xor.b32 %r3744, %r3743, %r3710; + shf.l.wrap.b32 %r3745, %r3744, %r3744, 20; + add.s32 %r3746, %r3740, %r3015; + add.s32 %r3747, %r3746, %r3745; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 24; + add.s32 %r3750, %r3749, %r3743; + xor.b32 %r3751, %r3750, %r3745; + shf.l.wrap.b32 %r3752, %r3751, %r3751, 25; + add.s32 %r3753, %r3705, %r2991; + add.s32 %r3754, %r3753, %r3724; + xor.b32 %r3755, %r3693, %r3754; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 16; + add.s32 %r3757, %r3756, %r3736; + xor.b32 %r3758, %r3757, %r3724; + shf.l.wrap.b32 %r3759, %r3758, %r3758, 20; + add.s32 %r3760, %r3754, %r3071; + add.s32 %r3761, %r3760, %r3759; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 24; + add.s32 %r3764, %r3763, %r3757; + xor.b32 %r3765, %r3764, %r3759; + shf.l.wrap.b32 %r3766, %r3765, %r3765, 25; + add.s32 %r3767, %r3719, %r3007; + add.s32 %r3768, %r3767, %r3738; + xor.b32 %r3769, %r3768, %r3707; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 16; + add.s32 %r3771, %r3770, %r3694; + xor.b32 %r3772, %r3771, %r3738; + shf.l.wrap.b32 %r3773, %r3772, %r3772, 20; + add.s32 %r3774, %r3768, %r3039; + add.s32 %r3775, %r3774, %r3773; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 24; + add.s32 %r3778, %r3777, %r3771; + xor.b32 %r3779, %r3778, %r3773; + shf.l.wrap.b32 %r3780, %r3779, %r3779, 25; + add.s32 %r3781, %r3733, %r3023; + add.s32 %r3782, %r3781, %r3696; + xor.b32 %r3783, %r3782, %r3721; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 16; + add.s32 %r3785, %r3784, %r3708; + xor.b32 %r3786, %r3785, %r3696; + shf.l.wrap.b32 %r3787, %r3786, %r3786, 20; + add.s32 %r3788, %r3782, %r3047; + add.s32 %r3789, %r3788, %r3787; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 24; + add.s32 %r3792, %r3791, %r3785; + xor.b32 %r3793, %r3792, %r3787; + shf.l.wrap.b32 %r3794, %r3793, %r3793, 25; + add.s32 %r3795, %r3747, %r3079; + add.s32 %r3796, %r3795, %r3794; + xor.b32 %r3797, %r3796, %r3763; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 16; + add.s32 %r3799, %r3798, %r3778; + xor.b32 %r3800, %r3799, %r3794; + shf.l.wrap.b32 %r3801, %r3800, %r3800, 20; + add.s32 %r3802, %r3796, %r3114; + add.s32 %r3803, %r3802, %r3801; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 24; + add.s32 %r3806, %r3805, %r3799; + xor.b32 %r3807, %r3806, %r3801; + shf.l.wrap.b32 %r3808, %r3807, %r3807, 25; + add.s32 %r3809, %r3761, %r3031; + add.s32 %r3810, %r3809, %r3752; + xor.b32 %r3811, %r3810, %r3777; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 16; + add.s32 %r3813, %r3812, %r3792; + xor.b32 %r3814, %r3813, %r3752; + shf.l.wrap.b32 %r3815, %r3814, %r3814, 20; + add.s32 %r3816, %r3810, %r2991; + add.s32 %r3817, %r3816, %r3815; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 24; + add.s32 %r3820, %r3819, %r3813; + xor.b32 %r3821, %r3820, %r3815; + shf.l.wrap.b32 %r3822, %r3821, %r3821, 25; + add.s32 %r3823, %r3775, %r2999; + add.s32 %r3824, %r3823, %r3766; + xor.b32 %r3825, %r3791, %r3824; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 16; + add.s32 %r3827, %r3826, %r3750; + xor.b32 %r3828, %r3827, %r3766; + shf.l.wrap.b32 %r3829, %r3828, %r3828, 20; + add.s32 %r3830, %r3824, %r3063; + add.s32 %r3831, %r3830, %r3829; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 24; + add.s32 %r3834, %r3833, %r3827; + xor.b32 %r3835, %r3834, %r3829; + shf.l.wrap.b32 %r3836, %r3835, %r3835, 25; + add.s32 %r3837, %r3780, %r3055; + add.s32 %r3838, %r3837, %r3789; + xor.b32 %r3839, %r3838, %r3749; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 16; + add.s32 %r3841, %r3840, %r3764; + xor.b32 %r3842, %r3841, %r3780; + shf.l.wrap.b32 %r3843, %r3842, %r3842, 20; + add.s32 %r3844, %r3838, %r3039; + add.s32 %r3845, %r3844, %r3843; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 24; + add.s32 %r3848, %r3847, %r3841; + xor.b32 %r3849, %r3848, %r3843; + shf.l.wrap.b32 %r3850, %r3849, %r3849, 25; + add.s32 %r3851, %r3822, %r3103; + add.s32 %r3852, %r3851, %r3803; + xor.b32 %r3853, %r3852, %r3847; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 16; + add.s32 %r3855, %r3854, %r3834; + xor.b32 %r3856, %r3855, %r3822; + shf.l.wrap.b32 %r3857, %r3856, %r3856, 20; + add.s32 %r3858, %r3852, %r3071; + add.s32 %r3859, %r3858, %r3857; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 24; + add.s32 %r3862, %r3861, %r3855; + xor.b32 %r3863, %r3862, %r3857; + shf.l.wrap.b32 %r3864, %r3863, %r3863, 25; + add.s32 %r3865, %r3817, %r3007; + add.s32 %r3866, %r3865, %r3836; + xor.b32 %r3867, %r3805, %r3866; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 16; + add.s32 %r3869, %r3868, %r3848; + xor.b32 %r3870, %r3869, %r3836; + shf.l.wrap.b32 %r3871, %r3870, %r3870, 20; + add.s32 %r3872, %r3866, %r3087; + add.s32 %r3873, %r3872, %r3871; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 24; + add.s32 %r3876, %r3875, %r3869; + xor.b32 %r3877, %r3876, %r3871; + shf.l.wrap.b32 %r3878, %r3877, %r3877, 25; + add.s32 %r3879, %r3831, %r3015; + add.s32 %r3880, %r3879, %r3850; + xor.b32 %r3881, %r3880, %r3819; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 16; + add.s32 %r3883, %r3882, %r3806; + xor.b32 %r3884, %r3883, %r3850; + shf.l.wrap.b32 %r3885, %r3884, %r3884, 20; + add.s32 %r3886, %r3880, %r3023; + add.s32 %r3887, %r3886, %r3885; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 24; + add.s32 %r3890, %r3889, %r3883; + xor.b32 %r3891, %r3890, %r3885; + shf.l.wrap.b32 %r3892, %r3891, %r3891, 25; + add.s32 %r3893, %r3845, %r3047; + add.s32 %r3894, %r3893, %r3808; + xor.b32 %r3895, %r3894, %r3833; + shf.l.wrap.b32 %r3896, %r3895, %r3895, 16; + add.s32 %r3897, %r3896, %r3820; + xor.b32 %r3898, %r3897, %r3808; + shf.l.wrap.b32 %r3899, %r3898, %r3898, 20; + add.s32 %r3900, %r3894, %r3095; + add.s32 %r3901, %r3900, %r3899; + xor.b32 %r3902, %r3901, %r3896; + shf.l.wrap.b32 %r3903, %r3902, %r3902, 24; + add.s32 %r3904, %r3903, %r3897; + xor.b32 %r3905, %r3904, %r3899; + shf.l.wrap.b32 %r3906, %r3905, %r3905, 25; + xor.b32 %r3907, %r3890, %r3859; + xor.b32 %r3908, %r3904, %r3873; + xor.b32 %r3909, %r3862, %r3887; + xor.b32 %r3910, %r3901, %r3876; + xor.b32 %r3911, %r3906, %r3875; + xor.b32 %r3912, %r3864, %r3889; + xor.b32 %r3913, %r3903, %r3878; + xor.b32 %r3914, %r3892, %r3861; + st.local.u8 [%rd134], %r3907; + shr.u32 %r3915, %r3907, 8; + st.local.u8 [%rd134+1], %r3915; + shr.u32 %r3916, %r3907, 16; + st.local.u8 [%rd134+2], %r3916; + shr.u32 %r3917, %r3907, 24; + st.local.u8 [%rd134+3], %r3917; + st.local.u8 [%rd134+4], %r3908; + shr.u32 %r3918, %r3908, 8; + st.local.u8 [%rd134+5], %r3918; + shr.u32 %r3919, %r3908, 16; + st.local.u8 [%rd134+6], %r3919; + shr.u32 %r3920, %r3908, 24; + st.local.u8 [%rd134+7], %r3920; + st.local.u8 [%rd134+8], %r3909; + shr.u32 %r3921, %r3909, 8; + st.local.u8 [%rd134+9], %r3921; + shr.u32 %r3922, %r3909, 16; + st.local.u8 [%rd134+10], %r3922; + shr.u32 %r3923, %r3909, 24; + st.local.u8 [%rd134+11], %r3923; + st.local.u8 [%rd134+12], %r3910; + shr.u32 %r3924, %r3910, 8; + st.local.u8 [%rd134+13], %r3924; + shr.u32 %r3925, %r3910, 16; + st.local.u8 [%rd134+14], %r3925; + shr.u32 %r3926, %r3910, 24; + st.local.u8 [%rd134+15], %r3926; + st.local.u8 [%rd134+16], %r3911; + shr.u32 %r3927, %r3911, 8; + st.local.u8 [%rd134+17], %r3927; + shr.u32 %r3928, %r3911, 16; + st.local.u8 [%rd134+18], %r3928; + shr.u32 %r3929, %r3911, 24; + st.local.u8 [%rd134+19], %r3929; + st.local.u8 [%rd134+20], %r3912; + shr.u32 %r3930, %r3912, 8; + st.local.u8 [%rd134+21], %r3930; + shr.u32 %r3931, %r3912, 16; + st.local.u8 [%rd134+22], %r3931; + shr.u32 %r3932, %r3912, 24; + st.local.u8 [%rd134+23], %r3932; + st.local.u8 [%rd134+24], %r3913; + shr.u32 %r3933, %r3913, 8; + st.local.u8 [%rd134+25], %r3933; + shr.u32 %r3934, %r3913, 16; + st.local.u8 [%rd134+26], %r3934; + shr.u32 %r3935, %r3913, 24; + st.local.u8 [%rd134+27], %r3935; + st.local.u8 [%rd134+28], %r3914; + shr.u32 %r3936, %r3914, 8; + st.local.u8 [%rd134+29], %r3936; + shr.u32 %r3937, %r3914, 16; + st.local.u8 [%rd134+30], %r3937; + shr.u32 %r3938, %r3914, 24; + st.local.u8 [%rd134+31], %r3938; + add.s64 %rd151, %rd151, 1; + bra.uni $L__BB0_30; + +$L__BB0_1: + add.s64 %rd76, %rd171, -1; + shr.u64 %rd77, %rd76, 10; + or.b64 %rd78, %rd77, 1; + setp.gt.u64 %p2, %rd78, 4294967295; + shr.u64 %rd79, %rd76, 42; + selp.b64 %rd80, %rd79, %rd78, %p2; + selp.b32 %r62, 32, 0, %p2; + and.b64 %rd81, %rd80, 4294901760; + setp.ne.s64 %p3, %rd81, 0; + shr.u64 %rd82, %rd80, 16; + or.b32 %r63, %r62, 16; + selp.b64 %rd83, %rd82, %rd80, %p3; + selp.b32 %r64, %r63, %r62, %p3; + and.b64 %rd84, %rd83, 65280; + setp.ne.s64 %p4, %rd84, 0; + shr.u64 %rd85, %rd83, 8; + or.b32 %r65, %r64, 8; + selp.b64 %rd86, %rd85, %rd83, %p4; + selp.b32 %r66, %r65, %r64, %p4; + and.b64 %rd87, %rd86, 240; + setp.ne.s64 %p5, %rd87, 0; + shr.u64 %rd88, %rd86, 4; + or.b32 %r67, %r66, 4; + selp.b64 %rd89, %rd88, %rd86, %p5; + selp.b32 %r68, %r67, %r66, %p5; + and.b64 %rd90, %rd89, 12; + setp.ne.s64 %p6, %rd90, 0; + shr.u64 %rd91, %rd89, 2; + add.s32 %r69, %r68, 2; + selp.b64 %rd92, %rd91, %rd89, %p6; + selp.b32 %r70, %r69, %r68, %p6; + and.b64 %rd93, %rd92, 2; + shr.u64 %rd94, %rd93, 1; + cvt.u32.u64 %r71, %rd94; + add.s32 %r72, %r70, %r71; + mov.u64 %rd95, 1024; + shl.b64 %rd96, %rd95, %r72; + sub.s64 %rd97, %rd171, %rd96; + add.s64 %rd98, %rd69, %rd96; + shr.u64 %rd99, %rd96, 10; + add.s64 %rd100, %rd99, %rd165; + setp.gt.u64 %p7, %rd96, 1024; + selp.b64 %rd101, 64, 32, %p7; + add.s64 %rd103, %rd149, %rd101; + cvt.u32.u16 %r73, %rs75; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd69; + .param .b64 param1; + st.param.b64 [param1+0], %rd96; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd165; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd149; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd5, [retval0+0]; + } // callseq 0 + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd98; + .param .b64 param1; + st.param.b64 [param1+0], %rd97; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd100; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd103; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd6, [retval0+0]; + } // callseq 1 + setp.eq.s64 %p8, %rd5, 1; + @%p8 bra $L__BB0_12; + bra.uni $L__BB0_2; + +$L__BB0_12: + mov.u64 %rd158, 0; + +$L__BB0_13: + add.s64 %rd117, %rd4, %rd158; + ld.local.u8 %rs78, [%rd117]; + add.s64 %rd118, %rd155, %rd158; + st.local.u8 [%rd118], %rs78; + add.s64 %rd158, %rd158, 1; + setp.lt.u64 %p15, %rd158, 64; + mov.u64 %rd151, 2; + @%p15 bra $L__BB0_13; + bra.uni $L__BB0_30; + +$L__BB0_2: + add.s64 %rd7, %rd6, %rd5; + setp.lt.u64 %p9, %rd7, 2; + mov.u64 %rd151, 0; + mov.u64 %rd152, %rd151; + @%p9 bra $L__BB0_5; + + mov.u64 %rd146, %rd153; + mov.u64 %rd147, %rd7; + +$L__BB0_4: + st.local.u64 [%rd146], %rd149; + add.s64 %rd151, %rd151, 1; + add.s64 %rd149, %rd149, 64; + add.s64 %rd152, %rd152, 2; + add.s64 %rd146, %rd146, 8; + add.s64 %rd147, %rd147, -2; + setp.gt.u64 %p10, %rd147, 1; + @%p10 bra $L__BB0_4; $L__BB0_5: - ld.global.u64 %rd104, [%rd1]; - xor.b64 %rd463, %rd104, %rd4; - -$L__BB0_6: - and.b64 %rd105, %rd463, %rd78; - or.b64 %rd8, %rd105, %rd79; - mov.b64 {%r26, %r27}, %rd8; - mov.u64 %rd106, 0; - ld.const.u64 %rd107, [hash_header]; - cvt.u32.u64 %r28, %rd107; - shr.u64 %rd108, %rd107, 8; - cvt.u32.u64 %r29, %rd108; - shr.u64 %rd109, %rd107, 16; - cvt.u32.u64 %r30, %rd109; - shr.u64 %rd110, %rd107, 32; - cvt.u32.u64 %r31, %rd110; - shr.u64 %rd111, %rd107, 40; - cvt.u32.u64 %r32, %rd111; - shr.u64 %rd112, %rd107, 48; - cvt.u32.u64 %r33, %rd112; - ld.const.u64 %rd113, [hash_header+8]; - cvt.u32.u64 %r34, %rd113; - shr.u64 %rd114, %rd113, 8; - cvt.u32.u64 %r35, %rd114; - shr.u64 %rd115, %rd113, 16; - cvt.u32.u64 %r36, %rd115; - shr.u64 %rd116, %rd113, 32; - cvt.u32.u64 %r37, %rd116; - shr.u64 %rd117, %rd113, 40; - cvt.u32.u64 %r38, %rd117; - shr.u64 %rd118, %rd113, 48; - cvt.u32.u64 %r39, %rd118; - ld.const.u64 %rd119, [hash_header+16]; - cvt.u32.u64 %r40, %rd119; - shr.u64 %rd120, %rd119, 8; - cvt.u32.u64 %r41, %rd120; - shr.u64 %rd121, %rd119, 16; - cvt.u32.u64 %r42, %rd121; - shr.u64 %rd122, %rd119, 32; - cvt.u32.u64 %r43, %rd122; - shr.u64 %rd123, %rd119, 40; - cvt.u32.u64 %r44, %rd123; - shr.u64 %rd124, %rd119, 48; - cvt.u32.u64 %r45, %rd124; - ld.const.u64 %rd125, [hash_header+24]; - cvt.u32.u64 %r46, %rd125; - shr.u64 %rd126, %rd125, 8; - cvt.u32.u64 %r47, %rd126; - shr.u64 %rd127, %rd125, 16; - cvt.u32.u64 %r48, %rd127; - shr.u64 %rd128, %rd125, 32; - cvt.u32.u64 %r49, %rd128; - shr.u64 %rd129, %rd125, 40; - cvt.u32.u64 %r50, %rd129; - shr.u64 %rd130, %rd125, 48; - cvt.u32.u64 %r51, %rd130; - ld.const.v4.u16 {%rs12, %rs13, %rs14, %rs15}, [hash_header+32]; - shr.u16 %rs17, %rs12, 8; - shr.u16 %rs19, %rs13, 8; - shr.u16 %rs21, %rs14, 8; - shr.u16 %rs23, %rs15, 8; - ld.const.v4.u16 {%rs24, %rs25, %rs26, %rs27}, [hash_header+40]; - shr.u16 %rs29, %rs24, 8; - shr.u16 %rs31, %rs25, 8; - shr.u16 %rs33, %rs26, 8; - shr.u16 %rs35, %rs27, 8; - ld.const.v4.u16 {%rs36, %rs37, %rs38, %rs39}, [hash_header+48]; - shr.u16 %rs41, %rs36, 8; - shr.u16 %rs43, %rs37, 8; - shr.u16 %rs45, %rs38, 8; - shr.u16 %rs47, %rs39, 8; - ld.const.v4.u16 {%rs48, %rs49, %rs50, %rs51}, [hash_header+56]; - shr.u16 %rs53, %rs48, 8; - shr.u16 %rs55, %rs49, 8; - shr.u16 %rs57, %rs50, 8; - shr.u16 %rs59, %rs51, 8; - ld.const.u64 %rd131, [hash_header+64]; - mov.b64 {%r52, %r53}, %rd131; - mov.u32 %r54, -1150833019; - mov.u32 %r55, 1779033703; - st.local.v2.u32 [%rd3], {%r55, %r54}; - mov.u32 %r56, -1521486534; - mov.u32 %r57, 1013904242; - st.local.v2.u32 [%rd3+8], {%r57, %r56}; - mov.u32 %r58, -1694144372; - mov.u32 %r59, 1359893119; - st.local.v2.u32 [%rd3+16], {%r59, %r58}; - mov.u32 %r60, 1541459225; - mov.u32 %r61, 528734635; - st.local.v2.u32 [%rd3+24], {%r61, %r60}; - st.local.u64 [%rd3+64], %rd106; - mov.u32 %r62, 0; - st.local.v2.u32 [%rd3+88], {%r62, %r62}; - st.local.v2.u32 [%rd3+96], {%r62, %r62}; - st.local.v2.u32 [%rd3+104], {%r62, %r62}; - st.local.v2.u32 [%rd3+112], {%r62, %r62}; - st.local.v2.u32 [%rd3+120], {%r62, %r62}; - st.local.v2.u32 [%rd3+128], {%r62, %r62}; - mov.u16 %rs60, 0; - st.local.v2.u8 [%rd3+136], {%rs60, %rs60}; - st.local.u8 [%rd3+138], %rs60; - st.local.v2.u32 [%rd3+32], {%r55, %r54}; - st.local.v2.u32 [%rd3+40], {%r57, %r56}; - st.local.v2.u32 [%rd3+48], {%r59, %r58}; - st.local.v2.u32 [%rd3+56], {%r61, %r60}; - st.local.v2.u32 [%rd3+72], {%r62, %r62}; - st.local.v2.u32 [%rd3+80], {%r62, %r62}; - st.local.u8 [%rd3+144], %rs60; - ld.local.v4.u8 {%rs61, %rs62, %rs63, %rs64}, [%rd3+136]; - setp.eq.s16 %p9, %rs62, 0; - selp.u16 %rs68, 1, 0, %p9; - or.b16 %rs69, %rs63, %rs68; - shr.u32 %r63, %r28, 24; - mov.u32 %r64, 64; - prmt.b32 %r65, %r28, %r29, %r64; - mov.u32 %r66, 1040; - prmt.b32 %r67, %r65, %r30, %r66; - mov.u32 %r68, 16912; - prmt.b32 %r69, %r67, %r63, %r68; - and.b32 %r70, %r31, 255; - and.b32 %r71, %r32, 255; - prmt.b32 %r72, %r71, %r70, 30212; - shl.b32 %r73, %r33, 16; - and.b32 %r74, %r73, 16711680; - or.b32 %r75, %r72, %r74; - and.b32 %r76, %r31, -16777216; - or.b32 %r77, %r75, %r76; - shr.u32 %r78, %r34, 24; - prmt.b32 %r79, %r34, %r35, %r64; - prmt.b32 %r80, %r79, %r36, %r66; - prmt.b32 %r81, %r80, %r78, %r68; - and.b32 %r82, %r37, 255; - and.b32 %r83, %r38, 255; - prmt.b32 %r84, %r83, %r82, 30212; - shl.b32 %r85, %r39, 16; - and.b32 %r86, %r85, 16711680; - or.b32 %r87, %r84, %r86; - and.b32 %r88, %r37, -16777216; - or.b32 %r89, %r87, %r88; - shr.u32 %r90, %r40, 24; - prmt.b32 %r91, %r40, %r41, %r64; - prmt.b32 %r92, %r91, %r42, %r66; - prmt.b32 %r93, %r92, %r90, %r68; - and.b32 %r94, %r43, 255; - and.b32 %r95, %r44, 255; - prmt.b32 %r96, %r95, %r94, 30212; - shl.b32 %r97, %r45, 16; - and.b32 %r98, %r97, 16711680; - or.b32 %r99, %r96, %r98; - and.b32 %r100, %r43, -16777216; - or.b32 %r101, %r99, %r100; - shr.u32 %r102, %r46, 24; - prmt.b32 %r103, %r46, %r47, %r64; - prmt.b32 %r104, %r103, %r48, %r66; - prmt.b32 %r105, %r104, %r102, %r68; - and.b32 %r106, %r49, 255; - and.b32 %r107, %r50, 255; - prmt.b32 %r108, %r107, %r106, 30212; - shl.b32 %r109, %r51, 16; - and.b32 %r110, %r109, 16711680; - or.b32 %r111, %r108, %r110; - and.b32 %r112, %r49, -16777216; - or.b32 %r113, %r111, %r112; - cvt.u32.u16 %r114, %rs12; - and.b32 %r115, %r114, 255; - cvt.u32.u16 %r116, %rs17; - prmt.b32 %r117, %r116, %r115, 30212; - cvt.u32.u16 %r118, %rs13; - prmt.b32 %r119, %r118, %r117, 28756; - cvt.u32.u16 %r120, %rs19; - prmt.b32 %r121, %r120, %r119, 1620; - cvt.u32.u16 %r122, %rs14; - and.b32 %r123, %r122, 255; - cvt.u32.u16 %r124, %rs21; + setp.eq.s64 %p11, %rd151, 0; + @%p11 bra $L__BB0_8; + + or.b16 %rs76, %rs75, 4; + cvt.u32.u16 %r1, %rs76; + mov.u64 %rd154, %rd151; + +$L__BB0_7: + ld.local.u64 %rd109, [%rd153]; + ld.u8 %r74, [%rd109]; + ld.u8 %r75, [%rd109+1]; + prmt.b32 %r76, %r75, %r74, 30212; + ld.u8 %r77, [%rd109+2]; + prmt.b32 %r78, %r77, %r76, 28756; + ld.u8 %r79, [%rd109+3]; + prmt.b32 %r80, %r79, %r78, 1620; + ld.u8 %r81, [%rd109+4]; + ld.u8 %r82, [%rd109+5]; + prmt.b32 %r83, %r82, %r81, 30212; + ld.u8 %r84, [%rd109+6]; + prmt.b32 %r85, %r84, %r83, 28756; + ld.u8 %r86, [%rd109+7]; + prmt.b32 %r87, %r86, %r85, 1620; + ld.u8 %r88, [%rd109+8]; + ld.u8 %r89, [%rd109+9]; + prmt.b32 %r90, %r89, %r88, 30212; + ld.u8 %r91, [%rd109+10]; + prmt.b32 %r92, %r91, %r90, 28756; + ld.u8 %r93, [%rd109+11]; + prmt.b32 %r94, %r93, %r92, 1620; + ld.u8 %r95, [%rd109+12]; + ld.u8 %r96, [%rd109+13]; + prmt.b32 %r97, %r96, %r95, 30212; + ld.u8 %r98, [%rd109+14]; + prmt.b32 %r99, %r98, %r97, 28756; + ld.u8 %r100, [%rd109+15]; + prmt.b32 %r101, %r100, %r99, 1620; + ld.u8 %r102, [%rd109+16]; + ld.u8 %r103, [%rd109+17]; + prmt.b32 %r104, %r103, %r102, 30212; + ld.u8 %r105, [%rd109+18]; + prmt.b32 %r106, %r105, %r104, 28756; + ld.u8 %r107, [%rd109+19]; + prmt.b32 %r108, %r107, %r106, 1620; + ld.u8 %r109, [%rd109+20]; + ld.u8 %r110, [%rd109+21]; + prmt.b32 %r111, %r110, %r109, 30212; + ld.u8 %r112, [%rd109+22]; + prmt.b32 %r113, %r112, %r111, 28756; + ld.u8 %r114, [%rd109+23]; + prmt.b32 %r115, %r114, %r113, 1620; + ld.u8 %r116, [%rd109+24]; + ld.u8 %r117, [%rd109+25]; + prmt.b32 %r118, %r117, %r116, 30212; + ld.u8 %r119, [%rd109+26]; + prmt.b32 %r120, %r119, %r118, 28756; + ld.u8 %r121, [%rd109+27]; + prmt.b32 %r122, %r121, %r120, 1620; + ld.u8 %r123, [%rd109+28]; + ld.u8 %r124, [%rd109+29]; prmt.b32 %r125, %r124, %r123, 30212; - cvt.u32.u16 %r126, %rs15; + ld.u8 %r126, [%rd109+30]; prmt.b32 %r127, %r126, %r125, 28756; - cvt.u32.u16 %r128, %rs23; + ld.u8 %r128, [%rd109+31]; prmt.b32 %r129, %r128, %r127, 1620; - cvt.u32.u16 %r130, %rs24; - and.b32 %r131, %r130, 255; - cvt.u32.u16 %r132, %rs29; - prmt.b32 %r133, %r132, %r131, 30212; - cvt.u32.u16 %r134, %rs25; - prmt.b32 %r135, %r134, %r133, 28756; - cvt.u32.u16 %r136, %rs31; - prmt.b32 %r137, %r136, %r135, 1620; - cvt.u32.u16 %r138, %rs26; - and.b32 %r139, %r138, 255; - cvt.u32.u16 %r140, %rs33; - prmt.b32 %r141, %r140, %r139, 30212; - cvt.u32.u16 %r142, %rs27; - prmt.b32 %r143, %r142, %r141, 28756; - cvt.u32.u16 %r144, %rs35; - prmt.b32 %r145, %r144, %r143, 1620; - cvt.u32.u16 %r146, %rs36; - and.b32 %r147, %r146, 255; - cvt.u32.u16 %r148, %rs41; - prmt.b32 %r149, %r148, %r147, 30212; - cvt.u32.u16 %r150, %rs37; - prmt.b32 %r151, %r150, %r149, 28756; - cvt.u32.u16 %r152, %rs43; - prmt.b32 %r153, %r152, %r151, 1620; - cvt.u32.u16 %r154, %rs38; - and.b32 %r155, %r154, 255; - cvt.u32.u16 %r156, %rs45; + ld.u8 %r130, [%rd109+32]; + ld.u8 %r131, [%rd109+33]; + prmt.b32 %r132, %r131, %r130, 30212; + ld.u8 %r133, [%rd109+34]; + prmt.b32 %r134, %r133, %r132, 28756; + ld.u8 %r135, [%rd109+35]; + prmt.b32 %r136, %r135, %r134, 1620; + ld.u8 %r137, [%rd109+36]; + ld.u8 %r138, [%rd109+37]; + prmt.b32 %r139, %r138, %r137, 30212; + ld.u8 %r140, [%rd109+38]; + prmt.b32 %r141, %r140, %r139, 28756; + ld.u8 %r142, [%rd109+39]; + prmt.b32 %r143, %r142, %r141, 1620; + ld.u8 %r144, [%rd109+40]; + ld.u8 %r145, [%rd109+41]; + prmt.b32 %r146, %r145, %r144, 30212; + ld.u8 %r147, [%rd109+42]; + prmt.b32 %r148, %r147, %r146, 28756; + ld.u8 %r149, [%rd109+43]; + prmt.b32 %r150, %r149, %r148, 1620; + ld.u8 %r151, [%rd109+44]; + ld.u8 %r152, [%rd109+45]; + prmt.b32 %r153, %r152, %r151, 30212; + ld.u8 %r154, [%rd109+46]; + prmt.b32 %r155, %r154, %r153, 28756; + ld.u8 %r156, [%rd109+47]; + prmt.b32 %r157, %r156, %r155, 1620; + ld.u8 %r158, [%rd109+48]; + ld.u8 %r159, [%rd109+49]; + prmt.b32 %r160, %r159, %r158, 30212; + ld.u8 %r161, [%rd109+50]; + prmt.b32 %r162, %r161, %r160, 28756; + ld.u8 %r163, [%rd109+51]; + prmt.b32 %r164, %r163, %r162, 1620; + ld.u8 %r165, [%rd109+52]; + ld.u8 %r166, [%rd109+53]; + prmt.b32 %r167, %r166, %r165, 30212; + ld.u8 %r168, [%rd109+54]; + prmt.b32 %r169, %r168, %r167, 28756; + ld.u8 %r170, [%rd109+55]; + prmt.b32 %r171, %r170, %r169, 1620; + ld.u8 %r172, [%rd109+56]; + ld.u8 %r173, [%rd109+57]; + prmt.b32 %r174, %r173, %r172, 30212; + ld.u8 %r175, [%rd109+58]; + prmt.b32 %r176, %r175, %r174, 28756; + ld.u8 %r177, [%rd109+59]; + prmt.b32 %r178, %r177, %r176, 1620; + ld.u8 %r179, [%rd109+60]; + ld.u8 %r180, [%rd109+61]; + prmt.b32 %r181, %r180, %r179, 30212; + ld.u8 %r182, [%rd109+62]; + prmt.b32 %r183, %r182, %r181, 28756; + ld.u8 %r184, [%rd109+63]; + prmt.b32 %r185, %r184, %r183, 1620; + ld.local.u8 %r186, [%rd2+16]; + ld.local.u8 %r187, [%rd2+17]; + prmt.b32 %r188, %r187, %r186, 30212; + ld.local.u8 %r189, [%rd2+18]; + ld.local.u8 %r190, [%rd2+19]; + prmt.b32 %r191, %r190, %r189, 30212; + prmt.b32 %r192, %r191, %r188, 4180; + ld.local.u8 %r193, [%rd2]; + ld.local.u8 %r194, [%rd2+1]; + prmt.b32 %r195, %r194, %r193, 30212; + ld.local.u8 %r196, [%rd2+2]; + ld.local.u8 %r197, [%rd2+3]; + prmt.b32 %r198, %r197, %r196, 30212; + prmt.b32 %r199, %r198, %r195, 4180; + add.s32 %r200, %r192, %r199; + add.s32 %r201, %r200, %r80; + shf.l.wrap.b32 %r202, %r201, %r201, 16; + add.s32 %r203, %r202, 1779033703; + xor.b32 %r204, %r203, %r192; + shf.l.wrap.b32 %r205, %r204, %r204, 20; + add.s32 %r206, %r87, %r201; + add.s32 %r207, %r206, %r205; + xor.b32 %r208, %r207, %r202; + shf.l.wrap.b32 %r209, %r208, %r208, 24; + add.s32 %r210, %r209, %r203; + xor.b32 %r211, %r210, %r205; + shf.l.wrap.b32 %r212, %r211, %r211, 25; + ld.local.u8 %r213, [%rd2+20]; + ld.local.u8 %r214, [%rd2+21]; + prmt.b32 %r215, %r214, %r213, 30212; + ld.local.u8 %r216, [%rd2+22]; + ld.local.u8 %r217, [%rd2+23]; + prmt.b32 %r218, %r217, %r216, 30212; + prmt.b32 %r219, %r218, %r215, 4180; + ld.local.u8 %r220, [%rd2+4]; + ld.local.u8 %r221, [%rd2+5]; + prmt.b32 %r222, %r221, %r220, 30212; + ld.local.u8 %r223, [%rd2+6]; + ld.local.u8 %r224, [%rd2+7]; + prmt.b32 %r225, %r224, %r223, 30212; + prmt.b32 %r226, %r225, %r222, 4180; + add.s32 %r227, %r219, %r226; + add.s32 %r228, %r227, %r94; + shf.l.wrap.b32 %r229, %r228, %r228, 16; + add.s32 %r230, %r229, -1150833019; + xor.b32 %r231, %r230, %r219; + shf.l.wrap.b32 %r232, %r231, %r231, 20; + add.s32 %r233, %r101, %r228; + add.s32 %r234, %r233, %r232; + xor.b32 %r235, %r234, %r229; + shf.l.wrap.b32 %r236, %r235, %r235, 24; + add.s32 %r237, %r236, %r230; + xor.b32 %r238, %r237, %r232; + shf.l.wrap.b32 %r239, %r238, %r238, 25; + ld.local.u8 %r240, [%rd2+24]; + ld.local.u8 %r241, [%rd2+25]; + prmt.b32 %r242, %r241, %r240, 30212; + ld.local.u8 %r243, [%rd2+26]; + ld.local.u8 %r244, [%rd2+27]; + prmt.b32 %r245, %r244, %r243, 30212; + prmt.b32 %r246, %r245, %r242, 4180; + ld.local.u8 %r247, [%rd2+8]; + ld.local.u8 %r248, [%rd2+9]; + prmt.b32 %r249, %r248, %r247, 30212; + ld.local.u8 %r250, [%rd2+10]; + ld.local.u8 %r251, [%rd2+11]; + prmt.b32 %r252, %r251, %r250, 30212; + prmt.b32 %r253, %r252, %r249, 4180; + add.s32 %r254, %r246, %r253; + add.s32 %r255, %r254, %r108; + shr.u32 %r256, %r255, 16; + shl.b32 %r257, %r255, 16; + xor.b32 %r258, %r257, 4194304; + or.b32 %r259, %r258, %r256; + add.s32 %r260, %r259, 1013904242; + xor.b32 %r261, %r260, %r246; + shf.l.wrap.b32 %r262, %r261, %r261, 20; + add.s32 %r263, %r115, %r255; + add.s32 %r264, %r263, %r262; + xor.b32 %r265, %r264, %r259; + shf.l.wrap.b32 %r266, %r265, %r265, 24; + add.s32 %r267, %r266, %r260; + xor.b32 %r268, %r267, %r262; + shf.l.wrap.b32 %r269, %r268, %r268, 25; + ld.local.u8 %r270, [%rd2+28]; + ld.local.u8 %r271, [%rd2+29]; + prmt.b32 %r272, %r271, %r270, 30212; + ld.local.u8 %r273, [%rd2+30]; + ld.local.u8 %r274, [%rd2+31]; + prmt.b32 %r275, %r274, %r273, 30212; + prmt.b32 %r276, %r275, %r272, 4180; + ld.local.u8 %r277, [%rd2+12]; + ld.local.u8 %r278, [%rd2+13]; + prmt.b32 %r279, %r278, %r277, 30212; + ld.local.u8 %r280, [%rd2+14]; + ld.local.u8 %r281, [%rd2+15]; + prmt.b32 %r282, %r281, %r280, 30212; + prmt.b32 %r283, %r282, %r279, 4180; + add.s32 %r284, %r276, %r283; + add.s32 %r285, %r284, %r122; + xor.b32 %r286, %r285, %r1; + shr.u32 %r287, %r285, 16; + shl.b32 %r288, %r286, 16; + or.b32 %r289, %r288, %r287; + add.s32 %r290, %r289, -1521486534; + xor.b32 %r291, %r290, %r276; + shf.l.wrap.b32 %r292, %r291, %r291, 20; + add.s32 %r293, %r129, %r285; + add.s32 %r294, %r293, %r292; + xor.b32 %r295, %r294, %r289; + shf.l.wrap.b32 %r296, %r295, %r295, 24; + add.s32 %r297, %r296, %r290; + xor.b32 %r298, %r297, %r292; + shf.l.wrap.b32 %r299, %r298, %r298, 25; + add.s32 %r300, %r239, %r207; + add.s32 %r301, %r300, %r136; + xor.b32 %r302, %r296, %r301; + shf.l.wrap.b32 %r303, %r302, %r302, 16; + add.s32 %r304, %r303, %r267; + xor.b32 %r305, %r304, %r239; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r143, %r301; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + add.s32 %r314, %r269, %r234; + add.s32 %r315, %r314, %r150; + xor.b32 %r316, %r315, %r209; + shf.l.wrap.b32 %r317, %r316, %r316, 16; + add.s32 %r318, %r317, %r297; + xor.b32 %r319, %r318, %r269; + shf.l.wrap.b32 %r320, %r319, %r319, 20; + add.s32 %r321, %r157, %r315; + add.s32 %r322, %r321, %r320; + xor.b32 %r323, %r322, %r317; + shf.l.wrap.b32 %r324, %r323, %r323, 24; + add.s32 %r325, %r324, %r318; + xor.b32 %r326, %r325, %r320; + shf.l.wrap.b32 %r327, %r326, %r326, 25; + add.s32 %r328, %r299, %r264; + add.s32 %r329, %r328, %r164; + xor.b32 %r330, %r329, %r236; + shf.l.wrap.b32 %r331, %r330, %r330, 16; + add.s32 %r332, %r331, %r210; + xor.b32 %r333, %r332, %r299; + shf.l.wrap.b32 %r334, %r333, %r333, 20; + add.s32 %r335, %r171, %r329; + add.s32 %r336, %r335, %r334; + xor.b32 %r337, %r336, %r331; + shf.l.wrap.b32 %r338, %r337, %r337, 24; + add.s32 %r339, %r338, %r332; + xor.b32 %r340, %r339, %r334; + shf.l.wrap.b32 %r341, %r340, %r340, 25; + add.s32 %r342, %r294, %r212; + add.s32 %r343, %r342, %r178; + xor.b32 %r344, %r343, %r266; + shf.l.wrap.b32 %r345, %r344, %r344, 16; + add.s32 %r346, %r345, %r237; + xor.b32 %r347, %r346, %r212; + shf.l.wrap.b32 %r348, %r347, %r347, 20; + add.s32 %r349, %r185, %r343; + add.s32 %r350, %r349, %r348; + xor.b32 %r351, %r350, %r345; + shf.l.wrap.b32 %r352, %r351, %r351, 24; + add.s32 %r353, %r352, %r346; + xor.b32 %r354, %r353, %r348; + shf.l.wrap.b32 %r355, %r354, %r354, 25; + add.s32 %r356, %r308, %r94; + add.s32 %r357, %r356, %r355; + xor.b32 %r358, %r357, %r324; + shf.l.wrap.b32 %r359, %r358, %r358, 16; + add.s32 %r360, %r359, %r339; + xor.b32 %r361, %r360, %r355; + shf.l.wrap.b32 %r362, %r361, %r361, 20; + add.s32 %r363, %r357, %r122; + add.s32 %r364, %r363, %r362; + xor.b32 %r365, %r364, %r359; + shf.l.wrap.b32 %r366, %r365, %r365, 24; + add.s32 %r367, %r366, %r360; + xor.b32 %r368, %r367, %r362; + shf.l.wrap.b32 %r369, %r368, %r368, 25; + add.s32 %r370, %r322, %r101; + add.s32 %r371, %r370, %r313; + xor.b32 %r372, %r338, %r371; + shf.l.wrap.b32 %r373, %r372, %r372, 16; + add.s32 %r374, %r353, %r373; + xor.b32 %r375, %r374, %r313; + shf.l.wrap.b32 %r376, %r375, %r375, 20; + add.s32 %r377, %r371, %r150; + add.s32 %r378, %r377, %r376; + xor.b32 %r379, %r378, %r373; + shf.l.wrap.b32 %r380, %r379, %r379, 24; + add.s32 %r381, %r380, %r374; + xor.b32 %r382, %r381, %r376; + shf.l.wrap.b32 %r383, %r382, %r382, 25; + add.s32 %r384, %r327, %r129; + add.s32 %r385, %r384, %r336; + xor.b32 %r386, %r352, %r385; + shf.l.wrap.b32 %r387, %r386, %r386, 16; + add.s32 %r388, %r387, %r311; + xor.b32 %r389, %r388, %r327; + shf.l.wrap.b32 %r390, %r389, %r389, 20; + add.s32 %r391, %r385, %r80; + add.s32 %r392, %r391, %r390; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 24; + add.s32 %r395, %r394, %r388; + xor.b32 %r396, %r395, %r390; + shf.l.wrap.b32 %r397, %r396, %r396, 25; + add.s32 %r398, %r341, %r108; + add.s32 %r399, %r398, %r350; + xor.b32 %r400, %r399, %r310; + shf.l.wrap.b32 %r401, %r400, %r400, 16; + add.s32 %r402, %r401, %r325; + xor.b32 %r403, %r402, %r341; + shf.l.wrap.b32 %r404, %r403, %r403, 20; + add.s32 %r405, %r399, %r171; + add.s32 %r406, %r405, %r404; + xor.b32 %r407, %r406, %r401; + shf.l.wrap.b32 %r408, %r407, %r407, 24; + add.s32 %r409, %r408, %r402; + xor.b32 %r410, %r409, %r404; + shf.l.wrap.b32 %r411, %r410, %r410, 25; + add.s32 %r412, %r383, %r87; + add.s32 %r413, %r412, %r364; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 16; + add.s32 %r416, %r415, %r395; + xor.b32 %r417, %r416, %r383; + shf.l.wrap.b32 %r418, %r417, %r417, 20; + add.s32 %r419, %r413, %r157; + add.s32 %r420, %r419, %r418; + xor.b32 %r421, %r420, %r415; + shf.l.wrap.b32 %r422, %r421, %r421, 24; + add.s32 %r423, %r422, %r416; + xor.b32 %r424, %r423, %r418; + shf.l.wrap.b32 %r425, %r424, %r424, 25; + add.s32 %r426, %r378, %r164; + add.s32 %r427, %r426, %r397; + xor.b32 %r428, %r366, %r427; + shf.l.wrap.b32 %r429, %r428, %r428, 16; + add.s32 %r430, %r429, %r409; + xor.b32 %r431, %r430, %r397; + shf.l.wrap.b32 %r432, %r431, %r431, 20; + add.s32 %r433, %r427, %r115; + add.s32 %r434, %r433, %r432; + xor.b32 %r435, %r434, %r429; + shf.l.wrap.b32 %r436, %r435, %r435, 24; + add.s32 %r437, %r436, %r430; + xor.b32 %r438, %r437, %r432; + shf.l.wrap.b32 %r439, %r438, %r438, 25; + add.s32 %r440, %r392, %r143; + add.s32 %r441, %r440, %r411; + xor.b32 %r442, %r441, %r380; + shf.l.wrap.b32 %r443, %r442, %r442, 16; + add.s32 %r444, %r443, %r367; + xor.b32 %r445, %r444, %r411; + shf.l.wrap.b32 %r446, %r445, %r445, 20; + add.s32 %r447, %r441, %r178; + add.s32 %r448, %r447, %r446; + xor.b32 %r449, %r448, %r443; + shf.l.wrap.b32 %r450, %r449, %r449, 24; + add.s32 %r451, %r450, %r444; + xor.b32 %r452, %r451, %r446; + shf.l.wrap.b32 %r453, %r452, %r452, 25; + add.s32 %r454, %r406, %r185; + add.s32 %r455, %r454, %r369; + xor.b32 %r456, %r455, %r394; + shf.l.wrap.b32 %r457, %r456, %r456, 16; + add.s32 %r458, %r457, %r381; + xor.b32 %r459, %r458, %r369; + shf.l.wrap.b32 %r460, %r459, %r459, 20; + add.s32 %r461, %r455, %r136; + add.s32 %r462, %r461, %r460; + xor.b32 %r463, %r462, %r457; + shf.l.wrap.b32 %r464, %r463, %r463, 24; + add.s32 %r465, %r464, %r458; + xor.b32 %r466, %r465, %r460; + shf.l.wrap.b32 %r467, %r466, %r466, 25; + add.s32 %r468, %r420, %r101; + add.s32 %r469, %r468, %r467; + xor.b32 %r470, %r469, %r436; + shf.l.wrap.b32 %r471, %r470, %r470, 16; + add.s32 %r472, %r471, %r451; + xor.b32 %r473, %r472, %r467; + shf.l.wrap.b32 %r474, %r473, %r473, 20; + add.s32 %r475, %r469, %r108; + add.s32 %r476, %r475, %r474; + xor.b32 %r477, %r476, %r471; + shf.l.wrap.b32 %r478, %r477, %r477, 24; + add.s32 %r479, %r478, %r472; + xor.b32 %r480, %r479, %r474; + shf.l.wrap.b32 %r481, %r480, %r480, 25; + add.s32 %r482, %r434, %r150; + add.s32 %r483, %r482, %r425; + xor.b32 %r484, %r483, %r450; + shf.l.wrap.b32 %r485, %r484, %r484, 16; + add.s32 %r486, %r485, %r465; + xor.b32 %r487, %r486, %r425; + shf.l.wrap.b32 %r488, %r487, %r487, 20; + add.s32 %r489, %r483, %r164; + add.s32 %r490, %r489, %r488; + xor.b32 %r491, %r490, %r485; + shf.l.wrap.b32 %r492, %r491, %r491, 24; + add.s32 %r493, %r492, %r486; + xor.b32 %r494, %r493, %r488; + shf.l.wrap.b32 %r495, %r494, %r494, 25; + add.s32 %r496, %r448, %r171; + add.s32 %r497, %r496, %r439; + xor.b32 %r498, %r464, %r497; + shf.l.wrap.b32 %r499, %r498, %r498, 16; + add.s32 %r500, %r499, %r423; + xor.b32 %r501, %r500, %r439; + shf.l.wrap.b32 %r502, %r501, %r501, 20; + add.s32 %r503, %r497, %r94; + add.s32 %r504, %r503, %r502; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 24; + add.s32 %r507, %r506, %r500; + xor.b32 %r508, %r507, %r502; + shf.l.wrap.b32 %r509, %r508, %r508, 25; + add.s32 %r510, %r453, %r129; + add.s32 %r511, %r510, %r462; + xor.b32 %r512, %r511, %r422; + shf.l.wrap.b32 %r513, %r512, %r512, 16; + add.s32 %r514, %r513, %r437; + xor.b32 %r515, %r514, %r453; + shf.l.wrap.b32 %r516, %r515, %r515, 20; + add.s32 %r517, %r511, %r178; + add.s32 %r518, %r517, %r516; + xor.b32 %r519, %r518, %r513; + shf.l.wrap.b32 %r520, %r519, %r519, 24; + add.s32 %r521, %r520, %r514; + xor.b32 %r522, %r521, %r516; + shf.l.wrap.b32 %r523, %r522, %r522, 25; + add.s32 %r524, %r495, %r122; + add.s32 %r525, %r524, %r476; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 16; + add.s32 %r528, %r527, %r507; + xor.b32 %r529, %r528, %r495; + shf.l.wrap.b32 %r530, %r529, %r529, 20; + add.s32 %r531, %r525, %r115; + add.s32 %r532, %r531, %r530; + xor.b32 %r533, %r532, %r527; + shf.l.wrap.b32 %r534, %r533, %r533, 24; + add.s32 %r535, %r534, %r528; + xor.b32 %r536, %r535, %r530; + shf.l.wrap.b32 %r537, %r536, %r536, 25; + add.s32 %r538, %r490, %r143; + add.s32 %r539, %r538, %r509; + xor.b32 %r540, %r478, %r539; + shf.l.wrap.b32 %r541, %r540, %r540, 16; + add.s32 %r542, %r541, %r521; + xor.b32 %r543, %r542, %r509; + shf.l.wrap.b32 %r544, %r543, %r543, 20; + add.s32 %r545, %r539, %r80; + add.s32 %r546, %r545, %r544; + xor.b32 %r547, %r546, %r541; + shf.l.wrap.b32 %r548, %r547, %r547, 24; + add.s32 %r549, %r548, %r542; + xor.b32 %r550, %r549, %r544; + shf.l.wrap.b32 %r551, %r550, %r550, 25; + add.s32 %r552, %r504, %r157; + add.s32 %r553, %r552, %r523; + xor.b32 %r554, %r553, %r492; + shf.l.wrap.b32 %r555, %r554, %r554, 16; + add.s32 %r556, %r555, %r479; + xor.b32 %r557, %r556, %r523; + shf.l.wrap.b32 %r558, %r557, %r557, 20; + add.s32 %r559, %r553, %r185; + add.s32 %r560, %r559, %r558; + xor.b32 %r561, %r560, %r555; + shf.l.wrap.b32 %r562, %r561, %r561, 24; + add.s32 %r563, %r562, %r556; + xor.b32 %r564, %r563, %r558; + shf.l.wrap.b32 %r565, %r564, %r564, 25; + add.s32 %r566, %r518, %r136; + add.s32 %r567, %r566, %r481; + xor.b32 %r568, %r567, %r506; + shf.l.wrap.b32 %r569, %r568, %r568, 16; + add.s32 %r570, %r569, %r493; + xor.b32 %r571, %r570, %r481; + shf.l.wrap.b32 %r572, %r571, %r571, 20; + add.s32 %r573, %r567, %r87; + add.s32 %r574, %r573, %r572; + xor.b32 %r575, %r574, %r569; + shf.l.wrap.b32 %r576, %r575, %r575, 24; + add.s32 %r577, %r576, %r570; + xor.b32 %r578, %r577, %r572; + shf.l.wrap.b32 %r579, %r578, %r578, 25; + add.s32 %r580, %r532, %r150; + add.s32 %r581, %r580, %r579; + xor.b32 %r582, %r581, %r548; + shf.l.wrap.b32 %r583, %r582, %r582, 16; + add.s32 %r584, %r583, %r563; + xor.b32 %r585, %r584, %r579; + shf.l.wrap.b32 %r586, %r585, %r585, 20; + add.s32 %r587, %r581, %r129; + add.s32 %r588, %r587, %r586; + xor.b32 %r589, %r588, %r583; + shf.l.wrap.b32 %r590, %r589, %r589, 24; + add.s32 %r591, %r590, %r584; + xor.b32 %r592, %r591, %r586; + shf.l.wrap.b32 %r593, %r592, %r592, 25; + add.s32 %r594, %r546, %r164; + add.s32 %r595, %r594, %r537; + xor.b32 %r596, %r595, %r562; + shf.l.wrap.b32 %r597, %r596, %r596, 16; + add.s32 %r598, %r597, %r577; + xor.b32 %r599, %r598, %r537; + shf.l.wrap.b32 %r600, %r599, %r599, 20; + add.s32 %r601, %r595, %r143; + add.s32 %r602, %r601, %r600; + xor.b32 %r603, %r602, %r597; + shf.l.wrap.b32 %r604, %r603, %r603, 24; + add.s32 %r605, %r604, %r598; + xor.b32 %r606, %r605, %r600; + shf.l.wrap.b32 %r607, %r606, %r606, 25; + add.s32 %r608, %r560, %r178; + add.s32 %r609, %r608, %r551; + xor.b32 %r610, %r576, %r609; + shf.l.wrap.b32 %r611, %r610, %r610, 16; + add.s32 %r612, %r611, %r535; + xor.b32 %r613, %r612, %r551; + shf.l.wrap.b32 %r614, %r613, %r613, 20; + add.s32 %r615, %r609, %r101; + add.s32 %r616, %r615, %r614; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 24; + add.s32 %r619, %r618, %r612; + xor.b32 %r620, %r619, %r614; + shf.l.wrap.b32 %r621, %r620, %r620, 25; + add.s32 %r622, %r565, %r171; + add.s32 %r623, %r622, %r574; + xor.b32 %r624, %r623, %r534; + shf.l.wrap.b32 %r625, %r624, %r624, 16; + add.s32 %r626, %r625, %r549; + xor.b32 %r627, %r626, %r565; + shf.l.wrap.b32 %r628, %r627, %r627, 20; + add.s32 %r629, %r623, %r185; + add.s32 %r630, %r629, %r628; + xor.b32 %r631, %r630, %r625; + shf.l.wrap.b32 %r632, %r631, %r631, 24; + add.s32 %r633, %r632, %r626; + xor.b32 %r634, %r633, %r628; + shf.l.wrap.b32 %r635, %r634, %r634, 25; + add.s32 %r636, %r607, %r108; + add.s32 %r637, %r636, %r588; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 16; + add.s32 %r640, %r639, %r619; + xor.b32 %r641, %r640, %r607; + shf.l.wrap.b32 %r642, %r641, %r641, 20; + add.s32 %r643, %r637, %r80; + add.s32 %r644, %r643, %r642; + xor.b32 %r645, %r644, %r639; + shf.l.wrap.b32 %r646, %r645, %r645, 24; + add.s32 %r647, %r646, %r640; + xor.b32 %r648, %r647, %r642; + shf.l.wrap.b32 %r649, %r648, %r648, 25; + add.s32 %r650, %r602, %r157; + add.s32 %r651, %r650, %r621; + xor.b32 %r652, %r590, %r651; + shf.l.wrap.b32 %r653, %r652, %r652, 16; + add.s32 %r654, %r653, %r633; + xor.b32 %r655, %r654, %r621; + shf.l.wrap.b32 %r656, %r655, %r655, 20; + add.s32 %r657, %r651, %r94; + add.s32 %r658, %r657, %r656; + xor.b32 %r659, %r658, %r653; + shf.l.wrap.b32 %r660, %r659, %r659, 24; + add.s32 %r661, %r660, %r654; + xor.b32 %r662, %r661, %r656; + shf.l.wrap.b32 %r663, %r662, %r662, 25; + add.s32 %r664, %r616, %r115; + add.s32 %r665, %r664, %r635; + xor.b32 %r666, %r665, %r604; + shf.l.wrap.b32 %r667, %r666, %r666, 16; + add.s32 %r668, %r667, %r591; + xor.b32 %r669, %r668, %r635; + shf.l.wrap.b32 %r670, %r669, %r669, 20; + add.s32 %r671, %r665, %r136; + add.s32 %r672, %r671, %r670; + xor.b32 %r673, %r672, %r667; + shf.l.wrap.b32 %r674, %r673, %r673, 24; + add.s32 %r675, %r674, %r668; + xor.b32 %r676, %r675, %r670; + shf.l.wrap.b32 %r677, %r676, %r676, 25; + add.s32 %r678, %r630, %r87; + add.s32 %r679, %r678, %r593; + xor.b32 %r680, %r679, %r618; + shf.l.wrap.b32 %r681, %r680, %r680, 16; + add.s32 %r682, %r681, %r605; + xor.b32 %r683, %r682, %r593; + shf.l.wrap.b32 %r684, %r683, %r683, 20; + add.s32 %r685, %r679, %r122; + add.s32 %r686, %r685, %r684; + xor.b32 %r687, %r686, %r681; + shf.l.wrap.b32 %r688, %r687, %r687, 24; + add.s32 %r689, %r688, %r682; + xor.b32 %r690, %r689, %r684; + shf.l.wrap.b32 %r691, %r690, %r690, 25; + add.s32 %r692, %r644, %r164; + add.s32 %r693, %r692, %r691; + xor.b32 %r694, %r693, %r660; + shf.l.wrap.b32 %r695, %r694, %r694, 16; + add.s32 %r696, %r695, %r675; + xor.b32 %r697, %r696, %r691; + shf.l.wrap.b32 %r698, %r697, %r697, 20; + add.s32 %r699, %r693, %r171; + add.s32 %r700, %r699, %r698; + xor.b32 %r701, %r700, %r695; + shf.l.wrap.b32 %r702, %r701, %r701, 24; + add.s32 %r703, %r702, %r696; + xor.b32 %r704, %r703, %r698; + shf.l.wrap.b32 %r705, %r704, %r704, 25; + add.s32 %r706, %r658, %r143; + add.s32 %r707, %r706, %r649; + xor.b32 %r708, %r707, %r674; + shf.l.wrap.b32 %r709, %r708, %r708, 16; + add.s32 %r710, %r709, %r689; + xor.b32 %r711, %r710, %r649; + shf.l.wrap.b32 %r712, %r711, %r711, 20; + add.s32 %r713, %r707, %r157; + add.s32 %r714, %r713, %r712; + xor.b32 %r715, %r714, %r709; + shf.l.wrap.b32 %r716, %r715, %r715, 24; + add.s32 %r717, %r716, %r710; + xor.b32 %r718, %r717, %r712; + shf.l.wrap.b32 %r719, %r718, %r718, 25; + add.s32 %r720, %r672, %r185; + add.s32 %r721, %r720, %r663; + xor.b32 %r722, %r688, %r721; + shf.l.wrap.b32 %r723, %r722, %r722, 16; + add.s32 %r724, %r723, %r647; + xor.b32 %r725, %r724, %r663; + shf.l.wrap.b32 %r726, %r725, %r725, 20; + add.s32 %r727, %r721, %r150; + add.s32 %r728, %r727, %r726; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 24; + add.s32 %r731, %r730, %r724; + xor.b32 %r732, %r731, %r726; + shf.l.wrap.b32 %r733, %r732, %r732, 25; + add.s32 %r734, %r677, %r178; + add.s32 %r735, %r734, %r686; + xor.b32 %r736, %r735, %r646; + shf.l.wrap.b32 %r737, %r736, %r736, 16; + add.s32 %r738, %r737, %r661; + xor.b32 %r739, %r738, %r677; + shf.l.wrap.b32 %r740, %r739, %r739, 20; + add.s32 %r741, %r735, %r136; + add.s32 %r742, %r741, %r740; + xor.b32 %r743, %r742, %r737; + shf.l.wrap.b32 %r744, %r743, %r743, 24; + add.s32 %r745, %r744, %r738; + xor.b32 %r746, %r745, %r740; + shf.l.wrap.b32 %r747, %r746, %r746, 25; + add.s32 %r748, %r719, %r129; + add.s32 %r749, %r748, %r700; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 16; + add.s32 %r752, %r751, %r731; + xor.b32 %r753, %r752, %r719; + shf.l.wrap.b32 %r754, %r753, %r753, 20; + add.s32 %r755, %r749, %r94; + add.s32 %r756, %r755, %r754; + xor.b32 %r757, %r756, %r751; + shf.l.wrap.b32 %r758, %r757, %r757, 24; + add.s32 %r759, %r758, %r752; + xor.b32 %r760, %r759, %r754; + shf.l.wrap.b32 %r761, %r760, %r760, 25; + add.s32 %r762, %r714, %r115; + add.s32 %r763, %r762, %r733; + xor.b32 %r764, %r702, %r763; + shf.l.wrap.b32 %r765, %r764, %r764, 16; + add.s32 %r766, %r765, %r745; + xor.b32 %r767, %r766, %r733; + shf.l.wrap.b32 %r768, %r767, %r767, 20; + add.s32 %r769, %r763, %r101; + add.s32 %r770, %r769, %r768; + xor.b32 %r771, %r770, %r765; + shf.l.wrap.b32 %r772, %r771, %r771, 24; + add.s32 %r773, %r772, %r766; + xor.b32 %r774, %r773, %r768; + shf.l.wrap.b32 %r775, %r774, %r774, 25; + add.s32 %r776, %r728, %r80; + add.s32 %r777, %r776, %r747; + xor.b32 %r778, %r777, %r716; + shf.l.wrap.b32 %r779, %r778, %r778, 16; + add.s32 %r780, %r779, %r703; + xor.b32 %r781, %r780, %r747; + shf.l.wrap.b32 %r782, %r781, %r781, 20; + add.s32 %r783, %r777, %r87; + add.s32 %r784, %r783, %r782; + xor.b32 %r785, %r784, %r779; + shf.l.wrap.b32 %r786, %r785, %r785, 24; + add.s32 %r787, %r786, %r780; + xor.b32 %r788, %r787, %r782; + shf.l.wrap.b32 %r789, %r788, %r788, 25; + add.s32 %r790, %r742, %r122; + add.s32 %r791, %r790, %r705; + xor.b32 %r792, %r791, %r730; + shf.l.wrap.b32 %r793, %r792, %r792, 16; + add.s32 %r794, %r793, %r717; + xor.b32 %r795, %r794, %r705; + shf.l.wrap.b32 %r796, %r795, %r795, 20; + add.s32 %r797, %r791, %r108; + add.s32 %r798, %r797, %r796; + xor.b32 %r799, %r798, %r793; + shf.l.wrap.b32 %r800, %r799, %r799, 24; + add.s32 %r801, %r800, %r794; + xor.b32 %r802, %r801, %r796; + shf.l.wrap.b32 %r803, %r802, %r802, 25; + add.s32 %r804, %r756, %r143; + add.s32 %r805, %r804, %r803; + xor.b32 %r806, %r805, %r772; + shf.l.wrap.b32 %r807, %r806, %r806, 16; + add.s32 %r808, %r807, %r787; + xor.b32 %r809, %r808, %r803; + shf.l.wrap.b32 %r810, %r809, %r809, 20; + add.s32 %r811, %r805, %r178; + add.s32 %r812, %r811, %r810; + xor.b32 %r813, %r812, %r807; + shf.l.wrap.b32 %r814, %r813, %r813, 24; + add.s32 %r815, %r814, %r808; + xor.b32 %r816, %r815, %r810; + shf.l.wrap.b32 %r817, %r816, %r816, 25; + add.s32 %r818, %r770, %r157; + add.s32 %r819, %r818, %r761; + xor.b32 %r820, %r819, %r786; + shf.l.wrap.b32 %r821, %r820, %r820, 16; + add.s32 %r822, %r821, %r801; + xor.b32 %r823, %r822, %r761; + shf.l.wrap.b32 %r824, %r823, %r823, 20; + add.s32 %r825, %r819, %r115; + add.s32 %r826, %r825, %r824; + xor.b32 %r827, %r826, %r821; + shf.l.wrap.b32 %r828, %r827, %r827, 24; + add.s32 %r829, %r828, %r822; + xor.b32 %r830, %r829, %r824; + shf.l.wrap.b32 %r831, %r830, %r830, 25; + add.s32 %r832, %r784, %r136; + add.s32 %r833, %r832, %r775; + xor.b32 %r834, %r800, %r833; + shf.l.wrap.b32 %r835, %r834, %r834, 16; + add.s32 %r836, %r835, %r759; + xor.b32 %r837, %r836, %r775; + shf.l.wrap.b32 %r838, %r837, %r837, 20; + add.s32 %r839, %r833, %r164; + add.s32 %r840, %r839, %r838; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 24; + add.s32 %r843, %r842, %r836; + xor.b32 %r844, %r843, %r838; + shf.l.wrap.b32 %r845, %r844, %r844, 25; + add.s32 %r846, %r789, %r185; + add.s32 %r847, %r846, %r798; + xor.b32 %r848, %r847, %r758; + shf.l.wrap.b32 %r849, %r848, %r848, 16; + add.s32 %r850, %r849, %r773; + xor.b32 %r851, %r850, %r789; + shf.l.wrap.b32 %r852, %r851, %r851, 20; + add.s32 %r853, %r847, %r87; + add.s32 %r854, %r853, %r852; + xor.b32 %r855, %r854, %r849; + shf.l.wrap.b32 %r856, %r855, %r855, 24; + add.s32 %r857, %r856, %r850; + xor.b32 %r858, %r857, %r852; + shf.l.wrap.b32 %r859, %r858, %r858, 25; + add.s32 %r860, %r831, %r171; + add.s32 %r861, %r860, %r812; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 16; + add.s32 %r864, %r863, %r843; + xor.b32 %r865, %r864, %r831; + shf.l.wrap.b32 %r866, %r865, %r865, 20; + add.s32 %r867, %r861, %r101; + add.s32 %r868, %r867, %r866; + xor.b32 %r869, %r868, %r863; + shf.l.wrap.b32 %r870, %r869, %r869, 24; + add.s32 %r871, %r870, %r864; + xor.b32 %r872, %r871, %r866; + shf.l.wrap.b32 %r873, %r872, %r872, 25; + add.s32 %r874, %r826, %r80; + add.s32 %r875, %r874, %r845; + xor.b32 %r876, %r814, %r875; + shf.l.wrap.b32 %r877, %r876, %r876, 16; + add.s32 %r878, %r877, %r857; + xor.b32 %r879, %r878, %r845; + shf.l.wrap.b32 %r880, %r879, %r879, 20; + add.s32 %r881, %r875, %r150; + add.s32 %r882, %r881, %r880; + xor.b32 %r883, %r882, %r877; + shf.l.wrap.b32 %r884, %r883, %r883, 24; + add.s32 %r885, %r884, %r878; + xor.b32 %r886, %r885, %r880; + shf.l.wrap.b32 %r887, %r886, %r886, 25; + add.s32 %r888, %r840, %r94; + add.s32 %r889, %r888, %r859; + xor.b32 %r890, %r889, %r828; + shf.l.wrap.b32 %r891, %r890, %r890, 16; + add.s32 %r892, %r891, %r815; + xor.b32 %r893, %r892, %r859; + shf.l.wrap.b32 %r894, %r893, %r893, 20; + add.s32 %r895, %r889, %r122; + add.s32 %r896, %r895, %r894; + xor.b32 %r897, %r896, %r891; + shf.l.wrap.b32 %r898, %r897, %r897, 24; + add.s32 %r899, %r898, %r892; + xor.b32 %r900, %r899, %r894; + shf.l.wrap.b32 %r901, %r900, %r900, 25; + add.s32 %r902, %r854, %r108; + add.s32 %r903, %r902, %r817; + xor.b32 %r904, %r903, %r842; + shf.l.wrap.b32 %r905, %r904, %r904, 16; + add.s32 %r906, %r905, %r829; + xor.b32 %r907, %r906, %r817; + shf.l.wrap.b32 %r908, %r907, %r907, 20; + add.s32 %r909, %r903, %r129; + add.s32 %r910, %r909, %r908; + xor.b32 %r911, %r910, %r905; + shf.l.wrap.b32 %r912, %r911, %r911, 24; + add.s32 %r913, %r912, %r906; + xor.b32 %r914, %r913, %r908; + shf.l.wrap.b32 %r915, %r914, %r914, 25; + add.s32 %r916, %r868, %r157; + add.s32 %r917, %r916, %r915; + xor.b32 %r918, %r917, %r884; + shf.l.wrap.b32 %r919, %r918, %r918, 16; + add.s32 %r920, %r919, %r899; + xor.b32 %r921, %r920, %r915; + shf.l.wrap.b32 %r922, %r921, %r921, 20; + add.s32 %r923, %r917, %r185; + add.s32 %r924, %r923, %r922; + xor.b32 %r925, %r924, %r919; + shf.l.wrap.b32 %r926, %r925, %r925, 24; + add.s32 %r927, %r926, %r920; + xor.b32 %r928, %r927, %r922; + shf.l.wrap.b32 %r929, %r928, %r928, 25; + add.s32 %r930, %r882, %r115; + add.s32 %r931, %r930, %r873; + xor.b32 %r932, %r931, %r898; + shf.l.wrap.b32 %r933, %r932, %r932, 16; + add.s32 %r934, %r933, %r913; + xor.b32 %r935, %r934, %r873; + shf.l.wrap.b32 %r936, %r935, %r935, 20; + add.s32 %r937, %r931, %r80; + add.s32 %r938, %r937, %r936; + xor.b32 %r939, %r938, %r933; + shf.l.wrap.b32 %r940, %r939, %r939, 24; + add.s32 %r941, %r940, %r934; + xor.b32 %r942, %r941, %r936; + shf.l.wrap.b32 %r943, %r942, %r942, 25; + add.s32 %r944, %r896, %r87; + add.s32 %r945, %r944, %r887; + xor.b32 %r946, %r912, %r945; + shf.l.wrap.b32 %r947, %r946, %r946, 16; + add.s32 %r948, %r947, %r871; + xor.b32 %r949, %r948, %r887; + shf.l.wrap.b32 %r950, %r949, %r949, 20; + add.s32 %r951, %r945, %r143; + add.s32 %r952, %r951, %r950; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 24; + add.s32 %r955, %r954, %r948; + xor.b32 %r956, %r955, %r950; + shf.l.wrap.b32 %r957, %r956, %r956, 25; + add.s32 %r958, %r901, %r136; + add.s32 %r959, %r958, %r910; + xor.b32 %r960, %r959, %r870; + shf.l.wrap.b32 %r961, %r960, %r960, 16; + add.s32 %r962, %r961, %r885; + xor.b32 %r963, %r962, %r901; + shf.l.wrap.b32 %r964, %r963, %r963, 20; + add.s32 %r965, %r959, %r122; + add.s32 %r966, %r965, %r964; + xor.b32 %r967, %r966, %r961; + shf.l.wrap.b32 %r968, %r967, %r967, 24; + add.s32 %r969, %r968, %r962; + xor.b32 %r970, %r969, %r964; + shf.l.wrap.b32 %r971, %r970, %r970, 25; + add.s32 %r972, %r943, %r178; + add.s32 %r973, %r972, %r924; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 16; + add.s32 %r976, %r975, %r955; + xor.b32 %r977, %r976, %r943; + shf.l.wrap.b32 %r978, %r977, %r977, 20; + add.s32 %r979, %r973, %r150; + add.s32 %r980, %r979, %r978; + xor.b32 %r981, %r980, %r975; + shf.l.wrap.b32 %r982, %r981, %r981, 24; + add.s32 %r983, %r982, %r976; + xor.b32 %r984, %r983, %r978; + shf.l.wrap.b32 %r985, %r984, %r984, 25; + add.s32 %r986, %r938, %r94; + add.s32 %r987, %r986, %r957; + xor.b32 %r988, %r926, %r987; + shf.l.wrap.b32 %r989, %r988, %r988, 16; + add.s32 %r990, %r989, %r969; + xor.b32 %r991, %r990, %r957; + shf.l.wrap.b32 %r992, %r991, %r991, 20; + add.s32 %r993, %r987, %r164; + add.s32 %r994, %r993, %r992; + xor.b32 %r995, %r994, %r989; + shf.l.wrap.b32 %r996, %r995, %r995, 24; + add.s32 %r997, %r996, %r990; + xor.b32 %r998, %r997, %r992; + shf.l.wrap.b32 %r999, %r998, %r998, 25; + add.s32 %r1000, %r952, %r101; + add.s32 %r1001, %r1000, %r971; + xor.b32 %r1002, %r1001, %r940; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 16; + add.s32 %r1004, %r1003, %r927; + xor.b32 %r1005, %r1004, %r971; + shf.l.wrap.b32 %r1006, %r1005, %r1005, 20; + add.s32 %r1007, %r1001, %r108; + add.s32 %r1008, %r1007, %r1006; + xor.b32 %r1009, %r1008, %r1003; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 24; + add.s32 %r1011, %r1010, %r1004; + xor.b32 %r1012, %r1011, %r1006; + shf.l.wrap.b32 %r1013, %r1012, %r1012, 25; + add.s32 %r1014, %r966, %r129; + add.s32 %r1015, %r1014, %r929; + xor.b32 %r1016, %r1015, %r954; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 16; + add.s32 %r1018, %r1017, %r941; + xor.b32 %r1019, %r1018, %r929; + shf.l.wrap.b32 %r1020, %r1019, %r1019, 20; + add.s32 %r1021, %r1015, %r171; + add.s32 %r1022, %r1021, %r1020; + xor.b32 %r1023, %r1022, %r1017; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 24; + add.s32 %r1025, %r1024, %r1018; + xor.b32 %r1026, %r1025, %r1020; + shf.l.wrap.b32 %r1027, %r1026, %r1026, 25; + xor.b32 %r1028, %r1011, %r980; + xor.b32 %r1029, %r1025, %r994; + xor.b32 %r1030, %r983, %r1008; + xor.b32 %r1031, %r1022, %r997; + xor.b32 %r1032, %r1027, %r996; + xor.b32 %r1033, %r985, %r1010; + xor.b32 %r1034, %r1024, %r999; + xor.b32 %r1035, %r1013, %r982; + st.local.u8 [%rd155], %r1028; + shr.u32 %r1036, %r1028, 8; + st.local.u8 [%rd155+1], %r1036; + shr.u32 %r1037, %r1028, 16; + st.local.u8 [%rd155+2], %r1037; + shr.u32 %r1038, %r1028, 24; + st.local.u8 [%rd155+3], %r1038; + st.local.u8 [%rd155+4], %r1029; + shr.u32 %r1039, %r1029, 8; + st.local.u8 [%rd155+5], %r1039; + shr.u32 %r1040, %r1029, 16; + st.local.u8 [%rd155+6], %r1040; + shr.u32 %r1041, %r1029, 24; + st.local.u8 [%rd155+7], %r1041; + st.local.u8 [%rd155+8], %r1030; + shr.u32 %r1042, %r1030, 8; + st.local.u8 [%rd155+9], %r1042; + shr.u32 %r1043, %r1030, 16; + st.local.u8 [%rd155+10], %r1043; + shr.u32 %r1044, %r1030, 24; + st.local.u8 [%rd155+11], %r1044; + st.local.u8 [%rd155+12], %r1031; + shr.u32 %r1045, %r1031, 8; + st.local.u8 [%rd155+13], %r1045; + shr.u32 %r1046, %r1031, 16; + st.local.u8 [%rd155+14], %r1046; + shr.u32 %r1047, %r1031, 24; + st.local.u8 [%rd155+15], %r1047; + st.local.u8 [%rd155+16], %r1032; + shr.u32 %r1048, %r1032, 8; + st.local.u8 [%rd155+17], %r1048; + shr.u32 %r1049, %r1032, 16; + st.local.u8 [%rd155+18], %r1049; + shr.u32 %r1050, %r1032, 24; + st.local.u8 [%rd155+19], %r1050; + st.local.u8 [%rd155+20], %r1033; + shr.u32 %r1051, %r1033, 8; + st.local.u8 [%rd155+21], %r1051; + shr.u32 %r1052, %r1033, 16; + st.local.u8 [%rd155+22], %r1052; + shr.u32 %r1053, %r1033, 24; + st.local.u8 [%rd155+23], %r1053; + st.local.u8 [%rd155+24], %r1034; + shr.u32 %r1054, %r1034, 8; + st.local.u8 [%rd155+25], %r1054; + shr.u32 %r1055, %r1034, 16; + st.local.u8 [%rd155+26], %r1055; + shr.u32 %r1056, %r1034, 24; + st.local.u8 [%rd155+27], %r1056; + st.local.u8 [%rd155+28], %r1035; + shr.u32 %r1057, %r1035, 8; + st.local.u8 [%rd155+29], %r1057; + shr.u32 %r1058, %r1035, 16; + st.local.u8 [%rd155+30], %r1058; + shr.u32 %r1059, %r1035, 24; + st.local.u8 [%rd155+31], %r1059; + add.s64 %rd153, %rd153, 8; + add.s64 %rd155, %rd155, 32; + add.s64 %rd154, %rd154, -1; + setp.ne.s64 %p12, %rd154, 0; + @%p12 bra $L__BB0_7; + +$L__BB0_8: + setp.le.u64 %p13, %rd7, %rd152; + @%p13 bra $L__BB0_30; + + add.u64 %rd144, %SPL, 96; + ld.param.u64 %rd142, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd141, %rd142; + shl.b64 %rd111, %rd151, 6; + shl.b64 %rd112, %rd151, 5; + add.s64 %rd27, %rd141, %rd112; + add.s64 %rd28, %rd144, %rd111; + mov.u64 %rd156, 0; + +$L__BB0_10: + add.s64 %rd113, %rd28, %rd156; + ld.local.u8 %rs77, [%rd113]; + add.s64 %rd114, %rd27, %rd156; + st.local.u8 [%rd114], %rs77; + add.s64 %rd156, %rd156, 1; + setp.lt.u64 %p14, %rd156, 32; + @%p14 bra $L__BB0_10; + + add.s64 %rd151, %rd151, 1; + +$L__BB0_30: + st.param.b64 [func_retval0+0], %rd151; + ret; + +} +.func _Z20blake3_hasher_updateP13blake3_hasherPKvy( + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0, + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1 +) +{ + .local .align 16 .b8 __local_depot1[144]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<54>; + .reg .b16 %rs<393>; + .reg .b32 %r<11690>; + .reg .b64 %rd<273>; + + + mov.u64 %SPL, __local_depot1; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd98, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + ld.param.u64 %rd254, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd261, %rd254; + cvta.to.local.u64 %rd2, %rd98; + add.s64 %rd3, %rd2, 136; + ld.local.v2.u8 {%rs102, %rs103}, [%rd2+136]; + cvt.u64.u16 %rd4, %rs103; + cvt.u32.u16 %r144, %rs103; + mul.wide.u32 %rd101, %r144, 64; + cvt.u64.u16 %rd5, %rs102; + neg.s64 %rd102, %rd5; + setp.eq.s64 %p1, %rd101, %rd102; + mov.u64 %rd262, 80; + @%p1 bra $L__BB1_24; + + shl.b64 %rd103, %rd4, 6; + mov.u64 %rd104, 1024; + sub.s64 %rd105, %rd104, %rd5; + sub.s64 %rd106, %rd105, %rd103; + min.u64 %rd6, %rd106, 80; + setp.eq.s16 %p2, %rs102, 0; + mov.u16 %rs351, 0; + mov.u64 %rd244, %rd6; + @%p2 bra $L__BB1_9; + + cvt.u32.u16 %r145, %rs102; + prmt.b32 %r147, %r144, %r145, 30212; + cvt.u16.u32 %rs350, %r147; + mov.u64 %rd107, 64; + sub.s64 %rd108, %rd107, %rd5; + min.u64 %rd7, %rd108, %rd6; + setp.eq.s64 %p3, %rd7, 0; + @%p3 bra $L__BB1_6; + + add.s64 %rd110, %rd2, %rd5; + add.s64 %rd8, %rd110, 72; + mov.u64 %rd237, 0; + +$L__BB1_4: + add.s64 %rd111, %rd261, %rd237; + ld.local.u8 %rs107, [%rd111]; + add.s64 %rd112, %rd8, %rd237; + st.local.u8 [%rd112], %rs107; + add.s64 %rd237, %rd237, 1; + setp.lt.u64 %p4, %rd237, %rd7; + @%p4 bra $L__BB1_4; + + ld.local.u8 %rs350, [%rd3]; + +$L__BB1_6: + cvt.u16.u64 %rs108, %rd7; + add.s16 %rs351, %rs350, %rs108; + mov.u64 %rd244, 0; + st.local.u8 [%rd3], %rs351; + add.s64 %rd261, %rd261, %rd7; + sub.s64 %rd12, %rd6, %rd7; + setp.eq.s64 %p5, %rd12, 0; + @%p5 bra $L__BB1_9; + + add.s64 %rd13, %rd2, 72; + ld.local.u8 %rs109, [%rd3+1]; + mov.u64 %rd238, 0; + setp.eq.s16 %p6, %rs109, 0; + mov.u16 %rs351, 0; + selp.u16 %rs111, 1, 0, %p6; + ld.local.u8 %rs112, [%rd3+2]; + or.b16 %rs113, %rs112, %rs111; + ld.local.u8 %r148, [%rd3+-64]; + ld.local.u8 %r149, [%rd3+-63]; + prmt.b32 %r150, %r149, %r148, 30212; + ld.local.u8 %r151, [%rd3+-62]; + prmt.b32 %r152, %r151, %r150, 28756; + ld.local.u8 %r153, [%rd3+-61]; + prmt.b32 %r154, %r153, %r152, 1620; + ld.local.u8 %r155, [%rd3+-60]; + ld.local.u8 %r156, [%rd3+-59]; prmt.b32 %r157, %r156, %r155, 30212; - cvt.u32.u16 %r158, %rs39; + ld.local.u8 %r158, [%rd3+-58]; prmt.b32 %r159, %r158, %r157, 28756; - cvt.u32.u16 %r160, %rs47; + ld.local.u8 %r160, [%rd3+-57]; prmt.b32 %r161, %r160, %r159, 1620; - cvt.u32.u16 %r162, %rs48; - and.b32 %r163, %r162, 255; - cvt.u32.u16 %r164, %rs53; - prmt.b32 %r165, %r164, %r163, 30212; - cvt.u32.u16 %r166, %rs49; - prmt.b32 %r167, %r166, %r165, 28756; - cvt.u32.u16 %r168, %rs55; - prmt.b32 %r169, %r168, %r167, 1620; - cvt.u32.u16 %r170, %rs50; - and.b32 %r171, %r170, 255; - cvt.u32.u16 %r172, %rs57; - prmt.b32 %r173, %r172, %r171, 30212; - cvt.u32.u16 %r174, %rs51; - prmt.b32 %r175, %r174, %r173, 28756; - cvt.u32.u16 %r176, %rs59; - prmt.b32 %r177, %r176, %r175, 1620; - cvt.u32.u16 %r178, %rs69; - and.b32 %r179, %r178, 255; - add.s32 %r180, %r69, -1156040474; - shf.l.wrap.b32 %r181, %r180, %r180, 16; - add.s32 %r182, %r181, 1779033703; - xor.b32 %r183, %r182, 1359893119; - shf.l.wrap.b32 %r184, %r183, %r183, 20; - add.s32 %r185, %r77, %r180; - add.s32 %r186, %r185, %r184; - xor.b32 %r187, %r186, %r181; - shf.l.wrap.b32 %r188, %r187, %r187, 24; - add.s32 %r189, %r188, %r182; - xor.b32 %r190, %r189, %r184; - shf.l.wrap.b32 %r191, %r190, %r190, 25; - add.s32 %r192, %r81, 1449989905; - shf.l.wrap.b32 %r193, %r192, %r192, 16; - add.s32 %r194, %r193, -1150833019; - xor.b32 %r195, %r194, -1694144372; - shf.l.wrap.b32 %r196, %r195, %r195, 20; - add.s32 %r197, %r89, %r192; - add.s32 %r198, %r197, %r196; - xor.b32 %r199, %r198, %r193; - shf.l.wrap.b32 %r200, %r199, %r199, 24; - add.s32 %r201, %r200, %r194; - xor.b32 %r202, %r201, %r196; - shf.l.wrap.b32 %r203, %r202, %r202, 25; - add.s32 %r204, %r93, 1542638877; - shr.u32 %r205, %r204, 16; - shl.b32 %r206, %r204, 16; - xor.b32 %r207, %r206, 4194304; - or.b32 %r208, %r207, %r205; - add.s32 %r209, %r208, 1013904242; - xor.b32 %r210, %r209, 528734635; - shf.l.wrap.b32 %r211, %r210, %r210, 20; - add.s32 %r212, %r101, %r204; - add.s32 %r213, %r212, %r211; - xor.b32 %r214, %r213, %r208; - shf.l.wrap.b32 %r215, %r214, %r214, 24; - add.s32 %r216, %r215, %r209; - xor.b32 %r217, %r216, %r211; - shf.l.wrap.b32 %r218, %r217, %r217, 25; - add.s32 %r219, %r105, 19972691; - xor.b32 %r220, %r219, %r179; - shr.u32 %r221, %r219, 16; - shl.b32 %r222, %r220, 16; - or.b32 %r223, %r222, %r221; - add.s32 %r224, %r223, -1521486534; - xor.b32 %r225, %r224, 1541459225; - shf.l.wrap.b32 %r226, %r225, %r225, 20; - add.s32 %r227, %r113, %r219; - add.s32 %r228, %r227, %r226; - xor.b32 %r229, %r228, %r223; - shf.l.wrap.b32 %r230, %r229, %r229, 24; - add.s32 %r231, %r230, %r224; - xor.b32 %r232, %r231, %r226; - shf.l.wrap.b32 %r233, %r232, %r232, 25; - add.s32 %r234, %r203, %r186; - add.s32 %r235, %r234, %r121; - xor.b32 %r236, %r230, %r235; - shf.l.wrap.b32 %r237, %r236, %r236, 16; - add.s32 %r238, %r237, %r216; - xor.b32 %r239, %r238, %r203; - shf.l.wrap.b32 %r240, %r239, %r239, 20; - add.s32 %r241, %r129, %r235; - add.s32 %r242, %r241, %r240; - xor.b32 %r243, %r242, %r237; - shf.l.wrap.b32 %r244, %r243, %r243, 24; - add.s32 %r245, %r244, %r238; - xor.b32 %r246, %r245, %r240; - shf.l.wrap.b32 %r247, %r246, %r246, 25; - add.s32 %r248, %r218, %r198; - add.s32 %r249, %r248, %r137; - xor.b32 %r250, %r249, %r188; - shf.l.wrap.b32 %r251, %r250, %r250, 16; - add.s32 %r252, %r251, %r231; - xor.b32 %r253, %r252, %r218; - shf.l.wrap.b32 %r254, %r253, %r253, 20; - add.s32 %r255, %r145, %r249; - add.s32 %r256, %r255, %r254; - xor.b32 %r257, %r256, %r251; - shf.l.wrap.b32 %r258, %r257, %r257, 24; - add.s32 %r259, %r258, %r252; - xor.b32 %r260, %r259, %r254; - shf.l.wrap.b32 %r261, %r260, %r260, 25; - add.s32 %r262, %r233, %r213; - add.s32 %r263, %r262, %r153; - xor.b32 %r264, %r263, %r200; - shf.l.wrap.b32 %r265, %r264, %r264, 16; - add.s32 %r266, %r265, %r189; - xor.b32 %r267, %r266, %r233; - shf.l.wrap.b32 %r268, %r267, %r267, 20; - add.s32 %r269, %r161, %r263; - add.s32 %r270, %r269, %r268; - xor.b32 %r271, %r270, %r265; - shf.l.wrap.b32 %r272, %r271, %r271, 24; - add.s32 %r273, %r272, %r266; - xor.b32 %r274, %r273, %r268; - shf.l.wrap.b32 %r275, %r274, %r274, 25; - add.s32 %r276, %r228, %r191; - add.s32 %r277, %r276, %r169; - xor.b32 %r278, %r277, %r215; - shf.l.wrap.b32 %r279, %r278, %r278, 16; - add.s32 %r280, %r279, %r201; - xor.b32 %r281, %r280, %r191; - shf.l.wrap.b32 %r282, %r281, %r281, 20; - add.s32 %r283, %r177, %r277; - add.s32 %r284, %r283, %r282; - xor.b32 %r285, %r284, %r279; - shf.l.wrap.b32 %r286, %r285, %r285, 24; - add.s32 %r287, %r286, %r280; - xor.b32 %r288, %r287, %r282; - shf.l.wrap.b32 %r289, %r288, %r288, 25; - add.s32 %r290, %r242, %r81; - add.s32 %r291, %r290, %r289; - xor.b32 %r292, %r291, %r258; - shf.l.wrap.b32 %r293, %r292, %r292, 16; - add.s32 %r294, %r293, %r273; - xor.b32 %r295, %r294, %r289; - shf.l.wrap.b32 %r296, %r295, %r295, 20; - add.s32 %r297, %r291, %r105; - add.s32 %r298, %r297, %r296; - xor.b32 %r299, %r298, %r293; - shf.l.wrap.b32 %r300, %r299, %r299, 24; - add.s32 %r301, %r300, %r294; - xor.b32 %r302, %r301, %r296; - shf.l.wrap.b32 %r303, %r302, %r302, 25; - add.s32 %r304, %r256, %r89; - add.s32 %r305, %r304, %r247; - xor.b32 %r306, %r272, %r305; - shf.l.wrap.b32 %r307, %r306, %r306, 16; - add.s32 %r308, %r287, %r307; - xor.b32 %r309, %r308, %r247; - shf.l.wrap.b32 %r310, %r309, %r309, 20; - add.s32 %r311, %r305, %r137; - add.s32 %r312, %r311, %r310; - xor.b32 %r313, %r312, %r307; - shf.l.wrap.b32 %r314, %r313, %r313, 24; - add.s32 %r315, %r314, %r308; - xor.b32 %r316, %r315, %r310; - shf.l.wrap.b32 %r317, %r316, %r316, 25; - add.s32 %r318, %r261, %r113; - add.s32 %r319, %r318, %r270; - xor.b32 %r320, %r286, %r319; - shf.l.wrap.b32 %r321, %r320, %r320, 16; - add.s32 %r322, %r321, %r245; - xor.b32 %r323, %r322, %r261; + ld.local.u8 %r162, [%rd3+-56]; + ld.local.u8 %r163, [%rd3+-55]; + prmt.b32 %r164, %r163, %r162, 30212; + ld.local.u8 %r165, [%rd3+-54]; + prmt.b32 %r166, %r165, %r164, 28756; + ld.local.u8 %r167, [%rd3+-53]; + prmt.b32 %r168, %r167, %r166, 1620; + ld.local.u8 %r169, [%rd3+-52]; + ld.local.u8 %r170, [%rd3+-51]; + prmt.b32 %r171, %r170, %r169, 30212; + ld.local.u8 %r172, [%rd3+-50]; + prmt.b32 %r173, %r172, %r171, 28756; + ld.local.u8 %r174, [%rd3+-49]; + prmt.b32 %r175, %r174, %r173, 1620; + ld.local.u8 %r176, [%rd3+-48]; + ld.local.u8 %r177, [%rd3+-47]; + prmt.b32 %r178, %r177, %r176, 30212; + ld.local.u8 %r179, [%rd3+-46]; + prmt.b32 %r180, %r179, %r178, 28756; + ld.local.u8 %r181, [%rd3+-45]; + prmt.b32 %r182, %r181, %r180, 1620; + ld.local.u8 %r183, [%rd3+-44]; + ld.local.u8 %r184, [%rd3+-43]; + prmt.b32 %r185, %r184, %r183, 30212; + ld.local.u8 %r186, [%rd3+-42]; + prmt.b32 %r187, %r186, %r185, 28756; + ld.local.u8 %r188, [%rd3+-41]; + prmt.b32 %r189, %r188, %r187, 1620; + ld.local.u8 %r190, [%rd3+-40]; + ld.local.u8 %r191, [%rd3+-39]; + prmt.b32 %r192, %r191, %r190, 30212; + ld.local.u8 %r193, [%rd3+-38]; + prmt.b32 %r194, %r193, %r192, 28756; + ld.local.u8 %r195, [%rd3+-37]; + prmt.b32 %r196, %r195, %r194, 1620; + ld.local.u8 %r197, [%rd3+-36]; + ld.local.u8 %r198, [%rd3+-35]; + prmt.b32 %r199, %r198, %r197, 30212; + ld.local.u8 %r200, [%rd3+-34]; + prmt.b32 %r201, %r200, %r199, 28756; + ld.local.u8 %r202, [%rd3+-33]; + prmt.b32 %r203, %r202, %r201, 1620; + ld.local.u8 %r204, [%rd3+-32]; + ld.local.u8 %r205, [%rd3+-31]; + prmt.b32 %r206, %r205, %r204, 30212; + ld.local.u8 %r207, [%rd3+-30]; + prmt.b32 %r208, %r207, %r206, 28756; + ld.local.u8 %r209, [%rd3+-29]; + prmt.b32 %r210, %r209, %r208, 1620; + ld.local.u8 %r211, [%rd3+-28]; + ld.local.u8 %r212, [%rd3+-27]; + prmt.b32 %r213, %r212, %r211, 30212; + ld.local.u8 %r214, [%rd3+-26]; + prmt.b32 %r215, %r214, %r213, 28756; + ld.local.u8 %r216, [%rd3+-25]; + prmt.b32 %r217, %r216, %r215, 1620; + ld.local.u8 %r218, [%rd3+-24]; + ld.local.u8 %r219, [%rd3+-23]; + prmt.b32 %r220, %r219, %r218, 30212; + ld.local.u8 %r221, [%rd3+-22]; + prmt.b32 %r222, %r221, %r220, 28756; + ld.local.u8 %r223, [%rd3+-21]; + prmt.b32 %r224, %r223, %r222, 1620; + ld.local.u8 %r225, [%rd3+-20]; + ld.local.u8 %r226, [%rd3+-19]; + prmt.b32 %r227, %r226, %r225, 30212; + ld.local.u8 %r228, [%rd3+-18]; + prmt.b32 %r229, %r228, %r227, 28756; + ld.local.u8 %r230, [%rd3+-17]; + prmt.b32 %r231, %r230, %r229, 1620; + ld.local.u8 %r232, [%rd3+-16]; + ld.local.u8 %r233, [%rd3+-15]; + prmt.b32 %r234, %r233, %r232, 30212; + ld.local.u8 %r235, [%rd3+-14]; + prmt.b32 %r236, %r235, %r234, 28756; + ld.local.u8 %r237, [%rd3+-13]; + prmt.b32 %r238, %r237, %r236, 1620; + ld.local.u8 %r239, [%rd3+-12]; + ld.local.u8 %r240, [%rd3+-11]; + prmt.b32 %r241, %r240, %r239, 30212; + ld.local.u8 %r242, [%rd3+-10]; + prmt.b32 %r243, %r242, %r241, 28756; + ld.local.u8 %r244, [%rd3+-9]; + prmt.b32 %r245, %r244, %r243, 1620; + ld.local.u8 %r246, [%rd3+-8]; + ld.local.u8 %r247, [%rd3+-7]; + prmt.b32 %r248, %r247, %r246, 30212; + ld.local.u8 %r249, [%rd3+-6]; + prmt.b32 %r250, %r249, %r248, 28756; + ld.local.u8 %r251, [%rd3+-5]; + prmt.b32 %r252, %r251, %r250, 1620; + ld.local.u8 %r253, [%rd3+-4]; + ld.local.u8 %r254, [%rd3+-3]; + prmt.b32 %r255, %r254, %r253, 30212; + ld.local.u8 %r256, [%rd3+-2]; + prmt.b32 %r257, %r256, %r255, 28756; + ld.local.u8 %r258, [%rd3+-1]; + prmt.b32 %r259, %r258, %r257, 1620; + ld.local.u64 %rd115, [%rd3+-72]; + cvt.u32.u64 %r260, %rd115; + shr.u64 %rd116, %rd115, 32; + cvt.u32.u64 %r261, %rd116; + cvt.u32.u16 %r262, %rs113; + and.b32 %r263, %r262, 255; + ld.local.u32 %r264, [%rd3+-104]; + add.s32 %r265, %r264, %r154; + ld.local.u32 %r266, [%rd3+-88]; + add.s32 %r267, %r265, %r266; + xor.b32 %r268, %r267, %r260; + shf.l.wrap.b32 %r269, %r268, %r268, 16; + add.s32 %r270, %r269, 1779033703; + xor.b32 %r271, %r270, %r266; + shf.l.wrap.b32 %r272, %r271, %r271, 20; + add.s32 %r273, %r267, %r161; + add.s32 %r274, %r273, %r272; + xor.b32 %r275, %r274, %r269; + shf.l.wrap.b32 %r276, %r275, %r275, 24; + add.s32 %r277, %r276, %r270; + xor.b32 %r278, %r277, %r272; + shf.l.wrap.b32 %r279, %r278, %r278, 25; + ld.local.u32 %r280, [%rd3+-100]; + add.s32 %r281, %r280, %r168; + ld.local.u32 %r282, [%rd3+-84]; + add.s32 %r283, %r281, %r282; + xor.b32 %r284, %r283, %r261; + shf.l.wrap.b32 %r285, %r284, %r284, 16; + add.s32 %r286, %r285, -1150833019; + xor.b32 %r287, %r286, %r282; + shf.l.wrap.b32 %r288, %r287, %r287, 20; + add.s32 %r289, %r283, %r175; + add.s32 %r290, %r289, %r288; + xor.b32 %r291, %r290, %r285; + shf.l.wrap.b32 %r292, %r291, %r291, 24; + add.s32 %r293, %r292, %r286; + xor.b32 %r294, %r293, %r288; + shf.l.wrap.b32 %r295, %r294, %r294, 25; + ld.local.u32 %r296, [%rd3+-96]; + add.s32 %r297, %r296, %r182; + ld.local.u32 %r298, [%rd3+-80]; + add.s32 %r299, %r297, %r298; + shr.u32 %r300, %r299, 16; + shl.b32 %r301, %r299, 16; + xor.b32 %r302, %r301, 4194304; + or.b32 %r303, %r302, %r300; + add.s32 %r304, %r303, 1013904242; + xor.b32 %r305, %r304, %r298; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r299, %r189; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + ld.local.u32 %r314, [%rd3+-92]; + add.s32 %r315, %r314, %r196; + ld.local.u32 %r316, [%rd3+-76]; + add.s32 %r317, %r315, %r316; + xor.b32 %r318, %r317, %r263; + shr.u32 %r319, %r317, 16; + shl.b32 %r320, %r318, 16; + or.b32 %r321, %r320, %r319; + add.s32 %r322, %r321, -1521486534; + xor.b32 %r323, %r322, %r316; shf.l.wrap.b32 %r324, %r323, %r323, 20; - add.s32 %r325, %r319, %r69; + add.s32 %r325, %r317, %r203; add.s32 %r326, %r325, %r324; xor.b32 %r327, %r326, %r321; shf.l.wrap.b32 %r328, %r327, %r327, 24; add.s32 %r329, %r328, %r322; xor.b32 %r330, %r329, %r324; shf.l.wrap.b32 %r331, %r330, %r330, 25; - add.s32 %r332, %r275, %r93; - add.s32 %r333, %r332, %r284; - xor.b32 %r334, %r333, %r244; + add.s32 %r332, %r274, %r210; + add.s32 %r333, %r332, %r295; + xor.b32 %r334, %r333, %r328; shf.l.wrap.b32 %r335, %r334, %r334, 16; - add.s32 %r336, %r335, %r259; - xor.b32 %r337, %r336, %r275; + add.s32 %r336, %r335, %r311; + xor.b32 %r337, %r336, %r295; shf.l.wrap.b32 %r338, %r337, %r337, 20; - add.s32 %r339, %r333, %r161; + add.s32 %r339, %r333, %r217; add.s32 %r340, %r339, %r338; xor.b32 %r341, %r340, %r335; shf.l.wrap.b32 %r342, %r341, %r341, 24; add.s32 %r343, %r342, %r336; xor.b32 %r344, %r343, %r338; shf.l.wrap.b32 %r345, %r344, %r344, 25; - add.s32 %r346, %r298, %r77; - add.s32 %r347, %r346, %r317; - xor.b32 %r348, %r347, %r342; + add.s32 %r346, %r290, %r224; + add.s32 %r347, %r346, %r313; + xor.b32 %r348, %r347, %r276; shf.l.wrap.b32 %r349, %r348, %r348, 16; add.s32 %r350, %r349, %r329; - xor.b32 %r351, %r350, %r317; + xor.b32 %r351, %r350, %r313; shf.l.wrap.b32 %r352, %r351, %r351, 20; - add.s32 %r353, %r347, %r145; + add.s32 %r353, %r347, %r231; add.s32 %r354, %r353, %r352; xor.b32 %r355, %r354, %r349; shf.l.wrap.b32 %r356, %r355, %r355, 24; add.s32 %r357, %r356, %r350; xor.b32 %r358, %r357, %r352; shf.l.wrap.b32 %r359, %r358, %r358, 25; - add.s32 %r360, %r312, %r153; + add.s32 %r360, %r308, %r238; add.s32 %r361, %r360, %r331; - xor.b32 %r362, %r361, %r300; + xor.b32 %r362, %r361, %r292; shf.l.wrap.b32 %r363, %r362, %r362, 16; - add.s32 %r364, %r363, %r343; + add.s32 %r364, %r363, %r277; xor.b32 %r365, %r364, %r331; shf.l.wrap.b32 %r366, %r365, %r365, 20; - add.s32 %r367, %r361, %r101; + add.s32 %r367, %r361, %r245; add.s32 %r368, %r367, %r366; xor.b32 %r369, %r368, %r363; shf.l.wrap.b32 %r370, %r369, %r369, 24; add.s32 %r371, %r370, %r364; xor.b32 %r372, %r371, %r366; shf.l.wrap.b32 %r373, %r372, %r372, 25; - add.s32 %r374, %r326, %r129; - add.s32 %r375, %r374, %r345; - xor.b32 %r376, %r375, %r314; + add.s32 %r374, %r326, %r252; + add.s32 %r375, %r374, %r279; + xor.b32 %r376, %r375, %r310; shf.l.wrap.b32 %r377, %r376, %r376, 16; - add.s32 %r378, %r377, %r301; - xor.b32 %r379, %r378, %r345; + add.s32 %r378, %r377, %r293; + xor.b32 %r379, %r378, %r279; shf.l.wrap.b32 %r380, %r379, %r379, 20; - add.s32 %r381, %r375, %r169; + add.s32 %r381, %r375, %r259; add.s32 %r382, %r381, %r380; xor.b32 %r383, %r382, %r377; shf.l.wrap.b32 %r384, %r383, %r383, 24; add.s32 %r385, %r384, %r378; xor.b32 %r386, %r385, %r380; shf.l.wrap.b32 %r387, %r386, %r386, 25; - add.s32 %r388, %r340, %r177; - add.s32 %r389, %r388, %r303; - xor.b32 %r390, %r389, %r328; + add.s32 %r388, %r340, %r168; + add.s32 %r389, %r388, %r387; + xor.b32 %r390, %r389, %r356; shf.l.wrap.b32 %r391, %r390, %r390, 16; - add.s32 %r392, %r391, %r315; - xor.b32 %r393, %r392, %r303; + add.s32 %r392, %r391, %r371; + xor.b32 %r393, %r392, %r387; shf.l.wrap.b32 %r394, %r393, %r393, 20; - add.s32 %r395, %r389, %r121; + add.s32 %r395, %r389, %r196; add.s32 %r396, %r395, %r394; xor.b32 %r397, %r396, %r391; shf.l.wrap.b32 %r398, %r397, %r397, 24; add.s32 %r399, %r398, %r392; xor.b32 %r400, %r399, %r394; shf.l.wrap.b32 %r401, %r400, %r400, 25; - add.s32 %r402, %r354, %r89; - add.s32 %r403, %r402, %r401; + add.s32 %r402, %r354, %r175; + add.s32 %r403, %r402, %r345; xor.b32 %r404, %r403, %r370; shf.l.wrap.b32 %r405, %r404, %r404, 16; add.s32 %r406, %r405, %r385; - xor.b32 %r407, %r406, %r401; + xor.b32 %r407, %r406, %r345; shf.l.wrap.b32 %r408, %r407, %r407, 20; - add.s32 %r409, %r403, %r93; + add.s32 %r409, %r403, %r224; add.s32 %r410, %r409, %r408; xor.b32 %r411, %r410, %r405; shf.l.wrap.b32 %r412, %r411, %r411, 24; add.s32 %r413, %r412, %r406; xor.b32 %r414, %r413, %r408; shf.l.wrap.b32 %r415, %r414, %r414, 25; - add.s32 %r416, %r368, %r137; + add.s32 %r416, %r368, %r203; add.s32 %r417, %r416, %r359; xor.b32 %r418, %r417, %r384; shf.l.wrap.b32 %r419, %r418, %r418, 16; - add.s32 %r420, %r419, %r399; + add.s32 %r420, %r419, %r343; xor.b32 %r421, %r420, %r359; shf.l.wrap.b32 %r422, %r421, %r421, 20; - add.s32 %r423, %r417, %r153; + add.s32 %r423, %r417, %r154; add.s32 %r424, %r423, %r422; xor.b32 %r425, %r424, %r419; shf.l.wrap.b32 %r426, %r425, %r425, 24; add.s32 %r427, %r426, %r420; xor.b32 %r428, %r427, %r422; shf.l.wrap.b32 %r429, %r428, %r428, 25; - add.s32 %r430, %r382, %r161; + add.s32 %r430, %r382, %r182; add.s32 %r431, %r430, %r373; - xor.b32 %r432, %r431, %r398; + xor.b32 %r432, %r431, %r342; shf.l.wrap.b32 %r433, %r432, %r432, 16; add.s32 %r434, %r433, %r357; xor.b32 %r435, %r434, %r373; shf.l.wrap.b32 %r436, %r435, %r435, 20; - add.s32 %r437, %r431, %r81; + add.s32 %r437, %r431, %r245; add.s32 %r438, %r437, %r436; xor.b32 %r439, %r438, %r433; shf.l.wrap.b32 %r440, %r439, %r439, 24; add.s32 %r441, %r440, %r434; xor.b32 %r442, %r441, %r436; shf.l.wrap.b32 %r443, %r442, %r442, 25; - add.s32 %r444, %r396, %r113; - add.s32 %r445, %r444, %r387; - xor.b32 %r446, %r445, %r356; + add.s32 %r444, %r396, %r161; + add.s32 %r445, %r444, %r415; + xor.b32 %r446, %r445, %r440; shf.l.wrap.b32 %r447, %r446, %r446, 16; - add.s32 %r448, %r447, %r371; - xor.b32 %r449, %r448, %r387; + add.s32 %r448, %r447, %r427; + xor.b32 %r449, %r448, %r415; shf.l.wrap.b32 %r450, %r449, %r449, 20; - add.s32 %r451, %r445, %r169; + add.s32 %r451, %r445, %r231; add.s32 %r452, %r451, %r450; xor.b32 %r453, %r452, %r447; shf.l.wrap.b32 %r454, %r453, %r453, 24; add.s32 %r455, %r454, %r448; xor.b32 %r456, %r455, %r450; shf.l.wrap.b32 %r457, %r456, %r456, 25; - add.s32 %r458, %r410, %r105; + add.s32 %r458, %r410, %r238; add.s32 %r459, %r458, %r429; - xor.b32 %r460, %r459, %r454; + xor.b32 %r460, %r459, %r398; shf.l.wrap.b32 %r461, %r460, %r460, 16; add.s32 %r462, %r461, %r441; xor.b32 %r463, %r462, %r429; shf.l.wrap.b32 %r464, %r463, %r463, 20; - add.s32 %r465, %r459, %r101; + add.s32 %r465, %r459, %r189; add.s32 %r466, %r465, %r464; xor.b32 %r467, %r466, %r461; shf.l.wrap.b32 %r468, %r467, %r467, 24; add.s32 %r469, %r468, %r462; xor.b32 %r470, %r469, %r464; shf.l.wrap.b32 %r471, %r470, %r470, 25; - add.s32 %r472, %r424, %r129; + add.s32 %r472, %r424, %r217; add.s32 %r473, %r472, %r443; xor.b32 %r474, %r473, %r412; shf.l.wrap.b32 %r475, %r474, %r474, 16; - add.s32 %r476, %r475, %r455; + add.s32 %r476, %r475, %r399; xor.b32 %r477, %r476, %r443; shf.l.wrap.b32 %r478, %r477, %r477, 20; - add.s32 %r479, %r473, %r69; + add.s32 %r479, %r473, %r252; add.s32 %r480, %r479, %r478; xor.b32 %r481, %r480, %r475; shf.l.wrap.b32 %r482, %r481, %r481, 24; add.s32 %r483, %r482, %r476; xor.b32 %r484, %r483, %r478; shf.l.wrap.b32 %r485, %r484, %r484, 25; - add.s32 %r486, %r438, %r145; - add.s32 %r487, %r486, %r457; + add.s32 %r486, %r438, %r259; + add.s32 %r487, %r486, %r401; xor.b32 %r488, %r487, %r426; shf.l.wrap.b32 %r489, %r488, %r488, 16; add.s32 %r490, %r489, %r413; - xor.b32 %r491, %r490, %r457; + xor.b32 %r491, %r490, %r401; shf.l.wrap.b32 %r492, %r491, %r491, 20; - add.s32 %r493, %r487, %r177; + add.s32 %r493, %r487, %r210; add.s32 %r494, %r493, %r492; xor.b32 %r495, %r494, %r489; shf.l.wrap.b32 %r496, %r495, %r495, 24; add.s32 %r497, %r496, %r490; xor.b32 %r498, %r497, %r492; shf.l.wrap.b32 %r499, %r498, %r498, 25; - add.s32 %r500, %r452, %r121; - add.s32 %r501, %r500, %r415; - xor.b32 %r502, %r501, %r440; + add.s32 %r500, %r452, %r175; + add.s32 %r501, %r500, %r499; + xor.b32 %r502, %r501, %r468; shf.l.wrap.b32 %r503, %r502, %r502, 16; - add.s32 %r504, %r503, %r427; - xor.b32 %r505, %r504, %r415; + add.s32 %r504, %r503, %r483; + xor.b32 %r505, %r504, %r499; shf.l.wrap.b32 %r506, %r505, %r505, 20; - add.s32 %r507, %r501, %r77; + add.s32 %r507, %r501, %r182; add.s32 %r508, %r507, %r506; xor.b32 %r509, %r508, %r503; shf.l.wrap.b32 %r510, %r509, %r509, 24; add.s32 %r511, %r510, %r504; xor.b32 %r512, %r511, %r506; shf.l.wrap.b32 %r513, %r512, %r512, 25; - add.s32 %r514, %r466, %r137; - add.s32 %r515, %r514, %r513; + add.s32 %r514, %r466, %r224; + add.s32 %r515, %r514, %r457; xor.b32 %r516, %r515, %r482; shf.l.wrap.b32 %r517, %r516, %r516, 16; add.s32 %r518, %r517, %r497; - xor.b32 %r519, %r518, %r513; + xor.b32 %r519, %r518, %r457; shf.l.wrap.b32 %r520, %r519, %r519, 20; - add.s32 %r521, %r515, %r113; + add.s32 %r521, %r515, %r238; add.s32 %r522, %r521, %r520; xor.b32 %r523, %r522, %r517; shf.l.wrap.b32 %r524, %r523, %r523, 24; add.s32 %r525, %r524, %r518; xor.b32 %r526, %r525, %r520; shf.l.wrap.b32 %r527, %r526, %r526, 25; - add.s32 %r528, %r480, %r153; + add.s32 %r528, %r480, %r245; add.s32 %r529, %r528, %r471; xor.b32 %r530, %r529, %r496; shf.l.wrap.b32 %r531, %r530, %r530, 16; - add.s32 %r532, %r531, %r511; + add.s32 %r532, %r531, %r455; xor.b32 %r533, %r532, %r471; shf.l.wrap.b32 %r534, %r533, %r533, 20; - add.s32 %r535, %r529, %r129; + add.s32 %r535, %r529, %r168; add.s32 %r536, %r535, %r534; xor.b32 %r537, %r536, %r531; shf.l.wrap.b32 %r538, %r537, %r537, 24; add.s32 %r539, %r538, %r532; xor.b32 %r540, %r539, %r534; shf.l.wrap.b32 %r541, %r540, %r540, 25; - add.s32 %r542, %r494, %r169; + add.s32 %r542, %r494, %r203; add.s32 %r543, %r542, %r485; - xor.b32 %r544, %r543, %r510; + xor.b32 %r544, %r543, %r454; shf.l.wrap.b32 %r545, %r544, %r544, 16; add.s32 %r546, %r545, %r469; xor.b32 %r547, %r546, %r485; shf.l.wrap.b32 %r548, %r547, %r547, 20; - add.s32 %r549, %r543, %r89; + add.s32 %r549, %r543, %r252; add.s32 %r550, %r549, %r548; xor.b32 %r551, %r550, %r545; shf.l.wrap.b32 %r552, %r551, %r551, 24; add.s32 %r553, %r552, %r546; xor.b32 %r554, %r553, %r548; shf.l.wrap.b32 %r555, %r554, %r554, 25; - add.s32 %r556, %r508, %r161; - add.s32 %r557, %r556, %r499; - xor.b32 %r558, %r557, %r468; + add.s32 %r556, %r508, %r196; + add.s32 %r557, %r556, %r527; + xor.b32 %r558, %r557, %r552; shf.l.wrap.b32 %r559, %r558, %r558, 16; - add.s32 %r560, %r559, %r483; - xor.b32 %r561, %r560, %r499; + add.s32 %r560, %r559, %r539; + xor.b32 %r561, %r560, %r527; shf.l.wrap.b32 %r562, %r561, %r561, 20; - add.s32 %r563, %r557, %r177; + add.s32 %r563, %r557, %r189; add.s32 %r564, %r563, %r562; xor.b32 %r565, %r564, %r559; shf.l.wrap.b32 %r566, %r565, %r565, 24; add.s32 %r567, %r566, %r560; xor.b32 %r568, %r567, %r562; shf.l.wrap.b32 %r569, %r568, %r568, 25; - add.s32 %r570, %r522, %r93; + add.s32 %r570, %r522, %r217; add.s32 %r571, %r570, %r541; - xor.b32 %r572, %r571, %r566; + xor.b32 %r572, %r571, %r510; shf.l.wrap.b32 %r573, %r572, %r572, 16; add.s32 %r574, %r573, %r553; xor.b32 %r575, %r574, %r541; shf.l.wrap.b32 %r576, %r575, %r575, 20; - add.s32 %r577, %r571, %r69; + add.s32 %r577, %r571, %r154; add.s32 %r578, %r577, %r576; xor.b32 %r579, %r578, %r573; shf.l.wrap.b32 %r580, %r579, %r579, 24; add.s32 %r581, %r580, %r574; xor.b32 %r582, %r581, %r576; shf.l.wrap.b32 %r583, %r582, %r582, 25; - add.s32 %r584, %r536, %r145; + add.s32 %r584, %r536, %r231; add.s32 %r585, %r584, %r555; xor.b32 %r586, %r585, %r524; shf.l.wrap.b32 %r587, %r586, %r586, 16; - add.s32 %r588, %r587, %r567; + add.s32 %r588, %r587, %r511; xor.b32 %r589, %r588, %r555; shf.l.wrap.b32 %r590, %r589, %r589, 20; - add.s32 %r591, %r585, %r81; + add.s32 %r591, %r585, %r259; add.s32 %r592, %r591, %r590; xor.b32 %r593, %r592, %r587; shf.l.wrap.b32 %r594, %r593, %r593, 24; add.s32 %r595, %r594, %r588; xor.b32 %r596, %r595, %r590; shf.l.wrap.b32 %r597, %r596, %r596, 25; - add.s32 %r598, %r550, %r101; - add.s32 %r599, %r598, %r569; + add.s32 %r598, %r550, %r210; + add.s32 %r599, %r598, %r513; xor.b32 %r600, %r599, %r538; shf.l.wrap.b32 %r601, %r600, %r600, 16; add.s32 %r602, %r601, %r525; - xor.b32 %r603, %r602, %r569; + xor.b32 %r603, %r602, %r513; shf.l.wrap.b32 %r604, %r603, %r603, 20; - add.s32 %r605, %r599, %r121; + add.s32 %r605, %r599, %r161; add.s32 %r606, %r605, %r604; xor.b32 %r607, %r606, %r601; shf.l.wrap.b32 %r608, %r607, %r607, 24; add.s32 %r609, %r608, %r602; xor.b32 %r610, %r609, %r604; shf.l.wrap.b32 %r611, %r610, %r610, 25; - add.s32 %r612, %r564, %r77; - add.s32 %r613, %r612, %r527; - xor.b32 %r614, %r613, %r552; + add.s32 %r612, %r564, %r224; + add.s32 %r613, %r612, %r611; + xor.b32 %r614, %r613, %r580; shf.l.wrap.b32 %r615, %r614, %r614, 16; - add.s32 %r616, %r615, %r539; - xor.b32 %r617, %r616, %r527; + add.s32 %r616, %r615, %r595; + xor.b32 %r617, %r616, %r611; shf.l.wrap.b32 %r618, %r617, %r617, 20; - add.s32 %r619, %r613, %r105; + add.s32 %r619, %r613, %r203; add.s32 %r620, %r619, %r618; xor.b32 %r621, %r620, %r615; shf.l.wrap.b32 %r622, %r621, %r621, 24; add.s32 %r623, %r622, %r616; xor.b32 %r624, %r623, %r618; shf.l.wrap.b32 %r625, %r624, %r624, 25; - add.s32 %r626, %r578, %r153; - add.s32 %r627, %r626, %r625; + add.s32 %r626, %r578, %r238; + add.s32 %r627, %r626, %r569; xor.b32 %r628, %r627, %r594; shf.l.wrap.b32 %r629, %r628, %r628, 16; add.s32 %r630, %r629, %r609; - xor.b32 %r631, %r630, %r625; + xor.b32 %r631, %r630, %r569; shf.l.wrap.b32 %r632, %r631, %r631, 20; - add.s32 %r633, %r627, %r161; + add.s32 %r633, %r627, %r217; add.s32 %r634, %r633, %r632; xor.b32 %r635, %r634, %r629; shf.l.wrap.b32 %r636, %r635, %r635, 24; add.s32 %r637, %r636, %r630; xor.b32 %r638, %r637, %r632; shf.l.wrap.b32 %r639, %r638, %r638, 25; - add.s32 %r640, %r592, %r129; + add.s32 %r640, %r592, %r252; add.s32 %r641, %r640, %r583; xor.b32 %r642, %r641, %r608; shf.l.wrap.b32 %r643, %r642, %r642, 16; - add.s32 %r644, %r643, %r623; + add.s32 %r644, %r643, %r567; xor.b32 %r645, %r644, %r583; shf.l.wrap.b32 %r646, %r645, %r645, 20; - add.s32 %r647, %r641, %r145; + add.s32 %r647, %r641, %r175; add.s32 %r648, %r647, %r646; xor.b32 %r649, %r648, %r643; shf.l.wrap.b32 %r650, %r649, %r649, 24; add.s32 %r651, %r650, %r644; xor.b32 %r652, %r651, %r646; shf.l.wrap.b32 %r653, %r652, %r652, 25; - add.s32 %r654, %r606, %r177; + add.s32 %r654, %r606, %r245; add.s32 %r655, %r654, %r597; - xor.b32 %r656, %r655, %r622; + xor.b32 %r656, %r655, %r566; shf.l.wrap.b32 %r657, %r656, %r656, 16; add.s32 %r658, %r657, %r581; xor.b32 %r659, %r658, %r597; shf.l.wrap.b32 %r660, %r659, %r659, 20; - add.s32 %r661, %r655, %r137; + add.s32 %r661, %r655, %r259; add.s32 %r662, %r661, %r660; xor.b32 %r663, %r662, %r657; shf.l.wrap.b32 %r664, %r663, %r663, 24; add.s32 %r665, %r664, %r658; xor.b32 %r666, %r665, %r660; shf.l.wrap.b32 %r667, %r666, %r666, 25; - add.s32 %r668, %r620, %r169; - add.s32 %r669, %r668, %r611; - xor.b32 %r670, %r669, %r580; + add.s32 %r668, %r620, %r182; + add.s32 %r669, %r668, %r639; + xor.b32 %r670, %r669, %r664; shf.l.wrap.b32 %r671, %r670, %r670, 16; - add.s32 %r672, %r671, %r595; - xor.b32 %r673, %r672, %r611; + add.s32 %r672, %r671, %r651; + xor.b32 %r673, %r672, %r639; shf.l.wrap.b32 %r674, %r673, %r673, 20; - add.s32 %r675, %r669, %r121; + add.s32 %r675, %r669, %r154; add.s32 %r676, %r675, %r674; xor.b32 %r677, %r676, %r671; shf.l.wrap.b32 %r678, %r677, %r677, 24; add.s32 %r679, %r678, %r672; xor.b32 %r680, %r679, %r674; shf.l.wrap.b32 %r681, %r680, %r680, 25; - add.s32 %r682, %r634, %r113; + add.s32 %r682, %r634, %r231; add.s32 %r683, %r682, %r653; - xor.b32 %r684, %r683, %r678; + xor.b32 %r684, %r683, %r622; shf.l.wrap.b32 %r685, %r684, %r684, 16; add.s32 %r686, %r685, %r665; xor.b32 %r687, %r686, %r653; shf.l.wrap.b32 %r688, %r687, %r687, 20; - add.s32 %r689, %r683, %r81; + add.s32 %r689, %r683, %r168; add.s32 %r690, %r689, %r688; xor.b32 %r691, %r690, %r685; shf.l.wrap.b32 %r692, %r691, %r691, 24; add.s32 %r693, %r692, %r686; xor.b32 %r694, %r693, %r688; shf.l.wrap.b32 %r695, %r694, %r694, 25; - add.s32 %r696, %r648, %r101; + add.s32 %r696, %r648, %r189; add.s32 %r697, %r696, %r667; xor.b32 %r698, %r697, %r636; shf.l.wrap.b32 %r699, %r698, %r698, 16; - add.s32 %r700, %r699, %r679; + add.s32 %r700, %r699, %r623; xor.b32 %r701, %r700, %r667; shf.l.wrap.b32 %r702, %r701, %r701, 20; - add.s32 %r703, %r697, %r89; + add.s32 %r703, %r697, %r210; add.s32 %r704, %r703, %r702; xor.b32 %r705, %r704, %r699; shf.l.wrap.b32 %r706, %r705, %r705, 24; add.s32 %r707, %r706, %r700; xor.b32 %r708, %r707, %r702; shf.l.wrap.b32 %r709, %r708, %r708, 25; - add.s32 %r710, %r662, %r69; - add.s32 %r711, %r710, %r681; + add.s32 %r710, %r662, %r161; + add.s32 %r711, %r710, %r625; xor.b32 %r712, %r711, %r650; shf.l.wrap.b32 %r713, %r712, %r712, 16; add.s32 %r714, %r713, %r637; - xor.b32 %r715, %r714, %r681; + xor.b32 %r715, %r714, %r625; shf.l.wrap.b32 %r716, %r715, %r715, 20; - add.s32 %r717, %r711, %r77; + add.s32 %r717, %r711, %r196; add.s32 %r718, %r717, %r716; xor.b32 %r719, %r718, %r713; shf.l.wrap.b32 %r720, %r719, %r719, 24; add.s32 %r721, %r720, %r714; xor.b32 %r722, %r721, %r716; shf.l.wrap.b32 %r723, %r722, %r722, 25; - add.s32 %r724, %r676, %r105; - add.s32 %r725, %r724, %r639; - xor.b32 %r726, %r725, %r664; + add.s32 %r724, %r676, %r238; + add.s32 %r725, %r724, %r723; + xor.b32 %r726, %r725, %r692; shf.l.wrap.b32 %r727, %r726, %r726, 16; - add.s32 %r728, %r727, %r651; - xor.b32 %r729, %r728, %r639; + add.s32 %r728, %r727, %r707; + xor.b32 %r729, %r728, %r723; shf.l.wrap.b32 %r730, %r729, %r729, 20; - add.s32 %r731, %r725, %r93; + add.s32 %r731, %r725, %r245; add.s32 %r732, %r731, %r730; xor.b32 %r733, %r732, %r727; shf.l.wrap.b32 %r734, %r733, %r733, 24; add.s32 %r735, %r734, %r728; xor.b32 %r736, %r735, %r730; shf.l.wrap.b32 %r737, %r736, %r736, 25; - add.s32 %r738, %r690, %r129; - add.s32 %r739, %r738, %r737; + add.s32 %r738, %r690, %r217; + add.s32 %r739, %r738, %r681; xor.b32 %r740, %r739, %r706; shf.l.wrap.b32 %r741, %r740, %r740, 16; add.s32 %r742, %r741, %r721; - xor.b32 %r743, %r742, %r737; + xor.b32 %r743, %r742, %r681; shf.l.wrap.b32 %r744, %r743, %r743, 20; - add.s32 %r745, %r739, %r169; + add.s32 %r745, %r739, %r231; add.s32 %r746, %r745, %r744; xor.b32 %r747, %r746, %r741; shf.l.wrap.b32 %r748, %r747, %r747, 24; add.s32 %r749, %r748, %r742; xor.b32 %r750, %r749, %r744; shf.l.wrap.b32 %r751, %r750, %r750, 25; - add.s32 %r752, %r704, %r145; + add.s32 %r752, %r704, %r259; add.s32 %r753, %r752, %r695; xor.b32 %r754, %r753, %r720; shf.l.wrap.b32 %r755, %r754, %r754, 16; - add.s32 %r756, %r755, %r735; + add.s32 %r756, %r755, %r679; xor.b32 %r757, %r756, %r695; shf.l.wrap.b32 %r758, %r757, %r757, 20; - add.s32 %r759, %r753, %r101; + add.s32 %r759, %r753, %r224; add.s32 %r760, %r759, %r758; xor.b32 %r761, %r760, %r755; shf.l.wrap.b32 %r762, %r761, %r761, 24; add.s32 %r763, %r762, %r756; xor.b32 %r764, %r763, %r758; shf.l.wrap.b32 %r765, %r764, %r764, 25; - add.s32 %r766, %r718, %r121; + add.s32 %r766, %r718, %r252; add.s32 %r767, %r766, %r709; - xor.b32 %r768, %r767, %r734; + xor.b32 %r768, %r767, %r678; shf.l.wrap.b32 %r769, %r768, %r768, 16; add.s32 %r770, %r769, %r693; xor.b32 %r771, %r770, %r709; shf.l.wrap.b32 %r772, %r771, %r771, 20; - add.s32 %r773, %r767, %r153; + add.s32 %r773, %r767, %r210; add.s32 %r774, %r773, %r772; xor.b32 %r775, %r774, %r769; shf.l.wrap.b32 %r776, %r775, %r775, 24; add.s32 %r777, %r776, %r770; xor.b32 %r778, %r777, %r772; shf.l.wrap.b32 %r779, %r778, %r778, 25; - add.s32 %r780, %r732, %r177; - add.s32 %r781, %r780, %r723; - xor.b32 %r782, %r781, %r692; + add.s32 %r780, %r732, %r203; + add.s32 %r781, %r780, %r751; + xor.b32 %r782, %r781, %r776; shf.l.wrap.b32 %r783, %r782, %r782, 16; - add.s32 %r784, %r783, %r707; - xor.b32 %r785, %r784, %r723; + add.s32 %r784, %r783, %r763; + xor.b32 %r785, %r784, %r751; shf.l.wrap.b32 %r786, %r785, %r785, 20; - add.s32 %r787, %r781, %r77; + add.s32 %r787, %r781, %r168; add.s32 %r788, %r787, %r786; xor.b32 %r789, %r788, %r783; shf.l.wrap.b32 %r790, %r789, %r789, 24; add.s32 %r791, %r790, %r784; xor.b32 %r792, %r791, %r786; shf.l.wrap.b32 %r793, %r792, %r792, 25; - add.s32 %r794, %r746, %r161; + add.s32 %r794, %r746, %r189; add.s32 %r795, %r794, %r765; - xor.b32 %r796, %r795, %r790; + xor.b32 %r796, %r795, %r734; shf.l.wrap.b32 %r797, %r796, %r796, 16; add.s32 %r798, %r797, %r777; xor.b32 %r799, %r798, %r765; shf.l.wrap.b32 %r800, %r799, %r799, 20; - add.s32 %r801, %r795, %r89; + add.s32 %r801, %r795, %r175; add.s32 %r802, %r801, %r800; xor.b32 %r803, %r802, %r797; shf.l.wrap.b32 %r804, %r803, %r803, 24; add.s32 %r805, %r804, %r798; xor.b32 %r806, %r805, %r800; shf.l.wrap.b32 %r807, %r806, %r806, 25; - add.s32 %r808, %r760, %r69; + add.s32 %r808, %r760, %r154; add.s32 %r809, %r808, %r779; xor.b32 %r810, %r809, %r748; shf.l.wrap.b32 %r811, %r810, %r810, 16; - add.s32 %r812, %r811, %r791; + add.s32 %r812, %r811, %r735; xor.b32 %r813, %r812, %r779; shf.l.wrap.b32 %r814, %r813, %r813, 20; - add.s32 %r815, %r809, %r137; + add.s32 %r815, %r809, %r161; add.s32 %r816, %r815, %r814; xor.b32 %r817, %r816, %r811; shf.l.wrap.b32 %r818, %r817, %r817, 24; add.s32 %r819, %r818, %r812; xor.b32 %r820, %r819, %r814; shf.l.wrap.b32 %r821, %r820, %r820, 25; - add.s32 %r822, %r774, %r81; - add.s32 %r823, %r822, %r793; + add.s32 %r822, %r774, %r196; + add.s32 %r823, %r822, %r737; xor.b32 %r824, %r823, %r762; shf.l.wrap.b32 %r825, %r824, %r824, 16; add.s32 %r826, %r825, %r749; - xor.b32 %r827, %r826, %r793; + xor.b32 %r827, %r826, %r737; shf.l.wrap.b32 %r828, %r827, %r827, 20; - add.s32 %r829, %r823, %r105; + add.s32 %r829, %r823, %r182; add.s32 %r830, %r829, %r828; xor.b32 %r831, %r830, %r825; shf.l.wrap.b32 %r832, %r831, %r831, 24; add.s32 %r833, %r832, %r826; xor.b32 %r834, %r833, %r828; shf.l.wrap.b32 %r835, %r834, %r834, 25; - add.s32 %r836, %r788, %r93; - add.s32 %r837, %r836, %r751; - xor.b32 %r838, %r837, %r776; + add.s32 %r836, %r788, %r217; + add.s32 %r837, %r836, %r835; + xor.b32 %r838, %r837, %r804; shf.l.wrap.b32 %r839, %r838, %r838, 16; - add.s32 %r840, %r839, %r763; - xor.b32 %r841, %r840, %r751; + add.s32 %r840, %r839, %r819; + xor.b32 %r841, %r840, %r835; shf.l.wrap.b32 %r842, %r841, %r841, 20; - add.s32 %r843, %r837, %r113; + add.s32 %r843, %r837, %r252; add.s32 %r844, %r843, %r842; xor.b32 %r845, %r844, %r839; shf.l.wrap.b32 %r846, %r845, %r845, 24; add.s32 %r847, %r846, %r840; xor.b32 %r848, %r847, %r842; shf.l.wrap.b32 %r849, %r848, %r848, 25; - add.s32 %r850, %r802, %r145; - add.s32 %r851, %r850, %r849; + add.s32 %r850, %r802, %r231; + add.s32 %r851, %r850, %r793; xor.b32 %r852, %r851, %r818; shf.l.wrap.b32 %r853, %r852, %r852, 16; add.s32 %r854, %r853, %r833; - xor.b32 %r855, %r854, %r849; + xor.b32 %r855, %r854, %r793; shf.l.wrap.b32 %r856, %r855, %r855, 20; - add.s32 %r857, %r851, %r177; + add.s32 %r857, %r851, %r189; add.s32 %r858, %r857, %r856; xor.b32 %r859, %r858, %r853; shf.l.wrap.b32 %r860, %r859, %r859, 24; add.s32 %r861, %r860, %r854; xor.b32 %r862, %r861, %r856; shf.l.wrap.b32 %r863, %r862, %r862, 25; - add.s32 %r864, %r816, %r101; + add.s32 %r864, %r816, %r210; add.s32 %r865, %r864, %r807; xor.b32 %r866, %r865, %r832; shf.l.wrap.b32 %r867, %r866, %r866, 16; - add.s32 %r868, %r867, %r847; + add.s32 %r868, %r867, %r791; xor.b32 %r869, %r868, %r807; shf.l.wrap.b32 %r870, %r869, %r869, 20; - add.s32 %r871, %r865, %r69; + add.s32 %r871, %r865, %r238; add.s32 %r872, %r871, %r870; xor.b32 %r873, %r872, %r867; shf.l.wrap.b32 %r874, %r873, %r873, 24; add.s32 %r875, %r874, %r868; xor.b32 %r876, %r875, %r870; shf.l.wrap.b32 %r877, %r876, %r876, 25; - add.s32 %r878, %r830, %r77; + add.s32 %r878, %r830, %r259; add.s32 %r879, %r878, %r821; - xor.b32 %r880, %r879, %r846; + xor.b32 %r880, %r879, %r790; shf.l.wrap.b32 %r881, %r880, %r880, 16; add.s32 %r882, %r881, %r805; xor.b32 %r883, %r882, %r821; shf.l.wrap.b32 %r884, %r883, %r883, 20; - add.s32 %r885, %r879, %r129; + add.s32 %r885, %r879, %r161; add.s32 %r886, %r885, %r884; xor.b32 %r887, %r886, %r881; shf.l.wrap.b32 %r888, %r887, %r887, 24; add.s32 %r889, %r888, %r882; xor.b32 %r890, %r889, %r884; shf.l.wrap.b32 %r891, %r890, %r890, 25; - add.s32 %r892, %r844, %r121; - add.s32 %r893, %r892, %r835; - xor.b32 %r894, %r893, %r804; + add.s32 %r892, %r844, %r245; + add.s32 %r893, %r892, %r863; + xor.b32 %r894, %r893, %r888; shf.l.wrap.b32 %r895, %r894, %r894, 16; - add.s32 %r896, %r895, %r819; - xor.b32 %r897, %r896, %r835; + add.s32 %r896, %r895, %r875; + xor.b32 %r897, %r896, %r863; shf.l.wrap.b32 %r898, %r897, %r897, 20; - add.s32 %r899, %r893, %r105; + add.s32 %r899, %r893, %r175; add.s32 %r900, %r899, %r898; xor.b32 %r901, %r900, %r895; shf.l.wrap.b32 %r902, %r901, %r901, 24; add.s32 %r903, %r902, %r896; xor.b32 %r904, %r903, %r898; shf.l.wrap.b32 %r905, %r904, %r904, 25; - add.s32 %r906, %r858, %r169; + add.s32 %r906, %r858, %r154; add.s32 %r907, %r906, %r877; - xor.b32 %r908, %r907, %r902; + xor.b32 %r908, %r907, %r846; shf.l.wrap.b32 %r909, %r908, %r908, 16; add.s32 %r910, %r909, %r889; xor.b32 %r911, %r910, %r877; shf.l.wrap.b32 %r912, %r911, %r911, 20; - add.s32 %r913, %r907, %r137; + add.s32 %r913, %r907, %r224; add.s32 %r914, %r913, %r912; xor.b32 %r915, %r914, %r909; shf.l.wrap.b32 %r916, %r915, %r915, 24; add.s32 %r917, %r916, %r910; xor.b32 %r918, %r917, %r912; shf.l.wrap.b32 %r919, %r918, %r918, 25; - add.s32 %r920, %r872, %r81; + add.s32 %r920, %r872, %r168; add.s32 %r921, %r920, %r891; xor.b32 %r922, %r921, %r860; shf.l.wrap.b32 %r923, %r922, %r922, 16; - add.s32 %r924, %r923, %r903; + add.s32 %r924, %r923, %r847; xor.b32 %r925, %r924, %r891; shf.l.wrap.b32 %r926, %r925, %r925, 20; - add.s32 %r927, %r921, %r153; + add.s32 %r927, %r921, %r196; add.s32 %r928, %r927, %r926; xor.b32 %r929, %r928, %r923; shf.l.wrap.b32 %r930, %r929, %r929, 24; add.s32 %r931, %r930, %r924; xor.b32 %r932, %r931, %r926; shf.l.wrap.b32 %r933, %r932, %r932, 25; - add.s32 %r934, %r886, %r89; - add.s32 %r935, %r934, %r905; + add.s32 %r934, %r886, %r182; + add.s32 %r935, %r934, %r849; xor.b32 %r936, %r935, %r874; shf.l.wrap.b32 %r937, %r936, %r936, 16; add.s32 %r938, %r937, %r861; - xor.b32 %r939, %r938, %r905; + xor.b32 %r939, %r938, %r849; shf.l.wrap.b32 %r940, %r939, %r939, 20; - add.s32 %r941, %r935, %r93; + add.s32 %r941, %r935, %r203; add.s32 %r942, %r941, %r940; xor.b32 %r943, %r942, %r937; shf.l.wrap.b32 %r944, %r943, %r943, 24; add.s32 %r945, %r944, %r938; xor.b32 %r946, %r945, %r940; shf.l.wrap.b32 %r947, %r946, %r946, 25; - add.s32 %r948, %r900, %r113; - add.s32 %r949, %r948, %r863; - xor.b32 %r950, %r949, %r888; + add.s32 %r948, %r900, %r231; + add.s32 %r949, %r948, %r947; + xor.b32 %r950, %r949, %r916; shf.l.wrap.b32 %r951, %r950, %r950, 16; - add.s32 %r952, %r951, %r875; - xor.b32 %r953, %r952, %r863; + add.s32 %r952, %r951, %r931; + xor.b32 %r953, %r952, %r947; shf.l.wrap.b32 %r954, %r953, %r953, 20; - add.s32 %r955, %r949, %r161; + add.s32 %r955, %r949, %r259; add.s32 %r956, %r955, %r954; xor.b32 %r957, %r956, %r951; shf.l.wrap.b32 %r958, %r957, %r957, 24; add.s32 %r959, %r958, %r952; xor.b32 %r960, %r959, %r954; shf.l.wrap.b32 %r961, %r960, %r960, 25; - xor.b32 %r1, %r945, %r914; - xor.b32 %r2, %r959, %r928; - st.local.v2.u32 [%rd3+32], {%r1, %r2}; - xor.b32 %r3, %r917, %r942; - xor.b32 %r4, %r956, %r931; - st.local.v2.u32 [%rd3+40], {%r3, %r4}; - xor.b32 %r5, %r961, %r930; - xor.b32 %r6, %r919, %r944; - st.local.v2.u32 [%rd3+48], {%r5, %r6}; - xor.b32 %r7, %r958, %r933; - xor.b32 %r8, %r947, %r916; - st.local.v2.u32 [%rd3+56], {%r7, %r8}; - st.local.u64 [%rd3+72], %rd131; - st.local.u64 [%rd3+80], %rd8; - add.s16 %rs1, %rs61, 16; - and.b16 %rs70, %rs1, 255; - add.s16 %rs71, %rs62, 1; - st.local.v2.u8 [%rd3+136], {%rs1, %rs71}; - cvt.u32.u16 %r962, %rs71; - cvt.u32.u16 %r963, %rs70; - prmt.b32 %r964, %r962, %r963, 30212; - cvt.u16.u32 %rs72, %r964; - shr.u16 %rs2, %rs72, 8; - mov.b32 {%rs5, %rs6}, %r53; - mov.b32 {%rs3, %rs4}, %r52; - mov.b32 {%rs9, %rs10}, %r27; - mov.b32 {%rs7, %rs8}, %r26; - setp.eq.s16 %p10, %rs2, 0; - selp.u16 %rs73, 1, 0, %p10; - shr.u16 %rs74, %rs3, 8; - shr.u16 %rs75, %rs4, 8; - shr.u16 %rs76, %rs5, 8; - shr.u16 %rs77, %rs6, 8; - shr.u16 %rs78, %rs7, 8; - shr.u16 %rs79, %rs8, 8; - shr.u16 %rs80, %rs9, 8; - shr.u16 %rs81, %rs10, 8; - or.b16 %rs82, %rs73, 10; - cvt.u32.u16 %r965, %rs3; - and.b32 %r966, %r965, 255; - cvt.u32.u16 %r967, %rs74; - prmt.b32 %r968, %r967, %r966, 30212; - cvt.u32.u16 %r969, %rs4; - prmt.b32 %r970, %r969, %r968, 28756; - cvt.u32.u16 %r971, %rs75; - prmt.b32 %r972, %r971, %r970, 1620; - cvt.u32.u16 %r973, %rs5; - and.b32 %r974, %r973, 255; - cvt.u32.u16 %r975, %rs76; - prmt.b32 %r976, %r975, %r974, 30212; - cvt.u32.u16 %r977, %rs6; - prmt.b32 %r978, %r977, %r976, 28756; - cvt.u32.u16 %r979, %rs77; - prmt.b32 %r980, %r979, %r978, 1620; - cvt.u32.u16 %r981, %rs7; - and.b32 %r982, %r981, 255; - cvt.u32.u16 %r983, %rs78; - prmt.b32 %r984, %r983, %r982, 30212; - cvt.u32.u16 %r985, %rs8; - prmt.b32 %r986, %r985, %r984, 28756; - cvt.u32.u16 %r987, %rs79; - prmt.b32 %r988, %r987, %r986, 1620; - cvt.u32.u16 %r989, %rs9; - and.b32 %r990, %r989, 255; - cvt.u32.u16 %r991, %rs80; - prmt.b32 %r992, %r991, %r990, 30212; - cvt.u32.u16 %r993, %rs10; - prmt.b32 %r994, %r993, %r992, 28756; - cvt.u32.u16 %r995, %rs81; - prmt.b32 %r996, %r995, %r994, 1620; - cvt.u32.u16 %r997, %rs82; - add.s32 %r998, %r5, %r1; - add.s32 %r999, %r998, %r972; - add.s32 %r1000, %r980, %r999; - add.s32 %r1001, %r6, %r2; - add.s32 %r1002, %r1001, %r988; - add.s32 %r1003, %r996, %r1002; - add.s32 %r1004, %r7, %r3; - cvt.u32.u16 %r1005, %rs1; - and.b32 %r1006, %r1005, 255; - xor.b32 %r1007, %r1004, %r1006; - shr.u32 %r1008, %r1004, 16; - shl.b32 %r1009, %r1007, 16; - or.b32 %r1010, %r1009, %r1008; - add.s32 %r1011, %r1010, 1013904242; - xor.b32 %r1012, %r1011, %r7; - shf.l.wrap.b32 %r1013, %r1012, %r1012, 20; - add.s32 %r1014, %r1004, %r1013; - xor.b32 %r1015, %r1014, %r1010; - shf.l.wrap.b32 %r1016, %r1015, %r1015, 24; - add.s32 %r1017, %r1016, %r1011; - xor.b32 %r1018, %r1017, %r1013; - shf.l.wrap.b32 %r1019, %r1018, %r1018, 25; - add.s32 %r1020, %r8, %r4; - xor.b32 %r1021, %r1020, %r997; - shr.u32 %r1022, %r1020, 16; - shl.b32 %r1023, %r1021, 16; - or.b32 %r1024, %r1023, %r1022; - add.s32 %r1025, %r1024, -1521486534; - xor.b32 %r1026, %r1025, %r8; - shf.l.wrap.b32 %r1027, %r1026, %r1026, 20; - add.s32 %r1028, %r1020, %r1027; - xor.b32 %r1029, %r1028, %r1024; - shf.l.wrap.b32 %r1030, %r1029, %r1029, 24; - add.s32 %r1031, %r1030, %r1025; - xor.b32 %r1032, %r1031, %r1027; - shf.l.wrap.b32 %r1033, %r1032, %r1032, 25; - add.s32 %r1034, %r1033, %r1014; - shf.l.wrap.b32 %r1035, %r999, %r999, 16; - add.s32 %r1036, %r1035, 1779033703; - xor.b32 %r1037, %r1036, %r5; + add.s32 %r962, %r914, %r189; + add.s32 %r963, %r962, %r905; + xor.b32 %r964, %r963, %r930; + shf.l.wrap.b32 %r965, %r964, %r964, 16; + add.s32 %r966, %r965, %r945; + xor.b32 %r967, %r966, %r905; + shf.l.wrap.b32 %r968, %r967, %r967, 20; + add.s32 %r969, %r963, %r154; + add.s32 %r970, %r969, %r968; + xor.b32 %r971, %r970, %r965; + shf.l.wrap.b32 %r972, %r971, %r971, 24; + add.s32 %r973, %r972, %r966; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 25; + add.s32 %r976, %r928, %r161; + add.s32 %r977, %r976, %r919; + xor.b32 %r978, %r977, %r944; + shf.l.wrap.b32 %r979, %r978, %r978, 16; + add.s32 %r980, %r979, %r903; + xor.b32 %r981, %r980, %r919; + shf.l.wrap.b32 %r982, %r981, %r981, 20; + add.s32 %r983, %r977, %r217; + add.s32 %r984, %r983, %r982; + xor.b32 %r985, %r984, %r979; + shf.l.wrap.b32 %r986, %r985, %r985, 24; + add.s32 %r987, %r986, %r980; + xor.b32 %r988, %r987, %r982; + shf.l.wrap.b32 %r989, %r988, %r988, 25; + add.s32 %r990, %r942, %r210; + add.s32 %r991, %r990, %r933; + xor.b32 %r992, %r991, %r902; + shf.l.wrap.b32 %r993, %r992, %r992, 16; + add.s32 %r994, %r993, %r917; + xor.b32 %r995, %r994, %r933; + shf.l.wrap.b32 %r996, %r995, %r995, 20; + add.s32 %r997, %r991, %r196; + add.s32 %r998, %r997, %r996; + xor.b32 %r999, %r998, %r993; + shf.l.wrap.b32 %r1000, %r999, %r999, 24; + add.s32 %r1001, %r1000, %r994; + xor.b32 %r1002, %r1001, %r996; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 25; + add.s32 %r1004, %r956, %r252; + add.s32 %r1005, %r1004, %r975; + xor.b32 %r1006, %r1005, %r1000; + shf.l.wrap.b32 %r1007, %r1006, %r1006, 16; + add.s32 %r1008, %r1007, %r987; + xor.b32 %r1009, %r1008, %r975; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 20; + add.s32 %r1011, %r1005, %r224; + add.s32 %r1012, %r1011, %r1010; + xor.b32 %r1013, %r1012, %r1007; + shf.l.wrap.b32 %r1014, %r1013, %r1013, 24; + add.s32 %r1015, %r1014, %r1008; + xor.b32 %r1016, %r1015, %r1010; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 25; + add.s32 %r1018, %r970, %r168; + add.s32 %r1019, %r1018, %r989; + xor.b32 %r1020, %r1019, %r958; + shf.l.wrap.b32 %r1021, %r1020, %r1020, 16; + add.s32 %r1022, %r1021, %r1001; + xor.b32 %r1023, %r1022, %r989; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 20; + add.s32 %r1025, %r1019, %r238; + add.s32 %r1026, %r1025, %r1024; + xor.b32 %r1027, %r1026, %r1021; + shf.l.wrap.b32 %r1028, %r1027, %r1027, 24; + add.s32 %r1029, %r1028, %r1022; + xor.b32 %r1030, %r1029, %r1024; + shf.l.wrap.b32 %r1031, %r1030, %r1030, 25; + add.s32 %r1032, %r984, %r175; + add.s32 %r1033, %r1032, %r1003; + xor.b32 %r1034, %r1033, %r972; + shf.l.wrap.b32 %r1035, %r1034, %r1034, 16; + add.s32 %r1036, %r1035, %r959; + xor.b32 %r1037, %r1036, %r1003; shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; - add.s32 %r1039, %r1000, %r1038; - xor.b32 %r1040, %r1039, %r1035; - shf.l.wrap.b32 %r1041, %r1040, %r1040, 24; - add.s32 %r1042, %r1041, %r1036; - xor.b32 %r1043, %r1042, %r1038; - shf.l.wrap.b32 %r1044, %r1043, %r1043, 25; - shf.l.wrap.b32 %r1045, %r1002, %r1002, 16; - add.s32 %r1046, %r1045, -1150833019; - xor.b32 %r1047, %r1046, %r6; - shf.l.wrap.b32 %r1048, %r1047, %r1047, 20; - add.s32 %r1049, %r1003, %r1048; - xor.b32 %r1050, %r1049, %r1045; - shf.l.wrap.b32 %r1051, %r1050, %r1050, 24; - add.s32 %r1052, %r1051, %r1046; - xor.b32 %r1053, %r1052, %r1048; - shf.l.wrap.b32 %r1054, %r1053, %r1053, 25; - add.s32 %r1055, %r1039, %r1054; - xor.b32 %r1056, %r1055, %r1030; - shf.l.wrap.b32 %r1057, %r1056, %r1056, 16; - add.s32 %r1058, %r1057, %r1017; - xor.b32 %r1059, %r1058, %r1054; - shf.l.wrap.b32 %r1060, %r1059, %r1059, 20; - add.s32 %r1061, %r1055, %r1060; - xor.b32 %r1062, %r1061, %r1057; - shf.l.wrap.b32 %r1063, %r1062, %r1062, 24; - add.s32 %r1064, %r1063, %r1058; - xor.b32 %r1065, %r1064, %r1060; - shf.l.wrap.b32 %r1066, %r1065, %r1065, 25; - add.s32 %r1067, %r1019, %r1049; - xor.b32 %r1068, %r1041, %r1067; - shf.l.wrap.b32 %r1069, %r1068, %r1068, 16; - add.s32 %r1070, %r1069, %r1031; - xor.b32 %r1071, %r1070, %r1019; - shf.l.wrap.b32 %r1072, %r1071, %r1071, 20; - add.s32 %r1073, %r1067, %r1072; - xor.b32 %r1074, %r1073, %r1069; - shf.l.wrap.b32 %r1075, %r1074, %r1074, 24; - add.s32 %r1076, %r1075, %r1070; - xor.b32 %r1077, %r1076, %r1072; - shf.l.wrap.b32 %r1078, %r1077, %r1077, 25; - xor.b32 %r1079, %r1051, %r1034; - shf.l.wrap.b32 %r1080, %r1079, %r1079, 16; - add.s32 %r1081, %r1080, %r1042; - xor.b32 %r1082, %r1081, %r1033; - shf.l.wrap.b32 %r1083, %r1082, %r1082, 20; - add.s32 %r1084, %r1034, %r1083; - xor.b32 %r1085, %r1084, %r1080; - shf.l.wrap.b32 %r1086, %r1085, %r1085, 24; - add.s32 %r1087, %r1086, %r1081; - xor.b32 %r1088, %r1087, %r1083; - shf.l.wrap.b32 %r1089, %r1088, %r1088, 25; - add.s32 %r1090, %r1028, %r1044; - xor.b32 %r1091, %r1090, %r1016; - shf.l.wrap.b32 %r1092, %r1091, %r1091, 16; - add.s32 %r1093, %r1092, %r1052; - xor.b32 %r1094, %r1093, %r1044; - shf.l.wrap.b32 %r1095, %r1094, %r1094, 20; - add.s32 %r1096, %r1090, %r1095; - xor.b32 %r1097, %r1096, %r1092; - shf.l.wrap.b32 %r1098, %r1097, %r1097, 24; - add.s32 %r1099, %r1098, %r1093; - xor.b32 %r1100, %r1099, %r1095; - shf.l.wrap.b32 %r1101, %r1100, %r1100, 25; - add.s32 %r1102, %r1061, %r988; - add.s32 %r1103, %r1102, %r1101; - xor.b32 %r1104, %r1103, %r1075; - shf.l.wrap.b32 %r1105, %r1104, %r1104, 16; - add.s32 %r1106, %r1105, %r1087; - xor.b32 %r1107, %r1106, %r1101; - shf.l.wrap.b32 %r1108, %r1107, %r1107, 20; - add.s32 %r1109, %r1103, %r1108; - xor.b32 %r1110, %r1109, %r1105; - shf.l.wrap.b32 %r1111, %r1110, %r1110, 24; - add.s32 %r1112, %r1111, %r1106; - xor.b32 %r1113, %r1112, %r1108; - shf.l.wrap.b32 %r1114, %r1113, %r1113, 25; - add.s32 %r1115, %r1073, %r996; - add.s32 %r1116, %r1115, %r1066; - xor.b32 %r1117, %r1116, %r1086; - shf.l.wrap.b32 %r1118, %r1117, %r1117, 16; - add.s32 %r1119, %r1118, %r1099; - xor.b32 %r1120, %r1119, %r1066; - shf.l.wrap.b32 %r1121, %r1120, %r1120, 20; - add.s32 %r1122, %r1116, %r1121; - xor.b32 %r1123, %r1122, %r1118; - shf.l.wrap.b32 %r1124, %r1123, %r1123, 24; - add.s32 %r1125, %r1124, %r1119; - xor.b32 %r1126, %r1125, %r1121; - shf.l.wrap.b32 %r1127, %r1126, %r1126, 25; - add.s32 %r1128, %r1084, %r1078; - xor.b32 %r1129, %r1098, %r1128; - shf.l.wrap.b32 %r1130, %r1129, %r1129, 16; - add.s32 %r1131, %r1130, %r1064; - xor.b32 %r1132, %r1131, %r1078; - shf.l.wrap.b32 %r1133, %r1132, %r1132, 20; - add.s32 %r1134, %r1128, %r972; - add.s32 %r1135, %r1134, %r1133; - xor.b32 %r1136, %r1135, %r1130; - shf.l.wrap.b32 %r1137, %r1136, %r1136, 24; - add.s32 %r1138, %r1137, %r1131; - xor.b32 %r1139, %r1138, %r1133; - shf.l.wrap.b32 %r1140, %r1139, %r1139, 25; - add.s32 %r1141, %r1096, %r1089; - xor.b32 %r1142, %r1063, %r1141; - shf.l.wrap.b32 %r1143, %r1142, %r1142, 16; - add.s32 %r1144, %r1143, %r1076; - xor.b32 %r1145, %r1144, %r1089; - shf.l.wrap.b32 %r1146, %r1145, %r1145, 20; - add.s32 %r1147, %r1141, %r1146; - xor.b32 %r1148, %r1147, %r1143; - shf.l.wrap.b32 %r1149, %r1148, %r1148, 24; - add.s32 %r1150, %r1149, %r1144; - xor.b32 %r1151, %r1150, %r1146; - shf.l.wrap.b32 %r1152, %r1151, %r1151, 25; - add.s32 %r1153, %r1109, %r980; - add.s32 %r1154, %r1153, %r1127; - xor.b32 %r1155, %r1154, %r1149; - shf.l.wrap.b32 %r1156, %r1155, %r1155, 16; - add.s32 %r1157, %r1156, %r1138; - xor.b32 %r1158, %r1157, %r1127; - shf.l.wrap.b32 %r1159, %r1158, %r1158, 20; - add.s32 %r1160, %r1154, %r1159; - xor.b32 %r1161, %r1160, %r1156; - shf.l.wrap.b32 %r1162, %r1161, %r1161, 24; - add.s32 %r1163, %r1162, %r1157; - xor.b32 %r1164, %r1163, %r1159; - shf.l.wrap.b32 %r1165, %r1164, %r1164, 25; - add.s32 %r1166, %r1140, %r1122; - xor.b32 %r1167, %r1111, %r1166; - shf.l.wrap.b32 %r1168, %r1167, %r1167, 16; - add.s32 %r1169, %r1168, %r1150; - xor.b32 %r1170, %r1169, %r1140; - shf.l.wrap.b32 %r1171, %r1170, %r1170, 20; - add.s32 %r1172, %r1166, %r1171; - xor.b32 %r1173, %r1172, %r1168; - shf.l.wrap.b32 %r1174, %r1173, %r1173, 24; - add.s32 %r1175, %r1174, %r1169; - xor.b32 %r1176, %r1175, %r1171; - shf.l.wrap.b32 %r1177, %r1176, %r1176, 25; - add.s32 %r1178, %r1135, %r1152; - xor.b32 %r1179, %r1124, %r1178; - shf.l.wrap.b32 %r1180, %r1179, %r1179, 16; - add.s32 %r1181, %r1180, %r1112; - xor.b32 %r1182, %r1181, %r1152; - shf.l.wrap.b32 %r1183, %r1182, %r1182, 20; - add.s32 %r1184, %r1178, %r1183; - xor.b32 %r1185, %r1184, %r1180; - shf.l.wrap.b32 %r1186, %r1185, %r1185, 24; - add.s32 %r1187, %r1186, %r1181; - xor.b32 %r1188, %r1187, %r1183; - shf.l.wrap.b32 %r1189, %r1188, %r1188, 25; - add.s32 %r1190, %r1147, %r1114; - xor.b32 %r1191, %r1190, %r1137; - shf.l.wrap.b32 %r1192, %r1191, %r1191, 16; - add.s32 %r1193, %r1192, %r1125; - xor.b32 %r1194, %r1193, %r1114; - shf.l.wrap.b32 %r1195, %r1194, %r1194, 20; - add.s32 %r1196, %r1190, %r1195; - xor.b32 %r1197, %r1196, %r1192; - shf.l.wrap.b32 %r1198, %r1197, %r1197, 24; - add.s32 %r1199, %r1198, %r1193; - xor.b32 %r1200, %r1199, %r1195; - shf.l.wrap.b32 %r1201, %r1200, %r1200, 25; - add.s32 %r1202, %r1160, %r996; - add.s32 %r1203, %r1202, %r1201; - xor.b32 %r1204, %r1203, %r1174; - shf.l.wrap.b32 %r1205, %r1204, %r1204, 16; - add.s32 %r1206, %r1205, %r1187; - xor.b32 %r1207, %r1206, %r1201; - shf.l.wrap.b32 %r1208, %r1207, %r1207, 20; - add.s32 %r1209, %r1203, %r1208; - xor.b32 %r1210, %r1209, %r1205; - shf.l.wrap.b32 %r1211, %r1210, %r1210, 24; - add.s32 %r1212, %r1211, %r1206; - xor.b32 %r1213, %r1212, %r1208; - shf.l.wrap.b32 %r1214, %r1213, %r1213, 25; - add.s32 %r1215, %r1172, %r1165; - xor.b32 %r1216, %r1215, %r1186; - shf.l.wrap.b32 %r1217, %r1216, %r1216, 16; - add.s32 %r1218, %r1217, %r1199; - xor.b32 %r1219, %r1218, %r1165; - shf.l.wrap.b32 %r1220, %r1219, %r1219, 20; - add.s32 %r1221, %r1215, %r1220; - xor.b32 %r1222, %r1221, %r1217; - shf.l.wrap.b32 %r1223, %r1222, %r1222, 24; - add.s32 %r1224, %r1223, %r1218; - xor.b32 %r1225, %r1224, %r1220; - shf.l.wrap.b32 %r1226, %r1225, %r1225, 25; - add.s32 %r1227, %r1184, %r1177; - xor.b32 %r1228, %r1198, %r1227; - shf.l.wrap.b32 %r1229, %r1228, %r1228, 16; - add.s32 %r1230, %r1229, %r1163; - xor.b32 %r1231, %r1230, %r1177; - shf.l.wrap.b32 %r1232, %r1231, %r1231, 20; - add.s32 %r1233, %r1227, %r988; - add.s32 %r1234, %r1233, %r1232; - xor.b32 %r1235, %r1234, %r1229; - shf.l.wrap.b32 %r1236, %r1235, %r1235, 24; - add.s32 %r1237, %r1236, %r1230; - xor.b32 %r1238, %r1237, %r1232; - shf.l.wrap.b32 %r1239, %r1238, %r1238, 25; - add.s32 %r1240, %r1196, %r1189; - xor.b32 %r1241, %r1162, %r1240; - shf.l.wrap.b32 %r1242, %r1241, %r1241, 16; - add.s32 %r1243, %r1242, %r1175; - xor.b32 %r1244, %r1243, %r1189; - shf.l.wrap.b32 %r1245, %r1244, %r1244, 20; - add.s32 %r1246, %r1240, %r1245; - xor.b32 %r1247, %r1246, %r1242; - shf.l.wrap.b32 %r1248, %r1247, %r1247, 24; - add.s32 %r1249, %r1248, %r1243; - xor.b32 %r1250, %r1249, %r1245; - shf.l.wrap.b32 %r1251, %r1250, %r1250, 25; - add.s32 %r1252, %r1209, %r1226; - xor.b32 %r1253, %r1252, %r1248; - shf.l.wrap.b32 %r1254, %r1253, %r1253, 16; - add.s32 %r1255, %r1254, %r1237; - xor.b32 %r1256, %r1255, %r1226; - shf.l.wrap.b32 %r1257, %r1256, %r1256, 20; - add.s32 %r1258, %r1252, %r1257; - xor.b32 %r1259, %r1258, %r1254; - shf.l.wrap.b32 %r1260, %r1259, %r1259, 24; - add.s32 %r1261, %r1260, %r1255; - xor.b32 %r1262, %r1261, %r1257; - shf.l.wrap.b32 %r1263, %r1262, %r1262, 25; - add.s32 %r1264, %r1239, %r1221; - xor.b32 %r1265, %r1211, %r1264; - shf.l.wrap.b32 %r1266, %r1265, %r1265, 16; - add.s32 %r1267, %r1266, %r1249; - xor.b32 %r1268, %r1267, %r1239; - shf.l.wrap.b32 %r1269, %r1268, %r1268, 20; - add.s32 %r1270, %r1264, %r972; - add.s32 %r1271, %r1270, %r1269; - xor.b32 %r1272, %r1271, %r1266; - shf.l.wrap.b32 %r1273, %r1272, %r1272, 24; - add.s32 %r1274, %r1273, %r1267; - xor.b32 %r1275, %r1274, %r1269; - shf.l.wrap.b32 %r1276, %r1275, %r1275, 25; - add.s32 %r1277, %r1234, %r1251; - xor.b32 %r1278, %r1223, %r1277; - shf.l.wrap.b32 %r1279, %r1278, %r1278, 16; - add.s32 %r1280, %r1279, %r1212; - xor.b32 %r1281, %r1280, %r1251; - shf.l.wrap.b32 %r1282, %r1281, %r1281, 20; - add.s32 %r1283, %r1277, %r1282; - xor.b32 %r1284, %r1283, %r1279; - shf.l.wrap.b32 %r1285, %r1284, %r1284, 24; - add.s32 %r1286, %r1285, %r1280; - xor.b32 %r1287, %r1286, %r1282; - shf.l.wrap.b32 %r1288, %r1287, %r1287, 25; - add.s32 %r1289, %r1246, %r1214; - xor.b32 %r1290, %r1289, %r1236; - shf.l.wrap.b32 %r1291, %r1290, %r1290, 16; - add.s32 %r1292, %r1291, %r1224; - xor.b32 %r1293, %r1292, %r1214; - shf.l.wrap.b32 %r1294, %r1293, %r1293, 20; - add.s32 %r1295, %r1289, %r980; - add.s32 %r1296, %r1295, %r1294; - xor.b32 %r1297, %r1296, %r1291; - shf.l.wrap.b32 %r1298, %r1297, %r1297, 24; - add.s32 %r1299, %r1298, %r1292; - xor.b32 %r1300, %r1299, %r1294; - shf.l.wrap.b32 %r1301, %r1300, %r1300, 25; - add.s32 %r1302, %r1258, %r1301; - xor.b32 %r1303, %r1302, %r1273; - shf.l.wrap.b32 %r1304, %r1303, %r1303, 16; - add.s32 %r1305, %r1304, %r1286; - xor.b32 %r1306, %r1305, %r1301; - shf.l.wrap.b32 %r1307, %r1306, %r1306, 20; - add.s32 %r1308, %r1302, %r1307; - xor.b32 %r1309, %r1308, %r1304; - shf.l.wrap.b32 %r1310, %r1309, %r1309, 24; - add.s32 %r1311, %r1310, %r1305; - xor.b32 %r1312, %r1311, %r1307; - shf.l.wrap.b32 %r1313, %r1312, %r1312, 25; - add.s32 %r1314, %r1271, %r1263; - xor.b32 %r1315, %r1314, %r1285; - shf.l.wrap.b32 %r1316, %r1315, %r1315, 16; - add.s32 %r1317, %r1316, %r1299; - xor.b32 %r1318, %r1317, %r1263; - shf.l.wrap.b32 %r1319, %r1318, %r1318, 20; - add.s32 %r1320, %r1314, %r1319; - xor.b32 %r1321, %r1320, %r1316; + add.s32 %r1039, %r1033, %r182; + add.s32 %r1040, %r1039, %r1038; + xor.b32 %r1041, %r1040, %r1035; + shf.l.wrap.b32 %r1042, %r1041, %r1041, 24; + add.s32 %r1043, %r1042, %r1036; + xor.b32 %r1044, %r1043, %r1038; + shf.l.wrap.b32 %r1045, %r1044, %r1044, 25; + add.s32 %r1046, %r998, %r203; + add.s32 %r1047, %r1046, %r961; + xor.b32 %r1048, %r1047, %r986; + shf.l.wrap.b32 %r1049, %r1048, %r1048, 16; + add.s32 %r1050, %r1049, %r973; + xor.b32 %r1051, %r1050, %r961; + shf.l.wrap.b32 %r1052, %r1051, %r1051, 20; + add.s32 %r1053, %r1047, %r245; + add.s32 %r1054, %r1053, %r1052; + xor.b32 %r1055, %r1054, %r1049; + shf.l.wrap.b32 %r1056, %r1055, %r1055, 24; + add.s32 %r1057, %r1056, %r1050; + xor.b32 %r1058, %r1057, %r1052; + shf.l.wrap.b32 %r1059, %r1058, %r1058, 25; + xor.b32 %r1060, %r1043, %r1012; + st.local.u32 [%rd3+-104], %r1060; + xor.b32 %r1061, %r1057, %r1026; + st.local.u32 [%rd3+-100], %r1061; + xor.b32 %r1062, %r1015, %r1040; + st.local.u32 [%rd3+-96], %r1062; + xor.b32 %r1063, %r1029, %r1054; + st.local.u32 [%rd3+-92], %r1063; + xor.b32 %r1064, %r1059, %r1028; + st.local.u32 [%rd3+-88], %r1064; + xor.b32 %r1065, %r1017, %r1042; + st.local.u32 [%rd3+-84], %r1065; + xor.b32 %r1066, %r1031, %r1056; + st.local.u32 [%rd3+-80], %r1066; + xor.b32 %r1067, %r1045, %r1014; + st.local.u32 [%rd3+-76], %r1067; + add.s16 %rs114, %rs109, 1; + st.local.v2.u8 [%rd3], {%rs351, %rs114}; + +$L__BB1_8: + add.s64 %rd117, %rd13, %rd238; + st.local.u8 [%rd117], %rs351; + add.s64 %rd238, %rd238, 1; + setp.lt.u64 %p7, %rd238, 64; + mov.u64 %rd244, %rd12; + @%p7 bra $L__BB1_8; + +$L__BB1_9: + setp.lt.u64 %p8, %rd244, 65; + @%p8 bra $L__BB1_12; + + ld.local.u8 %rs9, [%rd3+2]; + ld.local.u8 %rs352, [%rd3+1]; + ld.local.u32 %r11657, [%rd3+-104]; + ld.local.u32 %r11656, [%rd3+-100]; + ld.local.u32 %r11655, [%rd3+-96]; + ld.local.u32 %r11654, [%rd3+-92]; + ld.local.u32 %r11653, [%rd3+-88]; + ld.local.u32 %r11652, [%rd3+-84]; + ld.local.u32 %r11651, [%rd3+-80]; + ld.local.u32 %r11650, [%rd3+-76]; + ld.local.u64 %rd118, [%rd3+-72]; + cvt.u32.u64 %r9, %rd118; + shr.u64 %rd119, %rd118, 32; + cvt.u32.u64 %r10, %rd119; + +$L__BB1_11: + and.b16 %rs116, %rs352, 255; + setp.eq.s16 %p9, %rs116, 0; + selp.u16 %rs117, 1, 0, %p9; + or.b16 %rs118, %rs9, %rs117; + ld.local.u8 %r1068, [%rd261]; + ld.local.u8 %r1069, [%rd261+1]; + prmt.b32 %r1070, %r1069, %r1068, 30212; + ld.local.u8 %r1071, [%rd261+2]; + prmt.b32 %r1072, %r1071, %r1070, 28756; + ld.local.u8 %r1073, [%rd261+3]; + prmt.b32 %r1074, %r1073, %r1072, 1620; + ld.local.u8 %r1075, [%rd261+4]; + ld.local.u8 %r1076, [%rd261+5]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + ld.local.u8 %r1078, [%rd261+6]; + prmt.b32 %r1079, %r1078, %r1077, 28756; + ld.local.u8 %r1080, [%rd261+7]; + prmt.b32 %r1081, %r1080, %r1079, 1620; + ld.local.u8 %r1082, [%rd261+8]; + ld.local.u8 %r1083, [%rd261+9]; + prmt.b32 %r1084, %r1083, %r1082, 30212; + ld.local.u8 %r1085, [%rd261+10]; + prmt.b32 %r1086, %r1085, %r1084, 28756; + ld.local.u8 %r1087, [%rd261+11]; + prmt.b32 %r1088, %r1087, %r1086, 1620; + ld.local.u8 %r1089, [%rd261+12]; + ld.local.u8 %r1090, [%rd261+13]; + prmt.b32 %r1091, %r1090, %r1089, 30212; + ld.local.u8 %r1092, [%rd261+14]; + prmt.b32 %r1093, %r1092, %r1091, 28756; + ld.local.u8 %r1094, [%rd261+15]; + prmt.b32 %r1095, %r1094, %r1093, 1620; + ld.local.u8 %r1096, [%rd261+16]; + ld.local.u8 %r1097, [%rd261+17]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd261+18]; + prmt.b32 %r1100, %r1099, %r1098, 28756; + ld.local.u8 %r1101, [%rd261+19]; + prmt.b32 %r1102, %r1101, %r1100, 1620; + ld.local.u8 %r1103, [%rd261+20]; + ld.local.u8 %r1104, [%rd261+21]; + prmt.b32 %r1105, %r1104, %r1103, 30212; + ld.local.u8 %r1106, [%rd261+22]; + prmt.b32 %r1107, %r1106, %r1105, 28756; + ld.local.u8 %r1108, [%rd261+23]; + prmt.b32 %r1109, %r1108, %r1107, 1620; + ld.local.u8 %r1110, [%rd261+24]; + ld.local.u8 %r1111, [%rd261+25]; + prmt.b32 %r1112, %r1111, %r1110, 30212; + ld.local.u8 %r1113, [%rd261+26]; + prmt.b32 %r1114, %r1113, %r1112, 28756; + ld.local.u8 %r1115, [%rd261+27]; + prmt.b32 %r1116, %r1115, %r1114, 1620; + ld.local.u8 %r1117, [%rd261+28]; + ld.local.u8 %r1118, [%rd261+29]; + prmt.b32 %r1119, %r1118, %r1117, 30212; + ld.local.u8 %r1120, [%rd261+30]; + prmt.b32 %r1121, %r1120, %r1119, 28756; + ld.local.u8 %r1122, [%rd261+31]; + prmt.b32 %r1123, %r1122, %r1121, 1620; + ld.local.u8 %r1124, [%rd261+32]; + ld.local.u8 %r1125, [%rd261+33]; + prmt.b32 %r1126, %r1125, %r1124, 30212; + ld.local.u8 %r1127, [%rd261+34]; + prmt.b32 %r1128, %r1127, %r1126, 28756; + ld.local.u8 %r1129, [%rd261+35]; + prmt.b32 %r1130, %r1129, %r1128, 1620; + ld.local.u8 %r1131, [%rd261+36]; + ld.local.u8 %r1132, [%rd261+37]; + prmt.b32 %r1133, %r1132, %r1131, 30212; + ld.local.u8 %r1134, [%rd261+38]; + prmt.b32 %r1135, %r1134, %r1133, 28756; + ld.local.u8 %r1136, [%rd261+39]; + prmt.b32 %r1137, %r1136, %r1135, 1620; + ld.local.u8 %r1138, [%rd261+40]; + ld.local.u8 %r1139, [%rd261+41]; + prmt.b32 %r1140, %r1139, %r1138, 30212; + ld.local.u8 %r1141, [%rd261+42]; + prmt.b32 %r1142, %r1141, %r1140, 28756; + ld.local.u8 %r1143, [%rd261+43]; + prmt.b32 %r1144, %r1143, %r1142, 1620; + ld.local.u8 %r1145, [%rd261+44]; + ld.local.u8 %r1146, [%rd261+45]; + prmt.b32 %r1147, %r1146, %r1145, 30212; + ld.local.u8 %r1148, [%rd261+46]; + prmt.b32 %r1149, %r1148, %r1147, 28756; + ld.local.u8 %r1150, [%rd261+47]; + prmt.b32 %r1151, %r1150, %r1149, 1620; + ld.local.u8 %r1152, [%rd261+48]; + ld.local.u8 %r1153, [%rd261+49]; + prmt.b32 %r1154, %r1153, %r1152, 30212; + ld.local.u8 %r1155, [%rd261+50]; + prmt.b32 %r1156, %r1155, %r1154, 28756; + ld.local.u8 %r1157, [%rd261+51]; + prmt.b32 %r1158, %r1157, %r1156, 1620; + ld.local.u8 %r1159, [%rd261+52]; + ld.local.u8 %r1160, [%rd261+53]; + prmt.b32 %r1161, %r1160, %r1159, 30212; + ld.local.u8 %r1162, [%rd261+54]; + prmt.b32 %r1163, %r1162, %r1161, 28756; + ld.local.u8 %r1164, [%rd261+55]; + prmt.b32 %r1165, %r1164, %r1163, 1620; + ld.local.u8 %r1166, [%rd261+56]; + ld.local.u8 %r1167, [%rd261+57]; + prmt.b32 %r1168, %r1167, %r1166, 30212; + ld.local.u8 %r1169, [%rd261+58]; + prmt.b32 %r1170, %r1169, %r1168, 28756; + ld.local.u8 %r1171, [%rd261+59]; + prmt.b32 %r1172, %r1171, %r1170, 1620; + ld.local.u8 %r1173, [%rd261+60]; + ld.local.u8 %r1174, [%rd261+61]; + prmt.b32 %r1175, %r1174, %r1173, 30212; + ld.local.u8 %r1176, [%rd261+62]; + prmt.b32 %r1177, %r1176, %r1175, 28756; + ld.local.u8 %r1178, [%rd261+63]; + prmt.b32 %r1179, %r1178, %r1177, 1620; + cvt.u32.u16 %r1180, %rs118; + and.b32 %r1181, %r1180, 255; + add.s32 %r1182, %r11657, %r1074; + add.s32 %r1183, %r1182, %r11653; + xor.b32 %r1184, %r1183, %r9; + shf.l.wrap.b32 %r1185, %r1184, %r1184, 16; + add.s32 %r1186, %r1185, 1779033703; + xor.b32 %r1187, %r1186, %r11653; + shf.l.wrap.b32 %r1188, %r1187, %r1187, 20; + add.s32 %r1189, %r1183, %r1081; + add.s32 %r1190, %r1189, %r1188; + xor.b32 %r1191, %r1190, %r1185; + shf.l.wrap.b32 %r1192, %r1191, %r1191, 24; + add.s32 %r1193, %r1192, %r1186; + xor.b32 %r1194, %r1193, %r1188; + shf.l.wrap.b32 %r1195, %r1194, %r1194, 25; + add.s32 %r1196, %r11656, %r1088; + add.s32 %r1197, %r1196, %r11652; + xor.b32 %r1198, %r1197, %r10; + shf.l.wrap.b32 %r1199, %r1198, %r1198, 16; + add.s32 %r1200, %r1199, -1150833019; + xor.b32 %r1201, %r1200, %r11652; + shf.l.wrap.b32 %r1202, %r1201, %r1201, 20; + add.s32 %r1203, %r1197, %r1095; + add.s32 %r1204, %r1203, %r1202; + xor.b32 %r1205, %r1204, %r1199; + shf.l.wrap.b32 %r1206, %r1205, %r1205, 24; + add.s32 %r1207, %r1206, %r1200; + xor.b32 %r1208, %r1207, %r1202; + shf.l.wrap.b32 %r1209, %r1208, %r1208, 25; + add.s32 %r1210, %r11655, %r1102; + add.s32 %r1211, %r1210, %r11651; + shr.u32 %r1212, %r1211, 16; + shl.b32 %r1213, %r1211, 16; + xor.b32 %r1214, %r1213, 4194304; + or.b32 %r1215, %r1214, %r1212; + add.s32 %r1216, %r1215, 1013904242; + xor.b32 %r1217, %r1216, %r11651; + shf.l.wrap.b32 %r1218, %r1217, %r1217, 20; + add.s32 %r1219, %r1211, %r1109; + add.s32 %r1220, %r1219, %r1218; + xor.b32 %r1221, %r1220, %r1215; + shf.l.wrap.b32 %r1222, %r1221, %r1221, 24; + add.s32 %r1223, %r1222, %r1216; + xor.b32 %r1224, %r1223, %r1218; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 25; + add.s32 %r1226, %r11654, %r1116; + add.s32 %r1227, %r1226, %r11650; + xor.b32 %r1228, %r1227, %r1181; + shr.u32 %r1229, %r1227, 16; + shl.b32 %r1230, %r1228, 16; + or.b32 %r1231, %r1230, %r1229; + add.s32 %r1232, %r1231, -1521486534; + xor.b32 %r1233, %r1232, %r11650; + shf.l.wrap.b32 %r1234, %r1233, %r1233, 20; + add.s32 %r1235, %r1227, %r1123; + add.s32 %r1236, %r1235, %r1234; + xor.b32 %r1237, %r1236, %r1231; + shf.l.wrap.b32 %r1238, %r1237, %r1237, 24; + add.s32 %r1239, %r1238, %r1232; + xor.b32 %r1240, %r1239, %r1234; + shf.l.wrap.b32 %r1241, %r1240, %r1240, 25; + add.s32 %r1242, %r1190, %r1130; + add.s32 %r1243, %r1242, %r1209; + xor.b32 %r1244, %r1243, %r1238; + shf.l.wrap.b32 %r1245, %r1244, %r1244, 16; + add.s32 %r1246, %r1245, %r1223; + xor.b32 %r1247, %r1246, %r1209; + shf.l.wrap.b32 %r1248, %r1247, %r1247, 20; + add.s32 %r1249, %r1243, %r1137; + add.s32 %r1250, %r1249, %r1248; + xor.b32 %r1251, %r1250, %r1245; + shf.l.wrap.b32 %r1252, %r1251, %r1251, 24; + add.s32 %r1253, %r1252, %r1246; + xor.b32 %r1254, %r1253, %r1248; + shf.l.wrap.b32 %r1255, %r1254, %r1254, 25; + add.s32 %r1256, %r1204, %r1144; + add.s32 %r1257, %r1256, %r1225; + xor.b32 %r1258, %r1257, %r1192; + shf.l.wrap.b32 %r1259, %r1258, %r1258, 16; + add.s32 %r1260, %r1259, %r1239; + xor.b32 %r1261, %r1260, %r1225; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 20; + add.s32 %r1263, %r1257, %r1151; + add.s32 %r1264, %r1263, %r1262; + xor.b32 %r1265, %r1264, %r1259; + shf.l.wrap.b32 %r1266, %r1265, %r1265, 24; + add.s32 %r1267, %r1266, %r1260; + xor.b32 %r1268, %r1267, %r1262; + shf.l.wrap.b32 %r1269, %r1268, %r1268, 25; + add.s32 %r1270, %r1220, %r1158; + add.s32 %r1271, %r1270, %r1241; + xor.b32 %r1272, %r1271, %r1206; + shf.l.wrap.b32 %r1273, %r1272, %r1272, 16; + add.s32 %r1274, %r1273, %r1193; + xor.b32 %r1275, %r1274, %r1241; + shf.l.wrap.b32 %r1276, %r1275, %r1275, 20; + add.s32 %r1277, %r1271, %r1165; + add.s32 %r1278, %r1277, %r1276; + xor.b32 %r1279, %r1278, %r1273; + shf.l.wrap.b32 %r1280, %r1279, %r1279, 24; + add.s32 %r1281, %r1280, %r1274; + xor.b32 %r1282, %r1281, %r1276; + shf.l.wrap.b32 %r1283, %r1282, %r1282, 25; + add.s32 %r1284, %r1236, %r1172; + add.s32 %r1285, %r1284, %r1195; + xor.b32 %r1286, %r1285, %r1222; + shf.l.wrap.b32 %r1287, %r1286, %r1286, 16; + add.s32 %r1288, %r1287, %r1207; + xor.b32 %r1289, %r1288, %r1195; + shf.l.wrap.b32 %r1290, %r1289, %r1289, 20; + add.s32 %r1291, %r1285, %r1179; + add.s32 %r1292, %r1291, %r1290; + xor.b32 %r1293, %r1292, %r1287; + shf.l.wrap.b32 %r1294, %r1293, %r1293, 24; + add.s32 %r1295, %r1294, %r1288; + xor.b32 %r1296, %r1295, %r1290; + shf.l.wrap.b32 %r1297, %r1296, %r1296, 25; + add.s32 %r1298, %r1250, %r1088; + add.s32 %r1299, %r1298, %r1297; + xor.b32 %r1300, %r1299, %r1266; + shf.l.wrap.b32 %r1301, %r1300, %r1300, 16; + add.s32 %r1302, %r1301, %r1281; + xor.b32 %r1303, %r1302, %r1297; + shf.l.wrap.b32 %r1304, %r1303, %r1303, 20; + add.s32 %r1305, %r1299, %r1116; + add.s32 %r1306, %r1305, %r1304; + xor.b32 %r1307, %r1306, %r1301; + shf.l.wrap.b32 %r1308, %r1307, %r1307, 24; + add.s32 %r1309, %r1308, %r1302; + xor.b32 %r1310, %r1309, %r1304; + shf.l.wrap.b32 %r1311, %r1310, %r1310, 25; + add.s32 %r1312, %r1264, %r1095; + add.s32 %r1313, %r1312, %r1255; + xor.b32 %r1314, %r1313, %r1280; + shf.l.wrap.b32 %r1315, %r1314, %r1314, 16; + add.s32 %r1316, %r1315, %r1295; + xor.b32 %r1317, %r1316, %r1255; + shf.l.wrap.b32 %r1318, %r1317, %r1317, 20; + add.s32 %r1319, %r1313, %r1144; + add.s32 %r1320, %r1319, %r1318; + xor.b32 %r1321, %r1320, %r1315; shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; - add.s32 %r1323, %r1322, %r1317; - xor.b32 %r1324, %r1323, %r1319; + add.s32 %r1323, %r1322, %r1316; + xor.b32 %r1324, %r1323, %r1318; shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; - add.s32 %r1326, %r1283, %r1276; - xor.b32 %r1327, %r1298, %r1326; - shf.l.wrap.b32 %r1328, %r1327, %r1327, 16; - add.s32 %r1329, %r1328, %r1261; - xor.b32 %r1330, %r1329, %r1276; - shf.l.wrap.b32 %r1331, %r1330, %r1330, 20; - add.s32 %r1332, %r1326, %r996; - add.s32 %r1333, %r1332, %r1331; - xor.b32 %r1334, %r1333, %r1328; - shf.l.wrap.b32 %r1335, %r1334, %r1334, 24; - add.s32 %r1336, %r1335, %r1329; - xor.b32 %r1337, %r1336, %r1331; - shf.l.wrap.b32 %r1338, %r1337, %r1337, 25; - add.s32 %r1339, %r1296, %r1288; - xor.b32 %r1340, %r1260, %r1339; - shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; - add.s32 %r1342, %r1341, %r1274; - xor.b32 %r1343, %r1342, %r1288; - shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; - add.s32 %r1345, %r1339, %r1344; - xor.b32 %r1346, %r1345, %r1341; - shf.l.wrap.b32 %r1347, %r1346, %r1346, 24; - add.s32 %r1348, %r1347, %r1342; - xor.b32 %r1349, %r1348, %r1344; - shf.l.wrap.b32 %r1350, %r1349, %r1349, 25; - add.s32 %r1351, %r1308, %r1325; - xor.b32 %r1352, %r1351, %r1347; - shf.l.wrap.b32 %r1353, %r1352, %r1352, 16; - add.s32 %r1354, %r1353, %r1336; - xor.b32 %r1355, %r1354, %r1325; - shf.l.wrap.b32 %r1356, %r1355, %r1355, 20; - add.s32 %r1357, %r1351, %r972; - add.s32 %r1358, %r1357, %r1356; - xor.b32 %r1359, %r1358, %r1353; - shf.l.wrap.b32 %r1360, %r1359, %r1359, 24; - add.s32 %r1361, %r1360, %r1354; - xor.b32 %r1362, %r1361, %r1356; - shf.l.wrap.b32 %r1363, %r1362, %r1362, 25; - add.s32 %r1364, %r1338, %r1320; - xor.b32 %r1365, %r1310, %r1364; - shf.l.wrap.b32 %r1366, %r1365, %r1365, 16; - add.s32 %r1367, %r1366, %r1348; - xor.b32 %r1368, %r1367, %r1338; - shf.l.wrap.b32 %r1369, %r1368, %r1368, 20; - add.s32 %r1370, %r1364, %r988; - add.s32 %r1371, %r1370, %r1369; - xor.b32 %r1372, %r1371, %r1366; - shf.l.wrap.b32 %r1373, %r1372, %r1372, 24; - add.s32 %r1374, %r1373, %r1367; - xor.b32 %r1375, %r1374, %r1369; - shf.l.wrap.b32 %r1376, %r1375, %r1375, 25; - add.s32 %r1377, %r1333, %r1350; - xor.b32 %r1378, %r1322, %r1377; - shf.l.wrap.b32 %r1379, %r1378, %r1378, 16; - add.s32 %r1380, %r1379, %r1311; - xor.b32 %r1381, %r1380, %r1350; - shf.l.wrap.b32 %r1382, %r1381, %r1381, 20; - add.s32 %r1383, %r1377, %r1382; - xor.b32 %r1384, %r1383, %r1379; - shf.l.wrap.b32 %r1385, %r1384, %r1384, 24; - add.s32 %r1386, %r1385, %r1380; - xor.b32 %r1387, %r1386, %r1382; - shf.l.wrap.b32 %r1388, %r1387, %r1387, 25; - add.s32 %r1389, %r1345, %r980; - add.s32 %r1390, %r1389, %r1313; - xor.b32 %r1391, %r1390, %r1335; - shf.l.wrap.b32 %r1392, %r1391, %r1391, 16; - add.s32 %r1393, %r1392, %r1323; - xor.b32 %r1394, %r1393, %r1313; - shf.l.wrap.b32 %r1395, %r1394, %r1394, 20; - add.s32 %r1396, %r1390, %r1395; - xor.b32 %r1397, %r1396, %r1392; - shf.l.wrap.b32 %r1398, %r1397, %r1397, 24; - add.s32 %r1399, %r1398, %r1393; - xor.b32 %r1400, %r1399, %r1395; - shf.l.wrap.b32 %r1401, %r1400, %r1400, 25; - add.s32 %r1402, %r1358, %r1401; - xor.b32 %r1403, %r1402, %r1373; - shf.l.wrap.b32 %r1404, %r1403, %r1403, 16; - add.s32 %r1405, %r1404, %r1386; - xor.b32 %r1406, %r1405, %r1401; - shf.l.wrap.b32 %r1407, %r1406, %r1406, 20; - add.s32 %r1408, %r1402, %r1407; - xor.b32 %r1409, %r1408, %r1404; - shf.l.wrap.b32 %r1410, %r1409, %r1409, 24; - add.s32 %r1411, %r1410, %r1405; - xor.b32 %r1412, %r1411, %r1407; - shf.l.wrap.b32 %r1413, %r1412, %r1412, 25; - add.s32 %r1414, %r1371, %r1363; - xor.b32 %r1415, %r1414, %r1385; - shf.l.wrap.b32 %r1416, %r1415, %r1415, 16; - add.s32 %r1417, %r1416, %r1399; - xor.b32 %r1418, %r1417, %r1363; - shf.l.wrap.b32 %r1419, %r1418, %r1418, 20; - add.s32 %r1420, %r1414, %r1419; - xor.b32 %r1421, %r1420, %r1416; - shf.l.wrap.b32 %r1422, %r1421, %r1421, 24; - add.s32 %r1423, %r1422, %r1417; - xor.b32 %r1424, %r1423, %r1419; - shf.l.wrap.b32 %r1425, %r1424, %r1424, 25; - add.s32 %r1426, %r1383, %r1376; - xor.b32 %r1427, %r1398, %r1426; - shf.l.wrap.b32 %r1428, %r1427, %r1427, 16; - add.s32 %r1429, %r1428, %r1361; - xor.b32 %r1430, %r1429, %r1376; - shf.l.wrap.b32 %r1431, %r1430, %r1430, 20; - add.s32 %r1432, %r1426, %r1431; - xor.b32 %r1433, %r1432, %r1428; + add.s32 %r1326, %r1278, %r1123; + add.s32 %r1327, %r1326, %r1269; + xor.b32 %r1328, %r1327, %r1294; + shf.l.wrap.b32 %r1329, %r1328, %r1328, 16; + add.s32 %r1330, %r1329, %r1253; + xor.b32 %r1331, %r1330, %r1269; + shf.l.wrap.b32 %r1332, %r1331, %r1331, 20; + add.s32 %r1333, %r1327, %r1074; + add.s32 %r1334, %r1333, %r1332; + xor.b32 %r1335, %r1334, %r1329; + shf.l.wrap.b32 %r1336, %r1335, %r1335, 24; + add.s32 %r1337, %r1336, %r1330; + xor.b32 %r1338, %r1337, %r1332; + shf.l.wrap.b32 %r1339, %r1338, %r1338, 25; + add.s32 %r1340, %r1292, %r1102; + add.s32 %r1341, %r1340, %r1283; + xor.b32 %r1342, %r1341, %r1252; + shf.l.wrap.b32 %r1343, %r1342, %r1342, 16; + add.s32 %r1344, %r1343, %r1267; + xor.b32 %r1345, %r1344, %r1283; + shf.l.wrap.b32 %r1346, %r1345, %r1345, 20; + add.s32 %r1347, %r1341, %r1165; + add.s32 %r1348, %r1347, %r1346; + xor.b32 %r1349, %r1348, %r1343; + shf.l.wrap.b32 %r1350, %r1349, %r1349, 24; + add.s32 %r1351, %r1350, %r1344; + xor.b32 %r1352, %r1351, %r1346; + shf.l.wrap.b32 %r1353, %r1352, %r1352, 25; + add.s32 %r1354, %r1306, %r1081; + add.s32 %r1355, %r1354, %r1325; + xor.b32 %r1356, %r1355, %r1350; + shf.l.wrap.b32 %r1357, %r1356, %r1356, 16; + add.s32 %r1358, %r1357, %r1337; + xor.b32 %r1359, %r1358, %r1325; + shf.l.wrap.b32 %r1360, %r1359, %r1359, 20; + add.s32 %r1361, %r1355, %r1151; + add.s32 %r1362, %r1361, %r1360; + xor.b32 %r1363, %r1362, %r1357; + shf.l.wrap.b32 %r1364, %r1363, %r1363, 24; + add.s32 %r1365, %r1364, %r1358; + xor.b32 %r1366, %r1365, %r1360; + shf.l.wrap.b32 %r1367, %r1366, %r1366, 25; + add.s32 %r1368, %r1320, %r1158; + add.s32 %r1369, %r1368, %r1339; + xor.b32 %r1370, %r1369, %r1308; + shf.l.wrap.b32 %r1371, %r1370, %r1370, 16; + add.s32 %r1372, %r1371, %r1351; + xor.b32 %r1373, %r1372, %r1339; + shf.l.wrap.b32 %r1374, %r1373, %r1373, 20; + add.s32 %r1375, %r1369, %r1109; + add.s32 %r1376, %r1375, %r1374; + xor.b32 %r1377, %r1376, %r1371; + shf.l.wrap.b32 %r1378, %r1377, %r1377, 24; + add.s32 %r1379, %r1378, %r1372; + xor.b32 %r1380, %r1379, %r1374; + shf.l.wrap.b32 %r1381, %r1380, %r1380, 25; + add.s32 %r1382, %r1334, %r1137; + add.s32 %r1383, %r1382, %r1353; + xor.b32 %r1384, %r1383, %r1322; + shf.l.wrap.b32 %r1385, %r1384, %r1384, 16; + add.s32 %r1386, %r1385, %r1309; + xor.b32 %r1387, %r1386, %r1353; + shf.l.wrap.b32 %r1388, %r1387, %r1387, 20; + add.s32 %r1389, %r1383, %r1172; + add.s32 %r1390, %r1389, %r1388; + xor.b32 %r1391, %r1390, %r1385; + shf.l.wrap.b32 %r1392, %r1391, %r1391, 24; + add.s32 %r1393, %r1392, %r1386; + xor.b32 %r1394, %r1393, %r1388; + shf.l.wrap.b32 %r1395, %r1394, %r1394, 25; + add.s32 %r1396, %r1348, %r1179; + add.s32 %r1397, %r1396, %r1311; + xor.b32 %r1398, %r1397, %r1336; + shf.l.wrap.b32 %r1399, %r1398, %r1398, 16; + add.s32 %r1400, %r1399, %r1323; + xor.b32 %r1401, %r1400, %r1311; + shf.l.wrap.b32 %r1402, %r1401, %r1401, 20; + add.s32 %r1403, %r1397, %r1130; + add.s32 %r1404, %r1403, %r1402; + xor.b32 %r1405, %r1404, %r1399; + shf.l.wrap.b32 %r1406, %r1405, %r1405, 24; + add.s32 %r1407, %r1406, %r1400; + xor.b32 %r1408, %r1407, %r1402; + shf.l.wrap.b32 %r1409, %r1408, %r1408, 25; + add.s32 %r1410, %r1362, %r1095; + add.s32 %r1411, %r1410, %r1409; + xor.b32 %r1412, %r1411, %r1378; + shf.l.wrap.b32 %r1413, %r1412, %r1412, 16; + add.s32 %r1414, %r1413, %r1393; + xor.b32 %r1415, %r1414, %r1409; + shf.l.wrap.b32 %r1416, %r1415, %r1415, 20; + add.s32 %r1417, %r1411, %r1102; + add.s32 %r1418, %r1417, %r1416; + xor.b32 %r1419, %r1418, %r1413; + shf.l.wrap.b32 %r1420, %r1419, %r1419, 24; + add.s32 %r1421, %r1420, %r1414; + xor.b32 %r1422, %r1421, %r1416; + shf.l.wrap.b32 %r1423, %r1422, %r1422, 25; + add.s32 %r1424, %r1376, %r1144; + add.s32 %r1425, %r1424, %r1367; + xor.b32 %r1426, %r1425, %r1392; + shf.l.wrap.b32 %r1427, %r1426, %r1426, 16; + add.s32 %r1428, %r1427, %r1407; + xor.b32 %r1429, %r1428, %r1367; + shf.l.wrap.b32 %r1430, %r1429, %r1429, 20; + add.s32 %r1431, %r1425, %r1158; + add.s32 %r1432, %r1431, %r1430; + xor.b32 %r1433, %r1432, %r1427; shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; - add.s32 %r1435, %r1434, %r1429; - xor.b32 %r1436, %r1435, %r1431; + add.s32 %r1435, %r1434, %r1428; + xor.b32 %r1436, %r1435, %r1430; shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; - add.s32 %r1438, %r1396, %r1388; - xor.b32 %r1439, %r1360, %r1438; - shf.l.wrap.b32 %r1440, %r1439, %r1439, 16; - add.s32 %r1441, %r1440, %r1374; - xor.b32 %r1442, %r1441, %r1388; - shf.l.wrap.b32 %r1443, %r1442, %r1442, 20; - add.s32 %r1444, %r1438, %r1443; - xor.b32 %r1445, %r1444, %r1440; - shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; - add.s32 %r1447, %r1446, %r1441; - xor.b32 %r1448, %r1447, %r1443; - shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; - add.s32 %r1450, %r1408, %r1425; - xor.b32 %r1451, %r1450, %r1446; - shf.l.wrap.b32 %r1452, %r1451, %r1451, 16; - add.s32 %r1453, %r1452, %r1435; - xor.b32 %r1454, %r1453, %r1425; - shf.l.wrap.b32 %r1455, %r1454, %r1454, 20; - add.s32 %r1456, %r1450, %r988; - add.s32 %r1457, %r1456, %r1455; - xor.b32 %r1458, %r1457, %r1452; - shf.l.wrap.b32 %r1459, %r1458, %r1458, 24; - add.s32 %r1460, %r1459, %r1453; + add.s32 %r1438, %r1390, %r1165; + add.s32 %r1439, %r1438, %r1381; + xor.b32 %r1440, %r1439, %r1406; + shf.l.wrap.b32 %r1441, %r1440, %r1440, 16; + add.s32 %r1442, %r1441, %r1365; + xor.b32 %r1443, %r1442, %r1381; + shf.l.wrap.b32 %r1444, %r1443, %r1443, 20; + add.s32 %r1445, %r1439, %r1088; + add.s32 %r1446, %r1445, %r1444; + xor.b32 %r1447, %r1446, %r1441; + shf.l.wrap.b32 %r1448, %r1447, %r1447, 24; + add.s32 %r1449, %r1448, %r1442; + xor.b32 %r1450, %r1449, %r1444; + shf.l.wrap.b32 %r1451, %r1450, %r1450, 25; + add.s32 %r1452, %r1404, %r1123; + add.s32 %r1453, %r1452, %r1395; + xor.b32 %r1454, %r1453, %r1364; + shf.l.wrap.b32 %r1455, %r1454, %r1454, 16; + add.s32 %r1456, %r1455, %r1379; + xor.b32 %r1457, %r1456, %r1395; + shf.l.wrap.b32 %r1458, %r1457, %r1457, 20; + add.s32 %r1459, %r1453, %r1172; + add.s32 %r1460, %r1459, %r1458; xor.b32 %r1461, %r1460, %r1455; - shf.l.wrap.b32 %r1462, %r1461, %r1461, 25; - add.s32 %r1463, %r1437, %r1420; - xor.b32 %r1464, %r1410, %r1463; - shf.l.wrap.b32 %r1465, %r1464, %r1464, 16; - add.s32 %r1466, %r1465, %r1447; - xor.b32 %r1467, %r1466, %r1437; - shf.l.wrap.b32 %r1468, %r1467, %r1467, 20; - add.s32 %r1469, %r1463, %r996; - add.s32 %r1470, %r1469, %r1468; - xor.b32 %r1471, %r1470, %r1465; - shf.l.wrap.b32 %r1472, %r1471, %r1471, 24; - add.s32 %r1473, %r1472, %r1466; - xor.b32 %r1474, %r1473, %r1468; - shf.l.wrap.b32 %r1475, %r1474, %r1474, 25; - add.s32 %r1476, %r1432, %r972; - add.s32 %r1477, %r1476, %r1449; - xor.b32 %r1478, %r1422, %r1477; - shf.l.wrap.b32 %r1479, %r1478, %r1478, 16; - add.s32 %r1480, %r1479, %r1411; - xor.b32 %r1481, %r1480, %r1449; - shf.l.wrap.b32 %r1482, %r1481, %r1481, 20; - add.s32 %r1483, %r1477, %r980; - add.s32 %r1484, %r1483, %r1482; - xor.b32 %r1485, %r1484, %r1479; - shf.l.wrap.b32 %r1486, %r1485, %r1485, 24; - add.s32 %r1487, %r1486, %r1480; - xor.b32 %r1488, %r1487, %r1482; - shf.l.wrap.b32 %r1489, %r1488, %r1488, 25; - add.s32 %r1490, %r1444, %r1413; - xor.b32 %r1491, %r1490, %r1434; - shf.l.wrap.b32 %r1492, %r1491, %r1491, 16; - add.s32 %r1493, %r1492, %r1423; - xor.b32 %r1494, %r1493, %r1413; - shf.l.wrap.b32 %r1495, %r1494, %r1494, 20; - add.s32 %r1496, %r1490, %r1495; - xor.b32 %r1497, %r1496, %r1492; - shf.l.wrap.b32 %r1498, %r1497, %r1497, 24; - add.s32 %r1499, %r1498, %r1493; - xor.b32 %r1500, %r1499, %r1495; - shf.l.wrap.b32 %r1501, %r1500, %r1500, 25; - add.s32 %r1502, %r1457, %r1501; - xor.b32 %r1503, %r1502, %r1472; - shf.l.wrap.b32 %r1504, %r1503, %r1503, 16; - add.s32 %r1505, %r1504, %r1487; - xor.b32 %r1506, %r1505, %r1501; - shf.l.wrap.b32 %r1507, %r1506, %r1506, 20; - add.s32 %r1508, %r1502, %r1507; - xor.b32 %r1509, %r1508, %r1504; - shf.l.wrap.b32 %r1510, %r1509, %r1509, 24; - add.s32 %r1511, %r1510, %r1505; - xor.b32 %r1512, %r1511, %r1507; - shf.l.wrap.b32 %r1513, %r1512, %r1512, 25; - add.s32 %r1514, %r1470, %r1462; - xor.b32 %r1515, %r1514, %r1486; - shf.l.wrap.b32 %r1516, %r1515, %r1515, 16; - add.s32 %r1517, %r1516, %r1499; - xor.b32 %r1518, %r1517, %r1462; - shf.l.wrap.b32 %r1519, %r1518, %r1518, 20; - add.s32 %r1520, %r1514, %r1519; - xor.b32 %r1521, %r1520, %r1516; - shf.l.wrap.b32 %r1522, %r1521, %r1521, 24; - add.s32 %r1523, %r1522, %r1517; - xor.b32 %r1524, %r1523, %r1519; - shf.l.wrap.b32 %r1525, %r1524, %r1524, 25; - add.s32 %r1526, %r1484, %r1475; - xor.b32 %r1527, %r1498, %r1526; - shf.l.wrap.b32 %r1528, %r1527, %r1527, 16; - add.s32 %r1529, %r1528, %r1460; - xor.b32 %r1530, %r1529, %r1475; - shf.l.wrap.b32 %r1531, %r1530, %r1530, 20; - add.s32 %r1532, %r1526, %r1531; - xor.b32 %r1533, %r1532, %r1528; - shf.l.wrap.b32 %r1534, %r1533, %r1533, 24; - add.s32 %r1535, %r1534, %r1529; - xor.b32 %r1536, %r1535, %r1531; - shf.l.wrap.b32 %r1537, %r1536, %r1536, 25; - add.s32 %r1538, %r1496, %r1489; - xor.b32 %r1539, %r1459, %r1538; - shf.l.wrap.b32 %r1540, %r1539, %r1539, 16; - add.s32 %r1541, %r1540, %r1473; - xor.b32 %r1542, %r1541, %r1489; - shf.l.wrap.b32 %r1543, %r1542, %r1542, 20; - add.s32 %r1544, %r1538, %r980; - add.s32 %r1545, %r1544, %r1543; - xor.b32 %r1546, %r1545, %r1540; - shf.l.wrap.b32 %r1547, %r1546, %r1546, 24; - add.s32 %r1548, %r1547, %r1541; - xor.b32 %r1549, %r1548, %r1543; - shf.l.wrap.b32 %r1550, %r1549, %r1549, 25; - add.s32 %r1551, %r1508, %r1525; - xor.b32 %r1552, %r1551, %r1547; + shf.l.wrap.b32 %r1462, %r1461, %r1461, 24; + add.s32 %r1463, %r1462, %r1456; + xor.b32 %r1464, %r1463, %r1458; + shf.l.wrap.b32 %r1465, %r1464, %r1464, 25; + add.s32 %r1466, %r1418, %r1116; + add.s32 %r1467, %r1466, %r1437; + xor.b32 %r1468, %r1467, %r1462; + shf.l.wrap.b32 %r1469, %r1468, %r1468, 16; + add.s32 %r1470, %r1469, %r1449; + xor.b32 %r1471, %r1470, %r1437; + shf.l.wrap.b32 %r1472, %r1471, %r1471, 20; + add.s32 %r1473, %r1467, %r1109; + add.s32 %r1474, %r1473, %r1472; + xor.b32 %r1475, %r1474, %r1469; + shf.l.wrap.b32 %r1476, %r1475, %r1475, 24; + add.s32 %r1477, %r1476, %r1470; + xor.b32 %r1478, %r1477, %r1472; + shf.l.wrap.b32 %r1479, %r1478, %r1478, 25; + add.s32 %r1480, %r1432, %r1137; + add.s32 %r1481, %r1480, %r1451; + xor.b32 %r1482, %r1481, %r1420; + shf.l.wrap.b32 %r1483, %r1482, %r1482, 16; + add.s32 %r1484, %r1483, %r1463; + xor.b32 %r1485, %r1484, %r1451; + shf.l.wrap.b32 %r1486, %r1485, %r1485, 20; + add.s32 %r1487, %r1481, %r1074; + add.s32 %r1488, %r1487, %r1486; + xor.b32 %r1489, %r1488, %r1483; + shf.l.wrap.b32 %r1490, %r1489, %r1489, 24; + add.s32 %r1491, %r1490, %r1484; + xor.b32 %r1492, %r1491, %r1486; + shf.l.wrap.b32 %r1493, %r1492, %r1492, 25; + add.s32 %r1494, %r1446, %r1151; + add.s32 %r1495, %r1494, %r1465; + xor.b32 %r1496, %r1495, %r1434; + shf.l.wrap.b32 %r1497, %r1496, %r1496, 16; + add.s32 %r1498, %r1497, %r1421; + xor.b32 %r1499, %r1498, %r1465; + shf.l.wrap.b32 %r1500, %r1499, %r1499, 20; + add.s32 %r1501, %r1495, %r1179; + add.s32 %r1502, %r1501, %r1500; + xor.b32 %r1503, %r1502, %r1497; + shf.l.wrap.b32 %r1504, %r1503, %r1503, 24; + add.s32 %r1505, %r1504, %r1498; + xor.b32 %r1506, %r1505, %r1500; + shf.l.wrap.b32 %r1507, %r1506, %r1506, 25; + add.s32 %r1508, %r1460, %r1130; + add.s32 %r1509, %r1508, %r1423; + xor.b32 %r1510, %r1509, %r1448; + shf.l.wrap.b32 %r1511, %r1510, %r1510, 16; + add.s32 %r1512, %r1511, %r1435; + xor.b32 %r1513, %r1512, %r1423; + shf.l.wrap.b32 %r1514, %r1513, %r1513, 20; + add.s32 %r1515, %r1509, %r1081; + add.s32 %r1516, %r1515, %r1514; + xor.b32 %r1517, %r1516, %r1511; + shf.l.wrap.b32 %r1518, %r1517, %r1517, 24; + add.s32 %r1519, %r1518, %r1512; + xor.b32 %r1520, %r1519, %r1514; + shf.l.wrap.b32 %r1521, %r1520, %r1520, 25; + add.s32 %r1522, %r1474, %r1144; + add.s32 %r1523, %r1522, %r1521; + xor.b32 %r1524, %r1523, %r1490; + shf.l.wrap.b32 %r1525, %r1524, %r1524, 16; + add.s32 %r1526, %r1525, %r1505; + xor.b32 %r1527, %r1526, %r1521; + shf.l.wrap.b32 %r1528, %r1527, %r1527, 20; + add.s32 %r1529, %r1523, %r1123; + add.s32 %r1530, %r1529, %r1528; + xor.b32 %r1531, %r1530, %r1525; + shf.l.wrap.b32 %r1532, %r1531, %r1531, 24; + add.s32 %r1533, %r1532, %r1526; + xor.b32 %r1534, %r1533, %r1528; + shf.l.wrap.b32 %r1535, %r1534, %r1534, 25; + add.s32 %r1536, %r1488, %r1158; + add.s32 %r1537, %r1536, %r1479; + xor.b32 %r1538, %r1537, %r1504; + shf.l.wrap.b32 %r1539, %r1538, %r1538, 16; + add.s32 %r1540, %r1539, %r1519; + xor.b32 %r1541, %r1540, %r1479; + shf.l.wrap.b32 %r1542, %r1541, %r1541, 20; + add.s32 %r1543, %r1537, %r1137; + add.s32 %r1544, %r1543, %r1542; + xor.b32 %r1545, %r1544, %r1539; + shf.l.wrap.b32 %r1546, %r1545, %r1545, 24; + add.s32 %r1547, %r1546, %r1540; + xor.b32 %r1548, %r1547, %r1542; + shf.l.wrap.b32 %r1549, %r1548, %r1548, 25; + add.s32 %r1550, %r1502, %r1172; + add.s32 %r1551, %r1550, %r1493; + xor.b32 %r1552, %r1551, %r1518; shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; - add.s32 %r1554, %r1553, %r1535; - xor.b32 %r1555, %r1554, %r1525; + add.s32 %r1554, %r1553, %r1477; + xor.b32 %r1555, %r1554, %r1493; shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; - add.s32 %r1557, %r1551, %r996; + add.s32 %r1557, %r1551, %r1095; add.s32 %r1558, %r1557, %r1556; xor.b32 %r1559, %r1558, %r1553; shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; add.s32 %r1561, %r1560, %r1554; xor.b32 %r1562, %r1561, %r1556; shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; - add.s32 %r1564, %r1537, %r972; - add.s32 %r1565, %r1564, %r1520; - xor.b32 %r1566, %r1510, %r1565; + add.s32 %r1564, %r1516, %r1165; + add.s32 %r1565, %r1564, %r1507; + xor.b32 %r1566, %r1565, %r1476; shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; - add.s32 %r1568, %r1567, %r1548; - xor.b32 %r1569, %r1568, %r1537; + add.s32 %r1568, %r1567, %r1491; + xor.b32 %r1569, %r1568, %r1507; shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; - add.s32 %r1571, %r1565, %r1570; - xor.b32 %r1572, %r1571, %r1567; - shf.l.wrap.b32 %r1573, %r1572, %r1572, 24; - add.s32 %r1574, %r1573, %r1568; - xor.b32 %r1575, %r1574, %r1570; - shf.l.wrap.b32 %r1576, %r1575, %r1575, 25; - add.s32 %r1577, %r1532, %r988; - add.s32 %r1578, %r1577, %r1550; - xor.b32 %r1579, %r1522, %r1578; - shf.l.wrap.b32 %r1580, %r1579, %r1579, 16; - add.s32 %r1581, %r1580, %r1511; - xor.b32 %r1582, %r1581, %r1550; - shf.l.wrap.b32 %r1583, %r1582, %r1582, 20; - add.s32 %r1584, %r1578, %r1583; - xor.b32 %r1585, %r1584, %r1580; - shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; - add.s32 %r1587, %r1586, %r1581; - xor.b32 %r1588, %r1587, %r1583; - shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; - add.s32 %r1590, %r1545, %r1513; - xor.b32 %r1591, %r1590, %r1534; - shf.l.wrap.b32 %r1592, %r1591, %r1591, 16; - add.s32 %r1593, %r1592, %r1523; - xor.b32 %r1594, %r1593, %r1513; - shf.l.wrap.b32 %r1595, %r1594, %r1594, 20; - add.s32 %r1596, %r1590, %r1595; - xor.b32 %r1597, %r1596, %r1592; - shf.l.wrap.b32 %r1598, %r1597, %r1597, 24; - add.s32 %r1599, %r1598, %r1593; - xor.b32 %r1600, %r1599, %r1595; - shf.l.wrap.b32 %r1601, %r1600, %r1600, 25; - add.s32 %r1602, %r1558, %r1601; - xor.b32 %r1603, %r1602, %r1573; - shf.l.wrap.b32 %r1604, %r1603, %r1603, 16; - add.s32 %r1605, %r1604, %r1587; - xor.b32 %r1606, %r1605, %r1601; - shf.l.wrap.b32 %r1607, %r1606, %r1606, 20; - add.s32 %r1608, %r1602, %r1607; - xor.b32 %r1609, %r1608, %r1604; - shf.l.wrap.b32 %r1610, %r1609, %r1609, 24; - add.s32 %r1611, %r1610, %r1605; - xor.b32 %r1612, %r1611, %r1607; - shf.l.wrap.b32 %r1613, %r1612, %r1612, 25; - add.s32 %r1614, %r1571, %r1563; - xor.b32 %r1615, %r1614, %r1586; - shf.l.wrap.b32 %r1616, %r1615, %r1615, 16; - add.s32 %r1617, %r1616, %r1599; - xor.b32 %r1618, %r1617, %r1563; - shf.l.wrap.b32 %r1619, %r1618, %r1618, 20; - add.s32 %r1620, %r1614, %r972; - add.s32 %r1621, %r1620, %r1619; - xor.b32 %r1622, %r1621, %r1616; - shf.l.wrap.b32 %r1623, %r1622, %r1622, 24; - add.s32 %r1624, %r1623, %r1617; - xor.b32 %r1625, %r1624, %r1619; - shf.l.wrap.b32 %r1626, %r1625, %r1625, 25; - add.s32 %r1627, %r1584, %r980; - add.s32 %r1628, %r1627, %r1576; - xor.b32 %r1629, %r1598, %r1628; - shf.l.wrap.b32 %r1630, %r1629, %r1629, 16; - add.s32 %r1631, %r1630, %r1561; - xor.b32 %r1632, %r1631, %r1576; - shf.l.wrap.b32 %r1633, %r1632, %r1632, 20; - add.s32 %r1634, %r1628, %r1633; - xor.b32 %r1635, %r1634, %r1630; - shf.l.wrap.b32 %r1636, %r1635, %r1635, 24; - add.s32 %r1637, %r1636, %r1631; - xor.b32 %r1638, %r1637, %r1633; - shf.l.wrap.b32 %r1639, %r1638, %r1638, 25; - add.s32 %r1640, %r1596, %r1589; - xor.b32 %r1641, %r1560, %r1640; - shf.l.wrap.b32 %r1642, %r1641, %r1641, 16; - add.s32 %r1643, %r1642, %r1574; - xor.b32 %r1644, %r1643, %r1589; - shf.l.wrap.b32 %r1645, %r1644, %r1644, 20; - add.s32 %r1646, %r1640, %r1645; - xor.b32 %r1647, %r1646, %r1642; - shf.l.wrap.b32 %r1648, %r1647, %r1647, 24; - add.s32 %r1649, %r1648, %r1643; - xor.b32 %r1650, %r1649, %r1645; - shf.l.wrap.b32 %r1651, %r1650, %r1650, 25; - add.s32 %r1652, %r1608, %r1626; - xor.b32 %r1653, %r1652, %r1648; - shf.l.wrap.b32 %r1654, %r1653, %r1653, 16; - add.s32 %r1655, %r1654, %r1637; - xor.b32 %r1656, %r1655, %r1626; - shf.l.wrap.b32 %r1657, %r1656, %r1656, 20; - add.s32 %r1658, %r1652, %r1657; - xor.b32 %r1659, %r1658, %r1654; - shf.l.wrap.b32 %r1660, %r1659, %r1659, 24; - add.s32 %r1661, %r1660, %r1655; - xor.b32 %r1662, %r1661, %r1657; - shf.l.wrap.b32 %r1663, %r1662, %r1662, 25; - add.s32 %r1664, %r1639, %r988; - add.s32 %r1665, %r1664, %r1621; - xor.b32 %r1666, %r1610, %r1665; - shf.l.wrap.b32 %r1667, %r1666, %r1666, 16; - add.s32 %r1668, %r1667, %r1649; - xor.b32 %r1669, %r1668, %r1639; - shf.l.wrap.b32 %r1670, %r1669, %r1669, 20; - add.s32 %r1671, %r1665, %r1670; - xor.b32 %r1672, %r1671, %r1667; - shf.l.wrap.b32 %r1673, %r1672, %r1672, 24; - add.s32 %r1674, %r1673, %r1668; - xor.b32 %r1675, %r1674, %r1670; - shf.l.wrap.b32 %r1676, %r1675, %r1675, 25; - add.s32 %r1677, %r1634, %r996; - add.s32 %r1678, %r1677, %r1651; - xor.b32 %r1679, %r1623, %r1678; - shf.l.wrap.b32 %r1680, %r1679, %r1679, 16; - add.s32 %r1681, %r1680, %r1611; - xor.b32 %r1682, %r1681, %r1651; - shf.l.wrap.b32 %r1683, %r1682, %r1682, 20; - add.s32 %r1684, %r1678, %r1683; - xor.b32 %r1685, %r1684, %r1680; + add.s32 %r1571, %r1565, %r1179; + add.s32 %r1572, %r1571, %r1570; + xor.b32 %r1573, %r1572, %r1567; + shf.l.wrap.b32 %r1574, %r1573, %r1573, 24; + add.s32 %r1575, %r1574, %r1568; + xor.b32 %r1576, %r1575, %r1570; + shf.l.wrap.b32 %r1577, %r1576, %r1576, 25; + add.s32 %r1578, %r1530, %r1102; + add.s32 %r1579, %r1578, %r1549; + xor.b32 %r1580, %r1579, %r1574; + shf.l.wrap.b32 %r1581, %r1580, %r1580, 16; + add.s32 %r1582, %r1581, %r1561; + xor.b32 %r1583, %r1582, %r1549; + shf.l.wrap.b32 %r1584, %r1583, %r1583, 20; + add.s32 %r1585, %r1579, %r1074; + add.s32 %r1586, %r1585, %r1584; + xor.b32 %r1587, %r1586, %r1581; + shf.l.wrap.b32 %r1588, %r1587, %r1587, 24; + add.s32 %r1589, %r1588, %r1582; + xor.b32 %r1590, %r1589, %r1584; + shf.l.wrap.b32 %r1591, %r1590, %r1590, 25; + add.s32 %r1592, %r1544, %r1151; + add.s32 %r1593, %r1592, %r1563; + xor.b32 %r1594, %r1593, %r1532; + shf.l.wrap.b32 %r1595, %r1594, %r1594, 16; + add.s32 %r1596, %r1595, %r1575; + xor.b32 %r1597, %r1596, %r1563; + shf.l.wrap.b32 %r1598, %r1597, %r1597, 20; + add.s32 %r1599, %r1593, %r1088; + add.s32 %r1600, %r1599, %r1598; + xor.b32 %r1601, %r1600, %r1595; + shf.l.wrap.b32 %r1602, %r1601, %r1601, 24; + add.s32 %r1603, %r1602, %r1596; + xor.b32 %r1604, %r1603, %r1598; + shf.l.wrap.b32 %r1605, %r1604, %r1604, 25; + add.s32 %r1606, %r1558, %r1109; + add.s32 %r1607, %r1606, %r1577; + xor.b32 %r1608, %r1607, %r1546; + shf.l.wrap.b32 %r1609, %r1608, %r1608, 16; + add.s32 %r1610, %r1609, %r1533; + xor.b32 %r1611, %r1610, %r1577; + shf.l.wrap.b32 %r1612, %r1611, %r1611, 20; + add.s32 %r1613, %r1607, %r1130; + add.s32 %r1614, %r1613, %r1612; + xor.b32 %r1615, %r1614, %r1609; + shf.l.wrap.b32 %r1616, %r1615, %r1615, 24; + add.s32 %r1617, %r1616, %r1610; + xor.b32 %r1618, %r1617, %r1612; + shf.l.wrap.b32 %r1619, %r1618, %r1618, 25; + add.s32 %r1620, %r1572, %r1081; + add.s32 %r1621, %r1620, %r1535; + xor.b32 %r1622, %r1621, %r1560; + shf.l.wrap.b32 %r1623, %r1622, %r1622, 16; + add.s32 %r1624, %r1623, %r1547; + xor.b32 %r1625, %r1624, %r1535; + shf.l.wrap.b32 %r1626, %r1625, %r1625, 20; + add.s32 %r1627, %r1621, %r1116; + add.s32 %r1628, %r1627, %r1626; + xor.b32 %r1629, %r1628, %r1623; + shf.l.wrap.b32 %r1630, %r1629, %r1629, 24; + add.s32 %r1631, %r1630, %r1624; + xor.b32 %r1632, %r1631, %r1626; + shf.l.wrap.b32 %r1633, %r1632, %r1632, 25; + add.s32 %r1634, %r1586, %r1158; + add.s32 %r1635, %r1634, %r1633; + xor.b32 %r1636, %r1635, %r1602; + shf.l.wrap.b32 %r1637, %r1636, %r1636, 16; + add.s32 %r1638, %r1637, %r1617; + xor.b32 %r1639, %r1638, %r1633; + shf.l.wrap.b32 %r1640, %r1639, %r1639, 20; + add.s32 %r1641, %r1635, %r1165; + add.s32 %r1642, %r1641, %r1640; + xor.b32 %r1643, %r1642, %r1637; + shf.l.wrap.b32 %r1644, %r1643, %r1643, 24; + add.s32 %r1645, %r1644, %r1638; + xor.b32 %r1646, %r1645, %r1640; + shf.l.wrap.b32 %r1647, %r1646, %r1646, 25; + add.s32 %r1648, %r1600, %r1137; + add.s32 %r1649, %r1648, %r1591; + xor.b32 %r1650, %r1649, %r1616; + shf.l.wrap.b32 %r1651, %r1650, %r1650, 16; + add.s32 %r1652, %r1651, %r1631; + xor.b32 %r1653, %r1652, %r1591; + shf.l.wrap.b32 %r1654, %r1653, %r1653, 20; + add.s32 %r1655, %r1649, %r1151; + add.s32 %r1656, %r1655, %r1654; + xor.b32 %r1657, %r1656, %r1651; + shf.l.wrap.b32 %r1658, %r1657, %r1657, 24; + add.s32 %r1659, %r1658, %r1652; + xor.b32 %r1660, %r1659, %r1654; + shf.l.wrap.b32 %r1661, %r1660, %r1660, 25; + add.s32 %r1662, %r1614, %r1179; + add.s32 %r1663, %r1662, %r1605; + xor.b32 %r1664, %r1663, %r1630; + shf.l.wrap.b32 %r1665, %r1664, %r1664, 16; + add.s32 %r1666, %r1665, %r1589; + xor.b32 %r1667, %r1666, %r1605; + shf.l.wrap.b32 %r1668, %r1667, %r1667, 20; + add.s32 %r1669, %r1663, %r1144; + add.s32 %r1670, %r1669, %r1668; + xor.b32 %r1671, %r1670, %r1665; + shf.l.wrap.b32 %r1672, %r1671, %r1671, 24; + add.s32 %r1673, %r1672, %r1666; + xor.b32 %r1674, %r1673, %r1668; + shf.l.wrap.b32 %r1675, %r1674, %r1674, 25; + add.s32 %r1676, %r1628, %r1172; + add.s32 %r1677, %r1676, %r1619; + xor.b32 %r1678, %r1677, %r1588; + shf.l.wrap.b32 %r1679, %r1678, %r1678, 16; + add.s32 %r1680, %r1679, %r1603; + xor.b32 %r1681, %r1680, %r1619; + shf.l.wrap.b32 %r1682, %r1681, %r1681, 20; + add.s32 %r1683, %r1677, %r1130; + add.s32 %r1684, %r1683, %r1682; + xor.b32 %r1685, %r1684, %r1679; shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; - add.s32 %r1687, %r1686, %r1681; - xor.b32 %r1688, %r1687, %r1683; + add.s32 %r1687, %r1686, %r1680; + xor.b32 %r1688, %r1687, %r1682; shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; - add.s32 %r1690, %r1646, %r1613; - xor.b32 %r1691, %r1690, %r1636; - shf.l.wrap.b32 %r1692, %r1691, %r1691, 16; - add.s32 %r1693, %r1692, %r1624; - xor.b32 %r1694, %r1693, %r1613; - shf.l.wrap.b32 %r1695, %r1694, %r1694, 20; - add.s32 %r1696, %r1690, %r1695; - xor.b32 %r1697, %r1696, %r1692; - shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; - add.s32 %r1699, %r1698, %r1693; - xor.b32 %r1700, %r1699, %r1695; - shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; - xor.b32 %r9, %r1658, %r1687; - cvt.u64.u32 %rd132, %r9; - xor.b32 %r1702, %r1699, %r1671; - and.b32 %r1703, %r1702, 255; - cvt.u64.u32 %rd133, %r1703; - bfi.b64 %rd134, %rd133, %rd132, 32, 32; - cvt.u64.u32 %rd135, %r1702; - shl.b64 %rd136, %rd135, 32; - and.b64 %rd137, %rd136, 280375465082880; - or.b64 %rd138, %rd134, %rd137; - and.b64 %rd139, %rd136, 71776119061217280; - shr.u32 %r10, %r1702, 24; - cvt.u64.u32 %rd140, %r10; - shl.b64 %rd141, %rd140, 56; - or.b64 %rd142, %rd138, %rd139; - or.b64 %rd143, %rd142, %rd141; - xor.b32 %r11, %r1661, %r1684; - cvt.u64.u32 %rd144, %r11; - xor.b32 %r1704, %r1696, %r1674; - and.b32 %r1705, %r1704, 255; - cvt.u64.u32 %rd145, %r1705; - bfi.b64 %rd146, %rd145, %rd144, 32, 32; - cvt.u64.u32 %rd147, %r1704; - shl.b64 %rd148, %rd147, 32; - and.b64 %rd149, %rd148, 280375465082880; - or.b64 %rd150, %rd146, %rd149; - and.b64 %rd151, %rd148, 71776119061217280; - shr.u32 %r12, %r1704, 24; - cvt.u64.u32 %rd152, %r12; - shl.b64 %rd153, %rd152, 56; - or.b64 %rd154, %rd150, %rd151; - or.b64 %rd155, %rd154, %rd153; - xor.b32 %r13, %r1701, %r1673; - cvt.u64.u32 %rd156, %r13; - xor.b32 %r1706, %r1663, %r1686; - and.b32 %r1707, %r1706, 255; - cvt.u64.u32 %rd157, %r1707; - bfi.b64 %rd158, %rd157, %rd156, 32, 32; - cvt.u64.u32 %rd159, %r1706; - shl.b64 %rd160, %rd159, 32; - and.b64 %rd161, %rd160, 280375465082880; - or.b64 %rd162, %rd158, %rd161; - and.b64 %rd163, %rd160, 71776119061217280; - shr.u32 %r14, %r1706, 24; - cvt.u64.u32 %rd164, %r14; - shl.b64 %rd165, %rd164, 56; - or.b64 %rd166, %rd162, %rd163; - or.b64 %rd167, %rd166, %rd165; - xor.b32 %r1708, %r1698, %r1676; - cvt.u64.u32 %rd168, %r1708; - xor.b32 %r1709, %r1660, %r1689; - and.b32 %r1710, %r1709, 255; - cvt.u64.u32 %rd169, %r1710; - bfi.b64 %rd170, %rd169, %rd168, 32, 32; - cvt.u64.u32 %rd171, %r1709; - shl.b64 %rd172, %rd171, 32; - and.b64 %rd173, %rd172, 280375465082880; - or.b64 %rd174, %rd170, %rd173; - and.b64 %rd175, %rd172, 71776119061217280; - shr.u32 %r1711, %r1709, 24; - cvt.u64.u32 %rd176, %r1711; - shl.b64 %rd177, %rd176, 56; - or.b64 %rd178, %rd174, %rd175; - or.b64 %rd9, %rd178, %rd177; - shr.u64 %rd10, %rd143, 32; - shr.u64 %rd11, %rd143, 40; - shr.u64 %rd12, %rd143, 48; - shr.u64 %rd13, %rd155, 32; - shr.u64 %rd14, %rd155, 40; - shr.u64 %rd15, %rd155, 48; - shr.u64 %rd16, %rd167, 32; - shr.u64 %rd17, %rd167, 40; - shr.u64 %rd18, %rd167, 48; - shr.u32 %r5809, %r9, 12; - shr.u32 %r5810, %r9, 8; - shr.u32 %r5811, %r9, 4; - and.b32 %r5812, %r5811, 15; - and.b32 %r5813, %r9, 15; - bfi.b32 %r5814, %r5813, %r5812, 8, 4; - shl.b32 %r5815, %r9, 4; - and.b32 %r5816, %r5815, 983040; - or.b32 %r5817, %r5814, %r5816; - shl.b32 %r5818, %r9, 16; - and.b32 %r5819, %r5818, 251658240; - or.b32 %r5746, %r5817, %r5819; - shr.u32 %r5820, %r9, 20; - and.b32 %r5821, %r5820, 15; - shr.u32 %r5822, %r9, 16; - and.b32 %r5823, %r5822, 15; - shr.u32 %r5824, %r9, 24; - bfi.b32 %r5825, %r5823, %r5821, 8, 4; - and.b32 %r5826, %r5809, 983040; - or.b32 %r5827, %r5825, %r5826; - and.b32 %r5828, %r9, 251658240; - or.b32 %r5750, %r5827, %r5828; - cvt.u16.u64 %rs83, %rd10; - and.b16 %rs84, %rs83, 240; - shr.u16 %rs85, %rs84, 4; - cvt.u16.u64 %rs86, %rd11; - and.b16 %rs87, %rs86, 240; - shr.u16 %rs88, %rs87, 4; - cvt.u32.u16 %r5829, %rs85; - cvt.u32.u64 %r5830, %rd10; - and.b32 %r5831, %r5830, 15; - prmt.b32 %r5832, %r5831, %r5829, 30212; - cvt.u32.u16 %r5833, %rs88; - prmt.b32 %r5834, %r5833, %r5832, 28756; - cvt.u32.u64 %r5835, %rd11; - shl.b32 %r5836, %r5835, 24; - and.b32 %r5837, %r5836, 251658240; - or.b32 %r5754, %r5834, %r5837; - cvt.u16.u64 %rs89, %rd12; - and.b16 %rs90, %rs89, 240; - shr.u16 %rs91, %rs90, 4; - cvt.u32.u16 %r5838, %rs91; - cvt.u32.u64 %r5839, %rd12; - and.b32 %r5840, %r5839, 15; - prmt.b32 %r5841, %r5840, %r5838, 30212; - shl.b32 %r5842, %r10, 12; - and.b32 %r5843, %r5842, 983040; - or.b32 %r5844, %r5841, %r5843; - shl.b32 %r5845, %r10, 24; - and.b32 %r5846, %r5845, 251658240; - or.b32 %r5758, %r5844, %r5846; - shr.u32 %r5847, %r11, 12; - shr.u32 %r5848, %r11, 8; - shr.u32 %r5849, %r11, 4; - and.b32 %r5850, %r5849, 15; - and.b32 %r5851, %r11, 15; - bfi.b32 %r5852, %r5851, %r5850, 8, 4; - shl.b32 %r5853, %r11, 4; - and.b32 %r5854, %r5853, 983040; - or.b32 %r5855, %r5852, %r5854; - shl.b32 %r5856, %r11, 16; - and.b32 %r5857, %r5856, 251658240; - or.b32 %r5762, %r5855, %r5857; - shr.u32 %r5858, %r11, 20; - and.b32 %r5859, %r5858, 15; - shr.u32 %r5860, %r11, 16; - and.b32 %r5861, %r5860, 15; - shr.u32 %r5862, %r11, 24; - bfi.b32 %r5863, %r5861, %r5859, 8, 4; - and.b32 %r5864, %r5847, 983040; - or.b32 %r5865, %r5863, %r5864; - and.b32 %r5866, %r11, 251658240; - or.b32 %r5766, %r5865, %r5866; - cvt.u16.u64 %rs92, %rd13; - and.b16 %rs93, %rs92, 240; - shr.u16 %rs94, %rs93, 4; - cvt.u16.u64 %rs95, %rd14; - and.b16 %rs96, %rs95, 240; - shr.u16 %rs97, %rs96, 4; - cvt.u32.u16 %r5867, %rs94; - cvt.u32.u64 %r5868, %rd13; - and.b32 %r5869, %r5868, 15; - prmt.b32 %r5870, %r5869, %r5867, 30212; - cvt.u32.u16 %r5871, %rs97; - prmt.b32 %r5872, %r5871, %r5870, 28756; - cvt.u32.u64 %r5873, %rd14; - shl.b32 %r5874, %r5873, 24; - and.b32 %r5875, %r5874, 251658240; - or.b32 %r5770, %r5872, %r5875; - cvt.u16.u64 %rs98, %rd15; - and.b16 %rs99, %rs98, 240; - shr.u16 %rs100, %rs99, 4; - cvt.u32.u16 %r5876, %rs100; - cvt.u32.u64 %r5877, %rd15; - and.b32 %r5878, %r5877, 15; - prmt.b32 %r5879, %r5878, %r5876, 30212; - shl.b32 %r5880, %r12, 12; - and.b32 %r5881, %r5880, 983040; - or.b32 %r5882, %r5879, %r5881; - shl.b32 %r5883, %r12, 24; - and.b32 %r5884, %r5883, 251658240; - or.b32 %r5774, %r5882, %r5884; - shr.u32 %r5885, %r13, 12; - shr.u32 %r5886, %r13, 8; - shr.u32 %r5887, %r13, 4; - and.b32 %r5888, %r5887, 15; - and.b32 %r5889, %r13, 15; - bfi.b32 %r5890, %r5889, %r5888, 8, 4; - shl.b32 %r5891, %r13, 4; - and.b32 %r5892, %r5891, 983040; - or.b32 %r5893, %r5890, %r5892; - shl.b32 %r5894, %r13, 16; - and.b32 %r5895, %r5894, 251658240; - or.b32 %r5778, %r5893, %r5895; - shr.u32 %r5896, %r13, 20; - and.b32 %r5897, %r5896, 15; - shr.u32 %r5898, %r13, 16; - and.b32 %r5899, %r5898, 15; - shr.u32 %r5900, %r13, 24; - bfi.b32 %r5901, %r5899, %r5897, 8, 4; - and.b32 %r5902, %r5885, 983040; - or.b32 %r5903, %r5901, %r5902; - and.b32 %r5904, %r13, 251658240; - or.b32 %r5782, %r5903, %r5904; - cvt.u16.u64 %rs101, %rd16; - and.b16 %rs102, %rs101, 240; - shr.u16 %rs103, %rs102, 4; - cvt.u16.u64 %rs104, %rd17; - and.b16 %rs105, %rs104, 240; - shr.u16 %rs106, %rs105, 4; - cvt.u32.u16 %r5905, %rs103; - cvt.u32.u64 %r5906, %rd16; - and.b32 %r5907, %r5906, 15; - prmt.b32 %r5908, %r5907, %r5905, 30212; - cvt.u32.u16 %r5909, %rs106; - prmt.b32 %r5910, %r5909, %r5908, 28756; - cvt.u32.u64 %r5911, %rd17; - shl.b32 %r5912, %r5911, 24; - and.b32 %r5913, %r5912, 251658240; - or.b32 %r5786, %r5910, %r5913; - cvt.u16.u64 %rs107, %rd18; - and.b16 %rs108, %rs107, 240; - shr.u16 %rs109, %rs108, 4; - cvt.u32.u16 %r5914, %rs109; - cvt.u32.u64 %r5915, %rd18; - and.b32 %r5916, %r5915, 15; - prmt.b32 %r5917, %r5916, %r5914, 30212; - shl.b32 %r5918, %r14, 12; - and.b32 %r5919, %r5918, 983040; - or.b32 %r5920, %r5917, %r5919; - shl.b32 %r5921, %r14, 24; - and.b32 %r5922, %r5921, 251658240; - or.b32 %r5790, %r5920, %r5922; - cvt.u16.u64 %rs110, %rd9; - and.b16 %rs111, %rs110, 240; - shr.u16 %rs112, %rs111, 4; - shr.u64 %rd201, %rd9, 8; - cvt.u32.u64 %r5923, %rd201; - cvt.u32.u64 %r5924, %rd9; - shr.u32 %r5925, %r5924, 12; - cvt.u32.u16 %r5926, %rs112; - and.b32 %r5927, %r5924, 15; - prmt.b32 %r5928, %r5927, %r5926, 30212; - shl.b32 %r5929, %r5924, 4; - and.b32 %r5930, %r5929, 983040; - or.b32 %r5931, %r5928, %r5930; - shl.b32 %r5932, %r5923, 24; - and.b32 %r5933, %r5932, 251658240; - or.b32 %r5794, %r5931, %r5933; - shr.u64 %rd202, %rd9, 16; - cvt.u32.u64 %r5934, %rd202; - shr.u32 %r5935, %r5924, 20; - and.b32 %r5936, %r5935, 15; - and.b32 %r5937, %r5934, 15; - shr.u64 %rd203, %rd9, 24; - cvt.u32.u64 %r5938, %rd203; - bfi.b32 %r5939, %r5937, %r5936, 8, 4; - and.b32 %r5940, %r5925, 983040; - or.b32 %r5941, %r5939, %r5940; - shl.b32 %r5942, %r5938, 24; - and.b32 %r5943, %r5942, 251658240; - or.b32 %r5798, %r5941, %r5943; - shr.u64 %rd204, %rd9, 32; - cvt.u32.u64 %r5944, %rd204; - shr.u64 %rd205, %rd9, 36; - cvt.u32.u64 %r5945, %rd205; - and.b32 %r5946, %r5945, 15; - and.b32 %r5947, %r5944, 15; - shr.u64 %rd206, %rd9, 40; - cvt.u32.u64 %r5948, %rd206; - shr.u64 %rd207, %rd9, 44; - cvt.u32.u64 %r5949, %rd207; - bfi.b32 %r5950, %r5947, %r5946, 8, 4; - shl.b32 %r5951, %r5949, 16; - and.b32 %r5952, %r5951, 983040; - or.b32 %r5953, %r5950, %r5952; - shl.b32 %r5954, %r5948, 24; - and.b32 %r5955, %r5954, 251658240; - or.b32 %r5802, %r5953, %r5955; - shr.u64 %rd208, %rd9, 48; - cvt.u32.u64 %r5956, %rd208; - shr.u64 %rd209, %rd9, 52; - cvt.u32.u64 %r5957, %rd209; - and.b32 %r5958, %r5957, 15; - and.b32 %r5959, %r5956, 15; - shr.u64 %rd210, %rd9, 56; - cvt.u32.u64 %r5960, %rd210; - bfi.b32 %r5961, %r5959, %r5958, 8, 4; - and.b32 %r5962, %r5949, 983040; - or.b32 %r5963, %r5961, %r5962; - shl.b32 %r5964, %r5960, 24; - and.b32 %r5965, %r5964, 251658240; - or.b32 %r5806, %r5963, %r5965; - ld.const.u32 %r1713, [matrix]; - mov.u32 %r6244, 0; + add.s32 %r1690, %r1642, %r1123; + add.s32 %r1691, %r1690, %r1661; + xor.b32 %r1692, %r1691, %r1686; + shf.l.wrap.b32 %r1693, %r1692, %r1692, 16; + add.s32 %r1694, %r1693, %r1673; + xor.b32 %r1695, %r1694, %r1661; + shf.l.wrap.b32 %r1696, %r1695, %r1695, 20; + add.s32 %r1697, %r1691, %r1088; + add.s32 %r1698, %r1697, %r1696; + xor.b32 %r1699, %r1698, %r1693; + shf.l.wrap.b32 %r1700, %r1699, %r1699, 24; + add.s32 %r1701, %r1700, %r1694; + xor.b32 %r1702, %r1701, %r1696; + shf.l.wrap.b32 %r1703, %r1702, %r1702, 25; + add.s32 %r1704, %r1656, %r1109; + add.s32 %r1705, %r1704, %r1675; + xor.b32 %r1706, %r1705, %r1644; + shf.l.wrap.b32 %r1707, %r1706, %r1706, 16; + add.s32 %r1708, %r1707, %r1687; + xor.b32 %r1709, %r1708, %r1675; + shf.l.wrap.b32 %r1710, %r1709, %r1709, 20; + add.s32 %r1711, %r1705, %r1095; + add.s32 %r1712, %r1711, %r1710; + xor.b32 %r1713, %r1712, %r1707; + shf.l.wrap.b32 %r1714, %r1713, %r1713, 24; + add.s32 %r1715, %r1714, %r1708; + xor.b32 %r1716, %r1715, %r1710; + shf.l.wrap.b32 %r1717, %r1716, %r1716, 25; + add.s32 %r1718, %r1670, %r1074; + add.s32 %r1719, %r1718, %r1689; + xor.b32 %r1720, %r1719, %r1658; + shf.l.wrap.b32 %r1721, %r1720, %r1720, 16; + add.s32 %r1722, %r1721, %r1645; + xor.b32 %r1723, %r1722, %r1689; + shf.l.wrap.b32 %r1724, %r1723, %r1723, 20; + add.s32 %r1725, %r1719, %r1081; + add.s32 %r1726, %r1725, %r1724; + xor.b32 %r1727, %r1726, %r1721; + shf.l.wrap.b32 %r1728, %r1727, %r1727, 24; + add.s32 %r1729, %r1728, %r1722; + xor.b32 %r1730, %r1729, %r1724; + shf.l.wrap.b32 %r1731, %r1730, %r1730, 25; + add.s32 %r1732, %r1684, %r1116; + add.s32 %r1733, %r1732, %r1647; + xor.b32 %r1734, %r1733, %r1672; + shf.l.wrap.b32 %r1735, %r1734, %r1734, 16; + add.s32 %r1736, %r1735, %r1659; + xor.b32 %r1737, %r1736, %r1647; + shf.l.wrap.b32 %r1738, %r1737, %r1737, 20; + add.s32 %r1739, %r1733, %r1102; + add.s32 %r1740, %r1739, %r1738; + xor.b32 %r1741, %r1740, %r1735; + shf.l.wrap.b32 %r1742, %r1741, %r1741, 24; + add.s32 %r1743, %r1742, %r1736; + xor.b32 %r1744, %r1743, %r1738; + shf.l.wrap.b32 %r1745, %r1744, %r1744, 25; + add.s32 %r1746, %r1698, %r1137; + add.s32 %r1747, %r1746, %r1745; + xor.b32 %r1748, %r1747, %r1714; + shf.l.wrap.b32 %r1749, %r1748, %r1748, 16; + add.s32 %r1750, %r1749, %r1729; + xor.b32 %r1751, %r1750, %r1745; + shf.l.wrap.b32 %r1752, %r1751, %r1751, 20; + add.s32 %r1753, %r1747, %r1172; + add.s32 %r1754, %r1753, %r1752; + xor.b32 %r1755, %r1754, %r1749; + shf.l.wrap.b32 %r1756, %r1755, %r1755, 24; + add.s32 %r1757, %r1756, %r1750; + xor.b32 %r1758, %r1757, %r1752; + shf.l.wrap.b32 %r1759, %r1758, %r1758, 25; + add.s32 %r1760, %r1712, %r1151; + add.s32 %r1761, %r1760, %r1703; + xor.b32 %r1762, %r1761, %r1728; + shf.l.wrap.b32 %r1763, %r1762, %r1762, 16; + add.s32 %r1764, %r1763, %r1743; + xor.b32 %r1765, %r1764, %r1703; + shf.l.wrap.b32 %r1766, %r1765, %r1765, 20; + add.s32 %r1767, %r1761, %r1109; + add.s32 %r1768, %r1767, %r1766; + xor.b32 %r1769, %r1768, %r1763; + shf.l.wrap.b32 %r1770, %r1769, %r1769, 24; + add.s32 %r1771, %r1770, %r1764; + xor.b32 %r1772, %r1771, %r1766; + shf.l.wrap.b32 %r1773, %r1772, %r1772, 25; + add.s32 %r1774, %r1726, %r1130; + add.s32 %r1775, %r1774, %r1717; + xor.b32 %r1776, %r1775, %r1742; + shf.l.wrap.b32 %r1777, %r1776, %r1776, 16; + add.s32 %r1778, %r1777, %r1701; + xor.b32 %r1779, %r1778, %r1717; + shf.l.wrap.b32 %r1780, %r1779, %r1779, 20; + add.s32 %r1781, %r1775, %r1158; + add.s32 %r1782, %r1781, %r1780; + xor.b32 %r1783, %r1782, %r1777; + shf.l.wrap.b32 %r1784, %r1783, %r1783, 24; + add.s32 %r1785, %r1784, %r1778; + xor.b32 %r1786, %r1785, %r1780; + shf.l.wrap.b32 %r1787, %r1786, %r1786, 25; + add.s32 %r1788, %r1740, %r1179; + add.s32 %r1789, %r1788, %r1731; + xor.b32 %r1790, %r1789, %r1700; + shf.l.wrap.b32 %r1791, %r1790, %r1790, 16; + add.s32 %r1792, %r1791, %r1715; + xor.b32 %r1793, %r1792, %r1731; + shf.l.wrap.b32 %r1794, %r1793, %r1793, 20; + add.s32 %r1795, %r1789, %r1081; + add.s32 %r1796, %r1795, %r1794; + xor.b32 %r1797, %r1796, %r1791; + shf.l.wrap.b32 %r1798, %r1797, %r1797, 24; + add.s32 %r1799, %r1798, %r1792; + xor.b32 %r1800, %r1799, %r1794; + shf.l.wrap.b32 %r1801, %r1800, %r1800, 25; + add.s32 %r1802, %r1754, %r1165; + add.s32 %r1803, %r1802, %r1773; + xor.b32 %r1804, %r1803, %r1798; + shf.l.wrap.b32 %r1805, %r1804, %r1804, 16; + add.s32 %r1806, %r1805, %r1785; + xor.b32 %r1807, %r1806, %r1773; + shf.l.wrap.b32 %r1808, %r1807, %r1807, 20; + add.s32 %r1809, %r1803, %r1095; + add.s32 %r1810, %r1809, %r1808; + xor.b32 %r1811, %r1810, %r1805; + shf.l.wrap.b32 %r1812, %r1811, %r1811, 24; + add.s32 %r1813, %r1812, %r1806; + xor.b32 %r1814, %r1813, %r1808; + shf.l.wrap.b32 %r1815, %r1814, %r1814, 25; + add.s32 %r1816, %r1768, %r1074; + add.s32 %r1817, %r1816, %r1787; + xor.b32 %r1818, %r1817, %r1756; + shf.l.wrap.b32 %r1819, %r1818, %r1818, 16; + add.s32 %r1820, %r1819, %r1799; + xor.b32 %r1821, %r1820, %r1787; + shf.l.wrap.b32 %r1822, %r1821, %r1821, 20; + add.s32 %r1823, %r1817, %r1144; + add.s32 %r1824, %r1823, %r1822; + xor.b32 %r1825, %r1824, %r1819; + shf.l.wrap.b32 %r1826, %r1825, %r1825, 24; + add.s32 %r1827, %r1826, %r1820; + xor.b32 %r1828, %r1827, %r1822; + shf.l.wrap.b32 %r1829, %r1828, %r1828, 25; + add.s32 %r1830, %r1782, %r1088; + add.s32 %r1831, %r1830, %r1801; + xor.b32 %r1832, %r1831, %r1770; + shf.l.wrap.b32 %r1833, %r1832, %r1832, 16; + add.s32 %r1834, %r1833, %r1757; + xor.b32 %r1835, %r1834, %r1801; + shf.l.wrap.b32 %r1836, %r1835, %r1835, 20; + add.s32 %r1837, %r1831, %r1116; + add.s32 %r1838, %r1837, %r1836; + xor.b32 %r1839, %r1838, %r1833; + shf.l.wrap.b32 %r1840, %r1839, %r1839, 24; + add.s32 %r1841, %r1840, %r1834; + xor.b32 %r1842, %r1841, %r1836; + shf.l.wrap.b32 %r1843, %r1842, %r1842, 25; + add.s32 %r1844, %r1796, %r1102; + add.s32 %r1845, %r1844, %r1759; + xor.b32 %r1846, %r1845, %r1784; + shf.l.wrap.b32 %r1847, %r1846, %r1846, 16; + add.s32 %r1848, %r1847, %r1771; + xor.b32 %r1849, %r1848, %r1759; + shf.l.wrap.b32 %r1850, %r1849, %r1849, 20; + add.s32 %r1851, %r1845, %r1123; + add.s32 %r1852, %r1851, %r1850; + xor.b32 %r1853, %r1852, %r1847; + shf.l.wrap.b32 %r1854, %r1853, %r1853, 24; + add.s32 %r1855, %r1854, %r1848; + xor.b32 %r1856, %r1855, %r1850; + shf.l.wrap.b32 %r1857, %r1856, %r1856, 25; + add.s32 %r1858, %r1810, %r1151; + add.s32 %r1859, %r1858, %r1857; + xor.b32 %r1860, %r1859, %r1826; + shf.l.wrap.b32 %r1861, %r1860, %r1860, 16; + add.s32 %r1862, %r1861, %r1841; + xor.b32 %r1863, %r1862, %r1857; + shf.l.wrap.b32 %r1864, %r1863, %r1863, 20; + add.s32 %r1865, %r1859, %r1179; + add.s32 %r1866, %r1865, %r1864; + xor.b32 %r1867, %r1866, %r1861; + shf.l.wrap.b32 %r1868, %r1867, %r1867, 24; + add.s32 %r1869, %r1868, %r1862; + xor.b32 %r1870, %r1869, %r1864; + shf.l.wrap.b32 %r1871, %r1870, %r1870, 25; + add.s32 %r1872, %r1824, %r1109; + add.s32 %r1873, %r1872, %r1815; + xor.b32 %r1874, %r1873, %r1840; + shf.l.wrap.b32 %r1875, %r1874, %r1874, 16; + add.s32 %r1876, %r1875, %r1855; + xor.b32 %r1877, %r1876, %r1815; + shf.l.wrap.b32 %r1878, %r1877, %r1877, 20; + add.s32 %r1879, %r1873, %r1074; + add.s32 %r1880, %r1879, %r1878; + xor.b32 %r1881, %r1880, %r1875; + shf.l.wrap.b32 %r1882, %r1881, %r1881, 24; + add.s32 %r1883, %r1882, %r1876; + xor.b32 %r1884, %r1883, %r1878; + shf.l.wrap.b32 %r1885, %r1884, %r1884, 25; + add.s32 %r1886, %r1838, %r1081; + add.s32 %r1887, %r1886, %r1829; + xor.b32 %r1888, %r1887, %r1854; + shf.l.wrap.b32 %r1889, %r1888, %r1888, 16; + add.s32 %r1890, %r1889, %r1813; + xor.b32 %r1891, %r1890, %r1829; + shf.l.wrap.b32 %r1892, %r1891, %r1891, 20; + add.s32 %r1893, %r1887, %r1137; + add.s32 %r1894, %r1893, %r1892; + xor.b32 %r1895, %r1894, %r1889; + shf.l.wrap.b32 %r1896, %r1895, %r1895, 24; + add.s32 %r1897, %r1896, %r1890; + xor.b32 %r1898, %r1897, %r1892; + shf.l.wrap.b32 %r1899, %r1898, %r1898, 25; + add.s32 %r1900, %r1852, %r1130; + add.s32 %r1901, %r1900, %r1843; + xor.b32 %r1902, %r1901, %r1812; + shf.l.wrap.b32 %r1903, %r1902, %r1902, 16; + add.s32 %r1904, %r1903, %r1827; + xor.b32 %r1905, %r1904, %r1843; + shf.l.wrap.b32 %r1906, %r1905, %r1905, 20; + add.s32 %r1907, %r1901, %r1116; + add.s32 %r1908, %r1907, %r1906; + xor.b32 %r1909, %r1908, %r1903; + shf.l.wrap.b32 %r1910, %r1909, %r1909, 24; + add.s32 %r1911, %r1910, %r1904; + xor.b32 %r1912, %r1911, %r1906; + shf.l.wrap.b32 %r1913, %r1912, %r1912, 25; + add.s32 %r1914, %r1866, %r1172; + add.s32 %r1915, %r1914, %r1885; + xor.b32 %r1916, %r1915, %r1910; + shf.l.wrap.b32 %r1917, %r1916, %r1916, 16; + add.s32 %r1918, %r1917, %r1897; + xor.b32 %r1919, %r1918, %r1885; + shf.l.wrap.b32 %r1920, %r1919, %r1919, 20; + add.s32 %r1921, %r1915, %r1144; + add.s32 %r1922, %r1921, %r1920; + xor.b32 %r1923, %r1922, %r1917; + shf.l.wrap.b32 %r1924, %r1923, %r1923, 24; + add.s32 %r1925, %r1924, %r1918; + xor.b32 %r1926, %r1925, %r1920; + shf.l.wrap.b32 %r1927, %r1926, %r1926, 25; + add.s32 %r1928, %r1880, %r1088; + add.s32 %r1929, %r1928, %r1899; + xor.b32 %r1930, %r1929, %r1868; + shf.l.wrap.b32 %r1931, %r1930, %r1930, 16; + add.s32 %r1932, %r1931, %r1911; + xor.b32 %r1933, %r1932, %r1899; + shf.l.wrap.b32 %r1934, %r1933, %r1933, 20; + add.s32 %r1935, %r1929, %r1158; + add.s32 %r1936, %r1935, %r1934; + xor.b32 %r1937, %r1936, %r1931; + shf.l.wrap.b32 %r1938, %r1937, %r1937, 24; + add.s32 %r1939, %r1938, %r1932; + xor.b32 %r1940, %r1939, %r1934; + shf.l.wrap.b32 %r1941, %r1940, %r1940, 25; + add.s32 %r1942, %r1894, %r1095; + add.s32 %r1943, %r1942, %r1913; + xor.b32 %r1944, %r1943, %r1882; + shf.l.wrap.b32 %r1945, %r1944, %r1944, 16; + add.s32 %r1946, %r1945, %r1869; + xor.b32 %r1947, %r1946, %r1913; + shf.l.wrap.b32 %r1948, %r1947, %r1947, 20; + add.s32 %r1949, %r1943, %r1102; + add.s32 %r1950, %r1949, %r1948; + xor.b32 %r1951, %r1950, %r1945; + shf.l.wrap.b32 %r1952, %r1951, %r1951, 24; + add.s32 %r1953, %r1952, %r1946; + xor.b32 %r1954, %r1953, %r1948; + shf.l.wrap.b32 %r1955, %r1954, %r1954, 25; + add.s32 %r1956, %r1908, %r1123; + add.s32 %r1957, %r1956, %r1871; + xor.b32 %r1958, %r1957, %r1896; + shf.l.wrap.b32 %r1959, %r1958, %r1958, 16; + add.s32 %r1960, %r1959, %r1883; + xor.b32 %r1961, %r1960, %r1871; + shf.l.wrap.b32 %r1962, %r1961, %r1961, 20; + add.s32 %r1963, %r1957, %r1165; + add.s32 %r1964, %r1963, %r1962; + xor.b32 %r1965, %r1964, %r1959; + shf.l.wrap.b32 %r1966, %r1965, %r1965, 24; + add.s32 %r1967, %r1966, %r1960; + xor.b32 %r1968, %r1967, %r1962; + shf.l.wrap.b32 %r1969, %r1968, %r1968, 25; + xor.b32 %r11657, %r1953, %r1922; + st.local.u32 [%rd3+-104], %r11657; + xor.b32 %r11656, %r1967, %r1936; + st.local.u32 [%rd3+-100], %r11656; + xor.b32 %r11655, %r1925, %r1950; + st.local.u32 [%rd3+-96], %r11655; + xor.b32 %r11654, %r1939, %r1964; + st.local.u32 [%rd3+-92], %r11654; + xor.b32 %r11653, %r1969, %r1938; + st.local.u32 [%rd3+-88], %r11653; + xor.b32 %r11652, %r1927, %r1952; + st.local.u32 [%rd3+-84], %r11652; + xor.b32 %r11651, %r1941, %r1966; + st.local.u32 [%rd3+-80], %r11651; + xor.b32 %r11650, %r1955, %r1924; + st.local.u32 [%rd3+-76], %r11650; + add.s16 %rs352, %rs352, 1; + st.local.u8 [%rd3+1], %rs352; + add.s64 %rd261, %rd261, 64; + add.s64 %rd244, %rd244, -64; + setp.gt.u64 %p10, %rd244, 64; + @%p10 bra $L__BB1_11; + +$L__BB1_12: + cvt.u64.u16 %rd120, %rs351; + and.b64 %rd24, %rd120, 255; + mov.u64 %rd121, 64; + sub.s64 %rd122, %rd121, %rd24; + min.u64 %rd25, %rd122, %rd244; + setp.eq.s64 %p11, %rd25, 0; + @%p11 bra $L__BB1_15; + + add.s64 %rd124, %rd2, %rd24; + add.s64 %rd26, %rd124, 72; + mov.u64 %rd245, 0; + +$L__BB1_14: + add.s64 %rd125, %rd261, %rd245; + ld.local.u8 %rs119, [%rd125]; + add.s64 %rd126, %rd26, %rd245; + st.local.u8 [%rd126], %rs119; + add.s64 %rd245, %rd245, 1; + setp.lt.u64 %p12, %rd245, %rd25; + @%p12 bra $L__BB1_14; + +$L__BB1_15: + cvt.u16.u64 %rs120, %rd25; + ld.local.u8 %rs121, [%rd3]; + add.s16 %rs13, %rs121, %rs120; + st.local.u8 [%rd3], %rs13; + mov.u64 %rd127, 80; + sub.s64 %rd29, %rd127, %rd6; + setp.eq.s64 %p13, %rd29, 0; + @%p13 bra $L__BB1_68; + + ld.local.u8 %rs122, [%rd3+1]; + setp.eq.s16 %p14, %rs122, 0; + selp.u16 %rs123, 1, 0, %p14; + ld.local.u8 %rs124, [%rd3+2]; + or.b16 %rs125, %rs124, %rs123; + or.b16 %rs126, %rs125, 2; + ld.local.u8 %r1970, [%rd3+-64]; + ld.local.u8 %r1971, [%rd3+-63]; + prmt.b32 %r1972, %r1971, %r1970, 30212; + ld.local.u8 %r1973, [%rd3+-62]; + prmt.b32 %r1974, %r1973, %r1972, 28756; + ld.local.u8 %r1975, [%rd3+-61]; + prmt.b32 %r1976, %r1975, %r1974, 1620; + ld.local.u8 %r1977, [%rd3+-60]; + ld.local.u8 %r1978, [%rd3+-59]; + prmt.b32 %r1979, %r1978, %r1977, 30212; + ld.local.u8 %r1980, [%rd3+-58]; + prmt.b32 %r1981, %r1980, %r1979, 28756; + ld.local.u8 %r1982, [%rd3+-57]; + prmt.b32 %r1983, %r1982, %r1981, 1620; + ld.local.u8 %r1984, [%rd3+-56]; + ld.local.u8 %r1985, [%rd3+-55]; + prmt.b32 %r1986, %r1985, %r1984, 30212; + ld.local.u8 %r1987, [%rd3+-54]; + prmt.b32 %r1988, %r1987, %r1986, 28756; + ld.local.u8 %r1989, [%rd3+-53]; + prmt.b32 %r1990, %r1989, %r1988, 1620; + ld.local.u8 %r1991, [%rd3+-52]; + ld.local.u8 %r1992, [%rd3+-51]; + prmt.b32 %r1993, %r1992, %r1991, 30212; + ld.local.u8 %r1994, [%rd3+-50]; + prmt.b32 %r1995, %r1994, %r1993, 28756; + ld.local.u8 %r1996, [%rd3+-49]; + prmt.b32 %r1997, %r1996, %r1995, 1620; + ld.local.u8 %r1998, [%rd3+-48]; + ld.local.u8 %r1999, [%rd3+-47]; + prmt.b32 %r2000, %r1999, %r1998, 30212; + ld.local.u8 %r2001, [%rd3+-46]; + prmt.b32 %r2002, %r2001, %r2000, 28756; + ld.local.u8 %r2003, [%rd3+-45]; + prmt.b32 %r2004, %r2003, %r2002, 1620; + ld.local.u8 %r2005, [%rd3+-44]; + ld.local.u8 %r2006, [%rd3+-43]; + prmt.b32 %r2007, %r2006, %r2005, 30212; + ld.local.u8 %r2008, [%rd3+-42]; + prmt.b32 %r2009, %r2008, %r2007, 28756; + ld.local.u8 %r2010, [%rd3+-41]; + prmt.b32 %r2011, %r2010, %r2009, 1620; + ld.local.u8 %r2012, [%rd3+-40]; + ld.local.u8 %r2013, [%rd3+-39]; + prmt.b32 %r2014, %r2013, %r2012, 30212; + ld.local.u8 %r2015, [%rd3+-38]; + prmt.b32 %r2016, %r2015, %r2014, 28756; + ld.local.u8 %r2017, [%rd3+-37]; + prmt.b32 %r2018, %r2017, %r2016, 1620; + ld.local.u8 %r2019, [%rd3+-36]; + ld.local.u8 %r2020, [%rd3+-35]; + prmt.b32 %r2021, %r2020, %r2019, 30212; + ld.local.u8 %r2022, [%rd3+-34]; + prmt.b32 %r2023, %r2022, %r2021, 28756; + ld.local.u8 %r2024, [%rd3+-33]; + prmt.b32 %r2025, %r2024, %r2023, 1620; + ld.local.u8 %r2026, [%rd3+-32]; + ld.local.u8 %r2027, [%rd3+-31]; + prmt.b32 %r2028, %r2027, %r2026, 30212; + ld.local.u8 %r2029, [%rd3+-30]; + prmt.b32 %r2030, %r2029, %r2028, 28756; + ld.local.u8 %r2031, [%rd3+-29]; + prmt.b32 %r2032, %r2031, %r2030, 1620; + ld.local.u8 %r2033, [%rd3+-28]; + ld.local.u8 %r2034, [%rd3+-27]; + prmt.b32 %r2035, %r2034, %r2033, 30212; + ld.local.u8 %r2036, [%rd3+-26]; + prmt.b32 %r2037, %r2036, %r2035, 28756; + ld.local.u8 %r2038, [%rd3+-25]; + prmt.b32 %r2039, %r2038, %r2037, 1620; + ld.local.u8 %r2040, [%rd3+-24]; + ld.local.u8 %r2041, [%rd3+-23]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd3+-22]; + prmt.b32 %r2044, %r2043, %r2042, 28756; + ld.local.u8 %r2045, [%rd3+-21]; + prmt.b32 %r2046, %r2045, %r2044, 1620; + ld.local.u8 %r2047, [%rd3+-20]; + ld.local.u8 %r2048, [%rd3+-19]; + prmt.b32 %r2049, %r2048, %r2047, 30212; + ld.local.u8 %r2050, [%rd3+-18]; + prmt.b32 %r2051, %r2050, %r2049, 28756; + ld.local.u8 %r2052, [%rd3+-17]; + prmt.b32 %r2053, %r2052, %r2051, 1620; + ld.local.u8 %r2054, [%rd3+-16]; + ld.local.u8 %r2055, [%rd3+-15]; + prmt.b32 %r2056, %r2055, %r2054, 30212; + ld.local.u8 %r2057, [%rd3+-14]; + prmt.b32 %r2058, %r2057, %r2056, 28756; + ld.local.u8 %r2059, [%rd3+-13]; + prmt.b32 %r2060, %r2059, %r2058, 1620; + ld.local.u8 %r2061, [%rd3+-12]; + ld.local.u8 %r2062, [%rd3+-11]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + ld.local.u8 %r2064, [%rd3+-10]; + prmt.b32 %r2065, %r2064, %r2063, 28756; + ld.local.u8 %r2066, [%rd3+-9]; + prmt.b32 %r2067, %r2066, %r2065, 1620; + ld.local.u8 %r2068, [%rd3+-8]; + ld.local.u8 %r2069, [%rd3+-7]; + prmt.b32 %r2070, %r2069, %r2068, 30212; + ld.local.u8 %r2071, [%rd3+-6]; + prmt.b32 %r2072, %r2071, %r2070, 28756; + ld.local.u8 %r2073, [%rd3+-5]; + prmt.b32 %r2074, %r2073, %r2072, 1620; + ld.local.u8 %r2075, [%rd3+-4]; + ld.local.u8 %r2076, [%rd3+-3]; + prmt.b32 %r2077, %r2076, %r2075, 30212; + ld.local.u8 %r2078, [%rd3+-2]; + prmt.b32 %r2079, %r2078, %r2077, 28756; + ld.local.u8 %r2080, [%rd3+-1]; + prmt.b32 %r2081, %r2080, %r2079, 1620; + ld.local.u64 %rd128, [%rd3+-72]; + cvt.u32.u64 %r2082, %rd128; + shr.u64 %rd129, %rd128, 32; + cvt.u32.u64 %r2083, %rd129; + cvt.u32.u16 %r2084, %rs126; + and.b32 %r2085, %r2084, 255; + ld.local.u8 %r2086, [%rd3+-88]; + ld.local.u8 %r2087, [%rd3+-87]; + prmt.b32 %r2088, %r2087, %r2086, 30212; + ld.local.u8 %r2089, [%rd3+-86]; + ld.local.u8 %r2090, [%rd3+-85]; + prmt.b32 %r2091, %r2090, %r2089, 30212; + prmt.b32 %r2092, %r2091, %r2088, 4180; + ld.local.u8 %r2093, [%rd3+-104]; + ld.local.u8 %r2094, [%rd3+-103]; + prmt.b32 %r2095, %r2094, %r2093, 30212; + ld.local.u8 %r2096, [%rd3+-102]; + ld.local.u8 %r2097, [%rd3+-101]; + prmt.b32 %r2098, %r2097, %r2096, 30212; + prmt.b32 %r2099, %r2098, %r2095, 4180; + add.s32 %r2100, %r2092, %r2099; + add.s32 %r2101, %r2100, %r1976; + xor.b32 %r2102, %r2101, %r2082; + shf.l.wrap.b32 %r2103, %r2102, %r2102, 16; + add.s32 %r2104, %r2103, 1779033703; + xor.b32 %r2105, %r2104, %r2092; + shf.l.wrap.b32 %r2106, %r2105, %r2105, 20; + add.s32 %r2107, %r1983, %r2101; + add.s32 %r2108, %r2107, %r2106; + xor.b32 %r2109, %r2108, %r2103; + shf.l.wrap.b32 %r2110, %r2109, %r2109, 24; + add.s32 %r2111, %r2110, %r2104; + xor.b32 %r2112, %r2111, %r2106; + shf.l.wrap.b32 %r2113, %r2112, %r2112, 25; + ld.local.u8 %r2114, [%rd3+-84]; + ld.local.u8 %r2115, [%rd3+-83]; + prmt.b32 %r2116, %r2115, %r2114, 30212; + ld.local.u8 %r2117, [%rd3+-82]; + ld.local.u8 %r2118, [%rd3+-81]; + prmt.b32 %r2119, %r2118, %r2117, 30212; + prmt.b32 %r2120, %r2119, %r2116, 4180; + ld.local.u8 %r2121, [%rd3+-100]; + ld.local.u8 %r2122, [%rd3+-99]; + prmt.b32 %r2123, %r2122, %r2121, 30212; + ld.local.u8 %r2124, [%rd3+-98]; + ld.local.u8 %r2125, [%rd3+-97]; + prmt.b32 %r2126, %r2125, %r2124, 30212; + prmt.b32 %r2127, %r2126, %r2123, 4180; + add.s32 %r2128, %r2120, %r2127; + add.s32 %r2129, %r2128, %r1990; + xor.b32 %r2130, %r2129, %r2083; + shf.l.wrap.b32 %r2131, %r2130, %r2130, 16; + add.s32 %r2132, %r2131, -1150833019; + xor.b32 %r2133, %r2132, %r2120; + shf.l.wrap.b32 %r2134, %r2133, %r2133, 20; + add.s32 %r2135, %r1997, %r2129; + add.s32 %r2136, %r2135, %r2134; + xor.b32 %r2137, %r2136, %r2131; + shf.l.wrap.b32 %r2138, %r2137, %r2137, 24; + add.s32 %r2139, %r2138, %r2132; + xor.b32 %r2140, %r2139, %r2134; + shf.l.wrap.b32 %r2141, %r2140, %r2140, 25; + ld.local.u8 %r2142, [%rd3+-80]; + ld.local.u8 %r2143, [%rd3+-79]; + prmt.b32 %r2144, %r2143, %r2142, 30212; + ld.local.u8 %r2145, [%rd3+-78]; + ld.local.u8 %r2146, [%rd3+-77]; + prmt.b32 %r2147, %r2146, %r2145, 30212; + prmt.b32 %r2148, %r2147, %r2144, 4180; + ld.local.u8 %r2149, [%rd3+-96]; + ld.local.u8 %r2150, [%rd3+-95]; + prmt.b32 %r2151, %r2150, %r2149, 30212; + ld.local.u8 %r2152, [%rd3+-94]; + ld.local.u8 %r2153, [%rd3+-93]; + prmt.b32 %r2154, %r2153, %r2152, 30212; + prmt.b32 %r2155, %r2154, %r2151, 4180; + add.s32 %r2156, %r2148, %r2155; + add.s32 %r2157, %r2156, %r2004; + cvt.u32.u16 %r2158, %rs13; + and.b32 %r2159, %r2158, 255; + xor.b32 %r2160, %r2157, %r2159; + shr.u32 %r2161, %r2157, 16; + shl.b32 %r2162, %r2160, 16; + or.b32 %r2163, %r2162, %r2161; + add.s32 %r2164, %r2163, 1013904242; + xor.b32 %r2165, %r2164, %r2148; + shf.l.wrap.b32 %r2166, %r2165, %r2165, 20; + add.s32 %r2167, %r2011, %r2157; + add.s32 %r2168, %r2167, %r2166; + xor.b32 %r2169, %r2168, %r2163; + shf.l.wrap.b32 %r2170, %r2169, %r2169, 24; + add.s32 %r2171, %r2170, %r2164; + xor.b32 %r2172, %r2171, %r2166; + shf.l.wrap.b32 %r2173, %r2172, %r2172, 25; + ld.local.u8 %r2174, [%rd3+-76]; + ld.local.u8 %r2175, [%rd3+-75]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.local.u8 %r2177, [%rd3+-74]; + ld.local.u8 %r2178, [%rd3+-73]; + prmt.b32 %r2179, %r2178, %r2177, 30212; + prmt.b32 %r2180, %r2179, %r2176, 4180; + ld.local.u8 %r2181, [%rd3+-92]; + ld.local.u8 %r2182, [%rd3+-91]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.local.u8 %r2184, [%rd3+-90]; + ld.local.u8 %r2185, [%rd3+-89]; + prmt.b32 %r2186, %r2185, %r2184, 30212; + prmt.b32 %r2187, %r2186, %r2183, 4180; + add.s32 %r2188, %r2180, %r2187; + add.s32 %r2189, %r2188, %r2018; + xor.b32 %r2190, %r2189, %r2085; + shr.u32 %r2191, %r2189, 16; + shl.b32 %r2192, %r2190, 16; + or.b32 %r2193, %r2192, %r2191; + add.s32 %r2194, %r2193, -1521486534; + xor.b32 %r2195, %r2194, %r2180; + shf.l.wrap.b32 %r2196, %r2195, %r2195, 20; + add.s32 %r2197, %r2025, %r2189; + add.s32 %r2198, %r2197, %r2196; + xor.b32 %r2199, %r2198, %r2193; + shf.l.wrap.b32 %r2200, %r2199, %r2199, 24; + add.s32 %r2201, %r2200, %r2194; + xor.b32 %r2202, %r2201, %r2196; + shf.l.wrap.b32 %r2203, %r2202, %r2202, 25; + add.s32 %r2204, %r2141, %r2108; + add.s32 %r2205, %r2204, %r2032; + xor.b32 %r2206, %r2200, %r2205; + shf.l.wrap.b32 %r2207, %r2206, %r2206, 16; + add.s32 %r2208, %r2207, %r2171; + xor.b32 %r2209, %r2208, %r2141; + shf.l.wrap.b32 %r2210, %r2209, %r2209, 20; + add.s32 %r2211, %r2039, %r2205; + add.s32 %r2212, %r2211, %r2210; + xor.b32 %r2213, %r2212, %r2207; + shf.l.wrap.b32 %r2214, %r2213, %r2213, 24; + add.s32 %r2215, %r2214, %r2208; + xor.b32 %r2216, %r2215, %r2210; + shf.l.wrap.b32 %r2217, %r2216, %r2216, 25; + add.s32 %r2218, %r2173, %r2136; + add.s32 %r2219, %r2218, %r2046; + xor.b32 %r2220, %r2219, %r2110; + shf.l.wrap.b32 %r2221, %r2220, %r2220, 16; + add.s32 %r2222, %r2221, %r2201; + xor.b32 %r2223, %r2222, %r2173; + shf.l.wrap.b32 %r2224, %r2223, %r2223, 20; + add.s32 %r2225, %r2053, %r2219; + add.s32 %r2226, %r2225, %r2224; + xor.b32 %r2227, %r2226, %r2221; + shf.l.wrap.b32 %r2228, %r2227, %r2227, 24; + add.s32 %r2229, %r2228, %r2222; + xor.b32 %r2230, %r2229, %r2224; + shf.l.wrap.b32 %r2231, %r2230, %r2230, 25; + add.s32 %r2232, %r2203, %r2168; + add.s32 %r2233, %r2232, %r2060; + xor.b32 %r2234, %r2233, %r2138; + shf.l.wrap.b32 %r2235, %r2234, %r2234, 16; + add.s32 %r2236, %r2235, %r2111; + xor.b32 %r2237, %r2236, %r2203; + shf.l.wrap.b32 %r2238, %r2237, %r2237, 20; + add.s32 %r2239, %r2067, %r2233; + add.s32 %r2240, %r2239, %r2238; + xor.b32 %r2241, %r2240, %r2235; + shf.l.wrap.b32 %r2242, %r2241, %r2241, 24; + add.s32 %r2243, %r2242, %r2236; + xor.b32 %r2244, %r2243, %r2238; + shf.l.wrap.b32 %r2245, %r2244, %r2244, 25; + add.s32 %r2246, %r2198, %r2113; + add.s32 %r2247, %r2246, %r2074; + xor.b32 %r2248, %r2247, %r2170; + shf.l.wrap.b32 %r2249, %r2248, %r2248, 16; + add.s32 %r2250, %r2249, %r2139; + xor.b32 %r2251, %r2250, %r2113; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 20; + add.s32 %r2253, %r2081, %r2247; + add.s32 %r2254, %r2253, %r2252; + xor.b32 %r2255, %r2254, %r2249; + shf.l.wrap.b32 %r2256, %r2255, %r2255, 24; + add.s32 %r2257, %r2256, %r2250; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 25; + add.s32 %r2260, %r2212, %r1990; + add.s32 %r2261, %r2260, %r2259; + xor.b32 %r2262, %r2261, %r2228; + shf.l.wrap.b32 %r2263, %r2262, %r2262, 16; + add.s32 %r2264, %r2263, %r2243; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 20; + add.s32 %r2267, %r2261, %r2018; + add.s32 %r2268, %r2267, %r2266; + xor.b32 %r2269, %r2268, %r2263; + shf.l.wrap.b32 %r2270, %r2269, %r2269, 24; + add.s32 %r2271, %r2270, %r2264; + xor.b32 %r2272, %r2271, %r2266; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 25; + add.s32 %r2274, %r2226, %r1997; + add.s32 %r2275, %r2274, %r2217; + xor.b32 %r2276, %r2242, %r2275; + shf.l.wrap.b32 %r2277, %r2276, %r2276, 16; + add.s32 %r2278, %r2257, %r2277; + xor.b32 %r2279, %r2278, %r2217; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 20; + add.s32 %r2281, %r2275, %r2046; + add.s32 %r2282, %r2281, %r2280; + xor.b32 %r2283, %r2282, %r2277; + shf.l.wrap.b32 %r2284, %r2283, %r2283, 24; + add.s32 %r2285, %r2284, %r2278; + xor.b32 %r2286, %r2285, %r2280; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 25; + add.s32 %r2288, %r2231, %r2025; + add.s32 %r2289, %r2288, %r2240; + xor.b32 %r2290, %r2256, %r2289; + shf.l.wrap.b32 %r2291, %r2290, %r2290, 16; + add.s32 %r2292, %r2291, %r2215; + xor.b32 %r2293, %r2292, %r2231; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 20; + add.s32 %r2295, %r2289, %r1976; + add.s32 %r2296, %r2295, %r2294; + xor.b32 %r2297, %r2296, %r2291; + shf.l.wrap.b32 %r2298, %r2297, %r2297, 24; + add.s32 %r2299, %r2298, %r2292; + xor.b32 %r2300, %r2299, %r2294; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 25; + add.s32 %r2302, %r2245, %r2004; + add.s32 %r2303, %r2302, %r2254; + xor.b32 %r2304, %r2303, %r2214; + shf.l.wrap.b32 %r2305, %r2304, %r2304, 16; + add.s32 %r2306, %r2305, %r2229; + xor.b32 %r2307, %r2306, %r2245; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 20; + add.s32 %r2309, %r2303, %r2067; + add.s32 %r2310, %r2309, %r2308; + xor.b32 %r2311, %r2310, %r2305; + shf.l.wrap.b32 %r2312, %r2311, %r2311, 24; + add.s32 %r2313, %r2312, %r2306; + xor.b32 %r2314, %r2313, %r2308; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 25; + add.s32 %r2316, %r2287, %r1983; + add.s32 %r2317, %r2316, %r2268; + xor.b32 %r2318, %r2317, %r2312; + shf.l.wrap.b32 %r2319, %r2318, %r2318, 16; + add.s32 %r2320, %r2319, %r2299; + xor.b32 %r2321, %r2320, %r2287; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 20; + add.s32 %r2323, %r2317, %r2053; + add.s32 %r2324, %r2323, %r2322; + xor.b32 %r2325, %r2324, %r2319; + shf.l.wrap.b32 %r2326, %r2325, %r2325, 24; + add.s32 %r2327, %r2326, %r2320; + xor.b32 %r2328, %r2327, %r2322; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 25; + add.s32 %r2330, %r2282, %r2060; + add.s32 %r2331, %r2330, %r2301; + xor.b32 %r2332, %r2270, %r2331; + shf.l.wrap.b32 %r2333, %r2332, %r2332, 16; + add.s32 %r2334, %r2333, %r2313; + xor.b32 %r2335, %r2334, %r2301; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 20; + add.s32 %r2337, %r2331, %r2011; + add.s32 %r2338, %r2337, %r2336; + xor.b32 %r2339, %r2338, %r2333; + shf.l.wrap.b32 %r2340, %r2339, %r2339, 24; + add.s32 %r2341, %r2340, %r2334; + xor.b32 %r2342, %r2341, %r2336; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 25; + add.s32 %r2344, %r2296, %r2039; + add.s32 %r2345, %r2344, %r2315; + xor.b32 %r2346, %r2345, %r2284; + shf.l.wrap.b32 %r2347, %r2346, %r2346, 16; + add.s32 %r2348, %r2347, %r2271; + xor.b32 %r2349, %r2348, %r2315; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 20; + add.s32 %r2351, %r2345, %r2074; + add.s32 %r2352, %r2351, %r2350; + xor.b32 %r2353, %r2352, %r2347; + shf.l.wrap.b32 %r2354, %r2353, %r2353, 24; + add.s32 %r2355, %r2354, %r2348; + xor.b32 %r2356, %r2355, %r2350; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 25; + add.s32 %r2358, %r2310, %r2081; + add.s32 %r2359, %r2358, %r2273; + xor.b32 %r2360, %r2359, %r2298; + shf.l.wrap.b32 %r2361, %r2360, %r2360, 16; + add.s32 %r2362, %r2361, %r2285; + xor.b32 %r2363, %r2362, %r2273; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 20; + add.s32 %r2365, %r2359, %r2032; + add.s32 %r2366, %r2365, %r2364; + xor.b32 %r2367, %r2366, %r2361; + shf.l.wrap.b32 %r2368, %r2367, %r2367, 24; + add.s32 %r2369, %r2368, %r2362; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 25; + add.s32 %r2372, %r2324, %r1997; + add.s32 %r2373, %r2372, %r2371; + xor.b32 %r2374, %r2373, %r2340; + shf.l.wrap.b32 %r2375, %r2374, %r2374, 16; + add.s32 %r2376, %r2375, %r2355; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 20; + add.s32 %r2379, %r2373, %r2004; + add.s32 %r2380, %r2379, %r2378; + xor.b32 %r2381, %r2380, %r2375; + shf.l.wrap.b32 %r2382, %r2381, %r2381, 24; + add.s32 %r2383, %r2382, %r2376; + xor.b32 %r2384, %r2383, %r2378; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 25; + add.s32 %r2386, %r2338, %r2046; + add.s32 %r2387, %r2386, %r2329; + xor.b32 %r2388, %r2387, %r2354; + shf.l.wrap.b32 %r2389, %r2388, %r2388, 16; + add.s32 %r2390, %r2389, %r2369; + xor.b32 %r2391, %r2390, %r2329; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 20; + add.s32 %r2393, %r2387, %r2060; + add.s32 %r2394, %r2393, %r2392; + xor.b32 %r2395, %r2394, %r2389; + shf.l.wrap.b32 %r2396, %r2395, %r2395, 24; + add.s32 %r2397, %r2396, %r2390; + xor.b32 %r2398, %r2397, %r2392; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 25; + add.s32 %r2400, %r2352, %r2067; + add.s32 %r2401, %r2400, %r2343; + xor.b32 %r2402, %r2368, %r2401; + shf.l.wrap.b32 %r2403, %r2402, %r2402, 16; + add.s32 %r2404, %r2403, %r2327; + xor.b32 %r2405, %r2404, %r2343; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 20; + add.s32 %r2407, %r2401, %r1990; + add.s32 %r2408, %r2407, %r2406; + xor.b32 %r2409, %r2408, %r2403; + shf.l.wrap.b32 %r2410, %r2409, %r2409, 24; + add.s32 %r2411, %r2410, %r2404; + xor.b32 %r2412, %r2411, %r2406; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 25; + add.s32 %r2414, %r2357, %r2025; + add.s32 %r2415, %r2414, %r2366; + xor.b32 %r2416, %r2415, %r2326; + shf.l.wrap.b32 %r2417, %r2416, %r2416, 16; + add.s32 %r2418, %r2417, %r2341; + xor.b32 %r2419, %r2418, %r2357; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 20; + add.s32 %r2421, %r2415, %r2074; + add.s32 %r2422, %r2421, %r2420; + xor.b32 %r2423, %r2422, %r2417; + shf.l.wrap.b32 %r2424, %r2423, %r2423, 24; + add.s32 %r2425, %r2424, %r2418; + xor.b32 %r2426, %r2425, %r2420; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 25; + add.s32 %r2428, %r2399, %r2018; + add.s32 %r2429, %r2428, %r2380; + xor.b32 %r2430, %r2429, %r2424; + shf.l.wrap.b32 %r2431, %r2430, %r2430, 16; + add.s32 %r2432, %r2431, %r2411; + xor.b32 %r2433, %r2432, %r2399; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 20; + add.s32 %r2435, %r2429, %r2011; + add.s32 %r2436, %r2435, %r2434; + xor.b32 %r2437, %r2436, %r2431; + shf.l.wrap.b32 %r2438, %r2437, %r2437, 24; + add.s32 %r2439, %r2438, %r2432; + xor.b32 %r2440, %r2439, %r2434; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 25; + add.s32 %r2442, %r2394, %r2039; + add.s32 %r2443, %r2442, %r2413; + xor.b32 %r2444, %r2382, %r2443; + shf.l.wrap.b32 %r2445, %r2444, %r2444, 16; + add.s32 %r2446, %r2445, %r2425; + xor.b32 %r2447, %r2446, %r2413; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 20; + add.s32 %r2449, %r2443, %r1976; + add.s32 %r2450, %r2449, %r2448; + xor.b32 %r2451, %r2450, %r2445; + shf.l.wrap.b32 %r2452, %r2451, %r2451, 24; + add.s32 %r2453, %r2452, %r2446; + xor.b32 %r2454, %r2453, %r2448; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 25; + add.s32 %r2456, %r2408, %r2053; + add.s32 %r2457, %r2456, %r2427; + xor.b32 %r2458, %r2457, %r2396; + shf.l.wrap.b32 %r2459, %r2458, %r2458, 16; + add.s32 %r2460, %r2459, %r2383; + xor.b32 %r2461, %r2460, %r2427; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 20; + add.s32 %r2463, %r2457, %r2081; + add.s32 %r2464, %r2463, %r2462; + xor.b32 %r2465, %r2464, %r2459; + shf.l.wrap.b32 %r2466, %r2465, %r2465, 24; + add.s32 %r2467, %r2466, %r2460; + xor.b32 %r2468, %r2467, %r2462; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 25; + add.s32 %r2470, %r2422, %r2032; + add.s32 %r2471, %r2470, %r2385; + xor.b32 %r2472, %r2471, %r2410; + shf.l.wrap.b32 %r2473, %r2472, %r2472, 16; + add.s32 %r2474, %r2473, %r2397; + xor.b32 %r2475, %r2474, %r2385; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 20; + add.s32 %r2477, %r2471, %r1983; + add.s32 %r2478, %r2477, %r2476; + xor.b32 %r2479, %r2478, %r2473; + shf.l.wrap.b32 %r2480, %r2479, %r2479, 24; + add.s32 %r2481, %r2480, %r2474; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 25; + add.s32 %r2484, %r2436, %r2046; + add.s32 %r2485, %r2484, %r2483; + xor.b32 %r2486, %r2485, %r2452; + shf.l.wrap.b32 %r2487, %r2486, %r2486, 16; + add.s32 %r2488, %r2487, %r2467; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 20; + add.s32 %r2491, %r2485, %r2025; + add.s32 %r2492, %r2491, %r2490; + xor.b32 %r2493, %r2492, %r2487; + shf.l.wrap.b32 %r2494, %r2493, %r2493, 24; + add.s32 %r2495, %r2494, %r2488; + xor.b32 %r2496, %r2495, %r2490; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 25; + add.s32 %r2498, %r2450, %r2060; + add.s32 %r2499, %r2498, %r2441; + xor.b32 %r2500, %r2499, %r2466; + shf.l.wrap.b32 %r2501, %r2500, %r2500, 16; + add.s32 %r2502, %r2501, %r2481; + xor.b32 %r2503, %r2502, %r2441; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 20; + add.s32 %r2505, %r2499, %r2039; + add.s32 %r2506, %r2505, %r2504; + xor.b32 %r2507, %r2506, %r2501; + shf.l.wrap.b32 %r2508, %r2507, %r2507, 24; + add.s32 %r2509, %r2508, %r2502; + xor.b32 %r2510, %r2509, %r2504; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 25; + add.s32 %r2512, %r2464, %r2074; + add.s32 %r2513, %r2512, %r2455; + xor.b32 %r2514, %r2480, %r2513; + shf.l.wrap.b32 %r2515, %r2514, %r2514, 16; + add.s32 %r2516, %r2515, %r2439; + xor.b32 %r2517, %r2516, %r2455; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 20; + add.s32 %r2519, %r2513, %r1997; + add.s32 %r2520, %r2519, %r2518; + xor.b32 %r2521, %r2520, %r2515; + shf.l.wrap.b32 %r2522, %r2521, %r2521, 24; + add.s32 %r2523, %r2522, %r2516; + xor.b32 %r2524, %r2523, %r2518; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 25; + add.s32 %r2526, %r2469, %r2067; + add.s32 %r2527, %r2526, %r2478; + xor.b32 %r2528, %r2527, %r2438; + shf.l.wrap.b32 %r2529, %r2528, %r2528, 16; + add.s32 %r2530, %r2529, %r2453; + xor.b32 %r2531, %r2530, %r2469; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 20; + add.s32 %r2533, %r2527, %r2081; + add.s32 %r2534, %r2533, %r2532; + xor.b32 %r2535, %r2534, %r2529; + shf.l.wrap.b32 %r2536, %r2535, %r2535, 24; + add.s32 %r2537, %r2536, %r2530; + xor.b32 %r2538, %r2537, %r2532; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 25; + add.s32 %r2540, %r2511, %r2004; + add.s32 %r2541, %r2540, %r2492; + xor.b32 %r2542, %r2541, %r2536; + shf.l.wrap.b32 %r2543, %r2542, %r2542, 16; + add.s32 %r2544, %r2543, %r2523; + xor.b32 %r2545, %r2544, %r2511; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 20; + add.s32 %r2547, %r2541, %r1976; + add.s32 %r2548, %r2547, %r2546; + xor.b32 %r2549, %r2548, %r2543; + shf.l.wrap.b32 %r2550, %r2549, %r2549, 24; + add.s32 %r2551, %r2550, %r2544; + xor.b32 %r2552, %r2551, %r2546; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 25; + add.s32 %r2554, %r2506, %r2053; + add.s32 %r2555, %r2554, %r2525; + xor.b32 %r2556, %r2494, %r2555; + shf.l.wrap.b32 %r2557, %r2556, %r2556, 16; + add.s32 %r2558, %r2557, %r2537; + xor.b32 %r2559, %r2558, %r2525; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 20; + add.s32 %r2561, %r2555, %r1990; + add.s32 %r2562, %r2561, %r2560; + xor.b32 %r2563, %r2562, %r2557; + shf.l.wrap.b32 %r2564, %r2563, %r2563, 24; + add.s32 %r2565, %r2564, %r2558; + xor.b32 %r2566, %r2565, %r2560; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 25; + add.s32 %r2568, %r2520, %r2011; + add.s32 %r2569, %r2568, %r2539; + xor.b32 %r2570, %r2569, %r2508; + shf.l.wrap.b32 %r2571, %r2570, %r2570, 16; + add.s32 %r2572, %r2571, %r2495; + xor.b32 %r2573, %r2572, %r2539; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 20; + add.s32 %r2575, %r2569, %r2032; + add.s32 %r2576, %r2575, %r2574; + xor.b32 %r2577, %r2576, %r2571; + shf.l.wrap.b32 %r2578, %r2577, %r2577, 24; + add.s32 %r2579, %r2578, %r2572; + xor.b32 %r2580, %r2579, %r2574; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 25; + add.s32 %r2582, %r2534, %r1983; + add.s32 %r2583, %r2582, %r2497; + xor.b32 %r2584, %r2583, %r2522; + shf.l.wrap.b32 %r2585, %r2584, %r2584, 16; + add.s32 %r2586, %r2585, %r2509; + xor.b32 %r2587, %r2586, %r2497; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 20; + add.s32 %r2589, %r2583, %r2018; + add.s32 %r2590, %r2589, %r2588; + xor.b32 %r2591, %r2590, %r2585; + shf.l.wrap.b32 %r2592, %r2591, %r2591, 24; + add.s32 %r2593, %r2592, %r2586; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 25; + add.s32 %r2596, %r2548, %r2060; + add.s32 %r2597, %r2596, %r2595; + xor.b32 %r2598, %r2597, %r2564; + shf.l.wrap.b32 %r2599, %r2598, %r2598, 16; + add.s32 %r2600, %r2599, %r2579; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 20; + add.s32 %r2603, %r2597, %r2067; + add.s32 %r2604, %r2603, %r2602; + xor.b32 %r2605, %r2604, %r2599; + shf.l.wrap.b32 %r2606, %r2605, %r2605, 24; + add.s32 %r2607, %r2606, %r2600; + xor.b32 %r2608, %r2607, %r2602; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 25; + add.s32 %r2610, %r2562, %r2039; + add.s32 %r2611, %r2610, %r2553; + xor.b32 %r2612, %r2611, %r2578; + shf.l.wrap.b32 %r2613, %r2612, %r2612, 16; + add.s32 %r2614, %r2613, %r2593; + xor.b32 %r2615, %r2614, %r2553; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 20; + add.s32 %r2617, %r2611, %r2053; + add.s32 %r2618, %r2617, %r2616; + xor.b32 %r2619, %r2618, %r2613; + shf.l.wrap.b32 %r2620, %r2619, %r2619, 24; + add.s32 %r2621, %r2620, %r2614; + xor.b32 %r2622, %r2621, %r2616; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 25; + add.s32 %r2624, %r2576, %r2081; + add.s32 %r2625, %r2624, %r2567; + xor.b32 %r2626, %r2592, %r2625; + shf.l.wrap.b32 %r2627, %r2626, %r2626, 16; + add.s32 %r2628, %r2627, %r2551; + xor.b32 %r2629, %r2628, %r2567; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 20; + add.s32 %r2631, %r2625, %r2046; + add.s32 %r2632, %r2631, %r2630; + xor.b32 %r2633, %r2632, %r2627; + shf.l.wrap.b32 %r2634, %r2633, %r2633, 24; + add.s32 %r2635, %r2634, %r2628; + xor.b32 %r2636, %r2635, %r2630; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 25; + add.s32 %r2638, %r2581, %r2074; + add.s32 %r2639, %r2638, %r2590; + xor.b32 %r2640, %r2639, %r2550; + shf.l.wrap.b32 %r2641, %r2640, %r2640, 16; + add.s32 %r2642, %r2641, %r2565; + xor.b32 %r2643, %r2642, %r2581; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 20; + add.s32 %r2645, %r2639, %r2032; + add.s32 %r2646, %r2645, %r2644; + xor.b32 %r2647, %r2646, %r2641; + shf.l.wrap.b32 %r2648, %r2647, %r2647, 24; + add.s32 %r2649, %r2648, %r2642; + xor.b32 %r2650, %r2649, %r2644; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 25; + add.s32 %r2652, %r2623, %r2025; + add.s32 %r2653, %r2652, %r2604; + xor.b32 %r2654, %r2653, %r2648; + shf.l.wrap.b32 %r2655, %r2654, %r2654, 16; + add.s32 %r2656, %r2655, %r2635; + xor.b32 %r2657, %r2656, %r2623; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 20; + add.s32 %r2659, %r2653, %r1990; + add.s32 %r2660, %r2659, %r2658; + xor.b32 %r2661, %r2660, %r2655; + shf.l.wrap.b32 %r2662, %r2661, %r2661, 24; + add.s32 %r2663, %r2662, %r2656; + xor.b32 %r2664, %r2663, %r2658; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 25; + add.s32 %r2666, %r2618, %r2011; + add.s32 %r2667, %r2666, %r2637; + xor.b32 %r2668, %r2606, %r2667; + shf.l.wrap.b32 %r2669, %r2668, %r2668, 16; + add.s32 %r2670, %r2669, %r2649; + xor.b32 %r2671, %r2670, %r2637; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 20; + add.s32 %r2673, %r2667, %r1997; + add.s32 %r2674, %r2673, %r2672; + xor.b32 %r2675, %r2674, %r2669; + shf.l.wrap.b32 %r2676, %r2675, %r2675, 24; + add.s32 %r2677, %r2676, %r2670; + xor.b32 %r2678, %r2677, %r2672; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 25; + add.s32 %r2680, %r2632, %r1976; + add.s32 %r2681, %r2680, %r2651; + xor.b32 %r2682, %r2681, %r2620; + shf.l.wrap.b32 %r2683, %r2682, %r2682, 16; + add.s32 %r2684, %r2683, %r2607; + xor.b32 %r2685, %r2684, %r2651; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 20; + add.s32 %r2687, %r2681, %r1983; + add.s32 %r2688, %r2687, %r2686; + xor.b32 %r2689, %r2688, %r2683; + shf.l.wrap.b32 %r2690, %r2689, %r2689, 24; + add.s32 %r2691, %r2690, %r2684; + xor.b32 %r2692, %r2691, %r2686; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 25; + add.s32 %r2694, %r2646, %r2018; + add.s32 %r2695, %r2694, %r2609; + xor.b32 %r2696, %r2695, %r2634; + shf.l.wrap.b32 %r2697, %r2696, %r2696, 16; + add.s32 %r2698, %r2697, %r2621; + xor.b32 %r2699, %r2698, %r2609; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 20; + add.s32 %r2701, %r2695, %r2004; + add.s32 %r2702, %r2701, %r2700; + xor.b32 %r2703, %r2702, %r2697; + shf.l.wrap.b32 %r2704, %r2703, %r2703, 24; + add.s32 %r2705, %r2704, %r2698; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 25; + add.s32 %r2708, %r2660, %r2039; + add.s32 %r2709, %r2708, %r2707; + xor.b32 %r2710, %r2709, %r2676; + shf.l.wrap.b32 %r2711, %r2710, %r2710, 16; + add.s32 %r2712, %r2711, %r2691; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 20; + add.s32 %r2715, %r2709, %r2074; + add.s32 %r2716, %r2715, %r2714; + xor.b32 %r2717, %r2716, %r2711; + shf.l.wrap.b32 %r2718, %r2717, %r2717, 24; + add.s32 %r2719, %r2718, %r2712; + xor.b32 %r2720, %r2719, %r2714; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 25; + add.s32 %r2722, %r2674, %r2053; + add.s32 %r2723, %r2722, %r2665; + xor.b32 %r2724, %r2723, %r2690; + shf.l.wrap.b32 %r2725, %r2724, %r2724, 16; + add.s32 %r2726, %r2725, %r2705; + xor.b32 %r2727, %r2726, %r2665; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 20; + add.s32 %r2729, %r2723, %r2011; + add.s32 %r2730, %r2729, %r2728; + xor.b32 %r2731, %r2730, %r2725; + shf.l.wrap.b32 %r2732, %r2731, %r2731, 24; + add.s32 %r2733, %r2732, %r2726; + xor.b32 %r2734, %r2733, %r2728; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 25; + add.s32 %r2736, %r2688, %r2032; + add.s32 %r2737, %r2736, %r2679; + xor.b32 %r2738, %r2704, %r2737; + shf.l.wrap.b32 %r2739, %r2738, %r2738, 16; + add.s32 %r2740, %r2739, %r2663; + xor.b32 %r2741, %r2740, %r2679; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 20; + add.s32 %r2743, %r2737, %r2060; + add.s32 %r2744, %r2743, %r2742; + xor.b32 %r2745, %r2744, %r2739; + shf.l.wrap.b32 %r2746, %r2745, %r2745, 24; + add.s32 %r2747, %r2746, %r2740; + xor.b32 %r2748, %r2747, %r2742; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 25; + add.s32 %r2750, %r2693, %r2081; + add.s32 %r2751, %r2750, %r2702; + xor.b32 %r2752, %r2751, %r2662; + shf.l.wrap.b32 %r2753, %r2752, %r2752, 16; + add.s32 %r2754, %r2753, %r2677; + xor.b32 %r2755, %r2754, %r2693; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 20; + add.s32 %r2757, %r2751, %r1983; + add.s32 %r2758, %r2757, %r2756; + xor.b32 %r2759, %r2758, %r2753; + shf.l.wrap.b32 %r2760, %r2759, %r2759, 24; + add.s32 %r2761, %r2760, %r2754; + xor.b32 %r2762, %r2761, %r2756; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 25; + add.s32 %r2764, %r2735, %r2067; + add.s32 %r2765, %r2764, %r2716; + xor.b32 %r2766, %r2765, %r2760; + shf.l.wrap.b32 %r2767, %r2766, %r2766, 16; + add.s32 %r2768, %r2767, %r2747; + xor.b32 %r2769, %r2768, %r2735; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 20; + add.s32 %r2771, %r2765, %r1997; + add.s32 %r2772, %r2771, %r2770; + xor.b32 %r2773, %r2772, %r2767; + shf.l.wrap.b32 %r2774, %r2773, %r2773, 24; + add.s32 %r2775, %r2774, %r2768; + xor.b32 %r2776, %r2775, %r2770; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 25; + add.s32 %r2778, %r2730, %r1976; + add.s32 %r2779, %r2778, %r2749; + xor.b32 %r2780, %r2718, %r2779; + shf.l.wrap.b32 %r2781, %r2780, %r2780, 16; + add.s32 %r2782, %r2781, %r2761; + xor.b32 %r2783, %r2782, %r2749; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 20; + add.s32 %r2785, %r2779, %r2046; + add.s32 %r2786, %r2785, %r2784; + xor.b32 %r2787, %r2786, %r2781; + shf.l.wrap.b32 %r2788, %r2787, %r2787, 24; + add.s32 %r2789, %r2788, %r2782; + xor.b32 %r2790, %r2789, %r2784; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 25; + add.s32 %r2792, %r2744, %r1990; + add.s32 %r2793, %r2792, %r2763; + xor.b32 %r2794, %r2793, %r2732; + shf.l.wrap.b32 %r2795, %r2794, %r2794, 16; + add.s32 %r2796, %r2795, %r2719; + xor.b32 %r2797, %r2796, %r2763; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 20; + add.s32 %r2799, %r2793, %r2018; + add.s32 %r2800, %r2799, %r2798; + xor.b32 %r2801, %r2800, %r2795; + shf.l.wrap.b32 %r2802, %r2801, %r2801, 24; + add.s32 %r2803, %r2802, %r2796; + xor.b32 %r2804, %r2803, %r2798; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 25; + add.s32 %r2806, %r2758, %r2004; + add.s32 %r2807, %r2806, %r2721; + xor.b32 %r2808, %r2807, %r2746; + shf.l.wrap.b32 %r2809, %r2808, %r2808, 16; + add.s32 %r2810, %r2809, %r2733; + xor.b32 %r2811, %r2810, %r2721; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 20; + add.s32 %r2813, %r2807, %r2025; + add.s32 %r2814, %r2813, %r2812; + xor.b32 %r2815, %r2814, %r2809; + shf.l.wrap.b32 %r2816, %r2815, %r2815, 24; + add.s32 %r2817, %r2816, %r2810; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 25; + add.s32 %r2820, %r2772, %r2053; + add.s32 %r2821, %r2820, %r2819; + xor.b32 %r2822, %r2821, %r2788; + shf.l.wrap.b32 %r2823, %r2822, %r2822, 16; + add.s32 %r2824, %r2823, %r2803; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 20; + add.s32 %r2827, %r2821, %r2081; + add.s32 %r2828, %r2827, %r2826; + xor.b32 %r2829, %r2828, %r2823; + shf.l.wrap.b32 %r2830, %r2829, %r2829, 24; + add.s32 %r2831, %r2830, %r2824; + xor.b32 %r2832, %r2831, %r2826; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 25; + add.s32 %r2834, %r2786, %r2011; + add.s32 %r2835, %r2834, %r2777; + xor.b32 %r2836, %r2835, %r2802; + shf.l.wrap.b32 %r2837, %r2836, %r2836, 16; + add.s32 %r2838, %r2837, %r2817; + xor.b32 %r2839, %r2838, %r2777; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 20; + add.s32 %r2841, %r2835, %r1976; + add.s32 %r2842, %r2841, %r2840; + xor.b32 %r2843, %r2842, %r2837; + shf.l.wrap.b32 %r2844, %r2843, %r2843, 24; + add.s32 %r2845, %r2844, %r2838; + xor.b32 %r2846, %r2845, %r2840; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 25; + add.s32 %r2848, %r2800, %r1983; + add.s32 %r2849, %r2848, %r2791; + xor.b32 %r2850, %r2816, %r2849; + shf.l.wrap.b32 %r2851, %r2850, %r2850, 16; + add.s32 %r2852, %r2851, %r2775; + xor.b32 %r2853, %r2852, %r2791; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 20; + add.s32 %r2855, %r2849, %r2039; + add.s32 %r2856, %r2855, %r2854; + xor.b32 %r2857, %r2856, %r2851; + shf.l.wrap.b32 %r2858, %r2857, %r2857, 24; + add.s32 %r2859, %r2858, %r2852; + xor.b32 %r2860, %r2859, %r2854; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 25; + add.s32 %r2862, %r2805, %r2032; + add.s32 %r2863, %r2862, %r2814; + xor.b32 %r2864, %r2863, %r2774; + shf.l.wrap.b32 %r2865, %r2864, %r2864, 16; + add.s32 %r2866, %r2865, %r2789; + xor.b32 %r2867, %r2866, %r2805; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 20; + add.s32 %r2869, %r2863, %r2018; + add.s32 %r2870, %r2869, %r2868; + xor.b32 %r2871, %r2870, %r2865; + shf.l.wrap.b32 %r2872, %r2871, %r2871, 24; + add.s32 %r2873, %r2872, %r2866; + xor.b32 %r2874, %r2873, %r2868; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 25; + add.s32 %r2876, %r2847, %r2074; + add.s32 %r2877, %r2876, %r2828; + xor.b32 %r2878, %r2877, %r2872; + shf.l.wrap.b32 %r2879, %r2878, %r2878, 16; + add.s32 %r2880, %r2879, %r2859; + xor.b32 %r2881, %r2880, %r2847; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 20; + add.s32 %r2883, %r2877, %r2046; + add.s32 %r2884, %r2883, %r2882; + xor.b32 %r2885, %r2884, %r2879; + shf.l.wrap.b32 %r2886, %r2885, %r2885, 24; + add.s32 %r2887, %r2886, %r2880; + xor.b32 %r2888, %r2887, %r2882; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 25; + add.s32 %r2890, %r2842, %r1990; + add.s32 %r2891, %r2890, %r2861; + xor.b32 %r2892, %r2830, %r2891; + shf.l.wrap.b32 %r2893, %r2892, %r2892, 16; + add.s32 %r2894, %r2893, %r2873; + xor.b32 %r2895, %r2894, %r2861; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 20; + add.s32 %r2897, %r2891, %r2060; + add.s32 %r2898, %r2897, %r2896; + xor.b32 %r2899, %r2898, %r2893; + shf.l.wrap.b32 %r2900, %r2899, %r2899, 24; + add.s32 %r2901, %r2900, %r2894; + xor.b32 %r2902, %r2901, %r2896; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 25; + add.s32 %r2904, %r2856, %r1997; + add.s32 %r2905, %r2904, %r2875; + xor.b32 %r2906, %r2905, %r2844; + shf.l.wrap.b32 %r2907, %r2906, %r2906, 16; + add.s32 %r2908, %r2907, %r2831; + xor.b32 %r2909, %r2908, %r2875; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 20; + add.s32 %r2911, %r2905, %r2004; + add.s32 %r2912, %r2911, %r2910; + xor.b32 %r2913, %r2912, %r2907; + shf.l.wrap.b32 %r2914, %r2913, %r2913, 24; + add.s32 %r2915, %r2914, %r2908; + xor.b32 %r2916, %r2915, %r2910; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 25; + add.s32 %r2918, %r2870, %r2025; + add.s32 %r2919, %r2918, %r2833; + xor.b32 %r2920, %r2919, %r2858; + shf.l.wrap.b32 %r2921, %r2920, %r2920, 16; + add.s32 %r2922, %r2921, %r2845; + xor.b32 %r2923, %r2922, %r2833; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 20; + add.s32 %r2925, %r2919, %r2067; + add.s32 %r2926, %r2925, %r2924; + xor.b32 %r2927, %r2926, %r2921; + shf.l.wrap.b32 %r2928, %r2927, %r2927, 24; + add.s32 %r2929, %r2928, %r2922; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 25; + xor.b32 %r27, %r2915, %r2884; + xor.b32 %r28, %r2929, %r2898; + xor.b32 %r29, %r2887, %r2912; + xor.b32 %r30, %r2926, %r2901; + xor.b32 %r31, %r2931, %r2900; + xor.b32 %r32, %r2889, %r2914; + xor.b32 %r33, %r2928, %r2903; + xor.b32 %r34, %r2917, %r2886; + popc.b64 %r2932, %rd128; + cvt.u64.u32 %rd30, %r2932; + ld.local.u8 %rs127, [%rd3+8]; + cvt.u64.u16 %rd130, %rs127; + setp.ge.u64 %p15, %rd30, %rd130; + mul.wide.u16 %r11659, %rs127, 32; + @%p15 bra $L__BB1_19; + +$L__BB1_18: + add.s32 %r2933, %r11659, -64; + cvt.s64.s32 %rd131, %r2933; + add.s64 %rd132, %rd2, %rd131; + ld.local.u8 %r2934, [%rd3+2]; + ld.local.u8 %r2935, [%rd132+145]; + ld.local.u8 %r2936, [%rd132+146]; + prmt.b32 %r2937, %r2936, %r2935, 30212; + ld.local.u8 %r2938, [%rd132+147]; + prmt.b32 %r2939, %r2938, %r2937, 28756; + ld.local.u8 %r2940, [%rd132+148]; + prmt.b32 %r2941, %r2940, %r2939, 1620; + ld.local.u8 %r2942, [%rd132+149]; + ld.local.u8 %r2943, [%rd132+150]; + prmt.b32 %r2944, %r2943, %r2942, 30212; + ld.local.u8 %r2945, [%rd132+151]; + prmt.b32 %r2946, %r2945, %r2944, 28756; + ld.local.u8 %r2947, [%rd132+152]; + prmt.b32 %r2948, %r2947, %r2946, 1620; + ld.local.u8 %r2949, [%rd132+153]; + ld.local.u8 %r2950, [%rd132+154]; + prmt.b32 %r2951, %r2950, %r2949, 30212; + ld.local.u8 %r2952, [%rd132+155]; + prmt.b32 %r2953, %r2952, %r2951, 28756; + ld.local.u8 %r2954, [%rd132+156]; + prmt.b32 %r2955, %r2954, %r2953, 1620; + ld.local.u8 %r2956, [%rd132+157]; + ld.local.u8 %r2957, [%rd132+158]; + prmt.b32 %r2958, %r2957, %r2956, 30212; + ld.local.u8 %r2959, [%rd132+159]; + prmt.b32 %r2960, %r2959, %r2958, 28756; + ld.local.u8 %r2961, [%rd132+160]; + prmt.b32 %r2962, %r2961, %r2960, 1620; + ld.local.u8 %r2963, [%rd132+161]; + ld.local.u8 %r2964, [%rd132+162]; + prmt.b32 %r2965, %r2964, %r2963, 30212; + ld.local.u8 %r2966, [%rd132+163]; + prmt.b32 %r2967, %r2966, %r2965, 28756; + ld.local.u8 %r2968, [%rd132+164]; + prmt.b32 %r2969, %r2968, %r2967, 1620; + ld.local.u8 %r2970, [%rd132+165]; + ld.local.u8 %r2971, [%rd132+166]; + prmt.b32 %r2972, %r2971, %r2970, 30212; + ld.local.u8 %r2973, [%rd132+167]; + prmt.b32 %r2974, %r2973, %r2972, 28756; + ld.local.u8 %r2975, [%rd132+168]; + prmt.b32 %r2976, %r2975, %r2974, 1620; + ld.local.u8 %r2977, [%rd132+169]; + ld.local.u8 %r2978, [%rd132+170]; + prmt.b32 %r2979, %r2978, %r2977, 30212; + ld.local.u8 %r2980, [%rd132+171]; + prmt.b32 %r2981, %r2980, %r2979, 28756; + ld.local.u8 %r2982, [%rd132+172]; + prmt.b32 %r2983, %r2982, %r2981, 1620; + ld.local.u8 %r2984, [%rd132+173]; + ld.local.u8 %r2985, [%rd132+174]; + prmt.b32 %r2986, %r2985, %r2984, 30212; + ld.local.u8 %r2987, [%rd132+175]; + prmt.b32 %r2988, %r2987, %r2986, 28756; + ld.local.u8 %r2989, [%rd132+176]; + prmt.b32 %r2990, %r2989, %r2988, 1620; + ld.local.u8 %r2991, [%rd132+177]; + ld.local.u8 %r2992, [%rd132+178]; + prmt.b32 %r2993, %r2992, %r2991, 30212; + ld.local.u8 %r2994, [%rd132+179]; + prmt.b32 %r2995, %r2994, %r2993, 28756; + ld.local.u8 %r2996, [%rd132+180]; + prmt.b32 %r2997, %r2996, %r2995, 1620; + ld.local.u8 %r2998, [%rd132+181]; + ld.local.u8 %r2999, [%rd132+182]; + prmt.b32 %r3000, %r2999, %r2998, 30212; + ld.local.u8 %r3001, [%rd132+183]; + prmt.b32 %r3002, %r3001, %r3000, 28756; + ld.local.u8 %r3003, [%rd132+184]; + prmt.b32 %r3004, %r3003, %r3002, 1620; + ld.local.u8 %r3005, [%rd132+185]; + ld.local.u8 %r3006, [%rd132+186]; + prmt.b32 %r3007, %r3006, %r3005, 30212; + ld.local.u8 %r3008, [%rd132+187]; + prmt.b32 %r3009, %r3008, %r3007, 28756; + ld.local.u8 %r3010, [%rd132+188]; + prmt.b32 %r3011, %r3010, %r3009, 1620; + ld.local.u8 %r3012, [%rd132+189]; + ld.local.u8 %r3013, [%rd132+190]; + prmt.b32 %r3014, %r3013, %r3012, 30212; + ld.local.u8 %r3015, [%rd132+191]; + prmt.b32 %r3016, %r3015, %r3014, 28756; + ld.local.u8 %r3017, [%rd132+192]; + prmt.b32 %r3018, %r3017, %r3016, 1620; + ld.local.u8 %r3019, [%rd132+193]; + ld.local.u8 %r3020, [%rd132+194]; + prmt.b32 %r3021, %r3020, %r3019, 30212; + ld.local.u8 %r3022, [%rd132+195]; + prmt.b32 %r3023, %r3022, %r3021, 28756; + ld.local.u8 %r3024, [%rd132+196]; + prmt.b32 %r3025, %r3024, %r3023, 1620; + ld.local.u8 %r3026, [%rd132+197]; + ld.local.u8 %r3027, [%rd132+198]; + prmt.b32 %r3028, %r3027, %r3026, 30212; + ld.local.u8 %r3029, [%rd132+199]; + prmt.b32 %r3030, %r3029, %r3028, 28756; + ld.local.u8 %r3031, [%rd132+200]; + prmt.b32 %r3032, %r3031, %r3030, 1620; + ld.local.u8 %r3033, [%rd132+201]; + ld.local.u8 %r3034, [%rd132+202]; + prmt.b32 %r3035, %r3034, %r3033, 30212; + ld.local.u8 %r3036, [%rd132+203]; + prmt.b32 %r3037, %r3036, %r3035, 28756; + ld.local.u8 %r3038, [%rd132+204]; + prmt.b32 %r3039, %r3038, %r3037, 1620; + ld.local.u8 %r3040, [%rd132+205]; + ld.local.u8 %r3041, [%rd132+206]; + prmt.b32 %r3042, %r3041, %r3040, 30212; + ld.local.u8 %r3043, [%rd132+207]; + prmt.b32 %r3044, %r3043, %r3042, 28756; + ld.local.u8 %r3045, [%rd132+208]; + prmt.b32 %r3046, %r3045, %r3044, 1620; + or.b32 %r3047, %r2934, 4; + ld.local.u8 %r3048, [%rd3+-120]; + ld.local.u8 %r3049, [%rd3+-119]; + prmt.b32 %r3050, %r3049, %r3048, 30212; + ld.local.u8 %r3051, [%rd3+-118]; + ld.local.u8 %r3052, [%rd3+-117]; + prmt.b32 %r3053, %r3052, %r3051, 30212; + prmt.b32 %r3054, %r3053, %r3050, 4180; + ld.local.u8 %r3055, [%rd3+-136]; + ld.local.u8 %r3056, [%rd3+-135]; + prmt.b32 %r3057, %r3056, %r3055, 30212; + ld.local.u8 %r3058, [%rd3+-134]; + ld.local.u8 %r3059, [%rd3+-133]; + prmt.b32 %r3060, %r3059, %r3058, 30212; + prmt.b32 %r3061, %r3060, %r3057, 4180; + add.s32 %r3062, %r3054, %r3061; + add.s32 %r3063, %r3062, %r2941; + shf.l.wrap.b32 %r3064, %r3063, %r3063, 16; + add.s32 %r3065, %r3064, 1779033703; + xor.b32 %r3066, %r3065, %r3054; + shf.l.wrap.b32 %r3067, %r3066, %r3066, 20; + add.s32 %r3068, %r2948, %r3063; + add.s32 %r3069, %r3068, %r3067; + xor.b32 %r3070, %r3069, %r3064; + shf.l.wrap.b32 %r3071, %r3070, %r3070, 24; + add.s32 %r3072, %r3071, %r3065; + xor.b32 %r3073, %r3072, %r3067; + shf.l.wrap.b32 %r3074, %r3073, %r3073, 25; + ld.local.u8 %r3075, [%rd3+-116]; + ld.local.u8 %r3076, [%rd3+-115]; + prmt.b32 %r3077, %r3076, %r3075, 30212; + ld.local.u8 %r3078, [%rd3+-114]; + ld.local.u8 %r3079, [%rd3+-113]; + prmt.b32 %r3080, %r3079, %r3078, 30212; + prmt.b32 %r3081, %r3080, %r3077, 4180; + ld.local.u8 %r3082, [%rd3+-132]; + ld.local.u8 %r3083, [%rd3+-131]; + prmt.b32 %r3084, %r3083, %r3082, 30212; + ld.local.u8 %r3085, [%rd3+-130]; + ld.local.u8 %r3086, [%rd3+-129]; + prmt.b32 %r3087, %r3086, %r3085, 30212; + prmt.b32 %r3088, %r3087, %r3084, 4180; + add.s32 %r3089, %r3081, %r3088; + add.s32 %r3090, %r3089, %r2955; + shf.l.wrap.b32 %r3091, %r3090, %r3090, 16; + add.s32 %r3092, %r3091, -1150833019; + xor.b32 %r3093, %r3092, %r3081; + shf.l.wrap.b32 %r3094, %r3093, %r3093, 20; + add.s32 %r3095, %r2962, %r3090; + add.s32 %r3096, %r3095, %r3094; + xor.b32 %r3097, %r3096, %r3091; + shf.l.wrap.b32 %r3098, %r3097, %r3097, 24; + add.s32 %r3099, %r3098, %r3092; + xor.b32 %r3100, %r3099, %r3094; + shf.l.wrap.b32 %r3101, %r3100, %r3100, 25; + ld.local.u8 %r3102, [%rd3+-112]; + ld.local.u8 %r3103, [%rd3+-111]; + prmt.b32 %r3104, %r3103, %r3102, 30212; + ld.local.u8 %r3105, [%rd3+-110]; + ld.local.u8 %r3106, [%rd3+-109]; + prmt.b32 %r3107, %r3106, %r3105, 30212; + prmt.b32 %r3108, %r3107, %r3104, 4180; + ld.local.u8 %r3109, [%rd3+-128]; + ld.local.u8 %r3110, [%rd3+-127]; + prmt.b32 %r3111, %r3110, %r3109, 30212; + ld.local.u8 %r3112, [%rd3+-126]; + ld.local.u8 %r3113, [%rd3+-125]; + prmt.b32 %r3114, %r3113, %r3112, 30212; + prmt.b32 %r3115, %r3114, %r3111, 4180; + add.s32 %r3116, %r3108, %r3115; + add.s32 %r3117, %r3116, %r2969; + shr.u32 %r3118, %r3117, 16; + shl.b32 %r3119, %r3117, 16; + xor.b32 %r3120, %r3119, 4194304; + or.b32 %r3121, %r3120, %r3118; + add.s32 %r3122, %r3121, 1013904242; + xor.b32 %r3123, %r3122, %r3108; + shf.l.wrap.b32 %r3124, %r3123, %r3123, 20; + add.s32 %r3125, %r2976, %r3117; + add.s32 %r3126, %r3125, %r3124; + xor.b32 %r3127, %r3126, %r3121; + shf.l.wrap.b32 %r3128, %r3127, %r3127, 24; + add.s32 %r3129, %r3128, %r3122; + xor.b32 %r3130, %r3129, %r3124; + shf.l.wrap.b32 %r3131, %r3130, %r3130, 25; + ld.local.u8 %r3132, [%rd3+-108]; + ld.local.u8 %r3133, [%rd3+-107]; + prmt.b32 %r3134, %r3133, %r3132, 30212; + ld.local.u8 %r3135, [%rd3+-106]; + ld.local.u8 %r3136, [%rd3+-105]; + prmt.b32 %r3137, %r3136, %r3135, 30212; + prmt.b32 %r3138, %r3137, %r3134, 4180; + ld.local.u8 %r3139, [%rd3+-124]; + ld.local.u8 %r3140, [%rd3+-123]; + prmt.b32 %r3141, %r3140, %r3139, 30212; + ld.local.u8 %r3142, [%rd3+-122]; + ld.local.u8 %r3143, [%rd3+-121]; + prmt.b32 %r3144, %r3143, %r3142, 30212; + prmt.b32 %r3145, %r3144, %r3141, 4180; + add.s32 %r3146, %r3138, %r3145; + add.s32 %r3147, %r3146, %r2983; + xor.b32 %r3148, %r3147, %r3047; + shr.u32 %r3149, %r3147, 16; + shl.b32 %r3150, %r3148, 16; + or.b32 %r3151, %r3150, %r3149; + add.s32 %r3152, %r3151, -1521486534; + xor.b32 %r3153, %r3152, %r3138; + shf.l.wrap.b32 %r3154, %r3153, %r3153, 20; + add.s32 %r3155, %r2990, %r3147; + add.s32 %r3156, %r3155, %r3154; + xor.b32 %r3157, %r3156, %r3151; + shf.l.wrap.b32 %r3158, %r3157, %r3157, 24; + add.s32 %r3159, %r3158, %r3152; + xor.b32 %r3160, %r3159, %r3154; + shf.l.wrap.b32 %r3161, %r3160, %r3160, 25; + add.s32 %r3162, %r3101, %r3069; + add.s32 %r3163, %r3162, %r2997; + xor.b32 %r3164, %r3158, %r3163; + shf.l.wrap.b32 %r3165, %r3164, %r3164, 16; + add.s32 %r3166, %r3165, %r3129; + xor.b32 %r3167, %r3166, %r3101; + shf.l.wrap.b32 %r3168, %r3167, %r3167, 20; + add.s32 %r3169, %r3004, %r3163; + add.s32 %r3170, %r3169, %r3168; + xor.b32 %r3171, %r3170, %r3165; + shf.l.wrap.b32 %r3172, %r3171, %r3171, 24; + add.s32 %r3173, %r3172, %r3166; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 25; + add.s32 %r3176, %r3131, %r3096; + add.s32 %r3177, %r3176, %r3011; + xor.b32 %r3178, %r3177, %r3071; + shf.l.wrap.b32 %r3179, %r3178, %r3178, 16; + add.s32 %r3180, %r3179, %r3159; + xor.b32 %r3181, %r3180, %r3131; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 20; + add.s32 %r3183, %r3018, %r3177; + add.s32 %r3184, %r3183, %r3182; + xor.b32 %r3185, %r3184, %r3179; + shf.l.wrap.b32 %r3186, %r3185, %r3185, 24; + add.s32 %r3187, %r3186, %r3180; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 25; + add.s32 %r3190, %r3161, %r3126; + add.s32 %r3191, %r3190, %r3025; + xor.b32 %r3192, %r3191, %r3098; + shf.l.wrap.b32 %r3193, %r3192, %r3192, 16; + add.s32 %r3194, %r3193, %r3072; + xor.b32 %r3195, %r3194, %r3161; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 20; + add.s32 %r3197, %r3032, %r3191; + add.s32 %r3198, %r3197, %r3196; + xor.b32 %r3199, %r3198, %r3193; + shf.l.wrap.b32 %r3200, %r3199, %r3199, 24; + add.s32 %r3201, %r3200, %r3194; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 25; + add.s32 %r3204, %r3156, %r3074; + add.s32 %r3205, %r3204, %r3039; + xor.b32 %r3206, %r3205, %r3128; + shf.l.wrap.b32 %r3207, %r3206, %r3206, 16; + add.s32 %r3208, %r3207, %r3099; + xor.b32 %r3209, %r3208, %r3074; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 20; + add.s32 %r3211, %r3046, %r3205; + add.s32 %r3212, %r3211, %r3210; + xor.b32 %r3213, %r3212, %r3207; + shf.l.wrap.b32 %r3214, %r3213, %r3213, 24; + add.s32 %r3215, %r3214, %r3208; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 25; + add.s32 %r3218, %r3170, %r2955; + add.s32 %r3219, %r3218, %r3217; + xor.b32 %r3220, %r3219, %r3186; + shf.l.wrap.b32 %r3221, %r3220, %r3220, 16; + add.s32 %r3222, %r3221, %r3201; + xor.b32 %r3223, %r3222, %r3217; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 20; + add.s32 %r3225, %r3219, %r2983; + add.s32 %r3226, %r3225, %r3224; + xor.b32 %r3227, %r3226, %r3221; + shf.l.wrap.b32 %r3228, %r3227, %r3227, 24; + add.s32 %r3229, %r3228, %r3222; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 25; + add.s32 %r3232, %r3184, %r2962; + add.s32 %r3233, %r3232, %r3175; + xor.b32 %r3234, %r3200, %r3233; + shf.l.wrap.b32 %r3235, %r3234, %r3234, 16; + add.s32 %r3236, %r3215, %r3235; + xor.b32 %r3237, %r3236, %r3175; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 20; + add.s32 %r3239, %r3233, %r3011; + add.s32 %r3240, %r3239, %r3238; + xor.b32 %r3241, %r3240, %r3235; + shf.l.wrap.b32 %r3242, %r3241, %r3241, 24; + add.s32 %r3243, %r3242, %r3236; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 25; + add.s32 %r3246, %r3189, %r2990; + add.s32 %r3247, %r3246, %r3198; + xor.b32 %r3248, %r3214, %r3247; + shf.l.wrap.b32 %r3249, %r3248, %r3248, 16; + add.s32 %r3250, %r3249, %r3173; + xor.b32 %r3251, %r3250, %r3189; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 20; + add.s32 %r3253, %r3247, %r2941; + add.s32 %r3254, %r3253, %r3252; + xor.b32 %r3255, %r3254, %r3249; + shf.l.wrap.b32 %r3256, %r3255, %r3255, 24; + add.s32 %r3257, %r3256, %r3250; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 25; + add.s32 %r3260, %r3203, %r2969; + add.s32 %r3261, %r3260, %r3212; + xor.b32 %r3262, %r3261, %r3172; + shf.l.wrap.b32 %r3263, %r3262, %r3262, 16; + add.s32 %r3264, %r3263, %r3187; + xor.b32 %r3265, %r3264, %r3203; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 20; + add.s32 %r3267, %r3261, %r3032; + add.s32 %r3268, %r3267, %r3266; + xor.b32 %r3269, %r3268, %r3263; + shf.l.wrap.b32 %r3270, %r3269, %r3269, 24; + add.s32 %r3271, %r3270, %r3264; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 25; + add.s32 %r3274, %r3245, %r2948; + add.s32 %r3275, %r3274, %r3226; + xor.b32 %r3276, %r3275, %r3270; + shf.l.wrap.b32 %r3277, %r3276, %r3276, 16; + add.s32 %r3278, %r3277, %r3257; + xor.b32 %r3279, %r3278, %r3245; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 20; + add.s32 %r3281, %r3275, %r3018; + add.s32 %r3282, %r3281, %r3280; + xor.b32 %r3283, %r3282, %r3277; + shf.l.wrap.b32 %r3284, %r3283, %r3283, 24; + add.s32 %r3285, %r3284, %r3278; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 25; + add.s32 %r3288, %r3240, %r3025; + add.s32 %r3289, %r3288, %r3259; + xor.b32 %r3290, %r3228, %r3289; + shf.l.wrap.b32 %r3291, %r3290, %r3290, 16; + add.s32 %r3292, %r3291, %r3271; + xor.b32 %r3293, %r3292, %r3259; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 20; + add.s32 %r3295, %r3289, %r2976; + add.s32 %r3296, %r3295, %r3294; + xor.b32 %r3297, %r3296, %r3291; + shf.l.wrap.b32 %r3298, %r3297, %r3297, 24; + add.s32 %r3299, %r3298, %r3292; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 25; + add.s32 %r3302, %r3254, %r3004; + add.s32 %r3303, %r3302, %r3273; + xor.b32 %r3304, %r3303, %r3242; + shf.l.wrap.b32 %r3305, %r3304, %r3304, 16; + add.s32 %r3306, %r3305, %r3229; + xor.b32 %r3307, %r3306, %r3273; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 20; + add.s32 %r3309, %r3303, %r3039; + add.s32 %r3310, %r3309, %r3308; + xor.b32 %r3311, %r3310, %r3305; + shf.l.wrap.b32 %r3312, %r3311, %r3311, 24; + add.s32 %r3313, %r3312, %r3306; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 25; + add.s32 %r3316, %r3268, %r3046; + add.s32 %r3317, %r3316, %r3231; + xor.b32 %r3318, %r3317, %r3256; + shf.l.wrap.b32 %r3319, %r3318, %r3318, 16; + add.s32 %r3320, %r3319, %r3243; + xor.b32 %r3321, %r3320, %r3231; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 20; + add.s32 %r3323, %r3317, %r2997; + add.s32 %r3324, %r3323, %r3322; + xor.b32 %r3325, %r3324, %r3319; + shf.l.wrap.b32 %r3326, %r3325, %r3325, 24; + add.s32 %r3327, %r3326, %r3320; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 25; + add.s32 %r3330, %r3282, %r2962; + add.s32 %r3331, %r3330, %r3329; + xor.b32 %r3332, %r3331, %r3298; + shf.l.wrap.b32 %r3333, %r3332, %r3332, 16; + add.s32 %r3334, %r3333, %r3313; + xor.b32 %r3335, %r3334, %r3329; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 20; + add.s32 %r3337, %r3331, %r2969; + add.s32 %r3338, %r3337, %r3336; + xor.b32 %r3339, %r3338, %r3333; + shf.l.wrap.b32 %r3340, %r3339, %r3339, 24; + add.s32 %r3341, %r3340, %r3334; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 25; + add.s32 %r3344, %r3296, %r3011; + add.s32 %r3345, %r3344, %r3287; + xor.b32 %r3346, %r3345, %r3312; + shf.l.wrap.b32 %r3347, %r3346, %r3346, 16; + add.s32 %r3348, %r3347, %r3327; + xor.b32 %r3349, %r3348, %r3287; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 20; + add.s32 %r3351, %r3345, %r3025; + add.s32 %r3352, %r3351, %r3350; + xor.b32 %r3353, %r3352, %r3347; + shf.l.wrap.b32 %r3354, %r3353, %r3353, 24; + add.s32 %r3355, %r3354, %r3348; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 25; + add.s32 %r3358, %r3310, %r3032; + add.s32 %r3359, %r3358, %r3301; + xor.b32 %r3360, %r3326, %r3359; + shf.l.wrap.b32 %r3361, %r3360, %r3360, 16; + add.s32 %r3362, %r3361, %r3285; + xor.b32 %r3363, %r3362, %r3301; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 20; + add.s32 %r3365, %r3359, %r2955; + add.s32 %r3366, %r3365, %r3364; + xor.b32 %r3367, %r3366, %r3361; + shf.l.wrap.b32 %r3368, %r3367, %r3367, 24; + add.s32 %r3369, %r3368, %r3362; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 25; + add.s32 %r3372, %r3315, %r2990; + add.s32 %r3373, %r3372, %r3324; + xor.b32 %r3374, %r3373, %r3284; + shf.l.wrap.b32 %r3375, %r3374, %r3374, 16; + add.s32 %r3376, %r3375, %r3299; + xor.b32 %r3377, %r3376, %r3315; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 20; + add.s32 %r3379, %r3373, %r3039; + add.s32 %r3380, %r3379, %r3378; + xor.b32 %r3381, %r3380, %r3375; + shf.l.wrap.b32 %r3382, %r3381, %r3381, 24; + add.s32 %r3383, %r3382, %r3376; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 25; + add.s32 %r3386, %r3357, %r2983; + add.s32 %r3387, %r3386, %r3338; + xor.b32 %r3388, %r3387, %r3382; + shf.l.wrap.b32 %r3389, %r3388, %r3388, 16; + add.s32 %r3390, %r3389, %r3369; + xor.b32 %r3391, %r3390, %r3357; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 20; + add.s32 %r3393, %r3387, %r2976; + add.s32 %r3394, %r3393, %r3392; + xor.b32 %r3395, %r3394, %r3389; + shf.l.wrap.b32 %r3396, %r3395, %r3395, 24; + add.s32 %r3397, %r3396, %r3390; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 25; + add.s32 %r3400, %r3352, %r3004; + add.s32 %r3401, %r3400, %r3371; + xor.b32 %r3402, %r3340, %r3401; + shf.l.wrap.b32 %r3403, %r3402, %r3402, 16; + add.s32 %r3404, %r3403, %r3383; + xor.b32 %r3405, %r3404, %r3371; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 20; + add.s32 %r3407, %r3401, %r2941; + add.s32 %r3408, %r3407, %r3406; + xor.b32 %r3409, %r3408, %r3403; + shf.l.wrap.b32 %r3410, %r3409, %r3409, 24; + add.s32 %r3411, %r3410, %r3404; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 25; + add.s32 %r3414, %r3366, %r3018; + add.s32 %r3415, %r3414, %r3385; + xor.b32 %r3416, %r3415, %r3354; + shf.l.wrap.b32 %r3417, %r3416, %r3416, 16; + add.s32 %r3418, %r3417, %r3341; + xor.b32 %r3419, %r3418, %r3385; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 20; + add.s32 %r3421, %r3415, %r3046; + add.s32 %r3422, %r3421, %r3420; + xor.b32 %r3423, %r3422, %r3417; + shf.l.wrap.b32 %r3424, %r3423, %r3423, 24; + add.s32 %r3425, %r3424, %r3418; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 25; + add.s32 %r3428, %r3380, %r2997; + add.s32 %r3429, %r3428, %r3343; + xor.b32 %r3430, %r3429, %r3368; + shf.l.wrap.b32 %r3431, %r3430, %r3430, 16; + add.s32 %r3432, %r3431, %r3355; + xor.b32 %r3433, %r3432, %r3343; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 20; + add.s32 %r3435, %r3429, %r2948; + add.s32 %r3436, %r3435, %r3434; + xor.b32 %r3437, %r3436, %r3431; + shf.l.wrap.b32 %r3438, %r3437, %r3437, 24; + add.s32 %r3439, %r3438, %r3432; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 25; + add.s32 %r3442, %r3394, %r3011; + add.s32 %r3443, %r3442, %r3441; + xor.b32 %r3444, %r3443, %r3410; + shf.l.wrap.b32 %r3445, %r3444, %r3444, 16; + add.s32 %r3446, %r3445, %r3425; + xor.b32 %r3447, %r3446, %r3441; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 20; + add.s32 %r3449, %r3443, %r2990; + add.s32 %r3450, %r3449, %r3448; + xor.b32 %r3451, %r3450, %r3445; + shf.l.wrap.b32 %r3452, %r3451, %r3451, 24; + add.s32 %r3453, %r3452, %r3446; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 25; + add.s32 %r3456, %r3408, %r3025; + add.s32 %r3457, %r3456, %r3399; + xor.b32 %r3458, %r3457, %r3424; + shf.l.wrap.b32 %r3459, %r3458, %r3458, 16; + add.s32 %r3460, %r3459, %r3439; + xor.b32 %r3461, %r3460, %r3399; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 20; + add.s32 %r3463, %r3457, %r3004; + add.s32 %r3464, %r3463, %r3462; + xor.b32 %r3465, %r3464, %r3459; + shf.l.wrap.b32 %r3466, %r3465, %r3465, 24; + add.s32 %r3467, %r3466, %r3460; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 25; + add.s32 %r3470, %r3422, %r3039; + add.s32 %r3471, %r3470, %r3413; + xor.b32 %r3472, %r3438, %r3471; + shf.l.wrap.b32 %r3473, %r3472, %r3472, 16; + add.s32 %r3474, %r3473, %r3397; + xor.b32 %r3475, %r3474, %r3413; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 20; + add.s32 %r3477, %r3471, %r2962; + add.s32 %r3478, %r3477, %r3476; + xor.b32 %r3479, %r3478, %r3473; + shf.l.wrap.b32 %r3480, %r3479, %r3479, 24; + add.s32 %r3481, %r3480, %r3474; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 25; + add.s32 %r3484, %r3427, %r3032; + add.s32 %r3485, %r3484, %r3436; + xor.b32 %r3486, %r3485, %r3396; + shf.l.wrap.b32 %r3487, %r3486, %r3486, 16; + add.s32 %r3488, %r3487, %r3411; + xor.b32 %r3489, %r3488, %r3427; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 20; + add.s32 %r3491, %r3485, %r3046; + add.s32 %r3492, %r3491, %r3490; + xor.b32 %r3493, %r3492, %r3487; + shf.l.wrap.b32 %r3494, %r3493, %r3493, 24; + add.s32 %r3495, %r3494, %r3488; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 25; + add.s32 %r3498, %r3469, %r2969; + add.s32 %r3499, %r3498, %r3450; + xor.b32 %r3500, %r3499, %r3494; + shf.l.wrap.b32 %r3501, %r3500, %r3500, 16; + add.s32 %r3502, %r3501, %r3481; + xor.b32 %r3503, %r3502, %r3469; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 20; + add.s32 %r3505, %r3499, %r2941; + add.s32 %r3506, %r3505, %r3504; + xor.b32 %r3507, %r3506, %r3501; + shf.l.wrap.b32 %r3508, %r3507, %r3507, 24; + add.s32 %r3509, %r3508, %r3502; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 25; + add.s32 %r3512, %r3464, %r3018; + add.s32 %r3513, %r3512, %r3483; + xor.b32 %r3514, %r3452, %r3513; + shf.l.wrap.b32 %r3515, %r3514, %r3514, 16; + add.s32 %r3516, %r3515, %r3495; + xor.b32 %r3517, %r3516, %r3483; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 20; + add.s32 %r3519, %r3513, %r2955; + add.s32 %r3520, %r3519, %r3518; + xor.b32 %r3521, %r3520, %r3515; + shf.l.wrap.b32 %r3522, %r3521, %r3521, 24; + add.s32 %r3523, %r3522, %r3516; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 25; + add.s32 %r3526, %r3478, %r2976; + add.s32 %r3527, %r3526, %r3497; + xor.b32 %r3528, %r3527, %r3466; + shf.l.wrap.b32 %r3529, %r3528, %r3528, 16; + add.s32 %r3530, %r3529, %r3453; + xor.b32 %r3531, %r3530, %r3497; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 20; + add.s32 %r3533, %r3527, %r2997; + add.s32 %r3534, %r3533, %r3532; + xor.b32 %r3535, %r3534, %r3529; + shf.l.wrap.b32 %r3536, %r3535, %r3535, 24; + add.s32 %r3537, %r3536, %r3530; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 25; + add.s32 %r3540, %r3492, %r2948; + add.s32 %r3541, %r3540, %r3455; + xor.b32 %r3542, %r3541, %r3480; + shf.l.wrap.b32 %r3543, %r3542, %r3542, 16; + add.s32 %r3544, %r3543, %r3467; + xor.b32 %r3545, %r3544, %r3455; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 20; + add.s32 %r3547, %r3541, %r2983; + add.s32 %r3548, %r3547, %r3546; + xor.b32 %r3549, %r3548, %r3543; + shf.l.wrap.b32 %r3550, %r3549, %r3549, 24; + add.s32 %r3551, %r3550, %r3544; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 25; + add.s32 %r3554, %r3506, %r3025; + add.s32 %r3555, %r3554, %r3553; + xor.b32 %r3556, %r3555, %r3522; + shf.l.wrap.b32 %r3557, %r3556, %r3556, 16; + add.s32 %r3558, %r3557, %r3537; + xor.b32 %r3559, %r3558, %r3553; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 20; + add.s32 %r3561, %r3555, %r3032; + add.s32 %r3562, %r3561, %r3560; + xor.b32 %r3563, %r3562, %r3557; + shf.l.wrap.b32 %r3564, %r3563, %r3563, 24; + add.s32 %r3565, %r3564, %r3558; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 25; + add.s32 %r3568, %r3520, %r3004; + add.s32 %r3569, %r3568, %r3511; + xor.b32 %r3570, %r3569, %r3536; + shf.l.wrap.b32 %r3571, %r3570, %r3570, 16; + add.s32 %r3572, %r3571, %r3551; + xor.b32 %r3573, %r3572, %r3511; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 20; + add.s32 %r3575, %r3569, %r3018; + add.s32 %r3576, %r3575, %r3574; + xor.b32 %r3577, %r3576, %r3571; + shf.l.wrap.b32 %r3578, %r3577, %r3577, 24; + add.s32 %r3579, %r3578, %r3572; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 25; + add.s32 %r3582, %r3534, %r3046; + add.s32 %r3583, %r3582, %r3525; + xor.b32 %r3584, %r3550, %r3583; + shf.l.wrap.b32 %r3585, %r3584, %r3584, 16; + add.s32 %r3586, %r3585, %r3509; + xor.b32 %r3587, %r3586, %r3525; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 20; + add.s32 %r3589, %r3583, %r3011; + add.s32 %r3590, %r3589, %r3588; + xor.b32 %r3591, %r3590, %r3585; + shf.l.wrap.b32 %r3592, %r3591, %r3591, 24; + add.s32 %r3593, %r3592, %r3586; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 25; + add.s32 %r3596, %r3539, %r3039; + add.s32 %r3597, %r3596, %r3548; + xor.b32 %r3598, %r3597, %r3508; + shf.l.wrap.b32 %r3599, %r3598, %r3598, 16; + add.s32 %r3600, %r3599, %r3523; + xor.b32 %r3601, %r3600, %r3539; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 20; + add.s32 %r3603, %r3597, %r2997; + add.s32 %r3604, %r3603, %r3602; + xor.b32 %r3605, %r3604, %r3599; + shf.l.wrap.b32 %r3606, %r3605, %r3605, 24; + add.s32 %r3607, %r3606, %r3600; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 25; + add.s32 %r3610, %r3581, %r2990; + add.s32 %r3611, %r3610, %r3562; + xor.b32 %r3612, %r3611, %r3606; + shf.l.wrap.b32 %r3613, %r3612, %r3612, 16; + add.s32 %r3614, %r3613, %r3593; + xor.b32 %r3615, %r3614, %r3581; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 20; + add.s32 %r3617, %r3611, %r2955; + add.s32 %r3618, %r3617, %r3616; + xor.b32 %r3619, %r3618, %r3613; + shf.l.wrap.b32 %r3620, %r3619, %r3619, 24; + add.s32 %r3621, %r3620, %r3614; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 25; + add.s32 %r3624, %r3576, %r2976; + add.s32 %r3625, %r3624, %r3595; + xor.b32 %r3626, %r3564, %r3625; + shf.l.wrap.b32 %r3627, %r3626, %r3626, 16; + add.s32 %r3628, %r3627, %r3607; + xor.b32 %r3629, %r3628, %r3595; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 20; + add.s32 %r3631, %r3625, %r2962; + add.s32 %r3632, %r3631, %r3630; + xor.b32 %r3633, %r3632, %r3627; + shf.l.wrap.b32 %r3634, %r3633, %r3633, 24; + add.s32 %r3635, %r3634, %r3628; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 25; + add.s32 %r3638, %r3590, %r2941; + add.s32 %r3639, %r3638, %r3609; + xor.b32 %r3640, %r3639, %r3578; + shf.l.wrap.b32 %r3641, %r3640, %r3640, 16; + add.s32 %r3642, %r3641, %r3565; + xor.b32 %r3643, %r3642, %r3609; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 20; + add.s32 %r3645, %r3639, %r2948; + add.s32 %r3646, %r3645, %r3644; + xor.b32 %r3647, %r3646, %r3641; + shf.l.wrap.b32 %r3648, %r3647, %r3647, 24; + add.s32 %r3649, %r3648, %r3642; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 25; + add.s32 %r3652, %r3604, %r2983; + add.s32 %r3653, %r3652, %r3567; + xor.b32 %r3654, %r3653, %r3592; + shf.l.wrap.b32 %r3655, %r3654, %r3654, 16; + add.s32 %r3656, %r3655, %r3579; + xor.b32 %r3657, %r3656, %r3567; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 20; + add.s32 %r3659, %r3653, %r2969; + add.s32 %r3660, %r3659, %r3658; + xor.b32 %r3661, %r3660, %r3655; + shf.l.wrap.b32 %r3662, %r3661, %r3661, 24; + add.s32 %r3663, %r3662, %r3656; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 25; + add.s32 %r3666, %r3618, %r3004; + add.s32 %r3667, %r3666, %r3665; + xor.b32 %r3668, %r3667, %r3634; + shf.l.wrap.b32 %r3669, %r3668, %r3668, 16; + add.s32 %r3670, %r3669, %r3649; + xor.b32 %r3671, %r3670, %r3665; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 20; + add.s32 %r3673, %r3667, %r3039; + add.s32 %r3674, %r3673, %r3672; + xor.b32 %r3675, %r3674, %r3669; + shf.l.wrap.b32 %r3676, %r3675, %r3675, 24; + add.s32 %r3677, %r3676, %r3670; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 25; + add.s32 %r3680, %r3632, %r3018; + add.s32 %r3681, %r3680, %r3623; + xor.b32 %r3682, %r3681, %r3648; + shf.l.wrap.b32 %r3683, %r3682, %r3682, 16; + add.s32 %r3684, %r3683, %r3663; + xor.b32 %r3685, %r3684, %r3623; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 20; + add.s32 %r3687, %r3681, %r2976; + add.s32 %r3688, %r3687, %r3686; + xor.b32 %r3689, %r3688, %r3683; + shf.l.wrap.b32 %r3690, %r3689, %r3689, 24; + add.s32 %r3691, %r3690, %r3684; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 25; + add.s32 %r3694, %r3646, %r2997; + add.s32 %r3695, %r3694, %r3637; + xor.b32 %r3696, %r3662, %r3695; + shf.l.wrap.b32 %r3697, %r3696, %r3696, 16; + add.s32 %r3698, %r3697, %r3621; + xor.b32 %r3699, %r3698, %r3637; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 20; + add.s32 %r3701, %r3695, %r3025; + add.s32 %r3702, %r3701, %r3700; + xor.b32 %r3703, %r3702, %r3697; + shf.l.wrap.b32 %r3704, %r3703, %r3703, 24; + add.s32 %r3705, %r3704, %r3698; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 25; + add.s32 %r3708, %r3651, %r3046; + add.s32 %r3709, %r3708, %r3660; + xor.b32 %r3710, %r3709, %r3620; + shf.l.wrap.b32 %r3711, %r3710, %r3710, 16; + add.s32 %r3712, %r3711, %r3635; + xor.b32 %r3713, %r3712, %r3651; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 20; + add.s32 %r3715, %r3709, %r2948; + add.s32 %r3716, %r3715, %r3714; + xor.b32 %r3717, %r3716, %r3711; + shf.l.wrap.b32 %r3718, %r3717, %r3717, 24; + add.s32 %r3719, %r3718, %r3712; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 25; + add.s32 %r3722, %r3693, %r3032; + add.s32 %r3723, %r3722, %r3674; + xor.b32 %r3724, %r3723, %r3718; + shf.l.wrap.b32 %r3725, %r3724, %r3724, 16; + add.s32 %r3726, %r3725, %r3705; + xor.b32 %r3727, %r3726, %r3693; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 20; + add.s32 %r3729, %r3723, %r2962; + add.s32 %r3730, %r3729, %r3728; + xor.b32 %r3731, %r3730, %r3725; + shf.l.wrap.b32 %r3732, %r3731, %r3731, 24; + add.s32 %r3733, %r3732, %r3726; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 25; + add.s32 %r3736, %r3688, %r2941; + add.s32 %r3737, %r3736, %r3707; + xor.b32 %r3738, %r3676, %r3737; + shf.l.wrap.b32 %r3739, %r3738, %r3738, 16; + add.s32 %r3740, %r3739, %r3719; + xor.b32 %r3741, %r3740, %r3707; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 20; + add.s32 %r3743, %r3737, %r3011; + add.s32 %r3744, %r3743, %r3742; + xor.b32 %r3745, %r3744, %r3739; + shf.l.wrap.b32 %r3746, %r3745, %r3745, 24; + add.s32 %r3747, %r3746, %r3740; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 25; + add.s32 %r3750, %r3702, %r2955; + add.s32 %r3751, %r3750, %r3721; + xor.b32 %r3752, %r3751, %r3690; + shf.l.wrap.b32 %r3753, %r3752, %r3752, 16; + add.s32 %r3754, %r3753, %r3677; + xor.b32 %r3755, %r3754, %r3721; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 20; + add.s32 %r3757, %r3751, %r2983; + add.s32 %r3758, %r3757, %r3756; + xor.b32 %r3759, %r3758, %r3753; + shf.l.wrap.b32 %r3760, %r3759, %r3759, 24; + add.s32 %r3761, %r3760, %r3754; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 25; + add.s32 %r3764, %r3716, %r2969; + add.s32 %r3765, %r3764, %r3679; + xor.b32 %r3766, %r3765, %r3704; + shf.l.wrap.b32 %r3767, %r3766, %r3766, 16; + add.s32 %r3768, %r3767, %r3691; + xor.b32 %r3769, %r3768, %r3679; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 20; + add.s32 %r3771, %r3765, %r2990; + add.s32 %r3772, %r3771, %r3770; + xor.b32 %r3773, %r3772, %r3767; + shf.l.wrap.b32 %r3774, %r3773, %r3773, 24; + add.s32 %r3775, %r3774, %r3768; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 25; + add.s32 %r3778, %r3730, %r3018; + add.s32 %r3779, %r3778, %r3777; + xor.b32 %r3780, %r3779, %r3746; + shf.l.wrap.b32 %r3781, %r3780, %r3780, 16; + add.s32 %r3782, %r3781, %r3761; + xor.b32 %r3783, %r3782, %r3777; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 20; + add.s32 %r3785, %r3779, %r3046; + add.s32 %r3786, %r3785, %r3784; + xor.b32 %r3787, %r3786, %r3781; + shf.l.wrap.b32 %r3788, %r3787, %r3787, 24; + add.s32 %r3789, %r3788, %r3782; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 25; + add.s32 %r3792, %r3744, %r2976; + add.s32 %r3793, %r3792, %r3735; + xor.b32 %r3794, %r3793, %r3760; + shf.l.wrap.b32 %r3795, %r3794, %r3794, 16; + add.s32 %r3796, %r3795, %r3775; + xor.b32 %r3797, %r3796, %r3735; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 20; + add.s32 %r3799, %r3793, %r2941; + add.s32 %r3800, %r3799, %r3798; + xor.b32 %r3801, %r3800, %r3795; + shf.l.wrap.b32 %r3802, %r3801, %r3801, 24; + add.s32 %r3803, %r3802, %r3796; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 25; + add.s32 %r3806, %r3758, %r2948; + add.s32 %r3807, %r3806, %r3749; + xor.b32 %r3808, %r3774, %r3807; + shf.l.wrap.b32 %r3809, %r3808, %r3808, 16; + add.s32 %r3810, %r3809, %r3733; + xor.b32 %r3811, %r3810, %r3749; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 20; + add.s32 %r3813, %r3807, %r3004; + add.s32 %r3814, %r3813, %r3812; + xor.b32 %r3815, %r3814, %r3809; + shf.l.wrap.b32 %r3816, %r3815, %r3815, 24; + add.s32 %r3817, %r3816, %r3810; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 25; + add.s32 %r3820, %r3763, %r2997; + add.s32 %r3821, %r3820, %r3772; + xor.b32 %r3822, %r3821, %r3732; + shf.l.wrap.b32 %r3823, %r3822, %r3822, 16; + add.s32 %r3824, %r3823, %r3747; + xor.b32 %r3825, %r3824, %r3763; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 20; + add.s32 %r3827, %r3821, %r2983; + add.s32 %r3828, %r3827, %r3826; + xor.b32 %r3829, %r3828, %r3823; + shf.l.wrap.b32 %r3830, %r3829, %r3829, 24; + add.s32 %r3831, %r3830, %r3824; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 25; + add.s32 %r3834, %r3805, %r3039; + add.s32 %r3835, %r3834, %r3786; + xor.b32 %r3836, %r3835, %r3830; + shf.l.wrap.b32 %r3837, %r3836, %r3836, 16; + add.s32 %r3838, %r3837, %r3817; + xor.b32 %r3839, %r3838, %r3805; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 20; + add.s32 %r3841, %r3835, %r3011; + add.s32 %r3842, %r3841, %r3840; + xor.b32 %r3843, %r3842, %r3837; + shf.l.wrap.b32 %r3844, %r3843, %r3843, 24; + add.s32 %r3845, %r3844, %r3838; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 25; + add.s32 %r3848, %r3800, %r2955; + add.s32 %r3849, %r3848, %r3819; + xor.b32 %r3850, %r3788, %r3849; + shf.l.wrap.b32 %r3851, %r3850, %r3850, 16; + add.s32 %r3852, %r3851, %r3831; + xor.b32 %r3853, %r3852, %r3819; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 20; + add.s32 %r3855, %r3849, %r3025; + add.s32 %r3856, %r3855, %r3854; + xor.b32 %r3857, %r3856, %r3851; + shf.l.wrap.b32 %r3858, %r3857, %r3857, 24; + add.s32 %r3859, %r3858, %r3852; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 25; + add.s32 %r3862, %r3814, %r2962; + add.s32 %r3863, %r3862, %r3833; + xor.b32 %r3864, %r3863, %r3802; + shf.l.wrap.b32 %r3865, %r3864, %r3864, 16; + add.s32 %r3866, %r3865, %r3789; + xor.b32 %r3867, %r3866, %r3833; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 20; + add.s32 %r3869, %r3863, %r2969; + add.s32 %r3870, %r3869, %r3868; + xor.b32 %r3871, %r3870, %r3865; + shf.l.wrap.b32 %r3872, %r3871, %r3871, 24; + add.s32 %r3873, %r3872, %r3866; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 25; + add.s32 %r3876, %r3828, %r2990; + add.s32 %r3877, %r3876, %r3791; + xor.b32 %r3878, %r3877, %r3816; + shf.l.wrap.b32 %r3879, %r3878, %r3878, 16; + add.s32 %r3880, %r3879, %r3803; + xor.b32 %r3881, %r3880, %r3791; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 20; + add.s32 %r3883, %r3877, %r3032; + add.s32 %r3884, %r3883, %r3882; + xor.b32 %r3885, %r3884, %r3879; + shf.l.wrap.b32 %r3886, %r3885, %r3885, 24; + add.s32 %r3887, %r3886, %r3880; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 25; + xor.b32 %r3890, %r3873, %r3842; + xor.b32 %r3891, %r3887, %r3856; + xor.b32 %r3892, %r3845, %r3870; + xor.b32 %r3893, %r3884, %r3859; + xor.b32 %r3894, %r3889, %r3858; + xor.b32 %r3895, %r3847, %r3872; + xor.b32 %r3896, %r3886, %r3861; + xor.b32 %r3897, %r3875, %r3844; + st.local.u8 [%rd132+145], %r3890; + shr.u32 %r3898, %r3890, 8; + st.local.u8 [%rd132+146], %r3898; + shr.u32 %r3899, %r3890, 16; + st.local.u8 [%rd132+147], %r3899; + shr.u32 %r3900, %r3890, 24; + st.local.u8 [%rd132+148], %r3900; + st.local.u8 [%rd132+149], %r3891; + shr.u32 %r3901, %r3891, 8; + st.local.u8 [%rd132+150], %r3901; + shr.u32 %r3902, %r3891, 16; + st.local.u8 [%rd132+151], %r3902; + shr.u32 %r3903, %r3891, 24; + st.local.u8 [%rd132+152], %r3903; + st.local.u8 [%rd132+153], %r3892; + shr.u32 %r3904, %r3892, 8; + st.local.u8 [%rd132+154], %r3904; + shr.u32 %r3905, %r3892, 16; + st.local.u8 [%rd132+155], %r3905; + shr.u32 %r3906, %r3892, 24; + st.local.u8 [%rd132+156], %r3906; + st.local.u8 [%rd132+157], %r3893; + shr.u32 %r3907, %r3893, 8; + st.local.u8 [%rd132+158], %r3907; + shr.u32 %r3908, %r3893, 16; + st.local.u8 [%rd132+159], %r3908; + shr.u32 %r3909, %r3893, 24; + st.local.u8 [%rd132+160], %r3909; + st.local.u8 [%rd132+161], %r3894; + shr.u32 %r3910, %r3894, 8; + st.local.u8 [%rd132+162], %r3910; + shr.u32 %r3911, %r3894, 16; + st.local.u8 [%rd132+163], %r3911; + shr.u32 %r3912, %r3894, 24; + st.local.u8 [%rd132+164], %r3912; + st.local.u8 [%rd132+165], %r3895; + shr.u32 %r3913, %r3895, 8; + st.local.u8 [%rd132+166], %r3913; + shr.u32 %r3914, %r3895, 16; + st.local.u8 [%rd132+167], %r3914; + shr.u32 %r3915, %r3895, 24; + st.local.u8 [%rd132+168], %r3915; + st.local.u8 [%rd132+169], %r3896; + shr.u32 %r3916, %r3896, 8; + st.local.u8 [%rd132+170], %r3916; + shr.u32 %r3917, %r3896, 16; + st.local.u8 [%rd132+171], %r3917; + shr.u32 %r3918, %r3896, 24; + st.local.u8 [%rd132+172], %r3918; + st.local.u8 [%rd132+173], %r3897; + shr.u32 %r3919, %r3897, 8; + st.local.u8 [%rd132+174], %r3919; + shr.u32 %r3920, %r3897, 16; + st.local.u8 [%rd132+175], %r3920; + shr.u32 %r3921, %r3897, 24; + st.local.u8 [%rd132+176], %r3921; + ld.local.u8 %rs128, [%rd3+8]; + add.s16 %rs129, %rs128, -1; + st.local.u8 [%rd3+8], %rs129; + cvt.u64.u16 %rd133, %rs129; + and.b64 %rd134, %rd133, 255; + setp.lt.u64 %p16, %rd30, %rd134; + and.b16 %rs130, %rs129, 255; + mul.wide.u16 %r11659, %rs130, 32; + @%p16 bra $L__BB1_18; + +$L__BB1_19: + ld.param.u64 %rd223, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvt.s64.s32 %rd136, %r11659; + add.s64 %rd137, %rd2, %rd136; + mov.u64 %rd246, 0; + st.local.u8 [%rd137+145], %r27; + shr.u32 %r3922, %r27, 8; + st.local.u8 [%rd137+146], %r3922; + shr.u32 %r3923, %r27, 16; + st.local.u8 [%rd137+147], %r3923; + shr.u32 %r3924, %r27, 24; + st.local.u8 [%rd137+148], %r3924; + st.local.u8 [%rd137+149], %r28; + shr.u32 %r3925, %r28, 8; + st.local.u8 [%rd137+150], %r3925; + shr.u32 %r3926, %r28, 16; + st.local.u8 [%rd137+151], %r3926; + shr.u32 %r3927, %r28, 24; + st.local.u8 [%rd137+152], %r3927; + st.local.u8 [%rd137+153], %r29; + shr.u32 %r3928, %r29, 8; + st.local.u8 [%rd137+154], %r3928; + shr.u32 %r3929, %r29, 16; + st.local.u8 [%rd137+155], %r3929; + shr.u32 %r3930, %r29, 24; + st.local.u8 [%rd137+156], %r3930; + st.local.u8 [%rd137+157], %r30; + shr.u32 %r3931, %r30, 8; + st.local.u8 [%rd137+158], %r3931; + shr.u32 %r3932, %r30, 16; + st.local.u8 [%rd137+159], %r3932; + shr.u32 %r3933, %r30, 24; + st.local.u8 [%rd137+160], %r3933; + st.local.u8 [%rd137+161], %r31; + shr.u32 %r3934, %r31, 8; + st.local.u8 [%rd137+162], %r3934; + shr.u32 %r3935, %r31, 16; + st.local.u8 [%rd137+163], %r3935; + shr.u32 %r3936, %r31, 24; + st.local.u8 [%rd137+164], %r3936; + st.local.u8 [%rd137+165], %r32; + shr.u32 %r3937, %r32, 8; + st.local.u8 [%rd137+166], %r3937; + shr.u32 %r3938, %r32, 16; + st.local.u8 [%rd137+167], %r3938; + shr.u32 %r3939, %r32, 24; + st.local.u8 [%rd137+168], %r3939; + st.local.u8 [%rd137+169], %r33; + shr.u32 %r3940, %r33, 8; + st.local.u8 [%rd137+170], %r3940; + shr.u32 %r3941, %r33, 16; + st.local.u8 [%rd137+171], %r3941; + shr.u32 %r3942, %r33, 24; + st.local.u8 [%rd137+172], %r3942; + st.local.u8 [%rd137+173], %r34; + shr.u32 %r3943, %r34, 8; + st.local.u8 [%rd137+174], %r3943; + shr.u32 %r3944, %r34, 16; + st.local.u8 [%rd137+175], %r3944; + shr.u32 %r3945, %r34, 24; + st.local.u8 [%rd137+176], %r3945; + ld.local.u8 %rs131, [%rd3+8]; + add.s16 %rs132, %rs131, 1; + st.local.u8 [%rd3+8], %rs132; + ld.local.u64 %rd138, [%rd3+-72]; + add.s64 %rd32, %rd138, 1; + add.s64 %rd254, %rd223, %rd6; + +$L__BB1_20: + add.s64 %rd139, %rd2, %rd246; + ld.local.u8 %rs133, [%rd139]; + st.local.u8 [%rd139+32], %rs133; + add.s64 %rd246, %rd246, 1; + setp.lt.u64 %p17, %rd246, 32; + @%p17 bra $L__BB1_20; + + mov.u64 %rd247, 0; + st.local.u64 [%rd3+-72], %rd32; + mov.u16 %rs134, 0; + st.local.u8 [%rd3+1], %rs134; + +$L__BB1_22: + add.s64 %rd141, %rd2, %rd247; + st.local.u8 [%rd141+72], %rs134; + add.s64 %rd247, %rd247, 1; + setp.lt.u64 %p18, %rd247, 64; + @%p18 bra $L__BB1_22; + + ld.param.u64 %rd236, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd235, %rd236; + add.s64 %rd261, %rd235, %rd6; + mov.u64 %rd225, 80; + sub.s64 %rd262, %rd225, %rd6; + mov.u16 %rs136, 0; + st.local.u8 [%rd3], %rs136; + +$L__BB1_24: + setp.lt.u64 %p19, %rd262, 1025; + @%p19 bra $L__BB1_48; + + ld.local.u64 %rd251, [%rd3+-72]; + add.u64 %rd142, %SP, 0; + add.u64 %rd42, %SPL, 0; + +$L__BB1_26: + or.b64 %rd143, %rd262, 1; + mov.u64 %rd144, 1; + setp.gt.u64 %p20, %rd143, 4294967295; + shr.u64 %rd145, %rd262, 32; + selp.b64 %rd146, %rd145, %rd143, %p20; + selp.b32 %r3946, 32, 0, %p20; + and.b64 %rd147, %rd146, 4294901760; + setp.ne.s64 %p21, %rd147, 0; + shr.u64 %rd148, %rd146, 16; + or.b32 %r3947, %r3946, 16; + selp.b64 %rd149, %rd148, %rd146, %p21; + selp.b32 %r3948, %r3947, %r3946, %p21; + and.b64 %rd150, %rd149, 65280; + setp.ne.s64 %p22, %rd150, 0; + shr.u64 %rd151, %rd149, 8; + or.b32 %r3949, %r3948, 8; + selp.b64 %rd152, %rd151, %rd149, %p22; + selp.b32 %r3950, %r3949, %r3948, %p22; + and.b64 %rd153, %rd152, 240; + setp.ne.s64 %p23, %rd153, 0; + shr.u64 %rd154, %rd152, 4; + or.b32 %r3951, %r3950, 4; + selp.b64 %rd155, %rd154, %rd152, %p23; + selp.b32 %r3952, %r3951, %r3950, %p23; + and.b64 %rd156, %rd155, 12; + setp.ne.s64 %p24, %rd156, 0; + shr.u64 %rd157, %rd155, 2; + add.s32 %r3953, %r3952, 2; + selp.b64 %rd158, %rd157, %rd155, %p24; + selp.b32 %r3954, %r3953, %r3952, %p24; + and.b64 %rd159, %rd158, 2; + shr.u64 %rd160, %rd159, 1; + cvt.u32.u64 %r3955, %rd160; + add.s32 %r3956, %r3954, %r3955; + shl.b64 %rd255, %rd144, %r3956; + shl.b64 %rd48, %rd251, 10; + +$L__BB1_27: + mov.u64 %rd49, %rd255; + add.s64 %rd161, %rd49, -1; + and.b64 %rd162, %rd161, %rd48; + setp.ne.s64 %p25, %rd162, 0; + shr.u64 %rd255, %rd49, 1; + @%p25 bra $L__BB1_27; + + ld.local.u8 %rs14, [%rd3+2]; + setp.lt.u64 %p26, %rd49, 1025; + @%p26 bra $L__BB1_36; + bra.uni $L__BB1_29; + +$L__BB1_36: + ld.local.u8 %r5955, [%rd3+-136]; + ld.local.u8 %r5956, [%rd3+-135]; + prmt.b32 %r5957, %r5956, %r5955, 30212; + ld.local.u8 %r5958, [%rd3+-134]; + ld.local.u8 %r5959, [%rd3+-133]; + prmt.b32 %r5960, %r5959, %r5958, 30212; + prmt.b32 %r11679, %r5960, %r5957, 4180; + ld.local.u8 %r5961, [%rd3+-132]; + ld.local.u8 %r5962, [%rd3+-131]; + prmt.b32 %r5963, %r5962, %r5961, 30212; + ld.local.u8 %r5964, [%rd3+-130]; + ld.local.u8 %r5965, [%rd3+-129]; + prmt.b32 %r5966, %r5965, %r5964, 30212; + prmt.b32 %r11678, %r5966, %r5963, 4180; + ld.local.u8 %r5967, [%rd3+-128]; + ld.local.u8 %r5968, [%rd3+-127]; + prmt.b32 %r5969, %r5968, %r5967, 30212; + ld.local.u8 %r5970, [%rd3+-126]; + ld.local.u8 %r5971, [%rd3+-125]; + prmt.b32 %r5972, %r5971, %r5970, 30212; + prmt.b32 %r11677, %r5972, %r5969, 4180; + ld.local.u8 %r5973, [%rd3+-124]; + ld.local.u8 %r5974, [%rd3+-123]; + prmt.b32 %r5975, %r5974, %r5973, 30212; + ld.local.u8 %r5976, [%rd3+-122]; + ld.local.u8 %r5977, [%rd3+-121]; + prmt.b32 %r5978, %r5977, %r5976, 30212; + prmt.b32 %r11676, %r5978, %r5975, 4180; + ld.local.u8 %r5979, [%rd3+-120]; + ld.local.u8 %r5980, [%rd3+-119]; + prmt.b32 %r5981, %r5980, %r5979, 30212; + ld.local.u8 %r5982, [%rd3+-118]; + ld.local.u8 %r5983, [%rd3+-117]; + prmt.b32 %r5984, %r5983, %r5982, 30212; + prmt.b32 %r11675, %r5984, %r5981, 4180; + ld.local.u8 %r5985, [%rd3+-116]; + ld.local.u8 %r5986, [%rd3+-115]; + prmt.b32 %r5987, %r5986, %r5985, 30212; + ld.local.u8 %r5988, [%rd3+-114]; + ld.local.u8 %r5989, [%rd3+-113]; + prmt.b32 %r5990, %r5989, %r5988, 30212; + prmt.b32 %r11674, %r5990, %r5987, 4180; + ld.local.u8 %r5991, [%rd3+-112]; + ld.local.u8 %r5992, [%rd3+-111]; + prmt.b32 %r5993, %r5992, %r5991, 30212; + ld.local.u8 %r5994, [%rd3+-110]; + ld.local.u8 %r5995, [%rd3+-109]; + prmt.b32 %r5996, %r5995, %r5994, 30212; + prmt.b32 %r11673, %r5996, %r5993, 4180; + ld.local.u8 %r5997, [%rd3+-108]; + ld.local.u8 %r5998, [%rd3+-107]; + prmt.b32 %r5999, %r5998, %r5997, 30212; + ld.local.u8 %r6000, [%rd3+-106]; + ld.local.u8 %r6001, [%rd3+-105]; + prmt.b32 %r6002, %r6001, %r6000, 30212; + prmt.b32 %r11672, %r6002, %r5999, 4180; + add.u64 %rd53, %SPL, 64; + mov.u32 %r6003, 0; + st.local.v2.u32 [%rd53], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+8], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+16], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+24], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+32], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+40], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+48], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+56], {%r6003, %r6003}; + mov.u16 %rs354, 0; + st.local.v2.u8 [%rd53+64], {%rs354, %rs354}; + st.local.u8 [%rd53+66], %rs14; + cvt.u32.u64 %r71, %rd251; + shr.u64 %rd185, %rd251, 32; + cvt.u32.u64 %r72, %rd185; + setp.lt.u64 %p31, %rd49, 65; + mov.u64 %rd258, %rd261; + mov.u64 %rd259, %rd49; + @%p31 bra $L__BB1_39; + + add.s64 %rd54, %rd53, 64; + mov.u16 %rs353, 0; + mov.u64 %rd259, %rd49; + mov.u64 %rd258, %rd261; + +$L__BB1_38: + and.b16 %rs213, %rs353, 255; + setp.eq.s16 %p32, %rs213, 0; + selp.u16 %rs214, 1, 0, %p32; + or.b16 %rs215, %rs14, %rs214; + ld.local.u8 %r6004, [%rd258]; + ld.local.u8 %r6005, [%rd258+1]; + prmt.b32 %r6006, %r6005, %r6004, 30212; + ld.local.u8 %r6007, [%rd258+2]; + prmt.b32 %r6008, %r6007, %r6006, 28756; + ld.local.u8 %r6009, [%rd258+3]; + prmt.b32 %r6010, %r6009, %r6008, 1620; + ld.local.u8 %r6011, [%rd258+4]; + ld.local.u8 %r6012, [%rd258+5]; + prmt.b32 %r6013, %r6012, %r6011, 30212; + ld.local.u8 %r6014, [%rd258+6]; + prmt.b32 %r6015, %r6014, %r6013, 28756; + ld.local.u8 %r6016, [%rd258+7]; + prmt.b32 %r6017, %r6016, %r6015, 1620; + ld.local.u8 %r6018, [%rd258+8]; + ld.local.u8 %r6019, [%rd258+9]; + prmt.b32 %r6020, %r6019, %r6018, 30212; + ld.local.u8 %r6021, [%rd258+10]; + prmt.b32 %r6022, %r6021, %r6020, 28756; + ld.local.u8 %r6023, [%rd258+11]; + prmt.b32 %r6024, %r6023, %r6022, 1620; + ld.local.u8 %r6025, [%rd258+12]; + ld.local.u8 %r6026, [%rd258+13]; + prmt.b32 %r6027, %r6026, %r6025, 30212; + ld.local.u8 %r6028, [%rd258+14]; + prmt.b32 %r6029, %r6028, %r6027, 28756; + ld.local.u8 %r6030, [%rd258+15]; + prmt.b32 %r6031, %r6030, %r6029, 1620; + ld.local.u8 %r6032, [%rd258+16]; + ld.local.u8 %r6033, [%rd258+17]; + prmt.b32 %r6034, %r6033, %r6032, 30212; + ld.local.u8 %r6035, [%rd258+18]; + prmt.b32 %r6036, %r6035, %r6034, 28756; + ld.local.u8 %r6037, [%rd258+19]; + prmt.b32 %r6038, %r6037, %r6036, 1620; + ld.local.u8 %r6039, [%rd258+20]; + ld.local.u8 %r6040, [%rd258+21]; + prmt.b32 %r6041, %r6040, %r6039, 30212; + ld.local.u8 %r6042, [%rd258+22]; + prmt.b32 %r6043, %r6042, %r6041, 28756; + ld.local.u8 %r6044, [%rd258+23]; + prmt.b32 %r6045, %r6044, %r6043, 1620; + ld.local.u8 %r6046, [%rd258+24]; + ld.local.u8 %r6047, [%rd258+25]; + prmt.b32 %r6048, %r6047, %r6046, 30212; + ld.local.u8 %r6049, [%rd258+26]; + prmt.b32 %r6050, %r6049, %r6048, 28756; + ld.local.u8 %r6051, [%rd258+27]; + prmt.b32 %r6052, %r6051, %r6050, 1620; + ld.local.u8 %r6053, [%rd258+28]; + ld.local.u8 %r6054, [%rd258+29]; + prmt.b32 %r6055, %r6054, %r6053, 30212; + ld.local.u8 %r6056, [%rd258+30]; + prmt.b32 %r6057, %r6056, %r6055, 28756; + ld.local.u8 %r6058, [%rd258+31]; + prmt.b32 %r6059, %r6058, %r6057, 1620; + ld.local.u8 %r6060, [%rd258+32]; + ld.local.u8 %r6061, [%rd258+33]; + prmt.b32 %r6062, %r6061, %r6060, 30212; + ld.local.u8 %r6063, [%rd258+34]; + prmt.b32 %r6064, %r6063, %r6062, 28756; + ld.local.u8 %r6065, [%rd258+35]; + prmt.b32 %r6066, %r6065, %r6064, 1620; + ld.local.u8 %r6067, [%rd258+36]; + ld.local.u8 %r6068, [%rd258+37]; + prmt.b32 %r6069, %r6068, %r6067, 30212; + ld.local.u8 %r6070, [%rd258+38]; + prmt.b32 %r6071, %r6070, %r6069, 28756; + ld.local.u8 %r6072, [%rd258+39]; + prmt.b32 %r6073, %r6072, %r6071, 1620; + ld.local.u8 %r6074, [%rd258+40]; + ld.local.u8 %r6075, [%rd258+41]; + prmt.b32 %r6076, %r6075, %r6074, 30212; + ld.local.u8 %r6077, [%rd258+42]; + prmt.b32 %r6078, %r6077, %r6076, 28756; + ld.local.u8 %r6079, [%rd258+43]; + prmt.b32 %r6080, %r6079, %r6078, 1620; + ld.local.u8 %r6081, [%rd258+44]; + ld.local.u8 %r6082, [%rd258+45]; + prmt.b32 %r6083, %r6082, %r6081, 30212; + ld.local.u8 %r6084, [%rd258+46]; + prmt.b32 %r6085, %r6084, %r6083, 28756; + ld.local.u8 %r6086, [%rd258+47]; + prmt.b32 %r6087, %r6086, %r6085, 1620; + ld.local.u8 %r6088, [%rd258+48]; + ld.local.u8 %r6089, [%rd258+49]; + prmt.b32 %r6090, %r6089, %r6088, 30212; + ld.local.u8 %r6091, [%rd258+50]; + prmt.b32 %r6092, %r6091, %r6090, 28756; + ld.local.u8 %r6093, [%rd258+51]; + prmt.b32 %r6094, %r6093, %r6092, 1620; + ld.local.u8 %r6095, [%rd258+52]; + ld.local.u8 %r6096, [%rd258+53]; + prmt.b32 %r6097, %r6096, %r6095, 30212; + ld.local.u8 %r6098, [%rd258+54]; + prmt.b32 %r6099, %r6098, %r6097, 28756; + ld.local.u8 %r6100, [%rd258+55]; + prmt.b32 %r6101, %r6100, %r6099, 1620; + ld.local.u8 %r6102, [%rd258+56]; + ld.local.u8 %r6103, [%rd258+57]; + prmt.b32 %r6104, %r6103, %r6102, 30212; + ld.local.u8 %r6105, [%rd258+58]; + prmt.b32 %r6106, %r6105, %r6104, 28756; + ld.local.u8 %r6107, [%rd258+59]; + prmt.b32 %r6108, %r6107, %r6106, 1620; + ld.local.u8 %r6109, [%rd258+60]; + ld.local.u8 %r6110, [%rd258+61]; + prmt.b32 %r6111, %r6110, %r6109, 30212; + ld.local.u8 %r6112, [%rd258+62]; + prmt.b32 %r6113, %r6112, %r6111, 28756; + ld.local.u8 %r6114, [%rd258+63]; + prmt.b32 %r6115, %r6114, %r6113, 1620; + cvt.u32.u16 %r6116, %rs215; + and.b32 %r6117, %r6116, 255; + add.s32 %r6118, %r11679, %r11675; + add.s32 %r6119, %r6118, %r6010; + xor.b32 %r6120, %r6119, %r71; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 16; + add.s32 %r6122, %r6121, 1779033703; + xor.b32 %r6123, %r6122, %r11675; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 20; + add.s32 %r6125, %r6017, %r6119; + add.s32 %r6126, %r6125, %r6124; + xor.b32 %r6127, %r6126, %r6121; + shf.l.wrap.b32 %r6128, %r6127, %r6127, 24; + add.s32 %r6129, %r6128, %r6122; + xor.b32 %r6130, %r6129, %r6124; + shf.l.wrap.b32 %r6131, %r6130, %r6130, 25; + add.s32 %r6132, %r11678, %r11674; + add.s32 %r6133, %r6132, %r6024; + xor.b32 %r6134, %r6133, %r72; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 16; + add.s32 %r6136, %r6135, -1150833019; + xor.b32 %r6137, %r6136, %r11674; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 20; + add.s32 %r6139, %r6031, %r6133; + add.s32 %r6140, %r6139, %r6138; + xor.b32 %r6141, %r6140, %r6135; + shf.l.wrap.b32 %r6142, %r6141, %r6141, 24; + add.s32 %r6143, %r6142, %r6136; + xor.b32 %r6144, %r6143, %r6138; + shf.l.wrap.b32 %r6145, %r6144, %r6144, 25; + add.s32 %r6146, %r11677, %r11673; + add.s32 %r6147, %r6146, %r6038; + shr.u32 %r6148, %r6147, 16; + shl.b32 %r6149, %r6147, 16; + xor.b32 %r6150, %r6149, 4194304; + or.b32 %r6151, %r6150, %r6148; + add.s32 %r6152, %r6151, 1013904242; + xor.b32 %r6153, %r6152, %r11673; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6045, %r6147; + add.s32 %r6156, %r6155, %r6154; + xor.b32 %r6157, %r6156, %r6151; + shf.l.wrap.b32 %r6158, %r6157, %r6157, 24; + add.s32 %r6159, %r6158, %r6152; + xor.b32 %r6160, %r6159, %r6154; + shf.l.wrap.b32 %r6161, %r6160, %r6160, 25; + add.s32 %r6162, %r11676, %r11672; + add.s32 %r6163, %r6162, %r6052; + xor.b32 %r6164, %r6163, %r6117; + shr.u32 %r6165, %r6163, 16; + shl.b32 %r6166, %r6164, 16; + or.b32 %r6167, %r6166, %r6165; + add.s32 %r6168, %r6167, -1521486534; + xor.b32 %r6169, %r6168, %r11672; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6059, %r6163; + add.s32 %r6172, %r6171, %r6170; + xor.b32 %r6173, %r6172, %r6167; + shf.l.wrap.b32 %r6174, %r6173, %r6173, 24; + add.s32 %r6175, %r6174, %r6168; + xor.b32 %r6176, %r6175, %r6170; + shf.l.wrap.b32 %r6177, %r6176, %r6176, 25; + add.s32 %r6178, %r6145, %r6126; + add.s32 %r6179, %r6178, %r6066; + xor.b32 %r6180, %r6174, %r6179; + shf.l.wrap.b32 %r6181, %r6180, %r6180, 16; + add.s32 %r6182, %r6181, %r6159; + xor.b32 %r6183, %r6182, %r6145; + shf.l.wrap.b32 %r6184, %r6183, %r6183, 20; + add.s32 %r6185, %r6073, %r6179; + add.s32 %r6186, %r6185, %r6184; + xor.b32 %r6187, %r6186, %r6181; + shf.l.wrap.b32 %r6188, %r6187, %r6187, 24; + add.s32 %r6189, %r6188, %r6182; + xor.b32 %r6190, %r6189, %r6184; + shf.l.wrap.b32 %r6191, %r6190, %r6190, 25; + add.s32 %r6192, %r6161, %r6140; + add.s32 %r6193, %r6192, %r6080; + xor.b32 %r6194, %r6193, %r6128; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 16; + add.s32 %r6196, %r6195, %r6175; + xor.b32 %r6197, %r6196, %r6161; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 20; + add.s32 %r6199, %r6087, %r6193; + add.s32 %r6200, %r6199, %r6198; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 24; + add.s32 %r6203, %r6202, %r6196; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 25; + add.s32 %r6206, %r6177, %r6156; + add.s32 %r6207, %r6206, %r6094; + xor.b32 %r6208, %r6207, %r6142; + shf.l.wrap.b32 %r6209, %r6208, %r6208, 16; + add.s32 %r6210, %r6209, %r6129; + xor.b32 %r6211, %r6210, %r6177; + shf.l.wrap.b32 %r6212, %r6211, %r6211, 20; + add.s32 %r6213, %r6101, %r6207; + add.s32 %r6214, %r6213, %r6212; + xor.b32 %r6215, %r6214, %r6209; + shf.l.wrap.b32 %r6216, %r6215, %r6215, 24; + add.s32 %r6217, %r6216, %r6210; + xor.b32 %r6218, %r6217, %r6212; + shf.l.wrap.b32 %r6219, %r6218, %r6218, 25; + add.s32 %r6220, %r6172, %r6131; + add.s32 %r6221, %r6220, %r6108; + xor.b32 %r6222, %r6221, %r6158; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 16; + add.s32 %r6224, %r6223, %r6143; + xor.b32 %r6225, %r6224, %r6131; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 20; + add.s32 %r6227, %r6115, %r6221; + add.s32 %r6228, %r6227, %r6226; + xor.b32 %r6229, %r6228, %r6223; + shf.l.wrap.b32 %r6230, %r6229, %r6229, 24; + add.s32 %r6231, %r6230, %r6224; + xor.b32 %r6232, %r6231, %r6226; + shf.l.wrap.b32 %r6233, %r6232, %r6232, 25; + add.s32 %r6234, %r6186, %r6024; + add.s32 %r6235, %r6234, %r6233; + xor.b32 %r6236, %r6235, %r6202; + shf.l.wrap.b32 %r6237, %r6236, %r6236, 16; + add.s32 %r6238, %r6237, %r6217; + xor.b32 %r6239, %r6238, %r6233; + shf.l.wrap.b32 %r6240, %r6239, %r6239, 20; + add.s32 %r6241, %r6235, %r6052; + add.s32 %r6242, %r6241, %r6240; + xor.b32 %r6243, %r6242, %r6237; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6238; + xor.b32 %r6246, %r6245, %r6240; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6200, %r6031; + add.s32 %r6249, %r6248, %r6191; + xor.b32 %r6250, %r6216, %r6249; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6231, %r6251; + xor.b32 %r6253, %r6252, %r6191; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6080; + add.s32 %r6256, %r6255, %r6254; + xor.b32 %r6257, %r6256, %r6251; + shf.l.wrap.b32 %r6258, %r6257, %r6257, 24; + add.s32 %r6259, %r6258, %r6252; + xor.b32 %r6260, %r6259, %r6254; + shf.l.wrap.b32 %r6261, %r6260, %r6260, 25; + add.s32 %r6262, %r6205, %r6059; + add.s32 %r6263, %r6262, %r6214; + xor.b32 %r6264, %r6230, %r6263; + shf.l.wrap.b32 %r6265, %r6264, %r6264, 16; + add.s32 %r6266, %r6265, %r6189; + xor.b32 %r6267, %r6266, %r6205; + shf.l.wrap.b32 %r6268, %r6267, %r6267, 20; + add.s32 %r6269, %r6263, %r6010; + add.s32 %r6270, %r6269, %r6268; + xor.b32 %r6271, %r6270, %r6265; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6266; + xor.b32 %r6274, %r6273, %r6268; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6219, %r6038; + add.s32 %r6277, %r6276, %r6228; + xor.b32 %r6278, %r6277, %r6188; + shf.l.wrap.b32 %r6279, %r6278, %r6278, 16; + add.s32 %r6280, %r6279, %r6203; + xor.b32 %r6281, %r6280, %r6219; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 20; + add.s32 %r6283, %r6277, %r6101; + add.s32 %r6284, %r6283, %r6282; + xor.b32 %r6285, %r6284, %r6279; + shf.l.wrap.b32 %r6286, %r6285, %r6285, 24; + add.s32 %r6287, %r6286, %r6280; + xor.b32 %r6288, %r6287, %r6282; + shf.l.wrap.b32 %r6289, %r6288, %r6288, 25; + add.s32 %r6290, %r6242, %r6017; + add.s32 %r6291, %r6290, %r6261; + xor.b32 %r6292, %r6291, %r6286; + shf.l.wrap.b32 %r6293, %r6292, %r6292, 16; + add.s32 %r6294, %r6293, %r6273; + xor.b32 %r6295, %r6294, %r6261; + shf.l.wrap.b32 %r6296, %r6295, %r6295, 20; + add.s32 %r6297, %r6291, %r6087; + add.s32 %r6298, %r6297, %r6296; + xor.b32 %r6299, %r6298, %r6293; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 24; + add.s32 %r6301, %r6300, %r6294; + xor.b32 %r6302, %r6301, %r6296; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 25; + add.s32 %r6304, %r6256, %r6094; + add.s32 %r6305, %r6304, %r6275; + xor.b32 %r6306, %r6305, %r6244; + shf.l.wrap.b32 %r6307, %r6306, %r6306, 16; + add.s32 %r6308, %r6307, %r6287; + xor.b32 %r6309, %r6308, %r6275; + shf.l.wrap.b32 %r6310, %r6309, %r6309, 20; + add.s32 %r6311, %r6305, %r6045; + add.s32 %r6312, %r6311, %r6310; + xor.b32 %r6313, %r6312, %r6307; + shf.l.wrap.b32 %r6314, %r6313, %r6313, 24; + add.s32 %r6315, %r6314, %r6308; + xor.b32 %r6316, %r6315, %r6310; + shf.l.wrap.b32 %r6317, %r6316, %r6316, 25; + add.s32 %r6318, %r6270, %r6073; + add.s32 %r6319, %r6318, %r6289; + xor.b32 %r6320, %r6319, %r6258; + shf.l.wrap.b32 %r6321, %r6320, %r6320, 16; + add.s32 %r6322, %r6321, %r6245; + xor.b32 %r6323, %r6322, %r6289; + shf.l.wrap.b32 %r6324, %r6323, %r6323, 20; + add.s32 %r6325, %r6319, %r6108; + add.s32 %r6326, %r6325, %r6324; + xor.b32 %r6327, %r6326, %r6321; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 24; + add.s32 %r6329, %r6328, %r6322; + xor.b32 %r6330, %r6329, %r6324; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 25; + add.s32 %r6332, %r6284, %r6115; + add.s32 %r6333, %r6332, %r6247; + xor.b32 %r6334, %r6333, %r6272; + shf.l.wrap.b32 %r6335, %r6334, %r6334, 16; + add.s32 %r6336, %r6335, %r6259; + xor.b32 %r6337, %r6336, %r6247; + shf.l.wrap.b32 %r6338, %r6337, %r6337, 20; + add.s32 %r6339, %r6333, %r6066; + add.s32 %r6340, %r6339, %r6338; + xor.b32 %r6341, %r6340, %r6335; + shf.l.wrap.b32 %r6342, %r6341, %r6341, 24; + add.s32 %r6343, %r6342, %r6336; + xor.b32 %r6344, %r6343, %r6338; + shf.l.wrap.b32 %r6345, %r6344, %r6344, 25; + add.s32 %r6346, %r6298, %r6031; + add.s32 %r6347, %r6346, %r6345; + xor.b32 %r6348, %r6347, %r6314; + shf.l.wrap.b32 %r6349, %r6348, %r6348, 16; + add.s32 %r6350, %r6349, %r6329; + xor.b32 %r6351, %r6350, %r6345; + shf.l.wrap.b32 %r6352, %r6351, %r6351, 20; + add.s32 %r6353, %r6347, %r6038; + add.s32 %r6354, %r6353, %r6352; + xor.b32 %r6355, %r6354, %r6349; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6350; + xor.b32 %r6358, %r6357, %r6352; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6312, %r6080; + add.s32 %r6361, %r6360, %r6303; + xor.b32 %r6362, %r6361, %r6328; + shf.l.wrap.b32 %r6363, %r6362, %r6362, 16; + add.s32 %r6364, %r6363, %r6343; + xor.b32 %r6365, %r6364, %r6303; + shf.l.wrap.b32 %r6366, %r6365, %r6365, 20; + add.s32 %r6367, %r6361, %r6094; + add.s32 %r6368, %r6367, %r6366; + xor.b32 %r6369, %r6368, %r6363; + shf.l.wrap.b32 %r6370, %r6369, %r6369, 24; + add.s32 %r6371, %r6370, %r6364; + xor.b32 %r6372, %r6371, %r6366; + shf.l.wrap.b32 %r6373, %r6372, %r6372, 25; + add.s32 %r6374, %r6326, %r6101; + add.s32 %r6375, %r6374, %r6317; + xor.b32 %r6376, %r6375, %r6342; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6301; + xor.b32 %r6379, %r6378, %r6317; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6024; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6340, %r6059; + add.s32 %r6389, %r6388, %r6331; + xor.b32 %r6390, %r6389, %r6300; + shf.l.wrap.b32 %r6391, %r6390, %r6390, 16; + add.s32 %r6392, %r6391, %r6315; + xor.b32 %r6393, %r6392, %r6331; + shf.l.wrap.b32 %r6394, %r6393, %r6393, 20; + add.s32 %r6395, %r6389, %r6108; + add.s32 %r6396, %r6395, %r6394; + xor.b32 %r6397, %r6396, %r6391; + shf.l.wrap.b32 %r6398, %r6397, %r6397, 24; + add.s32 %r6399, %r6398, %r6392; + xor.b32 %r6400, %r6399, %r6394; + shf.l.wrap.b32 %r6401, %r6400, %r6400, 25; + add.s32 %r6402, %r6354, %r6052; + add.s32 %r6403, %r6402, %r6373; + xor.b32 %r6404, %r6403, %r6398; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 16; + add.s32 %r6406, %r6405, %r6385; + xor.b32 %r6407, %r6406, %r6373; + shf.l.wrap.b32 %r6408, %r6407, %r6407, 20; + add.s32 %r6409, %r6403, %r6045; + add.s32 %r6410, %r6409, %r6408; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 24; + add.s32 %r6413, %r6412, %r6406; + xor.b32 %r6414, %r6413, %r6408; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 25; + add.s32 %r6416, %r6368, %r6073; + add.s32 %r6417, %r6416, %r6387; + xor.b32 %r6418, %r6417, %r6356; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 16; + add.s32 %r6420, %r6419, %r6399; + xor.b32 %r6421, %r6420, %r6387; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 20; + add.s32 %r6423, %r6417, %r6010; + add.s32 %r6424, %r6423, %r6422; + xor.b32 %r6425, %r6424, %r6419; + shf.l.wrap.b32 %r6426, %r6425, %r6425, 24; + add.s32 %r6427, %r6426, %r6420; + xor.b32 %r6428, %r6427, %r6422; + shf.l.wrap.b32 %r6429, %r6428, %r6428, 25; + add.s32 %r6430, %r6382, %r6087; + add.s32 %r6431, %r6430, %r6401; + xor.b32 %r6432, %r6431, %r6370; + shf.l.wrap.b32 %r6433, %r6432, %r6432, 16; + add.s32 %r6434, %r6433, %r6357; + xor.b32 %r6435, %r6434, %r6401; + shf.l.wrap.b32 %r6436, %r6435, %r6435, 20; + add.s32 %r6437, %r6431, %r6115; + add.s32 %r6438, %r6437, %r6436; + xor.b32 %r6439, %r6438, %r6433; + shf.l.wrap.b32 %r6440, %r6439, %r6439, 24; + add.s32 %r6441, %r6440, %r6434; + xor.b32 %r6442, %r6441, %r6436; + shf.l.wrap.b32 %r6443, %r6442, %r6442, 25; + add.s32 %r6444, %r6396, %r6066; + add.s32 %r6445, %r6444, %r6359; + xor.b32 %r6446, %r6445, %r6384; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 16; + add.s32 %r6448, %r6447, %r6371; + xor.b32 %r6449, %r6448, %r6359; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 20; + add.s32 %r6451, %r6445, %r6017; + add.s32 %r6452, %r6451, %r6450; + xor.b32 %r6453, %r6452, %r6447; + shf.l.wrap.b32 %r6454, %r6453, %r6453, 24; + add.s32 %r6455, %r6454, %r6448; + xor.b32 %r6456, %r6455, %r6450; + shf.l.wrap.b32 %r6457, %r6456, %r6456, 25; + add.s32 %r6458, %r6410, %r6080; + add.s32 %r6459, %r6458, %r6457; + xor.b32 %r6460, %r6459, %r6426; + shf.l.wrap.b32 %r6461, %r6460, %r6460, 16; + add.s32 %r6462, %r6461, %r6441; + xor.b32 %r6463, %r6462, %r6457; + shf.l.wrap.b32 %r6464, %r6463, %r6463, 20; + add.s32 %r6465, %r6459, %r6059; + add.s32 %r6466, %r6465, %r6464; + xor.b32 %r6467, %r6466, %r6461; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6462; + xor.b32 %r6470, %r6469, %r6464; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6424, %r6094; + add.s32 %r6473, %r6472, %r6415; + xor.b32 %r6474, %r6473, %r6440; + shf.l.wrap.b32 %r6475, %r6474, %r6474, 16; + add.s32 %r6476, %r6475, %r6455; + xor.b32 %r6477, %r6476, %r6415; + shf.l.wrap.b32 %r6478, %r6477, %r6477, 20; + add.s32 %r6479, %r6473, %r6073; + add.s32 %r6480, %r6479, %r6478; + xor.b32 %r6481, %r6480, %r6475; + shf.l.wrap.b32 %r6482, %r6481, %r6481, 24; + add.s32 %r6483, %r6482, %r6476; + xor.b32 %r6484, %r6483, %r6478; + shf.l.wrap.b32 %r6485, %r6484, %r6484, 25; + add.s32 %r6486, %r6438, %r6108; + add.s32 %r6487, %r6486, %r6429; + xor.b32 %r6488, %r6487, %r6454; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6413; + xor.b32 %r6491, %r6490, %r6429; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6031; + add.s32 %r6494, %r6493, %r6492; + xor.b32 %r6495, %r6494, %r6489; + shf.l.wrap.b32 %r6496, %r6495, %r6495, 24; + add.s32 %r6497, %r6496, %r6490; + xor.b32 %r6498, %r6497, %r6492; + shf.l.wrap.b32 %r6499, %r6498, %r6498, 25; + add.s32 %r6500, %r6452, %r6101; + add.s32 %r6501, %r6500, %r6443; + xor.b32 %r6502, %r6501, %r6412; + shf.l.wrap.b32 %r6503, %r6502, %r6502, 16; + add.s32 %r6504, %r6503, %r6427; + xor.b32 %r6505, %r6504, %r6443; + shf.l.wrap.b32 %r6506, %r6505, %r6505, 20; + add.s32 %r6507, %r6501, %r6115; + add.s32 %r6508, %r6507, %r6506; + xor.b32 %r6509, %r6508, %r6503; + shf.l.wrap.b32 %r6510, %r6509, %r6509, 24; + add.s32 %r6511, %r6510, %r6504; + xor.b32 %r6512, %r6511, %r6506; + shf.l.wrap.b32 %r6513, %r6512, %r6512, 25; + add.s32 %r6514, %r6466, %r6038; + add.s32 %r6515, %r6514, %r6485; + xor.b32 %r6516, %r6515, %r6510; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 16; + add.s32 %r6518, %r6517, %r6497; + xor.b32 %r6519, %r6518, %r6485; + shf.l.wrap.b32 %r6520, %r6519, %r6519, 20; + add.s32 %r6521, %r6515, %r6010; + add.s32 %r6522, %r6521, %r6520; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 24; + add.s32 %r6525, %r6524, %r6518; + xor.b32 %r6526, %r6525, %r6520; + shf.l.wrap.b32 %r6527, %r6526, %r6526, 25; + add.s32 %r6528, %r6480, %r6087; + add.s32 %r6529, %r6528, %r6499; + xor.b32 %r6530, %r6529, %r6468; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 16; + add.s32 %r6532, %r6531, %r6511; + xor.b32 %r6533, %r6532, %r6499; + shf.l.wrap.b32 %r6534, %r6533, %r6533, 20; + add.s32 %r6535, %r6529, %r6024; + add.s32 %r6536, %r6535, %r6534; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 24; + add.s32 %r6539, %r6538, %r6532; + xor.b32 %r6540, %r6539, %r6534; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 25; + add.s32 %r6542, %r6494, %r6045; + add.s32 %r6543, %r6542, %r6513; + xor.b32 %r6544, %r6543, %r6482; + shf.l.wrap.b32 %r6545, %r6544, %r6544, 16; + add.s32 %r6546, %r6545, %r6469; + xor.b32 %r6547, %r6546, %r6513; + shf.l.wrap.b32 %r6548, %r6547, %r6547, 20; + add.s32 %r6549, %r6543, %r6066; + add.s32 %r6550, %r6549, %r6548; + xor.b32 %r6551, %r6550, %r6545; + shf.l.wrap.b32 %r6552, %r6551, %r6551, 24; + add.s32 %r6553, %r6552, %r6546; + xor.b32 %r6554, %r6553, %r6548; + shf.l.wrap.b32 %r6555, %r6554, %r6554, 25; + add.s32 %r6556, %r6508, %r6017; + add.s32 %r6557, %r6556, %r6471; + xor.b32 %r6558, %r6557, %r6496; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 16; + add.s32 %r6560, %r6559, %r6483; + xor.b32 %r6561, %r6560, %r6471; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 20; + add.s32 %r6563, %r6557, %r6052; + add.s32 %r6564, %r6563, %r6562; + xor.b32 %r6565, %r6564, %r6559; + shf.l.wrap.b32 %r6566, %r6565, %r6565, 24; + add.s32 %r6567, %r6566, %r6560; + xor.b32 %r6568, %r6567, %r6562; + shf.l.wrap.b32 %r6569, %r6568, %r6568, 25; + add.s32 %r6570, %r6522, %r6094; + add.s32 %r6571, %r6570, %r6569; + xor.b32 %r6572, %r6571, %r6538; + shf.l.wrap.b32 %r6573, %r6572, %r6572, 16; + add.s32 %r6574, %r6573, %r6553; + xor.b32 %r6575, %r6574, %r6569; + shf.l.wrap.b32 %r6576, %r6575, %r6575, 20; + add.s32 %r6577, %r6571, %r6101; + add.s32 %r6578, %r6577, %r6576; + xor.b32 %r6579, %r6578, %r6573; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6574; + xor.b32 %r6582, %r6581, %r6576; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6536, %r6073; + add.s32 %r6585, %r6584, %r6527; + xor.b32 %r6586, %r6585, %r6552; + shf.l.wrap.b32 %r6587, %r6586, %r6586, 16; + add.s32 %r6588, %r6587, %r6567; + xor.b32 %r6589, %r6588, %r6527; + shf.l.wrap.b32 %r6590, %r6589, %r6589, 20; + add.s32 %r6591, %r6585, %r6087; + add.s32 %r6592, %r6591, %r6590; + xor.b32 %r6593, %r6592, %r6587; + shf.l.wrap.b32 %r6594, %r6593, %r6593, 24; + add.s32 %r6595, %r6594, %r6588; + xor.b32 %r6596, %r6595, %r6590; + shf.l.wrap.b32 %r6597, %r6596, %r6596, 25; + add.s32 %r6598, %r6550, %r6115; + add.s32 %r6599, %r6598, %r6541; + xor.b32 %r6600, %r6599, %r6566; + shf.l.wrap.b32 %r6601, %r6600, %r6600, 16; + add.s32 %r6602, %r6601, %r6525; + xor.b32 %r6603, %r6602, %r6541; + shf.l.wrap.b32 %r6604, %r6603, %r6603, 20; + add.s32 %r6605, %r6599, %r6080; + add.s32 %r6606, %r6605, %r6604; + xor.b32 %r6607, %r6606, %r6601; + shf.l.wrap.b32 %r6608, %r6607, %r6607, 24; + add.s32 %r6609, %r6608, %r6602; + xor.b32 %r6610, %r6609, %r6604; + shf.l.wrap.b32 %r6611, %r6610, %r6610, 25; + add.s32 %r6612, %r6564, %r6108; + add.s32 %r6613, %r6612, %r6555; + xor.b32 %r6614, %r6613, %r6524; + shf.l.wrap.b32 %r6615, %r6614, %r6614, 16; + add.s32 %r6616, %r6615, %r6539; + xor.b32 %r6617, %r6616, %r6555; + shf.l.wrap.b32 %r6618, %r6617, %r6617, 20; + add.s32 %r6619, %r6613, %r6066; + add.s32 %r6620, %r6619, %r6618; + xor.b32 %r6621, %r6620, %r6615; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6616; + xor.b32 %r6624, %r6623, %r6618; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6578, %r6059; + add.s32 %r6627, %r6626, %r6597; + xor.b32 %r6628, %r6627, %r6622; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6609; + xor.b32 %r6631, %r6630, %r6597; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6024; + add.s32 %r6634, %r6633, %r6632; + xor.b32 %r6635, %r6634, %r6629; + shf.l.wrap.b32 %r6636, %r6635, %r6635, 24; + add.s32 %r6637, %r6636, %r6630; + xor.b32 %r6638, %r6637, %r6632; + shf.l.wrap.b32 %r6639, %r6638, %r6638, 25; + add.s32 %r6640, %r6592, %r6045; + add.s32 %r6641, %r6640, %r6611; + xor.b32 %r6642, %r6641, %r6580; + shf.l.wrap.b32 %r6643, %r6642, %r6642, 16; + add.s32 %r6644, %r6643, %r6623; + xor.b32 %r6645, %r6644, %r6611; + shf.l.wrap.b32 %r6646, %r6645, %r6645, 20; + add.s32 %r6647, %r6641, %r6031; + add.s32 %r6648, %r6647, %r6646; + xor.b32 %r6649, %r6648, %r6643; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 24; + add.s32 %r6651, %r6650, %r6644; + xor.b32 %r6652, %r6651, %r6646; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 25; + add.s32 %r6654, %r6606, %r6010; + add.s32 %r6655, %r6654, %r6625; + xor.b32 %r6656, %r6655, %r6594; + shf.l.wrap.b32 %r6657, %r6656, %r6656, 16; + add.s32 %r6658, %r6657, %r6581; + xor.b32 %r6659, %r6658, %r6625; + shf.l.wrap.b32 %r6660, %r6659, %r6659, 20; + add.s32 %r6661, %r6655, %r6017; + add.s32 %r6662, %r6661, %r6660; + xor.b32 %r6663, %r6662, %r6657; + shf.l.wrap.b32 %r6664, %r6663, %r6663, 24; + add.s32 %r6665, %r6664, %r6658; + xor.b32 %r6666, %r6665, %r6660; + shf.l.wrap.b32 %r6667, %r6666, %r6666, 25; + add.s32 %r6668, %r6620, %r6052; + add.s32 %r6669, %r6668, %r6583; + xor.b32 %r6670, %r6669, %r6608; + shf.l.wrap.b32 %r6671, %r6670, %r6670, 16; + add.s32 %r6672, %r6671, %r6595; + xor.b32 %r6673, %r6672, %r6583; + shf.l.wrap.b32 %r6674, %r6673, %r6673, 20; + add.s32 %r6675, %r6669, %r6038; + add.s32 %r6676, %r6675, %r6674; + xor.b32 %r6677, %r6676, %r6671; + shf.l.wrap.b32 %r6678, %r6677, %r6677, 24; + add.s32 %r6679, %r6678, %r6672; + xor.b32 %r6680, %r6679, %r6674; + shf.l.wrap.b32 %r6681, %r6680, %r6680, 25; + add.s32 %r6682, %r6634, %r6073; + add.s32 %r6683, %r6682, %r6681; + xor.b32 %r6684, %r6683, %r6650; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 16; + add.s32 %r6686, %r6685, %r6665; + xor.b32 %r6687, %r6686, %r6681; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 20; + add.s32 %r6689, %r6683, %r6108; + add.s32 %r6690, %r6689, %r6688; + xor.b32 %r6691, %r6690, %r6685; + shf.l.wrap.b32 %r6692, %r6691, %r6691, 24; + add.s32 %r6693, %r6692, %r6686; + xor.b32 %r6694, %r6693, %r6688; + shf.l.wrap.b32 %r6695, %r6694, %r6694, 25; + add.s32 %r6696, %r6648, %r6087; + add.s32 %r6697, %r6696, %r6639; + xor.b32 %r6698, %r6697, %r6664; + shf.l.wrap.b32 %r6699, %r6698, %r6698, 16; + add.s32 %r6700, %r6699, %r6679; + xor.b32 %r6701, %r6700, %r6639; + shf.l.wrap.b32 %r6702, %r6701, %r6701, 20; + add.s32 %r6703, %r6697, %r6045; + add.s32 %r6704, %r6703, %r6702; + xor.b32 %r6705, %r6704, %r6699; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6700; + xor.b32 %r6708, %r6707, %r6702; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6662, %r6066; + add.s32 %r6711, %r6710, %r6653; + xor.b32 %r6712, %r6711, %r6678; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6637; + xor.b32 %r6715, %r6714, %r6653; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6094; + add.s32 %r6718, %r6717, %r6716; + xor.b32 %r6719, %r6718, %r6713; + shf.l.wrap.b32 %r6720, %r6719, %r6719, 24; + add.s32 %r6721, %r6720, %r6714; + xor.b32 %r6722, %r6721, %r6716; + shf.l.wrap.b32 %r6723, %r6722, %r6722, 25; + add.s32 %r6724, %r6676, %r6115; + add.s32 %r6725, %r6724, %r6667; + xor.b32 %r6726, %r6725, %r6636; + shf.l.wrap.b32 %r6727, %r6726, %r6726, 16; + add.s32 %r6728, %r6727, %r6651; + xor.b32 %r6729, %r6728, %r6667; + shf.l.wrap.b32 %r6730, %r6729, %r6729, 20; + add.s32 %r6731, %r6725, %r6017; + add.s32 %r6732, %r6731, %r6730; + xor.b32 %r6733, %r6732, %r6727; + shf.l.wrap.b32 %r6734, %r6733, %r6733, 24; + add.s32 %r6735, %r6734, %r6728; + xor.b32 %r6736, %r6735, %r6730; + shf.l.wrap.b32 %r6737, %r6736, %r6736, 25; + add.s32 %r6738, %r6690, %r6101; + add.s32 %r6739, %r6738, %r6709; + xor.b32 %r6740, %r6739, %r6734; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6721; + xor.b32 %r6743, %r6742, %r6709; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6031; + add.s32 %r6746, %r6745, %r6744; + xor.b32 %r6747, %r6746, %r6741; + shf.l.wrap.b32 %r6748, %r6747, %r6747, 24; + add.s32 %r6749, %r6748, %r6742; + xor.b32 %r6750, %r6749, %r6744; + shf.l.wrap.b32 %r6751, %r6750, %r6750, 25; + add.s32 %r6752, %r6704, %r6010; + add.s32 %r6753, %r6752, %r6723; + xor.b32 %r6754, %r6753, %r6692; + shf.l.wrap.b32 %r6755, %r6754, %r6754, 16; + add.s32 %r6756, %r6755, %r6735; + xor.b32 %r6757, %r6756, %r6723; + shf.l.wrap.b32 %r6758, %r6757, %r6757, 20; + add.s32 %r6759, %r6753, %r6080; + add.s32 %r6760, %r6759, %r6758; + xor.b32 %r6761, %r6760, %r6755; + shf.l.wrap.b32 %r6762, %r6761, %r6761, 24; + add.s32 %r6763, %r6762, %r6756; + xor.b32 %r6764, %r6763, %r6758; + shf.l.wrap.b32 %r6765, %r6764, %r6764, 25; + add.s32 %r6766, %r6718, %r6024; + add.s32 %r6767, %r6766, %r6737; + xor.b32 %r6768, %r6767, %r6706; + shf.l.wrap.b32 %r6769, %r6768, %r6768, 16; + add.s32 %r6770, %r6769, %r6693; + xor.b32 %r6771, %r6770, %r6737; + shf.l.wrap.b32 %r6772, %r6771, %r6771, 20; + add.s32 %r6773, %r6767, %r6052; + add.s32 %r6774, %r6773, %r6772; + xor.b32 %r6775, %r6774, %r6769; + shf.l.wrap.b32 %r6776, %r6775, %r6775, 24; + add.s32 %r6777, %r6776, %r6770; + xor.b32 %r6778, %r6777, %r6772; + shf.l.wrap.b32 %r6779, %r6778, %r6778, 25; + add.s32 %r6780, %r6732, %r6038; + add.s32 %r6781, %r6780, %r6695; + xor.b32 %r6782, %r6781, %r6720; + shf.l.wrap.b32 %r6783, %r6782, %r6782, 16; + add.s32 %r6784, %r6783, %r6707; + xor.b32 %r6785, %r6784, %r6695; + shf.l.wrap.b32 %r6786, %r6785, %r6785, 20; + add.s32 %r6787, %r6781, %r6059; + add.s32 %r6788, %r6787, %r6786; + xor.b32 %r6789, %r6788, %r6783; + shf.l.wrap.b32 %r6790, %r6789, %r6789, 24; + add.s32 %r6791, %r6790, %r6784; + xor.b32 %r6792, %r6791, %r6786; + shf.l.wrap.b32 %r6793, %r6792, %r6792, 25; + add.s32 %r6794, %r6746, %r6087; + add.s32 %r6795, %r6794, %r6793; + xor.b32 %r6796, %r6795, %r6762; + shf.l.wrap.b32 %r6797, %r6796, %r6796, 16; + add.s32 %r6798, %r6797, %r6777; + xor.b32 %r6799, %r6798, %r6793; + shf.l.wrap.b32 %r6800, %r6799, %r6799, 20; + add.s32 %r6801, %r6795, %r6115; + add.s32 %r6802, %r6801, %r6800; + xor.b32 %r6803, %r6802, %r6797; + shf.l.wrap.b32 %r6804, %r6803, %r6803, 24; + add.s32 %r6805, %r6804, %r6798; + xor.b32 %r6806, %r6805, %r6800; + shf.l.wrap.b32 %r6807, %r6806, %r6806, 25; + add.s32 %r6808, %r6760, %r6045; + add.s32 %r6809, %r6808, %r6751; + xor.b32 %r6810, %r6809, %r6776; + shf.l.wrap.b32 %r6811, %r6810, %r6810, 16; + add.s32 %r6812, %r6811, %r6791; + xor.b32 %r6813, %r6812, %r6751; + shf.l.wrap.b32 %r6814, %r6813, %r6813, 20; + add.s32 %r6815, %r6809, %r6010; + add.s32 %r6816, %r6815, %r6814; + xor.b32 %r6817, %r6816, %r6811; + shf.l.wrap.b32 %r6818, %r6817, %r6817, 24; + add.s32 %r6819, %r6818, %r6812; + xor.b32 %r6820, %r6819, %r6814; + shf.l.wrap.b32 %r6821, %r6820, %r6820, 25; + add.s32 %r6822, %r6774, %r6017; + add.s32 %r6823, %r6822, %r6765; + xor.b32 %r6824, %r6823, %r6790; + shf.l.wrap.b32 %r6825, %r6824, %r6824, 16; + add.s32 %r6826, %r6825, %r6749; + xor.b32 %r6827, %r6826, %r6765; + shf.l.wrap.b32 %r6828, %r6827, %r6827, 20; + add.s32 %r6829, %r6823, %r6073; + add.s32 %r6830, %r6829, %r6828; + xor.b32 %r6831, %r6830, %r6825; + shf.l.wrap.b32 %r6832, %r6831, %r6831, 24; + add.s32 %r6833, %r6832, %r6826; + xor.b32 %r6834, %r6833, %r6828; + shf.l.wrap.b32 %r6835, %r6834, %r6834, 25; + add.s32 %r6836, %r6788, %r6066; + add.s32 %r6837, %r6836, %r6779; + xor.b32 %r6838, %r6837, %r6748; + shf.l.wrap.b32 %r6839, %r6838, %r6838, 16; + add.s32 %r6840, %r6839, %r6763; + xor.b32 %r6841, %r6840, %r6779; + shf.l.wrap.b32 %r6842, %r6841, %r6841, 20; + add.s32 %r6843, %r6837, %r6052; + add.s32 %r6844, %r6843, %r6842; + xor.b32 %r6845, %r6844, %r6839; + shf.l.wrap.b32 %r6846, %r6845, %r6845, 24; + add.s32 %r6847, %r6846, %r6840; + xor.b32 %r6848, %r6847, %r6842; + shf.l.wrap.b32 %r6849, %r6848, %r6848, 25; + add.s32 %r6850, %r6802, %r6108; + add.s32 %r6851, %r6850, %r6821; + xor.b32 %r6852, %r6851, %r6846; + shf.l.wrap.b32 %r6853, %r6852, %r6852, 16; + add.s32 %r6854, %r6853, %r6833; + xor.b32 %r6855, %r6854, %r6821; + shf.l.wrap.b32 %r6856, %r6855, %r6855, 20; + add.s32 %r6857, %r6851, %r6080; + add.s32 %r6858, %r6857, %r6856; + xor.b32 %r6859, %r6858, %r6853; + shf.l.wrap.b32 %r6860, %r6859, %r6859, 24; + add.s32 %r6861, %r6860, %r6854; + xor.b32 %r6862, %r6861, %r6856; + shf.l.wrap.b32 %r6863, %r6862, %r6862, 25; + add.s32 %r6864, %r6816, %r6024; + add.s32 %r6865, %r6864, %r6835; + xor.b32 %r6866, %r6865, %r6804; + shf.l.wrap.b32 %r6867, %r6866, %r6866, 16; + add.s32 %r6868, %r6867, %r6847; + xor.b32 %r6869, %r6868, %r6835; + shf.l.wrap.b32 %r6870, %r6869, %r6869, 20; + add.s32 %r6871, %r6865, %r6094; + add.s32 %r6872, %r6871, %r6870; + xor.b32 %r6873, %r6872, %r6867; + shf.l.wrap.b32 %r6874, %r6873, %r6873, 24; + add.s32 %r6875, %r6874, %r6868; + xor.b32 %r6876, %r6875, %r6870; + shf.l.wrap.b32 %r6877, %r6876, %r6876, 25; + add.s32 %r6878, %r6830, %r6031; + add.s32 %r6879, %r6878, %r6849; + xor.b32 %r6880, %r6879, %r6818; + shf.l.wrap.b32 %r6881, %r6880, %r6880, 16; + add.s32 %r6882, %r6881, %r6805; + xor.b32 %r6883, %r6882, %r6849; + shf.l.wrap.b32 %r6884, %r6883, %r6883, 20; + add.s32 %r6885, %r6879, %r6038; + add.s32 %r6886, %r6885, %r6884; + xor.b32 %r6887, %r6886, %r6881; + shf.l.wrap.b32 %r6888, %r6887, %r6887, 24; + add.s32 %r6889, %r6888, %r6882; + xor.b32 %r6890, %r6889, %r6884; + shf.l.wrap.b32 %r6891, %r6890, %r6890, 25; + add.s32 %r6892, %r6844, %r6059; + add.s32 %r6893, %r6892, %r6807; + xor.b32 %r6894, %r6893, %r6832; + shf.l.wrap.b32 %r6895, %r6894, %r6894, 16; + add.s32 %r6896, %r6895, %r6819; + xor.b32 %r6897, %r6896, %r6807; + shf.l.wrap.b32 %r6898, %r6897, %r6897, 20; + add.s32 %r6899, %r6893, %r6101; + add.s32 %r6900, %r6899, %r6898; + xor.b32 %r6901, %r6900, %r6895; + shf.l.wrap.b32 %r6902, %r6901, %r6901, 24; + add.s32 %r6903, %r6902, %r6896; + xor.b32 %r6904, %r6903, %r6898; + shf.l.wrap.b32 %r6905, %r6904, %r6904, 25; + xor.b32 %r11679, %r6889, %r6858; + xor.b32 %r11678, %r6903, %r6872; + xor.b32 %r11677, %r6861, %r6886; + xor.b32 %r11676, %r6900, %r6875; + xor.b32 %r11675, %r6905, %r6874; + xor.b32 %r11674, %r6863, %r6888; + xor.b32 %r11673, %r6902, %r6877; + xor.b32 %r11672, %r6891, %r6860; + add.s16 %rs353, %rs353, 1; + st.local.u8 [%rd54+1], %rs353; + add.s64 %rd258, %rd258, 64; + add.s64 %rd259, %rd259, -64; + setp.gt.u64 %p33, %rd259, 64; + @%p33 bra $L__BB1_38; + +$L__BB1_39: + min.u64 %rd61, %rd259, 64; + setp.eq.s64 %p34, %rd61, 0; + mov.u16 %rs355, %rs354; + mov.u16 %rs356, %rs354; + mov.u16 %rs357, %rs354; + mov.u16 %rs358, %rs354; + mov.u16 %rs359, %rs354; + mov.u16 %rs360, %rs354; + mov.u16 %rs361, %rs354; + mov.u16 %rs362, %rs354; + mov.u16 %rs363, %rs354; + mov.u16 %rs364, %rs354; + mov.u16 %rs365, %rs354; + mov.u16 %rs366, %rs354; + mov.u16 %rs367, %rs354; + mov.u16 %rs368, %rs354; + mov.u16 %rs369, %rs354; + mov.u16 %rs370, %rs354; + mov.u16 %rs371, %rs354; + mov.u16 %rs372, %rs354; + mov.u16 %rs373, %rs354; + mov.u16 %rs374, %rs354; + mov.u16 %rs375, %rs354; + mov.u16 %rs376, %rs354; + mov.u16 %rs377, %rs354; + mov.u16 %rs378, %rs354; + mov.u16 %rs379, %rs354; + mov.u16 %rs380, %rs354; + mov.u16 %rs381, %rs354; + mov.u16 %rs382, %rs354; + mov.u16 %rs383, %rs354; + mov.u16 %rs384, %rs354; + mov.u16 %rs385, %rs354; + mov.u16 %rs386, %rs354; + mov.u16 %rs387, %rs354; + @%p34 bra $L__BB1_43; + + mov.u64 %rd260, 0; + +$L__BB1_41: + add.s64 %rd187, %rd258, %rd260; + ld.local.u8 %rs251, [%rd187]; + add.s64 %rd188, %rd53, %rd260; + st.local.u8 [%rd188], %rs251; + add.s64 %rd260, %rd260, 1; + setp.lt.u64 %p35, %rd260, %rd61; + @%p35 bra $L__BB1_41; + + ld.local.v4.u16 {%rs384, %rs385, %rs386, %rs387}, [%rd53]; + ld.local.v4.u16 {%rs380, %rs381, %rs382, %rs383}, [%rd53+8]; + ld.local.v4.u16 {%rs376, %rs377, %rs378, %rs379}, [%rd53+16]; + ld.local.v4.u16 {%rs372, %rs373, %rs374, %rs375}, [%rd53+24]; + ld.local.v4.u16 {%rs368, %rs369, %rs370, %rs371}, [%rd53+32]; + ld.local.v4.u16 {%rs364, %rs365, %rs366, %rs367}, [%rd53+40]; + ld.local.v4.u16 {%rs360, %rs361, %rs362, %rs363}, [%rd53+48]; + ld.local.v4.u16 {%rs357, %rs358, %rs359, %rs283}, [%rd53+56]; + ld.local.u8 %rs356, [%rd53+61]; + ld.local.v2.u8 {%rs354, %rs355}, [%rd53+62]; + +$L__BB1_43: + ld.local.v4.u8 {%rs286, %rs287, %rs288, %rs289}, [%rd53+64]; + cvt.u16.u64 %rs292, %rd61; + add.s16 %rs293, %rs286, %rs292; + st.local.u8 [%rd53+64], %rs293; + setp.eq.s16 %p36, %rs287, 0; + selp.u16 %rs294, 1, 0, %p36; + or.b16 %rs295, %rs288, %rs294; + or.b16 %rs296, %rs295, 2; + shr.u16 %rs297, %rs384, 8; + shr.u16 %rs298, %rs385, 8; + shr.u16 %rs299, %rs386, 8; + shr.u16 %rs300, %rs387, 8; + shr.u16 %rs301, %rs380, 8; + shr.u16 %rs302, %rs381, 8; + shr.u16 %rs303, %rs382, 8; + shr.u16 %rs304, %rs383, 8; + shr.u16 %rs305, %rs376, 8; + shr.u16 %rs306, %rs377, 8; + shr.u16 %rs307, %rs378, 8; + shr.u16 %rs308, %rs379, 8; + shr.u16 %rs309, %rs372, 8; + shr.u16 %rs310, %rs373, 8; + shr.u16 %rs311, %rs374, 8; + shr.u16 %rs312, %rs375, 8; + shr.u16 %rs313, %rs368, 8; + shr.u16 %rs314, %rs369, 8; + shr.u16 %rs315, %rs370, 8; + shr.u16 %rs316, %rs371, 8; + shr.u16 %rs317, %rs364, 8; + shr.u16 %rs318, %rs365, 8; + shr.u16 %rs319, %rs366, 8; + shr.u16 %rs320, %rs367, 8; + shr.u16 %rs321, %rs360, 8; + shr.u16 %rs322, %rs361, 8; + shr.u16 %rs323, %rs362, 8; + shr.u16 %rs324, %rs363, 8; + shr.u16 %rs325, %rs357, 8; + shr.u16 %rs326, %rs358, 8; + cvt.u32.u16 %r6906, %rs384; + and.b32 %r6907, %r6906, 255; + cvt.u32.u16 %r6908, %rs297; + prmt.b32 %r6909, %r6908, %r6907, 30212; + cvt.u32.u16 %r6910, %rs385; + prmt.b32 %r6911, %r6910, %r6909, 28756; + cvt.u32.u16 %r6912, %rs298; + prmt.b32 %r6913, %r6912, %r6911, 1620; + cvt.u32.u16 %r6914, %rs386; + and.b32 %r6915, %r6914, 255; + cvt.u32.u16 %r6916, %rs299; + prmt.b32 %r6917, %r6916, %r6915, 30212; + cvt.u32.u16 %r6918, %rs387; + prmt.b32 %r6919, %r6918, %r6917, 28756; + cvt.u32.u16 %r6920, %rs300; + prmt.b32 %r6921, %r6920, %r6919, 1620; + cvt.u32.u16 %r6922, %rs380; + and.b32 %r6923, %r6922, 255; + cvt.u32.u16 %r6924, %rs301; + prmt.b32 %r6925, %r6924, %r6923, 30212; + cvt.u32.u16 %r6926, %rs381; + prmt.b32 %r6927, %r6926, %r6925, 28756; + cvt.u32.u16 %r6928, %rs302; + prmt.b32 %r6929, %r6928, %r6927, 1620; + cvt.u32.u16 %r6930, %rs382; + and.b32 %r6931, %r6930, 255; + cvt.u32.u16 %r6932, %rs303; + prmt.b32 %r6933, %r6932, %r6931, 30212; + cvt.u32.u16 %r6934, %rs383; + prmt.b32 %r6935, %r6934, %r6933, 28756; + cvt.u32.u16 %r6936, %rs304; + prmt.b32 %r6937, %r6936, %r6935, 1620; + cvt.u32.u16 %r6938, %rs376; + and.b32 %r6939, %r6938, 255; + cvt.u32.u16 %r6940, %rs305; + prmt.b32 %r6941, %r6940, %r6939, 30212; + cvt.u32.u16 %r6942, %rs377; + prmt.b32 %r6943, %r6942, %r6941, 28756; + cvt.u32.u16 %r6944, %rs306; + prmt.b32 %r6945, %r6944, %r6943, 1620; + cvt.u32.u16 %r6946, %rs378; + and.b32 %r6947, %r6946, 255; + cvt.u32.u16 %r6948, %rs307; + prmt.b32 %r6949, %r6948, %r6947, 30212; + cvt.u32.u16 %r6950, %rs379; + prmt.b32 %r6951, %r6950, %r6949, 28756; + cvt.u32.u16 %r6952, %rs308; + prmt.b32 %r6953, %r6952, %r6951, 1620; + cvt.u32.u16 %r6954, %rs372; + and.b32 %r6955, %r6954, 255; + cvt.u32.u16 %r6956, %rs309; + prmt.b32 %r6957, %r6956, %r6955, 30212; + cvt.u32.u16 %r6958, %rs373; + prmt.b32 %r6959, %r6958, %r6957, 28756; + cvt.u32.u16 %r6960, %rs310; + prmt.b32 %r6961, %r6960, %r6959, 1620; + cvt.u32.u16 %r6962, %rs374; + and.b32 %r6963, %r6962, 255; + cvt.u32.u16 %r6964, %rs311; + prmt.b32 %r6965, %r6964, %r6963, 30212; + cvt.u32.u16 %r6966, %rs375; + prmt.b32 %r6967, %r6966, %r6965, 28756; + cvt.u32.u16 %r6968, %rs312; + prmt.b32 %r6969, %r6968, %r6967, 1620; + cvt.u32.u16 %r6970, %rs368; + and.b32 %r6971, %r6970, 255; + cvt.u32.u16 %r6972, %rs313; + prmt.b32 %r6973, %r6972, %r6971, 30212; + cvt.u32.u16 %r6974, %rs369; + prmt.b32 %r6975, %r6974, %r6973, 28756; + cvt.u32.u16 %r6976, %rs314; + prmt.b32 %r6977, %r6976, %r6975, 1620; + cvt.u32.u16 %r6978, %rs370; + and.b32 %r6979, %r6978, 255; + cvt.u32.u16 %r6980, %rs315; + prmt.b32 %r6981, %r6980, %r6979, 30212; + cvt.u32.u16 %r6982, %rs371; + prmt.b32 %r6983, %r6982, %r6981, 28756; + cvt.u32.u16 %r6984, %rs316; + prmt.b32 %r6985, %r6984, %r6983, 1620; + cvt.u32.u16 %r6986, %rs364; + and.b32 %r6987, %r6986, 255; + cvt.u32.u16 %r6988, %rs317; + prmt.b32 %r6989, %r6988, %r6987, 30212; + cvt.u32.u16 %r6990, %rs365; + prmt.b32 %r6991, %r6990, %r6989, 28756; + cvt.u32.u16 %r6992, %rs318; + prmt.b32 %r6993, %r6992, %r6991, 1620; + cvt.u32.u16 %r6994, %rs366; + and.b32 %r6995, %r6994, 255; + cvt.u32.u16 %r6996, %rs319; + prmt.b32 %r6997, %r6996, %r6995, 30212; + cvt.u32.u16 %r6998, %rs367; + prmt.b32 %r6999, %r6998, %r6997, 28756; + cvt.u32.u16 %r7000, %rs320; + prmt.b32 %r7001, %r7000, %r6999, 1620; + cvt.u32.u16 %r7002, %rs360; + and.b32 %r7003, %r7002, 255; + cvt.u32.u16 %r7004, %rs321; + prmt.b32 %r7005, %r7004, %r7003, 30212; + cvt.u32.u16 %r7006, %rs361; + prmt.b32 %r7007, %r7006, %r7005, 28756; + cvt.u32.u16 %r7008, %rs322; + prmt.b32 %r7009, %r7008, %r7007, 1620; + cvt.u32.u16 %r7010, %rs362; + and.b32 %r7011, %r7010, 255; + cvt.u32.u16 %r7012, %rs323; + prmt.b32 %r7013, %r7012, %r7011, 30212; + cvt.u32.u16 %r7014, %rs363; + prmt.b32 %r7015, %r7014, %r7013, 28756; + cvt.u32.u16 %r7016, %rs324; + prmt.b32 %r7017, %r7016, %r7015, 1620; + cvt.u32.u16 %r7018, %rs357; + and.b32 %r7019, %r7018, 255; + cvt.u32.u16 %r7020, %rs325; + prmt.b32 %r7021, %r7020, %r7019, 30212; + cvt.u32.u16 %r7022, %rs358; + prmt.b32 %r7023, %r7022, %r7021, 28756; + cvt.u32.u16 %r7024, %rs326; + prmt.b32 %r7025, %r7024, %r7023, 1620; + cvt.u32.u16 %r7026, %rs359; + and.b32 %r7027, %r7026, 255; + cvt.u32.u16 %r7028, %rs356; + prmt.b32 %r7029, %r7028, %r7027, 30212; + cvt.u32.u16 %r7030, %rs354; + shl.b32 %r7031, %r7030, 16; + and.b32 %r7032, %r7031, 16711680; + or.b32 %r7033, %r7029, %r7032; + cvt.u32.u16 %r7034, %rs355; + shl.b32 %r7035, %r7034, 24; + or.b32 %r7036, %r7033, %r7035; + cvt.u32.u16 %r7037, %rs293; + and.b32 %r7038, %r7037, 255; + cvt.u32.u16 %r7039, %rs296; + and.b32 %r7040, %r7039, 255; + add.s32 %r7041, %r11675, %r11679; + add.s32 %r7042, %r7041, %r6913; + xor.b32 %r7043, %r7042, %r71; + shf.l.wrap.b32 %r7044, %r7043, %r7043, 16; + add.s32 %r7045, %r7044, 1779033703; + xor.b32 %r7046, %r7045, %r11675; + shf.l.wrap.b32 %r7047, %r7046, %r7046, 20; + add.s32 %r7048, %r6921, %r7042; + add.s32 %r7049, %r7048, %r7047; + xor.b32 %r7050, %r7049, %r7044; + shf.l.wrap.b32 %r7051, %r7050, %r7050, 24; + add.s32 %r7052, %r7051, %r7045; + xor.b32 %r7053, %r7052, %r7047; + shf.l.wrap.b32 %r7054, %r7053, %r7053, 25; + add.s32 %r7055, %r11674, %r11678; + add.s32 %r7056, %r7055, %r6929; + xor.b32 %r7057, %r7056, %r72; + shf.l.wrap.b32 %r7058, %r7057, %r7057, 16; + add.s32 %r7059, %r7058, -1150833019; + xor.b32 %r7060, %r7059, %r11674; + shf.l.wrap.b32 %r7061, %r7060, %r7060, 20; + add.s32 %r7062, %r6937, %r7056; + add.s32 %r7063, %r7062, %r7061; + xor.b32 %r7064, %r7063, %r7058; + shf.l.wrap.b32 %r7065, %r7064, %r7064, 24; + add.s32 %r7066, %r7065, %r7059; + xor.b32 %r7067, %r7066, %r7061; + shf.l.wrap.b32 %r7068, %r7067, %r7067, 25; + add.s32 %r7069, %r11673, %r11677; + add.s32 %r7070, %r7069, %r6945; + xor.b32 %r7071, %r7070, %r7038; + shr.u32 %r7072, %r7070, 16; + shl.b32 %r7073, %r7071, 16; + or.b32 %r7074, %r7073, %r7072; + add.s32 %r7075, %r7074, 1013904242; + xor.b32 %r7076, %r7075, %r11673; + shf.l.wrap.b32 %r7077, %r7076, %r7076, 20; + add.s32 %r7078, %r6953, %r7070; + add.s32 %r7079, %r7078, %r7077; + xor.b32 %r7080, %r7079, %r7074; + shf.l.wrap.b32 %r7081, %r7080, %r7080, 24; + add.s32 %r7082, %r7081, %r7075; + xor.b32 %r7083, %r7082, %r7077; + shf.l.wrap.b32 %r7084, %r7083, %r7083, 25; + add.s32 %r7085, %r11672, %r11676; + add.s32 %r7086, %r7085, %r6961; + xor.b32 %r7087, %r7086, %r7040; + shr.u32 %r7088, %r7086, 16; + shl.b32 %r7089, %r7087, 16; + or.b32 %r7090, %r7089, %r7088; + add.s32 %r7091, %r7090, -1521486534; + xor.b32 %r7092, %r7091, %r11672; + shf.l.wrap.b32 %r7093, %r7092, %r7092, 20; + add.s32 %r7094, %r6969, %r7086; + add.s32 %r7095, %r7094, %r7093; + xor.b32 %r7096, %r7095, %r7090; + shf.l.wrap.b32 %r7097, %r7096, %r7096, 24; + add.s32 %r7098, %r7097, %r7091; + xor.b32 %r7099, %r7098, %r7093; + shf.l.wrap.b32 %r7100, %r7099, %r7099, 25; + add.s32 %r7101, %r7068, %r7049; + add.s32 %r7102, %r7101, %r6977; + xor.b32 %r7103, %r7097, %r7102; + shf.l.wrap.b32 %r7104, %r7103, %r7103, 16; + add.s32 %r7105, %r7104, %r7082; + xor.b32 %r7106, %r7105, %r7068; + shf.l.wrap.b32 %r7107, %r7106, %r7106, 20; + add.s32 %r7108, %r6985, %r7102; + add.s32 %r7109, %r7108, %r7107; + xor.b32 %r7110, %r7109, %r7104; + shf.l.wrap.b32 %r7111, %r7110, %r7110, 24; + add.s32 %r7112, %r7111, %r7105; + xor.b32 %r7113, %r7112, %r7107; + shf.l.wrap.b32 %r7114, %r7113, %r7113, 25; + add.s32 %r7115, %r7084, %r7063; + add.s32 %r7116, %r7115, %r6993; + xor.b32 %r7117, %r7116, %r7051; + shf.l.wrap.b32 %r7118, %r7117, %r7117, 16; + add.s32 %r7119, %r7118, %r7098; + xor.b32 %r7120, %r7119, %r7084; + shf.l.wrap.b32 %r7121, %r7120, %r7120, 20; + add.s32 %r7122, %r7001, %r7116; + add.s32 %r7123, %r7122, %r7121; + xor.b32 %r7124, %r7123, %r7118; + shf.l.wrap.b32 %r7125, %r7124, %r7124, 24; + add.s32 %r7126, %r7125, %r7119; + xor.b32 %r7127, %r7126, %r7121; + shf.l.wrap.b32 %r7128, %r7127, %r7127, 25; + add.s32 %r7129, %r7100, %r7079; + add.s32 %r7130, %r7129, %r7009; + xor.b32 %r7131, %r7130, %r7065; + shf.l.wrap.b32 %r7132, %r7131, %r7131, 16; + add.s32 %r7133, %r7132, %r7052; + xor.b32 %r7134, %r7133, %r7100; + shf.l.wrap.b32 %r7135, %r7134, %r7134, 20; + add.s32 %r7136, %r7017, %r7130; + add.s32 %r7137, %r7136, %r7135; + xor.b32 %r7138, %r7137, %r7132; + shf.l.wrap.b32 %r7139, %r7138, %r7138, 24; + add.s32 %r7140, %r7139, %r7133; + xor.b32 %r7141, %r7140, %r7135; + shf.l.wrap.b32 %r7142, %r7141, %r7141, 25; + add.s32 %r7143, %r7095, %r7054; + add.s32 %r7144, %r7143, %r7025; + xor.b32 %r7145, %r7144, %r7081; + shf.l.wrap.b32 %r7146, %r7145, %r7145, 16; + add.s32 %r7147, %r7146, %r7066; + xor.b32 %r7148, %r7147, %r7054; + shf.l.wrap.b32 %r7149, %r7148, %r7148, 20; + add.s32 %r7150, %r7036, %r7144; + add.s32 %r7151, %r7150, %r7149; + xor.b32 %r7152, %r7151, %r7146; + shf.l.wrap.b32 %r7153, %r7152, %r7152, 24; + add.s32 %r7154, %r7153, %r7147; + xor.b32 %r7155, %r7154, %r7149; + shf.l.wrap.b32 %r7156, %r7155, %r7155, 25; + add.s32 %r7157, %r7109, %r6929; + add.s32 %r7158, %r7157, %r7156; + xor.b32 %r7159, %r7158, %r7125; + shf.l.wrap.b32 %r7160, %r7159, %r7159, 16; + add.s32 %r7161, %r7160, %r7140; + xor.b32 %r7162, %r7161, %r7156; + shf.l.wrap.b32 %r7163, %r7162, %r7162, 20; + add.s32 %r7164, %r7158, %r6961; + add.s32 %r7165, %r7164, %r7163; + xor.b32 %r7166, %r7165, %r7160; + shf.l.wrap.b32 %r7167, %r7166, %r7166, 24; + add.s32 %r7168, %r7167, %r7161; + xor.b32 %r7169, %r7168, %r7163; + shf.l.wrap.b32 %r7170, %r7169, %r7169, 25; + add.s32 %r7171, %r7123, %r6937; + add.s32 %r7172, %r7171, %r7114; + xor.b32 %r7173, %r7139, %r7172; + shf.l.wrap.b32 %r7174, %r7173, %r7173, 16; + add.s32 %r7175, %r7154, %r7174; + xor.b32 %r7176, %r7175, %r7114; + shf.l.wrap.b32 %r7177, %r7176, %r7176, 20; + add.s32 %r7178, %r7172, %r6993; + add.s32 %r7179, %r7178, %r7177; + xor.b32 %r7180, %r7179, %r7174; + shf.l.wrap.b32 %r7181, %r7180, %r7180, 24; + add.s32 %r7182, %r7181, %r7175; + xor.b32 %r7183, %r7182, %r7177; + shf.l.wrap.b32 %r7184, %r7183, %r7183, 25; + add.s32 %r7185, %r7128, %r6969; + add.s32 %r7186, %r7185, %r7137; + xor.b32 %r7187, %r7153, %r7186; + shf.l.wrap.b32 %r7188, %r7187, %r7187, 16; + add.s32 %r7189, %r7188, %r7112; + xor.b32 %r7190, %r7189, %r7128; + shf.l.wrap.b32 %r7191, %r7190, %r7190, 20; + add.s32 %r7192, %r7186, %r6913; + add.s32 %r7193, %r7192, %r7191; + xor.b32 %r7194, %r7193, %r7188; + shf.l.wrap.b32 %r7195, %r7194, %r7194, 24; + add.s32 %r7196, %r7195, %r7189; + xor.b32 %r7197, %r7196, %r7191; + shf.l.wrap.b32 %r7198, %r7197, %r7197, 25; + add.s32 %r7199, %r7142, %r6945; + add.s32 %r7200, %r7199, %r7151; + xor.b32 %r7201, %r7200, %r7111; + shf.l.wrap.b32 %r7202, %r7201, %r7201, 16; + add.s32 %r7203, %r7202, %r7126; + xor.b32 %r7204, %r7203, %r7142; + shf.l.wrap.b32 %r7205, %r7204, %r7204, 20; + add.s32 %r7206, %r7200, %r7017; + add.s32 %r7207, %r7206, %r7205; + xor.b32 %r7208, %r7207, %r7202; + shf.l.wrap.b32 %r7209, %r7208, %r7208, 24; + add.s32 %r7210, %r7209, %r7203; + xor.b32 %r7211, %r7210, %r7205; + shf.l.wrap.b32 %r7212, %r7211, %r7211, 25; + add.s32 %r7213, %r7184, %r6921; + add.s32 %r7214, %r7213, %r7165; + xor.b32 %r7215, %r7214, %r7209; + shf.l.wrap.b32 %r7216, %r7215, %r7215, 16; + add.s32 %r7217, %r7216, %r7196; + xor.b32 %r7218, %r7217, %r7184; + shf.l.wrap.b32 %r7219, %r7218, %r7218, 20; + add.s32 %r7220, %r7214, %r7001; + add.s32 %r7221, %r7220, %r7219; + xor.b32 %r7222, %r7221, %r7216; + shf.l.wrap.b32 %r7223, %r7222, %r7222, 24; + add.s32 %r7224, %r7223, %r7217; + xor.b32 %r7225, %r7224, %r7219; + shf.l.wrap.b32 %r7226, %r7225, %r7225, 25; + add.s32 %r7227, %r7179, %r7009; + add.s32 %r7228, %r7227, %r7198; + xor.b32 %r7229, %r7167, %r7228; + shf.l.wrap.b32 %r7230, %r7229, %r7229, 16; + add.s32 %r7231, %r7230, %r7210; + xor.b32 %r7232, %r7231, %r7198; + shf.l.wrap.b32 %r7233, %r7232, %r7232, 20; + add.s32 %r7234, %r7228, %r6953; + add.s32 %r7235, %r7234, %r7233; + xor.b32 %r7236, %r7235, %r7230; + shf.l.wrap.b32 %r7237, %r7236, %r7236, 24; + add.s32 %r7238, %r7237, %r7231; + xor.b32 %r7239, %r7238, %r7233; + shf.l.wrap.b32 %r7240, %r7239, %r7239, 25; + add.s32 %r7241, %r7193, %r6985; + add.s32 %r7242, %r7241, %r7212; + xor.b32 %r7243, %r7242, %r7181; + shf.l.wrap.b32 %r7244, %r7243, %r7243, 16; + add.s32 %r7245, %r7244, %r7168; + xor.b32 %r7246, %r7245, %r7212; + shf.l.wrap.b32 %r7247, %r7246, %r7246, 20; + add.s32 %r7248, %r7242, %r7025; + add.s32 %r7249, %r7248, %r7247; + xor.b32 %r7250, %r7249, %r7244; + shf.l.wrap.b32 %r7251, %r7250, %r7250, 24; + add.s32 %r7252, %r7251, %r7245; + xor.b32 %r7253, %r7252, %r7247; + shf.l.wrap.b32 %r7254, %r7253, %r7253, 25; + add.s32 %r7255, %r7207, %r7036; + add.s32 %r7256, %r7255, %r7170; + xor.b32 %r7257, %r7256, %r7195; + shf.l.wrap.b32 %r7258, %r7257, %r7257, 16; + add.s32 %r7259, %r7258, %r7182; + xor.b32 %r7260, %r7259, %r7170; + shf.l.wrap.b32 %r7261, %r7260, %r7260, 20; + add.s32 %r7262, %r7256, %r6977; + add.s32 %r7263, %r7262, %r7261; + xor.b32 %r7264, %r7263, %r7258; + shf.l.wrap.b32 %r7265, %r7264, %r7264, 24; + add.s32 %r7266, %r7265, %r7259; + xor.b32 %r7267, %r7266, %r7261; + shf.l.wrap.b32 %r7268, %r7267, %r7267, 25; + add.s32 %r7269, %r7221, %r6937; + add.s32 %r7270, %r7269, %r7268; + xor.b32 %r7271, %r7270, %r7237; + shf.l.wrap.b32 %r7272, %r7271, %r7271, 16; + add.s32 %r7273, %r7272, %r7252; + xor.b32 %r7274, %r7273, %r7268; + shf.l.wrap.b32 %r7275, %r7274, %r7274, 20; + add.s32 %r7276, %r7270, %r6945; + add.s32 %r7277, %r7276, %r7275; + xor.b32 %r7278, %r7277, %r7272; + shf.l.wrap.b32 %r7279, %r7278, %r7278, 24; + add.s32 %r7280, %r7279, %r7273; + xor.b32 %r7281, %r7280, %r7275; + shf.l.wrap.b32 %r7282, %r7281, %r7281, 25; + add.s32 %r7283, %r7235, %r6993; + add.s32 %r7284, %r7283, %r7226; + xor.b32 %r7285, %r7284, %r7251; + shf.l.wrap.b32 %r7286, %r7285, %r7285, 16; + add.s32 %r7287, %r7286, %r7266; + xor.b32 %r7288, %r7287, %r7226; + shf.l.wrap.b32 %r7289, %r7288, %r7288, 20; + add.s32 %r7290, %r7284, %r7009; + add.s32 %r7291, %r7290, %r7289; + xor.b32 %r7292, %r7291, %r7286; + shf.l.wrap.b32 %r7293, %r7292, %r7292, 24; + add.s32 %r7294, %r7293, %r7287; + xor.b32 %r7295, %r7294, %r7289; + shf.l.wrap.b32 %r7296, %r7295, %r7295, 25; + add.s32 %r7297, %r7249, %r7017; + add.s32 %r7298, %r7297, %r7240; + xor.b32 %r7299, %r7265, %r7298; + shf.l.wrap.b32 %r7300, %r7299, %r7299, 16; + add.s32 %r7301, %r7300, %r7224; + xor.b32 %r7302, %r7301, %r7240; + shf.l.wrap.b32 %r7303, %r7302, %r7302, 20; + add.s32 %r7304, %r7298, %r6929; + add.s32 %r7305, %r7304, %r7303; + xor.b32 %r7306, %r7305, %r7300; + shf.l.wrap.b32 %r7307, %r7306, %r7306, 24; + add.s32 %r7308, %r7307, %r7301; + xor.b32 %r7309, %r7308, %r7303; + shf.l.wrap.b32 %r7310, %r7309, %r7309, 25; + add.s32 %r7311, %r7254, %r6969; + add.s32 %r7312, %r7311, %r7263; + xor.b32 %r7313, %r7312, %r7223; + shf.l.wrap.b32 %r7314, %r7313, %r7313, 16; + add.s32 %r7315, %r7314, %r7238; + xor.b32 %r7316, %r7315, %r7254; + shf.l.wrap.b32 %r7317, %r7316, %r7316, 20; + add.s32 %r7318, %r7312, %r7025; + add.s32 %r7319, %r7318, %r7317; + xor.b32 %r7320, %r7319, %r7314; + shf.l.wrap.b32 %r7321, %r7320, %r7320, 24; + add.s32 %r7322, %r7321, %r7315; + xor.b32 %r7323, %r7322, %r7317; + shf.l.wrap.b32 %r7324, %r7323, %r7323, 25; + add.s32 %r7325, %r7277, %r6961; + add.s32 %r7326, %r7325, %r7296; + xor.b32 %r7327, %r7326, %r7321; + shf.l.wrap.b32 %r7328, %r7327, %r7327, 16; + add.s32 %r7329, %r7328, %r7308; + xor.b32 %r7330, %r7329, %r7296; + shf.l.wrap.b32 %r7331, %r7330, %r7330, 20; + add.s32 %r7332, %r7326, %r6953; + add.s32 %r7333, %r7332, %r7331; + xor.b32 %r7334, %r7333, %r7328; + shf.l.wrap.b32 %r7335, %r7334, %r7334, 24; + add.s32 %r7336, %r7335, %r7329; + xor.b32 %r7337, %r7336, %r7331; + shf.l.wrap.b32 %r7338, %r7337, %r7337, 25; + add.s32 %r7339, %r7291, %r6985; + add.s32 %r7340, %r7339, %r7310; + xor.b32 %r7341, %r7279, %r7340; + shf.l.wrap.b32 %r7342, %r7341, %r7341, 16; + add.s32 %r7343, %r7342, %r7322; + xor.b32 %r7344, %r7343, %r7310; + shf.l.wrap.b32 %r7345, %r7344, %r7344, 20; + add.s32 %r7346, %r7340, %r6913; + add.s32 %r7347, %r7346, %r7345; + xor.b32 %r7348, %r7347, %r7342; + shf.l.wrap.b32 %r7349, %r7348, %r7348, 24; + add.s32 %r7350, %r7349, %r7343; + xor.b32 %r7351, %r7350, %r7345; + shf.l.wrap.b32 %r7352, %r7351, %r7351, 25; + add.s32 %r7353, %r7305, %r7001; + add.s32 %r7354, %r7353, %r7324; + xor.b32 %r7355, %r7354, %r7293; + shf.l.wrap.b32 %r7356, %r7355, %r7355, 16; + add.s32 %r7357, %r7356, %r7280; + xor.b32 %r7358, %r7357, %r7324; + shf.l.wrap.b32 %r7359, %r7358, %r7358, 20; + add.s32 %r7360, %r7354, %r7036; + add.s32 %r7361, %r7360, %r7359; + xor.b32 %r7362, %r7361, %r7356; + shf.l.wrap.b32 %r7363, %r7362, %r7362, 24; + add.s32 %r7364, %r7363, %r7357; + xor.b32 %r7365, %r7364, %r7359; + shf.l.wrap.b32 %r7366, %r7365, %r7365, 25; + add.s32 %r7367, %r7319, %r6977; + add.s32 %r7368, %r7367, %r7282; + xor.b32 %r7369, %r7368, %r7307; + shf.l.wrap.b32 %r7370, %r7369, %r7369, 16; + add.s32 %r7371, %r7370, %r7294; + xor.b32 %r7372, %r7371, %r7282; + shf.l.wrap.b32 %r7373, %r7372, %r7372, 20; + add.s32 %r7374, %r7368, %r6921; + add.s32 %r7375, %r7374, %r7373; + xor.b32 %r7376, %r7375, %r7370; + shf.l.wrap.b32 %r7377, %r7376, %r7376, 24; + add.s32 %r7378, %r7377, %r7371; + xor.b32 %r7379, %r7378, %r7373; + shf.l.wrap.b32 %r7380, %r7379, %r7379, 25; + add.s32 %r7381, %r7333, %r6993; + add.s32 %r7382, %r7381, %r7380; + xor.b32 %r7383, %r7382, %r7349; + shf.l.wrap.b32 %r7384, %r7383, %r7383, 16; + add.s32 %r7385, %r7384, %r7364; + xor.b32 %r7386, %r7385, %r7380; + shf.l.wrap.b32 %r7387, %r7386, %r7386, 20; + add.s32 %r7388, %r7382, %r6969; + add.s32 %r7389, %r7388, %r7387; + xor.b32 %r7390, %r7389, %r7384; + shf.l.wrap.b32 %r7391, %r7390, %r7390, 24; + add.s32 %r7392, %r7391, %r7385; + xor.b32 %r7393, %r7392, %r7387; + shf.l.wrap.b32 %r7394, %r7393, %r7393, 25; + add.s32 %r7395, %r7347, %r7009; + add.s32 %r7396, %r7395, %r7338; + xor.b32 %r7397, %r7396, %r7363; + shf.l.wrap.b32 %r7398, %r7397, %r7397, 16; + add.s32 %r7399, %r7398, %r7378; + xor.b32 %r7400, %r7399, %r7338; + shf.l.wrap.b32 %r7401, %r7400, %r7400, 20; + add.s32 %r7402, %r7396, %r6985; + add.s32 %r7403, %r7402, %r7401; + xor.b32 %r7404, %r7403, %r7398; + shf.l.wrap.b32 %r7405, %r7404, %r7404, 24; + add.s32 %r7406, %r7405, %r7399; + xor.b32 %r7407, %r7406, %r7401; + shf.l.wrap.b32 %r7408, %r7407, %r7407, 25; + add.s32 %r7409, %r7361, %r7025; + add.s32 %r7410, %r7409, %r7352; + xor.b32 %r7411, %r7377, %r7410; + shf.l.wrap.b32 %r7412, %r7411, %r7411, 16; + add.s32 %r7413, %r7412, %r7336; + xor.b32 %r7414, %r7413, %r7352; + shf.l.wrap.b32 %r7415, %r7414, %r7414, 20; + add.s32 %r7416, %r7410, %r6937; + add.s32 %r7417, %r7416, %r7415; + xor.b32 %r7418, %r7417, %r7412; + shf.l.wrap.b32 %r7419, %r7418, %r7418, 24; + add.s32 %r7420, %r7419, %r7413; + xor.b32 %r7421, %r7420, %r7415; + shf.l.wrap.b32 %r7422, %r7421, %r7421, 25; + add.s32 %r7423, %r7375, %r7017; + add.s32 %r7424, %r7423, %r7366; + xor.b32 %r7425, %r7424, %r7335; + shf.l.wrap.b32 %r7426, %r7425, %r7425, 16; + add.s32 %r7427, %r7426, %r7350; + xor.b32 %r7428, %r7427, %r7366; + shf.l.wrap.b32 %r7429, %r7428, %r7428, 20; + add.s32 %r7430, %r7424, %r7036; + add.s32 %r7431, %r7430, %r7429; + xor.b32 %r7432, %r7431, %r7426; + shf.l.wrap.b32 %r7433, %r7432, %r7432, 24; + add.s32 %r7434, %r7433, %r7427; + xor.b32 %r7435, %r7434, %r7429; + shf.l.wrap.b32 %r7436, %r7435, %r7435, 25; + add.s32 %r7437, %r7389, %r6945; + add.s32 %r7438, %r7437, %r7408; + xor.b32 %r7439, %r7438, %r7433; + shf.l.wrap.b32 %r7440, %r7439, %r7439, 16; + add.s32 %r7441, %r7440, %r7420; + xor.b32 %r7442, %r7441, %r7408; + shf.l.wrap.b32 %r7443, %r7442, %r7442, 20; + add.s32 %r7444, %r7438, %r6913; + add.s32 %r7445, %r7444, %r7443; + xor.b32 %r7446, %r7445, %r7440; + shf.l.wrap.b32 %r7447, %r7446, %r7446, 24; + add.s32 %r7448, %r7447, %r7441; + xor.b32 %r7449, %r7448, %r7443; + shf.l.wrap.b32 %r7450, %r7449, %r7449, 25; + add.s32 %r7451, %r7403, %r7001; + add.s32 %r7452, %r7451, %r7422; + xor.b32 %r7453, %r7391, %r7452; + shf.l.wrap.b32 %r7454, %r7453, %r7453, 16; + add.s32 %r7455, %r7454, %r7434; + xor.b32 %r7456, %r7455, %r7422; + shf.l.wrap.b32 %r7457, %r7456, %r7456, 20; + add.s32 %r7458, %r7452, %r6929; + add.s32 %r7459, %r7458, %r7457; + xor.b32 %r7460, %r7459, %r7454; + shf.l.wrap.b32 %r7461, %r7460, %r7460, 24; + add.s32 %r7462, %r7461, %r7455; + xor.b32 %r7463, %r7462, %r7457; + shf.l.wrap.b32 %r7464, %r7463, %r7463, 25; + add.s32 %r7465, %r7417, %r6953; + add.s32 %r7466, %r7465, %r7436; + xor.b32 %r7467, %r7466, %r7405; + shf.l.wrap.b32 %r7468, %r7467, %r7467, 16; + add.s32 %r7469, %r7468, %r7392; + xor.b32 %r7470, %r7469, %r7436; + shf.l.wrap.b32 %r7471, %r7470, %r7470, 20; + add.s32 %r7472, %r7466, %r6977; + add.s32 %r7473, %r7472, %r7471; + xor.b32 %r7474, %r7473, %r7468; + shf.l.wrap.b32 %r7475, %r7474, %r7474, 24; + add.s32 %r7476, %r7475, %r7469; + xor.b32 %r7477, %r7476, %r7471; + shf.l.wrap.b32 %r7478, %r7477, %r7477, 25; + add.s32 %r7479, %r7431, %r6921; + add.s32 %r7480, %r7479, %r7394; + xor.b32 %r7481, %r7480, %r7419; + shf.l.wrap.b32 %r7482, %r7481, %r7481, 16; + add.s32 %r7483, %r7482, %r7406; + xor.b32 %r7484, %r7483, %r7394; + shf.l.wrap.b32 %r7485, %r7484, %r7484, 20; + add.s32 %r7486, %r7480, %r6961; + add.s32 %r7487, %r7486, %r7485; + xor.b32 %r7488, %r7487, %r7482; + shf.l.wrap.b32 %r7489, %r7488, %r7488, 24; + add.s32 %r7490, %r7489, %r7483; + xor.b32 %r7491, %r7490, %r7485; + shf.l.wrap.b32 %r7492, %r7491, %r7491, 25; + add.s32 %r7493, %r7445, %r7009; + add.s32 %r7494, %r7493, %r7492; + xor.b32 %r7495, %r7494, %r7461; + shf.l.wrap.b32 %r7496, %r7495, %r7495, 16; + add.s32 %r7497, %r7496, %r7476; + xor.b32 %r7498, %r7497, %r7492; + shf.l.wrap.b32 %r7499, %r7498, %r7498, 20; + add.s32 %r7500, %r7494, %r7017; + add.s32 %r7501, %r7500, %r7499; + xor.b32 %r7502, %r7501, %r7496; + shf.l.wrap.b32 %r7503, %r7502, %r7502, 24; + add.s32 %r7504, %r7503, %r7497; + xor.b32 %r7505, %r7504, %r7499; + shf.l.wrap.b32 %r7506, %r7505, %r7505, 25; + add.s32 %r7507, %r7459, %r6985; + add.s32 %r7508, %r7507, %r7450; + xor.b32 %r7509, %r7508, %r7475; + shf.l.wrap.b32 %r7510, %r7509, %r7509, 16; + add.s32 %r7511, %r7510, %r7490; + xor.b32 %r7512, %r7511, %r7450; + shf.l.wrap.b32 %r7513, %r7512, %r7512, 20; + add.s32 %r7514, %r7508, %r7001; + add.s32 %r7515, %r7514, %r7513; + xor.b32 %r7516, %r7515, %r7510; + shf.l.wrap.b32 %r7517, %r7516, %r7516, 24; + add.s32 %r7518, %r7517, %r7511; + xor.b32 %r7519, %r7518, %r7513; + shf.l.wrap.b32 %r7520, %r7519, %r7519, 25; + add.s32 %r7521, %r7473, %r7036; + add.s32 %r7522, %r7521, %r7464; + xor.b32 %r7523, %r7489, %r7522; + shf.l.wrap.b32 %r7524, %r7523, %r7523, 16; + add.s32 %r7525, %r7524, %r7448; + xor.b32 %r7526, %r7525, %r7464; + shf.l.wrap.b32 %r7527, %r7526, %r7526, 20; + add.s32 %r7528, %r7522, %r6993; + add.s32 %r7529, %r7528, %r7527; + xor.b32 %r7530, %r7529, %r7524; + shf.l.wrap.b32 %r7531, %r7530, %r7530, 24; + add.s32 %r7532, %r7531, %r7525; + xor.b32 %r7533, %r7532, %r7527; + shf.l.wrap.b32 %r7534, %r7533, %r7533, 25; + add.s32 %r7535, %r7487, %r7025; + add.s32 %r7536, %r7535, %r7478; + xor.b32 %r7537, %r7536, %r7447; + shf.l.wrap.b32 %r7538, %r7537, %r7537, 16; + add.s32 %r7539, %r7538, %r7462; + xor.b32 %r7540, %r7539, %r7478; + shf.l.wrap.b32 %r7541, %r7540, %r7540, 20; + add.s32 %r7542, %r7536, %r6977; + add.s32 %r7543, %r7542, %r7541; + xor.b32 %r7544, %r7543, %r7538; + shf.l.wrap.b32 %r7545, %r7544, %r7544, 24; + add.s32 %r7546, %r7545, %r7539; + xor.b32 %r7547, %r7546, %r7541; + shf.l.wrap.b32 %r7548, %r7547, %r7547, 25; + add.s32 %r7549, %r7501, %r6969; + add.s32 %r7550, %r7549, %r7520; + xor.b32 %r7551, %r7550, %r7545; + shf.l.wrap.b32 %r7552, %r7551, %r7551, 16; + add.s32 %r7553, %r7552, %r7532; + xor.b32 %r7554, %r7553, %r7520; + shf.l.wrap.b32 %r7555, %r7554, %r7554, 20; + add.s32 %r7556, %r7550, %r6929; + add.s32 %r7557, %r7556, %r7555; + xor.b32 %r7558, %r7557, %r7552; + shf.l.wrap.b32 %r7559, %r7558, %r7558, 24; + add.s32 %r7560, %r7559, %r7553; + xor.b32 %r7561, %r7560, %r7555; + shf.l.wrap.b32 %r7562, %r7561, %r7561, 25; + add.s32 %r7563, %r7515, %r6953; + add.s32 %r7564, %r7563, %r7534; + xor.b32 %r7565, %r7503, %r7564; + shf.l.wrap.b32 %r7566, %r7565, %r7565, 16; + add.s32 %r7567, %r7566, %r7546; + xor.b32 %r7568, %r7567, %r7534; + shf.l.wrap.b32 %r7569, %r7568, %r7568, 20; + add.s32 %r7570, %r7564, %r6937; + add.s32 %r7571, %r7570, %r7569; + xor.b32 %r7572, %r7571, %r7566; + shf.l.wrap.b32 %r7573, %r7572, %r7572, 24; + add.s32 %r7574, %r7573, %r7567; + xor.b32 %r7575, %r7574, %r7569; + shf.l.wrap.b32 %r7576, %r7575, %r7575, 25; + add.s32 %r7577, %r7529, %r6913; + add.s32 %r7578, %r7577, %r7548; + xor.b32 %r7579, %r7578, %r7517; + shf.l.wrap.b32 %r7580, %r7579, %r7579, 16; + add.s32 %r7581, %r7580, %r7504; + xor.b32 %r7582, %r7581, %r7548; + shf.l.wrap.b32 %r7583, %r7582, %r7582, 20; + add.s32 %r7584, %r7578, %r6921; + add.s32 %r7585, %r7584, %r7583; + xor.b32 %r7586, %r7585, %r7580; + shf.l.wrap.b32 %r7587, %r7586, %r7586, 24; + add.s32 %r7588, %r7587, %r7581; + xor.b32 %r7589, %r7588, %r7583; + shf.l.wrap.b32 %r7590, %r7589, %r7589, 25; + add.s32 %r7591, %r7543, %r6961; + add.s32 %r7592, %r7591, %r7506; + xor.b32 %r7593, %r7592, %r7531; + shf.l.wrap.b32 %r7594, %r7593, %r7593, 16; + add.s32 %r7595, %r7594, %r7518; + xor.b32 %r7596, %r7595, %r7506; + shf.l.wrap.b32 %r7597, %r7596, %r7596, 20; + add.s32 %r7598, %r7592, %r6945; + add.s32 %r7599, %r7598, %r7597; + xor.b32 %r7600, %r7599, %r7594; + shf.l.wrap.b32 %r7601, %r7600, %r7600, 24; + add.s32 %r7602, %r7601, %r7595; + xor.b32 %r7603, %r7602, %r7597; + shf.l.wrap.b32 %r7604, %r7603, %r7603, 25; + add.s32 %r7605, %r7557, %r6985; + add.s32 %r7606, %r7605, %r7604; + xor.b32 %r7607, %r7606, %r7573; + shf.l.wrap.b32 %r7608, %r7607, %r7607, 16; + add.s32 %r7609, %r7608, %r7588; + xor.b32 %r7610, %r7609, %r7604; + shf.l.wrap.b32 %r7611, %r7610, %r7610, 20; + add.s32 %r7612, %r7606, %r7025; + add.s32 %r7613, %r7612, %r7611; + xor.b32 %r7614, %r7613, %r7608; + shf.l.wrap.b32 %r7615, %r7614, %r7614, 24; + add.s32 %r7616, %r7615, %r7609; + xor.b32 %r7617, %r7616, %r7611; + shf.l.wrap.b32 %r7618, %r7617, %r7617, 25; + add.s32 %r7619, %r7571, %r7001; + add.s32 %r7620, %r7619, %r7562; + xor.b32 %r7621, %r7620, %r7587; + shf.l.wrap.b32 %r7622, %r7621, %r7621, 16; + add.s32 %r7623, %r7622, %r7602; + xor.b32 %r7624, %r7623, %r7562; + shf.l.wrap.b32 %r7625, %r7624, %r7624, 20; + add.s32 %r7626, %r7620, %r6953; + add.s32 %r7627, %r7626, %r7625; + xor.b32 %r7628, %r7627, %r7622; + shf.l.wrap.b32 %r7629, %r7628, %r7628, 24; + add.s32 %r7630, %r7629, %r7623; + xor.b32 %r7631, %r7630, %r7625; + shf.l.wrap.b32 %r7632, %r7631, %r7631, 25; + add.s32 %r7633, %r7585, %r6977; + add.s32 %r7634, %r7633, %r7576; + xor.b32 %r7635, %r7601, %r7634; + shf.l.wrap.b32 %r7636, %r7635, %r7635, 16; + add.s32 %r7637, %r7636, %r7560; + xor.b32 %r7638, %r7637, %r7576; + shf.l.wrap.b32 %r7639, %r7638, %r7638, 20; + add.s32 %r7640, %r7634, %r7009; + add.s32 %r7641, %r7640, %r7639; + xor.b32 %r7642, %r7641, %r7636; + shf.l.wrap.b32 %r7643, %r7642, %r7642, 24; + add.s32 %r7644, %r7643, %r7637; + xor.b32 %r7645, %r7644, %r7639; + shf.l.wrap.b32 %r7646, %r7645, %r7645, 25; + add.s32 %r7647, %r7599, %r7036; + add.s32 %r7648, %r7647, %r7590; + xor.b32 %r7649, %r7648, %r7559; + shf.l.wrap.b32 %r7650, %r7649, %r7649, 16; + add.s32 %r7651, %r7650, %r7574; + xor.b32 %r7652, %r7651, %r7590; + shf.l.wrap.b32 %r7653, %r7652, %r7652, 20; + add.s32 %r7654, %r7648, %r6921; + add.s32 %r7655, %r7654, %r7653; + xor.b32 %r7656, %r7655, %r7650; + shf.l.wrap.b32 %r7657, %r7656, %r7656, 24; + add.s32 %r7658, %r7657, %r7651; + xor.b32 %r7659, %r7658, %r7653; + shf.l.wrap.b32 %r7660, %r7659, %r7659, 25; + add.s32 %r7661, %r7613, %r7017; + add.s32 %r7662, %r7661, %r7632; + xor.b32 %r7663, %r7662, %r7657; + shf.l.wrap.b32 %r7664, %r7663, %r7663, 16; + add.s32 %r7665, %r7664, %r7644; + xor.b32 %r7666, %r7665, %r7632; + shf.l.wrap.b32 %r7667, %r7666, %r7666, 20; + add.s32 %r7668, %r7662, %r6937; + add.s32 %r7669, %r7668, %r7667; + xor.b32 %r7670, %r7669, %r7664; + shf.l.wrap.b32 %r7671, %r7670, %r7670, 24; + add.s32 %r7672, %r7671, %r7665; + xor.b32 %r7673, %r7672, %r7667; + shf.l.wrap.b32 %r7674, %r7673, %r7673, 25; + add.s32 %r7675, %r7627, %r6913; + add.s32 %r7676, %r7675, %r7646; + xor.b32 %r7677, %r7615, %r7676; + shf.l.wrap.b32 %r7678, %r7677, %r7677, 16; + add.s32 %r7679, %r7678, %r7658; + xor.b32 %r7680, %r7679, %r7646; + shf.l.wrap.b32 %r7681, %r7680, %r7680, 20; + add.s32 %r7682, %r7676, %r6993; + add.s32 %r7683, %r7682, %r7681; + xor.b32 %r7684, %r7683, %r7678; + shf.l.wrap.b32 %r7685, %r7684, %r7684, 24; + add.s32 %r7686, %r7685, %r7679; + xor.b32 %r7687, %r7686, %r7681; + shf.l.wrap.b32 %r7688, %r7687, %r7687, 25; + add.s32 %r7689, %r7641, %r6929; + add.s32 %r7690, %r7689, %r7660; + xor.b32 %r7691, %r7690, %r7629; + shf.l.wrap.b32 %r7692, %r7691, %r7691, 16; + add.s32 %r7693, %r7692, %r7616; + xor.b32 %r7694, %r7693, %r7660; + shf.l.wrap.b32 %r7695, %r7694, %r7694, 20; + add.s32 %r7696, %r7690, %r6961; + add.s32 %r7697, %r7696, %r7695; + xor.b32 %r7698, %r7697, %r7692; + shf.l.wrap.b32 %r7699, %r7698, %r7698, 24; + add.s32 %r7700, %r7699, %r7693; + xor.b32 %r7701, %r7700, %r7695; + shf.l.wrap.b32 %r7702, %r7701, %r7701, 25; + add.s32 %r7703, %r7655, %r6945; + add.s32 %r7704, %r7703, %r7618; + xor.b32 %r7705, %r7704, %r7643; + shf.l.wrap.b32 %r7706, %r7705, %r7705, 16; + add.s32 %r7707, %r7706, %r7630; + xor.b32 %r7708, %r7707, %r7618; + shf.l.wrap.b32 %r7709, %r7708, %r7708, 20; + add.s32 %r7710, %r7704, %r6969; + add.s32 %r7711, %r7710, %r7709; + xor.b32 %r7712, %r7711, %r7706; + shf.l.wrap.b32 %r7713, %r7712, %r7712, 24; + add.s32 %r7714, %r7713, %r7707; + xor.b32 %r7715, %r7714, %r7709; + shf.l.wrap.b32 %r7716, %r7715, %r7715, 25; + add.s32 %r7717, %r7669, %r7001; + add.s32 %r7718, %r7717, %r7716; + xor.b32 %r7719, %r7718, %r7685; + shf.l.wrap.b32 %r7720, %r7719, %r7719, 16; + add.s32 %r7721, %r7720, %r7700; + xor.b32 %r7722, %r7721, %r7716; + shf.l.wrap.b32 %r7723, %r7722, %r7722, 20; + add.s32 %r7724, %r7718, %r7036; + add.s32 %r7725, %r7724, %r7723; + xor.b32 %r7726, %r7725, %r7720; + shf.l.wrap.b32 %r7727, %r7726, %r7726, 24; + add.s32 %r7728, %r7727, %r7721; + xor.b32 %r7729, %r7728, %r7723; + shf.l.wrap.b32 %r7730, %r7729, %r7729, 25; + add.s32 %r7731, %r7683, %r6953; + add.s32 %r7732, %r7731, %r7674; + xor.b32 %r7733, %r7732, %r7699; + shf.l.wrap.b32 %r7734, %r7733, %r7733, 16; + add.s32 %r7735, %r7734, %r7714; + xor.b32 %r7736, %r7735, %r7674; + shf.l.wrap.b32 %r7737, %r7736, %r7736, 20; + add.s32 %r7738, %r7732, %r6913; + add.s32 %r7739, %r7738, %r7737; + xor.b32 %r7740, %r7739, %r7734; + shf.l.wrap.b32 %r7741, %r7740, %r7740, 24; + add.s32 %r7742, %r7741, %r7735; + xor.b32 %r7743, %r7742, %r7737; + shf.l.wrap.b32 %r7744, %r7743, %r7743, 25; + add.s32 %r7745, %r7697, %r6921; + add.s32 %r7746, %r7745, %r7688; + xor.b32 %r7747, %r7713, %r7746; + shf.l.wrap.b32 %r7748, %r7747, %r7747, 16; + add.s32 %r7749, %r7748, %r7672; + xor.b32 %r7750, %r7749, %r7688; + shf.l.wrap.b32 %r7751, %r7750, %r7750, 20; + add.s32 %r7752, %r7746, %r6985; + add.s32 %r7753, %r7752, %r7751; + xor.b32 %r7754, %r7753, %r7748; + shf.l.wrap.b32 %r7755, %r7754, %r7754, 24; + add.s32 %r7756, %r7755, %r7749; + xor.b32 %r7757, %r7756, %r7751; + shf.l.wrap.b32 %r7758, %r7757, %r7757, 25; + add.s32 %r7759, %r7711, %r6977; + add.s32 %r7760, %r7759, %r7702; + xor.b32 %r7761, %r7760, %r7671; + shf.l.wrap.b32 %r7762, %r7761, %r7761, 16; + add.s32 %r7763, %r7762, %r7686; + xor.b32 %r7764, %r7763, %r7702; + shf.l.wrap.b32 %r7765, %r7764, %r7764, 20; + add.s32 %r7766, %r7760, %r6961; + add.s32 %r7767, %r7766, %r7765; + xor.b32 %r7768, %r7767, %r7762; + shf.l.wrap.b32 %r7769, %r7768, %r7768, 24; + add.s32 %r7770, %r7769, %r7763; + xor.b32 %r7771, %r7770, %r7765; + shf.l.wrap.b32 %r7772, %r7771, %r7771, 25; + add.s32 %r7773, %r7725, %r7025; + add.s32 %r7774, %r7773, %r7744; + xor.b32 %r7775, %r7774, %r7769; + shf.l.wrap.b32 %r7776, %r7775, %r7775, 16; + add.s32 %r7777, %r7776, %r7756; + xor.b32 %r7778, %r7777, %r7744; + shf.l.wrap.b32 %r7779, %r7778, %r7778, 20; + add.s32 %r7780, %r7774, %r6993; + add.s32 %r7781, %r7780, %r7779; + xor.b32 %r7782, %r7781, %r7776; + shf.l.wrap.b32 %r7783, %r7782, %r7782, 24; + add.s32 %r7784, %r7783, %r7777; + xor.b32 %r7785, %r7784, %r7779; + shf.l.wrap.b32 %r7786, %r7785, %r7785, 25; + add.s32 %r7787, %r7739, %r6929; + add.s32 %r7788, %r7787, %r7758; + xor.b32 %r7789, %r7727, %r7788; + shf.l.wrap.b32 %r7790, %r7789, %r7789, 16; + add.s32 %r7791, %r7790, %r7770; + xor.b32 %r7792, %r7791, %r7758; + shf.l.wrap.b32 %r7793, %r7792, %r7792, 20; + add.s32 %r7794, %r7788, %r7009; + add.s32 %r7795, %r7794, %r7793; + xor.b32 %r7796, %r7795, %r7790; + shf.l.wrap.b32 %r7797, %r7796, %r7796, 24; + add.s32 %r7798, %r7797, %r7791; + xor.b32 %r7799, %r7798, %r7793; + shf.l.wrap.b32 %r7800, %r7799, %r7799, 25; + add.s32 %r7801, %r7753, %r6937; + add.s32 %r7802, %r7801, %r7772; + xor.b32 %r7803, %r7802, %r7741; + shf.l.wrap.b32 %r7804, %r7803, %r7803, 16; + add.s32 %r7805, %r7804, %r7728; + xor.b32 %r7806, %r7805, %r7772; + shf.l.wrap.b32 %r7807, %r7806, %r7806, 20; + add.s32 %r7808, %r7802, %r6945; + add.s32 %r7809, %r7808, %r7807; + xor.b32 %r7810, %r7809, %r7804; + shf.l.wrap.b32 %r7811, %r7810, %r7810, 24; + add.s32 %r7812, %r7811, %r7805; + xor.b32 %r7813, %r7812, %r7807; + shf.l.wrap.b32 %r7814, %r7813, %r7813, 25; + add.s32 %r7815, %r7767, %r6969; + add.s32 %r7816, %r7815, %r7730; + xor.b32 %r7817, %r7816, %r7755; + shf.l.wrap.b32 %r7818, %r7817, %r7817, 16; + add.s32 %r7819, %r7818, %r7742; + xor.b32 %r7820, %r7819, %r7730; + shf.l.wrap.b32 %r7821, %r7820, %r7820, 20; + add.s32 %r7822, %r7816, %r7017; + add.s32 %r7823, %r7822, %r7821; + xor.b32 %r7824, %r7823, %r7818; + shf.l.wrap.b32 %r7825, %r7824, %r7824, 24; + add.s32 %r7826, %r7825, %r7819; + xor.b32 %r7827, %r7826, %r7821; + shf.l.wrap.b32 %r7828, %r7827, %r7827, 25; + xor.b32 %r97, %r7812, %r7781; + xor.b32 %r98, %r7826, %r7795; + xor.b32 %r99, %r7784, %r7809; + xor.b32 %r100, %r7823, %r7798; + xor.b32 %r101, %r7828, %r7797; + xor.b32 %r102, %r7786, %r7811; + xor.b32 %r103, %r7825, %r7800; + xor.b32 %r104, %r7814, %r7783; + ld.local.u8 %rs327, [%rd3+8]; + cvt.u64.u16 %rd189, %rs327; + popc.b64 %r7829, %rd251; + cvt.u64.u32 %rd64, %r7829; + setp.ge.u64 %p37, %rd64, %rd189; + mul.wide.u16 %r11681, %rs327, 32; + @%p37 bra $L__BB1_46; + +$L__BB1_45: + popc.b64 %r11649, %rd251; + cvt.u64.u32 %rd230, %r11649; + add.s32 %r7830, %r11681, -64; + cvt.s64.s32 %rd190, %r7830; + add.s64 %rd191, %rd2, %rd190; + ld.local.u8 %r7831, [%rd3+2]; + ld.local.u8 %r7832, [%rd191+145]; + ld.local.u8 %r7833, [%rd191+146]; + prmt.b32 %r7834, %r7833, %r7832, 30212; + ld.local.u8 %r7835, [%rd191+147]; + prmt.b32 %r7836, %r7835, %r7834, 28756; + ld.local.u8 %r7837, [%rd191+148]; + prmt.b32 %r7838, %r7837, %r7836, 1620; + ld.local.u8 %r7839, [%rd191+149]; + ld.local.u8 %r7840, [%rd191+150]; + prmt.b32 %r7841, %r7840, %r7839, 30212; + ld.local.u8 %r7842, [%rd191+151]; + prmt.b32 %r7843, %r7842, %r7841, 28756; + ld.local.u8 %r7844, [%rd191+152]; + prmt.b32 %r7845, %r7844, %r7843, 1620; + ld.local.u8 %r7846, [%rd191+153]; + ld.local.u8 %r7847, [%rd191+154]; + prmt.b32 %r7848, %r7847, %r7846, 30212; + ld.local.u8 %r7849, [%rd191+155]; + prmt.b32 %r7850, %r7849, %r7848, 28756; + ld.local.u8 %r7851, [%rd191+156]; + prmt.b32 %r7852, %r7851, %r7850, 1620; + ld.local.u8 %r7853, [%rd191+157]; + ld.local.u8 %r7854, [%rd191+158]; + prmt.b32 %r7855, %r7854, %r7853, 30212; + ld.local.u8 %r7856, [%rd191+159]; + prmt.b32 %r7857, %r7856, %r7855, 28756; + ld.local.u8 %r7858, [%rd191+160]; + prmt.b32 %r7859, %r7858, %r7857, 1620; + ld.local.u8 %r7860, [%rd191+161]; + ld.local.u8 %r7861, [%rd191+162]; + prmt.b32 %r7862, %r7861, %r7860, 30212; + ld.local.u8 %r7863, [%rd191+163]; + prmt.b32 %r7864, %r7863, %r7862, 28756; + ld.local.u8 %r7865, [%rd191+164]; + prmt.b32 %r7866, %r7865, %r7864, 1620; + ld.local.u8 %r7867, [%rd191+165]; + ld.local.u8 %r7868, [%rd191+166]; + prmt.b32 %r7869, %r7868, %r7867, 30212; + ld.local.u8 %r7870, [%rd191+167]; + prmt.b32 %r7871, %r7870, %r7869, 28756; + ld.local.u8 %r7872, [%rd191+168]; + prmt.b32 %r7873, %r7872, %r7871, 1620; + ld.local.u8 %r7874, [%rd191+169]; + ld.local.u8 %r7875, [%rd191+170]; + prmt.b32 %r7876, %r7875, %r7874, 30212; + ld.local.u8 %r7877, [%rd191+171]; + prmt.b32 %r7878, %r7877, %r7876, 28756; + ld.local.u8 %r7879, [%rd191+172]; + prmt.b32 %r7880, %r7879, %r7878, 1620; + ld.local.u8 %r7881, [%rd191+173]; + ld.local.u8 %r7882, [%rd191+174]; + prmt.b32 %r7883, %r7882, %r7881, 30212; + ld.local.u8 %r7884, [%rd191+175]; + prmt.b32 %r7885, %r7884, %r7883, 28756; + ld.local.u8 %r7886, [%rd191+176]; + prmt.b32 %r7887, %r7886, %r7885, 1620; + ld.local.u8 %r7888, [%rd191+177]; + ld.local.u8 %r7889, [%rd191+178]; + prmt.b32 %r7890, %r7889, %r7888, 30212; + ld.local.u8 %r7891, [%rd191+179]; + prmt.b32 %r7892, %r7891, %r7890, 28756; + ld.local.u8 %r7893, [%rd191+180]; + prmt.b32 %r7894, %r7893, %r7892, 1620; + ld.local.u8 %r7895, [%rd191+181]; + ld.local.u8 %r7896, [%rd191+182]; + prmt.b32 %r7897, %r7896, %r7895, 30212; + ld.local.u8 %r7898, [%rd191+183]; + prmt.b32 %r7899, %r7898, %r7897, 28756; + ld.local.u8 %r7900, [%rd191+184]; + prmt.b32 %r7901, %r7900, %r7899, 1620; + ld.local.u8 %r7902, [%rd191+185]; + ld.local.u8 %r7903, [%rd191+186]; + prmt.b32 %r7904, %r7903, %r7902, 30212; + ld.local.u8 %r7905, [%rd191+187]; + prmt.b32 %r7906, %r7905, %r7904, 28756; + ld.local.u8 %r7907, [%rd191+188]; + prmt.b32 %r7908, %r7907, %r7906, 1620; + ld.local.u8 %r7909, [%rd191+189]; + ld.local.u8 %r7910, [%rd191+190]; + prmt.b32 %r7911, %r7910, %r7909, 30212; + ld.local.u8 %r7912, [%rd191+191]; + prmt.b32 %r7913, %r7912, %r7911, 28756; + ld.local.u8 %r7914, [%rd191+192]; + prmt.b32 %r7915, %r7914, %r7913, 1620; + ld.local.u8 %r7916, [%rd191+193]; + ld.local.u8 %r7917, [%rd191+194]; + prmt.b32 %r7918, %r7917, %r7916, 30212; + ld.local.u8 %r7919, [%rd191+195]; + prmt.b32 %r7920, %r7919, %r7918, 28756; + ld.local.u8 %r7921, [%rd191+196]; + prmt.b32 %r7922, %r7921, %r7920, 1620; + ld.local.u8 %r7923, [%rd191+197]; + ld.local.u8 %r7924, [%rd191+198]; + prmt.b32 %r7925, %r7924, %r7923, 30212; + ld.local.u8 %r7926, [%rd191+199]; + prmt.b32 %r7927, %r7926, %r7925, 28756; + ld.local.u8 %r7928, [%rd191+200]; + prmt.b32 %r7929, %r7928, %r7927, 1620; + ld.local.u8 %r7930, [%rd191+201]; + ld.local.u8 %r7931, [%rd191+202]; + prmt.b32 %r7932, %r7931, %r7930, 30212; + ld.local.u8 %r7933, [%rd191+203]; + prmt.b32 %r7934, %r7933, %r7932, 28756; + ld.local.u8 %r7935, [%rd191+204]; + prmt.b32 %r7936, %r7935, %r7934, 1620; + ld.local.u8 %r7937, [%rd191+205]; + ld.local.u8 %r7938, [%rd191+206]; + prmt.b32 %r7939, %r7938, %r7937, 30212; + ld.local.u8 %r7940, [%rd191+207]; + prmt.b32 %r7941, %r7940, %r7939, 28756; + ld.local.u8 %r7942, [%rd191+208]; + prmt.b32 %r7943, %r7942, %r7941, 1620; + or.b32 %r7944, %r7831, 4; + ld.local.u8 %r7945, [%rd3+-120]; + ld.local.u8 %r7946, [%rd3+-119]; + prmt.b32 %r7947, %r7946, %r7945, 30212; + ld.local.u8 %r7948, [%rd3+-118]; + ld.local.u8 %r7949, [%rd3+-117]; + prmt.b32 %r7950, %r7949, %r7948, 30212; + prmt.b32 %r7951, %r7950, %r7947, 4180; + ld.local.u8 %r7952, [%rd3+-136]; + ld.local.u8 %r7953, [%rd3+-135]; + prmt.b32 %r7954, %r7953, %r7952, 30212; + ld.local.u8 %r7955, [%rd3+-134]; + ld.local.u8 %r7956, [%rd3+-133]; + prmt.b32 %r7957, %r7956, %r7955, 30212; + prmt.b32 %r7958, %r7957, %r7954, 4180; + add.s32 %r7959, %r7951, %r7958; + add.s32 %r7960, %r7959, %r7838; + shf.l.wrap.b32 %r7961, %r7960, %r7960, 16; + add.s32 %r7962, %r7961, 1779033703; + xor.b32 %r7963, %r7962, %r7951; + shf.l.wrap.b32 %r7964, %r7963, %r7963, 20; + add.s32 %r7965, %r7845, %r7960; + add.s32 %r7966, %r7965, %r7964; + xor.b32 %r7967, %r7966, %r7961; + shf.l.wrap.b32 %r7968, %r7967, %r7967, 24; + add.s32 %r7969, %r7968, %r7962; + xor.b32 %r7970, %r7969, %r7964; + shf.l.wrap.b32 %r7971, %r7970, %r7970, 25; + ld.local.u8 %r7972, [%rd3+-116]; + ld.local.u8 %r7973, [%rd3+-115]; + prmt.b32 %r7974, %r7973, %r7972, 30212; + ld.local.u8 %r7975, [%rd3+-114]; + ld.local.u8 %r7976, [%rd3+-113]; + prmt.b32 %r7977, %r7976, %r7975, 30212; + prmt.b32 %r7978, %r7977, %r7974, 4180; + ld.local.u8 %r7979, [%rd3+-132]; + ld.local.u8 %r7980, [%rd3+-131]; + prmt.b32 %r7981, %r7980, %r7979, 30212; + ld.local.u8 %r7982, [%rd3+-130]; + ld.local.u8 %r7983, [%rd3+-129]; + prmt.b32 %r7984, %r7983, %r7982, 30212; + prmt.b32 %r7985, %r7984, %r7981, 4180; + add.s32 %r7986, %r7978, %r7985; + add.s32 %r7987, %r7986, %r7852; + shf.l.wrap.b32 %r7988, %r7987, %r7987, 16; + add.s32 %r7989, %r7988, -1150833019; + xor.b32 %r7990, %r7989, %r7978; + shf.l.wrap.b32 %r7991, %r7990, %r7990, 20; + add.s32 %r7992, %r7859, %r7987; + add.s32 %r7993, %r7992, %r7991; + xor.b32 %r7994, %r7993, %r7988; + shf.l.wrap.b32 %r7995, %r7994, %r7994, 24; + add.s32 %r7996, %r7995, %r7989; + xor.b32 %r7997, %r7996, %r7991; + shf.l.wrap.b32 %r7998, %r7997, %r7997, 25; + ld.local.u8 %r7999, [%rd3+-112]; + ld.local.u8 %r8000, [%rd3+-111]; + prmt.b32 %r8001, %r8000, %r7999, 30212; + ld.local.u8 %r8002, [%rd3+-110]; + ld.local.u8 %r8003, [%rd3+-109]; + prmt.b32 %r8004, %r8003, %r8002, 30212; + prmt.b32 %r8005, %r8004, %r8001, 4180; + ld.local.u8 %r8006, [%rd3+-128]; + ld.local.u8 %r8007, [%rd3+-127]; + prmt.b32 %r8008, %r8007, %r8006, 30212; + ld.local.u8 %r8009, [%rd3+-126]; + ld.local.u8 %r8010, [%rd3+-125]; + prmt.b32 %r8011, %r8010, %r8009, 30212; + prmt.b32 %r8012, %r8011, %r8008, 4180; + add.s32 %r8013, %r8005, %r8012; + add.s32 %r8014, %r8013, %r7866; + shr.u32 %r8015, %r8014, 16; + shl.b32 %r8016, %r8014, 16; + xor.b32 %r8017, %r8016, 4194304; + or.b32 %r8018, %r8017, %r8015; + add.s32 %r8019, %r8018, 1013904242; + xor.b32 %r8020, %r8019, %r8005; + shf.l.wrap.b32 %r8021, %r8020, %r8020, 20; + add.s32 %r8022, %r7873, %r8014; + add.s32 %r8023, %r8022, %r8021; + xor.b32 %r8024, %r8023, %r8018; + shf.l.wrap.b32 %r8025, %r8024, %r8024, 24; + add.s32 %r8026, %r8025, %r8019; + xor.b32 %r8027, %r8026, %r8021; + shf.l.wrap.b32 %r8028, %r8027, %r8027, 25; + ld.local.u8 %r8029, [%rd3+-108]; + ld.local.u8 %r8030, [%rd3+-107]; + prmt.b32 %r8031, %r8030, %r8029, 30212; + ld.local.u8 %r8032, [%rd3+-106]; + ld.local.u8 %r8033, [%rd3+-105]; + prmt.b32 %r8034, %r8033, %r8032, 30212; + prmt.b32 %r8035, %r8034, %r8031, 4180; + ld.local.u8 %r8036, [%rd3+-124]; + ld.local.u8 %r8037, [%rd3+-123]; + prmt.b32 %r8038, %r8037, %r8036, 30212; + ld.local.u8 %r8039, [%rd3+-122]; + ld.local.u8 %r8040, [%rd3+-121]; + prmt.b32 %r8041, %r8040, %r8039, 30212; + prmt.b32 %r8042, %r8041, %r8038, 4180; + add.s32 %r8043, %r8035, %r8042; + add.s32 %r8044, %r8043, %r7880; + xor.b32 %r8045, %r8044, %r7944; + shr.u32 %r8046, %r8044, 16; + shl.b32 %r8047, %r8045, 16; + or.b32 %r8048, %r8047, %r8046; + add.s32 %r8049, %r8048, -1521486534; + xor.b32 %r8050, %r8049, %r8035; + shf.l.wrap.b32 %r8051, %r8050, %r8050, 20; + add.s32 %r8052, %r7887, %r8044; + add.s32 %r8053, %r8052, %r8051; + xor.b32 %r8054, %r8053, %r8048; + shf.l.wrap.b32 %r8055, %r8054, %r8054, 24; + add.s32 %r8056, %r8055, %r8049; + xor.b32 %r8057, %r8056, %r8051; + shf.l.wrap.b32 %r8058, %r8057, %r8057, 25; + add.s32 %r8059, %r7998, %r7966; + add.s32 %r8060, %r8059, %r7894; + xor.b32 %r8061, %r8055, %r8060; + shf.l.wrap.b32 %r8062, %r8061, %r8061, 16; + add.s32 %r8063, %r8062, %r8026; + xor.b32 %r8064, %r8063, %r7998; + shf.l.wrap.b32 %r8065, %r8064, %r8064, 20; + add.s32 %r8066, %r7901, %r8060; + add.s32 %r8067, %r8066, %r8065; + xor.b32 %r8068, %r8067, %r8062; + shf.l.wrap.b32 %r8069, %r8068, %r8068, 24; + add.s32 %r8070, %r8069, %r8063; + xor.b32 %r8071, %r8070, %r8065; + shf.l.wrap.b32 %r8072, %r8071, %r8071, 25; + add.s32 %r8073, %r8028, %r7993; + add.s32 %r8074, %r8073, %r7908; + xor.b32 %r8075, %r8074, %r7968; + shf.l.wrap.b32 %r8076, %r8075, %r8075, 16; + add.s32 %r8077, %r8076, %r8056; + xor.b32 %r8078, %r8077, %r8028; + shf.l.wrap.b32 %r8079, %r8078, %r8078, 20; + add.s32 %r8080, %r7915, %r8074; + add.s32 %r8081, %r8080, %r8079; + xor.b32 %r8082, %r8081, %r8076; + shf.l.wrap.b32 %r8083, %r8082, %r8082, 24; + add.s32 %r8084, %r8083, %r8077; + xor.b32 %r8085, %r8084, %r8079; + shf.l.wrap.b32 %r8086, %r8085, %r8085, 25; + add.s32 %r8087, %r8058, %r8023; + add.s32 %r8088, %r8087, %r7922; + xor.b32 %r8089, %r8088, %r7995; + shf.l.wrap.b32 %r8090, %r8089, %r8089, 16; + add.s32 %r8091, %r8090, %r7969; + xor.b32 %r8092, %r8091, %r8058; + shf.l.wrap.b32 %r8093, %r8092, %r8092, 20; + add.s32 %r8094, %r7929, %r8088; + add.s32 %r8095, %r8094, %r8093; + xor.b32 %r8096, %r8095, %r8090; + shf.l.wrap.b32 %r8097, %r8096, %r8096, 24; + add.s32 %r8098, %r8097, %r8091; + xor.b32 %r8099, %r8098, %r8093; + shf.l.wrap.b32 %r8100, %r8099, %r8099, 25; + add.s32 %r8101, %r8053, %r7971; + add.s32 %r8102, %r8101, %r7936; + xor.b32 %r8103, %r8102, %r8025; + shf.l.wrap.b32 %r8104, %r8103, %r8103, 16; + add.s32 %r8105, %r8104, %r7996; + xor.b32 %r8106, %r8105, %r7971; + shf.l.wrap.b32 %r8107, %r8106, %r8106, 20; + add.s32 %r8108, %r7943, %r8102; + add.s32 %r8109, %r8108, %r8107; + xor.b32 %r8110, %r8109, %r8104; + shf.l.wrap.b32 %r8111, %r8110, %r8110, 24; + add.s32 %r8112, %r8111, %r8105; + xor.b32 %r8113, %r8112, %r8107; + shf.l.wrap.b32 %r8114, %r8113, %r8113, 25; + add.s32 %r8115, %r8067, %r7852; + add.s32 %r8116, %r8115, %r8114; + xor.b32 %r8117, %r8116, %r8083; + shf.l.wrap.b32 %r8118, %r8117, %r8117, 16; + add.s32 %r8119, %r8118, %r8098; + xor.b32 %r8120, %r8119, %r8114; + shf.l.wrap.b32 %r8121, %r8120, %r8120, 20; + add.s32 %r8122, %r8116, %r7880; + add.s32 %r8123, %r8122, %r8121; + xor.b32 %r8124, %r8123, %r8118; + shf.l.wrap.b32 %r8125, %r8124, %r8124, 24; + add.s32 %r8126, %r8125, %r8119; + xor.b32 %r8127, %r8126, %r8121; + shf.l.wrap.b32 %r8128, %r8127, %r8127, 25; + add.s32 %r8129, %r8081, %r7859; + add.s32 %r8130, %r8129, %r8072; + xor.b32 %r8131, %r8097, %r8130; + shf.l.wrap.b32 %r8132, %r8131, %r8131, 16; + add.s32 %r8133, %r8112, %r8132; + xor.b32 %r8134, %r8133, %r8072; + shf.l.wrap.b32 %r8135, %r8134, %r8134, 20; + add.s32 %r8136, %r8130, %r7908; + add.s32 %r8137, %r8136, %r8135; + xor.b32 %r8138, %r8137, %r8132; + shf.l.wrap.b32 %r8139, %r8138, %r8138, 24; + add.s32 %r8140, %r8139, %r8133; + xor.b32 %r8141, %r8140, %r8135; + shf.l.wrap.b32 %r8142, %r8141, %r8141, 25; + add.s32 %r8143, %r8086, %r7887; + add.s32 %r8144, %r8143, %r8095; + xor.b32 %r8145, %r8111, %r8144; + shf.l.wrap.b32 %r8146, %r8145, %r8145, 16; + add.s32 %r8147, %r8146, %r8070; + xor.b32 %r8148, %r8147, %r8086; + shf.l.wrap.b32 %r8149, %r8148, %r8148, 20; + add.s32 %r8150, %r8144, %r7838; + add.s32 %r8151, %r8150, %r8149; + xor.b32 %r8152, %r8151, %r8146; + shf.l.wrap.b32 %r8153, %r8152, %r8152, 24; + add.s32 %r8154, %r8153, %r8147; + xor.b32 %r8155, %r8154, %r8149; + shf.l.wrap.b32 %r8156, %r8155, %r8155, 25; + add.s32 %r8157, %r8100, %r7866; + add.s32 %r8158, %r8157, %r8109; + xor.b32 %r8159, %r8158, %r8069; + shf.l.wrap.b32 %r8160, %r8159, %r8159, 16; + add.s32 %r8161, %r8160, %r8084; + xor.b32 %r8162, %r8161, %r8100; + shf.l.wrap.b32 %r8163, %r8162, %r8162, 20; + add.s32 %r8164, %r8158, %r7929; + add.s32 %r8165, %r8164, %r8163; + xor.b32 %r8166, %r8165, %r8160; + shf.l.wrap.b32 %r8167, %r8166, %r8166, 24; + add.s32 %r8168, %r8167, %r8161; + xor.b32 %r8169, %r8168, %r8163; + shf.l.wrap.b32 %r8170, %r8169, %r8169, 25; + add.s32 %r8171, %r8142, %r7845; + add.s32 %r8172, %r8171, %r8123; + xor.b32 %r8173, %r8172, %r8167; + shf.l.wrap.b32 %r8174, %r8173, %r8173, 16; + add.s32 %r8175, %r8174, %r8154; + xor.b32 %r8176, %r8175, %r8142; + shf.l.wrap.b32 %r8177, %r8176, %r8176, 20; + add.s32 %r8178, %r8172, %r7915; + add.s32 %r8179, %r8178, %r8177; + xor.b32 %r8180, %r8179, %r8174; + shf.l.wrap.b32 %r8181, %r8180, %r8180, 24; + add.s32 %r8182, %r8181, %r8175; + xor.b32 %r8183, %r8182, %r8177; + shf.l.wrap.b32 %r8184, %r8183, %r8183, 25; + add.s32 %r8185, %r8137, %r7922; + add.s32 %r8186, %r8185, %r8156; + xor.b32 %r8187, %r8125, %r8186; + shf.l.wrap.b32 %r8188, %r8187, %r8187, 16; + add.s32 %r8189, %r8188, %r8168; + xor.b32 %r8190, %r8189, %r8156; + shf.l.wrap.b32 %r8191, %r8190, %r8190, 20; + add.s32 %r8192, %r8186, %r7873; + add.s32 %r8193, %r8192, %r8191; + xor.b32 %r8194, %r8193, %r8188; + shf.l.wrap.b32 %r8195, %r8194, %r8194, 24; + add.s32 %r8196, %r8195, %r8189; + xor.b32 %r8197, %r8196, %r8191; + shf.l.wrap.b32 %r8198, %r8197, %r8197, 25; + add.s32 %r8199, %r8151, %r7901; + add.s32 %r8200, %r8199, %r8170; + xor.b32 %r8201, %r8200, %r8139; + shf.l.wrap.b32 %r8202, %r8201, %r8201, 16; + add.s32 %r8203, %r8202, %r8126; + xor.b32 %r8204, %r8203, %r8170; + shf.l.wrap.b32 %r8205, %r8204, %r8204, 20; + add.s32 %r8206, %r8200, %r7936; + add.s32 %r8207, %r8206, %r8205; + xor.b32 %r8208, %r8207, %r8202; + shf.l.wrap.b32 %r8209, %r8208, %r8208, 24; + add.s32 %r8210, %r8209, %r8203; + xor.b32 %r8211, %r8210, %r8205; + shf.l.wrap.b32 %r8212, %r8211, %r8211, 25; + add.s32 %r8213, %r8165, %r7943; + add.s32 %r8214, %r8213, %r8128; + xor.b32 %r8215, %r8214, %r8153; + shf.l.wrap.b32 %r8216, %r8215, %r8215, 16; + add.s32 %r8217, %r8216, %r8140; + xor.b32 %r8218, %r8217, %r8128; + shf.l.wrap.b32 %r8219, %r8218, %r8218, 20; + add.s32 %r8220, %r8214, %r7894; + add.s32 %r8221, %r8220, %r8219; + xor.b32 %r8222, %r8221, %r8216; + shf.l.wrap.b32 %r8223, %r8222, %r8222, 24; + add.s32 %r8224, %r8223, %r8217; + xor.b32 %r8225, %r8224, %r8219; + shf.l.wrap.b32 %r8226, %r8225, %r8225, 25; + add.s32 %r8227, %r8179, %r7859; + add.s32 %r8228, %r8227, %r8226; + xor.b32 %r8229, %r8228, %r8195; + shf.l.wrap.b32 %r8230, %r8229, %r8229, 16; + add.s32 %r8231, %r8230, %r8210; + xor.b32 %r8232, %r8231, %r8226; + shf.l.wrap.b32 %r8233, %r8232, %r8232, 20; + add.s32 %r8234, %r8228, %r7866; + add.s32 %r8235, %r8234, %r8233; + xor.b32 %r8236, %r8235, %r8230; + shf.l.wrap.b32 %r8237, %r8236, %r8236, 24; + add.s32 %r8238, %r8237, %r8231; + xor.b32 %r8239, %r8238, %r8233; + shf.l.wrap.b32 %r8240, %r8239, %r8239, 25; + add.s32 %r8241, %r8193, %r7908; + add.s32 %r8242, %r8241, %r8184; + xor.b32 %r8243, %r8242, %r8209; + shf.l.wrap.b32 %r8244, %r8243, %r8243, 16; + add.s32 %r8245, %r8244, %r8224; + xor.b32 %r8246, %r8245, %r8184; + shf.l.wrap.b32 %r8247, %r8246, %r8246, 20; + add.s32 %r8248, %r8242, %r7922; + add.s32 %r8249, %r8248, %r8247; + xor.b32 %r8250, %r8249, %r8244; + shf.l.wrap.b32 %r8251, %r8250, %r8250, 24; + add.s32 %r8252, %r8251, %r8245; + xor.b32 %r8253, %r8252, %r8247; + shf.l.wrap.b32 %r8254, %r8253, %r8253, 25; + add.s32 %r8255, %r8207, %r7929; + add.s32 %r8256, %r8255, %r8198; + xor.b32 %r8257, %r8223, %r8256; + shf.l.wrap.b32 %r8258, %r8257, %r8257, 16; + add.s32 %r8259, %r8258, %r8182; + xor.b32 %r8260, %r8259, %r8198; + shf.l.wrap.b32 %r8261, %r8260, %r8260, 20; + add.s32 %r8262, %r8256, %r7852; + add.s32 %r8263, %r8262, %r8261; + xor.b32 %r8264, %r8263, %r8258; + shf.l.wrap.b32 %r8265, %r8264, %r8264, 24; + add.s32 %r8266, %r8265, %r8259; + xor.b32 %r8267, %r8266, %r8261; + shf.l.wrap.b32 %r8268, %r8267, %r8267, 25; + add.s32 %r8269, %r8212, %r7887; + add.s32 %r8270, %r8269, %r8221; + xor.b32 %r8271, %r8270, %r8181; + shf.l.wrap.b32 %r8272, %r8271, %r8271, 16; + add.s32 %r8273, %r8272, %r8196; + xor.b32 %r8274, %r8273, %r8212; + shf.l.wrap.b32 %r8275, %r8274, %r8274, 20; + add.s32 %r8276, %r8270, %r7936; + add.s32 %r8277, %r8276, %r8275; + xor.b32 %r8278, %r8277, %r8272; + shf.l.wrap.b32 %r8279, %r8278, %r8278, 24; + add.s32 %r8280, %r8279, %r8273; + xor.b32 %r8281, %r8280, %r8275; + shf.l.wrap.b32 %r8282, %r8281, %r8281, 25; + add.s32 %r8283, %r8235, %r7880; + add.s32 %r8284, %r8283, %r8254; + xor.b32 %r8285, %r8284, %r8279; + shf.l.wrap.b32 %r8286, %r8285, %r8285, 16; + add.s32 %r8287, %r8286, %r8266; + xor.b32 %r8288, %r8287, %r8254; + shf.l.wrap.b32 %r8289, %r8288, %r8288, 20; + add.s32 %r8290, %r8284, %r7873; + add.s32 %r8291, %r8290, %r8289; + xor.b32 %r8292, %r8291, %r8286; + shf.l.wrap.b32 %r8293, %r8292, %r8292, 24; + add.s32 %r8294, %r8293, %r8287; + xor.b32 %r8295, %r8294, %r8289; + shf.l.wrap.b32 %r8296, %r8295, %r8295, 25; + add.s32 %r8297, %r8249, %r7901; + add.s32 %r8298, %r8297, %r8268; + xor.b32 %r8299, %r8237, %r8298; + shf.l.wrap.b32 %r8300, %r8299, %r8299, 16; + add.s32 %r8301, %r8300, %r8280; + xor.b32 %r8302, %r8301, %r8268; + shf.l.wrap.b32 %r8303, %r8302, %r8302, 20; + add.s32 %r8304, %r8298, %r7838; + add.s32 %r8305, %r8304, %r8303; + xor.b32 %r8306, %r8305, %r8300; + shf.l.wrap.b32 %r8307, %r8306, %r8306, 24; + add.s32 %r8308, %r8307, %r8301; + xor.b32 %r8309, %r8308, %r8303; + shf.l.wrap.b32 %r8310, %r8309, %r8309, 25; + add.s32 %r8311, %r8263, %r7915; + add.s32 %r8312, %r8311, %r8282; + xor.b32 %r8313, %r8312, %r8251; + shf.l.wrap.b32 %r8314, %r8313, %r8313, 16; + add.s32 %r8315, %r8314, %r8238; + xor.b32 %r8316, %r8315, %r8282; + shf.l.wrap.b32 %r8317, %r8316, %r8316, 20; + add.s32 %r8318, %r8312, %r7943; + add.s32 %r8319, %r8318, %r8317; + xor.b32 %r8320, %r8319, %r8314; + shf.l.wrap.b32 %r8321, %r8320, %r8320, 24; + add.s32 %r8322, %r8321, %r8315; + xor.b32 %r8323, %r8322, %r8317; + shf.l.wrap.b32 %r8324, %r8323, %r8323, 25; + add.s32 %r8325, %r8277, %r7894; + add.s32 %r8326, %r8325, %r8240; + xor.b32 %r8327, %r8326, %r8265; + shf.l.wrap.b32 %r8328, %r8327, %r8327, 16; + add.s32 %r8329, %r8328, %r8252; + xor.b32 %r8330, %r8329, %r8240; + shf.l.wrap.b32 %r8331, %r8330, %r8330, 20; + add.s32 %r8332, %r8326, %r7845; + add.s32 %r8333, %r8332, %r8331; + xor.b32 %r8334, %r8333, %r8328; + shf.l.wrap.b32 %r8335, %r8334, %r8334, 24; + add.s32 %r8336, %r8335, %r8329; + xor.b32 %r8337, %r8336, %r8331; + shf.l.wrap.b32 %r8338, %r8337, %r8337, 25; + add.s32 %r8339, %r8291, %r7908; + add.s32 %r8340, %r8339, %r8338; + xor.b32 %r8341, %r8340, %r8307; + shf.l.wrap.b32 %r8342, %r8341, %r8341, 16; + add.s32 %r8343, %r8342, %r8322; + xor.b32 %r8344, %r8343, %r8338; + shf.l.wrap.b32 %r8345, %r8344, %r8344, 20; + add.s32 %r8346, %r8340, %r7887; + add.s32 %r8347, %r8346, %r8345; + xor.b32 %r8348, %r8347, %r8342; + shf.l.wrap.b32 %r8349, %r8348, %r8348, 24; + add.s32 %r8350, %r8349, %r8343; + xor.b32 %r8351, %r8350, %r8345; + shf.l.wrap.b32 %r8352, %r8351, %r8351, 25; + add.s32 %r8353, %r8305, %r7922; + add.s32 %r8354, %r8353, %r8296; + xor.b32 %r8355, %r8354, %r8321; + shf.l.wrap.b32 %r8356, %r8355, %r8355, 16; + add.s32 %r8357, %r8356, %r8336; + xor.b32 %r8358, %r8357, %r8296; + shf.l.wrap.b32 %r8359, %r8358, %r8358, 20; + add.s32 %r8360, %r8354, %r7901; + add.s32 %r8361, %r8360, %r8359; + xor.b32 %r8362, %r8361, %r8356; + shf.l.wrap.b32 %r8363, %r8362, %r8362, 24; + add.s32 %r8364, %r8363, %r8357; + xor.b32 %r8365, %r8364, %r8359; + shf.l.wrap.b32 %r8366, %r8365, %r8365, 25; + add.s32 %r8367, %r8319, %r7936; + add.s32 %r8368, %r8367, %r8310; + xor.b32 %r8369, %r8335, %r8368; + shf.l.wrap.b32 %r8370, %r8369, %r8369, 16; + add.s32 %r8371, %r8370, %r8294; + xor.b32 %r8372, %r8371, %r8310; + shf.l.wrap.b32 %r8373, %r8372, %r8372, 20; + add.s32 %r8374, %r8368, %r7859; + add.s32 %r8375, %r8374, %r8373; + xor.b32 %r8376, %r8375, %r8370; + shf.l.wrap.b32 %r8377, %r8376, %r8376, 24; + add.s32 %r8378, %r8377, %r8371; + xor.b32 %r8379, %r8378, %r8373; + shf.l.wrap.b32 %r8380, %r8379, %r8379, 25; + add.s32 %r8381, %r8333, %r7929; + add.s32 %r8382, %r8381, %r8324; + xor.b32 %r8383, %r8382, %r8293; + shf.l.wrap.b32 %r8384, %r8383, %r8383, 16; + add.s32 %r8385, %r8384, %r8308; + xor.b32 %r8386, %r8385, %r8324; + shf.l.wrap.b32 %r8387, %r8386, %r8386, 20; + add.s32 %r8388, %r8382, %r7943; + add.s32 %r8389, %r8388, %r8387; + xor.b32 %r8390, %r8389, %r8384; + shf.l.wrap.b32 %r8391, %r8390, %r8390, 24; + add.s32 %r8392, %r8391, %r8385; + xor.b32 %r8393, %r8392, %r8387; + shf.l.wrap.b32 %r8394, %r8393, %r8393, 25; + add.s32 %r8395, %r8347, %r7866; + add.s32 %r8396, %r8395, %r8366; + xor.b32 %r8397, %r8396, %r8391; + shf.l.wrap.b32 %r8398, %r8397, %r8397, 16; + add.s32 %r8399, %r8398, %r8378; + xor.b32 %r8400, %r8399, %r8366; + shf.l.wrap.b32 %r8401, %r8400, %r8400, 20; + add.s32 %r8402, %r8396, %r7838; + add.s32 %r8403, %r8402, %r8401; + xor.b32 %r8404, %r8403, %r8398; + shf.l.wrap.b32 %r8405, %r8404, %r8404, 24; + add.s32 %r8406, %r8405, %r8399; + xor.b32 %r8407, %r8406, %r8401; + shf.l.wrap.b32 %r8408, %r8407, %r8407, 25; + add.s32 %r8409, %r8361, %r7915; + add.s32 %r8410, %r8409, %r8380; + xor.b32 %r8411, %r8349, %r8410; + shf.l.wrap.b32 %r8412, %r8411, %r8411, 16; + add.s32 %r8413, %r8412, %r8392; + xor.b32 %r8414, %r8413, %r8380; + shf.l.wrap.b32 %r8415, %r8414, %r8414, 20; + add.s32 %r8416, %r8410, %r7852; + add.s32 %r8417, %r8416, %r8415; + xor.b32 %r8418, %r8417, %r8412; + shf.l.wrap.b32 %r8419, %r8418, %r8418, 24; + add.s32 %r8420, %r8419, %r8413; + xor.b32 %r8421, %r8420, %r8415; + shf.l.wrap.b32 %r8422, %r8421, %r8421, 25; + add.s32 %r8423, %r8375, %r7873; + add.s32 %r8424, %r8423, %r8394; + xor.b32 %r8425, %r8424, %r8363; + shf.l.wrap.b32 %r8426, %r8425, %r8425, 16; + add.s32 %r8427, %r8426, %r8350; + xor.b32 %r8428, %r8427, %r8394; + shf.l.wrap.b32 %r8429, %r8428, %r8428, 20; + add.s32 %r8430, %r8424, %r7894; + add.s32 %r8431, %r8430, %r8429; + xor.b32 %r8432, %r8431, %r8426; + shf.l.wrap.b32 %r8433, %r8432, %r8432, 24; + add.s32 %r8434, %r8433, %r8427; + xor.b32 %r8435, %r8434, %r8429; + shf.l.wrap.b32 %r8436, %r8435, %r8435, 25; + add.s32 %r8437, %r8389, %r7845; + add.s32 %r8438, %r8437, %r8352; + xor.b32 %r8439, %r8438, %r8377; + shf.l.wrap.b32 %r8440, %r8439, %r8439, 16; + add.s32 %r8441, %r8440, %r8364; + xor.b32 %r8442, %r8441, %r8352; + shf.l.wrap.b32 %r8443, %r8442, %r8442, 20; + add.s32 %r8444, %r8438, %r7880; + add.s32 %r8445, %r8444, %r8443; + xor.b32 %r8446, %r8445, %r8440; + shf.l.wrap.b32 %r8447, %r8446, %r8446, 24; + add.s32 %r8448, %r8447, %r8441; + xor.b32 %r8449, %r8448, %r8443; + shf.l.wrap.b32 %r8450, %r8449, %r8449, 25; + add.s32 %r8451, %r8403, %r7922; + add.s32 %r8452, %r8451, %r8450; + xor.b32 %r8453, %r8452, %r8419; + shf.l.wrap.b32 %r8454, %r8453, %r8453, 16; + add.s32 %r8455, %r8454, %r8434; + xor.b32 %r8456, %r8455, %r8450; + shf.l.wrap.b32 %r8457, %r8456, %r8456, 20; + add.s32 %r8458, %r8452, %r7929; + add.s32 %r8459, %r8458, %r8457; + xor.b32 %r8460, %r8459, %r8454; + shf.l.wrap.b32 %r8461, %r8460, %r8460, 24; + add.s32 %r8462, %r8461, %r8455; + xor.b32 %r8463, %r8462, %r8457; + shf.l.wrap.b32 %r8464, %r8463, %r8463, 25; + add.s32 %r8465, %r8417, %r7901; + add.s32 %r8466, %r8465, %r8408; + xor.b32 %r8467, %r8466, %r8433; + shf.l.wrap.b32 %r8468, %r8467, %r8467, 16; + add.s32 %r8469, %r8468, %r8448; + xor.b32 %r8470, %r8469, %r8408; + shf.l.wrap.b32 %r8471, %r8470, %r8470, 20; + add.s32 %r8472, %r8466, %r7915; + add.s32 %r8473, %r8472, %r8471; + xor.b32 %r8474, %r8473, %r8468; + shf.l.wrap.b32 %r8475, %r8474, %r8474, 24; + add.s32 %r8476, %r8475, %r8469; + xor.b32 %r8477, %r8476, %r8471; + shf.l.wrap.b32 %r8478, %r8477, %r8477, 25; + add.s32 %r8479, %r8431, %r7943; + add.s32 %r8480, %r8479, %r8422; + xor.b32 %r8481, %r8447, %r8480; + shf.l.wrap.b32 %r8482, %r8481, %r8481, 16; + add.s32 %r8483, %r8482, %r8406; + xor.b32 %r8484, %r8483, %r8422; + shf.l.wrap.b32 %r8485, %r8484, %r8484, 20; + add.s32 %r8486, %r8480, %r7908; + add.s32 %r8487, %r8486, %r8485; + xor.b32 %r8488, %r8487, %r8482; + shf.l.wrap.b32 %r8489, %r8488, %r8488, 24; + add.s32 %r8490, %r8489, %r8483; + xor.b32 %r8491, %r8490, %r8485; + shf.l.wrap.b32 %r8492, %r8491, %r8491, 25; + add.s32 %r8493, %r8445, %r7936; + add.s32 %r8494, %r8493, %r8436; + xor.b32 %r8495, %r8494, %r8405; + shf.l.wrap.b32 %r8496, %r8495, %r8495, 16; + add.s32 %r8497, %r8496, %r8420; + xor.b32 %r8498, %r8497, %r8436; + shf.l.wrap.b32 %r8499, %r8498, %r8498, 20; + add.s32 %r8500, %r8494, %r7894; + add.s32 %r8501, %r8500, %r8499; + xor.b32 %r8502, %r8501, %r8496; + shf.l.wrap.b32 %r8503, %r8502, %r8502, 24; + add.s32 %r8504, %r8503, %r8497; + xor.b32 %r8505, %r8504, %r8499; + shf.l.wrap.b32 %r8506, %r8505, %r8505, 25; + add.s32 %r8507, %r8459, %r7887; + add.s32 %r8508, %r8507, %r8478; + xor.b32 %r8509, %r8508, %r8503; + shf.l.wrap.b32 %r8510, %r8509, %r8509, 16; + add.s32 %r8511, %r8510, %r8490; + xor.b32 %r8512, %r8511, %r8478; + shf.l.wrap.b32 %r8513, %r8512, %r8512, 20; + add.s32 %r8514, %r8508, %r7852; + add.s32 %r8515, %r8514, %r8513; + xor.b32 %r8516, %r8515, %r8510; + shf.l.wrap.b32 %r8517, %r8516, %r8516, 24; + add.s32 %r8518, %r8517, %r8511; + xor.b32 %r8519, %r8518, %r8513; + shf.l.wrap.b32 %r8520, %r8519, %r8519, 25; + add.s32 %r8521, %r8473, %r7873; + add.s32 %r8522, %r8521, %r8492; + xor.b32 %r8523, %r8461, %r8522; + shf.l.wrap.b32 %r8524, %r8523, %r8523, 16; + add.s32 %r8525, %r8524, %r8504; + xor.b32 %r8526, %r8525, %r8492; + shf.l.wrap.b32 %r8527, %r8526, %r8526, 20; + add.s32 %r8528, %r8522, %r7859; + add.s32 %r8529, %r8528, %r8527; + xor.b32 %r8530, %r8529, %r8524; + shf.l.wrap.b32 %r8531, %r8530, %r8530, 24; + add.s32 %r8532, %r8531, %r8525; + xor.b32 %r8533, %r8532, %r8527; + shf.l.wrap.b32 %r8534, %r8533, %r8533, 25; + add.s32 %r8535, %r8487, %r7838; + add.s32 %r8536, %r8535, %r8506; + xor.b32 %r8537, %r8536, %r8475; + shf.l.wrap.b32 %r8538, %r8537, %r8537, 16; + add.s32 %r8539, %r8538, %r8462; + xor.b32 %r8540, %r8539, %r8506; + shf.l.wrap.b32 %r8541, %r8540, %r8540, 20; + add.s32 %r8542, %r8536, %r7845; + add.s32 %r8543, %r8542, %r8541; + xor.b32 %r8544, %r8543, %r8538; + shf.l.wrap.b32 %r8545, %r8544, %r8544, 24; + add.s32 %r8546, %r8545, %r8539; + xor.b32 %r8547, %r8546, %r8541; + shf.l.wrap.b32 %r8548, %r8547, %r8547, 25; + add.s32 %r8549, %r8501, %r7880; + add.s32 %r8550, %r8549, %r8464; + xor.b32 %r8551, %r8550, %r8489; + shf.l.wrap.b32 %r8552, %r8551, %r8551, 16; + add.s32 %r8553, %r8552, %r8476; + xor.b32 %r8554, %r8553, %r8464; + shf.l.wrap.b32 %r8555, %r8554, %r8554, 20; + add.s32 %r8556, %r8550, %r7866; + add.s32 %r8557, %r8556, %r8555; + xor.b32 %r8558, %r8557, %r8552; + shf.l.wrap.b32 %r8559, %r8558, %r8558, 24; + add.s32 %r8560, %r8559, %r8553; + xor.b32 %r8561, %r8560, %r8555; + shf.l.wrap.b32 %r8562, %r8561, %r8561, 25; + add.s32 %r8563, %r8515, %r7901; + add.s32 %r8564, %r8563, %r8562; + xor.b32 %r8565, %r8564, %r8531; + shf.l.wrap.b32 %r8566, %r8565, %r8565, 16; + add.s32 %r8567, %r8566, %r8546; + xor.b32 %r8568, %r8567, %r8562; + shf.l.wrap.b32 %r8569, %r8568, %r8568, 20; + add.s32 %r8570, %r8564, %r7936; + add.s32 %r8571, %r8570, %r8569; + xor.b32 %r8572, %r8571, %r8566; + shf.l.wrap.b32 %r8573, %r8572, %r8572, 24; + add.s32 %r8574, %r8573, %r8567; + xor.b32 %r8575, %r8574, %r8569; + shf.l.wrap.b32 %r8576, %r8575, %r8575, 25; + add.s32 %r8577, %r8529, %r7915; + add.s32 %r8578, %r8577, %r8520; + xor.b32 %r8579, %r8578, %r8545; + shf.l.wrap.b32 %r8580, %r8579, %r8579, 16; + add.s32 %r8581, %r8580, %r8560; + xor.b32 %r8582, %r8581, %r8520; + shf.l.wrap.b32 %r8583, %r8582, %r8582, 20; + add.s32 %r8584, %r8578, %r7873; + add.s32 %r8585, %r8584, %r8583; + xor.b32 %r8586, %r8585, %r8580; + shf.l.wrap.b32 %r8587, %r8586, %r8586, 24; + add.s32 %r8588, %r8587, %r8581; + xor.b32 %r8589, %r8588, %r8583; + shf.l.wrap.b32 %r8590, %r8589, %r8589, 25; + add.s32 %r8591, %r8543, %r7894; + add.s32 %r8592, %r8591, %r8534; + xor.b32 %r8593, %r8559, %r8592; + shf.l.wrap.b32 %r8594, %r8593, %r8593, 16; + add.s32 %r8595, %r8594, %r8518; + xor.b32 %r8596, %r8595, %r8534; + shf.l.wrap.b32 %r8597, %r8596, %r8596, 20; + add.s32 %r8598, %r8592, %r7922; + add.s32 %r8599, %r8598, %r8597; + xor.b32 %r8600, %r8599, %r8594; + shf.l.wrap.b32 %r8601, %r8600, %r8600, 24; + add.s32 %r8602, %r8601, %r8595; + xor.b32 %r8603, %r8602, %r8597; + shf.l.wrap.b32 %r8604, %r8603, %r8603, 25; + add.s32 %r8605, %r8557, %r7943; + add.s32 %r8606, %r8605, %r8548; + xor.b32 %r8607, %r8606, %r8517; + shf.l.wrap.b32 %r8608, %r8607, %r8607, 16; + add.s32 %r8609, %r8608, %r8532; + xor.b32 %r8610, %r8609, %r8548; + shf.l.wrap.b32 %r8611, %r8610, %r8610, 20; + add.s32 %r8612, %r8606, %r7845; + add.s32 %r8613, %r8612, %r8611; + xor.b32 %r8614, %r8613, %r8608; + shf.l.wrap.b32 %r8615, %r8614, %r8614, 24; + add.s32 %r8616, %r8615, %r8609; + xor.b32 %r8617, %r8616, %r8611; + shf.l.wrap.b32 %r8618, %r8617, %r8617, 25; + add.s32 %r8619, %r8571, %r7929; + add.s32 %r8620, %r8619, %r8590; + xor.b32 %r8621, %r8620, %r8615; + shf.l.wrap.b32 %r8622, %r8621, %r8621, 16; + add.s32 %r8623, %r8622, %r8602; + xor.b32 %r8624, %r8623, %r8590; + shf.l.wrap.b32 %r8625, %r8624, %r8624, 20; + add.s32 %r8626, %r8620, %r7859; + add.s32 %r8627, %r8626, %r8625; + xor.b32 %r8628, %r8627, %r8622; + shf.l.wrap.b32 %r8629, %r8628, %r8628, 24; + add.s32 %r8630, %r8629, %r8623; + xor.b32 %r8631, %r8630, %r8625; + shf.l.wrap.b32 %r8632, %r8631, %r8631, 25; + add.s32 %r8633, %r8585, %r7838; + add.s32 %r8634, %r8633, %r8604; + xor.b32 %r8635, %r8573, %r8634; + shf.l.wrap.b32 %r8636, %r8635, %r8635, 16; + add.s32 %r8637, %r8636, %r8616; + xor.b32 %r8638, %r8637, %r8604; + shf.l.wrap.b32 %r8639, %r8638, %r8638, 20; + add.s32 %r8640, %r8634, %r7908; + add.s32 %r8641, %r8640, %r8639; + xor.b32 %r8642, %r8641, %r8636; + shf.l.wrap.b32 %r8643, %r8642, %r8642, 24; + add.s32 %r8644, %r8643, %r8637; + xor.b32 %r8645, %r8644, %r8639; + shf.l.wrap.b32 %r8646, %r8645, %r8645, 25; + add.s32 %r8647, %r8599, %r7852; + add.s32 %r8648, %r8647, %r8618; + xor.b32 %r8649, %r8648, %r8587; + shf.l.wrap.b32 %r8650, %r8649, %r8649, 16; + add.s32 %r8651, %r8650, %r8574; + xor.b32 %r8652, %r8651, %r8618; + shf.l.wrap.b32 %r8653, %r8652, %r8652, 20; + add.s32 %r8654, %r8648, %r7880; + add.s32 %r8655, %r8654, %r8653; + xor.b32 %r8656, %r8655, %r8650; + shf.l.wrap.b32 %r8657, %r8656, %r8656, 24; + add.s32 %r8658, %r8657, %r8651; + xor.b32 %r8659, %r8658, %r8653; + shf.l.wrap.b32 %r8660, %r8659, %r8659, 25; + add.s32 %r8661, %r8613, %r7866; + add.s32 %r8662, %r8661, %r8576; + xor.b32 %r8663, %r8662, %r8601; + shf.l.wrap.b32 %r8664, %r8663, %r8663, 16; + add.s32 %r8665, %r8664, %r8588; + xor.b32 %r8666, %r8665, %r8576; + shf.l.wrap.b32 %r8667, %r8666, %r8666, 20; + add.s32 %r8668, %r8662, %r7887; + add.s32 %r8669, %r8668, %r8667; + xor.b32 %r8670, %r8669, %r8664; + shf.l.wrap.b32 %r8671, %r8670, %r8670, 24; + add.s32 %r8672, %r8671, %r8665; + xor.b32 %r8673, %r8672, %r8667; + shf.l.wrap.b32 %r8674, %r8673, %r8673, 25; + add.s32 %r8675, %r8627, %r7915; + add.s32 %r8676, %r8675, %r8674; + xor.b32 %r8677, %r8676, %r8643; + shf.l.wrap.b32 %r8678, %r8677, %r8677, 16; + add.s32 %r8679, %r8678, %r8658; + xor.b32 %r8680, %r8679, %r8674; + shf.l.wrap.b32 %r8681, %r8680, %r8680, 20; + add.s32 %r8682, %r8676, %r7943; + add.s32 %r8683, %r8682, %r8681; + xor.b32 %r8684, %r8683, %r8678; + shf.l.wrap.b32 %r8685, %r8684, %r8684, 24; + add.s32 %r8686, %r8685, %r8679; + xor.b32 %r8687, %r8686, %r8681; + shf.l.wrap.b32 %r8688, %r8687, %r8687, 25; + add.s32 %r8689, %r8641, %r7873; + add.s32 %r8690, %r8689, %r8632; + xor.b32 %r8691, %r8690, %r8657; + shf.l.wrap.b32 %r8692, %r8691, %r8691, 16; + add.s32 %r8693, %r8692, %r8672; + xor.b32 %r8694, %r8693, %r8632; + shf.l.wrap.b32 %r8695, %r8694, %r8694, 20; + add.s32 %r8696, %r8690, %r7838; + add.s32 %r8697, %r8696, %r8695; + xor.b32 %r8698, %r8697, %r8692; + shf.l.wrap.b32 %r8699, %r8698, %r8698, 24; + add.s32 %r8700, %r8699, %r8693; + xor.b32 %r8701, %r8700, %r8695; + shf.l.wrap.b32 %r8702, %r8701, %r8701, 25; + add.s32 %r8703, %r8655, %r7845; + add.s32 %r8704, %r8703, %r8646; + xor.b32 %r8705, %r8671, %r8704; + shf.l.wrap.b32 %r8706, %r8705, %r8705, 16; + add.s32 %r8707, %r8706, %r8630; + xor.b32 %r8708, %r8707, %r8646; + shf.l.wrap.b32 %r8709, %r8708, %r8708, 20; + add.s32 %r8710, %r8704, %r7901; + add.s32 %r8711, %r8710, %r8709; + xor.b32 %r8712, %r8711, %r8706; + shf.l.wrap.b32 %r8713, %r8712, %r8712, 24; + add.s32 %r8714, %r8713, %r8707; + xor.b32 %r8715, %r8714, %r8709; + shf.l.wrap.b32 %r8716, %r8715, %r8715, 25; + add.s32 %r8717, %r8669, %r7894; + add.s32 %r8718, %r8717, %r8660; + xor.b32 %r8719, %r8718, %r8629; + shf.l.wrap.b32 %r8720, %r8719, %r8719, 16; + add.s32 %r8721, %r8720, %r8644; + xor.b32 %r8722, %r8721, %r8660; + shf.l.wrap.b32 %r8723, %r8722, %r8722, 20; + add.s32 %r8724, %r8718, %r7880; + add.s32 %r8725, %r8724, %r8723; + xor.b32 %r8726, %r8725, %r8720; + shf.l.wrap.b32 %r8727, %r8726, %r8726, 24; + add.s32 %r8728, %r8727, %r8721; + xor.b32 %r8729, %r8728, %r8723; + shf.l.wrap.b32 %r8730, %r8729, %r8729, 25; + add.s32 %r8731, %r8683, %r7936; + add.s32 %r8732, %r8731, %r8702; + xor.b32 %r8733, %r8732, %r8727; + shf.l.wrap.b32 %r8734, %r8733, %r8733, 16; + add.s32 %r8735, %r8734, %r8714; + xor.b32 %r8736, %r8735, %r8702; + shf.l.wrap.b32 %r8737, %r8736, %r8736, 20; + add.s32 %r8738, %r8732, %r7908; + add.s32 %r8739, %r8738, %r8737; + xor.b32 %r8740, %r8739, %r8734; + shf.l.wrap.b32 %r8741, %r8740, %r8740, 24; + add.s32 %r8742, %r8741, %r8735; + xor.b32 %r8743, %r8742, %r8737; + shf.l.wrap.b32 %r8744, %r8743, %r8743, 25; + add.s32 %r8745, %r8697, %r7852; + add.s32 %r8746, %r8745, %r8716; + xor.b32 %r8747, %r8685, %r8746; + shf.l.wrap.b32 %r8748, %r8747, %r8747, 16; + add.s32 %r8749, %r8748, %r8728; + xor.b32 %r8750, %r8749, %r8716; + shf.l.wrap.b32 %r8751, %r8750, %r8750, 20; + add.s32 %r8752, %r8746, %r7922; + add.s32 %r8753, %r8752, %r8751; + xor.b32 %r8754, %r8753, %r8748; + shf.l.wrap.b32 %r8755, %r8754, %r8754, 24; + add.s32 %r8756, %r8755, %r8749; + xor.b32 %r8757, %r8756, %r8751; + shf.l.wrap.b32 %r8758, %r8757, %r8757, 25; + add.s32 %r8759, %r8711, %r7859; + add.s32 %r8760, %r8759, %r8730; + xor.b32 %r8761, %r8760, %r8699; + shf.l.wrap.b32 %r8762, %r8761, %r8761, 16; + add.s32 %r8763, %r8762, %r8686; + xor.b32 %r8764, %r8763, %r8730; + shf.l.wrap.b32 %r8765, %r8764, %r8764, 20; + add.s32 %r8766, %r8760, %r7866; + add.s32 %r8767, %r8766, %r8765; + xor.b32 %r8768, %r8767, %r8762; + shf.l.wrap.b32 %r8769, %r8768, %r8768, 24; + add.s32 %r8770, %r8769, %r8763; + xor.b32 %r8771, %r8770, %r8765; + shf.l.wrap.b32 %r8772, %r8771, %r8771, 25; + add.s32 %r8773, %r8725, %r7887; + add.s32 %r8774, %r8773, %r8688; + xor.b32 %r8775, %r8774, %r8713; + shf.l.wrap.b32 %r8776, %r8775, %r8775, 16; + add.s32 %r8777, %r8776, %r8700; + xor.b32 %r8778, %r8777, %r8688; + shf.l.wrap.b32 %r8779, %r8778, %r8778, 20; + add.s32 %r8780, %r8774, %r7929; + add.s32 %r8781, %r8780, %r8779; + xor.b32 %r8782, %r8781, %r8776; + shf.l.wrap.b32 %r8783, %r8782, %r8782, 24; + add.s32 %r8784, %r8783, %r8777; + xor.b32 %r8785, %r8784, %r8779; + shf.l.wrap.b32 %r8786, %r8785, %r8785, 25; + xor.b32 %r8787, %r8770, %r8739; + xor.b32 %r8788, %r8784, %r8753; + xor.b32 %r8789, %r8742, %r8767; + xor.b32 %r8790, %r8781, %r8756; + xor.b32 %r8791, %r8786, %r8755; + xor.b32 %r8792, %r8744, %r8769; + xor.b32 %r8793, %r8783, %r8758; + xor.b32 %r8794, %r8772, %r8741; + st.local.u8 [%rd191+145], %r8787; + shr.u32 %r8795, %r8787, 8; + st.local.u8 [%rd191+146], %r8795; + shr.u32 %r8796, %r8787, 16; + st.local.u8 [%rd191+147], %r8796; + shr.u32 %r8797, %r8787, 24; + st.local.u8 [%rd191+148], %r8797; + st.local.u8 [%rd191+149], %r8788; + shr.u32 %r8798, %r8788, 8; + st.local.u8 [%rd191+150], %r8798; + shr.u32 %r8799, %r8788, 16; + st.local.u8 [%rd191+151], %r8799; + shr.u32 %r8800, %r8788, 24; + st.local.u8 [%rd191+152], %r8800; + st.local.u8 [%rd191+153], %r8789; + shr.u32 %r8801, %r8789, 8; + st.local.u8 [%rd191+154], %r8801; + shr.u32 %r8802, %r8789, 16; + st.local.u8 [%rd191+155], %r8802; + shr.u32 %r8803, %r8789, 24; + st.local.u8 [%rd191+156], %r8803; + st.local.u8 [%rd191+157], %r8790; + shr.u32 %r8804, %r8790, 8; + st.local.u8 [%rd191+158], %r8804; + shr.u32 %r8805, %r8790, 16; + st.local.u8 [%rd191+159], %r8805; + shr.u32 %r8806, %r8790, 24; + st.local.u8 [%rd191+160], %r8806; + st.local.u8 [%rd191+161], %r8791; + shr.u32 %r8807, %r8791, 8; + st.local.u8 [%rd191+162], %r8807; + shr.u32 %r8808, %r8791, 16; + st.local.u8 [%rd191+163], %r8808; + shr.u32 %r8809, %r8791, 24; + st.local.u8 [%rd191+164], %r8809; + st.local.u8 [%rd191+165], %r8792; + shr.u32 %r8810, %r8792, 8; + st.local.u8 [%rd191+166], %r8810; + shr.u32 %r8811, %r8792, 16; + st.local.u8 [%rd191+167], %r8811; + shr.u32 %r8812, %r8792, 24; + st.local.u8 [%rd191+168], %r8812; + st.local.u8 [%rd191+169], %r8793; + shr.u32 %r8813, %r8793, 8; + st.local.u8 [%rd191+170], %r8813; + shr.u32 %r8814, %r8793, 16; + st.local.u8 [%rd191+171], %r8814; + shr.u32 %r8815, %r8793, 24; + st.local.u8 [%rd191+172], %r8815; + st.local.u8 [%rd191+173], %r8794; + shr.u32 %r8816, %r8794, 8; + st.local.u8 [%rd191+174], %r8816; + shr.u32 %r8817, %r8794, 16; + st.local.u8 [%rd191+175], %r8817; + shr.u32 %r8818, %r8794, 24; + st.local.u8 [%rd191+176], %r8818; + ld.local.u8 %rs328, [%rd3+8]; + add.s16 %rs329, %rs328, -1; + st.local.u8 [%rd3+8], %rs329; + cvt.u64.u16 %rd192, %rs329; + and.b64 %rd193, %rd192, 255; + setp.lt.u64 %p38, %rd230, %rd193; + and.b16 %rs330, %rs329, 255; + mul.wide.u16 %r11681, %rs330, 32; + @%p38 bra $L__BB1_45; + +$L__BB1_46: + cvt.s64.s32 %rd194, %r11681; + add.s64 %rd195, %rd2, %rd194; + st.local.u8 [%rd195+145], %r97; + shr.u32 %r8819, %r97, 8; + st.local.u8 [%rd195+146], %r8819; + shr.u32 %r8820, %r97, 16; + st.local.u8 [%rd195+147], %r8820; + shr.u32 %r8821, %r97, 24; + st.local.u8 [%rd195+148], %r8821; + st.local.u8 [%rd195+149], %r98; + shr.u32 %r8822, %r98, 8; + st.local.u8 [%rd195+150], %r8822; + shr.u32 %r8823, %r98, 16; + st.local.u8 [%rd195+151], %r8823; + shr.u32 %r8824, %r98, 24; + st.local.u8 [%rd195+152], %r8824; + st.local.u8 [%rd195+153], %r99; + shr.u32 %r8825, %r99, 8; + st.local.u8 [%rd195+154], %r8825; + shr.u32 %r8826, %r99, 16; + st.local.u8 [%rd195+155], %r8826; + shr.u32 %r8827, %r99, 24; + st.local.u8 [%rd195+156], %r8827; + st.local.u8 [%rd195+157], %r100; + shr.u32 %r8828, %r100, 8; + st.local.u8 [%rd195+158], %r8828; + shr.u32 %r8829, %r100, 16; + st.local.u8 [%rd195+159], %r8829; + shr.u32 %r8830, %r100, 24; + st.local.u8 [%rd195+160], %r8830; + st.local.u8 [%rd195+161], %r101; + shr.u32 %r8831, %r101, 8; + st.local.u8 [%rd195+162], %r8831; + shr.u32 %r8832, %r101, 16; + st.local.u8 [%rd195+163], %r8832; + shr.u32 %r8833, %r101, 24; + st.local.u8 [%rd195+164], %r8833; + st.local.u8 [%rd195+165], %r102; + shr.u32 %r8834, %r102, 8; + st.local.u8 [%rd195+166], %r8834; + shr.u32 %r8835, %r102, 16; + st.local.u8 [%rd195+167], %r8835; + shr.u32 %r8836, %r102, 24; + st.local.u8 [%rd195+168], %r8836; + st.local.u8 [%rd195+169], %r103; + shr.u32 %r8837, %r103, 8; + st.local.u8 [%rd195+170], %r8837; + shr.u32 %r8838, %r103, 16; + st.local.u8 [%rd195+171], %r8838; + shr.u32 %r8839, %r103, 24; + st.local.u8 [%rd195+172], %r8839; + st.local.u8 [%rd195+173], %r104; + shr.u32 %r8840, %r104, 8; + st.local.u8 [%rd195+174], %r8840; + shr.u32 %r8841, %r104, 16; + st.local.u8 [%rd195+175], %r8841; + shr.u32 %r8842, %r104, 24; + st.local.u8 [%rd195+176], %r8842; + ld.local.u8 %rs388, [%rd3+8]; + bra.uni $L__BB1_47; + +$L__BB1_29: + cvt.u32.u16 %r3957, %rs14; + and.b32 %r3958, %r3957, 255; + { // callseq 2, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd254; + .param .b64 param1; + st.param.b64 [param1+0], %rd49; + .param .b64 param2; + st.param.b64 [param2+0], %rd98; + .param .b64 param3; + st.param.b64 [param3+0], %rd251; + .param .b32 param4; + st.param.b32 [param4+0], %r3958; + .param .b64 param5; + st.param.b64 [param5+0], %rd142; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd164, [retval0+0]; + } // callseq 2 + ld.local.v4.u32 {%r3959, %r3960, %r3961, %r3962}, [%rd42]; + ld.local.v4.u32 {%r3963, %r3964, %r3965, %r3966}, [%rd42+16]; + ld.local.v4.u32 {%r3967, %r3968, %r3969, %r3970}, [%rd42+32]; + ld.local.v4.u32 {%r3971, %r3972, %r3973, %r3974}, [%rd42+48]; + ld.local.u64 %rd165, [%rd3+-72]; + popc.b64 %r3975, %rd165; + cvt.u64.u32 %rd51, %r3975; + ld.local.u8 %rs137, [%rd3+8]; + cvt.u64.u16 %rd166, %rs137; + setp.ge.u64 %p27, %rd51, %rd166; + mul.wide.u16 %r11661, %rs137, 32; + @%p27 bra $L__BB1_32; + +$L__BB1_31: + popc.b64 %r11647, %rd165; + cvt.u64.u32 %rd226, %r11647; + add.s32 %r3976, %r11661, -64; + cvt.s64.s32 %rd167, %r3976; + add.s64 %rd168, %rd2, %rd167; + ld.local.u8 %r3977, [%rd3+2]; + ld.local.u8 %r3978, [%rd168+145]; + ld.local.u8 %r3979, [%rd168+146]; + prmt.b32 %r3980, %r3979, %r3978, 30212; + ld.local.u8 %r3981, [%rd168+147]; + prmt.b32 %r3982, %r3981, %r3980, 28756; + ld.local.u8 %r3983, [%rd168+148]; + prmt.b32 %r3984, %r3983, %r3982, 1620; + ld.local.u8 %r3985, [%rd168+149]; + ld.local.u8 %r3986, [%rd168+150]; + prmt.b32 %r3987, %r3986, %r3985, 30212; + ld.local.u8 %r3988, [%rd168+151]; + prmt.b32 %r3989, %r3988, %r3987, 28756; + ld.local.u8 %r3990, [%rd168+152]; + prmt.b32 %r3991, %r3990, %r3989, 1620; + ld.local.u8 %r3992, [%rd168+153]; + ld.local.u8 %r3993, [%rd168+154]; + prmt.b32 %r3994, %r3993, %r3992, 30212; + ld.local.u8 %r3995, [%rd168+155]; + prmt.b32 %r3996, %r3995, %r3994, 28756; + ld.local.u8 %r3997, [%rd168+156]; + prmt.b32 %r3998, %r3997, %r3996, 1620; + ld.local.u8 %r3999, [%rd168+157]; + ld.local.u8 %r4000, [%rd168+158]; + prmt.b32 %r4001, %r4000, %r3999, 30212; + ld.local.u8 %r4002, [%rd168+159]; + prmt.b32 %r4003, %r4002, %r4001, 28756; + ld.local.u8 %r4004, [%rd168+160]; + prmt.b32 %r4005, %r4004, %r4003, 1620; + ld.local.u8 %r4006, [%rd168+161]; + ld.local.u8 %r4007, [%rd168+162]; + prmt.b32 %r4008, %r4007, %r4006, 30212; + ld.local.u8 %r4009, [%rd168+163]; + prmt.b32 %r4010, %r4009, %r4008, 28756; + ld.local.u8 %r4011, [%rd168+164]; + prmt.b32 %r4012, %r4011, %r4010, 1620; + ld.local.u8 %r4013, [%rd168+165]; + ld.local.u8 %r4014, [%rd168+166]; + prmt.b32 %r4015, %r4014, %r4013, 30212; + ld.local.u8 %r4016, [%rd168+167]; + prmt.b32 %r4017, %r4016, %r4015, 28756; + ld.local.u8 %r4018, [%rd168+168]; + prmt.b32 %r4019, %r4018, %r4017, 1620; + ld.local.u8 %r4020, [%rd168+169]; + ld.local.u8 %r4021, [%rd168+170]; + prmt.b32 %r4022, %r4021, %r4020, 30212; + ld.local.u8 %r4023, [%rd168+171]; + prmt.b32 %r4024, %r4023, %r4022, 28756; + ld.local.u8 %r4025, [%rd168+172]; + prmt.b32 %r4026, %r4025, %r4024, 1620; + ld.local.u8 %r4027, [%rd168+173]; + ld.local.u8 %r4028, [%rd168+174]; + prmt.b32 %r4029, %r4028, %r4027, 30212; + ld.local.u8 %r4030, [%rd168+175]; + prmt.b32 %r4031, %r4030, %r4029, 28756; + ld.local.u8 %r4032, [%rd168+176]; + prmt.b32 %r4033, %r4032, %r4031, 1620; + ld.local.u8 %r4034, [%rd168+177]; + ld.local.u8 %r4035, [%rd168+178]; + prmt.b32 %r4036, %r4035, %r4034, 30212; + ld.local.u8 %r4037, [%rd168+179]; + prmt.b32 %r4038, %r4037, %r4036, 28756; + ld.local.u8 %r4039, [%rd168+180]; + prmt.b32 %r4040, %r4039, %r4038, 1620; + ld.local.u8 %r4041, [%rd168+181]; + ld.local.u8 %r4042, [%rd168+182]; + prmt.b32 %r4043, %r4042, %r4041, 30212; + ld.local.u8 %r4044, [%rd168+183]; + prmt.b32 %r4045, %r4044, %r4043, 28756; + ld.local.u8 %r4046, [%rd168+184]; + prmt.b32 %r4047, %r4046, %r4045, 1620; + ld.local.u8 %r4048, [%rd168+185]; + ld.local.u8 %r4049, [%rd168+186]; + prmt.b32 %r4050, %r4049, %r4048, 30212; + ld.local.u8 %r4051, [%rd168+187]; + prmt.b32 %r4052, %r4051, %r4050, 28756; + ld.local.u8 %r4053, [%rd168+188]; + prmt.b32 %r4054, %r4053, %r4052, 1620; + ld.local.u8 %r4055, [%rd168+189]; + ld.local.u8 %r4056, [%rd168+190]; + prmt.b32 %r4057, %r4056, %r4055, 30212; + ld.local.u8 %r4058, [%rd168+191]; + prmt.b32 %r4059, %r4058, %r4057, 28756; + ld.local.u8 %r4060, [%rd168+192]; + prmt.b32 %r4061, %r4060, %r4059, 1620; + ld.local.u8 %r4062, [%rd168+193]; + ld.local.u8 %r4063, [%rd168+194]; + prmt.b32 %r4064, %r4063, %r4062, 30212; + ld.local.u8 %r4065, [%rd168+195]; + prmt.b32 %r4066, %r4065, %r4064, 28756; + ld.local.u8 %r4067, [%rd168+196]; + prmt.b32 %r4068, %r4067, %r4066, 1620; + ld.local.u8 %r4069, [%rd168+197]; + ld.local.u8 %r4070, [%rd168+198]; + prmt.b32 %r4071, %r4070, %r4069, 30212; + ld.local.u8 %r4072, [%rd168+199]; + prmt.b32 %r4073, %r4072, %r4071, 28756; + ld.local.u8 %r4074, [%rd168+200]; + prmt.b32 %r4075, %r4074, %r4073, 1620; + ld.local.u8 %r4076, [%rd168+201]; + ld.local.u8 %r4077, [%rd168+202]; + prmt.b32 %r4078, %r4077, %r4076, 30212; + ld.local.u8 %r4079, [%rd168+203]; + prmt.b32 %r4080, %r4079, %r4078, 28756; + ld.local.u8 %r4081, [%rd168+204]; + prmt.b32 %r4082, %r4081, %r4080, 1620; + ld.local.u8 %r4083, [%rd168+205]; + ld.local.u8 %r4084, [%rd168+206]; + prmt.b32 %r4085, %r4084, %r4083, 30212; + ld.local.u8 %r4086, [%rd168+207]; + prmt.b32 %r4087, %r4086, %r4085, 28756; + ld.local.u8 %r4088, [%rd168+208]; + prmt.b32 %r4089, %r4088, %r4087, 1620; + or.b32 %r4090, %r3977, 4; + ld.local.u8 %r4091, [%rd3+-120]; + ld.local.u8 %r4092, [%rd3+-119]; + prmt.b32 %r4093, %r4092, %r4091, 30212; + ld.local.u8 %r4094, [%rd3+-118]; + ld.local.u8 %r4095, [%rd3+-117]; + prmt.b32 %r4096, %r4095, %r4094, 30212; + prmt.b32 %r4097, %r4096, %r4093, 4180; + ld.local.u8 %r4098, [%rd3+-136]; + ld.local.u8 %r4099, [%rd3+-135]; + prmt.b32 %r4100, %r4099, %r4098, 30212; + ld.local.u8 %r4101, [%rd3+-134]; + ld.local.u8 %r4102, [%rd3+-133]; + prmt.b32 %r4103, %r4102, %r4101, 30212; + prmt.b32 %r4104, %r4103, %r4100, 4180; + add.s32 %r4105, %r4097, %r4104; + add.s32 %r4106, %r4105, %r3984; + shf.l.wrap.b32 %r4107, %r4106, %r4106, 16; + add.s32 %r4108, %r4107, 1779033703; + xor.b32 %r4109, %r4108, %r4097; + shf.l.wrap.b32 %r4110, %r4109, %r4109, 20; + add.s32 %r4111, %r3991, %r4106; + add.s32 %r4112, %r4111, %r4110; + xor.b32 %r4113, %r4112, %r4107; + shf.l.wrap.b32 %r4114, %r4113, %r4113, 24; + add.s32 %r4115, %r4114, %r4108; + xor.b32 %r4116, %r4115, %r4110; + shf.l.wrap.b32 %r4117, %r4116, %r4116, 25; + ld.local.u8 %r4118, [%rd3+-116]; + ld.local.u8 %r4119, [%rd3+-115]; + prmt.b32 %r4120, %r4119, %r4118, 30212; + ld.local.u8 %r4121, [%rd3+-114]; + ld.local.u8 %r4122, [%rd3+-113]; + prmt.b32 %r4123, %r4122, %r4121, 30212; + prmt.b32 %r4124, %r4123, %r4120, 4180; + ld.local.u8 %r4125, [%rd3+-132]; + ld.local.u8 %r4126, [%rd3+-131]; + prmt.b32 %r4127, %r4126, %r4125, 30212; + ld.local.u8 %r4128, [%rd3+-130]; + ld.local.u8 %r4129, [%rd3+-129]; + prmt.b32 %r4130, %r4129, %r4128, 30212; + prmt.b32 %r4131, %r4130, %r4127, 4180; + add.s32 %r4132, %r4124, %r4131; + add.s32 %r4133, %r4132, %r3998; + shf.l.wrap.b32 %r4134, %r4133, %r4133, 16; + add.s32 %r4135, %r4134, -1150833019; + xor.b32 %r4136, %r4135, %r4124; + shf.l.wrap.b32 %r4137, %r4136, %r4136, 20; + add.s32 %r4138, %r4005, %r4133; + add.s32 %r4139, %r4138, %r4137; + xor.b32 %r4140, %r4139, %r4134; + shf.l.wrap.b32 %r4141, %r4140, %r4140, 24; + add.s32 %r4142, %r4141, %r4135; + xor.b32 %r4143, %r4142, %r4137; + shf.l.wrap.b32 %r4144, %r4143, %r4143, 25; + ld.local.u8 %r4145, [%rd3+-112]; + ld.local.u8 %r4146, [%rd3+-111]; + prmt.b32 %r4147, %r4146, %r4145, 30212; + ld.local.u8 %r4148, [%rd3+-110]; + ld.local.u8 %r4149, [%rd3+-109]; + prmt.b32 %r4150, %r4149, %r4148, 30212; + prmt.b32 %r4151, %r4150, %r4147, 4180; + ld.local.u8 %r4152, [%rd3+-128]; + ld.local.u8 %r4153, [%rd3+-127]; + prmt.b32 %r4154, %r4153, %r4152, 30212; + ld.local.u8 %r4155, [%rd3+-126]; + ld.local.u8 %r4156, [%rd3+-125]; + prmt.b32 %r4157, %r4156, %r4155, 30212; + prmt.b32 %r4158, %r4157, %r4154, 4180; + add.s32 %r4159, %r4151, %r4158; + add.s32 %r4160, %r4159, %r4012; + shr.u32 %r4161, %r4160, 16; + shl.b32 %r4162, %r4160, 16; + xor.b32 %r4163, %r4162, 4194304; + or.b32 %r4164, %r4163, %r4161; + add.s32 %r4165, %r4164, 1013904242; + xor.b32 %r4166, %r4165, %r4151; + shf.l.wrap.b32 %r4167, %r4166, %r4166, 20; + add.s32 %r4168, %r4019, %r4160; + add.s32 %r4169, %r4168, %r4167; + xor.b32 %r4170, %r4169, %r4164; + shf.l.wrap.b32 %r4171, %r4170, %r4170, 24; + add.s32 %r4172, %r4171, %r4165; + xor.b32 %r4173, %r4172, %r4167; + shf.l.wrap.b32 %r4174, %r4173, %r4173, 25; + ld.local.u8 %r4175, [%rd3+-108]; + ld.local.u8 %r4176, [%rd3+-107]; + prmt.b32 %r4177, %r4176, %r4175, 30212; + ld.local.u8 %r4178, [%rd3+-106]; + ld.local.u8 %r4179, [%rd3+-105]; + prmt.b32 %r4180, %r4179, %r4178, 30212; + prmt.b32 %r4181, %r4180, %r4177, 4180; + ld.local.u8 %r4182, [%rd3+-124]; + ld.local.u8 %r4183, [%rd3+-123]; + prmt.b32 %r4184, %r4183, %r4182, 30212; + ld.local.u8 %r4185, [%rd3+-122]; + ld.local.u8 %r4186, [%rd3+-121]; + prmt.b32 %r4187, %r4186, %r4185, 30212; + prmt.b32 %r4188, %r4187, %r4184, 4180; + add.s32 %r4189, %r4181, %r4188; + add.s32 %r4190, %r4189, %r4026; + xor.b32 %r4191, %r4190, %r4090; + shr.u32 %r4192, %r4190, 16; + shl.b32 %r4193, %r4191, 16; + or.b32 %r4194, %r4193, %r4192; + add.s32 %r4195, %r4194, -1521486534; + xor.b32 %r4196, %r4195, %r4181; + shf.l.wrap.b32 %r4197, %r4196, %r4196, 20; + add.s32 %r4198, %r4033, %r4190; + add.s32 %r4199, %r4198, %r4197; + xor.b32 %r4200, %r4199, %r4194; + shf.l.wrap.b32 %r4201, %r4200, %r4200, 24; + add.s32 %r4202, %r4201, %r4195; + xor.b32 %r4203, %r4202, %r4197; + shf.l.wrap.b32 %r4204, %r4203, %r4203, 25; + add.s32 %r4205, %r4144, %r4112; + add.s32 %r4206, %r4205, %r4040; + xor.b32 %r4207, %r4201, %r4206; + shf.l.wrap.b32 %r4208, %r4207, %r4207, 16; + add.s32 %r4209, %r4208, %r4172; + xor.b32 %r4210, %r4209, %r4144; + shf.l.wrap.b32 %r4211, %r4210, %r4210, 20; + add.s32 %r4212, %r4047, %r4206; + add.s32 %r4213, %r4212, %r4211; + xor.b32 %r4214, %r4213, %r4208; + shf.l.wrap.b32 %r4215, %r4214, %r4214, 24; + add.s32 %r4216, %r4215, %r4209; + xor.b32 %r4217, %r4216, %r4211; + shf.l.wrap.b32 %r4218, %r4217, %r4217, 25; + add.s32 %r4219, %r4174, %r4139; + add.s32 %r4220, %r4219, %r4054; + xor.b32 %r4221, %r4220, %r4114; + shf.l.wrap.b32 %r4222, %r4221, %r4221, 16; + add.s32 %r4223, %r4222, %r4202; + xor.b32 %r4224, %r4223, %r4174; + shf.l.wrap.b32 %r4225, %r4224, %r4224, 20; + add.s32 %r4226, %r4061, %r4220; + add.s32 %r4227, %r4226, %r4225; + xor.b32 %r4228, %r4227, %r4222; + shf.l.wrap.b32 %r4229, %r4228, %r4228, 24; + add.s32 %r4230, %r4229, %r4223; + xor.b32 %r4231, %r4230, %r4225; + shf.l.wrap.b32 %r4232, %r4231, %r4231, 25; + add.s32 %r4233, %r4204, %r4169; + add.s32 %r4234, %r4233, %r4068; + xor.b32 %r4235, %r4234, %r4141; + shf.l.wrap.b32 %r4236, %r4235, %r4235, 16; + add.s32 %r4237, %r4236, %r4115; + xor.b32 %r4238, %r4237, %r4204; + shf.l.wrap.b32 %r4239, %r4238, %r4238, 20; + add.s32 %r4240, %r4075, %r4234; + add.s32 %r4241, %r4240, %r4239; + xor.b32 %r4242, %r4241, %r4236; + shf.l.wrap.b32 %r4243, %r4242, %r4242, 24; + add.s32 %r4244, %r4243, %r4237; + xor.b32 %r4245, %r4244, %r4239; + shf.l.wrap.b32 %r4246, %r4245, %r4245, 25; + add.s32 %r4247, %r4199, %r4117; + add.s32 %r4248, %r4247, %r4082; + xor.b32 %r4249, %r4248, %r4171; + shf.l.wrap.b32 %r4250, %r4249, %r4249, 16; + add.s32 %r4251, %r4250, %r4142; + xor.b32 %r4252, %r4251, %r4117; + shf.l.wrap.b32 %r4253, %r4252, %r4252, 20; + add.s32 %r4254, %r4089, %r4248; + add.s32 %r4255, %r4254, %r4253; + xor.b32 %r4256, %r4255, %r4250; + shf.l.wrap.b32 %r4257, %r4256, %r4256, 24; + add.s32 %r4258, %r4257, %r4251; + xor.b32 %r4259, %r4258, %r4253; + shf.l.wrap.b32 %r4260, %r4259, %r4259, 25; + add.s32 %r4261, %r4213, %r3998; + add.s32 %r4262, %r4261, %r4260; + xor.b32 %r4263, %r4262, %r4229; + shf.l.wrap.b32 %r4264, %r4263, %r4263, 16; + add.s32 %r4265, %r4264, %r4244; + xor.b32 %r4266, %r4265, %r4260; + shf.l.wrap.b32 %r4267, %r4266, %r4266, 20; + add.s32 %r4268, %r4262, %r4026; + add.s32 %r4269, %r4268, %r4267; + xor.b32 %r4270, %r4269, %r4264; + shf.l.wrap.b32 %r4271, %r4270, %r4270, 24; + add.s32 %r4272, %r4271, %r4265; + xor.b32 %r4273, %r4272, %r4267; + shf.l.wrap.b32 %r4274, %r4273, %r4273, 25; + add.s32 %r4275, %r4227, %r4005; + add.s32 %r4276, %r4275, %r4218; + xor.b32 %r4277, %r4243, %r4276; + shf.l.wrap.b32 %r4278, %r4277, %r4277, 16; + add.s32 %r4279, %r4258, %r4278; + xor.b32 %r4280, %r4279, %r4218; + shf.l.wrap.b32 %r4281, %r4280, %r4280, 20; + add.s32 %r4282, %r4276, %r4054; + add.s32 %r4283, %r4282, %r4281; + xor.b32 %r4284, %r4283, %r4278; + shf.l.wrap.b32 %r4285, %r4284, %r4284, 24; + add.s32 %r4286, %r4285, %r4279; + xor.b32 %r4287, %r4286, %r4281; + shf.l.wrap.b32 %r4288, %r4287, %r4287, 25; + add.s32 %r4289, %r4232, %r4033; + add.s32 %r4290, %r4289, %r4241; + xor.b32 %r4291, %r4257, %r4290; + shf.l.wrap.b32 %r4292, %r4291, %r4291, 16; + add.s32 %r4293, %r4292, %r4216; + xor.b32 %r4294, %r4293, %r4232; + shf.l.wrap.b32 %r4295, %r4294, %r4294, 20; + add.s32 %r4296, %r4290, %r3984; + add.s32 %r4297, %r4296, %r4295; + xor.b32 %r4298, %r4297, %r4292; + shf.l.wrap.b32 %r4299, %r4298, %r4298, 24; + add.s32 %r4300, %r4299, %r4293; + xor.b32 %r4301, %r4300, %r4295; + shf.l.wrap.b32 %r4302, %r4301, %r4301, 25; + add.s32 %r4303, %r4246, %r4012; + add.s32 %r4304, %r4303, %r4255; + xor.b32 %r4305, %r4304, %r4215; + shf.l.wrap.b32 %r4306, %r4305, %r4305, 16; + add.s32 %r4307, %r4306, %r4230; + xor.b32 %r4308, %r4307, %r4246; + shf.l.wrap.b32 %r4309, %r4308, %r4308, 20; + add.s32 %r4310, %r4304, %r4075; + add.s32 %r4311, %r4310, %r4309; + xor.b32 %r4312, %r4311, %r4306; + shf.l.wrap.b32 %r4313, %r4312, %r4312, 24; + add.s32 %r4314, %r4313, %r4307; + xor.b32 %r4315, %r4314, %r4309; + shf.l.wrap.b32 %r4316, %r4315, %r4315, 25; + add.s32 %r4317, %r4288, %r3991; + add.s32 %r4318, %r4317, %r4269; + xor.b32 %r4319, %r4318, %r4313; + shf.l.wrap.b32 %r4320, %r4319, %r4319, 16; + add.s32 %r4321, %r4320, %r4300; + xor.b32 %r4322, %r4321, %r4288; + shf.l.wrap.b32 %r4323, %r4322, %r4322, 20; + add.s32 %r4324, %r4318, %r4061; + add.s32 %r4325, %r4324, %r4323; + xor.b32 %r4326, %r4325, %r4320; + shf.l.wrap.b32 %r4327, %r4326, %r4326, 24; + add.s32 %r4328, %r4327, %r4321; + xor.b32 %r4329, %r4328, %r4323; + shf.l.wrap.b32 %r4330, %r4329, %r4329, 25; + add.s32 %r4331, %r4283, %r4068; + add.s32 %r4332, %r4331, %r4302; + xor.b32 %r4333, %r4271, %r4332; + shf.l.wrap.b32 %r4334, %r4333, %r4333, 16; + add.s32 %r4335, %r4334, %r4314; + xor.b32 %r4336, %r4335, %r4302; + shf.l.wrap.b32 %r4337, %r4336, %r4336, 20; + add.s32 %r4338, %r4332, %r4019; + add.s32 %r4339, %r4338, %r4337; + xor.b32 %r4340, %r4339, %r4334; + shf.l.wrap.b32 %r4341, %r4340, %r4340, 24; + add.s32 %r4342, %r4341, %r4335; + xor.b32 %r4343, %r4342, %r4337; + shf.l.wrap.b32 %r4344, %r4343, %r4343, 25; + add.s32 %r4345, %r4297, %r4047; + add.s32 %r4346, %r4345, %r4316; + xor.b32 %r4347, %r4346, %r4285; + shf.l.wrap.b32 %r4348, %r4347, %r4347, 16; + add.s32 %r4349, %r4348, %r4272; + xor.b32 %r4350, %r4349, %r4316; + shf.l.wrap.b32 %r4351, %r4350, %r4350, 20; + add.s32 %r4352, %r4346, %r4082; + add.s32 %r4353, %r4352, %r4351; + xor.b32 %r4354, %r4353, %r4348; + shf.l.wrap.b32 %r4355, %r4354, %r4354, 24; + add.s32 %r4356, %r4355, %r4349; + xor.b32 %r4357, %r4356, %r4351; + shf.l.wrap.b32 %r4358, %r4357, %r4357, 25; + add.s32 %r4359, %r4311, %r4089; + add.s32 %r4360, %r4359, %r4274; + xor.b32 %r4361, %r4360, %r4299; + shf.l.wrap.b32 %r4362, %r4361, %r4361, 16; + add.s32 %r4363, %r4362, %r4286; + xor.b32 %r4364, %r4363, %r4274; + shf.l.wrap.b32 %r4365, %r4364, %r4364, 20; + add.s32 %r4366, %r4360, %r4040; + add.s32 %r4367, %r4366, %r4365; + xor.b32 %r4368, %r4367, %r4362; + shf.l.wrap.b32 %r4369, %r4368, %r4368, 24; + add.s32 %r4370, %r4369, %r4363; + xor.b32 %r4371, %r4370, %r4365; + shf.l.wrap.b32 %r4372, %r4371, %r4371, 25; + add.s32 %r4373, %r4325, %r4005; + add.s32 %r4374, %r4373, %r4372; + xor.b32 %r4375, %r4374, %r4341; + shf.l.wrap.b32 %r4376, %r4375, %r4375, 16; + add.s32 %r4377, %r4376, %r4356; + xor.b32 %r4378, %r4377, %r4372; + shf.l.wrap.b32 %r4379, %r4378, %r4378, 20; + add.s32 %r4380, %r4374, %r4012; + add.s32 %r4381, %r4380, %r4379; + xor.b32 %r4382, %r4381, %r4376; + shf.l.wrap.b32 %r4383, %r4382, %r4382, 24; + add.s32 %r4384, %r4383, %r4377; + xor.b32 %r4385, %r4384, %r4379; + shf.l.wrap.b32 %r4386, %r4385, %r4385, 25; + add.s32 %r4387, %r4339, %r4054; + add.s32 %r4388, %r4387, %r4330; + xor.b32 %r4389, %r4388, %r4355; + shf.l.wrap.b32 %r4390, %r4389, %r4389, 16; + add.s32 %r4391, %r4390, %r4370; + xor.b32 %r4392, %r4391, %r4330; + shf.l.wrap.b32 %r4393, %r4392, %r4392, 20; + add.s32 %r4394, %r4388, %r4068; + add.s32 %r4395, %r4394, %r4393; + xor.b32 %r4396, %r4395, %r4390; + shf.l.wrap.b32 %r4397, %r4396, %r4396, 24; + add.s32 %r4398, %r4397, %r4391; + xor.b32 %r4399, %r4398, %r4393; + shf.l.wrap.b32 %r4400, %r4399, %r4399, 25; + add.s32 %r4401, %r4353, %r4075; + add.s32 %r4402, %r4401, %r4344; + xor.b32 %r4403, %r4369, %r4402; + shf.l.wrap.b32 %r4404, %r4403, %r4403, 16; + add.s32 %r4405, %r4404, %r4328; + xor.b32 %r4406, %r4405, %r4344; + shf.l.wrap.b32 %r4407, %r4406, %r4406, 20; + add.s32 %r4408, %r4402, %r3998; + add.s32 %r4409, %r4408, %r4407; + xor.b32 %r4410, %r4409, %r4404; + shf.l.wrap.b32 %r4411, %r4410, %r4410, 24; + add.s32 %r4412, %r4411, %r4405; + xor.b32 %r4413, %r4412, %r4407; + shf.l.wrap.b32 %r4414, %r4413, %r4413, 25; + add.s32 %r4415, %r4358, %r4033; + add.s32 %r4416, %r4415, %r4367; + xor.b32 %r4417, %r4416, %r4327; + shf.l.wrap.b32 %r4418, %r4417, %r4417, 16; + add.s32 %r4419, %r4418, %r4342; + xor.b32 %r4420, %r4419, %r4358; + shf.l.wrap.b32 %r4421, %r4420, %r4420, 20; + add.s32 %r4422, %r4416, %r4082; + add.s32 %r4423, %r4422, %r4421; + xor.b32 %r4424, %r4423, %r4418; + shf.l.wrap.b32 %r4425, %r4424, %r4424, 24; + add.s32 %r4426, %r4425, %r4419; + xor.b32 %r4427, %r4426, %r4421; + shf.l.wrap.b32 %r4428, %r4427, %r4427, 25; + add.s32 %r4429, %r4381, %r4026; + add.s32 %r4430, %r4429, %r4400; + xor.b32 %r4431, %r4430, %r4425; + shf.l.wrap.b32 %r4432, %r4431, %r4431, 16; + add.s32 %r4433, %r4432, %r4412; + xor.b32 %r4434, %r4433, %r4400; + shf.l.wrap.b32 %r4435, %r4434, %r4434, 20; + add.s32 %r4436, %r4430, %r4019; + add.s32 %r4437, %r4436, %r4435; + xor.b32 %r4438, %r4437, %r4432; + shf.l.wrap.b32 %r4439, %r4438, %r4438, 24; + add.s32 %r4440, %r4439, %r4433; + xor.b32 %r4441, %r4440, %r4435; + shf.l.wrap.b32 %r4442, %r4441, %r4441, 25; + add.s32 %r4443, %r4395, %r4047; + add.s32 %r4444, %r4443, %r4414; + xor.b32 %r4445, %r4383, %r4444; + shf.l.wrap.b32 %r4446, %r4445, %r4445, 16; + add.s32 %r4447, %r4446, %r4426; + xor.b32 %r4448, %r4447, %r4414; + shf.l.wrap.b32 %r4449, %r4448, %r4448, 20; + add.s32 %r4450, %r4444, %r3984; + add.s32 %r4451, %r4450, %r4449; + xor.b32 %r4452, %r4451, %r4446; + shf.l.wrap.b32 %r4453, %r4452, %r4452, 24; + add.s32 %r4454, %r4453, %r4447; + xor.b32 %r4455, %r4454, %r4449; + shf.l.wrap.b32 %r4456, %r4455, %r4455, 25; + add.s32 %r4457, %r4409, %r4061; + add.s32 %r4458, %r4457, %r4428; + xor.b32 %r4459, %r4458, %r4397; + shf.l.wrap.b32 %r4460, %r4459, %r4459, 16; + add.s32 %r4461, %r4460, %r4384; + xor.b32 %r4462, %r4461, %r4428; + shf.l.wrap.b32 %r4463, %r4462, %r4462, 20; + add.s32 %r4464, %r4458, %r4089; + add.s32 %r4465, %r4464, %r4463; + xor.b32 %r4466, %r4465, %r4460; + shf.l.wrap.b32 %r4467, %r4466, %r4466, 24; + add.s32 %r4468, %r4467, %r4461; + xor.b32 %r4469, %r4468, %r4463; + shf.l.wrap.b32 %r4470, %r4469, %r4469, 25; + add.s32 %r4471, %r4423, %r4040; + add.s32 %r4472, %r4471, %r4386; + xor.b32 %r4473, %r4472, %r4411; + shf.l.wrap.b32 %r4474, %r4473, %r4473, 16; + add.s32 %r4475, %r4474, %r4398; + xor.b32 %r4476, %r4475, %r4386; + shf.l.wrap.b32 %r4477, %r4476, %r4476, 20; + add.s32 %r4478, %r4472, %r3991; + add.s32 %r4479, %r4478, %r4477; + xor.b32 %r4480, %r4479, %r4474; + shf.l.wrap.b32 %r4481, %r4480, %r4480, 24; + add.s32 %r4482, %r4481, %r4475; + xor.b32 %r4483, %r4482, %r4477; + shf.l.wrap.b32 %r4484, %r4483, %r4483, 25; + add.s32 %r4485, %r4437, %r4054; + add.s32 %r4486, %r4485, %r4484; + xor.b32 %r4487, %r4486, %r4453; + shf.l.wrap.b32 %r4488, %r4487, %r4487, 16; + add.s32 %r4489, %r4488, %r4468; + xor.b32 %r4490, %r4489, %r4484; + shf.l.wrap.b32 %r4491, %r4490, %r4490, 20; + add.s32 %r4492, %r4486, %r4033; + add.s32 %r4493, %r4492, %r4491; + xor.b32 %r4494, %r4493, %r4488; + shf.l.wrap.b32 %r4495, %r4494, %r4494, 24; + add.s32 %r4496, %r4495, %r4489; + xor.b32 %r4497, %r4496, %r4491; + shf.l.wrap.b32 %r4498, %r4497, %r4497, 25; + add.s32 %r4499, %r4451, %r4068; + add.s32 %r4500, %r4499, %r4442; + xor.b32 %r4501, %r4500, %r4467; + shf.l.wrap.b32 %r4502, %r4501, %r4501, 16; + add.s32 %r4503, %r4502, %r4482; + xor.b32 %r4504, %r4503, %r4442; + shf.l.wrap.b32 %r4505, %r4504, %r4504, 20; + add.s32 %r4506, %r4500, %r4047; + add.s32 %r4507, %r4506, %r4505; + xor.b32 %r4508, %r4507, %r4502; + shf.l.wrap.b32 %r4509, %r4508, %r4508, 24; + add.s32 %r4510, %r4509, %r4503; + xor.b32 %r4511, %r4510, %r4505; + shf.l.wrap.b32 %r4512, %r4511, %r4511, 25; + add.s32 %r4513, %r4465, %r4082; + add.s32 %r4514, %r4513, %r4456; + xor.b32 %r4515, %r4481, %r4514; + shf.l.wrap.b32 %r4516, %r4515, %r4515, 16; + add.s32 %r4517, %r4516, %r4440; + xor.b32 %r4518, %r4517, %r4456; + shf.l.wrap.b32 %r4519, %r4518, %r4518, 20; + add.s32 %r4520, %r4514, %r4005; + add.s32 %r4521, %r4520, %r4519; + xor.b32 %r4522, %r4521, %r4516; + shf.l.wrap.b32 %r4523, %r4522, %r4522, 24; + add.s32 %r4524, %r4523, %r4517; + xor.b32 %r4525, %r4524, %r4519; + shf.l.wrap.b32 %r4526, %r4525, %r4525, 25; + add.s32 %r4527, %r4479, %r4075; + add.s32 %r4528, %r4527, %r4470; + xor.b32 %r4529, %r4528, %r4439; + shf.l.wrap.b32 %r4530, %r4529, %r4529, 16; + add.s32 %r4531, %r4530, %r4454; + xor.b32 %r4532, %r4531, %r4470; + shf.l.wrap.b32 %r4533, %r4532, %r4532, 20; + add.s32 %r4534, %r4528, %r4089; + add.s32 %r4535, %r4534, %r4533; + xor.b32 %r4536, %r4535, %r4530; + shf.l.wrap.b32 %r4537, %r4536, %r4536, 24; + add.s32 %r4538, %r4537, %r4531; + xor.b32 %r4539, %r4538, %r4533; + shf.l.wrap.b32 %r4540, %r4539, %r4539, 25; + add.s32 %r4541, %r4493, %r4012; + add.s32 %r4542, %r4541, %r4512; + xor.b32 %r4543, %r4542, %r4537; + shf.l.wrap.b32 %r4544, %r4543, %r4543, 16; + add.s32 %r4545, %r4544, %r4524; + xor.b32 %r4546, %r4545, %r4512; + shf.l.wrap.b32 %r4547, %r4546, %r4546, 20; + add.s32 %r4548, %r4542, %r3984; + add.s32 %r4549, %r4548, %r4547; + xor.b32 %r4550, %r4549, %r4544; + shf.l.wrap.b32 %r4551, %r4550, %r4550, 24; + add.s32 %r4552, %r4551, %r4545; + xor.b32 %r4553, %r4552, %r4547; + shf.l.wrap.b32 %r4554, %r4553, %r4553, 25; + add.s32 %r4555, %r4507, %r4061; + add.s32 %r4556, %r4555, %r4526; + xor.b32 %r4557, %r4495, %r4556; + shf.l.wrap.b32 %r4558, %r4557, %r4557, 16; + add.s32 %r4559, %r4558, %r4538; + xor.b32 %r4560, %r4559, %r4526; + shf.l.wrap.b32 %r4561, %r4560, %r4560, 20; + add.s32 %r4562, %r4556, %r3998; + add.s32 %r4563, %r4562, %r4561; + xor.b32 %r4564, %r4563, %r4558; + shf.l.wrap.b32 %r4565, %r4564, %r4564, 24; + add.s32 %r4566, %r4565, %r4559; + xor.b32 %r4567, %r4566, %r4561; + shf.l.wrap.b32 %r4568, %r4567, %r4567, 25; + add.s32 %r4569, %r4521, %r4019; + add.s32 %r4570, %r4569, %r4540; + xor.b32 %r4571, %r4570, %r4509; + shf.l.wrap.b32 %r4572, %r4571, %r4571, 16; + add.s32 %r4573, %r4572, %r4496; + xor.b32 %r4574, %r4573, %r4540; + shf.l.wrap.b32 %r4575, %r4574, %r4574, 20; + add.s32 %r4576, %r4570, %r4040; + add.s32 %r4577, %r4576, %r4575; + xor.b32 %r4578, %r4577, %r4572; + shf.l.wrap.b32 %r4579, %r4578, %r4578, 24; + add.s32 %r4580, %r4579, %r4573; + xor.b32 %r4581, %r4580, %r4575; + shf.l.wrap.b32 %r4582, %r4581, %r4581, 25; + add.s32 %r4583, %r4535, %r3991; + add.s32 %r4584, %r4583, %r4498; + xor.b32 %r4585, %r4584, %r4523; + shf.l.wrap.b32 %r4586, %r4585, %r4585, 16; + add.s32 %r4587, %r4586, %r4510; + xor.b32 %r4588, %r4587, %r4498; + shf.l.wrap.b32 %r4589, %r4588, %r4588, 20; + add.s32 %r4590, %r4584, %r4026; + add.s32 %r4591, %r4590, %r4589; + xor.b32 %r4592, %r4591, %r4586; + shf.l.wrap.b32 %r4593, %r4592, %r4592, 24; + add.s32 %r4594, %r4593, %r4587; + xor.b32 %r4595, %r4594, %r4589; + shf.l.wrap.b32 %r4596, %r4595, %r4595, 25; + add.s32 %r4597, %r4549, %r4068; + add.s32 %r4598, %r4597, %r4596; + xor.b32 %r4599, %r4598, %r4565; + shf.l.wrap.b32 %r4600, %r4599, %r4599, 16; + add.s32 %r4601, %r4600, %r4580; + xor.b32 %r4602, %r4601, %r4596; + shf.l.wrap.b32 %r4603, %r4602, %r4602, 20; + add.s32 %r4604, %r4598, %r4075; + add.s32 %r4605, %r4604, %r4603; + xor.b32 %r4606, %r4605, %r4600; + shf.l.wrap.b32 %r4607, %r4606, %r4606, 24; + add.s32 %r4608, %r4607, %r4601; + xor.b32 %r4609, %r4608, %r4603; + shf.l.wrap.b32 %r4610, %r4609, %r4609, 25; + add.s32 %r4611, %r4563, %r4047; + add.s32 %r4612, %r4611, %r4554; + xor.b32 %r4613, %r4612, %r4579; + shf.l.wrap.b32 %r4614, %r4613, %r4613, 16; + add.s32 %r4615, %r4614, %r4594; + xor.b32 %r4616, %r4615, %r4554; + shf.l.wrap.b32 %r4617, %r4616, %r4616, 20; + add.s32 %r4618, %r4612, %r4061; + add.s32 %r4619, %r4618, %r4617; + xor.b32 %r4620, %r4619, %r4614; + shf.l.wrap.b32 %r4621, %r4620, %r4620, 24; + add.s32 %r4622, %r4621, %r4615; + xor.b32 %r4623, %r4622, %r4617; + shf.l.wrap.b32 %r4624, %r4623, %r4623, 25; + add.s32 %r4625, %r4577, %r4089; + add.s32 %r4626, %r4625, %r4568; + xor.b32 %r4627, %r4593, %r4626; + shf.l.wrap.b32 %r4628, %r4627, %r4627, 16; + add.s32 %r4629, %r4628, %r4552; + xor.b32 %r4630, %r4629, %r4568; + shf.l.wrap.b32 %r4631, %r4630, %r4630, 20; + add.s32 %r4632, %r4626, %r4054; + add.s32 %r4633, %r4632, %r4631; + xor.b32 %r4634, %r4633, %r4628; + shf.l.wrap.b32 %r4635, %r4634, %r4634, 24; + add.s32 %r4636, %r4635, %r4629; + xor.b32 %r4637, %r4636, %r4631; + shf.l.wrap.b32 %r4638, %r4637, %r4637, 25; + add.s32 %r4639, %r4591, %r4082; + add.s32 %r4640, %r4639, %r4582; + xor.b32 %r4641, %r4640, %r4551; + shf.l.wrap.b32 %r4642, %r4641, %r4641, 16; + add.s32 %r4643, %r4642, %r4566; + xor.b32 %r4644, %r4643, %r4582; + shf.l.wrap.b32 %r4645, %r4644, %r4644, 20; + add.s32 %r4646, %r4640, %r4040; + add.s32 %r4647, %r4646, %r4645; + xor.b32 %r4648, %r4647, %r4642; + shf.l.wrap.b32 %r4649, %r4648, %r4648, 24; + add.s32 %r4650, %r4649, %r4643; + xor.b32 %r4651, %r4650, %r4645; + shf.l.wrap.b32 %r4652, %r4651, %r4651, 25; + add.s32 %r4653, %r4605, %r4033; + add.s32 %r4654, %r4653, %r4624; + xor.b32 %r4655, %r4654, %r4649; + shf.l.wrap.b32 %r4656, %r4655, %r4655, 16; + add.s32 %r4657, %r4656, %r4636; + xor.b32 %r4658, %r4657, %r4624; + shf.l.wrap.b32 %r4659, %r4658, %r4658, 20; + add.s32 %r4660, %r4654, %r3998; + add.s32 %r4661, %r4660, %r4659; + xor.b32 %r4662, %r4661, %r4656; + shf.l.wrap.b32 %r4663, %r4662, %r4662, 24; + add.s32 %r4664, %r4663, %r4657; + xor.b32 %r4665, %r4664, %r4659; + shf.l.wrap.b32 %r4666, %r4665, %r4665, 25; + add.s32 %r4667, %r4619, %r4019; + add.s32 %r4668, %r4667, %r4638; + xor.b32 %r4669, %r4607, %r4668; + shf.l.wrap.b32 %r4670, %r4669, %r4669, 16; + add.s32 %r4671, %r4670, %r4650; + xor.b32 %r4672, %r4671, %r4638; + shf.l.wrap.b32 %r4673, %r4672, %r4672, 20; + add.s32 %r4674, %r4668, %r4005; + add.s32 %r4675, %r4674, %r4673; + xor.b32 %r4676, %r4675, %r4670; + shf.l.wrap.b32 %r4677, %r4676, %r4676, 24; + add.s32 %r4678, %r4677, %r4671; + xor.b32 %r4679, %r4678, %r4673; + shf.l.wrap.b32 %r4680, %r4679, %r4679, 25; + add.s32 %r4681, %r4633, %r3984; + add.s32 %r4682, %r4681, %r4652; + xor.b32 %r4683, %r4682, %r4621; + shf.l.wrap.b32 %r4684, %r4683, %r4683, 16; + add.s32 %r4685, %r4684, %r4608; + xor.b32 %r4686, %r4685, %r4652; + shf.l.wrap.b32 %r4687, %r4686, %r4686, 20; + add.s32 %r4688, %r4682, %r3991; + add.s32 %r4689, %r4688, %r4687; + xor.b32 %r4690, %r4689, %r4684; + shf.l.wrap.b32 %r4691, %r4690, %r4690, 24; + add.s32 %r4692, %r4691, %r4685; + xor.b32 %r4693, %r4692, %r4687; + shf.l.wrap.b32 %r4694, %r4693, %r4693, 25; + add.s32 %r4695, %r4647, %r4026; + add.s32 %r4696, %r4695, %r4610; + xor.b32 %r4697, %r4696, %r4635; + shf.l.wrap.b32 %r4698, %r4697, %r4697, 16; + add.s32 %r4699, %r4698, %r4622; + xor.b32 %r4700, %r4699, %r4610; + shf.l.wrap.b32 %r4701, %r4700, %r4700, 20; + add.s32 %r4702, %r4696, %r4012; + add.s32 %r4703, %r4702, %r4701; + xor.b32 %r4704, %r4703, %r4698; + shf.l.wrap.b32 %r4705, %r4704, %r4704, 24; + add.s32 %r4706, %r4705, %r4699; + xor.b32 %r4707, %r4706, %r4701; + shf.l.wrap.b32 %r4708, %r4707, %r4707, 25; + add.s32 %r4709, %r4661, %r4047; + add.s32 %r4710, %r4709, %r4708; + xor.b32 %r4711, %r4710, %r4677; + shf.l.wrap.b32 %r4712, %r4711, %r4711, 16; + add.s32 %r4713, %r4712, %r4692; + xor.b32 %r4714, %r4713, %r4708; + shf.l.wrap.b32 %r4715, %r4714, %r4714, 20; + add.s32 %r4716, %r4710, %r4082; + add.s32 %r4717, %r4716, %r4715; + xor.b32 %r4718, %r4717, %r4712; + shf.l.wrap.b32 %r4719, %r4718, %r4718, 24; + add.s32 %r4720, %r4719, %r4713; + xor.b32 %r4721, %r4720, %r4715; + shf.l.wrap.b32 %r4722, %r4721, %r4721, 25; + add.s32 %r4723, %r4675, %r4061; + add.s32 %r4724, %r4723, %r4666; + xor.b32 %r4725, %r4724, %r4691; + shf.l.wrap.b32 %r4726, %r4725, %r4725, 16; + add.s32 %r4727, %r4726, %r4706; + xor.b32 %r4728, %r4727, %r4666; + shf.l.wrap.b32 %r4729, %r4728, %r4728, 20; + add.s32 %r4730, %r4724, %r4019; + add.s32 %r4731, %r4730, %r4729; + xor.b32 %r4732, %r4731, %r4726; + shf.l.wrap.b32 %r4733, %r4732, %r4732, 24; + add.s32 %r4734, %r4733, %r4727; + xor.b32 %r4735, %r4734, %r4729; + shf.l.wrap.b32 %r4736, %r4735, %r4735, 25; + add.s32 %r4737, %r4689, %r4040; + add.s32 %r4738, %r4737, %r4680; + xor.b32 %r4739, %r4705, %r4738; + shf.l.wrap.b32 %r4740, %r4739, %r4739, 16; + add.s32 %r4741, %r4740, %r4664; + xor.b32 %r4742, %r4741, %r4680; + shf.l.wrap.b32 %r4743, %r4742, %r4742, 20; + add.s32 %r4744, %r4738, %r4068; + add.s32 %r4745, %r4744, %r4743; + xor.b32 %r4746, %r4745, %r4740; + shf.l.wrap.b32 %r4747, %r4746, %r4746, 24; + add.s32 %r4748, %r4747, %r4741; + xor.b32 %r4749, %r4748, %r4743; + shf.l.wrap.b32 %r4750, %r4749, %r4749, 25; + add.s32 %r4751, %r4703, %r4089; + add.s32 %r4752, %r4751, %r4694; + xor.b32 %r4753, %r4752, %r4663; + shf.l.wrap.b32 %r4754, %r4753, %r4753, 16; + add.s32 %r4755, %r4754, %r4678; + xor.b32 %r4756, %r4755, %r4694; + shf.l.wrap.b32 %r4757, %r4756, %r4756, 20; + add.s32 %r4758, %r4752, %r3991; + add.s32 %r4759, %r4758, %r4757; + xor.b32 %r4760, %r4759, %r4754; + shf.l.wrap.b32 %r4761, %r4760, %r4760, 24; + add.s32 %r4762, %r4761, %r4755; + xor.b32 %r4763, %r4762, %r4757; + shf.l.wrap.b32 %r4764, %r4763, %r4763, 25; + add.s32 %r4765, %r4717, %r4075; + add.s32 %r4766, %r4765, %r4736; + xor.b32 %r4767, %r4766, %r4761; + shf.l.wrap.b32 %r4768, %r4767, %r4767, 16; + add.s32 %r4769, %r4768, %r4748; + xor.b32 %r4770, %r4769, %r4736; + shf.l.wrap.b32 %r4771, %r4770, %r4770, 20; + add.s32 %r4772, %r4766, %r4005; + add.s32 %r4773, %r4772, %r4771; + xor.b32 %r4774, %r4773, %r4768; + shf.l.wrap.b32 %r4775, %r4774, %r4774, 24; + add.s32 %r4776, %r4775, %r4769; + xor.b32 %r4777, %r4776, %r4771; + shf.l.wrap.b32 %r4778, %r4777, %r4777, 25; + add.s32 %r4779, %r4731, %r3984; + add.s32 %r4780, %r4779, %r4750; + xor.b32 %r4781, %r4719, %r4780; + shf.l.wrap.b32 %r4782, %r4781, %r4781, 16; + add.s32 %r4783, %r4782, %r4762; + xor.b32 %r4784, %r4783, %r4750; + shf.l.wrap.b32 %r4785, %r4784, %r4784, 20; + add.s32 %r4786, %r4780, %r4054; + add.s32 %r4787, %r4786, %r4785; + xor.b32 %r4788, %r4787, %r4782; + shf.l.wrap.b32 %r4789, %r4788, %r4788, 24; + add.s32 %r4790, %r4789, %r4783; + xor.b32 %r4791, %r4790, %r4785; + shf.l.wrap.b32 %r4792, %r4791, %r4791, 25; + add.s32 %r4793, %r4745, %r3998; + add.s32 %r4794, %r4793, %r4764; + xor.b32 %r4795, %r4794, %r4733; + shf.l.wrap.b32 %r4796, %r4795, %r4795, 16; + add.s32 %r4797, %r4796, %r4720; + xor.b32 %r4798, %r4797, %r4764; + shf.l.wrap.b32 %r4799, %r4798, %r4798, 20; + add.s32 %r4800, %r4794, %r4026; + add.s32 %r4801, %r4800, %r4799; + xor.b32 %r4802, %r4801, %r4796; + shf.l.wrap.b32 %r4803, %r4802, %r4802, 24; + add.s32 %r4804, %r4803, %r4797; + xor.b32 %r4805, %r4804, %r4799; + shf.l.wrap.b32 %r4806, %r4805, %r4805, 25; + add.s32 %r4807, %r4759, %r4012; + add.s32 %r4808, %r4807, %r4722; + xor.b32 %r4809, %r4808, %r4747; + shf.l.wrap.b32 %r4810, %r4809, %r4809, 16; + add.s32 %r4811, %r4810, %r4734; + xor.b32 %r4812, %r4811, %r4722; + shf.l.wrap.b32 %r4813, %r4812, %r4812, 20; + add.s32 %r4814, %r4808, %r4033; + add.s32 %r4815, %r4814, %r4813; + xor.b32 %r4816, %r4815, %r4810; + shf.l.wrap.b32 %r4817, %r4816, %r4816, 24; + add.s32 %r4818, %r4817, %r4811; + xor.b32 %r4819, %r4818, %r4813; + shf.l.wrap.b32 %r4820, %r4819, %r4819, 25; + add.s32 %r4821, %r4773, %r4061; + add.s32 %r4822, %r4821, %r4820; + xor.b32 %r4823, %r4822, %r4789; + shf.l.wrap.b32 %r4824, %r4823, %r4823, 16; + add.s32 %r4825, %r4824, %r4804; + xor.b32 %r4826, %r4825, %r4820; + shf.l.wrap.b32 %r4827, %r4826, %r4826, 20; + add.s32 %r4828, %r4822, %r4089; + add.s32 %r4829, %r4828, %r4827; + xor.b32 %r4830, %r4829, %r4824; + shf.l.wrap.b32 %r4831, %r4830, %r4830, 24; + add.s32 %r4832, %r4831, %r4825; + xor.b32 %r4833, %r4832, %r4827; + shf.l.wrap.b32 %r4834, %r4833, %r4833, 25; + add.s32 %r4835, %r4787, %r4019; + add.s32 %r4836, %r4835, %r4778; + xor.b32 %r4837, %r4836, %r4803; + shf.l.wrap.b32 %r4838, %r4837, %r4837, 16; + add.s32 %r4839, %r4838, %r4818; + xor.b32 %r4840, %r4839, %r4778; + shf.l.wrap.b32 %r4841, %r4840, %r4840, 20; + add.s32 %r4842, %r4836, %r3984; + add.s32 %r4843, %r4842, %r4841; + xor.b32 %r4844, %r4843, %r4838; + shf.l.wrap.b32 %r4845, %r4844, %r4844, 24; + add.s32 %r4846, %r4845, %r4839; + xor.b32 %r4847, %r4846, %r4841; + shf.l.wrap.b32 %r4848, %r4847, %r4847, 25; + add.s32 %r4849, %r4801, %r3991; + add.s32 %r4850, %r4849, %r4792; + xor.b32 %r4851, %r4817, %r4850; + shf.l.wrap.b32 %r4852, %r4851, %r4851, 16; + add.s32 %r4853, %r4852, %r4776; + xor.b32 %r4854, %r4853, %r4792; + shf.l.wrap.b32 %r4855, %r4854, %r4854, 20; + add.s32 %r4856, %r4850, %r4047; + add.s32 %r4857, %r4856, %r4855; + xor.b32 %r4858, %r4857, %r4852; + shf.l.wrap.b32 %r4859, %r4858, %r4858, 24; + add.s32 %r4860, %r4859, %r4853; + xor.b32 %r4861, %r4860, %r4855; + shf.l.wrap.b32 %r4862, %r4861, %r4861, 25; + add.s32 %r4863, %r4815, %r4040; + add.s32 %r4864, %r4863, %r4806; + xor.b32 %r4865, %r4864, %r4775; + shf.l.wrap.b32 %r4866, %r4865, %r4865, 16; + add.s32 %r4867, %r4866, %r4790; + xor.b32 %r4868, %r4867, %r4806; + shf.l.wrap.b32 %r4869, %r4868, %r4868, 20; + add.s32 %r4870, %r4864, %r4026; + add.s32 %r4871, %r4870, %r4869; + xor.b32 %r4872, %r4871, %r4866; + shf.l.wrap.b32 %r4873, %r4872, %r4872, 24; + add.s32 %r4874, %r4873, %r4867; + xor.b32 %r4875, %r4874, %r4869; + shf.l.wrap.b32 %r4876, %r4875, %r4875, 25; + add.s32 %r4877, %r4829, %r4082; + add.s32 %r4878, %r4877, %r4848; + xor.b32 %r4879, %r4878, %r4873; + shf.l.wrap.b32 %r4880, %r4879, %r4879, 16; + add.s32 %r4881, %r4880, %r4860; + xor.b32 %r4882, %r4881, %r4848; + shf.l.wrap.b32 %r4883, %r4882, %r4882, 20; + add.s32 %r4884, %r4878, %r4054; + add.s32 %r4885, %r4884, %r4883; + xor.b32 %r4886, %r4885, %r4880; + shf.l.wrap.b32 %r4887, %r4886, %r4886, 24; + add.s32 %r4888, %r4887, %r4881; + xor.b32 %r4889, %r4888, %r4883; + shf.l.wrap.b32 %r4890, %r4889, %r4889, 25; + add.s32 %r4891, %r4843, %r3998; + add.s32 %r4892, %r4891, %r4862; + xor.b32 %r4893, %r4831, %r4892; + shf.l.wrap.b32 %r4894, %r4893, %r4893, 16; + add.s32 %r4895, %r4894, %r4874; + xor.b32 %r4896, %r4895, %r4862; + shf.l.wrap.b32 %r4897, %r4896, %r4896, 20; + add.s32 %r4898, %r4892, %r4068; + add.s32 %r4899, %r4898, %r4897; + xor.b32 %r4900, %r4899, %r4894; + shf.l.wrap.b32 %r4901, %r4900, %r4900, 24; + add.s32 %r4902, %r4901, %r4895; + xor.b32 %r4903, %r4902, %r4897; + shf.l.wrap.b32 %r4904, %r4903, %r4903, 25; + add.s32 %r4905, %r4857, %r4005; + add.s32 %r4906, %r4905, %r4876; + xor.b32 %r4907, %r4906, %r4845; + shf.l.wrap.b32 %r4908, %r4907, %r4907, 16; + add.s32 %r4909, %r4908, %r4832; + xor.b32 %r4910, %r4909, %r4876; + shf.l.wrap.b32 %r4911, %r4910, %r4910, 20; + add.s32 %r4912, %r4906, %r4012; + add.s32 %r4913, %r4912, %r4911; + xor.b32 %r4914, %r4913, %r4908; + shf.l.wrap.b32 %r4915, %r4914, %r4914, 24; + add.s32 %r4916, %r4915, %r4909; + xor.b32 %r4917, %r4916, %r4911; + shf.l.wrap.b32 %r4918, %r4917, %r4917, 25; + add.s32 %r4919, %r4871, %r4033; + add.s32 %r4920, %r4919, %r4834; + xor.b32 %r4921, %r4920, %r4859; + shf.l.wrap.b32 %r4922, %r4921, %r4921, 16; + add.s32 %r4923, %r4922, %r4846; + xor.b32 %r4924, %r4923, %r4834; + shf.l.wrap.b32 %r4925, %r4924, %r4924, 20; + add.s32 %r4926, %r4920, %r4075; + add.s32 %r4927, %r4926, %r4925; + xor.b32 %r4928, %r4927, %r4922; + shf.l.wrap.b32 %r4929, %r4928, %r4928, 24; + add.s32 %r4930, %r4929, %r4923; + xor.b32 %r4931, %r4930, %r4925; + shf.l.wrap.b32 %r4932, %r4931, %r4931, 25; + xor.b32 %r4933, %r4916, %r4885; + xor.b32 %r4934, %r4930, %r4899; + xor.b32 %r4935, %r4888, %r4913; + xor.b32 %r4936, %r4927, %r4902; + xor.b32 %r4937, %r4932, %r4901; + xor.b32 %r4938, %r4890, %r4915; + xor.b32 %r4939, %r4929, %r4904; + xor.b32 %r4940, %r4918, %r4887; + st.local.u8 [%rd168+145], %r4933; + shr.u32 %r4941, %r4933, 8; + st.local.u8 [%rd168+146], %r4941; + shr.u32 %r4942, %r4933, 16; + st.local.u8 [%rd168+147], %r4942; + shr.u32 %r4943, %r4933, 24; + st.local.u8 [%rd168+148], %r4943; + st.local.u8 [%rd168+149], %r4934; + shr.u32 %r4944, %r4934, 8; + st.local.u8 [%rd168+150], %r4944; + shr.u32 %r4945, %r4934, 16; + st.local.u8 [%rd168+151], %r4945; + shr.u32 %r4946, %r4934, 24; + st.local.u8 [%rd168+152], %r4946; + st.local.u8 [%rd168+153], %r4935; + shr.u32 %r4947, %r4935, 8; + st.local.u8 [%rd168+154], %r4947; + shr.u32 %r4948, %r4935, 16; + st.local.u8 [%rd168+155], %r4948; + shr.u32 %r4949, %r4935, 24; + st.local.u8 [%rd168+156], %r4949; + st.local.u8 [%rd168+157], %r4936; + shr.u32 %r4950, %r4936, 8; + st.local.u8 [%rd168+158], %r4950; + shr.u32 %r4951, %r4936, 16; + st.local.u8 [%rd168+159], %r4951; + shr.u32 %r4952, %r4936, 24; + st.local.u8 [%rd168+160], %r4952; + st.local.u8 [%rd168+161], %r4937; + shr.u32 %r4953, %r4937, 8; + st.local.u8 [%rd168+162], %r4953; + shr.u32 %r4954, %r4937, 16; + st.local.u8 [%rd168+163], %r4954; + shr.u32 %r4955, %r4937, 24; + st.local.u8 [%rd168+164], %r4955; + st.local.u8 [%rd168+165], %r4938; + shr.u32 %r4956, %r4938, 8; + st.local.u8 [%rd168+166], %r4956; + shr.u32 %r4957, %r4938, 16; + st.local.u8 [%rd168+167], %r4957; + shr.u32 %r4958, %r4938, 24; + st.local.u8 [%rd168+168], %r4958; + st.local.u8 [%rd168+169], %r4939; + shr.u32 %r4959, %r4939, 8; + st.local.u8 [%rd168+170], %r4959; + shr.u32 %r4960, %r4939, 16; + st.local.u8 [%rd168+171], %r4960; + shr.u32 %r4961, %r4939, 24; + st.local.u8 [%rd168+172], %r4961; + st.local.u8 [%rd168+173], %r4940; + shr.u32 %r4962, %r4940, 8; + st.local.u8 [%rd168+174], %r4962; + shr.u32 %r4963, %r4940, 16; + st.local.u8 [%rd168+175], %r4963; + shr.u32 %r4964, %r4940, 24; + st.local.u8 [%rd168+176], %r4964; + ld.local.u8 %rs138, [%rd3+8]; + add.s16 %rs139, %rs138, -1; + st.local.u8 [%rd3+8], %rs139; + cvt.u64.u16 %rd169, %rs139; + and.b64 %rd170, %rd169, 255; + setp.lt.u64 %p28, %rd226, %rd170; + and.b16 %rs140, %rs139, 255; + mul.wide.u16 %r11661, %rs140, 32; + @%p28 bra $L__BB1_31; + +$L__BB1_32: + cvt.s64.s32 %rd171, %r11661; + add.s64 %rd172, %rd2, %rd171; + mov.b32 {%rs141, %rs142}, %r3959; + st.local.u8 [%rd172+145], %rs141; + shr.u16 %rs143, %rs141, 8; + st.local.u8 [%rd172+146], %rs143; + st.local.u8 [%rd172+147], %rs142; + shr.u16 %rs144, %rs142, 8; + st.local.u8 [%rd172+148], %rs144; + mov.b32 {%rs145, %rs146}, %r3960; + st.local.u8 [%rd172+149], %rs145; + shr.u16 %rs147, %rs145, 8; + st.local.u8 [%rd172+150], %rs147; + st.local.u8 [%rd172+151], %rs146; + shr.u16 %rs148, %rs146, 8; + st.local.u8 [%rd172+152], %rs148; + mov.b32 {%rs149, %rs150}, %r3961; + st.local.u8 [%rd172+153], %rs149; + shr.u16 %rs151, %rs149, 8; + st.local.u8 [%rd172+154], %rs151; + st.local.u8 [%rd172+155], %rs150; + shr.u16 %rs152, %rs150, 8; + st.local.u8 [%rd172+156], %rs152; + mov.b32 {%rs153, %rs154}, %r3962; + st.local.u8 [%rd172+157], %rs153; + shr.u16 %rs155, %rs153, 8; + st.local.u8 [%rd172+158], %rs155; + st.local.u8 [%rd172+159], %rs154; + shr.u16 %rs156, %rs154, 8; + st.local.u8 [%rd172+160], %rs156; + mov.b32 {%rs157, %rs158}, %r3963; + st.local.u8 [%rd172+161], %rs157; + shr.u16 %rs159, %rs157, 8; + st.local.u8 [%rd172+162], %rs159; + st.local.u8 [%rd172+163], %rs158; + shr.u16 %rs160, %rs158, 8; + st.local.u8 [%rd172+164], %rs160; + mov.b32 {%rs161, %rs162}, %r3964; + st.local.u8 [%rd172+165], %rs161; + shr.u16 %rs163, %rs161, 8; + st.local.u8 [%rd172+166], %rs163; + st.local.u8 [%rd172+167], %rs162; + shr.u16 %rs164, %rs162, 8; + st.local.u8 [%rd172+168], %rs164; + mov.b32 {%rs165, %rs166}, %r3965; + st.local.u8 [%rd172+169], %rs165; + shr.u16 %rs167, %rs165, 8; + st.local.u8 [%rd172+170], %rs167; + st.local.u8 [%rd172+171], %rs166; + shr.u16 %rs168, %rs166, 8; + st.local.u8 [%rd172+172], %rs168; + mov.b32 {%rs169, %rs170}, %r3966; + st.local.u8 [%rd172+173], %rs169; + shr.u16 %rs171, %rs169, 8; + st.local.u8 [%rd172+174], %rs171; + st.local.u8 [%rd172+175], %rs170; + shr.u16 %rs172, %rs170, 8; + st.local.u8 [%rd172+176], %rs172; + ld.local.u8 %rs173, [%rd3+8]; + add.s16 %rs174, %rs173, 1; + st.local.u8 [%rd3+8], %rs174; + shr.u64 %rd173, %rd49, 11; + ld.local.u64 %rd174, [%rd3+-72]; + add.s64 %rd175, %rd174, %rd173; + popc.b64 %r4965, %rd175; + cvt.u64.u32 %rd52, %r4965; + cvt.u64.u16 %rd176, %rs174; + and.b64 %rd177, %rd176, 255; + setp.ge.u64 %p29, %rd52, %rd177; + and.b16 %rs175, %rs174, 255; + mul.wide.u16 %r11663, %rs175, 32; + @%p29 bra $L__BB1_35; + +$L__BB1_34: + shr.u64 %rd229, %rd49, 11; + add.s64 %rd228, %rd174, %rd229; + popc.b64 %r11648, %rd228; + cvt.u64.u32 %rd227, %r11648; + add.s32 %r4966, %r11663, -64; + cvt.s64.s32 %rd178, %r4966; + add.s64 %rd179, %rd2, %rd178; + ld.local.u8 %r4967, [%rd3+2]; + ld.local.u8 %r4968, [%rd179+145]; + ld.local.u8 %r4969, [%rd179+146]; + prmt.b32 %r4970, %r4969, %r4968, 30212; + ld.local.u8 %r4971, [%rd179+147]; + prmt.b32 %r4972, %r4971, %r4970, 28756; + ld.local.u8 %r4973, [%rd179+148]; + prmt.b32 %r4974, %r4973, %r4972, 1620; + ld.local.u8 %r4975, [%rd179+149]; + ld.local.u8 %r4976, [%rd179+150]; + prmt.b32 %r4977, %r4976, %r4975, 30212; + ld.local.u8 %r4978, [%rd179+151]; + prmt.b32 %r4979, %r4978, %r4977, 28756; + ld.local.u8 %r4980, [%rd179+152]; + prmt.b32 %r4981, %r4980, %r4979, 1620; + ld.local.u8 %r4982, [%rd179+153]; + ld.local.u8 %r4983, [%rd179+154]; + prmt.b32 %r4984, %r4983, %r4982, 30212; + ld.local.u8 %r4985, [%rd179+155]; + prmt.b32 %r4986, %r4985, %r4984, 28756; + ld.local.u8 %r4987, [%rd179+156]; + prmt.b32 %r4988, %r4987, %r4986, 1620; + ld.local.u8 %r4989, [%rd179+157]; + ld.local.u8 %r4990, [%rd179+158]; + prmt.b32 %r4991, %r4990, %r4989, 30212; + ld.local.u8 %r4992, [%rd179+159]; + prmt.b32 %r4993, %r4992, %r4991, 28756; + ld.local.u8 %r4994, [%rd179+160]; + prmt.b32 %r4995, %r4994, %r4993, 1620; + ld.local.u8 %r4996, [%rd179+161]; + ld.local.u8 %r4997, [%rd179+162]; + prmt.b32 %r4998, %r4997, %r4996, 30212; + ld.local.u8 %r4999, [%rd179+163]; + prmt.b32 %r5000, %r4999, %r4998, 28756; + ld.local.u8 %r5001, [%rd179+164]; + prmt.b32 %r5002, %r5001, %r5000, 1620; + ld.local.u8 %r5003, [%rd179+165]; + ld.local.u8 %r5004, [%rd179+166]; + prmt.b32 %r5005, %r5004, %r5003, 30212; + ld.local.u8 %r5006, [%rd179+167]; + prmt.b32 %r5007, %r5006, %r5005, 28756; + ld.local.u8 %r5008, [%rd179+168]; + prmt.b32 %r5009, %r5008, %r5007, 1620; + ld.local.u8 %r5010, [%rd179+169]; + ld.local.u8 %r5011, [%rd179+170]; + prmt.b32 %r5012, %r5011, %r5010, 30212; + ld.local.u8 %r5013, [%rd179+171]; + prmt.b32 %r5014, %r5013, %r5012, 28756; + ld.local.u8 %r5015, [%rd179+172]; + prmt.b32 %r5016, %r5015, %r5014, 1620; + ld.local.u8 %r5017, [%rd179+173]; + ld.local.u8 %r5018, [%rd179+174]; + prmt.b32 %r5019, %r5018, %r5017, 30212; + ld.local.u8 %r5020, [%rd179+175]; + prmt.b32 %r5021, %r5020, %r5019, 28756; + ld.local.u8 %r5022, [%rd179+176]; + prmt.b32 %r5023, %r5022, %r5021, 1620; + ld.local.u8 %r5024, [%rd179+177]; + ld.local.u8 %r5025, [%rd179+178]; + prmt.b32 %r5026, %r5025, %r5024, 30212; + ld.local.u8 %r5027, [%rd179+179]; + prmt.b32 %r5028, %r5027, %r5026, 28756; + ld.local.u8 %r5029, [%rd179+180]; + prmt.b32 %r5030, %r5029, %r5028, 1620; + ld.local.u8 %r5031, [%rd179+181]; + ld.local.u8 %r5032, [%rd179+182]; + prmt.b32 %r5033, %r5032, %r5031, 30212; + ld.local.u8 %r5034, [%rd179+183]; + prmt.b32 %r5035, %r5034, %r5033, 28756; + ld.local.u8 %r5036, [%rd179+184]; + prmt.b32 %r5037, %r5036, %r5035, 1620; + ld.local.u8 %r5038, [%rd179+185]; + ld.local.u8 %r5039, [%rd179+186]; + prmt.b32 %r5040, %r5039, %r5038, 30212; + ld.local.u8 %r5041, [%rd179+187]; + prmt.b32 %r5042, %r5041, %r5040, 28756; + ld.local.u8 %r5043, [%rd179+188]; + prmt.b32 %r5044, %r5043, %r5042, 1620; + ld.local.u8 %r5045, [%rd179+189]; + ld.local.u8 %r5046, [%rd179+190]; + prmt.b32 %r5047, %r5046, %r5045, 30212; + ld.local.u8 %r5048, [%rd179+191]; + prmt.b32 %r5049, %r5048, %r5047, 28756; + ld.local.u8 %r5050, [%rd179+192]; + prmt.b32 %r5051, %r5050, %r5049, 1620; + ld.local.u8 %r5052, [%rd179+193]; + ld.local.u8 %r5053, [%rd179+194]; + prmt.b32 %r5054, %r5053, %r5052, 30212; + ld.local.u8 %r5055, [%rd179+195]; + prmt.b32 %r5056, %r5055, %r5054, 28756; + ld.local.u8 %r5057, [%rd179+196]; + prmt.b32 %r5058, %r5057, %r5056, 1620; + ld.local.u8 %r5059, [%rd179+197]; + ld.local.u8 %r5060, [%rd179+198]; + prmt.b32 %r5061, %r5060, %r5059, 30212; + ld.local.u8 %r5062, [%rd179+199]; + prmt.b32 %r5063, %r5062, %r5061, 28756; + ld.local.u8 %r5064, [%rd179+200]; + prmt.b32 %r5065, %r5064, %r5063, 1620; + ld.local.u8 %r5066, [%rd179+201]; + ld.local.u8 %r5067, [%rd179+202]; + prmt.b32 %r5068, %r5067, %r5066, 30212; + ld.local.u8 %r5069, [%rd179+203]; + prmt.b32 %r5070, %r5069, %r5068, 28756; + ld.local.u8 %r5071, [%rd179+204]; + prmt.b32 %r5072, %r5071, %r5070, 1620; + ld.local.u8 %r5073, [%rd179+205]; + ld.local.u8 %r5074, [%rd179+206]; + prmt.b32 %r5075, %r5074, %r5073, 30212; + ld.local.u8 %r5076, [%rd179+207]; + prmt.b32 %r5077, %r5076, %r5075, 28756; + ld.local.u8 %r5078, [%rd179+208]; + prmt.b32 %r5079, %r5078, %r5077, 1620; + or.b32 %r5080, %r4967, 4; + ld.local.u8 %r5081, [%rd3+-120]; + ld.local.u8 %r5082, [%rd3+-119]; + prmt.b32 %r5083, %r5082, %r5081, 30212; + ld.local.u8 %r5084, [%rd3+-118]; + ld.local.u8 %r5085, [%rd3+-117]; + prmt.b32 %r5086, %r5085, %r5084, 30212; + prmt.b32 %r5087, %r5086, %r5083, 4180; + ld.local.u8 %r5088, [%rd3+-136]; + ld.local.u8 %r5089, [%rd3+-135]; + prmt.b32 %r5090, %r5089, %r5088, 30212; + ld.local.u8 %r5091, [%rd3+-134]; + ld.local.u8 %r5092, [%rd3+-133]; + prmt.b32 %r5093, %r5092, %r5091, 30212; + prmt.b32 %r5094, %r5093, %r5090, 4180; + add.s32 %r5095, %r5087, %r5094; + add.s32 %r5096, %r5095, %r4974; + shf.l.wrap.b32 %r5097, %r5096, %r5096, 16; + add.s32 %r5098, %r5097, 1779033703; + xor.b32 %r5099, %r5098, %r5087; + shf.l.wrap.b32 %r5100, %r5099, %r5099, 20; + add.s32 %r5101, %r4981, %r5096; + add.s32 %r5102, %r5101, %r5100; + xor.b32 %r5103, %r5102, %r5097; + shf.l.wrap.b32 %r5104, %r5103, %r5103, 24; + add.s32 %r5105, %r5104, %r5098; + xor.b32 %r5106, %r5105, %r5100; + shf.l.wrap.b32 %r5107, %r5106, %r5106, 25; + ld.local.u8 %r5108, [%rd3+-116]; + ld.local.u8 %r5109, [%rd3+-115]; + prmt.b32 %r5110, %r5109, %r5108, 30212; + ld.local.u8 %r5111, [%rd3+-114]; + ld.local.u8 %r5112, [%rd3+-113]; + prmt.b32 %r5113, %r5112, %r5111, 30212; + prmt.b32 %r5114, %r5113, %r5110, 4180; + ld.local.u8 %r5115, [%rd3+-132]; + ld.local.u8 %r5116, [%rd3+-131]; + prmt.b32 %r5117, %r5116, %r5115, 30212; + ld.local.u8 %r5118, [%rd3+-130]; + ld.local.u8 %r5119, [%rd3+-129]; + prmt.b32 %r5120, %r5119, %r5118, 30212; + prmt.b32 %r5121, %r5120, %r5117, 4180; + add.s32 %r5122, %r5114, %r5121; + add.s32 %r5123, %r5122, %r4988; + shf.l.wrap.b32 %r5124, %r5123, %r5123, 16; + add.s32 %r5125, %r5124, -1150833019; + xor.b32 %r5126, %r5125, %r5114; + shf.l.wrap.b32 %r5127, %r5126, %r5126, 20; + add.s32 %r5128, %r4995, %r5123; + add.s32 %r5129, %r5128, %r5127; + xor.b32 %r5130, %r5129, %r5124; + shf.l.wrap.b32 %r5131, %r5130, %r5130, 24; + add.s32 %r5132, %r5131, %r5125; + xor.b32 %r5133, %r5132, %r5127; + shf.l.wrap.b32 %r5134, %r5133, %r5133, 25; + ld.local.u8 %r5135, [%rd3+-112]; + ld.local.u8 %r5136, [%rd3+-111]; + prmt.b32 %r5137, %r5136, %r5135, 30212; + ld.local.u8 %r5138, [%rd3+-110]; + ld.local.u8 %r5139, [%rd3+-109]; + prmt.b32 %r5140, %r5139, %r5138, 30212; + prmt.b32 %r5141, %r5140, %r5137, 4180; + ld.local.u8 %r5142, [%rd3+-128]; + ld.local.u8 %r5143, [%rd3+-127]; + prmt.b32 %r5144, %r5143, %r5142, 30212; + ld.local.u8 %r5145, [%rd3+-126]; + ld.local.u8 %r5146, [%rd3+-125]; + prmt.b32 %r5147, %r5146, %r5145, 30212; + prmt.b32 %r5148, %r5147, %r5144, 4180; + add.s32 %r5149, %r5141, %r5148; + add.s32 %r5150, %r5149, %r5002; + shr.u32 %r5151, %r5150, 16; + shl.b32 %r5152, %r5150, 16; + xor.b32 %r5153, %r5152, 4194304; + or.b32 %r5154, %r5153, %r5151; + add.s32 %r5155, %r5154, 1013904242; + xor.b32 %r5156, %r5155, %r5141; + shf.l.wrap.b32 %r5157, %r5156, %r5156, 20; + add.s32 %r5158, %r5009, %r5150; + add.s32 %r5159, %r5158, %r5157; + xor.b32 %r5160, %r5159, %r5154; + shf.l.wrap.b32 %r5161, %r5160, %r5160, 24; + add.s32 %r5162, %r5161, %r5155; + xor.b32 %r5163, %r5162, %r5157; + shf.l.wrap.b32 %r5164, %r5163, %r5163, 25; + ld.local.u8 %r5165, [%rd3+-108]; + ld.local.u8 %r5166, [%rd3+-107]; + prmt.b32 %r5167, %r5166, %r5165, 30212; + ld.local.u8 %r5168, [%rd3+-106]; + ld.local.u8 %r5169, [%rd3+-105]; + prmt.b32 %r5170, %r5169, %r5168, 30212; + prmt.b32 %r5171, %r5170, %r5167, 4180; + ld.local.u8 %r5172, [%rd3+-124]; + ld.local.u8 %r5173, [%rd3+-123]; + prmt.b32 %r5174, %r5173, %r5172, 30212; + ld.local.u8 %r5175, [%rd3+-122]; + ld.local.u8 %r5176, [%rd3+-121]; + prmt.b32 %r5177, %r5176, %r5175, 30212; + prmt.b32 %r5178, %r5177, %r5174, 4180; + add.s32 %r5179, %r5171, %r5178; + add.s32 %r5180, %r5179, %r5016; + xor.b32 %r5181, %r5180, %r5080; + shr.u32 %r5182, %r5180, 16; + shl.b32 %r5183, %r5181, 16; + or.b32 %r5184, %r5183, %r5182; + add.s32 %r5185, %r5184, -1521486534; + xor.b32 %r5186, %r5185, %r5171; + shf.l.wrap.b32 %r5187, %r5186, %r5186, 20; + add.s32 %r5188, %r5023, %r5180; + add.s32 %r5189, %r5188, %r5187; + xor.b32 %r5190, %r5189, %r5184; + shf.l.wrap.b32 %r5191, %r5190, %r5190, 24; + add.s32 %r5192, %r5191, %r5185; + xor.b32 %r5193, %r5192, %r5187; + shf.l.wrap.b32 %r5194, %r5193, %r5193, 25; + add.s32 %r5195, %r5134, %r5102; + add.s32 %r5196, %r5195, %r5030; + xor.b32 %r5197, %r5191, %r5196; + shf.l.wrap.b32 %r5198, %r5197, %r5197, 16; + add.s32 %r5199, %r5198, %r5162; + xor.b32 %r5200, %r5199, %r5134; + shf.l.wrap.b32 %r5201, %r5200, %r5200, 20; + add.s32 %r5202, %r5037, %r5196; + add.s32 %r5203, %r5202, %r5201; + xor.b32 %r5204, %r5203, %r5198; + shf.l.wrap.b32 %r5205, %r5204, %r5204, 24; + add.s32 %r5206, %r5205, %r5199; + xor.b32 %r5207, %r5206, %r5201; + shf.l.wrap.b32 %r5208, %r5207, %r5207, 25; + add.s32 %r5209, %r5164, %r5129; + add.s32 %r5210, %r5209, %r5044; + xor.b32 %r5211, %r5210, %r5104; + shf.l.wrap.b32 %r5212, %r5211, %r5211, 16; + add.s32 %r5213, %r5212, %r5192; + xor.b32 %r5214, %r5213, %r5164; + shf.l.wrap.b32 %r5215, %r5214, %r5214, 20; + add.s32 %r5216, %r5051, %r5210; + add.s32 %r5217, %r5216, %r5215; + xor.b32 %r5218, %r5217, %r5212; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 24; + add.s32 %r5220, %r5219, %r5213; + xor.b32 %r5221, %r5220, %r5215; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 25; + add.s32 %r5223, %r5194, %r5159; + add.s32 %r5224, %r5223, %r5058; + xor.b32 %r5225, %r5224, %r5131; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 16; + add.s32 %r5227, %r5226, %r5105; + xor.b32 %r5228, %r5227, %r5194; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 20; + add.s32 %r5230, %r5065, %r5224; + add.s32 %r5231, %r5230, %r5229; + xor.b32 %r5232, %r5231, %r5226; + shf.l.wrap.b32 %r5233, %r5232, %r5232, 24; + add.s32 %r5234, %r5233, %r5227; + xor.b32 %r5235, %r5234, %r5229; + shf.l.wrap.b32 %r5236, %r5235, %r5235, 25; + add.s32 %r5237, %r5189, %r5107; + add.s32 %r5238, %r5237, %r5072; + xor.b32 %r5239, %r5238, %r5161; + shf.l.wrap.b32 %r5240, %r5239, %r5239, 16; + add.s32 %r5241, %r5240, %r5132; + xor.b32 %r5242, %r5241, %r5107; + shf.l.wrap.b32 %r5243, %r5242, %r5242, 20; + add.s32 %r5244, %r5079, %r5238; + add.s32 %r5245, %r5244, %r5243; + xor.b32 %r5246, %r5245, %r5240; + shf.l.wrap.b32 %r5247, %r5246, %r5246, 24; + add.s32 %r5248, %r5247, %r5241; + xor.b32 %r5249, %r5248, %r5243; + shf.l.wrap.b32 %r5250, %r5249, %r5249, 25; + add.s32 %r5251, %r5203, %r4988; + add.s32 %r5252, %r5251, %r5250; + xor.b32 %r5253, %r5252, %r5219; + shf.l.wrap.b32 %r5254, %r5253, %r5253, 16; + add.s32 %r5255, %r5254, %r5234; + xor.b32 %r5256, %r5255, %r5250; + shf.l.wrap.b32 %r5257, %r5256, %r5256, 20; + add.s32 %r5258, %r5252, %r5016; + add.s32 %r5259, %r5258, %r5257; + xor.b32 %r5260, %r5259, %r5254; + shf.l.wrap.b32 %r5261, %r5260, %r5260, 24; + add.s32 %r5262, %r5261, %r5255; + xor.b32 %r5263, %r5262, %r5257; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 25; + add.s32 %r5265, %r5217, %r4995; + add.s32 %r5266, %r5265, %r5208; + xor.b32 %r5267, %r5233, %r5266; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 16; + add.s32 %r5269, %r5248, %r5268; + xor.b32 %r5270, %r5269, %r5208; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 20; + add.s32 %r5272, %r5266, %r5044; + add.s32 %r5273, %r5272, %r5271; + xor.b32 %r5274, %r5273, %r5268; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 24; + add.s32 %r5276, %r5275, %r5269; + xor.b32 %r5277, %r5276, %r5271; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 25; + add.s32 %r5279, %r5222, %r5023; + add.s32 %r5280, %r5279, %r5231; + xor.b32 %r5281, %r5247, %r5280; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 16; + add.s32 %r5283, %r5282, %r5206; + xor.b32 %r5284, %r5283, %r5222; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 20; + add.s32 %r5286, %r5280, %r4974; + add.s32 %r5287, %r5286, %r5285; + xor.b32 %r5288, %r5287, %r5282; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 24; + add.s32 %r5290, %r5289, %r5283; + xor.b32 %r5291, %r5290, %r5285; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 25; + add.s32 %r5293, %r5236, %r5002; + add.s32 %r5294, %r5293, %r5245; + xor.b32 %r5295, %r5294, %r5205; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 16; + add.s32 %r5297, %r5296, %r5220; + xor.b32 %r5298, %r5297, %r5236; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 20; + add.s32 %r5300, %r5294, %r5065; + add.s32 %r5301, %r5300, %r5299; + xor.b32 %r5302, %r5301, %r5296; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 24; + add.s32 %r5304, %r5303, %r5297; + xor.b32 %r5305, %r5304, %r5299; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 25; + add.s32 %r5307, %r5278, %r4981; + add.s32 %r5308, %r5307, %r5259; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 16; + add.s32 %r5311, %r5310, %r5290; + xor.b32 %r5312, %r5311, %r5278; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 20; + add.s32 %r5314, %r5308, %r5051; + add.s32 %r5315, %r5314, %r5313; + xor.b32 %r5316, %r5315, %r5310; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 24; + add.s32 %r5318, %r5317, %r5311; + xor.b32 %r5319, %r5318, %r5313; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 25; + add.s32 %r5321, %r5273, %r5058; + add.s32 %r5322, %r5321, %r5292; + xor.b32 %r5323, %r5261, %r5322; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 16; + add.s32 %r5325, %r5324, %r5304; + xor.b32 %r5326, %r5325, %r5292; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 20; + add.s32 %r5328, %r5322, %r5009; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5324; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 24; + add.s32 %r5332, %r5331, %r5325; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 25; + add.s32 %r5335, %r5287, %r5037; + add.s32 %r5336, %r5335, %r5306; + xor.b32 %r5337, %r5336, %r5275; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 16; + add.s32 %r5339, %r5338, %r5262; + xor.b32 %r5340, %r5339, %r5306; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 20; + add.s32 %r5342, %r5336, %r5072; + add.s32 %r5343, %r5342, %r5341; + xor.b32 %r5344, %r5343, %r5338; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 24; + add.s32 %r5346, %r5345, %r5339; + xor.b32 %r5347, %r5346, %r5341; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 25; + add.s32 %r5349, %r5301, %r5079; + add.s32 %r5350, %r5349, %r5264; + xor.b32 %r5351, %r5350, %r5289; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 16; + add.s32 %r5353, %r5352, %r5276; + xor.b32 %r5354, %r5353, %r5264; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 20; + add.s32 %r5356, %r5350, %r5030; + add.s32 %r5357, %r5356, %r5355; + xor.b32 %r5358, %r5357, %r5352; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 24; + add.s32 %r5360, %r5359, %r5353; + xor.b32 %r5361, %r5360, %r5355; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 25; + add.s32 %r5363, %r5315, %r4995; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5331; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 16; + add.s32 %r5367, %r5366, %r5346; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 20; + add.s32 %r5370, %r5364, %r5002; + add.s32 %r5371, %r5370, %r5369; + xor.b32 %r5372, %r5371, %r5366; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 24; + add.s32 %r5374, %r5373, %r5367; + xor.b32 %r5375, %r5374, %r5369; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 25; + add.s32 %r5377, %r5329, %r5044; + add.s32 %r5378, %r5377, %r5320; + xor.b32 %r5379, %r5378, %r5345; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 16; + add.s32 %r5381, %r5380, %r5360; + xor.b32 %r5382, %r5381, %r5320; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 20; + add.s32 %r5384, %r5378, %r5058; + add.s32 %r5385, %r5384, %r5383; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 24; + add.s32 %r5388, %r5387, %r5381; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 25; + add.s32 %r5391, %r5343, %r5065; + add.s32 %r5392, %r5391, %r5334; + xor.b32 %r5393, %r5359, %r5392; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 16; + add.s32 %r5395, %r5394, %r5318; + xor.b32 %r5396, %r5395, %r5334; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 20; + add.s32 %r5398, %r5392, %r4988; + add.s32 %r5399, %r5398, %r5397; + xor.b32 %r5400, %r5399, %r5394; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 24; + add.s32 %r5402, %r5401, %r5395; + xor.b32 %r5403, %r5402, %r5397; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 25; + add.s32 %r5405, %r5348, %r5023; + add.s32 %r5406, %r5405, %r5357; + xor.b32 %r5407, %r5406, %r5317; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 16; + add.s32 %r5409, %r5408, %r5332; + xor.b32 %r5410, %r5409, %r5348; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 20; + add.s32 %r5412, %r5406, %r5072; + add.s32 %r5413, %r5412, %r5411; + xor.b32 %r5414, %r5413, %r5408; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 24; + add.s32 %r5416, %r5415, %r5409; + xor.b32 %r5417, %r5416, %r5411; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 25; + add.s32 %r5419, %r5371, %r5016; + add.s32 %r5420, %r5419, %r5390; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 16; + add.s32 %r5423, %r5422, %r5402; + xor.b32 %r5424, %r5423, %r5390; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 20; + add.s32 %r5426, %r5420, %r5009; + add.s32 %r5427, %r5426, %r5425; + xor.b32 %r5428, %r5427, %r5422; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 24; + add.s32 %r5430, %r5429, %r5423; + xor.b32 %r5431, %r5430, %r5425; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 25; + add.s32 %r5433, %r5385, %r5037; + add.s32 %r5434, %r5433, %r5404; + xor.b32 %r5435, %r5373, %r5434; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 16; + add.s32 %r5437, %r5436, %r5416; + xor.b32 %r5438, %r5437, %r5404; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 20; + add.s32 %r5440, %r5434, %r4974; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5436; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 24; + add.s32 %r5444, %r5443, %r5437; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 25; + add.s32 %r5447, %r5399, %r5051; + add.s32 %r5448, %r5447, %r5418; + xor.b32 %r5449, %r5448, %r5387; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 16; + add.s32 %r5451, %r5450, %r5374; + xor.b32 %r5452, %r5451, %r5418; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 20; + add.s32 %r5454, %r5448, %r5079; + add.s32 %r5455, %r5454, %r5453; + xor.b32 %r5456, %r5455, %r5450; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 24; + add.s32 %r5458, %r5457, %r5451; + xor.b32 %r5459, %r5458, %r5453; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 25; + add.s32 %r5461, %r5413, %r5030; + add.s32 %r5462, %r5461, %r5376; + xor.b32 %r5463, %r5462, %r5401; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 16; + add.s32 %r5465, %r5464, %r5388; + xor.b32 %r5466, %r5465, %r5376; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 20; + add.s32 %r5468, %r5462, %r4981; + add.s32 %r5469, %r5468, %r5467; + xor.b32 %r5470, %r5469, %r5464; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 24; + add.s32 %r5472, %r5471, %r5465; + xor.b32 %r5473, %r5472, %r5467; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 25; + add.s32 %r5475, %r5427, %r5044; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5443; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 16; + add.s32 %r5479, %r5478, %r5458; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 20; + add.s32 %r5482, %r5476, %r5023; + add.s32 %r5483, %r5482, %r5481; + xor.b32 %r5484, %r5483, %r5478; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 24; + add.s32 %r5486, %r5485, %r5479; + xor.b32 %r5487, %r5486, %r5481; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 25; + add.s32 %r5489, %r5441, %r5058; + add.s32 %r5490, %r5489, %r5432; + xor.b32 %r5491, %r5490, %r5457; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 16; + add.s32 %r5493, %r5492, %r5472; + xor.b32 %r5494, %r5493, %r5432; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 20; + add.s32 %r5496, %r5490, %r5037; + add.s32 %r5497, %r5496, %r5495; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 24; + add.s32 %r5500, %r5499, %r5493; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 25; + add.s32 %r5503, %r5455, %r5072; + add.s32 %r5504, %r5503, %r5446; + xor.b32 %r5505, %r5471, %r5504; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 16; + add.s32 %r5507, %r5506, %r5430; + xor.b32 %r5508, %r5507, %r5446; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 20; + add.s32 %r5510, %r5504, %r4995; + add.s32 %r5511, %r5510, %r5509; + xor.b32 %r5512, %r5511, %r5506; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 24; + add.s32 %r5514, %r5513, %r5507; + xor.b32 %r5515, %r5514, %r5509; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 25; + add.s32 %r5517, %r5469, %r5065; + add.s32 %r5518, %r5517, %r5460; + xor.b32 %r5519, %r5518, %r5429; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 16; + add.s32 %r5521, %r5520, %r5444; + xor.b32 %r5522, %r5521, %r5460; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 20; + add.s32 %r5524, %r5518, %r5079; + add.s32 %r5525, %r5524, %r5523; + xor.b32 %r5526, %r5525, %r5520; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 24; + add.s32 %r5528, %r5527, %r5521; + xor.b32 %r5529, %r5528, %r5523; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 25; + add.s32 %r5531, %r5483, %r5002; + add.s32 %r5532, %r5531, %r5502; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 16; + add.s32 %r5535, %r5534, %r5514; + xor.b32 %r5536, %r5535, %r5502; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 20; + add.s32 %r5538, %r5532, %r4974; + add.s32 %r5539, %r5538, %r5537; + xor.b32 %r5540, %r5539, %r5534; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 24; + add.s32 %r5542, %r5541, %r5535; + xor.b32 %r5543, %r5542, %r5537; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 25; + add.s32 %r5545, %r5497, %r5051; + add.s32 %r5546, %r5545, %r5516; + xor.b32 %r5547, %r5485, %r5546; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 16; + add.s32 %r5549, %r5548, %r5528; + xor.b32 %r5550, %r5549, %r5516; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 20; + add.s32 %r5552, %r5546, %r4988; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5548; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 24; + add.s32 %r5556, %r5555, %r5549; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 25; + add.s32 %r5559, %r5511, %r5009; + add.s32 %r5560, %r5559, %r5530; + xor.b32 %r5561, %r5560, %r5499; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 16; + add.s32 %r5563, %r5562, %r5486; + xor.b32 %r5564, %r5563, %r5530; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 20; + add.s32 %r5566, %r5560, %r5030; + add.s32 %r5567, %r5566, %r5565; + xor.b32 %r5568, %r5567, %r5562; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 24; + add.s32 %r5570, %r5569, %r5563; + xor.b32 %r5571, %r5570, %r5565; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 25; + add.s32 %r5573, %r5525, %r4981; + add.s32 %r5574, %r5573, %r5488; + xor.b32 %r5575, %r5574, %r5513; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 16; + add.s32 %r5577, %r5576, %r5500; + xor.b32 %r5578, %r5577, %r5488; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 20; + add.s32 %r5580, %r5574, %r5016; + add.s32 %r5581, %r5580, %r5579; + xor.b32 %r5582, %r5581, %r5576; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 24; + add.s32 %r5584, %r5583, %r5577; + xor.b32 %r5585, %r5584, %r5579; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 25; + add.s32 %r5587, %r5539, %r5058; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5555; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 16; + add.s32 %r5591, %r5590, %r5570; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 20; + add.s32 %r5594, %r5588, %r5065; + add.s32 %r5595, %r5594, %r5593; + xor.b32 %r5596, %r5595, %r5590; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 24; + add.s32 %r5598, %r5597, %r5591; + xor.b32 %r5599, %r5598, %r5593; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 25; + add.s32 %r5601, %r5553, %r5037; + add.s32 %r5602, %r5601, %r5544; + xor.b32 %r5603, %r5602, %r5569; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 16; + add.s32 %r5605, %r5604, %r5584; + xor.b32 %r5606, %r5605, %r5544; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 20; + add.s32 %r5608, %r5602, %r5051; + add.s32 %r5609, %r5608, %r5607; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 24; + add.s32 %r5612, %r5611, %r5605; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 25; + add.s32 %r5615, %r5567, %r5079; + add.s32 %r5616, %r5615, %r5558; + xor.b32 %r5617, %r5583, %r5616; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 16; + add.s32 %r5619, %r5618, %r5542; + xor.b32 %r5620, %r5619, %r5558; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 20; + add.s32 %r5622, %r5616, %r5044; + add.s32 %r5623, %r5622, %r5621; + xor.b32 %r5624, %r5623, %r5618; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 24; + add.s32 %r5626, %r5625, %r5619; + xor.b32 %r5627, %r5626, %r5621; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 25; + add.s32 %r5629, %r5581, %r5072; + add.s32 %r5630, %r5629, %r5572; + xor.b32 %r5631, %r5630, %r5541; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 16; + add.s32 %r5633, %r5632, %r5556; + xor.b32 %r5634, %r5633, %r5572; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 20; + add.s32 %r5636, %r5630, %r5030; + add.s32 %r5637, %r5636, %r5635; + xor.b32 %r5638, %r5637, %r5632; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 24; + add.s32 %r5640, %r5639, %r5633; + xor.b32 %r5641, %r5640, %r5635; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 25; + add.s32 %r5643, %r5595, %r5023; + add.s32 %r5644, %r5643, %r5614; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 16; + add.s32 %r5647, %r5646, %r5626; + xor.b32 %r5648, %r5647, %r5614; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 20; + add.s32 %r5650, %r5644, %r4988; + add.s32 %r5651, %r5650, %r5649; + xor.b32 %r5652, %r5651, %r5646; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 24; + add.s32 %r5654, %r5653, %r5647; + xor.b32 %r5655, %r5654, %r5649; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 25; + add.s32 %r5657, %r5609, %r5009; + add.s32 %r5658, %r5657, %r5628; + xor.b32 %r5659, %r5597, %r5658; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 16; + add.s32 %r5661, %r5660, %r5640; + xor.b32 %r5662, %r5661, %r5628; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 20; + add.s32 %r5664, %r5658, %r4995; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5660; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 24; + add.s32 %r5668, %r5667, %r5661; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 25; + add.s32 %r5671, %r5623, %r4974; + add.s32 %r5672, %r5671, %r5642; + xor.b32 %r5673, %r5672, %r5611; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 16; + add.s32 %r5675, %r5674, %r5598; + xor.b32 %r5676, %r5675, %r5642; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 20; + add.s32 %r5678, %r5672, %r4981; + add.s32 %r5679, %r5678, %r5677; + xor.b32 %r5680, %r5679, %r5674; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 24; + add.s32 %r5682, %r5681, %r5675; + xor.b32 %r5683, %r5682, %r5677; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 25; + add.s32 %r5685, %r5637, %r5016; + add.s32 %r5686, %r5685, %r5600; + xor.b32 %r5687, %r5686, %r5625; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 16; + add.s32 %r5689, %r5688, %r5612; + xor.b32 %r5690, %r5689, %r5600; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 20; + add.s32 %r5692, %r5686, %r5002; + add.s32 %r5693, %r5692, %r5691; + xor.b32 %r5694, %r5693, %r5688; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 24; + add.s32 %r5696, %r5695, %r5689; + xor.b32 %r5697, %r5696, %r5691; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 25; + add.s32 %r5699, %r5651, %r5037; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5667; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 16; + add.s32 %r5703, %r5702, %r5682; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 20; + add.s32 %r5706, %r5700, %r5072; + add.s32 %r5707, %r5706, %r5705; + xor.b32 %r5708, %r5707, %r5702; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 24; + add.s32 %r5710, %r5709, %r5703; + xor.b32 %r5711, %r5710, %r5705; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 25; + add.s32 %r5713, %r5665, %r5051; + add.s32 %r5714, %r5713, %r5656; + xor.b32 %r5715, %r5714, %r5681; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 16; + add.s32 %r5717, %r5716, %r5696; + xor.b32 %r5718, %r5717, %r5656; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 20; + add.s32 %r5720, %r5714, %r5009; + add.s32 %r5721, %r5720, %r5719; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 24; + add.s32 %r5724, %r5723, %r5717; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 25; + add.s32 %r5727, %r5679, %r5030; + add.s32 %r5728, %r5727, %r5670; + xor.b32 %r5729, %r5695, %r5728; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 16; + add.s32 %r5731, %r5730, %r5654; + xor.b32 %r5732, %r5731, %r5670; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 20; + add.s32 %r5734, %r5728, %r5058; + add.s32 %r5735, %r5734, %r5733; + xor.b32 %r5736, %r5735, %r5730; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 24; + add.s32 %r5738, %r5737, %r5731; + xor.b32 %r5739, %r5738, %r5733; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 25; + add.s32 %r5741, %r5693, %r5079; + add.s32 %r5742, %r5741, %r5684; + xor.b32 %r5743, %r5742, %r5653; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 16; + add.s32 %r5745, %r5744, %r5668; + xor.b32 %r5746, %r5745, %r5684; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 20; + add.s32 %r5748, %r5742, %r4981; + add.s32 %r5749, %r5748, %r5747; + xor.b32 %r5750, %r5749, %r5744; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 24; + add.s32 %r5752, %r5751, %r5745; + xor.b32 %r5753, %r5752, %r5747; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 25; + add.s32 %r5755, %r5707, %r5065; + add.s32 %r5756, %r5755, %r5726; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 16; + add.s32 %r5759, %r5758, %r5738; + xor.b32 %r5760, %r5759, %r5726; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 20; + add.s32 %r5762, %r5756, %r4995; + add.s32 %r5763, %r5762, %r5761; + xor.b32 %r5764, %r5763, %r5758; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 24; + add.s32 %r5766, %r5765, %r5759; + xor.b32 %r5767, %r5766, %r5761; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 25; + add.s32 %r5769, %r5721, %r4974; + add.s32 %r5770, %r5769, %r5740; + xor.b32 %r5771, %r5709, %r5770; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 16; + add.s32 %r5773, %r5772, %r5752; + xor.b32 %r5774, %r5773, %r5740; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 20; + add.s32 %r5776, %r5770, %r5044; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5772; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 24; + add.s32 %r5780, %r5779, %r5773; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 25; + add.s32 %r5783, %r5735, %r4988; + add.s32 %r5784, %r5783, %r5754; + xor.b32 %r5785, %r5784, %r5723; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 16; + add.s32 %r5787, %r5786, %r5710; + xor.b32 %r5788, %r5787, %r5754; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 20; + add.s32 %r5790, %r5784, %r5016; + add.s32 %r5791, %r5790, %r5789; + xor.b32 %r5792, %r5791, %r5786; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 24; + add.s32 %r5794, %r5793, %r5787; + xor.b32 %r5795, %r5794, %r5789; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 25; + add.s32 %r5797, %r5749, %r5002; + add.s32 %r5798, %r5797, %r5712; + xor.b32 %r5799, %r5798, %r5737; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 16; + add.s32 %r5801, %r5800, %r5724; + xor.b32 %r5802, %r5801, %r5712; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 20; + add.s32 %r5804, %r5798, %r5023; + add.s32 %r5805, %r5804, %r5803; + xor.b32 %r5806, %r5805, %r5800; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 24; + add.s32 %r5808, %r5807, %r5801; + xor.b32 %r5809, %r5808, %r5803; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 25; + add.s32 %r5811, %r5763, %r5051; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5779; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 16; + add.s32 %r5815, %r5814, %r5794; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 20; + add.s32 %r5818, %r5812, %r5079; + add.s32 %r5819, %r5818, %r5817; + xor.b32 %r5820, %r5819, %r5814; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 24; + add.s32 %r5822, %r5821, %r5815; + xor.b32 %r5823, %r5822, %r5817; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 25; + add.s32 %r5825, %r5777, %r5009; + add.s32 %r5826, %r5825, %r5768; + xor.b32 %r5827, %r5826, %r5793; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 16; + add.s32 %r5829, %r5828, %r5808; + xor.b32 %r5830, %r5829, %r5768; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 20; + add.s32 %r5832, %r5826, %r4974; + add.s32 %r5833, %r5832, %r5831; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 24; + add.s32 %r5836, %r5835, %r5829; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 25; + add.s32 %r5839, %r5791, %r4981; + add.s32 %r5840, %r5839, %r5782; + xor.b32 %r5841, %r5807, %r5840; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 16; + add.s32 %r5843, %r5842, %r5766; + xor.b32 %r5844, %r5843, %r5782; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 20; + add.s32 %r5846, %r5840, %r5037; + add.s32 %r5847, %r5846, %r5845; + xor.b32 %r5848, %r5847, %r5842; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 24; + add.s32 %r5850, %r5849, %r5843; + xor.b32 %r5851, %r5850, %r5845; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 25; + add.s32 %r5853, %r5805, %r5030; + add.s32 %r5854, %r5853, %r5796; + xor.b32 %r5855, %r5854, %r5765; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 16; + add.s32 %r5857, %r5856, %r5780; + xor.b32 %r5858, %r5857, %r5796; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 20; + add.s32 %r5860, %r5854, %r5016; + add.s32 %r5861, %r5860, %r5859; + xor.b32 %r5862, %r5861, %r5856; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 24; + add.s32 %r5864, %r5863, %r5857; + xor.b32 %r5865, %r5864, %r5859; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 25; + add.s32 %r5867, %r5819, %r5072; + add.s32 %r5868, %r5867, %r5838; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 16; + add.s32 %r5871, %r5870, %r5850; + xor.b32 %r5872, %r5871, %r5838; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 20; + add.s32 %r5874, %r5868, %r5044; + add.s32 %r5875, %r5874, %r5873; + xor.b32 %r5876, %r5875, %r5870; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 24; + add.s32 %r5878, %r5877, %r5871; + xor.b32 %r5879, %r5878, %r5873; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 25; + add.s32 %r5881, %r5833, %r4988; + add.s32 %r5882, %r5881, %r5852; + xor.b32 %r5883, %r5821, %r5882; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 16; + add.s32 %r5885, %r5884, %r5864; + xor.b32 %r5886, %r5885, %r5852; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 20; + add.s32 %r5888, %r5882, %r5058; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5884; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 24; + add.s32 %r5892, %r5891, %r5885; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 25; + add.s32 %r5895, %r5847, %r4995; + add.s32 %r5896, %r5895, %r5866; + xor.b32 %r5897, %r5896, %r5835; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 16; + add.s32 %r5899, %r5898, %r5822; + xor.b32 %r5900, %r5899, %r5866; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 20; + add.s32 %r5902, %r5896, %r5002; + add.s32 %r5903, %r5902, %r5901; + xor.b32 %r5904, %r5903, %r5898; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 24; + add.s32 %r5906, %r5905, %r5899; + xor.b32 %r5907, %r5906, %r5901; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 25; + add.s32 %r5909, %r5861, %r5023; + add.s32 %r5910, %r5909, %r5824; + xor.b32 %r5911, %r5910, %r5849; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 16; + add.s32 %r5913, %r5912, %r5836; + xor.b32 %r5914, %r5913, %r5824; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 20; + add.s32 %r5916, %r5910, %r5065; + add.s32 %r5917, %r5916, %r5915; + xor.b32 %r5918, %r5917, %r5912; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 24; + add.s32 %r5920, %r5919, %r5913; + xor.b32 %r5921, %r5920, %r5915; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 25; + xor.b32 %r5923, %r5906, %r5875; + xor.b32 %r5924, %r5920, %r5889; + xor.b32 %r5925, %r5878, %r5903; + xor.b32 %r5926, %r5917, %r5892; + xor.b32 %r5927, %r5922, %r5891; + xor.b32 %r5928, %r5880, %r5905; + xor.b32 %r5929, %r5919, %r5894; + xor.b32 %r5930, %r5908, %r5877; + st.local.u8 [%rd179+145], %r5923; + shr.u32 %r5931, %r5923, 8; + st.local.u8 [%rd179+146], %r5931; + shr.u32 %r5932, %r5923, 16; + st.local.u8 [%rd179+147], %r5932; + shr.u32 %r5933, %r5923, 24; + st.local.u8 [%rd179+148], %r5933; + st.local.u8 [%rd179+149], %r5924; + shr.u32 %r5934, %r5924, 8; + st.local.u8 [%rd179+150], %r5934; + shr.u32 %r5935, %r5924, 16; + st.local.u8 [%rd179+151], %r5935; + shr.u32 %r5936, %r5924, 24; + st.local.u8 [%rd179+152], %r5936; + st.local.u8 [%rd179+153], %r5925; + shr.u32 %r5937, %r5925, 8; + st.local.u8 [%rd179+154], %r5937; + shr.u32 %r5938, %r5925, 16; + st.local.u8 [%rd179+155], %r5938; + shr.u32 %r5939, %r5925, 24; + st.local.u8 [%rd179+156], %r5939; + st.local.u8 [%rd179+157], %r5926; + shr.u32 %r5940, %r5926, 8; + st.local.u8 [%rd179+158], %r5940; + shr.u32 %r5941, %r5926, 16; + st.local.u8 [%rd179+159], %r5941; + shr.u32 %r5942, %r5926, 24; + st.local.u8 [%rd179+160], %r5942; + st.local.u8 [%rd179+161], %r5927; + shr.u32 %r5943, %r5927, 8; + st.local.u8 [%rd179+162], %r5943; + shr.u32 %r5944, %r5927, 16; + st.local.u8 [%rd179+163], %r5944; + shr.u32 %r5945, %r5927, 24; + st.local.u8 [%rd179+164], %r5945; + st.local.u8 [%rd179+165], %r5928; + shr.u32 %r5946, %r5928, 8; + st.local.u8 [%rd179+166], %r5946; + shr.u32 %r5947, %r5928, 16; + st.local.u8 [%rd179+167], %r5947; + shr.u32 %r5948, %r5928, 24; + st.local.u8 [%rd179+168], %r5948; + st.local.u8 [%rd179+169], %r5929; + shr.u32 %r5949, %r5929, 8; + st.local.u8 [%rd179+170], %r5949; + shr.u32 %r5950, %r5929, 16; + st.local.u8 [%rd179+171], %r5950; + shr.u32 %r5951, %r5929, 24; + st.local.u8 [%rd179+172], %r5951; + st.local.u8 [%rd179+173], %r5930; + shr.u32 %r5952, %r5930, 8; + st.local.u8 [%rd179+174], %r5952; + shr.u32 %r5953, %r5930, 16; + st.local.u8 [%rd179+175], %r5953; + shr.u32 %r5954, %r5930, 24; + st.local.u8 [%rd179+176], %r5954; + ld.local.u8 %rs176, [%rd3+8]; + add.s16 %rs177, %rs176, -1; + st.local.u8 [%rd3+8], %rs177; + cvt.u64.u16 %rd180, %rs177; + and.b64 %rd181, %rd180, 255; + setp.lt.u64 %p30, %rd227, %rd181; + and.b16 %rs178, %rs177, 255; + mul.wide.u16 %r11663, %rs178, 32; + @%p30 bra $L__BB1_34; + +$L__BB1_35: + cvt.s64.s32 %rd182, %r11663; + add.s64 %rd183, %rd2, %rd182; + mov.b32 {%rs179, %rs180}, %r3967; + st.local.u8 [%rd183+145], %rs179; + shr.u16 %rs181, %rs179, 8; + st.local.u8 [%rd183+146], %rs181; + st.local.u8 [%rd183+147], %rs180; + shr.u16 %rs182, %rs180, 8; + st.local.u8 [%rd183+148], %rs182; + mov.b32 {%rs183, %rs184}, %r3968; + st.local.u8 [%rd183+149], %rs183; + shr.u16 %rs185, %rs183, 8; + st.local.u8 [%rd183+150], %rs185; + st.local.u8 [%rd183+151], %rs184; + shr.u16 %rs186, %rs184, 8; + st.local.u8 [%rd183+152], %rs186; + mov.b32 {%rs187, %rs188}, %r3969; + st.local.u8 [%rd183+153], %rs187; + shr.u16 %rs189, %rs187, 8; + st.local.u8 [%rd183+154], %rs189; + st.local.u8 [%rd183+155], %rs188; + shr.u16 %rs190, %rs188, 8; + st.local.u8 [%rd183+156], %rs190; + mov.b32 {%rs191, %rs192}, %r3970; + st.local.u8 [%rd183+157], %rs191; + shr.u16 %rs193, %rs191, 8; + st.local.u8 [%rd183+158], %rs193; + st.local.u8 [%rd183+159], %rs192; + shr.u16 %rs194, %rs192, 8; + st.local.u8 [%rd183+160], %rs194; + mov.b32 {%rs195, %rs196}, %r3971; + st.local.u8 [%rd183+161], %rs195; + shr.u16 %rs197, %rs195, 8; + st.local.u8 [%rd183+162], %rs197; + st.local.u8 [%rd183+163], %rs196; + shr.u16 %rs198, %rs196, 8; + st.local.u8 [%rd183+164], %rs198; + mov.b32 {%rs199, %rs200}, %r3972; + st.local.u8 [%rd183+165], %rs199; + shr.u16 %rs201, %rs199, 8; + st.local.u8 [%rd183+166], %rs201; + st.local.u8 [%rd183+167], %rs200; + shr.u16 %rs202, %rs200, 8; + st.local.u8 [%rd183+168], %rs202; + mov.b32 {%rs203, %rs204}, %r3973; + st.local.u8 [%rd183+169], %rs203; + shr.u16 %rs205, %rs203, 8; + st.local.u8 [%rd183+170], %rs205; + st.local.u8 [%rd183+171], %rs204; + shr.u16 %rs206, %rs204, 8; + st.local.u8 [%rd183+172], %rs206; + mov.b32 {%rs207, %rs208}, %r3974; + st.local.u8 [%rd183+173], %rs207; + shr.u16 %rs209, %rs207, 8; + st.local.u8 [%rd183+174], %rs209; + st.local.u8 [%rd183+175], %rs208; + shr.u16 %rs210, %rs208, 8; + st.local.u8 [%rd183+176], %rs210; + ld.local.u8 %rs388, [%rd3+8]; + +$L__BB1_47: + add.s16 %rs331, %rs388, 1; + st.local.u8 [%rd3+8], %rs331; + ld.local.u64 %rd196, [%rd3+-72]; + shr.u64 %rd197, %rd49, 10; + add.s64 %rd251, %rd196, %rd197; + st.local.u64 [%rd3+-72], %rd251; + add.s64 %rd261, %rd261, %rd49; + add.s64 %rd254, %rd254, %rd49; + sub.s64 %rd262, %rd262, %rd49; + setp.gt.u64 %p39, %rd262, 1024; + @%p39 bra $L__BB1_26; + +$L__BB1_48: + setp.eq.s64 %p40, %rd262, 0; + @%p40 bra $L__BB1_68; + + ld.local.u8 %rs389, [%rd3]; + cvt.u64.u16 %rd71, %rs389; + setp.eq.s16 %p41, %rs389, 0; + mov.u16 %rs390, 0; + mov.u64 %rd271, %rd262; + @%p41 bra $L__BB1_57; + + mov.u64 %rd198, 64; + sub.s64 %rd199, %rd198, %rd71; + min.u64 %rd72, %rd199, %rd262; + setp.eq.s64 %p42, %rd72, 0; + @%p42 bra $L__BB1_54; + + add.s64 %rd201, %rd2, %rd71; + add.s64 %rd73, %rd201, 72; + mov.u64 %rd263, 0; + +$L__BB1_52: + add.s64 %rd202, %rd261, %rd263; + ld.local.u8 %rs333, [%rd202]; + add.s64 %rd203, %rd73, %rd263; + st.local.u8 [%rd203], %rs333; + add.s64 %rd263, %rd263, 1; + setp.lt.u64 %p43, %rd263, %rd72; + @%p43 bra $L__BB1_52; + + ld.local.u8 %rs389, [%rd3]; + +$L__BB1_54: + cvt.u16.u64 %rs334, %rd72; + add.s16 %rs390, %rs389, %rs334; + mov.u64 %rd271, 0; + st.local.u8 [%rd3], %rs390; + add.s64 %rd261, %rd261, %rd72; + sub.s64 %rd77, %rd262, %rd72; + setp.eq.s64 %p44, %rd77, 0; + @%p44 bra $L__BB1_57; + + add.s64 %rd78, %rd2, 72; + ld.local.u8 %rs335, [%rd3+1]; + mov.u64 %rd264, 0; + setp.eq.s16 %p45, %rs335, 0; + mov.u16 %rs390, 0; + selp.u16 %rs337, 1, 0, %p45; + ld.local.u8 %rs338, [%rd3+2]; + or.b16 %rs339, %rs338, %rs337; + ld.local.u8 %r8843, [%rd3+-64]; + ld.local.u8 %r8844, [%rd3+-63]; + prmt.b32 %r8845, %r8844, %r8843, 30212; + ld.local.u8 %r8846, [%rd3+-62]; + prmt.b32 %r8847, %r8846, %r8845, 28756; + ld.local.u8 %r8848, [%rd3+-61]; + prmt.b32 %r8849, %r8848, %r8847, 1620; + ld.local.u8 %r8850, [%rd3+-60]; + ld.local.u8 %r8851, [%rd3+-59]; + prmt.b32 %r8852, %r8851, %r8850, 30212; + ld.local.u8 %r8853, [%rd3+-58]; + prmt.b32 %r8854, %r8853, %r8852, 28756; + ld.local.u8 %r8855, [%rd3+-57]; + prmt.b32 %r8856, %r8855, %r8854, 1620; + ld.local.u8 %r8857, [%rd3+-56]; + ld.local.u8 %r8858, [%rd3+-55]; + prmt.b32 %r8859, %r8858, %r8857, 30212; + ld.local.u8 %r8860, [%rd3+-54]; + prmt.b32 %r8861, %r8860, %r8859, 28756; + ld.local.u8 %r8862, [%rd3+-53]; + prmt.b32 %r8863, %r8862, %r8861, 1620; + ld.local.u8 %r8864, [%rd3+-52]; + ld.local.u8 %r8865, [%rd3+-51]; + prmt.b32 %r8866, %r8865, %r8864, 30212; + ld.local.u8 %r8867, [%rd3+-50]; + prmt.b32 %r8868, %r8867, %r8866, 28756; + ld.local.u8 %r8869, [%rd3+-49]; + prmt.b32 %r8870, %r8869, %r8868, 1620; + ld.local.u8 %r8871, [%rd3+-48]; + ld.local.u8 %r8872, [%rd3+-47]; + prmt.b32 %r8873, %r8872, %r8871, 30212; + ld.local.u8 %r8874, [%rd3+-46]; + prmt.b32 %r8875, %r8874, %r8873, 28756; + ld.local.u8 %r8876, [%rd3+-45]; + prmt.b32 %r8877, %r8876, %r8875, 1620; + ld.local.u8 %r8878, [%rd3+-44]; + ld.local.u8 %r8879, [%rd3+-43]; + prmt.b32 %r8880, %r8879, %r8878, 30212; + ld.local.u8 %r8881, [%rd3+-42]; + prmt.b32 %r8882, %r8881, %r8880, 28756; + ld.local.u8 %r8883, [%rd3+-41]; + prmt.b32 %r8884, %r8883, %r8882, 1620; + ld.local.u8 %r8885, [%rd3+-40]; + ld.local.u8 %r8886, [%rd3+-39]; + prmt.b32 %r8887, %r8886, %r8885, 30212; + ld.local.u8 %r8888, [%rd3+-38]; + prmt.b32 %r8889, %r8888, %r8887, 28756; + ld.local.u8 %r8890, [%rd3+-37]; + prmt.b32 %r8891, %r8890, %r8889, 1620; + ld.local.u8 %r8892, [%rd3+-36]; + ld.local.u8 %r8893, [%rd3+-35]; + prmt.b32 %r8894, %r8893, %r8892, 30212; + ld.local.u8 %r8895, [%rd3+-34]; + prmt.b32 %r8896, %r8895, %r8894, 28756; + ld.local.u8 %r8897, [%rd3+-33]; + prmt.b32 %r8898, %r8897, %r8896, 1620; + ld.local.u8 %r8899, [%rd3+-32]; + ld.local.u8 %r8900, [%rd3+-31]; + prmt.b32 %r8901, %r8900, %r8899, 30212; + ld.local.u8 %r8902, [%rd3+-30]; + prmt.b32 %r8903, %r8902, %r8901, 28756; + ld.local.u8 %r8904, [%rd3+-29]; + prmt.b32 %r8905, %r8904, %r8903, 1620; + ld.local.u8 %r8906, [%rd3+-28]; + ld.local.u8 %r8907, [%rd3+-27]; + prmt.b32 %r8908, %r8907, %r8906, 30212; + ld.local.u8 %r8909, [%rd3+-26]; + prmt.b32 %r8910, %r8909, %r8908, 28756; + ld.local.u8 %r8911, [%rd3+-25]; + prmt.b32 %r8912, %r8911, %r8910, 1620; + ld.local.u8 %r8913, [%rd3+-24]; + ld.local.u8 %r8914, [%rd3+-23]; + prmt.b32 %r8915, %r8914, %r8913, 30212; + ld.local.u8 %r8916, [%rd3+-22]; + prmt.b32 %r8917, %r8916, %r8915, 28756; + ld.local.u8 %r8918, [%rd3+-21]; + prmt.b32 %r8919, %r8918, %r8917, 1620; + ld.local.u8 %r8920, [%rd3+-20]; + ld.local.u8 %r8921, [%rd3+-19]; + prmt.b32 %r8922, %r8921, %r8920, 30212; + ld.local.u8 %r8923, [%rd3+-18]; + prmt.b32 %r8924, %r8923, %r8922, 28756; + ld.local.u8 %r8925, [%rd3+-17]; + prmt.b32 %r8926, %r8925, %r8924, 1620; + ld.local.u8 %r8927, [%rd3+-16]; + ld.local.u8 %r8928, [%rd3+-15]; + prmt.b32 %r8929, %r8928, %r8927, 30212; + ld.local.u8 %r8930, [%rd3+-14]; + prmt.b32 %r8931, %r8930, %r8929, 28756; + ld.local.u8 %r8932, [%rd3+-13]; + prmt.b32 %r8933, %r8932, %r8931, 1620; + ld.local.u8 %r8934, [%rd3+-12]; + ld.local.u8 %r8935, [%rd3+-11]; + prmt.b32 %r8936, %r8935, %r8934, 30212; + ld.local.u8 %r8937, [%rd3+-10]; + prmt.b32 %r8938, %r8937, %r8936, 28756; + ld.local.u8 %r8939, [%rd3+-9]; + prmt.b32 %r8940, %r8939, %r8938, 1620; + ld.local.u8 %r8941, [%rd3+-8]; + ld.local.u8 %r8942, [%rd3+-7]; + prmt.b32 %r8943, %r8942, %r8941, 30212; + ld.local.u8 %r8944, [%rd3+-6]; + prmt.b32 %r8945, %r8944, %r8943, 28756; + ld.local.u8 %r8946, [%rd3+-5]; + prmt.b32 %r8947, %r8946, %r8945, 1620; + ld.local.u8 %r8948, [%rd3+-4]; + ld.local.u8 %r8949, [%rd3+-3]; + prmt.b32 %r8950, %r8949, %r8948, 30212; + ld.local.u8 %r8951, [%rd3+-2]; + prmt.b32 %r8952, %r8951, %r8950, 28756; + ld.local.u8 %r8953, [%rd3+-1]; + prmt.b32 %r8954, %r8953, %r8952, 1620; + ld.local.u64 %rd206, [%rd3+-72]; + cvt.u32.u64 %r8955, %rd206; + shr.u64 %rd207, %rd206, 32; + cvt.u32.u64 %r8956, %rd207; + cvt.u32.u16 %r8957, %rs339; + and.b32 %r8958, %r8957, 255; + ld.local.u32 %r8959, [%rd3+-104]; + add.s32 %r8960, %r8959, %r8849; + ld.local.u32 %r8961, [%rd3+-88]; + add.s32 %r8962, %r8960, %r8961; + xor.b32 %r8963, %r8962, %r8955; + shf.l.wrap.b32 %r8964, %r8963, %r8963, 16; + add.s32 %r8965, %r8964, 1779033703; + xor.b32 %r8966, %r8965, %r8961; + shf.l.wrap.b32 %r8967, %r8966, %r8966, 20; + add.s32 %r8968, %r8962, %r8856; + add.s32 %r8969, %r8968, %r8967; + xor.b32 %r8970, %r8969, %r8964; + shf.l.wrap.b32 %r8971, %r8970, %r8970, 24; + add.s32 %r8972, %r8971, %r8965; + xor.b32 %r8973, %r8972, %r8967; + shf.l.wrap.b32 %r8974, %r8973, %r8973, 25; + ld.local.u32 %r8975, [%rd3+-100]; + add.s32 %r8976, %r8975, %r8863; + ld.local.u32 %r8977, [%rd3+-84]; + add.s32 %r8978, %r8976, %r8977; + xor.b32 %r8979, %r8978, %r8956; + shf.l.wrap.b32 %r8980, %r8979, %r8979, 16; + add.s32 %r8981, %r8980, -1150833019; + xor.b32 %r8982, %r8981, %r8977; + shf.l.wrap.b32 %r8983, %r8982, %r8982, 20; + add.s32 %r8984, %r8978, %r8870; + add.s32 %r8985, %r8984, %r8983; + xor.b32 %r8986, %r8985, %r8980; + shf.l.wrap.b32 %r8987, %r8986, %r8986, 24; + add.s32 %r8988, %r8987, %r8981; + xor.b32 %r8989, %r8988, %r8983; + shf.l.wrap.b32 %r8990, %r8989, %r8989, 25; + ld.local.u32 %r8991, [%rd3+-96]; + add.s32 %r8992, %r8991, %r8877; + ld.local.u32 %r8993, [%rd3+-80]; + add.s32 %r8994, %r8992, %r8993; + shr.u32 %r8995, %r8994, 16; + shl.b32 %r8996, %r8994, 16; + xor.b32 %r8997, %r8996, 4194304; + or.b32 %r8998, %r8997, %r8995; + add.s32 %r8999, %r8998, 1013904242; + xor.b32 %r9000, %r8999, %r8993; + shf.l.wrap.b32 %r9001, %r9000, %r9000, 20; + add.s32 %r9002, %r8994, %r8884; + add.s32 %r9003, %r9002, %r9001; + xor.b32 %r9004, %r9003, %r8998; + shf.l.wrap.b32 %r9005, %r9004, %r9004, 24; + add.s32 %r9006, %r9005, %r8999; + xor.b32 %r9007, %r9006, %r9001; + shf.l.wrap.b32 %r9008, %r9007, %r9007, 25; + ld.local.u32 %r9009, [%rd3+-92]; + add.s32 %r9010, %r9009, %r8891; + ld.local.u32 %r9011, [%rd3+-76]; + add.s32 %r9012, %r9010, %r9011; + xor.b32 %r9013, %r9012, %r8958; + shr.u32 %r9014, %r9012, 16; + shl.b32 %r9015, %r9013, 16; + or.b32 %r9016, %r9015, %r9014; + add.s32 %r9017, %r9016, -1521486534; + xor.b32 %r9018, %r9017, %r9011; + shf.l.wrap.b32 %r9019, %r9018, %r9018, 20; + add.s32 %r9020, %r9012, %r8898; + add.s32 %r9021, %r9020, %r9019; + xor.b32 %r9022, %r9021, %r9016; + shf.l.wrap.b32 %r9023, %r9022, %r9022, 24; + add.s32 %r9024, %r9023, %r9017; + xor.b32 %r9025, %r9024, %r9019; + shf.l.wrap.b32 %r9026, %r9025, %r9025, 25; + add.s32 %r9027, %r8969, %r8905; + add.s32 %r9028, %r9027, %r8990; + xor.b32 %r9029, %r9028, %r9023; + shf.l.wrap.b32 %r9030, %r9029, %r9029, 16; + add.s32 %r9031, %r9030, %r9006; + xor.b32 %r9032, %r9031, %r8990; + shf.l.wrap.b32 %r9033, %r9032, %r9032, 20; + add.s32 %r9034, %r9028, %r8912; + add.s32 %r9035, %r9034, %r9033; + xor.b32 %r9036, %r9035, %r9030; + shf.l.wrap.b32 %r9037, %r9036, %r9036, 24; + add.s32 %r9038, %r9037, %r9031; + xor.b32 %r9039, %r9038, %r9033; + shf.l.wrap.b32 %r9040, %r9039, %r9039, 25; + add.s32 %r9041, %r8985, %r8919; + add.s32 %r9042, %r9041, %r9008; + xor.b32 %r9043, %r9042, %r8971; + shf.l.wrap.b32 %r9044, %r9043, %r9043, 16; + add.s32 %r9045, %r9044, %r9024; + xor.b32 %r9046, %r9045, %r9008; + shf.l.wrap.b32 %r9047, %r9046, %r9046, 20; + add.s32 %r9048, %r9042, %r8926; + add.s32 %r9049, %r9048, %r9047; + xor.b32 %r9050, %r9049, %r9044; + shf.l.wrap.b32 %r9051, %r9050, %r9050, 24; + add.s32 %r9052, %r9051, %r9045; + xor.b32 %r9053, %r9052, %r9047; + shf.l.wrap.b32 %r9054, %r9053, %r9053, 25; + add.s32 %r9055, %r9003, %r8933; + add.s32 %r9056, %r9055, %r9026; + xor.b32 %r9057, %r9056, %r8987; + shf.l.wrap.b32 %r9058, %r9057, %r9057, 16; + add.s32 %r9059, %r9058, %r8972; + xor.b32 %r9060, %r9059, %r9026; + shf.l.wrap.b32 %r9061, %r9060, %r9060, 20; + add.s32 %r9062, %r9056, %r8940; + add.s32 %r9063, %r9062, %r9061; + xor.b32 %r9064, %r9063, %r9058; + shf.l.wrap.b32 %r9065, %r9064, %r9064, 24; + add.s32 %r9066, %r9065, %r9059; + xor.b32 %r9067, %r9066, %r9061; + shf.l.wrap.b32 %r9068, %r9067, %r9067, 25; + add.s32 %r9069, %r9021, %r8947; + add.s32 %r9070, %r9069, %r8974; + xor.b32 %r9071, %r9070, %r9005; + shf.l.wrap.b32 %r9072, %r9071, %r9071, 16; + add.s32 %r9073, %r9072, %r8988; + xor.b32 %r9074, %r9073, %r8974; + shf.l.wrap.b32 %r9075, %r9074, %r9074, 20; + add.s32 %r9076, %r9070, %r8954; + add.s32 %r9077, %r9076, %r9075; + xor.b32 %r9078, %r9077, %r9072; + shf.l.wrap.b32 %r9079, %r9078, %r9078, 24; + add.s32 %r9080, %r9079, %r9073; + xor.b32 %r9081, %r9080, %r9075; + shf.l.wrap.b32 %r9082, %r9081, %r9081, 25; + add.s32 %r9083, %r9035, %r8863; + add.s32 %r9084, %r9083, %r9082; + xor.b32 %r9085, %r9084, %r9051; + shf.l.wrap.b32 %r9086, %r9085, %r9085, 16; + add.s32 %r9087, %r9086, %r9066; + xor.b32 %r9088, %r9087, %r9082; + shf.l.wrap.b32 %r9089, %r9088, %r9088, 20; + add.s32 %r9090, %r9084, %r8891; + add.s32 %r9091, %r9090, %r9089; + xor.b32 %r9092, %r9091, %r9086; + shf.l.wrap.b32 %r9093, %r9092, %r9092, 24; + add.s32 %r9094, %r9093, %r9087; + xor.b32 %r9095, %r9094, %r9089; + shf.l.wrap.b32 %r9096, %r9095, %r9095, 25; + add.s32 %r9097, %r9049, %r8870; + add.s32 %r9098, %r9097, %r9040; + xor.b32 %r9099, %r9098, %r9065; + shf.l.wrap.b32 %r9100, %r9099, %r9099, 16; + add.s32 %r9101, %r9100, %r9080; + xor.b32 %r9102, %r9101, %r9040; + shf.l.wrap.b32 %r9103, %r9102, %r9102, 20; + add.s32 %r9104, %r9098, %r8919; + add.s32 %r9105, %r9104, %r9103; + xor.b32 %r9106, %r9105, %r9100; + shf.l.wrap.b32 %r9107, %r9106, %r9106, 24; + add.s32 %r9108, %r9107, %r9101; + xor.b32 %r9109, %r9108, %r9103; + shf.l.wrap.b32 %r9110, %r9109, %r9109, 25; + add.s32 %r9111, %r9063, %r8898; + add.s32 %r9112, %r9111, %r9054; + xor.b32 %r9113, %r9112, %r9079; + shf.l.wrap.b32 %r9114, %r9113, %r9113, 16; + add.s32 %r9115, %r9114, %r9038; + xor.b32 %r9116, %r9115, %r9054; + shf.l.wrap.b32 %r9117, %r9116, %r9116, 20; + add.s32 %r9118, %r9112, %r8849; + add.s32 %r9119, %r9118, %r9117; + xor.b32 %r9120, %r9119, %r9114; + shf.l.wrap.b32 %r9121, %r9120, %r9120, 24; + add.s32 %r9122, %r9121, %r9115; + xor.b32 %r9123, %r9122, %r9117; + shf.l.wrap.b32 %r9124, %r9123, %r9123, 25; + add.s32 %r9125, %r9077, %r8877; + add.s32 %r9126, %r9125, %r9068; + xor.b32 %r9127, %r9126, %r9037; + shf.l.wrap.b32 %r9128, %r9127, %r9127, 16; + add.s32 %r9129, %r9128, %r9052; + xor.b32 %r9130, %r9129, %r9068; + shf.l.wrap.b32 %r9131, %r9130, %r9130, 20; + add.s32 %r9132, %r9126, %r8940; + add.s32 %r9133, %r9132, %r9131; + xor.b32 %r9134, %r9133, %r9128; + shf.l.wrap.b32 %r9135, %r9134, %r9134, 24; + add.s32 %r9136, %r9135, %r9129; + xor.b32 %r9137, %r9136, %r9131; + shf.l.wrap.b32 %r9138, %r9137, %r9137, 25; + add.s32 %r9139, %r9091, %r8856; + add.s32 %r9140, %r9139, %r9110; + xor.b32 %r9141, %r9140, %r9135; + shf.l.wrap.b32 %r9142, %r9141, %r9141, 16; + add.s32 %r9143, %r9142, %r9122; + xor.b32 %r9144, %r9143, %r9110; + shf.l.wrap.b32 %r9145, %r9144, %r9144, 20; + add.s32 %r9146, %r9140, %r8926; + add.s32 %r9147, %r9146, %r9145; + xor.b32 %r9148, %r9147, %r9142; + shf.l.wrap.b32 %r9149, %r9148, %r9148, 24; + add.s32 %r9150, %r9149, %r9143; + xor.b32 %r9151, %r9150, %r9145; + shf.l.wrap.b32 %r9152, %r9151, %r9151, 25; + add.s32 %r9153, %r9105, %r8933; + add.s32 %r9154, %r9153, %r9124; + xor.b32 %r9155, %r9154, %r9093; + shf.l.wrap.b32 %r9156, %r9155, %r9155, 16; + add.s32 %r9157, %r9156, %r9136; + xor.b32 %r9158, %r9157, %r9124; + shf.l.wrap.b32 %r9159, %r9158, %r9158, 20; + add.s32 %r9160, %r9154, %r8884; + add.s32 %r9161, %r9160, %r9159; + xor.b32 %r9162, %r9161, %r9156; + shf.l.wrap.b32 %r9163, %r9162, %r9162, 24; + add.s32 %r9164, %r9163, %r9157; + xor.b32 %r9165, %r9164, %r9159; + shf.l.wrap.b32 %r9166, %r9165, %r9165, 25; + add.s32 %r9167, %r9119, %r8912; + add.s32 %r9168, %r9167, %r9138; + xor.b32 %r9169, %r9168, %r9107; + shf.l.wrap.b32 %r9170, %r9169, %r9169, 16; + add.s32 %r9171, %r9170, %r9094; + xor.b32 %r9172, %r9171, %r9138; + shf.l.wrap.b32 %r9173, %r9172, %r9172, 20; + add.s32 %r9174, %r9168, %r8947; + add.s32 %r9175, %r9174, %r9173; + xor.b32 %r9176, %r9175, %r9170; + shf.l.wrap.b32 %r9177, %r9176, %r9176, 24; + add.s32 %r9178, %r9177, %r9171; + xor.b32 %r9179, %r9178, %r9173; + shf.l.wrap.b32 %r9180, %r9179, %r9179, 25; + add.s32 %r9181, %r9133, %r8954; + add.s32 %r9182, %r9181, %r9096; + xor.b32 %r9183, %r9182, %r9121; + shf.l.wrap.b32 %r9184, %r9183, %r9183, 16; + add.s32 %r9185, %r9184, %r9108; + xor.b32 %r9186, %r9185, %r9096; + shf.l.wrap.b32 %r9187, %r9186, %r9186, 20; + add.s32 %r9188, %r9182, %r8905; + add.s32 %r9189, %r9188, %r9187; + xor.b32 %r9190, %r9189, %r9184; + shf.l.wrap.b32 %r9191, %r9190, %r9190, 24; + add.s32 %r9192, %r9191, %r9185; + xor.b32 %r9193, %r9192, %r9187; + shf.l.wrap.b32 %r9194, %r9193, %r9193, 25; + add.s32 %r9195, %r9147, %r8870; + add.s32 %r9196, %r9195, %r9194; + xor.b32 %r9197, %r9196, %r9163; + shf.l.wrap.b32 %r9198, %r9197, %r9197, 16; + add.s32 %r9199, %r9198, %r9178; + xor.b32 %r9200, %r9199, %r9194; + shf.l.wrap.b32 %r9201, %r9200, %r9200, 20; + add.s32 %r9202, %r9196, %r8877; + add.s32 %r9203, %r9202, %r9201; + xor.b32 %r9204, %r9203, %r9198; + shf.l.wrap.b32 %r9205, %r9204, %r9204, 24; + add.s32 %r9206, %r9205, %r9199; + xor.b32 %r9207, %r9206, %r9201; + shf.l.wrap.b32 %r9208, %r9207, %r9207, 25; + add.s32 %r9209, %r9161, %r8919; + add.s32 %r9210, %r9209, %r9152; + xor.b32 %r9211, %r9210, %r9177; + shf.l.wrap.b32 %r9212, %r9211, %r9211, 16; + add.s32 %r9213, %r9212, %r9192; + xor.b32 %r9214, %r9213, %r9152; + shf.l.wrap.b32 %r9215, %r9214, %r9214, 20; + add.s32 %r9216, %r9210, %r8933; + add.s32 %r9217, %r9216, %r9215; + xor.b32 %r9218, %r9217, %r9212; + shf.l.wrap.b32 %r9219, %r9218, %r9218, 24; + add.s32 %r9220, %r9219, %r9213; + xor.b32 %r9221, %r9220, %r9215; + shf.l.wrap.b32 %r9222, %r9221, %r9221, 25; + add.s32 %r9223, %r9175, %r8940; + add.s32 %r9224, %r9223, %r9166; + xor.b32 %r9225, %r9224, %r9191; + shf.l.wrap.b32 %r9226, %r9225, %r9225, 16; + add.s32 %r9227, %r9226, %r9150; + xor.b32 %r9228, %r9227, %r9166; + shf.l.wrap.b32 %r9229, %r9228, %r9228, 20; + add.s32 %r9230, %r9224, %r8863; + add.s32 %r9231, %r9230, %r9229; + xor.b32 %r9232, %r9231, %r9226; + shf.l.wrap.b32 %r9233, %r9232, %r9232, 24; + add.s32 %r9234, %r9233, %r9227; + xor.b32 %r9235, %r9234, %r9229; + shf.l.wrap.b32 %r9236, %r9235, %r9235, 25; + add.s32 %r9237, %r9189, %r8898; + add.s32 %r9238, %r9237, %r9180; + xor.b32 %r9239, %r9238, %r9149; + shf.l.wrap.b32 %r9240, %r9239, %r9239, 16; + add.s32 %r9241, %r9240, %r9164; + xor.b32 %r9242, %r9241, %r9180; + shf.l.wrap.b32 %r9243, %r9242, %r9242, 20; + add.s32 %r9244, %r9238, %r8947; + add.s32 %r9245, %r9244, %r9243; + xor.b32 %r9246, %r9245, %r9240; + shf.l.wrap.b32 %r9247, %r9246, %r9246, 24; + add.s32 %r9248, %r9247, %r9241; + xor.b32 %r9249, %r9248, %r9243; + shf.l.wrap.b32 %r9250, %r9249, %r9249, 25; + add.s32 %r9251, %r9203, %r8891; + add.s32 %r9252, %r9251, %r9222; + xor.b32 %r9253, %r9252, %r9247; + shf.l.wrap.b32 %r9254, %r9253, %r9253, 16; + add.s32 %r9255, %r9254, %r9234; + xor.b32 %r9256, %r9255, %r9222; + shf.l.wrap.b32 %r9257, %r9256, %r9256, 20; + add.s32 %r9258, %r9252, %r8884; + add.s32 %r9259, %r9258, %r9257; + xor.b32 %r9260, %r9259, %r9254; + shf.l.wrap.b32 %r9261, %r9260, %r9260, 24; + add.s32 %r9262, %r9261, %r9255; + xor.b32 %r9263, %r9262, %r9257; + shf.l.wrap.b32 %r9264, %r9263, %r9263, 25; + add.s32 %r9265, %r9217, %r8912; + add.s32 %r9266, %r9265, %r9236; + xor.b32 %r9267, %r9266, %r9205; + shf.l.wrap.b32 %r9268, %r9267, %r9267, 16; + add.s32 %r9269, %r9268, %r9248; + xor.b32 %r9270, %r9269, %r9236; + shf.l.wrap.b32 %r9271, %r9270, %r9270, 20; + add.s32 %r9272, %r9266, %r8849; + add.s32 %r9273, %r9272, %r9271; + xor.b32 %r9274, %r9273, %r9268; + shf.l.wrap.b32 %r9275, %r9274, %r9274, 24; + add.s32 %r9276, %r9275, %r9269; + xor.b32 %r9277, %r9276, %r9271; + shf.l.wrap.b32 %r9278, %r9277, %r9277, 25; + add.s32 %r9279, %r9231, %r8926; + add.s32 %r9280, %r9279, %r9250; + xor.b32 %r9281, %r9280, %r9219; + shf.l.wrap.b32 %r9282, %r9281, %r9281, 16; + add.s32 %r9283, %r9282, %r9206; + xor.b32 %r9284, %r9283, %r9250; + shf.l.wrap.b32 %r9285, %r9284, %r9284, 20; + add.s32 %r9286, %r9280, %r8954; + add.s32 %r9287, %r9286, %r9285; + xor.b32 %r9288, %r9287, %r9282; + shf.l.wrap.b32 %r9289, %r9288, %r9288, 24; + add.s32 %r9290, %r9289, %r9283; + xor.b32 %r9291, %r9290, %r9285; + shf.l.wrap.b32 %r9292, %r9291, %r9291, 25; + add.s32 %r9293, %r9245, %r8905; + add.s32 %r9294, %r9293, %r9208; + xor.b32 %r9295, %r9294, %r9233; + shf.l.wrap.b32 %r9296, %r9295, %r9295, 16; + add.s32 %r9297, %r9296, %r9220; + xor.b32 %r9298, %r9297, %r9208; + shf.l.wrap.b32 %r9299, %r9298, %r9298, 20; + add.s32 %r9300, %r9294, %r8856; + add.s32 %r9301, %r9300, %r9299; + xor.b32 %r9302, %r9301, %r9296; + shf.l.wrap.b32 %r9303, %r9302, %r9302, 24; + add.s32 %r9304, %r9303, %r9297; + xor.b32 %r9305, %r9304, %r9299; + shf.l.wrap.b32 %r9306, %r9305, %r9305, 25; + add.s32 %r9307, %r9259, %r8919; + add.s32 %r9308, %r9307, %r9306; + xor.b32 %r9309, %r9308, %r9275; + shf.l.wrap.b32 %r9310, %r9309, %r9309, 16; + add.s32 %r9311, %r9310, %r9290; + xor.b32 %r9312, %r9311, %r9306; + shf.l.wrap.b32 %r9313, %r9312, %r9312, 20; + add.s32 %r9314, %r9308, %r8898; + add.s32 %r9315, %r9314, %r9313; + xor.b32 %r9316, %r9315, %r9310; + shf.l.wrap.b32 %r9317, %r9316, %r9316, 24; + add.s32 %r9318, %r9317, %r9311; + xor.b32 %r9319, %r9318, %r9313; + shf.l.wrap.b32 %r9320, %r9319, %r9319, 25; + add.s32 %r9321, %r9273, %r8933; + add.s32 %r9322, %r9321, %r9264; + xor.b32 %r9323, %r9322, %r9289; + shf.l.wrap.b32 %r9324, %r9323, %r9323, 16; + add.s32 %r9325, %r9324, %r9304; + xor.b32 %r9326, %r9325, %r9264; + shf.l.wrap.b32 %r9327, %r9326, %r9326, 20; + add.s32 %r9328, %r9322, %r8912; + add.s32 %r9329, %r9328, %r9327; + xor.b32 %r9330, %r9329, %r9324; + shf.l.wrap.b32 %r9331, %r9330, %r9330, 24; + add.s32 %r9332, %r9331, %r9325; + xor.b32 %r9333, %r9332, %r9327; + shf.l.wrap.b32 %r9334, %r9333, %r9333, 25; + add.s32 %r9335, %r9287, %r8947; + add.s32 %r9336, %r9335, %r9278; + xor.b32 %r9337, %r9336, %r9303; + shf.l.wrap.b32 %r9338, %r9337, %r9337, 16; + add.s32 %r9339, %r9338, %r9262; + xor.b32 %r9340, %r9339, %r9278; + shf.l.wrap.b32 %r9341, %r9340, %r9340, 20; + add.s32 %r9342, %r9336, %r8870; + add.s32 %r9343, %r9342, %r9341; + xor.b32 %r9344, %r9343, %r9338; + shf.l.wrap.b32 %r9345, %r9344, %r9344, 24; + add.s32 %r9346, %r9345, %r9339; + xor.b32 %r9347, %r9346, %r9341; + shf.l.wrap.b32 %r9348, %r9347, %r9347, 25; + add.s32 %r9349, %r9301, %r8940; + add.s32 %r9350, %r9349, %r9292; + xor.b32 %r9351, %r9350, %r9261; + shf.l.wrap.b32 %r9352, %r9351, %r9351, 16; + add.s32 %r9353, %r9352, %r9276; + xor.b32 %r9354, %r9353, %r9292; + shf.l.wrap.b32 %r9355, %r9354, %r9354, 20; + add.s32 %r9356, %r9350, %r8954; + add.s32 %r9357, %r9356, %r9355; + xor.b32 %r9358, %r9357, %r9352; + shf.l.wrap.b32 %r9359, %r9358, %r9358, 24; + add.s32 %r9360, %r9359, %r9353; + xor.b32 %r9361, %r9360, %r9355; + shf.l.wrap.b32 %r9362, %r9361, %r9361, 25; + add.s32 %r9363, %r9315, %r8877; + add.s32 %r9364, %r9363, %r9334; + xor.b32 %r9365, %r9364, %r9359; + shf.l.wrap.b32 %r9366, %r9365, %r9365, 16; + add.s32 %r9367, %r9366, %r9346; + xor.b32 %r9368, %r9367, %r9334; + shf.l.wrap.b32 %r9369, %r9368, %r9368, 20; + add.s32 %r9370, %r9364, %r8849; + add.s32 %r9371, %r9370, %r9369; + xor.b32 %r9372, %r9371, %r9366; + shf.l.wrap.b32 %r9373, %r9372, %r9372, 24; + add.s32 %r9374, %r9373, %r9367; + xor.b32 %r9375, %r9374, %r9369; + shf.l.wrap.b32 %r9376, %r9375, %r9375, 25; + add.s32 %r9377, %r9329, %r8926; + add.s32 %r9378, %r9377, %r9348; + xor.b32 %r9379, %r9378, %r9317; + shf.l.wrap.b32 %r9380, %r9379, %r9379, 16; + add.s32 %r9381, %r9380, %r9360; + xor.b32 %r9382, %r9381, %r9348; + shf.l.wrap.b32 %r9383, %r9382, %r9382, 20; + add.s32 %r9384, %r9378, %r8863; + add.s32 %r9385, %r9384, %r9383; + xor.b32 %r9386, %r9385, %r9380; + shf.l.wrap.b32 %r9387, %r9386, %r9386, 24; + add.s32 %r9388, %r9387, %r9381; + xor.b32 %r9389, %r9388, %r9383; + shf.l.wrap.b32 %r9390, %r9389, %r9389, 25; + add.s32 %r9391, %r9343, %r8884; + add.s32 %r9392, %r9391, %r9362; + xor.b32 %r9393, %r9392, %r9331; + shf.l.wrap.b32 %r9394, %r9393, %r9393, 16; + add.s32 %r9395, %r9394, %r9318; + xor.b32 %r9396, %r9395, %r9362; + shf.l.wrap.b32 %r9397, %r9396, %r9396, 20; + add.s32 %r9398, %r9392, %r8905; + add.s32 %r9399, %r9398, %r9397; + xor.b32 %r9400, %r9399, %r9394; + shf.l.wrap.b32 %r9401, %r9400, %r9400, 24; + add.s32 %r9402, %r9401, %r9395; + xor.b32 %r9403, %r9402, %r9397; + shf.l.wrap.b32 %r9404, %r9403, %r9403, 25; + add.s32 %r9405, %r9357, %r8856; + add.s32 %r9406, %r9405, %r9320; + xor.b32 %r9407, %r9406, %r9345; + shf.l.wrap.b32 %r9408, %r9407, %r9407, 16; + add.s32 %r9409, %r9408, %r9332; + xor.b32 %r9410, %r9409, %r9320; + shf.l.wrap.b32 %r9411, %r9410, %r9410, 20; + add.s32 %r9412, %r9406, %r8891; + add.s32 %r9413, %r9412, %r9411; + xor.b32 %r9414, %r9413, %r9408; + shf.l.wrap.b32 %r9415, %r9414, %r9414, 24; + add.s32 %r9416, %r9415, %r9409; + xor.b32 %r9417, %r9416, %r9411; + shf.l.wrap.b32 %r9418, %r9417, %r9417, 25; + add.s32 %r9419, %r9371, %r8933; + add.s32 %r9420, %r9419, %r9418; + xor.b32 %r9421, %r9420, %r9387; + shf.l.wrap.b32 %r9422, %r9421, %r9421, 16; + add.s32 %r9423, %r9422, %r9402; + xor.b32 %r9424, %r9423, %r9418; + shf.l.wrap.b32 %r9425, %r9424, %r9424, 20; + add.s32 %r9426, %r9420, %r8940; + add.s32 %r9427, %r9426, %r9425; + xor.b32 %r9428, %r9427, %r9422; + shf.l.wrap.b32 %r9429, %r9428, %r9428, 24; + add.s32 %r9430, %r9429, %r9423; + xor.b32 %r9431, %r9430, %r9425; + shf.l.wrap.b32 %r9432, %r9431, %r9431, 25; + add.s32 %r9433, %r9385, %r8912; + add.s32 %r9434, %r9433, %r9376; + xor.b32 %r9435, %r9434, %r9401; + shf.l.wrap.b32 %r9436, %r9435, %r9435, 16; + add.s32 %r9437, %r9436, %r9416; + xor.b32 %r9438, %r9437, %r9376; + shf.l.wrap.b32 %r9439, %r9438, %r9438, 20; + add.s32 %r9440, %r9434, %r8926; + add.s32 %r9441, %r9440, %r9439; + xor.b32 %r9442, %r9441, %r9436; + shf.l.wrap.b32 %r9443, %r9442, %r9442, 24; + add.s32 %r9444, %r9443, %r9437; + xor.b32 %r9445, %r9444, %r9439; + shf.l.wrap.b32 %r9446, %r9445, %r9445, 25; + add.s32 %r9447, %r9399, %r8954; + add.s32 %r9448, %r9447, %r9390; + xor.b32 %r9449, %r9448, %r9415; + shf.l.wrap.b32 %r9450, %r9449, %r9449, 16; + add.s32 %r9451, %r9450, %r9374; + xor.b32 %r9452, %r9451, %r9390; + shf.l.wrap.b32 %r9453, %r9452, %r9452, 20; + add.s32 %r9454, %r9448, %r8919; + add.s32 %r9455, %r9454, %r9453; + xor.b32 %r9456, %r9455, %r9450; + shf.l.wrap.b32 %r9457, %r9456, %r9456, 24; + add.s32 %r9458, %r9457, %r9451; + xor.b32 %r9459, %r9458, %r9453; + shf.l.wrap.b32 %r9460, %r9459, %r9459, 25; + add.s32 %r9461, %r9413, %r8947; + add.s32 %r9462, %r9461, %r9404; + xor.b32 %r9463, %r9462, %r9373; + shf.l.wrap.b32 %r9464, %r9463, %r9463, 16; + add.s32 %r9465, %r9464, %r9388; + xor.b32 %r9466, %r9465, %r9404; + shf.l.wrap.b32 %r9467, %r9466, %r9466, 20; + add.s32 %r9468, %r9462, %r8905; + add.s32 %r9469, %r9468, %r9467; + xor.b32 %r9470, %r9469, %r9464; + shf.l.wrap.b32 %r9471, %r9470, %r9470, 24; + add.s32 %r9472, %r9471, %r9465; + xor.b32 %r9473, %r9472, %r9467; + shf.l.wrap.b32 %r9474, %r9473, %r9473, 25; + add.s32 %r9475, %r9427, %r8898; + add.s32 %r9476, %r9475, %r9446; + xor.b32 %r9477, %r9476, %r9471; + shf.l.wrap.b32 %r9478, %r9477, %r9477, 16; + add.s32 %r9479, %r9478, %r9458; + xor.b32 %r9480, %r9479, %r9446; + shf.l.wrap.b32 %r9481, %r9480, %r9480, 20; + add.s32 %r9482, %r9476, %r8863; + add.s32 %r9483, %r9482, %r9481; + xor.b32 %r9484, %r9483, %r9478; + shf.l.wrap.b32 %r9485, %r9484, %r9484, 24; + add.s32 %r9486, %r9485, %r9479; + xor.b32 %r9487, %r9486, %r9481; + shf.l.wrap.b32 %r9488, %r9487, %r9487, 25; + add.s32 %r9489, %r9441, %r8884; + add.s32 %r9490, %r9489, %r9460; + xor.b32 %r9491, %r9490, %r9429; + shf.l.wrap.b32 %r9492, %r9491, %r9491, 16; + add.s32 %r9493, %r9492, %r9472; + xor.b32 %r9494, %r9493, %r9460; + shf.l.wrap.b32 %r9495, %r9494, %r9494, 20; + add.s32 %r9496, %r9490, %r8870; + add.s32 %r9497, %r9496, %r9495; + xor.b32 %r9498, %r9497, %r9492; + shf.l.wrap.b32 %r9499, %r9498, %r9498, 24; + add.s32 %r9500, %r9499, %r9493; + xor.b32 %r9501, %r9500, %r9495; + shf.l.wrap.b32 %r9502, %r9501, %r9501, 25; + add.s32 %r9503, %r9455, %r8849; + add.s32 %r9504, %r9503, %r9474; + xor.b32 %r9505, %r9504, %r9443; + shf.l.wrap.b32 %r9506, %r9505, %r9505, 16; + add.s32 %r9507, %r9506, %r9430; + xor.b32 %r9508, %r9507, %r9474; + shf.l.wrap.b32 %r9509, %r9508, %r9508, 20; + add.s32 %r9510, %r9504, %r8856; + add.s32 %r9511, %r9510, %r9509; + xor.b32 %r9512, %r9511, %r9506; + shf.l.wrap.b32 %r9513, %r9512, %r9512, 24; + add.s32 %r9514, %r9513, %r9507; + xor.b32 %r9515, %r9514, %r9509; + shf.l.wrap.b32 %r9516, %r9515, %r9515, 25; + add.s32 %r9517, %r9469, %r8891; + add.s32 %r9518, %r9517, %r9432; + xor.b32 %r9519, %r9518, %r9457; + shf.l.wrap.b32 %r9520, %r9519, %r9519, 16; + add.s32 %r9521, %r9520, %r9444; + xor.b32 %r9522, %r9521, %r9432; + shf.l.wrap.b32 %r9523, %r9522, %r9522, 20; + add.s32 %r9524, %r9518, %r8877; + add.s32 %r9525, %r9524, %r9523; + xor.b32 %r9526, %r9525, %r9520; + shf.l.wrap.b32 %r9527, %r9526, %r9526, 24; + add.s32 %r9528, %r9527, %r9521; + xor.b32 %r9529, %r9528, %r9523; + shf.l.wrap.b32 %r9530, %r9529, %r9529, 25; + add.s32 %r9531, %r9483, %r8912; + add.s32 %r9532, %r9531, %r9530; + xor.b32 %r9533, %r9532, %r9499; + shf.l.wrap.b32 %r9534, %r9533, %r9533, 16; + add.s32 %r9535, %r9534, %r9514; + xor.b32 %r9536, %r9535, %r9530; + shf.l.wrap.b32 %r9537, %r9536, %r9536, 20; + add.s32 %r9538, %r9532, %r8947; + add.s32 %r9539, %r9538, %r9537; + xor.b32 %r9540, %r9539, %r9534; + shf.l.wrap.b32 %r9541, %r9540, %r9540, 24; + add.s32 %r9542, %r9541, %r9535; + xor.b32 %r9543, %r9542, %r9537; + shf.l.wrap.b32 %r9544, %r9543, %r9543, 25; + add.s32 %r9545, %r9497, %r8926; + add.s32 %r9546, %r9545, %r9488; + xor.b32 %r9547, %r9546, %r9513; + shf.l.wrap.b32 %r9548, %r9547, %r9547, 16; + add.s32 %r9549, %r9548, %r9528; + xor.b32 %r9550, %r9549, %r9488; + shf.l.wrap.b32 %r9551, %r9550, %r9550, 20; + add.s32 %r9552, %r9546, %r8884; + add.s32 %r9553, %r9552, %r9551; + xor.b32 %r9554, %r9553, %r9548; + shf.l.wrap.b32 %r9555, %r9554, %r9554, 24; + add.s32 %r9556, %r9555, %r9549; + xor.b32 %r9557, %r9556, %r9551; + shf.l.wrap.b32 %r9558, %r9557, %r9557, 25; + add.s32 %r9559, %r9511, %r8905; + add.s32 %r9560, %r9559, %r9502; + xor.b32 %r9561, %r9560, %r9527; + shf.l.wrap.b32 %r9562, %r9561, %r9561, 16; + add.s32 %r9563, %r9562, %r9486; + xor.b32 %r9564, %r9563, %r9502; + shf.l.wrap.b32 %r9565, %r9564, %r9564, 20; + add.s32 %r9566, %r9560, %r8933; + add.s32 %r9567, %r9566, %r9565; + xor.b32 %r9568, %r9567, %r9562; + shf.l.wrap.b32 %r9569, %r9568, %r9568, 24; + add.s32 %r9570, %r9569, %r9563; + xor.b32 %r9571, %r9570, %r9565; + shf.l.wrap.b32 %r9572, %r9571, %r9571, 25; + add.s32 %r9573, %r9525, %r8954; + add.s32 %r9574, %r9573, %r9516; + xor.b32 %r9575, %r9574, %r9485; + shf.l.wrap.b32 %r9576, %r9575, %r9575, 16; + add.s32 %r9577, %r9576, %r9500; + xor.b32 %r9578, %r9577, %r9516; + shf.l.wrap.b32 %r9579, %r9578, %r9578, 20; + add.s32 %r9580, %r9574, %r8856; + add.s32 %r9581, %r9580, %r9579; + xor.b32 %r9582, %r9581, %r9576; + shf.l.wrap.b32 %r9583, %r9582, %r9582, 24; + add.s32 %r9584, %r9583, %r9577; + xor.b32 %r9585, %r9584, %r9579; + shf.l.wrap.b32 %r9586, %r9585, %r9585, 25; + add.s32 %r9587, %r9539, %r8940; + add.s32 %r9588, %r9587, %r9558; + xor.b32 %r9589, %r9588, %r9583; + shf.l.wrap.b32 %r9590, %r9589, %r9589, 16; + add.s32 %r9591, %r9590, %r9570; + xor.b32 %r9592, %r9591, %r9558; + shf.l.wrap.b32 %r9593, %r9592, %r9592, 20; + add.s32 %r9594, %r9588, %r8870; + add.s32 %r9595, %r9594, %r9593; + xor.b32 %r9596, %r9595, %r9590; + shf.l.wrap.b32 %r9597, %r9596, %r9596, 24; + add.s32 %r9598, %r9597, %r9591; + xor.b32 %r9599, %r9598, %r9593; + shf.l.wrap.b32 %r9600, %r9599, %r9599, 25; + add.s32 %r9601, %r9553, %r8849; + add.s32 %r9602, %r9601, %r9572; + xor.b32 %r9603, %r9602, %r9541; + shf.l.wrap.b32 %r9604, %r9603, %r9603, 16; + add.s32 %r9605, %r9604, %r9584; + xor.b32 %r9606, %r9605, %r9572; + shf.l.wrap.b32 %r9607, %r9606, %r9606, 20; + add.s32 %r9608, %r9602, %r8919; + add.s32 %r9609, %r9608, %r9607; + xor.b32 %r9610, %r9609, %r9604; + shf.l.wrap.b32 %r9611, %r9610, %r9610, 24; + add.s32 %r9612, %r9611, %r9605; + xor.b32 %r9613, %r9612, %r9607; + shf.l.wrap.b32 %r9614, %r9613, %r9613, 25; + add.s32 %r9615, %r9567, %r8863; + add.s32 %r9616, %r9615, %r9586; + xor.b32 %r9617, %r9616, %r9555; + shf.l.wrap.b32 %r9618, %r9617, %r9617, 16; + add.s32 %r9619, %r9618, %r9542; + xor.b32 %r9620, %r9619, %r9586; + shf.l.wrap.b32 %r9621, %r9620, %r9620, 20; + add.s32 %r9622, %r9616, %r8891; + add.s32 %r9623, %r9622, %r9621; + xor.b32 %r9624, %r9623, %r9618; + shf.l.wrap.b32 %r9625, %r9624, %r9624, 24; + add.s32 %r9626, %r9625, %r9619; + xor.b32 %r9627, %r9626, %r9621; + shf.l.wrap.b32 %r9628, %r9627, %r9627, 25; + add.s32 %r9629, %r9581, %r8877; + add.s32 %r9630, %r9629, %r9544; + xor.b32 %r9631, %r9630, %r9569; + shf.l.wrap.b32 %r9632, %r9631, %r9631, 16; + add.s32 %r9633, %r9632, %r9556; + xor.b32 %r9634, %r9633, %r9544; + shf.l.wrap.b32 %r9635, %r9634, %r9634, 20; + add.s32 %r9636, %r9630, %r8898; + add.s32 %r9637, %r9636, %r9635; + xor.b32 %r9638, %r9637, %r9632; + shf.l.wrap.b32 %r9639, %r9638, %r9638, 24; + add.s32 %r9640, %r9639, %r9633; + xor.b32 %r9641, %r9640, %r9635; + shf.l.wrap.b32 %r9642, %r9641, %r9641, 25; + add.s32 %r9643, %r9595, %r8926; + add.s32 %r9644, %r9643, %r9642; + xor.b32 %r9645, %r9644, %r9611; + shf.l.wrap.b32 %r9646, %r9645, %r9645, 16; + add.s32 %r9647, %r9646, %r9626; + xor.b32 %r9648, %r9647, %r9642; + shf.l.wrap.b32 %r9649, %r9648, %r9648, 20; + add.s32 %r9650, %r9644, %r8954; + add.s32 %r9651, %r9650, %r9649; + xor.b32 %r9652, %r9651, %r9646; + shf.l.wrap.b32 %r9653, %r9652, %r9652, 24; + add.s32 %r9654, %r9653, %r9647; + xor.b32 %r9655, %r9654, %r9649; + shf.l.wrap.b32 %r9656, %r9655, %r9655, 25; + add.s32 %r9657, %r9609, %r8884; + add.s32 %r9658, %r9657, %r9600; + xor.b32 %r9659, %r9658, %r9625; + shf.l.wrap.b32 %r9660, %r9659, %r9659, 16; + add.s32 %r9661, %r9660, %r9640; + xor.b32 %r9662, %r9661, %r9600; + shf.l.wrap.b32 %r9663, %r9662, %r9662, 20; + add.s32 %r9664, %r9658, %r8849; + add.s32 %r9665, %r9664, %r9663; + xor.b32 %r9666, %r9665, %r9660; + shf.l.wrap.b32 %r9667, %r9666, %r9666, 24; + add.s32 %r9668, %r9667, %r9661; + xor.b32 %r9669, %r9668, %r9663; + shf.l.wrap.b32 %r9670, %r9669, %r9669, 25; + add.s32 %r9671, %r9623, %r8856; + add.s32 %r9672, %r9671, %r9614; + xor.b32 %r9673, %r9672, %r9639; + shf.l.wrap.b32 %r9674, %r9673, %r9673, 16; + add.s32 %r9675, %r9674, %r9598; + xor.b32 %r9676, %r9675, %r9614; + shf.l.wrap.b32 %r9677, %r9676, %r9676, 20; + add.s32 %r9678, %r9672, %r8912; + add.s32 %r9679, %r9678, %r9677; + xor.b32 %r9680, %r9679, %r9674; + shf.l.wrap.b32 %r9681, %r9680, %r9680, 24; + add.s32 %r9682, %r9681, %r9675; + xor.b32 %r9683, %r9682, %r9677; + shf.l.wrap.b32 %r9684, %r9683, %r9683, 25; + add.s32 %r9685, %r9637, %r8905; + add.s32 %r9686, %r9685, %r9628; + xor.b32 %r9687, %r9686, %r9597; + shf.l.wrap.b32 %r9688, %r9687, %r9687, 16; + add.s32 %r9689, %r9688, %r9612; + xor.b32 %r9690, %r9689, %r9628; + shf.l.wrap.b32 %r9691, %r9690, %r9690, 20; + add.s32 %r9692, %r9686, %r8891; + add.s32 %r9693, %r9692, %r9691; + xor.b32 %r9694, %r9693, %r9688; + shf.l.wrap.b32 %r9695, %r9694, %r9694, 24; + add.s32 %r9696, %r9695, %r9689; + xor.b32 %r9697, %r9696, %r9691; + shf.l.wrap.b32 %r9698, %r9697, %r9697, 25; + add.s32 %r9699, %r9651, %r8947; + add.s32 %r9700, %r9699, %r9670; + xor.b32 %r9701, %r9700, %r9695; + shf.l.wrap.b32 %r9702, %r9701, %r9701, 16; + add.s32 %r9703, %r9702, %r9682; + xor.b32 %r9704, %r9703, %r9670; + shf.l.wrap.b32 %r9705, %r9704, %r9704, 20; + add.s32 %r9706, %r9700, %r8919; + add.s32 %r9707, %r9706, %r9705; + xor.b32 %r9708, %r9707, %r9702; + shf.l.wrap.b32 %r9709, %r9708, %r9708, 24; + add.s32 %r9710, %r9709, %r9703; + xor.b32 %r9711, %r9710, %r9705; + shf.l.wrap.b32 %r9712, %r9711, %r9711, 25; + add.s32 %r9713, %r9665, %r8863; + add.s32 %r9714, %r9713, %r9684; + xor.b32 %r9715, %r9714, %r9653; + shf.l.wrap.b32 %r9716, %r9715, %r9715, 16; + add.s32 %r9717, %r9716, %r9696; + xor.b32 %r9718, %r9717, %r9684; + shf.l.wrap.b32 %r9719, %r9718, %r9718, 20; + add.s32 %r9720, %r9714, %r8933; + add.s32 %r9721, %r9720, %r9719; + xor.b32 %r9722, %r9721, %r9716; + shf.l.wrap.b32 %r9723, %r9722, %r9722, 24; + add.s32 %r9724, %r9723, %r9717; + xor.b32 %r9725, %r9724, %r9719; + shf.l.wrap.b32 %r9726, %r9725, %r9725, 25; + add.s32 %r9727, %r9679, %r8870; + add.s32 %r9728, %r9727, %r9698; + xor.b32 %r9729, %r9728, %r9667; + shf.l.wrap.b32 %r9730, %r9729, %r9729, 16; + add.s32 %r9731, %r9730, %r9654; + xor.b32 %r9732, %r9731, %r9698; + shf.l.wrap.b32 %r9733, %r9732, %r9732, 20; + add.s32 %r9734, %r9728, %r8877; + add.s32 %r9735, %r9734, %r9733; + xor.b32 %r9736, %r9735, %r9730; + shf.l.wrap.b32 %r9737, %r9736, %r9736, 24; + add.s32 %r9738, %r9737, %r9731; + xor.b32 %r9739, %r9738, %r9733; + shf.l.wrap.b32 %r9740, %r9739, %r9739, 25; + add.s32 %r9741, %r9693, %r8898; + add.s32 %r9742, %r9741, %r9656; + xor.b32 %r9743, %r9742, %r9681; + shf.l.wrap.b32 %r9744, %r9743, %r9743, 16; + add.s32 %r9745, %r9744, %r9668; + xor.b32 %r9746, %r9745, %r9656; + shf.l.wrap.b32 %r9747, %r9746, %r9746, 20; + add.s32 %r9748, %r9742, %r8940; + add.s32 %r9749, %r9748, %r9747; + xor.b32 %r9750, %r9749, %r9744; + shf.l.wrap.b32 %r9751, %r9750, %r9750, 24; + add.s32 %r9752, %r9751, %r9745; + xor.b32 %r9753, %r9752, %r9747; + shf.l.wrap.b32 %r9754, %r9753, %r9753, 25; + xor.b32 %r9755, %r9738, %r9707; + st.local.u32 [%rd3+-104], %r9755; + xor.b32 %r9756, %r9752, %r9721; + st.local.u32 [%rd3+-100], %r9756; + xor.b32 %r9757, %r9710, %r9735; + st.local.u32 [%rd3+-96], %r9757; + xor.b32 %r9758, %r9724, %r9749; + st.local.u32 [%rd3+-92], %r9758; + xor.b32 %r9759, %r9754, %r9723; + st.local.u32 [%rd3+-88], %r9759; + xor.b32 %r9760, %r9712, %r9737; + st.local.u32 [%rd3+-84], %r9760; + xor.b32 %r9761, %r9726, %r9751; + st.local.u32 [%rd3+-80], %r9761; + xor.b32 %r9762, %r9740, %r9709; + st.local.u32 [%rd3+-76], %r9762; + add.s16 %rs340, %rs335, 1; + st.local.v2.u8 [%rd3], {%rs390, %rs340}; + +$L__BB1_56: + add.s64 %rd208, %rd78, %rd264; + st.local.u8 [%rd208], %rs390; + add.s64 %rd264, %rd264, 1; + setp.lt.u64 %p46, %rd264, 64; + mov.u64 %rd271, %rd77; + @%p46 bra $L__BB1_56; + +$L__BB1_57: + setp.gt.u64 %p47, %rd271, 64; + @%p47 bra $L__BB1_59; + bra.uni $L__BB1_58; + +$L__BB1_59: + ld.local.u8 %rs95, [%rd3+2]; + ld.local.u8 %rs391, [%rd3+1]; + ld.local.u32 %r11689, [%rd3+-104]; + ld.local.u32 %r11688, [%rd3+-100]; + ld.local.u32 %r11687, [%rd3+-96]; + ld.local.u32 %r11686, [%rd3+-92]; + ld.local.u32 %r11685, [%rd3+-88]; + ld.local.u32 %r11684, [%rd3+-84]; + ld.local.u32 %r11683, [%rd3+-80]; + ld.local.u32 %r11682, [%rd3+-76]; + ld.local.u64 %rd269, [%rd3+-72]; + cvt.u32.u64 %r117, %rd269; + shr.u64 %rd209, %rd269, 32; + cvt.u32.u64 %r118, %rd209; + +$L__BB1_60: + and.b16 %rs342, %rs391, 255; + setp.eq.s16 %p48, %rs342, 0; + selp.u16 %rs343, 1, 0, %p48; + or.b16 %rs344, %rs95, %rs343; + ld.local.u8 %r9763, [%rd261]; + ld.local.u8 %r9764, [%rd261+1]; + prmt.b32 %r9765, %r9764, %r9763, 30212; + ld.local.u8 %r9766, [%rd261+2]; + prmt.b32 %r9767, %r9766, %r9765, 28756; + ld.local.u8 %r9768, [%rd261+3]; + prmt.b32 %r9769, %r9768, %r9767, 1620; + ld.local.u8 %r9770, [%rd261+4]; + ld.local.u8 %r9771, [%rd261+5]; + prmt.b32 %r9772, %r9771, %r9770, 30212; + ld.local.u8 %r9773, [%rd261+6]; + prmt.b32 %r9774, %r9773, %r9772, 28756; + ld.local.u8 %r9775, [%rd261+7]; + prmt.b32 %r9776, %r9775, %r9774, 1620; + ld.local.u8 %r9777, [%rd261+8]; + ld.local.u8 %r9778, [%rd261+9]; + prmt.b32 %r9779, %r9778, %r9777, 30212; + ld.local.u8 %r9780, [%rd261+10]; + prmt.b32 %r9781, %r9780, %r9779, 28756; + ld.local.u8 %r9782, [%rd261+11]; + prmt.b32 %r9783, %r9782, %r9781, 1620; + ld.local.u8 %r9784, [%rd261+12]; + ld.local.u8 %r9785, [%rd261+13]; + prmt.b32 %r9786, %r9785, %r9784, 30212; + ld.local.u8 %r9787, [%rd261+14]; + prmt.b32 %r9788, %r9787, %r9786, 28756; + ld.local.u8 %r9789, [%rd261+15]; + prmt.b32 %r9790, %r9789, %r9788, 1620; + ld.local.u8 %r9791, [%rd261+16]; + ld.local.u8 %r9792, [%rd261+17]; + prmt.b32 %r9793, %r9792, %r9791, 30212; + ld.local.u8 %r9794, [%rd261+18]; + prmt.b32 %r9795, %r9794, %r9793, 28756; + ld.local.u8 %r9796, [%rd261+19]; + prmt.b32 %r9797, %r9796, %r9795, 1620; + ld.local.u8 %r9798, [%rd261+20]; + ld.local.u8 %r9799, [%rd261+21]; + prmt.b32 %r9800, %r9799, %r9798, 30212; + ld.local.u8 %r9801, [%rd261+22]; + prmt.b32 %r9802, %r9801, %r9800, 28756; + ld.local.u8 %r9803, [%rd261+23]; + prmt.b32 %r9804, %r9803, %r9802, 1620; + ld.local.u8 %r9805, [%rd261+24]; + ld.local.u8 %r9806, [%rd261+25]; + prmt.b32 %r9807, %r9806, %r9805, 30212; + ld.local.u8 %r9808, [%rd261+26]; + prmt.b32 %r9809, %r9808, %r9807, 28756; + ld.local.u8 %r9810, [%rd261+27]; + prmt.b32 %r9811, %r9810, %r9809, 1620; + ld.local.u8 %r9812, [%rd261+28]; + ld.local.u8 %r9813, [%rd261+29]; + prmt.b32 %r9814, %r9813, %r9812, 30212; + ld.local.u8 %r9815, [%rd261+30]; + prmt.b32 %r9816, %r9815, %r9814, 28756; + ld.local.u8 %r9817, [%rd261+31]; + prmt.b32 %r9818, %r9817, %r9816, 1620; + ld.local.u8 %r9819, [%rd261+32]; + ld.local.u8 %r9820, [%rd261+33]; + prmt.b32 %r9821, %r9820, %r9819, 30212; + ld.local.u8 %r9822, [%rd261+34]; + prmt.b32 %r9823, %r9822, %r9821, 28756; + ld.local.u8 %r9824, [%rd261+35]; + prmt.b32 %r9825, %r9824, %r9823, 1620; + ld.local.u8 %r9826, [%rd261+36]; + ld.local.u8 %r9827, [%rd261+37]; + prmt.b32 %r9828, %r9827, %r9826, 30212; + ld.local.u8 %r9829, [%rd261+38]; + prmt.b32 %r9830, %r9829, %r9828, 28756; + ld.local.u8 %r9831, [%rd261+39]; + prmt.b32 %r9832, %r9831, %r9830, 1620; + ld.local.u8 %r9833, [%rd261+40]; + ld.local.u8 %r9834, [%rd261+41]; + prmt.b32 %r9835, %r9834, %r9833, 30212; + ld.local.u8 %r9836, [%rd261+42]; + prmt.b32 %r9837, %r9836, %r9835, 28756; + ld.local.u8 %r9838, [%rd261+43]; + prmt.b32 %r9839, %r9838, %r9837, 1620; + ld.local.u8 %r9840, [%rd261+44]; + ld.local.u8 %r9841, [%rd261+45]; + prmt.b32 %r9842, %r9841, %r9840, 30212; + ld.local.u8 %r9843, [%rd261+46]; + prmt.b32 %r9844, %r9843, %r9842, 28756; + ld.local.u8 %r9845, [%rd261+47]; + prmt.b32 %r9846, %r9845, %r9844, 1620; + ld.local.u8 %r9847, [%rd261+48]; + ld.local.u8 %r9848, [%rd261+49]; + prmt.b32 %r9849, %r9848, %r9847, 30212; + ld.local.u8 %r9850, [%rd261+50]; + prmt.b32 %r9851, %r9850, %r9849, 28756; + ld.local.u8 %r9852, [%rd261+51]; + prmt.b32 %r9853, %r9852, %r9851, 1620; + ld.local.u8 %r9854, [%rd261+52]; + ld.local.u8 %r9855, [%rd261+53]; + prmt.b32 %r9856, %r9855, %r9854, 30212; + ld.local.u8 %r9857, [%rd261+54]; + prmt.b32 %r9858, %r9857, %r9856, 28756; + ld.local.u8 %r9859, [%rd261+55]; + prmt.b32 %r9860, %r9859, %r9858, 1620; + ld.local.u8 %r9861, [%rd261+56]; + ld.local.u8 %r9862, [%rd261+57]; + prmt.b32 %r9863, %r9862, %r9861, 30212; + ld.local.u8 %r9864, [%rd261+58]; + prmt.b32 %r9865, %r9864, %r9863, 28756; + ld.local.u8 %r9866, [%rd261+59]; + prmt.b32 %r9867, %r9866, %r9865, 1620; + ld.local.u8 %r9868, [%rd261+60]; + ld.local.u8 %r9869, [%rd261+61]; + prmt.b32 %r9870, %r9869, %r9868, 30212; + ld.local.u8 %r9871, [%rd261+62]; + prmt.b32 %r9872, %r9871, %r9870, 28756; + ld.local.u8 %r9873, [%rd261+63]; + prmt.b32 %r9874, %r9873, %r9872, 1620; + cvt.u32.u16 %r9875, %rs344; + and.b32 %r9876, %r9875, 255; + add.s32 %r9877, %r11689, %r11685; + add.s32 %r9878, %r9877, %r9769; + xor.b32 %r9879, %r9878, %r117; + shf.l.wrap.b32 %r9880, %r9879, %r9879, 16; + add.s32 %r9881, %r9880, 1779033703; + xor.b32 %r9882, %r9881, %r11685; + shf.l.wrap.b32 %r9883, %r9882, %r9882, 20; + add.s32 %r9884, %r9776, %r9878; + add.s32 %r9885, %r9884, %r9883; + xor.b32 %r9886, %r9885, %r9880; + shf.l.wrap.b32 %r9887, %r9886, %r9886, 24; + add.s32 %r9888, %r9887, %r9881; + xor.b32 %r9889, %r9888, %r9883; + shf.l.wrap.b32 %r9890, %r9889, %r9889, 25; + add.s32 %r9891, %r11688, %r11684; + add.s32 %r9892, %r9891, %r9783; + xor.b32 %r9893, %r9892, %r118; + shf.l.wrap.b32 %r9894, %r9893, %r9893, 16; + add.s32 %r9895, %r9894, -1150833019; + xor.b32 %r9896, %r9895, %r11684; + shf.l.wrap.b32 %r9897, %r9896, %r9896, 20; + add.s32 %r9898, %r9790, %r9892; + add.s32 %r9899, %r9898, %r9897; + xor.b32 %r9900, %r9899, %r9894; + shf.l.wrap.b32 %r9901, %r9900, %r9900, 24; + add.s32 %r9902, %r9901, %r9895; + xor.b32 %r9903, %r9902, %r9897; + shf.l.wrap.b32 %r9904, %r9903, %r9903, 25; + add.s32 %r9905, %r11687, %r11683; + add.s32 %r9906, %r9905, %r9797; + shr.u32 %r9907, %r9906, 16; + shl.b32 %r9908, %r9906, 16; + xor.b32 %r9909, %r9908, 4194304; + or.b32 %r9910, %r9909, %r9907; + add.s32 %r9911, %r9910, 1013904242; + xor.b32 %r9912, %r9911, %r11683; + shf.l.wrap.b32 %r9913, %r9912, %r9912, 20; + add.s32 %r9914, %r9804, %r9906; + add.s32 %r9915, %r9914, %r9913; + xor.b32 %r9916, %r9915, %r9910; + shf.l.wrap.b32 %r9917, %r9916, %r9916, 24; + add.s32 %r9918, %r9917, %r9911; + xor.b32 %r9919, %r9918, %r9913; + shf.l.wrap.b32 %r9920, %r9919, %r9919, 25; + add.s32 %r9921, %r11686, %r11682; + add.s32 %r9922, %r9921, %r9811; + xor.b32 %r9923, %r9922, %r9876; + shr.u32 %r9924, %r9922, 16; + shl.b32 %r9925, %r9923, 16; + or.b32 %r9926, %r9925, %r9924; + add.s32 %r9927, %r9926, -1521486534; + xor.b32 %r9928, %r9927, %r11682; + shf.l.wrap.b32 %r9929, %r9928, %r9928, 20; + add.s32 %r9930, %r9818, %r9922; + add.s32 %r9931, %r9930, %r9929; + xor.b32 %r9932, %r9931, %r9926; + shf.l.wrap.b32 %r9933, %r9932, %r9932, 24; + add.s32 %r9934, %r9933, %r9927; + xor.b32 %r9935, %r9934, %r9929; + shf.l.wrap.b32 %r9936, %r9935, %r9935, 25; + add.s32 %r9937, %r9904, %r9885; + add.s32 %r9938, %r9937, %r9825; + xor.b32 %r9939, %r9933, %r9938; + shf.l.wrap.b32 %r9940, %r9939, %r9939, 16; + add.s32 %r9941, %r9940, %r9918; + xor.b32 %r9942, %r9941, %r9904; + shf.l.wrap.b32 %r9943, %r9942, %r9942, 20; + add.s32 %r9944, %r9832, %r9938; + add.s32 %r9945, %r9944, %r9943; + xor.b32 %r9946, %r9945, %r9940; + shf.l.wrap.b32 %r9947, %r9946, %r9946, 24; + add.s32 %r9948, %r9947, %r9941; + xor.b32 %r9949, %r9948, %r9943; + shf.l.wrap.b32 %r9950, %r9949, %r9949, 25; + add.s32 %r9951, %r9920, %r9899; + add.s32 %r9952, %r9951, %r9839; + xor.b32 %r9953, %r9952, %r9887; + shf.l.wrap.b32 %r9954, %r9953, %r9953, 16; + add.s32 %r9955, %r9954, %r9934; + xor.b32 %r9956, %r9955, %r9920; + shf.l.wrap.b32 %r9957, %r9956, %r9956, 20; + add.s32 %r9958, %r9846, %r9952; + add.s32 %r9959, %r9958, %r9957; + xor.b32 %r9960, %r9959, %r9954; + shf.l.wrap.b32 %r9961, %r9960, %r9960, 24; + add.s32 %r9962, %r9961, %r9955; + xor.b32 %r9963, %r9962, %r9957; + shf.l.wrap.b32 %r9964, %r9963, %r9963, 25; + add.s32 %r9965, %r9936, %r9915; + add.s32 %r9966, %r9965, %r9853; + xor.b32 %r9967, %r9966, %r9901; + shf.l.wrap.b32 %r9968, %r9967, %r9967, 16; + add.s32 %r9969, %r9968, %r9888; + xor.b32 %r9970, %r9969, %r9936; + shf.l.wrap.b32 %r9971, %r9970, %r9970, 20; + add.s32 %r9972, %r9860, %r9966; + add.s32 %r9973, %r9972, %r9971; + xor.b32 %r9974, %r9973, %r9968; + shf.l.wrap.b32 %r9975, %r9974, %r9974, 24; + add.s32 %r9976, %r9975, %r9969; + xor.b32 %r9977, %r9976, %r9971; + shf.l.wrap.b32 %r9978, %r9977, %r9977, 25; + add.s32 %r9979, %r9931, %r9890; + add.s32 %r9980, %r9979, %r9867; + xor.b32 %r9981, %r9980, %r9917; + shf.l.wrap.b32 %r9982, %r9981, %r9981, 16; + add.s32 %r9983, %r9982, %r9902; + xor.b32 %r9984, %r9983, %r9890; + shf.l.wrap.b32 %r9985, %r9984, %r9984, 20; + add.s32 %r9986, %r9874, %r9980; + add.s32 %r9987, %r9986, %r9985; + xor.b32 %r9988, %r9987, %r9982; + shf.l.wrap.b32 %r9989, %r9988, %r9988, 24; + add.s32 %r9990, %r9989, %r9983; + xor.b32 %r9991, %r9990, %r9985; + shf.l.wrap.b32 %r9992, %r9991, %r9991, 25; + add.s32 %r9993, %r9945, %r9783; + add.s32 %r9994, %r9993, %r9992; + xor.b32 %r9995, %r9994, %r9961; + shf.l.wrap.b32 %r9996, %r9995, %r9995, 16; + add.s32 %r9997, %r9996, %r9976; + xor.b32 %r9998, %r9997, %r9992; + shf.l.wrap.b32 %r9999, %r9998, %r9998, 20; + add.s32 %r10000, %r9994, %r9811; + add.s32 %r10001, %r10000, %r9999; + xor.b32 %r10002, %r10001, %r9996; + shf.l.wrap.b32 %r10003, %r10002, %r10002, 24; + add.s32 %r10004, %r10003, %r9997; + xor.b32 %r10005, %r10004, %r9999; + shf.l.wrap.b32 %r10006, %r10005, %r10005, 25; + add.s32 %r10007, %r9959, %r9790; + add.s32 %r10008, %r10007, %r9950; + xor.b32 %r10009, %r9975, %r10008; + shf.l.wrap.b32 %r10010, %r10009, %r10009, 16; + add.s32 %r10011, %r9990, %r10010; + xor.b32 %r10012, %r10011, %r9950; + shf.l.wrap.b32 %r10013, %r10012, %r10012, 20; + add.s32 %r10014, %r10008, %r9839; + add.s32 %r10015, %r10014, %r10013; + xor.b32 %r10016, %r10015, %r10010; + shf.l.wrap.b32 %r10017, %r10016, %r10016, 24; + add.s32 %r10018, %r10017, %r10011; + xor.b32 %r10019, %r10018, %r10013; + shf.l.wrap.b32 %r10020, %r10019, %r10019, 25; + add.s32 %r10021, %r9964, %r9818; + add.s32 %r10022, %r10021, %r9973; + xor.b32 %r10023, %r9989, %r10022; + shf.l.wrap.b32 %r10024, %r10023, %r10023, 16; + add.s32 %r10025, %r10024, %r9948; + xor.b32 %r10026, %r10025, %r9964; + shf.l.wrap.b32 %r10027, %r10026, %r10026, 20; + add.s32 %r10028, %r10022, %r9769; + add.s32 %r10029, %r10028, %r10027; + xor.b32 %r10030, %r10029, %r10024; + shf.l.wrap.b32 %r10031, %r10030, %r10030, 24; + add.s32 %r10032, %r10031, %r10025; + xor.b32 %r10033, %r10032, %r10027; + shf.l.wrap.b32 %r10034, %r10033, %r10033, 25; + add.s32 %r10035, %r9978, %r9797; + add.s32 %r10036, %r10035, %r9987; + xor.b32 %r10037, %r10036, %r9947; + shf.l.wrap.b32 %r10038, %r10037, %r10037, 16; + add.s32 %r10039, %r10038, %r9962; + xor.b32 %r10040, %r10039, %r9978; + shf.l.wrap.b32 %r10041, %r10040, %r10040, 20; + add.s32 %r10042, %r10036, %r9860; + add.s32 %r10043, %r10042, %r10041; + xor.b32 %r10044, %r10043, %r10038; + shf.l.wrap.b32 %r10045, %r10044, %r10044, 24; + add.s32 %r10046, %r10045, %r10039; + xor.b32 %r10047, %r10046, %r10041; + shf.l.wrap.b32 %r10048, %r10047, %r10047, 25; + add.s32 %r10049, %r10001, %r9776; + add.s32 %r10050, %r10049, %r10020; + xor.b32 %r10051, %r10050, %r10045; + shf.l.wrap.b32 %r10052, %r10051, %r10051, 16; + add.s32 %r10053, %r10052, %r10032; + xor.b32 %r10054, %r10053, %r10020; + shf.l.wrap.b32 %r10055, %r10054, %r10054, 20; + add.s32 %r10056, %r10050, %r9846; + add.s32 %r10057, %r10056, %r10055; + xor.b32 %r10058, %r10057, %r10052; + shf.l.wrap.b32 %r10059, %r10058, %r10058, 24; + add.s32 %r10060, %r10059, %r10053; + xor.b32 %r10061, %r10060, %r10055; + shf.l.wrap.b32 %r10062, %r10061, %r10061, 25; + add.s32 %r10063, %r10015, %r9853; + add.s32 %r10064, %r10063, %r10034; + xor.b32 %r10065, %r10064, %r10003; + shf.l.wrap.b32 %r10066, %r10065, %r10065, 16; + add.s32 %r10067, %r10066, %r10046; + xor.b32 %r10068, %r10067, %r10034; + shf.l.wrap.b32 %r10069, %r10068, %r10068, 20; + add.s32 %r10070, %r10064, %r9804; + add.s32 %r10071, %r10070, %r10069; + xor.b32 %r10072, %r10071, %r10066; + shf.l.wrap.b32 %r10073, %r10072, %r10072, 24; + add.s32 %r10074, %r10073, %r10067; + xor.b32 %r10075, %r10074, %r10069; + shf.l.wrap.b32 %r10076, %r10075, %r10075, 25; + add.s32 %r10077, %r10029, %r9832; + add.s32 %r10078, %r10077, %r10048; + xor.b32 %r10079, %r10078, %r10017; + shf.l.wrap.b32 %r10080, %r10079, %r10079, 16; + add.s32 %r10081, %r10080, %r10004; + xor.b32 %r10082, %r10081, %r10048; + shf.l.wrap.b32 %r10083, %r10082, %r10082, 20; + add.s32 %r10084, %r10078, %r9867; + add.s32 %r10085, %r10084, %r10083; + xor.b32 %r10086, %r10085, %r10080; + shf.l.wrap.b32 %r10087, %r10086, %r10086, 24; + add.s32 %r10088, %r10087, %r10081; + xor.b32 %r10089, %r10088, %r10083; + shf.l.wrap.b32 %r10090, %r10089, %r10089, 25; + add.s32 %r10091, %r10043, %r9874; + add.s32 %r10092, %r10091, %r10006; + xor.b32 %r10093, %r10092, %r10031; + shf.l.wrap.b32 %r10094, %r10093, %r10093, 16; + add.s32 %r10095, %r10094, %r10018; + xor.b32 %r10096, %r10095, %r10006; + shf.l.wrap.b32 %r10097, %r10096, %r10096, 20; + add.s32 %r10098, %r10092, %r9825; + add.s32 %r10099, %r10098, %r10097; + xor.b32 %r10100, %r10099, %r10094; + shf.l.wrap.b32 %r10101, %r10100, %r10100, 24; + add.s32 %r10102, %r10101, %r10095; + xor.b32 %r10103, %r10102, %r10097; + shf.l.wrap.b32 %r10104, %r10103, %r10103, 25; + add.s32 %r10105, %r10057, %r9790; + add.s32 %r10106, %r10105, %r10104; + xor.b32 %r10107, %r10106, %r10073; + shf.l.wrap.b32 %r10108, %r10107, %r10107, 16; + add.s32 %r10109, %r10108, %r10088; + xor.b32 %r10110, %r10109, %r10104; + shf.l.wrap.b32 %r10111, %r10110, %r10110, 20; + add.s32 %r10112, %r10106, %r9797; + add.s32 %r10113, %r10112, %r10111; + xor.b32 %r10114, %r10113, %r10108; + shf.l.wrap.b32 %r10115, %r10114, %r10114, 24; + add.s32 %r10116, %r10115, %r10109; + xor.b32 %r10117, %r10116, %r10111; + shf.l.wrap.b32 %r10118, %r10117, %r10117, 25; + add.s32 %r10119, %r10071, %r9839; + add.s32 %r10120, %r10119, %r10062; + xor.b32 %r10121, %r10120, %r10087; + shf.l.wrap.b32 %r10122, %r10121, %r10121, 16; + add.s32 %r10123, %r10122, %r10102; + xor.b32 %r10124, %r10123, %r10062; + shf.l.wrap.b32 %r10125, %r10124, %r10124, 20; + add.s32 %r10126, %r10120, %r9853; + add.s32 %r10127, %r10126, %r10125; + xor.b32 %r10128, %r10127, %r10122; + shf.l.wrap.b32 %r10129, %r10128, %r10128, 24; + add.s32 %r10130, %r10129, %r10123; + xor.b32 %r10131, %r10130, %r10125; + shf.l.wrap.b32 %r10132, %r10131, %r10131, 25; + add.s32 %r10133, %r10085, %r9860; + add.s32 %r10134, %r10133, %r10076; + xor.b32 %r10135, %r10134, %r10101; + shf.l.wrap.b32 %r10136, %r10135, %r10135, 16; + add.s32 %r10137, %r10136, %r10060; + xor.b32 %r10138, %r10137, %r10076; + shf.l.wrap.b32 %r10139, %r10138, %r10138, 20; + add.s32 %r10140, %r10134, %r9783; + add.s32 %r10141, %r10140, %r10139; + xor.b32 %r10142, %r10141, %r10136; + shf.l.wrap.b32 %r10143, %r10142, %r10142, 24; + add.s32 %r10144, %r10143, %r10137; + xor.b32 %r10145, %r10144, %r10139; + shf.l.wrap.b32 %r10146, %r10145, %r10145, 25; + add.s32 %r10147, %r10099, %r9818; + add.s32 %r10148, %r10147, %r10090; + xor.b32 %r10149, %r10148, %r10059; + shf.l.wrap.b32 %r10150, %r10149, %r10149, 16; + add.s32 %r10151, %r10150, %r10074; + xor.b32 %r10152, %r10151, %r10090; + shf.l.wrap.b32 %r10153, %r10152, %r10152, 20; + add.s32 %r10154, %r10148, %r9867; + add.s32 %r10155, %r10154, %r10153; + xor.b32 %r10156, %r10155, %r10150; + shf.l.wrap.b32 %r10157, %r10156, %r10156, 24; + add.s32 %r10158, %r10157, %r10151; + xor.b32 %r10159, %r10158, %r10153; + shf.l.wrap.b32 %r10160, %r10159, %r10159, 25; + add.s32 %r10161, %r10113, %r9811; + add.s32 %r10162, %r10161, %r10132; + xor.b32 %r10163, %r10162, %r10157; + shf.l.wrap.b32 %r10164, %r10163, %r10163, 16; + add.s32 %r10165, %r10164, %r10144; + xor.b32 %r10166, %r10165, %r10132; + shf.l.wrap.b32 %r10167, %r10166, %r10166, 20; + add.s32 %r10168, %r10162, %r9804; + add.s32 %r10169, %r10168, %r10167; + xor.b32 %r10170, %r10169, %r10164; + shf.l.wrap.b32 %r10171, %r10170, %r10170, 24; + add.s32 %r10172, %r10171, %r10165; + xor.b32 %r10173, %r10172, %r10167; + shf.l.wrap.b32 %r10174, %r10173, %r10173, 25; + add.s32 %r10175, %r10127, %r9832; + add.s32 %r10176, %r10175, %r10146; + xor.b32 %r10177, %r10176, %r10115; + shf.l.wrap.b32 %r10178, %r10177, %r10177, 16; + add.s32 %r10179, %r10178, %r10158; + xor.b32 %r10180, %r10179, %r10146; + shf.l.wrap.b32 %r10181, %r10180, %r10180, 20; + add.s32 %r10182, %r10176, %r9769; + add.s32 %r10183, %r10182, %r10181; + xor.b32 %r10184, %r10183, %r10178; + shf.l.wrap.b32 %r10185, %r10184, %r10184, 24; + add.s32 %r10186, %r10185, %r10179; + xor.b32 %r10187, %r10186, %r10181; + shf.l.wrap.b32 %r10188, %r10187, %r10187, 25; + add.s32 %r10189, %r10141, %r9846; + add.s32 %r10190, %r10189, %r10160; + xor.b32 %r10191, %r10190, %r10129; + shf.l.wrap.b32 %r10192, %r10191, %r10191, 16; + add.s32 %r10193, %r10192, %r10116; + xor.b32 %r10194, %r10193, %r10160; + shf.l.wrap.b32 %r10195, %r10194, %r10194, 20; + add.s32 %r10196, %r10190, %r9874; + add.s32 %r10197, %r10196, %r10195; + xor.b32 %r10198, %r10197, %r10192; + shf.l.wrap.b32 %r10199, %r10198, %r10198, 24; + add.s32 %r10200, %r10199, %r10193; + xor.b32 %r10201, %r10200, %r10195; + shf.l.wrap.b32 %r10202, %r10201, %r10201, 25; + add.s32 %r10203, %r10155, %r9825; + add.s32 %r10204, %r10203, %r10118; + xor.b32 %r10205, %r10204, %r10143; + shf.l.wrap.b32 %r10206, %r10205, %r10205, 16; + add.s32 %r10207, %r10206, %r10130; + xor.b32 %r10208, %r10207, %r10118; + shf.l.wrap.b32 %r10209, %r10208, %r10208, 20; + add.s32 %r10210, %r10204, %r9776; + add.s32 %r10211, %r10210, %r10209; + xor.b32 %r10212, %r10211, %r10206; + shf.l.wrap.b32 %r10213, %r10212, %r10212, 24; + add.s32 %r10214, %r10213, %r10207; + xor.b32 %r10215, %r10214, %r10209; + shf.l.wrap.b32 %r10216, %r10215, %r10215, 25; + add.s32 %r10217, %r10169, %r9839; + add.s32 %r10218, %r10217, %r10216; + xor.b32 %r10219, %r10218, %r10185; + shf.l.wrap.b32 %r10220, %r10219, %r10219, 16; + add.s32 %r10221, %r10220, %r10200; + xor.b32 %r10222, %r10221, %r10216; + shf.l.wrap.b32 %r10223, %r10222, %r10222, 20; + add.s32 %r10224, %r10218, %r9818; + add.s32 %r10225, %r10224, %r10223; + xor.b32 %r10226, %r10225, %r10220; + shf.l.wrap.b32 %r10227, %r10226, %r10226, 24; + add.s32 %r10228, %r10227, %r10221; + xor.b32 %r10229, %r10228, %r10223; + shf.l.wrap.b32 %r10230, %r10229, %r10229, 25; + add.s32 %r10231, %r10183, %r9853; + add.s32 %r10232, %r10231, %r10174; + xor.b32 %r10233, %r10232, %r10199; + shf.l.wrap.b32 %r10234, %r10233, %r10233, 16; + add.s32 %r10235, %r10234, %r10214; + xor.b32 %r10236, %r10235, %r10174; + shf.l.wrap.b32 %r10237, %r10236, %r10236, 20; + add.s32 %r10238, %r10232, %r9832; + add.s32 %r10239, %r10238, %r10237; + xor.b32 %r10240, %r10239, %r10234; + shf.l.wrap.b32 %r10241, %r10240, %r10240, 24; + add.s32 %r10242, %r10241, %r10235; + xor.b32 %r10243, %r10242, %r10237; + shf.l.wrap.b32 %r10244, %r10243, %r10243, 25; + add.s32 %r10245, %r10197, %r9867; + add.s32 %r10246, %r10245, %r10188; + xor.b32 %r10247, %r10246, %r10213; + shf.l.wrap.b32 %r10248, %r10247, %r10247, 16; + add.s32 %r10249, %r10248, %r10172; + xor.b32 %r10250, %r10249, %r10188; + shf.l.wrap.b32 %r10251, %r10250, %r10250, 20; + add.s32 %r10252, %r10246, %r9790; + add.s32 %r10253, %r10252, %r10251; + xor.b32 %r10254, %r10253, %r10248; + shf.l.wrap.b32 %r10255, %r10254, %r10254, 24; + add.s32 %r10256, %r10255, %r10249; + xor.b32 %r10257, %r10256, %r10251; + shf.l.wrap.b32 %r10258, %r10257, %r10257, 25; + add.s32 %r10259, %r10211, %r9860; + add.s32 %r10260, %r10259, %r10202; + xor.b32 %r10261, %r10260, %r10171; + shf.l.wrap.b32 %r10262, %r10261, %r10261, 16; + add.s32 %r10263, %r10262, %r10186; + xor.b32 %r10264, %r10263, %r10202; + shf.l.wrap.b32 %r10265, %r10264, %r10264, 20; + add.s32 %r10266, %r10260, %r9874; + add.s32 %r10267, %r10266, %r10265; + xor.b32 %r10268, %r10267, %r10262; + shf.l.wrap.b32 %r10269, %r10268, %r10268, 24; + add.s32 %r10270, %r10269, %r10263; + xor.b32 %r10271, %r10270, %r10265; + shf.l.wrap.b32 %r10272, %r10271, %r10271, 25; + add.s32 %r10273, %r10225, %r9797; + add.s32 %r10274, %r10273, %r10244; + xor.b32 %r10275, %r10274, %r10269; + shf.l.wrap.b32 %r10276, %r10275, %r10275, 16; + add.s32 %r10277, %r10276, %r10256; + xor.b32 %r10278, %r10277, %r10244; + shf.l.wrap.b32 %r10279, %r10278, %r10278, 20; + add.s32 %r10280, %r10274, %r9769; + add.s32 %r10281, %r10280, %r10279; + xor.b32 %r10282, %r10281, %r10276; + shf.l.wrap.b32 %r10283, %r10282, %r10282, 24; + add.s32 %r10284, %r10283, %r10277; + xor.b32 %r10285, %r10284, %r10279; + shf.l.wrap.b32 %r10286, %r10285, %r10285, 25; + add.s32 %r10287, %r10239, %r9846; + add.s32 %r10288, %r10287, %r10258; + xor.b32 %r10289, %r10288, %r10227; + shf.l.wrap.b32 %r10290, %r10289, %r10289, 16; + add.s32 %r10291, %r10290, %r10270; + xor.b32 %r10292, %r10291, %r10258; + shf.l.wrap.b32 %r10293, %r10292, %r10292, 20; + add.s32 %r10294, %r10288, %r9783; + add.s32 %r10295, %r10294, %r10293; + xor.b32 %r10296, %r10295, %r10290; + shf.l.wrap.b32 %r10297, %r10296, %r10296, 24; + add.s32 %r10298, %r10297, %r10291; + xor.b32 %r10299, %r10298, %r10293; + shf.l.wrap.b32 %r10300, %r10299, %r10299, 25; + add.s32 %r10301, %r10253, %r9804; + add.s32 %r10302, %r10301, %r10272; + xor.b32 %r10303, %r10302, %r10241; + shf.l.wrap.b32 %r10304, %r10303, %r10303, 16; + add.s32 %r10305, %r10304, %r10228; + xor.b32 %r10306, %r10305, %r10272; + shf.l.wrap.b32 %r10307, %r10306, %r10306, 20; + add.s32 %r10308, %r10302, %r9825; + add.s32 %r10309, %r10308, %r10307; + xor.b32 %r10310, %r10309, %r10304; + shf.l.wrap.b32 %r10311, %r10310, %r10310, 24; + add.s32 %r10312, %r10311, %r10305; + xor.b32 %r10313, %r10312, %r10307; + shf.l.wrap.b32 %r10314, %r10313, %r10313, 25; + add.s32 %r10315, %r10267, %r9776; + add.s32 %r10316, %r10315, %r10230; + xor.b32 %r10317, %r10316, %r10255; + shf.l.wrap.b32 %r10318, %r10317, %r10317, 16; + add.s32 %r10319, %r10318, %r10242; + xor.b32 %r10320, %r10319, %r10230; + shf.l.wrap.b32 %r10321, %r10320, %r10320, 20; + add.s32 %r10322, %r10316, %r9811; + add.s32 %r10323, %r10322, %r10321; + xor.b32 %r10324, %r10323, %r10318; + shf.l.wrap.b32 %r10325, %r10324, %r10324, 24; + add.s32 %r10326, %r10325, %r10319; + xor.b32 %r10327, %r10326, %r10321; + shf.l.wrap.b32 %r10328, %r10327, %r10327, 25; + add.s32 %r10329, %r10281, %r9853; + add.s32 %r10330, %r10329, %r10328; + xor.b32 %r10331, %r10330, %r10297; + shf.l.wrap.b32 %r10332, %r10331, %r10331, 16; + add.s32 %r10333, %r10332, %r10312; + xor.b32 %r10334, %r10333, %r10328; + shf.l.wrap.b32 %r10335, %r10334, %r10334, 20; + add.s32 %r10336, %r10330, %r9860; + add.s32 %r10337, %r10336, %r10335; + xor.b32 %r10338, %r10337, %r10332; + shf.l.wrap.b32 %r10339, %r10338, %r10338, 24; + add.s32 %r10340, %r10339, %r10333; + xor.b32 %r10341, %r10340, %r10335; + shf.l.wrap.b32 %r10342, %r10341, %r10341, 25; + add.s32 %r10343, %r10295, %r9832; + add.s32 %r10344, %r10343, %r10286; + xor.b32 %r10345, %r10344, %r10311; + shf.l.wrap.b32 %r10346, %r10345, %r10345, 16; + add.s32 %r10347, %r10346, %r10326; + xor.b32 %r10348, %r10347, %r10286; + shf.l.wrap.b32 %r10349, %r10348, %r10348, 20; + add.s32 %r10350, %r10344, %r9846; + add.s32 %r10351, %r10350, %r10349; + xor.b32 %r10352, %r10351, %r10346; + shf.l.wrap.b32 %r10353, %r10352, %r10352, 24; + add.s32 %r10354, %r10353, %r10347; + xor.b32 %r10355, %r10354, %r10349; + shf.l.wrap.b32 %r10356, %r10355, %r10355, 25; + add.s32 %r10357, %r10309, %r9874; + add.s32 %r10358, %r10357, %r10300; + xor.b32 %r10359, %r10358, %r10325; + shf.l.wrap.b32 %r10360, %r10359, %r10359, 16; + add.s32 %r10361, %r10360, %r10284; + xor.b32 %r10362, %r10361, %r10300; + shf.l.wrap.b32 %r10363, %r10362, %r10362, 20; + add.s32 %r10364, %r10358, %r9839; + add.s32 %r10365, %r10364, %r10363; + xor.b32 %r10366, %r10365, %r10360; + shf.l.wrap.b32 %r10367, %r10366, %r10366, 24; + add.s32 %r10368, %r10367, %r10361; + xor.b32 %r10369, %r10368, %r10363; + shf.l.wrap.b32 %r10370, %r10369, %r10369, 25; + add.s32 %r10371, %r10323, %r9867; + add.s32 %r10372, %r10371, %r10314; + xor.b32 %r10373, %r10372, %r10283; + shf.l.wrap.b32 %r10374, %r10373, %r10373, 16; + add.s32 %r10375, %r10374, %r10298; + xor.b32 %r10376, %r10375, %r10314; + shf.l.wrap.b32 %r10377, %r10376, %r10376, 20; + add.s32 %r10378, %r10372, %r9825; + add.s32 %r10379, %r10378, %r10377; + xor.b32 %r10380, %r10379, %r10374; + shf.l.wrap.b32 %r10381, %r10380, %r10380, 24; + add.s32 %r10382, %r10381, %r10375; + xor.b32 %r10383, %r10382, %r10377; + shf.l.wrap.b32 %r10384, %r10383, %r10383, 25; + add.s32 %r10385, %r10337, %r9818; + add.s32 %r10386, %r10385, %r10356; + xor.b32 %r10387, %r10386, %r10381; + shf.l.wrap.b32 %r10388, %r10387, %r10387, 16; + add.s32 %r10389, %r10388, %r10368; + xor.b32 %r10390, %r10389, %r10356; + shf.l.wrap.b32 %r10391, %r10390, %r10390, 20; + add.s32 %r10392, %r10386, %r9783; + add.s32 %r10393, %r10392, %r10391; + xor.b32 %r10394, %r10393, %r10388; + shf.l.wrap.b32 %r10395, %r10394, %r10394, 24; + add.s32 %r10396, %r10395, %r10389; + xor.b32 %r10397, %r10396, %r10391; + shf.l.wrap.b32 %r10398, %r10397, %r10397, 25; + add.s32 %r10399, %r10351, %r9804; + add.s32 %r10400, %r10399, %r10370; + xor.b32 %r10401, %r10400, %r10339; + shf.l.wrap.b32 %r10402, %r10401, %r10401, 16; + add.s32 %r10403, %r10402, %r10382; + xor.b32 %r10404, %r10403, %r10370; + shf.l.wrap.b32 %r10405, %r10404, %r10404, 20; + add.s32 %r10406, %r10400, %r9790; + add.s32 %r10407, %r10406, %r10405; + xor.b32 %r10408, %r10407, %r10402; + shf.l.wrap.b32 %r10409, %r10408, %r10408, 24; + add.s32 %r10410, %r10409, %r10403; + xor.b32 %r10411, %r10410, %r10405; + shf.l.wrap.b32 %r10412, %r10411, %r10411, 25; + add.s32 %r10413, %r10365, %r9769; + add.s32 %r10414, %r10413, %r10384; + xor.b32 %r10415, %r10414, %r10353; + shf.l.wrap.b32 %r10416, %r10415, %r10415, 16; + add.s32 %r10417, %r10416, %r10340; + xor.b32 %r10418, %r10417, %r10384; + shf.l.wrap.b32 %r10419, %r10418, %r10418, 20; + add.s32 %r10420, %r10414, %r9776; + add.s32 %r10421, %r10420, %r10419; + xor.b32 %r10422, %r10421, %r10416; + shf.l.wrap.b32 %r10423, %r10422, %r10422, 24; + add.s32 %r10424, %r10423, %r10417; + xor.b32 %r10425, %r10424, %r10419; + shf.l.wrap.b32 %r10426, %r10425, %r10425, 25; + add.s32 %r10427, %r10379, %r9811; + add.s32 %r10428, %r10427, %r10342; + xor.b32 %r10429, %r10428, %r10367; + shf.l.wrap.b32 %r10430, %r10429, %r10429, 16; + add.s32 %r10431, %r10430, %r10354; + xor.b32 %r10432, %r10431, %r10342; + shf.l.wrap.b32 %r10433, %r10432, %r10432, 20; + add.s32 %r10434, %r10428, %r9797; + add.s32 %r10435, %r10434, %r10433; + xor.b32 %r10436, %r10435, %r10430; + shf.l.wrap.b32 %r10437, %r10436, %r10436, 24; + add.s32 %r10438, %r10437, %r10431; + xor.b32 %r10439, %r10438, %r10433; + shf.l.wrap.b32 %r10440, %r10439, %r10439, 25; + add.s32 %r10441, %r10393, %r9832; + add.s32 %r10442, %r10441, %r10440; + xor.b32 %r10443, %r10442, %r10409; + shf.l.wrap.b32 %r10444, %r10443, %r10443, 16; + add.s32 %r10445, %r10444, %r10424; + xor.b32 %r10446, %r10445, %r10440; + shf.l.wrap.b32 %r10447, %r10446, %r10446, 20; + add.s32 %r10448, %r10442, %r9867; + add.s32 %r10449, %r10448, %r10447; + xor.b32 %r10450, %r10449, %r10444; + shf.l.wrap.b32 %r10451, %r10450, %r10450, 24; + add.s32 %r10452, %r10451, %r10445; + xor.b32 %r10453, %r10452, %r10447; + shf.l.wrap.b32 %r10454, %r10453, %r10453, 25; + add.s32 %r10455, %r10407, %r9846; + add.s32 %r10456, %r10455, %r10398; + xor.b32 %r10457, %r10456, %r10423; + shf.l.wrap.b32 %r10458, %r10457, %r10457, 16; + add.s32 %r10459, %r10458, %r10438; + xor.b32 %r10460, %r10459, %r10398; + shf.l.wrap.b32 %r10461, %r10460, %r10460, 20; + add.s32 %r10462, %r10456, %r9804; + add.s32 %r10463, %r10462, %r10461; + xor.b32 %r10464, %r10463, %r10458; + shf.l.wrap.b32 %r10465, %r10464, %r10464, 24; + add.s32 %r10466, %r10465, %r10459; + xor.b32 %r10467, %r10466, %r10461; + shf.l.wrap.b32 %r10468, %r10467, %r10467, 25; + add.s32 %r10469, %r10421, %r9825; + add.s32 %r10470, %r10469, %r10412; + xor.b32 %r10471, %r10470, %r10437; + shf.l.wrap.b32 %r10472, %r10471, %r10471, 16; + add.s32 %r10473, %r10472, %r10396; + xor.b32 %r10474, %r10473, %r10412; + shf.l.wrap.b32 %r10475, %r10474, %r10474, 20; + add.s32 %r10476, %r10470, %r9853; + add.s32 %r10477, %r10476, %r10475; + xor.b32 %r10478, %r10477, %r10472; + shf.l.wrap.b32 %r10479, %r10478, %r10478, 24; + add.s32 %r10480, %r10479, %r10473; + xor.b32 %r10481, %r10480, %r10475; + shf.l.wrap.b32 %r10482, %r10481, %r10481, 25; + add.s32 %r10483, %r10435, %r9874; + add.s32 %r10484, %r10483, %r10426; + xor.b32 %r10485, %r10484, %r10395; + shf.l.wrap.b32 %r10486, %r10485, %r10485, 16; + add.s32 %r10487, %r10486, %r10410; + xor.b32 %r10488, %r10487, %r10426; + shf.l.wrap.b32 %r10489, %r10488, %r10488, 20; + add.s32 %r10490, %r10484, %r9776; + add.s32 %r10491, %r10490, %r10489; + xor.b32 %r10492, %r10491, %r10486; + shf.l.wrap.b32 %r10493, %r10492, %r10492, 24; + add.s32 %r10494, %r10493, %r10487; + xor.b32 %r10495, %r10494, %r10489; + shf.l.wrap.b32 %r10496, %r10495, %r10495, 25; + add.s32 %r10497, %r10449, %r9860; + add.s32 %r10498, %r10497, %r10468; + xor.b32 %r10499, %r10498, %r10493; + shf.l.wrap.b32 %r10500, %r10499, %r10499, 16; + add.s32 %r10501, %r10500, %r10480; + xor.b32 %r10502, %r10501, %r10468; + shf.l.wrap.b32 %r10503, %r10502, %r10502, 20; + add.s32 %r10504, %r10498, %r9790; + add.s32 %r10505, %r10504, %r10503; + xor.b32 %r10506, %r10505, %r10500; + shf.l.wrap.b32 %r10507, %r10506, %r10506, 24; + add.s32 %r10508, %r10507, %r10501; + xor.b32 %r10509, %r10508, %r10503; + shf.l.wrap.b32 %r10510, %r10509, %r10509, 25; + add.s32 %r10511, %r10463, %r9769; + add.s32 %r10512, %r10511, %r10482; + xor.b32 %r10513, %r10512, %r10451; + shf.l.wrap.b32 %r10514, %r10513, %r10513, 16; + add.s32 %r10515, %r10514, %r10494; + xor.b32 %r10516, %r10515, %r10482; + shf.l.wrap.b32 %r10517, %r10516, %r10516, 20; + add.s32 %r10518, %r10512, %r9839; + add.s32 %r10519, %r10518, %r10517; + xor.b32 %r10520, %r10519, %r10514; + shf.l.wrap.b32 %r10521, %r10520, %r10520, 24; + add.s32 %r10522, %r10521, %r10515; + xor.b32 %r10523, %r10522, %r10517; + shf.l.wrap.b32 %r10524, %r10523, %r10523, 25; + add.s32 %r10525, %r10477, %r9783; + add.s32 %r10526, %r10525, %r10496; + xor.b32 %r10527, %r10526, %r10465; + shf.l.wrap.b32 %r10528, %r10527, %r10527, 16; + add.s32 %r10529, %r10528, %r10452; + xor.b32 %r10530, %r10529, %r10496; + shf.l.wrap.b32 %r10531, %r10530, %r10530, 20; + add.s32 %r10532, %r10526, %r9811; + add.s32 %r10533, %r10532, %r10531; + xor.b32 %r10534, %r10533, %r10528; + shf.l.wrap.b32 %r10535, %r10534, %r10534, 24; + add.s32 %r10536, %r10535, %r10529; + xor.b32 %r10537, %r10536, %r10531; + shf.l.wrap.b32 %r10538, %r10537, %r10537, 25; + add.s32 %r10539, %r10491, %r9797; + add.s32 %r10540, %r10539, %r10454; + xor.b32 %r10541, %r10540, %r10479; + shf.l.wrap.b32 %r10542, %r10541, %r10541, 16; + add.s32 %r10543, %r10542, %r10466; + xor.b32 %r10544, %r10543, %r10454; + shf.l.wrap.b32 %r10545, %r10544, %r10544, 20; + add.s32 %r10546, %r10540, %r9818; + add.s32 %r10547, %r10546, %r10545; + xor.b32 %r10548, %r10547, %r10542; + shf.l.wrap.b32 %r10549, %r10548, %r10548, 24; + add.s32 %r10550, %r10549, %r10543; + xor.b32 %r10551, %r10550, %r10545; + shf.l.wrap.b32 %r10552, %r10551, %r10551, 25; + add.s32 %r10553, %r10505, %r9846; + add.s32 %r10554, %r10553, %r10552; + xor.b32 %r10555, %r10554, %r10521; + shf.l.wrap.b32 %r10556, %r10555, %r10555, 16; + add.s32 %r10557, %r10556, %r10536; + xor.b32 %r10558, %r10557, %r10552; + shf.l.wrap.b32 %r10559, %r10558, %r10558, 20; + add.s32 %r10560, %r10554, %r9874; + add.s32 %r10561, %r10560, %r10559; + xor.b32 %r10562, %r10561, %r10556; + shf.l.wrap.b32 %r10563, %r10562, %r10562, 24; + add.s32 %r10564, %r10563, %r10557; + xor.b32 %r10565, %r10564, %r10559; + shf.l.wrap.b32 %r10566, %r10565, %r10565, 25; + add.s32 %r10567, %r10519, %r9804; + add.s32 %r10568, %r10567, %r10510; + xor.b32 %r10569, %r10568, %r10535; + shf.l.wrap.b32 %r10570, %r10569, %r10569, 16; + add.s32 %r10571, %r10570, %r10550; + xor.b32 %r10572, %r10571, %r10510; + shf.l.wrap.b32 %r10573, %r10572, %r10572, 20; + add.s32 %r10574, %r10568, %r9769; + add.s32 %r10575, %r10574, %r10573; + xor.b32 %r10576, %r10575, %r10570; + shf.l.wrap.b32 %r10577, %r10576, %r10576, 24; + add.s32 %r10578, %r10577, %r10571; + xor.b32 %r10579, %r10578, %r10573; + shf.l.wrap.b32 %r10580, %r10579, %r10579, 25; + add.s32 %r10581, %r10533, %r9776; + add.s32 %r10582, %r10581, %r10524; + xor.b32 %r10583, %r10582, %r10549; + shf.l.wrap.b32 %r10584, %r10583, %r10583, 16; + add.s32 %r10585, %r10584, %r10508; + xor.b32 %r10586, %r10585, %r10524; + shf.l.wrap.b32 %r10587, %r10586, %r10586, 20; + add.s32 %r10588, %r10582, %r9832; + add.s32 %r10589, %r10588, %r10587; + xor.b32 %r10590, %r10589, %r10584; + shf.l.wrap.b32 %r10591, %r10590, %r10590, 24; + add.s32 %r10592, %r10591, %r10585; + xor.b32 %r10593, %r10592, %r10587; + shf.l.wrap.b32 %r10594, %r10593, %r10593, 25; + add.s32 %r10595, %r10547, %r9825; + add.s32 %r10596, %r10595, %r10538; + xor.b32 %r10597, %r10596, %r10507; + shf.l.wrap.b32 %r10598, %r10597, %r10597, 16; + add.s32 %r10599, %r10598, %r10522; + xor.b32 %r10600, %r10599, %r10538; + shf.l.wrap.b32 %r10601, %r10600, %r10600, 20; + add.s32 %r10602, %r10596, %r9811; + add.s32 %r10603, %r10602, %r10601; + xor.b32 %r10604, %r10603, %r10598; + shf.l.wrap.b32 %r10605, %r10604, %r10604, 24; + add.s32 %r10606, %r10605, %r10599; + xor.b32 %r10607, %r10606, %r10601; + shf.l.wrap.b32 %r10608, %r10607, %r10607, 25; + add.s32 %r10609, %r10561, %r9867; + add.s32 %r10610, %r10609, %r10580; + xor.b32 %r10611, %r10610, %r10605; + shf.l.wrap.b32 %r10612, %r10611, %r10611, 16; + add.s32 %r10613, %r10612, %r10592; + xor.b32 %r10614, %r10613, %r10580; + shf.l.wrap.b32 %r10615, %r10614, %r10614, 20; + add.s32 %r10616, %r10610, %r9839; + add.s32 %r10617, %r10616, %r10615; + xor.b32 %r10618, %r10617, %r10612; + shf.l.wrap.b32 %r10619, %r10618, %r10618, 24; + add.s32 %r10620, %r10619, %r10613; + xor.b32 %r10621, %r10620, %r10615; + shf.l.wrap.b32 %r10622, %r10621, %r10621, 25; + add.s32 %r10623, %r10575, %r9783; + add.s32 %r10624, %r10623, %r10594; + xor.b32 %r10625, %r10624, %r10563; + shf.l.wrap.b32 %r10626, %r10625, %r10625, 16; + add.s32 %r10627, %r10626, %r10606; + xor.b32 %r10628, %r10627, %r10594; + shf.l.wrap.b32 %r10629, %r10628, %r10628, 20; + add.s32 %r10630, %r10624, %r9853; + add.s32 %r10631, %r10630, %r10629; + xor.b32 %r10632, %r10631, %r10626; + shf.l.wrap.b32 %r10633, %r10632, %r10632, 24; + add.s32 %r10634, %r10633, %r10627; + xor.b32 %r10635, %r10634, %r10629; + shf.l.wrap.b32 %r10636, %r10635, %r10635, 25; + add.s32 %r10637, %r10589, %r9790; + add.s32 %r10638, %r10637, %r10608; + xor.b32 %r10639, %r10638, %r10577; + shf.l.wrap.b32 %r10640, %r10639, %r10639, 16; + add.s32 %r10641, %r10640, %r10564; + xor.b32 %r10642, %r10641, %r10608; + shf.l.wrap.b32 %r10643, %r10642, %r10642, 20; + add.s32 %r10644, %r10638, %r9797; + add.s32 %r10645, %r10644, %r10643; + xor.b32 %r10646, %r10645, %r10640; + shf.l.wrap.b32 %r10647, %r10646, %r10646, 24; + add.s32 %r10648, %r10647, %r10641; + xor.b32 %r10649, %r10648, %r10643; + shf.l.wrap.b32 %r10650, %r10649, %r10649, 25; + add.s32 %r10651, %r10603, %r9818; + add.s32 %r10652, %r10651, %r10566; + xor.b32 %r10653, %r10652, %r10591; + shf.l.wrap.b32 %r10654, %r10653, %r10653, 16; + add.s32 %r10655, %r10654, %r10578; + xor.b32 %r10656, %r10655, %r10566; + shf.l.wrap.b32 %r10657, %r10656, %r10656, 20; + add.s32 %r10658, %r10652, %r9860; + add.s32 %r10659, %r10658, %r10657; + xor.b32 %r10660, %r10659, %r10654; + shf.l.wrap.b32 %r10661, %r10660, %r10660, 24; + add.s32 %r10662, %r10661, %r10655; + xor.b32 %r10663, %r10662, %r10657; + shf.l.wrap.b32 %r10664, %r10663, %r10663, 25; + xor.b32 %r11689, %r10648, %r10617; + st.local.u32 [%rd3+-104], %r11689; + xor.b32 %r11688, %r10662, %r10631; + st.local.u32 [%rd3+-100], %r11688; + xor.b32 %r11687, %r10620, %r10645; + st.local.u32 [%rd3+-96], %r11687; + xor.b32 %r11686, %r10659, %r10634; + st.local.u32 [%rd3+-92], %r11686; + xor.b32 %r11685, %r10664, %r10633; + st.local.u32 [%rd3+-88], %r11685; + xor.b32 %r11684, %r10622, %r10647; + st.local.u32 [%rd3+-84], %r11684; + xor.b32 %r11683, %r10661, %r10636; + st.local.u32 [%rd3+-80], %r11683; + xor.b32 %r11682, %r10650, %r10619; + st.local.u32 [%rd3+-76], %r11682; + add.s16 %rs391, %rs391, 1; + st.local.u8 [%rd3+1], %rs391; + add.s64 %rd261, %rd261, 64; + add.s64 %rd271, %rd271, -64; + setp.gt.u64 %p49, %rd271, 64; + @%p49 bra $L__BB1_60; + bra.uni $L__BB1_61; + +$L__BB1_58: + ld.local.u64 %rd269, [%rd3+-72]; + +$L__BB1_61: + cvt.u64.u16 %rd210, %rs390; + and.b64 %rd92, %rd210, 255; + mov.u64 %rd211, 64; + sub.s64 %rd212, %rd211, %rd92; + min.u64 %rd93, %rd212, %rd271; + setp.eq.s64 %p50, %rd93, 0; + @%p50 bra $L__BB1_64; + + add.s64 %rd214, %rd2, %rd92; + add.s64 %rd94, %rd214, 72; + mov.u64 %rd272, 0; + +$L__BB1_63: + add.s64 %rd215, %rd261, %rd272; + ld.local.u8 %rs345, [%rd215]; + add.s64 %rd216, %rd94, %rd272; + st.local.u8 [%rd216], %rs345; + add.s64 %rd272, %rd272, 1; + setp.lt.u64 %p51, %rd272, %rd93; + @%p51 bra $L__BB1_63; + +$L__BB1_64: + cvt.u16.u64 %rs346, %rd93; + ld.local.u8 %rs347, [%rd3]; + add.s16 %rs348, %rs347, %rs346; + st.local.u8 [%rd3], %rs348; + ld.local.u8 %rs392, [%rd3+8]; + cvt.u64.u16 %rd217, %rs392; + and.b64 %rd218, %rd217, 255; + popc.b64 %r10665, %rd269; + cvt.u64.u32 %rd97, %r10665; + setp.ge.u64 %p52, %rd97, %rd218; + @%p52 bra $L__BB1_68; + + ld.local.u8 %r10666, [%rd3+2]; + or.b32 %r135, %r10666, 4; + ld.local.u8 %r10667, [%rd3+-120]; + ld.local.u8 %r10668, [%rd3+-119]; + prmt.b32 %r10669, %r10668, %r10667, 30212; + ld.local.u8 %r10670, [%rd3+-118]; + ld.local.u8 %r10671, [%rd3+-117]; + prmt.b32 %r10672, %r10671, %r10670, 30212; + prmt.b32 %r136, %r10672, %r10669, 4180; + ld.local.u8 %r10673, [%rd3+-136]; + ld.local.u8 %r10674, [%rd3+-135]; + prmt.b32 %r10675, %r10674, %r10673, 30212; + ld.local.u8 %r10676, [%rd3+-134]; + ld.local.u8 %r10677, [%rd3+-133]; + prmt.b32 %r10678, %r10677, %r10676, 30212; + prmt.b32 %r10679, %r10678, %r10675, 4180; + add.s32 %r137, %r136, %r10679; + ld.local.u8 %r10680, [%rd3+-116]; + ld.local.u8 %r10681, [%rd3+-115]; + prmt.b32 %r10682, %r10681, %r10680, 30212; + ld.local.u8 %r10683, [%rd3+-114]; + ld.local.u8 %r10684, [%rd3+-113]; + prmt.b32 %r10685, %r10684, %r10683, 30212; + prmt.b32 %r138, %r10685, %r10682, 4180; + ld.local.u8 %r10686, [%rd3+-132]; + ld.local.u8 %r10687, [%rd3+-131]; + prmt.b32 %r10688, %r10687, %r10686, 30212; + ld.local.u8 %r10689, [%rd3+-130]; + ld.local.u8 %r10690, [%rd3+-129]; + prmt.b32 %r10691, %r10690, %r10689, 30212; + prmt.b32 %r10692, %r10691, %r10688, 4180; + add.s32 %r139, %r138, %r10692; + ld.local.u8 %r10693, [%rd3+-112]; + ld.local.u8 %r10694, [%rd3+-111]; + prmt.b32 %r10695, %r10694, %r10693, 30212; + ld.local.u8 %r10696, [%rd3+-110]; + ld.local.u8 %r10697, [%rd3+-109]; + prmt.b32 %r10698, %r10697, %r10696, 30212; + prmt.b32 %r140, %r10698, %r10695, 4180; + ld.local.u8 %r10699, [%rd3+-128]; + ld.local.u8 %r10700, [%rd3+-127]; + prmt.b32 %r10701, %r10700, %r10699, 30212; + ld.local.u8 %r10702, [%rd3+-126]; + ld.local.u8 %r10703, [%rd3+-125]; + prmt.b32 %r10704, %r10703, %r10702, 30212; + prmt.b32 %r10705, %r10704, %r10701, 4180; + add.s32 %r141, %r140, %r10705; + ld.local.u8 %r10706, [%rd3+-108]; + ld.local.u8 %r10707, [%rd3+-107]; + prmt.b32 %r10708, %r10707, %r10706, 30212; + ld.local.u8 %r10709, [%rd3+-106]; + ld.local.u8 %r10710, [%rd3+-105]; + prmt.b32 %r10711, %r10710, %r10709, 30212; + prmt.b32 %r142, %r10711, %r10708, 4180; + ld.local.u8 %r10712, [%rd3+-124]; + ld.local.u8 %r10713, [%rd3+-123]; + prmt.b32 %r10714, %r10713, %r10712, 30212; + ld.local.u8 %r10715, [%rd3+-122]; + ld.local.u8 %r10716, [%rd3+-121]; + prmt.b32 %r10717, %r10716, %r10715, 30212; + prmt.b32 %r10718, %r10717, %r10714, 4180; + add.s32 %r143, %r142, %r10718; + +$L__BB1_66: + and.b16 %rs349, %rs392, 255; + mul.wide.u16 %r10719, %rs349, 32; + add.s32 %r10720, %r10719, -64; + cvt.s64.s32 %rd219, %r10720; + add.s64 %rd220, %rd2, %rd219; + ld.local.u8 %r10721, [%rd220+145]; + ld.local.u8 %r10722, [%rd220+146]; + prmt.b32 %r10723, %r10722, %r10721, 30212; + ld.local.u8 %r10724, [%rd220+147]; + prmt.b32 %r10725, %r10724, %r10723, 28756; + ld.local.u8 %r10726, [%rd220+148]; + prmt.b32 %r10727, %r10726, %r10725, 1620; + ld.local.u8 %r10728, [%rd220+149]; + ld.local.u8 %r10729, [%rd220+150]; + prmt.b32 %r10730, %r10729, %r10728, 30212; + ld.local.u8 %r10731, [%rd220+151]; + prmt.b32 %r10732, %r10731, %r10730, 28756; + ld.local.u8 %r10733, [%rd220+152]; + prmt.b32 %r10734, %r10733, %r10732, 1620; + ld.local.u8 %r10735, [%rd220+153]; + ld.local.u8 %r10736, [%rd220+154]; + prmt.b32 %r10737, %r10736, %r10735, 30212; + ld.local.u8 %r10738, [%rd220+155]; + prmt.b32 %r10739, %r10738, %r10737, 28756; + ld.local.u8 %r10740, [%rd220+156]; + prmt.b32 %r10741, %r10740, %r10739, 1620; + ld.local.u8 %r10742, [%rd220+157]; + ld.local.u8 %r10743, [%rd220+158]; + prmt.b32 %r10744, %r10743, %r10742, 30212; + ld.local.u8 %r10745, [%rd220+159]; + prmt.b32 %r10746, %r10745, %r10744, 28756; + ld.local.u8 %r10747, [%rd220+160]; + prmt.b32 %r10748, %r10747, %r10746, 1620; + ld.local.u8 %r10749, [%rd220+161]; + ld.local.u8 %r10750, [%rd220+162]; + prmt.b32 %r10751, %r10750, %r10749, 30212; + ld.local.u8 %r10752, [%rd220+163]; + prmt.b32 %r10753, %r10752, %r10751, 28756; + ld.local.u8 %r10754, [%rd220+164]; + prmt.b32 %r10755, %r10754, %r10753, 1620; + ld.local.u8 %r10756, [%rd220+165]; + ld.local.u8 %r10757, [%rd220+166]; + prmt.b32 %r10758, %r10757, %r10756, 30212; + ld.local.u8 %r10759, [%rd220+167]; + prmt.b32 %r10760, %r10759, %r10758, 28756; + ld.local.u8 %r10761, [%rd220+168]; + prmt.b32 %r10762, %r10761, %r10760, 1620; + ld.local.u8 %r10763, [%rd220+169]; + ld.local.u8 %r10764, [%rd220+170]; + prmt.b32 %r10765, %r10764, %r10763, 30212; + ld.local.u8 %r10766, [%rd220+171]; + prmt.b32 %r10767, %r10766, %r10765, 28756; + ld.local.u8 %r10768, [%rd220+172]; + prmt.b32 %r10769, %r10768, %r10767, 1620; + ld.local.u8 %r10770, [%rd220+173]; + ld.local.u8 %r10771, [%rd220+174]; + prmt.b32 %r10772, %r10771, %r10770, 30212; + ld.local.u8 %r10773, [%rd220+175]; + prmt.b32 %r10774, %r10773, %r10772, 28756; + ld.local.u8 %r10775, [%rd220+176]; + prmt.b32 %r10776, %r10775, %r10774, 1620; + ld.local.u8 %r10777, [%rd220+177]; + ld.local.u8 %r10778, [%rd220+178]; + prmt.b32 %r10779, %r10778, %r10777, 30212; + ld.local.u8 %r10780, [%rd220+179]; + prmt.b32 %r10781, %r10780, %r10779, 28756; + ld.local.u8 %r10782, [%rd220+180]; + prmt.b32 %r10783, %r10782, %r10781, 1620; + ld.local.u8 %r10784, [%rd220+181]; + ld.local.u8 %r10785, [%rd220+182]; + prmt.b32 %r10786, %r10785, %r10784, 30212; + ld.local.u8 %r10787, [%rd220+183]; + prmt.b32 %r10788, %r10787, %r10786, 28756; + ld.local.u8 %r10789, [%rd220+184]; + prmt.b32 %r10790, %r10789, %r10788, 1620; + ld.local.u8 %r10791, [%rd220+185]; + ld.local.u8 %r10792, [%rd220+186]; + prmt.b32 %r10793, %r10792, %r10791, 30212; + ld.local.u8 %r10794, [%rd220+187]; + prmt.b32 %r10795, %r10794, %r10793, 28756; + ld.local.u8 %r10796, [%rd220+188]; + prmt.b32 %r10797, %r10796, %r10795, 1620; + ld.local.u8 %r10798, [%rd220+189]; + ld.local.u8 %r10799, [%rd220+190]; + prmt.b32 %r10800, %r10799, %r10798, 30212; + ld.local.u8 %r10801, [%rd220+191]; + prmt.b32 %r10802, %r10801, %r10800, 28756; + ld.local.u8 %r10803, [%rd220+192]; + prmt.b32 %r10804, %r10803, %r10802, 1620; + ld.local.u8 %r10805, [%rd220+193]; + ld.local.u8 %r10806, [%rd220+194]; + prmt.b32 %r10807, %r10806, %r10805, 30212; + ld.local.u8 %r10808, [%rd220+195]; + prmt.b32 %r10809, %r10808, %r10807, 28756; + ld.local.u8 %r10810, [%rd220+196]; + prmt.b32 %r10811, %r10810, %r10809, 1620; + ld.local.u8 %r10812, [%rd220+197]; + ld.local.u8 %r10813, [%rd220+198]; + prmt.b32 %r10814, %r10813, %r10812, 30212; + ld.local.u8 %r10815, [%rd220+199]; + prmt.b32 %r10816, %r10815, %r10814, 28756; + ld.local.u8 %r10817, [%rd220+200]; + prmt.b32 %r10818, %r10817, %r10816, 1620; + ld.local.u8 %r10819, [%rd220+201]; + ld.local.u8 %r10820, [%rd220+202]; + prmt.b32 %r10821, %r10820, %r10819, 30212; + ld.local.u8 %r10822, [%rd220+203]; + prmt.b32 %r10823, %r10822, %r10821, 28756; + ld.local.u8 %r10824, [%rd220+204]; + prmt.b32 %r10825, %r10824, %r10823, 1620; + ld.local.u8 %r10826, [%rd220+205]; + ld.local.u8 %r10827, [%rd220+206]; + prmt.b32 %r10828, %r10827, %r10826, 30212; + ld.local.u8 %r10829, [%rd220+207]; + prmt.b32 %r10830, %r10829, %r10828, 28756; + ld.local.u8 %r10831, [%rd220+208]; + prmt.b32 %r10832, %r10831, %r10830, 1620; + add.s32 %r10833, %r137, %r10727; + shf.l.wrap.b32 %r10834, %r10833, %r10833, 16; + add.s32 %r10835, %r10834, 1779033703; + xor.b32 %r10836, %r10835, %r136; + shf.l.wrap.b32 %r10837, %r10836, %r10836, 20; + add.s32 %r10838, %r10734, %r10833; + add.s32 %r10839, %r10838, %r10837; + xor.b32 %r10840, %r10839, %r10834; + shf.l.wrap.b32 %r10841, %r10840, %r10840, 24; + add.s32 %r10842, %r10841, %r10835; + xor.b32 %r10843, %r10842, %r10837; + shf.l.wrap.b32 %r10844, %r10843, %r10843, 25; + add.s32 %r10845, %r139, %r10741; + shf.l.wrap.b32 %r10846, %r10845, %r10845, 16; + add.s32 %r10847, %r10846, -1150833019; + xor.b32 %r10848, %r10847, %r138; + shf.l.wrap.b32 %r10849, %r10848, %r10848, 20; + add.s32 %r10850, %r10748, %r10845; + add.s32 %r10851, %r10850, %r10849; + xor.b32 %r10852, %r10851, %r10846; + shf.l.wrap.b32 %r10853, %r10852, %r10852, 24; + add.s32 %r10854, %r10853, %r10847; + xor.b32 %r10855, %r10854, %r10849; + shf.l.wrap.b32 %r10856, %r10855, %r10855, 25; + add.s32 %r10857, %r141, %r10755; + shr.u32 %r10858, %r10857, 16; + shl.b32 %r10859, %r10857, 16; + xor.b32 %r10860, %r10859, 4194304; + or.b32 %r10861, %r10860, %r10858; + add.s32 %r10862, %r10861, 1013904242; + xor.b32 %r10863, %r10862, %r140; + shf.l.wrap.b32 %r10864, %r10863, %r10863, 20; + add.s32 %r10865, %r10762, %r10857; + add.s32 %r10866, %r10865, %r10864; + xor.b32 %r10867, %r10866, %r10861; + shf.l.wrap.b32 %r10868, %r10867, %r10867, 24; + add.s32 %r10869, %r10868, %r10862; + xor.b32 %r10870, %r10869, %r10864; + shf.l.wrap.b32 %r10871, %r10870, %r10870, 25; + add.s32 %r10872, %r143, %r10769; + xor.b32 %r10873, %r10872, %r135; + shr.u32 %r10874, %r10872, 16; + shl.b32 %r10875, %r10873, 16; + or.b32 %r10876, %r10875, %r10874; + add.s32 %r10877, %r10876, -1521486534; + xor.b32 %r10878, %r10877, %r142; + shf.l.wrap.b32 %r10879, %r10878, %r10878, 20; + add.s32 %r10880, %r10776, %r10872; + add.s32 %r10881, %r10880, %r10879; + xor.b32 %r10882, %r10881, %r10876; + shf.l.wrap.b32 %r10883, %r10882, %r10882, 24; + add.s32 %r10884, %r10883, %r10877; + xor.b32 %r10885, %r10884, %r10879; + shf.l.wrap.b32 %r10886, %r10885, %r10885, 25; + add.s32 %r10887, %r10856, %r10839; + add.s32 %r10888, %r10887, %r10783; + xor.b32 %r10889, %r10883, %r10888; + shf.l.wrap.b32 %r10890, %r10889, %r10889, 16; + add.s32 %r10891, %r10890, %r10869; + xor.b32 %r10892, %r10891, %r10856; + shf.l.wrap.b32 %r10893, %r10892, %r10892, 20; + add.s32 %r10894, %r10790, %r10888; + add.s32 %r10895, %r10894, %r10893; + xor.b32 %r10896, %r10895, %r10890; + shf.l.wrap.b32 %r10897, %r10896, %r10896, 24; + add.s32 %r10898, %r10897, %r10891; + xor.b32 %r10899, %r10898, %r10893; + shf.l.wrap.b32 %r10900, %r10899, %r10899, 25; + add.s32 %r10901, %r10871, %r10851; + add.s32 %r10902, %r10901, %r10797; + xor.b32 %r10903, %r10902, %r10841; + shf.l.wrap.b32 %r10904, %r10903, %r10903, 16; + add.s32 %r10905, %r10904, %r10884; + xor.b32 %r10906, %r10905, %r10871; + shf.l.wrap.b32 %r10907, %r10906, %r10906, 20; + add.s32 %r10908, %r10804, %r10902; + add.s32 %r10909, %r10908, %r10907; + xor.b32 %r10910, %r10909, %r10904; + shf.l.wrap.b32 %r10911, %r10910, %r10910, 24; + add.s32 %r10912, %r10911, %r10905; + xor.b32 %r10913, %r10912, %r10907; + shf.l.wrap.b32 %r10914, %r10913, %r10913, 25; + add.s32 %r10915, %r10886, %r10866; + add.s32 %r10916, %r10915, %r10811; + xor.b32 %r10917, %r10916, %r10853; + shf.l.wrap.b32 %r10918, %r10917, %r10917, 16; + add.s32 %r10919, %r10918, %r10842; + xor.b32 %r10920, %r10919, %r10886; + shf.l.wrap.b32 %r10921, %r10920, %r10920, 20; + add.s32 %r10922, %r10818, %r10916; + add.s32 %r10923, %r10922, %r10921; + xor.b32 %r10924, %r10923, %r10918; + shf.l.wrap.b32 %r10925, %r10924, %r10924, 24; + add.s32 %r10926, %r10925, %r10919; + xor.b32 %r10927, %r10926, %r10921; + shf.l.wrap.b32 %r10928, %r10927, %r10927, 25; + add.s32 %r10929, %r10881, %r10844; + add.s32 %r10930, %r10929, %r10825; + xor.b32 %r10931, %r10930, %r10868; + shf.l.wrap.b32 %r10932, %r10931, %r10931, 16; + add.s32 %r10933, %r10932, %r10854; + xor.b32 %r10934, %r10933, %r10844; + shf.l.wrap.b32 %r10935, %r10934, %r10934, 20; + add.s32 %r10936, %r10832, %r10930; + add.s32 %r10937, %r10936, %r10935; + xor.b32 %r10938, %r10937, %r10932; + shf.l.wrap.b32 %r10939, %r10938, %r10938, 24; + add.s32 %r10940, %r10939, %r10933; + xor.b32 %r10941, %r10940, %r10935; + shf.l.wrap.b32 %r10942, %r10941, %r10941, 25; + add.s32 %r10943, %r10895, %r10741; + add.s32 %r10944, %r10943, %r10942; + xor.b32 %r10945, %r10944, %r10911; + shf.l.wrap.b32 %r10946, %r10945, %r10945, 16; + add.s32 %r10947, %r10946, %r10926; + xor.b32 %r10948, %r10947, %r10942; + shf.l.wrap.b32 %r10949, %r10948, %r10948, 20; + add.s32 %r10950, %r10944, %r10769; + add.s32 %r10951, %r10950, %r10949; + xor.b32 %r10952, %r10951, %r10946; + shf.l.wrap.b32 %r10953, %r10952, %r10952, 24; + add.s32 %r10954, %r10953, %r10947; + xor.b32 %r10955, %r10954, %r10949; + shf.l.wrap.b32 %r10956, %r10955, %r10955, 25; + add.s32 %r10957, %r10909, %r10748; + add.s32 %r10958, %r10957, %r10900; + xor.b32 %r10959, %r10925, %r10958; + shf.l.wrap.b32 %r10960, %r10959, %r10959, 16; + add.s32 %r10961, %r10940, %r10960; + xor.b32 %r10962, %r10961, %r10900; + shf.l.wrap.b32 %r10963, %r10962, %r10962, 20; + add.s32 %r10964, %r10958, %r10797; + add.s32 %r10965, %r10964, %r10963; + xor.b32 %r10966, %r10965, %r10960; + shf.l.wrap.b32 %r10967, %r10966, %r10966, 24; + add.s32 %r10968, %r10967, %r10961; + xor.b32 %r10969, %r10968, %r10963; + shf.l.wrap.b32 %r10970, %r10969, %r10969, 25; + add.s32 %r10971, %r10914, %r10776; + add.s32 %r10972, %r10971, %r10923; + xor.b32 %r10973, %r10939, %r10972; + shf.l.wrap.b32 %r10974, %r10973, %r10973, 16; + add.s32 %r10975, %r10974, %r10898; + xor.b32 %r10976, %r10975, %r10914; + shf.l.wrap.b32 %r10977, %r10976, %r10976, 20; + add.s32 %r10978, %r10972, %r10727; + add.s32 %r10979, %r10978, %r10977; + xor.b32 %r10980, %r10979, %r10974; + shf.l.wrap.b32 %r10981, %r10980, %r10980, 24; + add.s32 %r10982, %r10981, %r10975; + xor.b32 %r10983, %r10982, %r10977; + shf.l.wrap.b32 %r10984, %r10983, %r10983, 25; + add.s32 %r10985, %r10928, %r10755; + add.s32 %r10986, %r10985, %r10937; + xor.b32 %r10987, %r10986, %r10897; + shf.l.wrap.b32 %r10988, %r10987, %r10987, 16; + add.s32 %r10989, %r10988, %r10912; + xor.b32 %r10990, %r10989, %r10928; + shf.l.wrap.b32 %r10991, %r10990, %r10990, 20; + add.s32 %r10992, %r10986, %r10818; + add.s32 %r10993, %r10992, %r10991; + xor.b32 %r10994, %r10993, %r10988; + shf.l.wrap.b32 %r10995, %r10994, %r10994, 24; + add.s32 %r10996, %r10995, %r10989; + xor.b32 %r10997, %r10996, %r10991; + shf.l.wrap.b32 %r10998, %r10997, %r10997, 25; + add.s32 %r10999, %r10970, %r10734; + add.s32 %r11000, %r10999, %r10951; + xor.b32 %r11001, %r11000, %r10995; + shf.l.wrap.b32 %r11002, %r11001, %r11001, 16; + add.s32 %r11003, %r11002, %r10982; + xor.b32 %r11004, %r11003, %r10970; + shf.l.wrap.b32 %r11005, %r11004, %r11004, 20; + add.s32 %r11006, %r11000, %r10804; + add.s32 %r11007, %r11006, %r11005; + xor.b32 %r11008, %r11007, %r11002; + shf.l.wrap.b32 %r11009, %r11008, %r11008, 24; + add.s32 %r11010, %r11009, %r11003; + xor.b32 %r11011, %r11010, %r11005; + shf.l.wrap.b32 %r11012, %r11011, %r11011, 25; + add.s32 %r11013, %r10965, %r10811; + add.s32 %r11014, %r11013, %r10984; + xor.b32 %r11015, %r10953, %r11014; + shf.l.wrap.b32 %r11016, %r11015, %r11015, 16; + add.s32 %r11017, %r11016, %r10996; + xor.b32 %r11018, %r11017, %r10984; + shf.l.wrap.b32 %r11019, %r11018, %r11018, 20; + add.s32 %r11020, %r11014, %r10762; + add.s32 %r11021, %r11020, %r11019; + xor.b32 %r11022, %r11021, %r11016; + shf.l.wrap.b32 %r11023, %r11022, %r11022, 24; + add.s32 %r11024, %r11023, %r11017; + xor.b32 %r11025, %r11024, %r11019; + shf.l.wrap.b32 %r11026, %r11025, %r11025, 25; + add.s32 %r11027, %r10979, %r10790; + add.s32 %r11028, %r11027, %r10998; + xor.b32 %r11029, %r11028, %r10967; + shf.l.wrap.b32 %r11030, %r11029, %r11029, 16; + add.s32 %r11031, %r11030, %r10954; + xor.b32 %r11032, %r11031, %r10998; + shf.l.wrap.b32 %r11033, %r11032, %r11032, 20; + add.s32 %r11034, %r11028, %r10825; + add.s32 %r11035, %r11034, %r11033; + xor.b32 %r11036, %r11035, %r11030; + shf.l.wrap.b32 %r11037, %r11036, %r11036, 24; + add.s32 %r11038, %r11037, %r11031; + xor.b32 %r11039, %r11038, %r11033; + shf.l.wrap.b32 %r11040, %r11039, %r11039, 25; + add.s32 %r11041, %r10993, %r10832; + add.s32 %r11042, %r11041, %r10956; + xor.b32 %r11043, %r11042, %r10981; + shf.l.wrap.b32 %r11044, %r11043, %r11043, 16; + add.s32 %r11045, %r11044, %r10968; + xor.b32 %r11046, %r11045, %r10956; + shf.l.wrap.b32 %r11047, %r11046, %r11046, 20; + add.s32 %r11048, %r11042, %r10783; + add.s32 %r11049, %r11048, %r11047; + xor.b32 %r11050, %r11049, %r11044; + shf.l.wrap.b32 %r11051, %r11050, %r11050, 24; + add.s32 %r11052, %r11051, %r11045; + xor.b32 %r11053, %r11052, %r11047; + shf.l.wrap.b32 %r11054, %r11053, %r11053, 25; + add.s32 %r11055, %r11007, %r10748; + add.s32 %r11056, %r11055, %r11054; + xor.b32 %r11057, %r11056, %r11023; + shf.l.wrap.b32 %r11058, %r11057, %r11057, 16; + add.s32 %r11059, %r11058, %r11038; + xor.b32 %r11060, %r11059, %r11054; + shf.l.wrap.b32 %r11061, %r11060, %r11060, 20; + add.s32 %r11062, %r11056, %r10755; + add.s32 %r11063, %r11062, %r11061; + xor.b32 %r11064, %r11063, %r11058; + shf.l.wrap.b32 %r11065, %r11064, %r11064, 24; + add.s32 %r11066, %r11065, %r11059; + xor.b32 %r11067, %r11066, %r11061; + shf.l.wrap.b32 %r11068, %r11067, %r11067, 25; + add.s32 %r11069, %r11021, %r10797; + add.s32 %r11070, %r11069, %r11012; + xor.b32 %r11071, %r11070, %r11037; + shf.l.wrap.b32 %r11072, %r11071, %r11071, 16; + add.s32 %r11073, %r11072, %r11052; + xor.b32 %r11074, %r11073, %r11012; + shf.l.wrap.b32 %r11075, %r11074, %r11074, 20; + add.s32 %r11076, %r11070, %r10811; + add.s32 %r11077, %r11076, %r11075; + xor.b32 %r11078, %r11077, %r11072; + shf.l.wrap.b32 %r11079, %r11078, %r11078, 24; + add.s32 %r11080, %r11079, %r11073; + xor.b32 %r11081, %r11080, %r11075; + shf.l.wrap.b32 %r11082, %r11081, %r11081, 25; + add.s32 %r11083, %r11035, %r10818; + add.s32 %r11084, %r11083, %r11026; + xor.b32 %r11085, %r11051, %r11084; + shf.l.wrap.b32 %r11086, %r11085, %r11085, 16; + add.s32 %r11087, %r11086, %r11010; + xor.b32 %r11088, %r11087, %r11026; + shf.l.wrap.b32 %r11089, %r11088, %r11088, 20; + add.s32 %r11090, %r11084, %r10741; + add.s32 %r11091, %r11090, %r11089; + xor.b32 %r11092, %r11091, %r11086; + shf.l.wrap.b32 %r11093, %r11092, %r11092, 24; + add.s32 %r11094, %r11093, %r11087; + xor.b32 %r11095, %r11094, %r11089; + shf.l.wrap.b32 %r11096, %r11095, %r11095, 25; + add.s32 %r11097, %r11040, %r10776; + add.s32 %r11098, %r11097, %r11049; + xor.b32 %r11099, %r11098, %r11009; + shf.l.wrap.b32 %r11100, %r11099, %r11099, 16; + add.s32 %r11101, %r11100, %r11024; + xor.b32 %r11102, %r11101, %r11040; + shf.l.wrap.b32 %r11103, %r11102, %r11102, 20; + add.s32 %r11104, %r11098, %r10825; + add.s32 %r11105, %r11104, %r11103; + xor.b32 %r11106, %r11105, %r11100; + shf.l.wrap.b32 %r11107, %r11106, %r11106, 24; + add.s32 %r11108, %r11107, %r11101; + xor.b32 %r11109, %r11108, %r11103; + shf.l.wrap.b32 %r11110, %r11109, %r11109, 25; + add.s32 %r11111, %r11082, %r10769; + add.s32 %r11112, %r11111, %r11063; + xor.b32 %r11113, %r11112, %r11107; + shf.l.wrap.b32 %r11114, %r11113, %r11113, 16; + add.s32 %r11115, %r11114, %r11094; + xor.b32 %r11116, %r11115, %r11082; + shf.l.wrap.b32 %r11117, %r11116, %r11116, 20; + add.s32 %r11118, %r11112, %r10762; + add.s32 %r11119, %r11118, %r11117; + xor.b32 %r11120, %r11119, %r11114; + shf.l.wrap.b32 %r11121, %r11120, %r11120, 24; + add.s32 %r11122, %r11121, %r11115; + xor.b32 %r11123, %r11122, %r11117; + shf.l.wrap.b32 %r11124, %r11123, %r11123, 25; + add.s32 %r11125, %r11077, %r10790; + add.s32 %r11126, %r11125, %r11096; + xor.b32 %r11127, %r11065, %r11126; + shf.l.wrap.b32 %r11128, %r11127, %r11127, 16; + add.s32 %r11129, %r11128, %r11108; + xor.b32 %r11130, %r11129, %r11096; + shf.l.wrap.b32 %r11131, %r11130, %r11130, 20; + add.s32 %r11132, %r11126, %r10727; + add.s32 %r11133, %r11132, %r11131; + xor.b32 %r11134, %r11133, %r11128; + shf.l.wrap.b32 %r11135, %r11134, %r11134, 24; + add.s32 %r11136, %r11135, %r11129; + xor.b32 %r11137, %r11136, %r11131; + shf.l.wrap.b32 %r11138, %r11137, %r11137, 25; + add.s32 %r11139, %r11091, %r10804; + add.s32 %r11140, %r11139, %r11110; + xor.b32 %r11141, %r11140, %r11079; + shf.l.wrap.b32 %r11142, %r11141, %r11141, 16; + add.s32 %r11143, %r11142, %r11066; + xor.b32 %r11144, %r11143, %r11110; + shf.l.wrap.b32 %r11145, %r11144, %r11144, 20; + add.s32 %r11146, %r11140, %r10832; + add.s32 %r11147, %r11146, %r11145; + xor.b32 %r11148, %r11147, %r11142; + shf.l.wrap.b32 %r11149, %r11148, %r11148, 24; + add.s32 %r11150, %r11149, %r11143; + xor.b32 %r11151, %r11150, %r11145; + shf.l.wrap.b32 %r11152, %r11151, %r11151, 25; + add.s32 %r11153, %r11105, %r10783; + add.s32 %r11154, %r11153, %r11068; + xor.b32 %r11155, %r11154, %r11093; + shf.l.wrap.b32 %r11156, %r11155, %r11155, 16; + add.s32 %r11157, %r11156, %r11080; + xor.b32 %r11158, %r11157, %r11068; + shf.l.wrap.b32 %r11159, %r11158, %r11158, 20; + add.s32 %r11160, %r11154, %r10734; + add.s32 %r11161, %r11160, %r11159; + xor.b32 %r11162, %r11161, %r11156; + shf.l.wrap.b32 %r11163, %r11162, %r11162, 24; + add.s32 %r11164, %r11163, %r11157; + xor.b32 %r11165, %r11164, %r11159; + shf.l.wrap.b32 %r11166, %r11165, %r11165, 25; + add.s32 %r11167, %r11119, %r10797; + add.s32 %r11168, %r11167, %r11166; + xor.b32 %r11169, %r11168, %r11135; + shf.l.wrap.b32 %r11170, %r11169, %r11169, 16; + add.s32 %r11171, %r11170, %r11150; + xor.b32 %r11172, %r11171, %r11166; + shf.l.wrap.b32 %r11173, %r11172, %r11172, 20; + add.s32 %r11174, %r11168, %r10776; + add.s32 %r11175, %r11174, %r11173; + xor.b32 %r11176, %r11175, %r11170; + shf.l.wrap.b32 %r11177, %r11176, %r11176, 24; + add.s32 %r11178, %r11177, %r11171; + xor.b32 %r11179, %r11178, %r11173; + shf.l.wrap.b32 %r11180, %r11179, %r11179, 25; + add.s32 %r11181, %r11133, %r10811; + add.s32 %r11182, %r11181, %r11124; + xor.b32 %r11183, %r11182, %r11149; + shf.l.wrap.b32 %r11184, %r11183, %r11183, 16; + add.s32 %r11185, %r11184, %r11164; + xor.b32 %r11186, %r11185, %r11124; + shf.l.wrap.b32 %r11187, %r11186, %r11186, 20; + add.s32 %r11188, %r11182, %r10790; + add.s32 %r11189, %r11188, %r11187; + xor.b32 %r11190, %r11189, %r11184; + shf.l.wrap.b32 %r11191, %r11190, %r11190, 24; + add.s32 %r11192, %r11191, %r11185; + xor.b32 %r11193, %r11192, %r11187; + shf.l.wrap.b32 %r11194, %r11193, %r11193, 25; + add.s32 %r11195, %r11147, %r10825; + add.s32 %r11196, %r11195, %r11138; + xor.b32 %r11197, %r11163, %r11196; + shf.l.wrap.b32 %r11198, %r11197, %r11197, 16; + add.s32 %r11199, %r11198, %r11122; + xor.b32 %r11200, %r11199, %r11138; + shf.l.wrap.b32 %r11201, %r11200, %r11200, 20; + add.s32 %r11202, %r11196, %r10748; + add.s32 %r11203, %r11202, %r11201; + xor.b32 %r11204, %r11203, %r11198; + shf.l.wrap.b32 %r11205, %r11204, %r11204, 24; + add.s32 %r11206, %r11205, %r11199; + xor.b32 %r11207, %r11206, %r11201; + shf.l.wrap.b32 %r11208, %r11207, %r11207, 25; + add.s32 %r11209, %r11152, %r10818; + add.s32 %r11210, %r11209, %r11161; + xor.b32 %r11211, %r11210, %r11121; + shf.l.wrap.b32 %r11212, %r11211, %r11211, 16; + add.s32 %r11213, %r11212, %r11136; + xor.b32 %r11214, %r11213, %r11152; + shf.l.wrap.b32 %r11215, %r11214, %r11214, 20; + add.s32 %r11216, %r11210, %r10832; + add.s32 %r11217, %r11216, %r11215; + xor.b32 %r11218, %r11217, %r11212; + shf.l.wrap.b32 %r11219, %r11218, %r11218, 24; + add.s32 %r11220, %r11219, %r11213; + xor.b32 %r11221, %r11220, %r11215; + shf.l.wrap.b32 %r11222, %r11221, %r11221, 25; + add.s32 %r11223, %r11194, %r10755; + add.s32 %r11224, %r11223, %r11175; + xor.b32 %r11225, %r11224, %r11219; + shf.l.wrap.b32 %r11226, %r11225, %r11225, 16; + add.s32 %r11227, %r11226, %r11206; + xor.b32 %r11228, %r11227, %r11194; + shf.l.wrap.b32 %r11229, %r11228, %r11228, 20; + add.s32 %r11230, %r11224, %r10727; + add.s32 %r11231, %r11230, %r11229; + xor.b32 %r11232, %r11231, %r11226; + shf.l.wrap.b32 %r11233, %r11232, %r11232, 24; + add.s32 %r11234, %r11233, %r11227; + xor.b32 %r11235, %r11234, %r11229; + shf.l.wrap.b32 %r11236, %r11235, %r11235, 25; + add.s32 %r11237, %r11189, %r10804; + add.s32 %r11238, %r11237, %r11208; + xor.b32 %r11239, %r11177, %r11238; + shf.l.wrap.b32 %r11240, %r11239, %r11239, 16; + add.s32 %r11241, %r11240, %r11220; + xor.b32 %r11242, %r11241, %r11208; + shf.l.wrap.b32 %r11243, %r11242, %r11242, 20; + add.s32 %r11244, %r11238, %r10741; + add.s32 %r11245, %r11244, %r11243; + xor.b32 %r11246, %r11245, %r11240; + shf.l.wrap.b32 %r11247, %r11246, %r11246, 24; + add.s32 %r11248, %r11247, %r11241; + xor.b32 %r11249, %r11248, %r11243; + shf.l.wrap.b32 %r11250, %r11249, %r11249, 25; + add.s32 %r11251, %r11203, %r10762; + add.s32 %r11252, %r11251, %r11222; + xor.b32 %r11253, %r11252, %r11191; + shf.l.wrap.b32 %r11254, %r11253, %r11253, 16; + add.s32 %r11255, %r11254, %r11178; + xor.b32 %r11256, %r11255, %r11222; + shf.l.wrap.b32 %r11257, %r11256, %r11256, 20; + add.s32 %r11258, %r11252, %r10783; + add.s32 %r11259, %r11258, %r11257; + xor.b32 %r11260, %r11259, %r11254; + shf.l.wrap.b32 %r11261, %r11260, %r11260, 24; + add.s32 %r11262, %r11261, %r11255; + xor.b32 %r11263, %r11262, %r11257; + shf.l.wrap.b32 %r11264, %r11263, %r11263, 25; + add.s32 %r11265, %r11217, %r10734; + add.s32 %r11266, %r11265, %r11180; + xor.b32 %r11267, %r11266, %r11205; + shf.l.wrap.b32 %r11268, %r11267, %r11267, 16; + add.s32 %r11269, %r11268, %r11192; + xor.b32 %r11270, %r11269, %r11180; + shf.l.wrap.b32 %r11271, %r11270, %r11270, 20; + add.s32 %r11272, %r11266, %r10769; + add.s32 %r11273, %r11272, %r11271; + xor.b32 %r11274, %r11273, %r11268; + shf.l.wrap.b32 %r11275, %r11274, %r11274, 24; + add.s32 %r11276, %r11275, %r11269; + xor.b32 %r11277, %r11276, %r11271; + shf.l.wrap.b32 %r11278, %r11277, %r11277, 25; + add.s32 %r11279, %r11231, %r10811; + add.s32 %r11280, %r11279, %r11278; + xor.b32 %r11281, %r11280, %r11247; + shf.l.wrap.b32 %r11282, %r11281, %r11281, 16; + add.s32 %r11283, %r11282, %r11262; + xor.b32 %r11284, %r11283, %r11278; + shf.l.wrap.b32 %r11285, %r11284, %r11284, 20; + add.s32 %r11286, %r11280, %r10818; + add.s32 %r11287, %r11286, %r11285; + xor.b32 %r11288, %r11287, %r11282; + shf.l.wrap.b32 %r11289, %r11288, %r11288, 24; + add.s32 %r11290, %r11289, %r11283; + xor.b32 %r11291, %r11290, %r11285; + shf.l.wrap.b32 %r11292, %r11291, %r11291, 25; + add.s32 %r11293, %r11245, %r10790; + add.s32 %r11294, %r11293, %r11236; + xor.b32 %r11295, %r11294, %r11261; + shf.l.wrap.b32 %r11296, %r11295, %r11295, 16; + add.s32 %r11297, %r11296, %r11276; + xor.b32 %r11298, %r11297, %r11236; + shf.l.wrap.b32 %r11299, %r11298, %r11298, 20; + add.s32 %r11300, %r11294, %r10804; + add.s32 %r11301, %r11300, %r11299; + xor.b32 %r11302, %r11301, %r11296; + shf.l.wrap.b32 %r11303, %r11302, %r11302, 24; + add.s32 %r11304, %r11303, %r11297; + xor.b32 %r11305, %r11304, %r11299; + shf.l.wrap.b32 %r11306, %r11305, %r11305, 25; + add.s32 %r11307, %r11259, %r10832; + add.s32 %r11308, %r11307, %r11250; + xor.b32 %r11309, %r11275, %r11308; + shf.l.wrap.b32 %r11310, %r11309, %r11309, 16; + add.s32 %r11311, %r11310, %r11234; + xor.b32 %r11312, %r11311, %r11250; + shf.l.wrap.b32 %r11313, %r11312, %r11312, 20; + add.s32 %r11314, %r11308, %r10797; + add.s32 %r11315, %r11314, %r11313; + xor.b32 %r11316, %r11315, %r11310; + shf.l.wrap.b32 %r11317, %r11316, %r11316, 24; + add.s32 %r11318, %r11317, %r11311; + xor.b32 %r11319, %r11318, %r11313; + shf.l.wrap.b32 %r11320, %r11319, %r11319, 25; + add.s32 %r11321, %r11264, %r10825; + add.s32 %r11322, %r11321, %r11273; + xor.b32 %r11323, %r11322, %r11233; + shf.l.wrap.b32 %r11324, %r11323, %r11323, 16; + add.s32 %r11325, %r11324, %r11248; + xor.b32 %r11326, %r11325, %r11264; + shf.l.wrap.b32 %r11327, %r11326, %r11326, 20; + add.s32 %r11328, %r11322, %r10783; + add.s32 %r11329, %r11328, %r11327; + xor.b32 %r11330, %r11329, %r11324; + shf.l.wrap.b32 %r11331, %r11330, %r11330, 24; + add.s32 %r11332, %r11331, %r11325; + xor.b32 %r11333, %r11332, %r11327; + shf.l.wrap.b32 %r11334, %r11333, %r11333, 25; + add.s32 %r11335, %r11306, %r10776; + add.s32 %r11336, %r11335, %r11287; + xor.b32 %r11337, %r11336, %r11331; + shf.l.wrap.b32 %r11338, %r11337, %r11337, 16; + add.s32 %r11339, %r11338, %r11318; + xor.b32 %r11340, %r11339, %r11306; + shf.l.wrap.b32 %r11341, %r11340, %r11340, 20; + add.s32 %r11342, %r11336, %r10741; + add.s32 %r11343, %r11342, %r11341; + xor.b32 %r11344, %r11343, %r11338; + shf.l.wrap.b32 %r11345, %r11344, %r11344, 24; + add.s32 %r11346, %r11345, %r11339; + xor.b32 %r11347, %r11346, %r11341; + shf.l.wrap.b32 %r11348, %r11347, %r11347, 25; + add.s32 %r11349, %r11301, %r10762; + add.s32 %r11350, %r11349, %r11320; + xor.b32 %r11351, %r11289, %r11350; + shf.l.wrap.b32 %r11352, %r11351, %r11351, 16; + add.s32 %r11353, %r11352, %r11332; + xor.b32 %r11354, %r11353, %r11320; + shf.l.wrap.b32 %r11355, %r11354, %r11354, 20; + add.s32 %r11356, %r11350, %r10748; + add.s32 %r11357, %r11356, %r11355; + xor.b32 %r11358, %r11357, %r11352; + shf.l.wrap.b32 %r11359, %r11358, %r11358, 24; + add.s32 %r11360, %r11359, %r11353; + xor.b32 %r11361, %r11360, %r11355; + shf.l.wrap.b32 %r11362, %r11361, %r11361, 25; + add.s32 %r11363, %r11315, %r10727; + add.s32 %r11364, %r11363, %r11334; + xor.b32 %r11365, %r11364, %r11303; + shf.l.wrap.b32 %r11366, %r11365, %r11365, 16; + add.s32 %r11367, %r11366, %r11290; + xor.b32 %r11368, %r11367, %r11334; + shf.l.wrap.b32 %r11369, %r11368, %r11368, 20; + add.s32 %r11370, %r11364, %r10734; + add.s32 %r11371, %r11370, %r11369; + xor.b32 %r11372, %r11371, %r11366; + shf.l.wrap.b32 %r11373, %r11372, %r11372, 24; + add.s32 %r11374, %r11373, %r11367; + xor.b32 %r11375, %r11374, %r11369; + shf.l.wrap.b32 %r11376, %r11375, %r11375, 25; + add.s32 %r11377, %r11329, %r10769; + add.s32 %r11378, %r11377, %r11292; + xor.b32 %r11379, %r11378, %r11317; + shf.l.wrap.b32 %r11380, %r11379, %r11379, 16; + add.s32 %r11381, %r11380, %r11304; + xor.b32 %r11382, %r11381, %r11292; + shf.l.wrap.b32 %r11383, %r11382, %r11382, 20; + add.s32 %r11384, %r11378, %r10755; + add.s32 %r11385, %r11384, %r11383; + xor.b32 %r11386, %r11385, %r11380; + shf.l.wrap.b32 %r11387, %r11386, %r11386, 24; + add.s32 %r11388, %r11387, %r11381; + xor.b32 %r11389, %r11388, %r11383; + shf.l.wrap.b32 %r11390, %r11389, %r11389, 25; + add.s32 %r11391, %r11343, %r10790; + add.s32 %r11392, %r11391, %r11390; + xor.b32 %r11393, %r11392, %r11359; + shf.l.wrap.b32 %r11394, %r11393, %r11393, 16; + add.s32 %r11395, %r11394, %r11374; + xor.b32 %r11396, %r11395, %r11390; + shf.l.wrap.b32 %r11397, %r11396, %r11396, 20; + add.s32 %r11398, %r11392, %r10825; + add.s32 %r11399, %r11398, %r11397; + xor.b32 %r11400, %r11399, %r11394; + shf.l.wrap.b32 %r11401, %r11400, %r11400, 24; + add.s32 %r11402, %r11401, %r11395; + xor.b32 %r11403, %r11402, %r11397; + shf.l.wrap.b32 %r11404, %r11403, %r11403, 25; + add.s32 %r11405, %r11357, %r10804; + add.s32 %r11406, %r11405, %r11348; + xor.b32 %r11407, %r11406, %r11373; + shf.l.wrap.b32 %r11408, %r11407, %r11407, 16; + add.s32 %r11409, %r11408, %r11388; + xor.b32 %r11410, %r11409, %r11348; + shf.l.wrap.b32 %r11411, %r11410, %r11410, 20; + add.s32 %r11412, %r11406, %r10762; + add.s32 %r11413, %r11412, %r11411; + xor.b32 %r11414, %r11413, %r11408; + shf.l.wrap.b32 %r11415, %r11414, %r11414, 24; + add.s32 %r11416, %r11415, %r11409; + xor.b32 %r11417, %r11416, %r11411; + shf.l.wrap.b32 %r11418, %r11417, %r11417, 25; + add.s32 %r11419, %r11371, %r10783; + add.s32 %r11420, %r11419, %r11362; + xor.b32 %r11421, %r11387, %r11420; + shf.l.wrap.b32 %r11422, %r11421, %r11421, 16; + add.s32 %r11423, %r11422, %r11346; + xor.b32 %r11424, %r11423, %r11362; + shf.l.wrap.b32 %r11425, %r11424, %r11424, 20; + add.s32 %r11426, %r11420, %r10811; + add.s32 %r11427, %r11426, %r11425; + xor.b32 %r11428, %r11427, %r11422; + shf.l.wrap.b32 %r11429, %r11428, %r11428, 24; + add.s32 %r11430, %r11429, %r11423; + xor.b32 %r11431, %r11430, %r11425; + shf.l.wrap.b32 %r11432, %r11431, %r11431, 25; + add.s32 %r11433, %r11376, %r10832; + add.s32 %r11434, %r11433, %r11385; + xor.b32 %r11435, %r11434, %r11345; + shf.l.wrap.b32 %r11436, %r11435, %r11435, 16; + add.s32 %r11437, %r11436, %r11360; + xor.b32 %r11438, %r11437, %r11376; + shf.l.wrap.b32 %r11439, %r11438, %r11438, 20; + add.s32 %r11440, %r11434, %r10734; + add.s32 %r11441, %r11440, %r11439; + xor.b32 %r11442, %r11441, %r11436; + shf.l.wrap.b32 %r11443, %r11442, %r11442, 24; + add.s32 %r11444, %r11443, %r11437; + xor.b32 %r11445, %r11444, %r11439; + shf.l.wrap.b32 %r11446, %r11445, %r11445, 25; + add.s32 %r11447, %r11418, %r10818; + add.s32 %r11448, %r11447, %r11399; + xor.b32 %r11449, %r11448, %r11443; + shf.l.wrap.b32 %r11450, %r11449, %r11449, 16; + add.s32 %r11451, %r11450, %r11430; + xor.b32 %r11452, %r11451, %r11418; + shf.l.wrap.b32 %r11453, %r11452, %r11452, 20; + add.s32 %r11454, %r11448, %r10748; + add.s32 %r11455, %r11454, %r11453; + xor.b32 %r11456, %r11455, %r11450; + shf.l.wrap.b32 %r11457, %r11456, %r11456, 24; + add.s32 %r11458, %r11457, %r11451; + xor.b32 %r11459, %r11458, %r11453; + shf.l.wrap.b32 %r11460, %r11459, %r11459, 25; + add.s32 %r11461, %r11413, %r10727; + add.s32 %r11462, %r11461, %r11432; + xor.b32 %r11463, %r11401, %r11462; + shf.l.wrap.b32 %r11464, %r11463, %r11463, 16; + add.s32 %r11465, %r11464, %r11444; + xor.b32 %r11466, %r11465, %r11432; + shf.l.wrap.b32 %r11467, %r11466, %r11466, 20; + add.s32 %r11468, %r11462, %r10797; + add.s32 %r11469, %r11468, %r11467; + xor.b32 %r11470, %r11469, %r11464; + shf.l.wrap.b32 %r11471, %r11470, %r11470, 24; + add.s32 %r11472, %r11471, %r11465; + xor.b32 %r11473, %r11472, %r11467; + shf.l.wrap.b32 %r11474, %r11473, %r11473, 25; + add.s32 %r11475, %r11427, %r10741; + add.s32 %r11476, %r11475, %r11446; + xor.b32 %r11477, %r11476, %r11415; + shf.l.wrap.b32 %r11478, %r11477, %r11477, 16; + add.s32 %r11479, %r11478, %r11402; + xor.b32 %r11480, %r11479, %r11446; + shf.l.wrap.b32 %r11481, %r11480, %r11480, 20; + add.s32 %r11482, %r11476, %r10769; + add.s32 %r11483, %r11482, %r11481; + xor.b32 %r11484, %r11483, %r11478; + shf.l.wrap.b32 %r11485, %r11484, %r11484, 24; + add.s32 %r11486, %r11485, %r11479; + xor.b32 %r11487, %r11486, %r11481; + shf.l.wrap.b32 %r11488, %r11487, %r11487, 25; + add.s32 %r11489, %r11441, %r10755; + add.s32 %r11490, %r11489, %r11404; + xor.b32 %r11491, %r11490, %r11429; + shf.l.wrap.b32 %r11492, %r11491, %r11491, 16; + add.s32 %r11493, %r11492, %r11416; + xor.b32 %r11494, %r11493, %r11404; + shf.l.wrap.b32 %r11495, %r11494, %r11494, 20; + add.s32 %r11496, %r11490, %r10776; + add.s32 %r11497, %r11496, %r11495; + xor.b32 %r11498, %r11497, %r11492; + shf.l.wrap.b32 %r11499, %r11498, %r11498, 24; + add.s32 %r11500, %r11499, %r11493; + xor.b32 %r11501, %r11500, %r11495; + shf.l.wrap.b32 %r11502, %r11501, %r11501, 25; + add.s32 %r11503, %r11455, %r10804; + add.s32 %r11504, %r11503, %r11502; + xor.b32 %r11505, %r11504, %r11471; + shf.l.wrap.b32 %r11506, %r11505, %r11505, 16; + add.s32 %r11507, %r11506, %r11486; + xor.b32 %r11508, %r11507, %r11502; + shf.l.wrap.b32 %r11509, %r11508, %r11508, 20; + add.s32 %r11510, %r11504, %r10832; + add.s32 %r11511, %r11510, %r11509; + xor.b32 %r11512, %r11511, %r11506; + shf.l.wrap.b32 %r11513, %r11512, %r11512, 24; + add.s32 %r11514, %r11513, %r11507; + xor.b32 %r11515, %r11514, %r11509; + shf.l.wrap.b32 %r11516, %r11515, %r11515, 25; + add.s32 %r11517, %r11469, %r10762; + add.s32 %r11518, %r11517, %r11460; + xor.b32 %r11519, %r11518, %r11485; + shf.l.wrap.b32 %r11520, %r11519, %r11519, 16; + add.s32 %r11521, %r11520, %r11500; + xor.b32 %r11522, %r11521, %r11460; + shf.l.wrap.b32 %r11523, %r11522, %r11522, 20; + add.s32 %r11524, %r11518, %r10727; + add.s32 %r11525, %r11524, %r11523; + xor.b32 %r11526, %r11525, %r11520; + shf.l.wrap.b32 %r11527, %r11526, %r11526, 24; + add.s32 %r11528, %r11527, %r11521; + xor.b32 %r11529, %r11528, %r11523; + shf.l.wrap.b32 %r11530, %r11529, %r11529, 25; + add.s32 %r11531, %r11483, %r10734; + add.s32 %r11532, %r11531, %r11474; + xor.b32 %r11533, %r11499, %r11532; + shf.l.wrap.b32 %r11534, %r11533, %r11533, 16; + add.s32 %r11535, %r11534, %r11458; + xor.b32 %r11536, %r11535, %r11474; + shf.l.wrap.b32 %r11537, %r11536, %r11536, 20; + add.s32 %r11538, %r11532, %r10790; + add.s32 %r11539, %r11538, %r11537; + xor.b32 %r11540, %r11539, %r11534; + shf.l.wrap.b32 %r11541, %r11540, %r11540, 24; + add.s32 %r11542, %r11541, %r11535; + xor.b32 %r11543, %r11542, %r11537; + shf.l.wrap.b32 %r11544, %r11543, %r11543, 25; + add.s32 %r11545, %r11488, %r10783; + add.s32 %r11546, %r11545, %r11497; + xor.b32 %r11547, %r11546, %r11457; + shf.l.wrap.b32 %r11548, %r11547, %r11547, 16; + add.s32 %r11549, %r11548, %r11472; + xor.b32 %r11550, %r11549, %r11488; + shf.l.wrap.b32 %r11551, %r11550, %r11550, 20; + add.s32 %r11552, %r11546, %r10769; + add.s32 %r11553, %r11552, %r11551; + xor.b32 %r11554, %r11553, %r11548; + shf.l.wrap.b32 %r11555, %r11554, %r11554, 24; + add.s32 %r11556, %r11555, %r11549; + xor.b32 %r11557, %r11556, %r11551; + shf.l.wrap.b32 %r11558, %r11557, %r11557, 25; + add.s32 %r11559, %r11530, %r10825; + add.s32 %r11560, %r11559, %r11511; + xor.b32 %r11561, %r11560, %r11555; + shf.l.wrap.b32 %r11562, %r11561, %r11561, 16; + add.s32 %r11563, %r11562, %r11542; + xor.b32 %r11564, %r11563, %r11530; + shf.l.wrap.b32 %r11565, %r11564, %r11564, 20; + add.s32 %r11566, %r11560, %r10797; + add.s32 %r11567, %r11566, %r11565; + xor.b32 %r11568, %r11567, %r11562; + shf.l.wrap.b32 %r11569, %r11568, %r11568, 24; + add.s32 %r11570, %r11569, %r11563; + xor.b32 %r11571, %r11570, %r11565; + shf.l.wrap.b32 %r11572, %r11571, %r11571, 25; + add.s32 %r11573, %r11525, %r10741; + add.s32 %r11574, %r11573, %r11544; + xor.b32 %r11575, %r11513, %r11574; + shf.l.wrap.b32 %r11576, %r11575, %r11575, 16; + add.s32 %r11577, %r11576, %r11556; + xor.b32 %r11578, %r11577, %r11544; + shf.l.wrap.b32 %r11579, %r11578, %r11578, 20; + add.s32 %r11580, %r11574, %r10811; + add.s32 %r11581, %r11580, %r11579; + xor.b32 %r11582, %r11581, %r11576; + shf.l.wrap.b32 %r11583, %r11582, %r11582, 24; + add.s32 %r11584, %r11583, %r11577; + xor.b32 %r11585, %r11584, %r11579; + shf.l.wrap.b32 %r11586, %r11585, %r11585, 25; + add.s32 %r11587, %r11539, %r10748; + add.s32 %r11588, %r11587, %r11558; + xor.b32 %r11589, %r11588, %r11527; + shf.l.wrap.b32 %r11590, %r11589, %r11589, 16; + add.s32 %r11591, %r11590, %r11514; + xor.b32 %r11592, %r11591, %r11558; + shf.l.wrap.b32 %r11593, %r11592, %r11592, 20; + add.s32 %r11594, %r11588, %r10755; + add.s32 %r11595, %r11594, %r11593; + xor.b32 %r11596, %r11595, %r11590; + shf.l.wrap.b32 %r11597, %r11596, %r11596, 24; + add.s32 %r11598, %r11597, %r11591; + xor.b32 %r11599, %r11598, %r11593; + shf.l.wrap.b32 %r11600, %r11599, %r11599, 25; + add.s32 %r11601, %r11553, %r10776; + add.s32 %r11602, %r11601, %r11516; + xor.b32 %r11603, %r11602, %r11541; + shf.l.wrap.b32 %r11604, %r11603, %r11603, 16; + add.s32 %r11605, %r11604, %r11528; + xor.b32 %r11606, %r11605, %r11516; + shf.l.wrap.b32 %r11607, %r11606, %r11606, 20; + add.s32 %r11608, %r11602, %r10818; + add.s32 %r11609, %r11608, %r11607; + xor.b32 %r11610, %r11609, %r11604; + shf.l.wrap.b32 %r11611, %r11610, %r11610, 24; + add.s32 %r11612, %r11611, %r11605; + xor.b32 %r11613, %r11612, %r11607; + shf.l.wrap.b32 %r11614, %r11613, %r11613, 25; + xor.b32 %r11615, %r11598, %r11567; + xor.b32 %r11616, %r11612, %r11581; + xor.b32 %r11617, %r11570, %r11595; + xor.b32 %r11618, %r11609, %r11584; + xor.b32 %r11619, %r11614, %r11583; + xor.b32 %r11620, %r11572, %r11597; + xor.b32 %r11621, %r11611, %r11586; + xor.b32 %r11622, %r11600, %r11569; + st.local.u8 [%rd220+145], %r11615; + shr.u32 %r11623, %r11615, 8; + st.local.u8 [%rd220+146], %r11623; + shr.u32 %r11624, %r11615, 16; + st.local.u8 [%rd220+147], %r11624; + shr.u32 %r11625, %r11615, 24; + st.local.u8 [%rd220+148], %r11625; + st.local.u8 [%rd220+149], %r11616; + shr.u32 %r11626, %r11616, 8; + st.local.u8 [%rd220+150], %r11626; + shr.u32 %r11627, %r11616, 16; + st.local.u8 [%rd220+151], %r11627; + shr.u32 %r11628, %r11616, 24; + st.local.u8 [%rd220+152], %r11628; + st.local.u8 [%rd220+153], %r11617; + shr.u32 %r11629, %r11617, 8; + st.local.u8 [%rd220+154], %r11629; + shr.u32 %r11630, %r11617, 16; + st.local.u8 [%rd220+155], %r11630; + shr.u32 %r11631, %r11617, 24; + st.local.u8 [%rd220+156], %r11631; + st.local.u8 [%rd220+157], %r11618; + shr.u32 %r11632, %r11618, 8; + st.local.u8 [%rd220+158], %r11632; + shr.u32 %r11633, %r11618, 16; + st.local.u8 [%rd220+159], %r11633; + shr.u32 %r11634, %r11618, 24; + st.local.u8 [%rd220+160], %r11634; + st.local.u8 [%rd220+161], %r11619; + shr.u32 %r11635, %r11619, 8; + st.local.u8 [%rd220+162], %r11635; + shr.u32 %r11636, %r11619, 16; + st.local.u8 [%rd220+163], %r11636; + shr.u32 %r11637, %r11619, 24; + st.local.u8 [%rd220+164], %r11637; + st.local.u8 [%rd220+165], %r11620; + shr.u32 %r11638, %r11620, 8; + st.local.u8 [%rd220+166], %r11638; + shr.u32 %r11639, %r11620, 16; + st.local.u8 [%rd220+167], %r11639; + shr.u32 %r11640, %r11620, 24; + st.local.u8 [%rd220+168], %r11640; + st.local.u8 [%rd220+169], %r11621; + shr.u32 %r11641, %r11621, 8; + st.local.u8 [%rd220+170], %r11641; + shr.u32 %r11642, %r11621, 16; + st.local.u8 [%rd220+171], %r11642; + shr.u32 %r11643, %r11621, 24; + st.local.u8 [%rd220+172], %r11643; + st.local.u8 [%rd220+173], %r11622; + shr.u32 %r11644, %r11622, 8; + st.local.u8 [%rd220+174], %r11644; + shr.u32 %r11645, %r11622, 16; + st.local.u8 [%rd220+175], %r11645; + shr.u32 %r11646, %r11622, 24; + st.local.u8 [%rd220+176], %r11646; + add.s16 %rs392, %rs392, -1; + cvt.u64.u16 %rd221, %rs392; + and.b64 %rd222, %rd221, 255; + setp.lt.u64 %p53, %rd97, %rd222; + @%p53 bra $L__BB1_66; + + ld.param.u64 %rd233, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + cvta.to.local.u64 %rd232, %rd233; + add.s64 %rd231, %rd232, 136; + st.local.u8 [%rd231+8], %rs392; + +$L__BB1_68: + ret; + +} + // .globl heavy_hash +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5, + .param .u64 heavy_hash_param_6, + .param .u64 heavy_hash_param_7 +) +{ + .local .align 16 .b8 __local_depot2[2096]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<64>; + .reg .b16 %rs<866>; + .reg .b32 %r<30985>; + .reg .b64 %rd<1282>; + + + mov.u64 %SPL, __local_depot2; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs409, [heavy_hash_param_3]; + ld.param.u64 %rd349, [heavy_hash_param_0]; + ld.param.u64 %rd350, [heavy_hash_param_1]; + ld.param.u64 %rd351, [heavy_hash_param_2]; + ld.param.u64 %rd355, [heavy_hash_param_4]; + ld.param.u64 %rd352, [heavy_hash_param_5]; + ld.param.u64 %rd353, [heavy_hash_param_6]; + ld.param.u64 %rd354, [heavy_hash_param_7]; + cvta.to.global.u64 %rd1254, %rd353; + cvta.to.global.u64 %rd2, %rd355; + add.u64 %rd356, %SP, 0; + add.u64 %rd3, %SPL, 0; + add.u64 %rd4, %SPL, 2016; + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %tid.x; + or.b32 %r5023, %r2, %r1; + setp.ne.s32 %p6, %r5023, 0; + @%p6 bra $L__BB2_8; + + add.u64 %rd358, %SP, 2000; + add.u64 %rd359, %SPL, 2000; + mov.u32 %r29535, 0; + mov.u64 %rd360, 0; + st.local.v2.u32 [%rd359], {%r2, %r1}; + mov.u64 %rd361, $str$2; + cvta.global.u64 %rd362, %rd361; + { // callseq 3, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd362; + .param .b64 param1; + st.param.b64 [param1+0], %rd358; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5025, [retval0+0]; + } // callseq 3 + mov.u64 %rd363, $str$3; + cvta.global.u64 %rd364, %rd363; + { // callseq 4, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd364; + .param .b64 param1; + st.param.b64 [param1+0], %rd360; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5026, [retval0+0]; + } // callseq 4 + mov.u64 %rd1252, %rd1254; + +$L__BB2_2: + ld.global.u8 %r5027, [%rd1252+1280]; + st.local.u32 [%rd3], %r5027; + mov.u64 %rd368, $str; + cvta.global.u64 %rd369, %rd368; + { // callseq 5, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5028, [retval0+0]; + } // callseq 5 + add.s64 %rd1252, %rd1252, 1; + add.s32 %r29535, %r29535, 1; + setp.lt.u32 %p7, %r29535, 128; + @%p7 bra $L__BB2_2; + + mov.u64 %rd371, $str$1; + cvta.global.u64 %rd372, %rd371; + mov.u32 %r29536, 0; + mov.u64 %rd373, 0; + { // callseq 6, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd373; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5030, [retval0+0]; + } // callseq 6 + mov.u64 %rd374, $str$4; + cvta.global.u64 %rd375, %rd374; + { // callseq 7, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd375; + .param .b64 param1; + st.param.b64 [param1+0], %rd373; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5031, [retval0+0]; + } // callseq 7 + mov.u64 %rd1253, %rd1254; + +$L__BB2_4: + ld.global.u8 %r5032, [%rd1253+5376]; + st.local.u32 [%rd3], %r5032; + { // callseq 8, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5033, [retval0+0]; + } // callseq 8 + add.s64 %rd1253, %rd1253, 1; + add.s32 %r29536, %r29536, 1; + setp.lt.u32 %p8, %r29536, 128; + @%p8 bra $L__BB2_4; + + mov.u32 %r29537, 0; + mov.u64 %rd381, 0; + { // callseq 9, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd381; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5035, [retval0+0]; + } // callseq 9 + mov.u64 %rd382, $str$5; + cvta.global.u64 %rd383, %rd382; + { // callseq 10, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd383; + .param .b64 param1; + st.param.b64 [param1+0], %rd381; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5036, [retval0+0]; + } // callseq 10 + +$L__BB2_6: + ld.global.u8 %r5037, [%rd1254+1580160]; + st.local.u32 [%rd3], %r5037; + { // callseq 11, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5038, [retval0+0]; + } // callseq 11 + add.s64 %rd1254, %rd1254, 1; + add.s32 %r29537, %r29537, 1; + setp.lt.u32 %p9, %r29537, 128; + @%p9 bra $L__BB2_6; + + mov.u64 %rd389, 0; + { // callseq 12, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd389; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5039, [retval0+0]; + } // callseq 12 + +$L__BB2_8: + mov.u32 %r5040, %ntid.x; + mad.lo.s32 %r5041, %r1, %r5040, %r2; + cvt.s64.s32 %rd14, %r5041; + setp.ge.u64 %p10, %rd14, %rd351; + @%p10 bra $L__BB2_113; + + cvt.u32.u64 %r5042, %rd14; + setp.ne.s32 %p11, %r5042, 0; + @%p11 bra $L__BB2_11; + + cvta.to.global.u64 %rd390, %rd352; + mov.u64 %rd391, 0; + st.global.u64 [%rd390], %rd391; + +$L__BB2_11: + setp.eq.s16 %p12, %rs409, 0; + @%p12 bra $L__BB2_13; + + shl.b64 %rd392, %rd14, 5; + add.s64 %rd393, %rd2, %rd392; + ld.global.v2.u64 {%rd394, %rd395}, [%rd393]; + mul.lo.s64 %rd398, %rd395, 5; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd398, 7; + shr.b64 %rhs, %rd398, 57; + add.u64 %rd399, %lhs, %rhs; + } + mul.lo.s64 %rd1255, %rd399, 9; + shl.b64 %rd400, %rd395, 17; + ld.global.v2.u64 {%rd401, %rd402}, [%rd393+16]; + xor.b64 %rd405, %rd401, %rd394; + xor.b64 %rd406, %rd402, %rd395; + xor.b64 %rd407, %rd395, %rd405; + xor.b64 %rd408, %rd394, %rd406; + st.global.v2.u64 [%rd393], {%rd408, %rd407}; + { + .reg .b32 %dummy; + mov.b64 {%r5043,%dummy}, %rd406; + } + { + .reg .b32 %dummy; + mov.b64 {%dummy,%r5044}, %rd406; + } + shf.r.wrap.b32 %r5045, %r5044, %r5043, 19; + shf.r.wrap.b32 %r5046, %r5043, %r5044, 19; + mov.b64 %rd409, {%r5046, %r5045}; + xor.b64 %rd410, %rd405, %rd400; + st.global.v2.u64 [%rd393+16], {%rd410, %rd409}; + bra.uni $L__BB2_14; + +$L__BB2_13: + ld.global.u64 %rd411, [%rd2]; + xor.b64 %rd1255, %rd411, %rd14; + +$L__BB2_14: + and.b64 %rd413, %rd1255, %rd349; + or.b64 %rd18, %rd413, %rd350; + mov.u64 %rd1256, 0; + mov.u64 %rd414, hash_header; + +$L__BB2_15: + add.s64 %rd415, %rd414, %rd1256; + ld.const.u8 %rs410, [%rd415]; + add.s64 %rd416, %rd4, %rd1256; + st.local.u8 [%rd416], %rs410; + add.s64 %rd1256, %rd1256, 1; + setp.lt.u64 %p13, %rd1256, 72; + @%p13 bra $L__BB2_15; + + mov.u64 %rd417, 0; + st.local.u64 [%rd4+72], %rd18; + mov.u32 %r5047, -1150833019; + mov.u32 %r5048, 1779033703; + st.local.v2.u32 [%rd3], {%r5048, %r5047}; + mov.u32 %r5049, -1521486534; + mov.u32 %r5050, 1013904242; + st.local.v2.u32 [%rd3+8], {%r5050, %r5049}; + mov.u32 %r5051, -1694144372; + mov.u32 %r5052, 1359893119; + st.local.v2.u32 [%rd3+16], {%r5052, %r5051}; + mov.u32 %r5053, 1541459225; + mov.u32 %r5054, 528734635; + st.local.v2.u32 [%rd3+24], {%r5054, %r5053}; + st.local.v2.u32 [%rd3+32], {%r5048, %r5047}; + st.local.v2.u32 [%rd3+40], {%r5050, %r5049}; + st.local.v2.u32 [%rd3+48], {%r5052, %r5051}; + st.local.v2.u32 [%rd3+56], {%r5054, %r5053}; + st.local.u64 [%rd3+64], %rd417; + mov.u32 %r5055, 0; + st.local.v2.u32 [%rd3+72], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+80], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+88], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+96], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+104], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+112], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+120], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+128], {%r5055, %r5055}; + mov.u16 %rs411, 0; + st.local.v2.u8 [%rd3+136], {%rs411, %rs411}; + st.local.u8 [%rd3+138], %rs411; + st.local.u8 [%rd3+144], %rs411; + ld.local.v4.u8 {%rs412, %rs413, %rs414, %rs415}, [%rd3+136]; + setp.eq.s16 %p14, %rs413, 0; + selp.u16 %rs419, 1, 0, %p14; + or.b16 %rs420, %rs414, %rs419; + ld.local.v4.u32 {%r5056, %r5057, %r5058, %r5059}, [%rd4]; + mov.b32 {%rs421, %rs422}, %r5056; + shr.u16 %rs423, %rs421, 8; + shr.u16 %rs424, %rs422, 8; + mov.b32 {%rs425, %rs426}, %r5057; + shr.u16 %rs427, %rs425, 8; + shr.u16 %rs428, %rs426, 8; + mov.b32 {%rs429, %rs430}, %r5058; + shr.u16 %rs431, %rs429, 8; + shr.u16 %rs432, %rs430, 8; + mov.b32 {%rs433, %rs434}, %r5059; + shr.u16 %rs435, %rs433, 8; + shr.u16 %rs436, %rs434, 8; + cvt.u32.u16 %r5064, %rs421; + and.b32 %r5065, %r5064, 255; + cvt.u32.u16 %r5066, %rs423; + prmt.b32 %r5067, %r5066, %r5065, 30212; + cvt.u32.u16 %r5068, %rs422; + prmt.b32 %r5069, %r5068, %r5067, 28756; + cvt.u32.u16 %r5070, %rs424; + prmt.b32 %r5071, %r5070, %r5069, 1620; + cvt.u32.u16 %r5072, %rs425; + and.b32 %r5073, %r5072, 255; + cvt.u32.u16 %r5074, %rs427; + prmt.b32 %r5075, %r5074, %r5073, 30212; + cvt.u32.u16 %r5076, %rs426; + prmt.b32 %r5077, %r5076, %r5075, 28756; + cvt.u32.u16 %r5078, %rs428; + prmt.b32 %r5079, %r5078, %r5077, 1620; + cvt.u32.u16 %r5080, %rs429; + and.b32 %r5081, %r5080, 255; + cvt.u32.u16 %r5082, %rs431; + prmt.b32 %r5083, %r5082, %r5081, 30212; + cvt.u32.u16 %r5084, %rs430; + prmt.b32 %r5085, %r5084, %r5083, 28756; + cvt.u32.u16 %r5086, %rs432; + prmt.b32 %r5087, %r5086, %r5085, 1620; + cvt.u32.u16 %r5088, %rs433; + and.b32 %r5089, %r5088, 255; + cvt.u32.u16 %r5090, %rs435; + prmt.b32 %r5091, %r5090, %r5089, 30212; + cvt.u32.u16 %r5092, %rs434; + prmt.b32 %r5093, %r5092, %r5091, 28756; + cvt.u32.u16 %r5094, %rs436; + prmt.b32 %r5095, %r5094, %r5093, 1620; + ld.local.v4.u32 {%r5096, %r5097, %r5098, %r5099}, [%rd4+16]; + mov.b32 {%rs437, %rs438}, %r5096; + shr.u16 %rs439, %rs437, 8; + shr.u16 %rs440, %rs438, 8; + mov.b32 {%rs441, %rs442}, %r5097; + shr.u16 %rs443, %rs441, 8; + shr.u16 %rs444, %rs442, 8; + mov.b32 {%rs445, %rs446}, %r5098; + shr.u16 %rs447, %rs445, 8; + shr.u16 %rs448, %rs446, 8; + mov.b32 {%rs449, %rs450}, %r5099; + shr.u16 %rs451, %rs449, 8; + shr.u16 %rs452, %rs450, 8; + cvt.u32.u16 %r5104, %rs437; + and.b32 %r5105, %r5104, 255; + cvt.u32.u16 %r5106, %rs439; + prmt.b32 %r5107, %r5106, %r5105, 30212; + cvt.u32.u16 %r5108, %rs438; + prmt.b32 %r5109, %r5108, %r5107, 28756; + cvt.u32.u16 %r5110, %rs440; + prmt.b32 %r5111, %r5110, %r5109, 1620; + cvt.u32.u16 %r5112, %rs441; + and.b32 %r5113, %r5112, 255; + cvt.u32.u16 %r5114, %rs443; + prmt.b32 %r5115, %r5114, %r5113, 30212; + cvt.u32.u16 %r5116, %rs442; + prmt.b32 %r5117, %r5116, %r5115, 28756; + cvt.u32.u16 %r5118, %rs444; + prmt.b32 %r5119, %r5118, %r5117, 1620; + cvt.u32.u16 %r5120, %rs445; + and.b32 %r5121, %r5120, 255; + cvt.u32.u16 %r5122, %rs447; + prmt.b32 %r5123, %r5122, %r5121, 30212; + cvt.u32.u16 %r5124, %rs446; + prmt.b32 %r5125, %r5124, %r5123, 28756; + cvt.u32.u16 %r5126, %rs448; + prmt.b32 %r5127, %r5126, %r5125, 1620; + cvt.u32.u16 %r5128, %rs449; + and.b32 %r5129, %r5128, 255; + cvt.u32.u16 %r5130, %rs451; + prmt.b32 %r5131, %r5130, %r5129, 30212; + cvt.u32.u16 %r5132, %rs450; + prmt.b32 %r5133, %r5132, %r5131, 28756; + cvt.u32.u16 %r5134, %rs452; + prmt.b32 %r5135, %r5134, %r5133, 1620; + ld.local.v4.u32 {%r5136, %r5137, %r5138, %r5139}, [%rd4+32]; + mov.b32 {%rs453, %rs454}, %r5136; + shr.u16 %rs455, %rs453, 8; + shr.u16 %rs456, %rs454, 8; + mov.b32 {%rs457, %rs458}, %r5137; + shr.u16 %rs459, %rs457, 8; + shr.u16 %rs460, %rs458, 8; + mov.b32 {%rs461, %rs462}, %r5138; + shr.u16 %rs463, %rs461, 8; + shr.u16 %rs464, %rs462, 8; + mov.b32 {%rs465, %rs466}, %r5139; + shr.u16 %rs467, %rs465, 8; + shr.u16 %rs468, %rs466, 8; + cvt.u32.u16 %r5144, %rs453; + and.b32 %r5145, %r5144, 255; + cvt.u32.u16 %r5146, %rs455; + prmt.b32 %r5147, %r5146, %r5145, 30212; + cvt.u32.u16 %r5148, %rs454; + prmt.b32 %r5149, %r5148, %r5147, 28756; + cvt.u32.u16 %r5150, %rs456; + prmt.b32 %r5151, %r5150, %r5149, 1620; + cvt.u32.u16 %r5152, %rs457; + and.b32 %r5153, %r5152, 255; + cvt.u32.u16 %r5154, %rs459; + prmt.b32 %r5155, %r5154, %r5153, 30212; + cvt.u32.u16 %r5156, %rs458; + prmt.b32 %r5157, %r5156, %r5155, 28756; + cvt.u32.u16 %r5158, %rs460; + prmt.b32 %r5159, %r5158, %r5157, 1620; + cvt.u32.u16 %r5160, %rs461; + and.b32 %r5161, %r5160, 255; + cvt.u32.u16 %r5162, %rs463; + prmt.b32 %r5163, %r5162, %r5161, 30212; + cvt.u32.u16 %r5164, %rs462; + prmt.b32 %r5165, %r5164, %r5163, 28756; + cvt.u32.u16 %r5166, %rs464; + prmt.b32 %r5167, %r5166, %r5165, 1620; + cvt.u32.u16 %r5168, %rs465; + and.b32 %r5169, %r5168, 255; + cvt.u32.u16 %r5170, %rs467; + prmt.b32 %r5171, %r5170, %r5169, 30212; + cvt.u32.u16 %r5172, %rs466; + prmt.b32 %r5173, %r5172, %r5171, 28756; + cvt.u32.u16 %r5174, %rs468; + prmt.b32 %r5175, %r5174, %r5173, 1620; + ld.local.v4.u32 {%r5176, %r5177, %r5178, %r5179}, [%rd4+48]; + mov.b32 {%rs469, %rs470}, %r5176; + shr.u16 %rs471, %rs469, 8; + shr.u16 %rs472, %rs470, 8; + mov.b32 {%rs473, %rs474}, %r5177; + shr.u16 %rs475, %rs473, 8; + shr.u16 %rs476, %rs474, 8; + mov.b32 {%rs477, %rs478}, %r5178; + shr.u16 %rs479, %rs477, 8; + shr.u16 %rs480, %rs478, 8; + mov.b32 {%rs481, %rs482}, %r5179; + shr.u16 %rs483, %rs481, 8; + shr.u16 %rs484, %rs482, 8; + cvt.u32.u16 %r5184, %rs469; + and.b32 %r5185, %r5184, 255; + cvt.u32.u16 %r5186, %rs471; + prmt.b32 %r5187, %r5186, %r5185, 30212; + cvt.u32.u16 %r5188, %rs470; + prmt.b32 %r5189, %r5188, %r5187, 28756; + cvt.u32.u16 %r5190, %rs472; + prmt.b32 %r5191, %r5190, %r5189, 1620; + cvt.u32.u16 %r5192, %rs473; + and.b32 %r5193, %r5192, 255; + cvt.u32.u16 %r5194, %rs475; + prmt.b32 %r5195, %r5194, %r5193, 30212; + cvt.u32.u16 %r5196, %rs474; + prmt.b32 %r5197, %r5196, %r5195, 28756; + cvt.u32.u16 %r5198, %rs476; + prmt.b32 %r5199, %r5198, %r5197, 1620; + cvt.u32.u16 %r5200, %rs477; + and.b32 %r5201, %r5200, 255; + cvt.u32.u16 %r5202, %rs479; + prmt.b32 %r5203, %r5202, %r5201, 30212; + cvt.u32.u16 %r5204, %rs478; + prmt.b32 %r5205, %r5204, %r5203, 28756; + cvt.u32.u16 %r5206, %rs480; + prmt.b32 %r5207, %r5206, %r5205, 1620; + cvt.u32.u16 %r5208, %rs481; + and.b32 %r5209, %r5208, 255; + cvt.u32.u16 %r5210, %rs483; + prmt.b32 %r5211, %r5210, %r5209, 30212; + cvt.u32.u16 %r5212, %rs482; + prmt.b32 %r5213, %r5212, %r5211, 28756; + cvt.u32.u16 %r5214, %rs484; + prmt.b32 %r5215, %r5214, %r5213, 1620; + cvt.u32.u16 %r5216, %rs420; + and.b32 %r5217, %r5216, 255; + add.s32 %r5218, %r5071, -1156040474; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 16; + add.s32 %r5220, %r5219, 1779033703; + xor.b32 %r5221, %r5220, 1359893119; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 20; + add.s32 %r5223, %r5079, %r5218; + add.s32 %r5224, %r5223, %r5222; + xor.b32 %r5225, %r5224, %r5219; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 24; + add.s32 %r5227, %r5226, %r5220; + xor.b32 %r5228, %r5227, %r5222; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 25; + add.s32 %r5230, %r5087, 1449989905; + shf.l.wrap.b32 %r5231, %r5230, %r5230, 16; + add.s32 %r5232, %r5231, -1150833019; + xor.b32 %r5233, %r5232, -1694144372; + shf.l.wrap.b32 %r5234, %r5233, %r5233, 20; + add.s32 %r5235, %r5095, %r5230; + add.s32 %r5236, %r5235, %r5234; + xor.b32 %r5237, %r5236, %r5231; + shf.l.wrap.b32 %r5238, %r5237, %r5237, 24; + add.s32 %r5239, %r5238, %r5232; + xor.b32 %r5240, %r5239, %r5234; + shf.l.wrap.b32 %r5241, %r5240, %r5240, 25; + add.s32 %r5242, %r5111, 1542638877; + shr.u32 %r5243, %r5242, 16; + shl.b32 %r5244, %r5242, 16; + xor.b32 %r5245, %r5244, 4194304; + or.b32 %r5246, %r5245, %r5243; + add.s32 %r5247, %r5246, 1013904242; + xor.b32 %r5248, %r5247, 528734635; + shf.l.wrap.b32 %r5249, %r5248, %r5248, 20; + add.s32 %r5250, %r5119, %r5242; + add.s32 %r5251, %r5250, %r5249; + xor.b32 %r5252, %r5251, %r5246; + shf.l.wrap.b32 %r5253, %r5252, %r5252, 24; + add.s32 %r5254, %r5253, %r5247; + xor.b32 %r5255, %r5254, %r5249; + shf.l.wrap.b32 %r5256, %r5255, %r5255, 25; + add.s32 %r5257, %r5127, 19972691; + xor.b32 %r5258, %r5257, %r5217; + shr.u32 %r5259, %r5257, 16; + shl.b32 %r5260, %r5258, 16; + or.b32 %r5261, %r5260, %r5259; + add.s32 %r5262, %r5261, -1521486534; + xor.b32 %r5263, %r5262, 1541459225; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 20; + add.s32 %r5265, %r5135, %r5257; + add.s32 %r5266, %r5265, %r5264; + xor.b32 %r5267, %r5266, %r5261; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 24; + add.s32 %r5269, %r5268, %r5262; + xor.b32 %r5270, %r5269, %r5264; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 25; + add.s32 %r5272, %r5241, %r5224; + add.s32 %r5273, %r5272, %r5151; + xor.b32 %r5274, %r5268, %r5273; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 16; + add.s32 %r5276, %r5275, %r5254; + xor.b32 %r5277, %r5276, %r5241; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 20; + add.s32 %r5279, %r5159, %r5273; + add.s32 %r5280, %r5279, %r5278; + xor.b32 %r5281, %r5280, %r5275; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 24; + add.s32 %r5283, %r5282, %r5276; + xor.b32 %r5284, %r5283, %r5278; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 25; + add.s32 %r5286, %r5256, %r5236; + add.s32 %r5287, %r5286, %r5167; + xor.b32 %r5288, %r5287, %r5226; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 16; + add.s32 %r5290, %r5289, %r5269; + xor.b32 %r5291, %r5290, %r5256; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 20; + add.s32 %r5293, %r5175, %r5287; + add.s32 %r5294, %r5293, %r5292; + xor.b32 %r5295, %r5294, %r5289; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 24; + add.s32 %r5297, %r5296, %r5290; + xor.b32 %r5298, %r5297, %r5292; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 25; + add.s32 %r5300, %r5271, %r5251; + add.s32 %r5301, %r5300, %r5191; + xor.b32 %r5302, %r5301, %r5238; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 16; + add.s32 %r5304, %r5303, %r5227; + xor.b32 %r5305, %r5304, %r5271; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 20; + add.s32 %r5307, %r5199, %r5301; + add.s32 %r5308, %r5307, %r5306; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 24; + add.s32 %r5311, %r5310, %r5304; + xor.b32 %r5312, %r5311, %r5306; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 25; + add.s32 %r5314, %r5266, %r5229; + add.s32 %r5315, %r5314, %r5207; + xor.b32 %r5316, %r5315, %r5253; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 16; + add.s32 %r5318, %r5317, %r5239; + xor.b32 %r5319, %r5318, %r5229; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 20; + add.s32 %r5321, %r5215, %r5315; + add.s32 %r5322, %r5321, %r5320; + xor.b32 %r5323, %r5322, %r5317; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 24; + add.s32 %r5325, %r5324, %r5318; + xor.b32 %r5326, %r5325, %r5320; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 25; + add.s32 %r5328, %r5280, %r5087; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5296; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 16; + add.s32 %r5332, %r5331, %r5311; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 20; + add.s32 %r5335, %r5329, %r5127; + add.s32 %r5336, %r5335, %r5334; + xor.b32 %r5337, %r5336, %r5331; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 24; + add.s32 %r5339, %r5338, %r5332; + xor.b32 %r5340, %r5339, %r5334; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 25; + add.s32 %r5342, %r5294, %r5095; + add.s32 %r5343, %r5342, %r5285; + xor.b32 %r5344, %r5310, %r5343; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 16; + add.s32 %r5346, %r5325, %r5345; + xor.b32 %r5347, %r5346, %r5285; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 20; + add.s32 %r5349, %r5343, %r5167; + add.s32 %r5350, %r5349, %r5348; + xor.b32 %r5351, %r5350, %r5345; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 24; + add.s32 %r5353, %r5352, %r5346; + xor.b32 %r5354, %r5353, %r5348; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 25; + add.s32 %r5356, %r5299, %r5135; + add.s32 %r5357, %r5356, %r5308; + xor.b32 %r5358, %r5324, %r5357; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 16; + add.s32 %r5360, %r5359, %r5283; + xor.b32 %r5361, %r5360, %r5299; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 20; + add.s32 %r5363, %r5357, %r5071; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5359; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 24; + add.s32 %r5367, %r5366, %r5360; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 25; + add.s32 %r5370, %r5313, %r5111; + add.s32 %r5371, %r5370, %r5322; + xor.b32 %r5372, %r5371, %r5282; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 16; + add.s32 %r5374, %r5373, %r5297; + xor.b32 %r5375, %r5374, %r5313; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 20; + add.s32 %r5377, %r5371, %r5199; + add.s32 %r5378, %r5377, %r5376; + xor.b32 %r5379, %r5378, %r5373; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 24; + add.s32 %r5381, %r5380, %r5374; + xor.b32 %r5382, %r5381, %r5376; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 25; + add.s32 %r5384, %r5336, %r5079; + add.s32 %r5385, %r5384, %r5355; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 16; + add.s32 %r5388, %r5387, %r5367; + xor.b32 %r5389, %r5388, %r5355; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 20; + add.s32 %r5391, %r5385, %r5175; + add.s32 %r5392, %r5391, %r5390; + xor.b32 %r5393, %r5392, %r5387; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 24; + add.s32 %r5395, %r5394, %r5388; + xor.b32 %r5396, %r5395, %r5390; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 25; + add.s32 %r5398, %r5350, %r5191; + add.s32 %r5399, %r5398, %r5369; + xor.b32 %r5400, %r5399, %r5338; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 16; + add.s32 %r5402, %r5401, %r5381; + xor.b32 %r5403, %r5402, %r5369; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 20; + add.s32 %r5405, %r5399, %r5119; + add.s32 %r5406, %r5405, %r5404; + xor.b32 %r5407, %r5406, %r5401; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 24; + add.s32 %r5409, %r5408, %r5402; + xor.b32 %r5410, %r5409, %r5404; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 25; + add.s32 %r5412, %r5364, %r5159; + add.s32 %r5413, %r5412, %r5383; + xor.b32 %r5414, %r5413, %r5352; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 16; + add.s32 %r5416, %r5415, %r5339; + xor.b32 %r5417, %r5416, %r5383; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 20; + add.s32 %r5419, %r5413, %r5207; + add.s32 %r5420, %r5419, %r5418; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 24; + add.s32 %r5423, %r5422, %r5416; + xor.b32 %r5424, %r5423, %r5418; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 25; + add.s32 %r5426, %r5378, %r5215; + add.s32 %r5427, %r5426, %r5341; + xor.b32 %r5428, %r5427, %r5366; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 16; + add.s32 %r5430, %r5429, %r5353; + xor.b32 %r5431, %r5430, %r5341; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 20; + add.s32 %r5433, %r5427, %r5151; + add.s32 %r5434, %r5433, %r5432; + xor.b32 %r5435, %r5434, %r5429; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 24; + add.s32 %r5437, %r5436, %r5430; + xor.b32 %r5438, %r5437, %r5432; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 25; + add.s32 %r5440, %r5392, %r5095; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5408; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 16; + add.s32 %r5444, %r5443, %r5423; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 20; + add.s32 %r5447, %r5441, %r5111; + add.s32 %r5448, %r5447, %r5446; + xor.b32 %r5449, %r5448, %r5443; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 24; + add.s32 %r5451, %r5450, %r5444; + xor.b32 %r5452, %r5451, %r5446; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 25; + add.s32 %r5454, %r5406, %r5167; + add.s32 %r5455, %r5454, %r5397; + xor.b32 %r5456, %r5455, %r5422; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 16; + add.s32 %r5458, %r5457, %r5437; + xor.b32 %r5459, %r5458, %r5397; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 20; + add.s32 %r5461, %r5455, %r5191; + add.s32 %r5462, %r5461, %r5460; + xor.b32 %r5463, %r5462, %r5457; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 24; + add.s32 %r5465, %r5464, %r5458; + xor.b32 %r5466, %r5465, %r5460; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 25; + add.s32 %r5468, %r5420, %r5199; + add.s32 %r5469, %r5468, %r5411; + xor.b32 %r5470, %r5469, %r5436; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 16; + add.s32 %r5472, %r5471, %r5395; + xor.b32 %r5473, %r5472, %r5411; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 20; + add.s32 %r5475, %r5469, %r5087; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5471; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 24; + add.s32 %r5479, %r5478, %r5472; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 25; + add.s32 %r5482, %r5434, %r5135; + add.s32 %r5483, %r5482, %r5425; + xor.b32 %r5484, %r5483, %r5394; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 16; + add.s32 %r5486, %r5485, %r5409; + xor.b32 %r5487, %r5486, %r5425; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 20; + add.s32 %r5489, %r5483, %r5207; + add.s32 %r5490, %r5489, %r5488; + xor.b32 %r5491, %r5490, %r5485; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 24; + add.s32 %r5493, %r5492, %r5486; + xor.b32 %r5494, %r5493, %r5488; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 25; + add.s32 %r5496, %r5448, %r5127; + add.s32 %r5497, %r5496, %r5467; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 16; + add.s32 %r5500, %r5499, %r5479; + xor.b32 %r5501, %r5500, %r5467; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 20; + add.s32 %r5503, %r5497, %r5119; + add.s32 %r5504, %r5503, %r5502; + xor.b32 %r5505, %r5504, %r5499; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 24; + add.s32 %r5507, %r5506, %r5500; + xor.b32 %r5508, %r5507, %r5502; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 25; + add.s32 %r5510, %r5462, %r5159; + add.s32 %r5511, %r5510, %r5481; + xor.b32 %r5512, %r5511, %r5450; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 16; + add.s32 %r5514, %r5513, %r5493; + xor.b32 %r5515, %r5514, %r5481; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 20; + add.s32 %r5517, %r5511, %r5071; + add.s32 %r5518, %r5517, %r5516; + xor.b32 %r5519, %r5518, %r5513; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 24; + add.s32 %r5521, %r5520, %r5514; + xor.b32 %r5522, %r5521, %r5516; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 25; + add.s32 %r5524, %r5476, %r5175; + add.s32 %r5525, %r5524, %r5495; + xor.b32 %r5526, %r5525, %r5464; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 16; + add.s32 %r5528, %r5527, %r5451; + xor.b32 %r5529, %r5528, %r5495; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 20; + add.s32 %r5531, %r5525, %r5215; + add.s32 %r5532, %r5531, %r5530; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 24; + add.s32 %r5535, %r5534, %r5528; + xor.b32 %r5536, %r5535, %r5530; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 25; + add.s32 %r5538, %r5490, %r5151; + add.s32 %r5539, %r5538, %r5453; + xor.b32 %r5540, %r5539, %r5478; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 16; + add.s32 %r5542, %r5541, %r5465; + xor.b32 %r5543, %r5542, %r5453; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 20; + add.s32 %r5545, %r5539, %r5079; + add.s32 %r5546, %r5545, %r5544; + xor.b32 %r5547, %r5546, %r5541; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 24; + add.s32 %r5549, %r5548, %r5542; + xor.b32 %r5550, %r5549, %r5544; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 25; + add.s32 %r5552, %r5504, %r5167; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5520; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 16; + add.s32 %r5556, %r5555, %r5535; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 20; + add.s32 %r5559, %r5553, %r5135; + add.s32 %r5560, %r5559, %r5558; + xor.b32 %r5561, %r5560, %r5555; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 24; + add.s32 %r5563, %r5562, %r5556; + xor.b32 %r5564, %r5563, %r5558; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 25; + add.s32 %r5566, %r5518, %r5191; + add.s32 %r5567, %r5566, %r5509; + xor.b32 %r5568, %r5567, %r5534; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 16; + add.s32 %r5570, %r5569, %r5549; + xor.b32 %r5571, %r5570, %r5509; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 20; + add.s32 %r5573, %r5567, %r5159; + add.s32 %r5574, %r5573, %r5572; + xor.b32 %r5575, %r5574, %r5569; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 24; + add.s32 %r5577, %r5576, %r5570; + xor.b32 %r5578, %r5577, %r5572; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 25; + add.s32 %r5580, %r5532, %r5207; + add.s32 %r5581, %r5580, %r5523; + xor.b32 %r5582, %r5581, %r5548; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 16; + add.s32 %r5584, %r5583, %r5507; + xor.b32 %r5585, %r5584, %r5523; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 20; + add.s32 %r5587, %r5581, %r5095; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5583; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 24; + add.s32 %r5591, %r5590, %r5584; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 25; + add.s32 %r5594, %r5546, %r5199; + add.s32 %r5595, %r5594, %r5537; + xor.b32 %r5596, %r5595, %r5506; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 16; + add.s32 %r5598, %r5597, %r5521; + xor.b32 %r5599, %r5598, %r5537; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 20; + add.s32 %r5601, %r5595, %r5215; + add.s32 %r5602, %r5601, %r5600; + xor.b32 %r5603, %r5602, %r5597; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 24; + add.s32 %r5605, %r5604, %r5598; + xor.b32 %r5606, %r5605, %r5600; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 25; + add.s32 %r5608, %r5560, %r5111; + add.s32 %r5609, %r5608, %r5579; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 16; + add.s32 %r5612, %r5611, %r5591; + xor.b32 %r5613, %r5612, %r5579; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 20; + add.s32 %r5615, %r5609, %r5071; + add.s32 %r5616, %r5615, %r5614; + xor.b32 %r5617, %r5616, %r5611; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 24; + add.s32 %r5619, %r5618, %r5612; + xor.b32 %r5620, %r5619, %r5614; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 25; + add.s32 %r5622, %r5574, %r5175; + add.s32 %r5623, %r5622, %r5593; + xor.b32 %r5624, %r5623, %r5562; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 16; + add.s32 %r5626, %r5625, %r5605; + xor.b32 %r5627, %r5626, %r5593; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 20; + add.s32 %r5629, %r5623, %r5087; + add.s32 %r5630, %r5629, %r5628; + xor.b32 %r5631, %r5630, %r5625; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 24; + add.s32 %r5633, %r5632, %r5626; + xor.b32 %r5634, %r5633, %r5628; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 25; + add.s32 %r5636, %r5588, %r5119; + add.s32 %r5637, %r5636, %r5607; + xor.b32 %r5638, %r5637, %r5576; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 16; + add.s32 %r5640, %r5639, %r5563; + xor.b32 %r5641, %r5640, %r5607; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 20; + add.s32 %r5643, %r5637, %r5151; + add.s32 %r5644, %r5643, %r5642; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 24; + add.s32 %r5647, %r5646, %r5640; + xor.b32 %r5648, %r5647, %r5642; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 25; + add.s32 %r5650, %r5602, %r5079; + add.s32 %r5651, %r5650, %r5565; + xor.b32 %r5652, %r5651, %r5590; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 16; + add.s32 %r5654, %r5653, %r5577; + xor.b32 %r5655, %r5654, %r5565; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 20; + add.s32 %r5657, %r5651, %r5127; + add.s32 %r5658, %r5657, %r5656; + xor.b32 %r5659, %r5658, %r5653; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 24; + add.s32 %r5661, %r5660, %r5654; + xor.b32 %r5662, %r5661, %r5656; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 25; + add.s32 %r5664, %r5616, %r5191; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5632; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 16; + add.s32 %r5668, %r5667, %r5647; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 20; + add.s32 %r5671, %r5665, %r5199; + add.s32 %r5672, %r5671, %r5670; + xor.b32 %r5673, %r5672, %r5667; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 24; + add.s32 %r5675, %r5674, %r5668; + xor.b32 %r5676, %r5675, %r5670; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 25; + add.s32 %r5678, %r5630, %r5159; + add.s32 %r5679, %r5678, %r5621; + xor.b32 %r5680, %r5679, %r5646; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 16; + add.s32 %r5682, %r5681, %r5661; + xor.b32 %r5683, %r5682, %r5621; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 20; + add.s32 %r5685, %r5679, %r5175; + add.s32 %r5686, %r5685, %r5684; + xor.b32 %r5687, %r5686, %r5681; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 24; + add.s32 %r5689, %r5688, %r5682; + xor.b32 %r5690, %r5689, %r5684; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 25; + add.s32 %r5692, %r5644, %r5215; + add.s32 %r5693, %r5692, %r5635; + xor.b32 %r5694, %r5693, %r5660; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 16; + add.s32 %r5696, %r5695, %r5619; + xor.b32 %r5697, %r5696, %r5635; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 20; + add.s32 %r5699, %r5693, %r5167; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5695; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 24; + add.s32 %r5703, %r5702, %r5696; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 25; + add.s32 %r5706, %r5658, %r5207; + add.s32 %r5707, %r5706, %r5649; + xor.b32 %r5708, %r5707, %r5618; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 16; + add.s32 %r5710, %r5709, %r5633; + xor.b32 %r5711, %r5710, %r5649; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 20; + add.s32 %r5713, %r5707, %r5151; + add.s32 %r5714, %r5713, %r5712; + xor.b32 %r5715, %r5714, %r5709; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 24; + add.s32 %r5717, %r5716, %r5710; + xor.b32 %r5718, %r5717, %r5712; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 25; + add.s32 %r5720, %r5672, %r5135; + add.s32 %r5721, %r5720, %r5691; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 16; + add.s32 %r5724, %r5723, %r5703; + xor.b32 %r5725, %r5724, %r5691; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 20; + add.s32 %r5727, %r5721, %r5087; + add.s32 %r5728, %r5727, %r5726; + xor.b32 %r5729, %r5728, %r5723; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 24; + add.s32 %r5731, %r5730, %r5724; + xor.b32 %r5732, %r5731, %r5726; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 25; + add.s32 %r5734, %r5686, %r5119; + add.s32 %r5735, %r5734, %r5705; + xor.b32 %r5736, %r5735, %r5674; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 16; + add.s32 %r5738, %r5737, %r5717; + xor.b32 %r5739, %r5738, %r5705; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 20; + add.s32 %r5741, %r5735, %r5095; + add.s32 %r5742, %r5741, %r5740; + xor.b32 %r5743, %r5742, %r5737; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 24; + add.s32 %r5745, %r5744, %r5738; + xor.b32 %r5746, %r5745, %r5740; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 25; + add.s32 %r5748, %r5700, %r5071; + add.s32 %r5749, %r5748, %r5719; + xor.b32 %r5750, %r5749, %r5688; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 16; + add.s32 %r5752, %r5751, %r5675; + xor.b32 %r5753, %r5752, %r5719; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 20; + add.s32 %r5755, %r5749, %r5079; + add.s32 %r5756, %r5755, %r5754; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 24; + add.s32 %r5759, %r5758, %r5752; + xor.b32 %r5760, %r5759, %r5754; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 25; + add.s32 %r5762, %r5714, %r5127; + add.s32 %r5763, %r5762, %r5677; + xor.b32 %r5764, %r5763, %r5702; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 16; + add.s32 %r5766, %r5765, %r5689; + xor.b32 %r5767, %r5766, %r5677; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 20; + add.s32 %r5769, %r5763, %r5111; + add.s32 %r5770, %r5769, %r5768; + xor.b32 %r5771, %r5770, %r5765; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 24; + add.s32 %r5773, %r5772, %r5766; + xor.b32 %r5774, %r5773, %r5768; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 25; + add.s32 %r5776, %r5728, %r5159; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5744; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 16; + add.s32 %r5780, %r5779, %r5759; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 20; + add.s32 %r5783, %r5777, %r5207; + add.s32 %r5784, %r5783, %r5782; + xor.b32 %r5785, %r5784, %r5779; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 24; + add.s32 %r5787, %r5786, %r5780; + xor.b32 %r5788, %r5787, %r5782; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 25; + add.s32 %r5790, %r5742, %r5175; + add.s32 %r5791, %r5790, %r5733; + xor.b32 %r5792, %r5791, %r5758; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 16; + add.s32 %r5794, %r5793, %r5773; + xor.b32 %r5795, %r5794, %r5733; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 20; + add.s32 %r5797, %r5791, %r5119; + add.s32 %r5798, %r5797, %r5796; + xor.b32 %r5799, %r5798, %r5793; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 24; + add.s32 %r5801, %r5800, %r5794; + xor.b32 %r5802, %r5801, %r5796; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 25; + add.s32 %r5804, %r5756, %r5151; + add.s32 %r5805, %r5804, %r5747; + xor.b32 %r5806, %r5805, %r5772; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 16; + add.s32 %r5808, %r5807, %r5731; + xor.b32 %r5809, %r5808, %r5747; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 20; + add.s32 %r5811, %r5805, %r5191; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5807; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 24; + add.s32 %r5815, %r5814, %r5808; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 25; + add.s32 %r5818, %r5770, %r5215; + add.s32 %r5819, %r5818, %r5761; + xor.b32 %r5820, %r5819, %r5730; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 16; + add.s32 %r5822, %r5821, %r5745; + xor.b32 %r5823, %r5822, %r5761; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 20; + add.s32 %r5825, %r5819, %r5079; + add.s32 %r5826, %r5825, %r5824; + xor.b32 %r5827, %r5826, %r5821; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 24; + add.s32 %r5829, %r5828, %r5822; + xor.b32 %r5830, %r5829, %r5824; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 25; + add.s32 %r5832, %r5784, %r5199; + add.s32 %r5833, %r5832, %r5803; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 16; + add.s32 %r5836, %r5835, %r5815; + xor.b32 %r5837, %r5836, %r5803; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 20; + add.s32 %r5839, %r5833, %r5095; + add.s32 %r5840, %r5839, %r5838; + xor.b32 %r5841, %r5840, %r5835; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 24; + add.s32 %r5843, %r5842, %r5836; + xor.b32 %r5844, %r5843, %r5838; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 25; + add.s32 %r5846, %r5798, %r5071; + add.s32 %r5847, %r5846, %r5817; + xor.b32 %r5848, %r5847, %r5786; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 16; + add.s32 %r5850, %r5849, %r5829; + xor.b32 %r5851, %r5850, %r5817; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 20; + add.s32 %r5853, %r5847, %r5167; + add.s32 %r5854, %r5853, %r5852; + xor.b32 %r5855, %r5854, %r5849; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 24; + add.s32 %r5857, %r5856, %r5850; + xor.b32 %r5858, %r5857, %r5852; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 25; + add.s32 %r5860, %r5812, %r5087; + add.s32 %r5861, %r5860, %r5831; + xor.b32 %r5862, %r5861, %r5800; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 16; + add.s32 %r5864, %r5863, %r5787; + xor.b32 %r5865, %r5864, %r5831; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 20; + add.s32 %r5867, %r5861, %r5127; + add.s32 %r5868, %r5867, %r5866; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 24; + add.s32 %r5871, %r5870, %r5864; + xor.b32 %r5872, %r5871, %r5866; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 25; + add.s32 %r5874, %r5826, %r5111; + add.s32 %r5875, %r5874, %r5789; + xor.b32 %r5876, %r5875, %r5814; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 16; + add.s32 %r5878, %r5877, %r5801; + xor.b32 %r5879, %r5878, %r5789; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 20; + add.s32 %r5881, %r5875, %r5135; + add.s32 %r5882, %r5881, %r5880; + xor.b32 %r5883, %r5882, %r5877; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 24; + add.s32 %r5885, %r5884, %r5878; + xor.b32 %r5886, %r5885, %r5880; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 25; + add.s32 %r5888, %r5840, %r5175; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5856; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 16; + add.s32 %r5892, %r5891, %r5871; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 20; + add.s32 %r5895, %r5889, %r5215; + add.s32 %r5896, %r5895, %r5894; + xor.b32 %r5897, %r5896, %r5891; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 24; + add.s32 %r5899, %r5898, %r5892; + xor.b32 %r5900, %r5899, %r5894; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 25; + add.s32 %r5902, %r5854, %r5119; + add.s32 %r5903, %r5902, %r5845; + xor.b32 %r5904, %r5903, %r5870; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 16; + add.s32 %r5906, %r5905, %r5885; + xor.b32 %r5907, %r5906, %r5845; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 20; + add.s32 %r5909, %r5903, %r5071; + add.s32 %r5910, %r5909, %r5908; + xor.b32 %r5911, %r5910, %r5905; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 24; + add.s32 %r5913, %r5912, %r5906; + xor.b32 %r5914, %r5913, %r5908; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 25; + add.s32 %r5916, %r5868, %r5079; + add.s32 %r5917, %r5916, %r5859; + xor.b32 %r5918, %r5917, %r5884; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 16; + add.s32 %r5920, %r5919, %r5843; + xor.b32 %r5921, %r5920, %r5859; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 20; + add.s32 %r5923, %r5917, %r5159; + add.s32 %r5924, %r5923, %r5922; + xor.b32 %r5925, %r5924, %r5919; + shf.l.wrap.b32 %r5926, %r5925, %r5925, 24; + add.s32 %r5927, %r5926, %r5920; + xor.b32 %r5928, %r5927, %r5922; + shf.l.wrap.b32 %r5929, %r5928, %r5928, 25; + add.s32 %r5930, %r5882, %r5151; + add.s32 %r5931, %r5930, %r5873; + xor.b32 %r5932, %r5931, %r5842; + shf.l.wrap.b32 %r5933, %r5932, %r5932, 16; + add.s32 %r5934, %r5933, %r5857; + xor.b32 %r5935, %r5934, %r5873; + shf.l.wrap.b32 %r5936, %r5935, %r5935, 20; + add.s32 %r5937, %r5931, %r5127; + add.s32 %r5938, %r5937, %r5936; + xor.b32 %r5939, %r5938, %r5933; + shf.l.wrap.b32 %r5940, %r5939, %r5939, 24; + add.s32 %r5941, %r5940, %r5934; + xor.b32 %r5942, %r5941, %r5936; + shf.l.wrap.b32 %r5943, %r5942, %r5942, 25; + add.s32 %r5944, %r5896, %r5207; + add.s32 %r5945, %r5944, %r5915; + xor.b32 %r5946, %r5945, %r5940; + shf.l.wrap.b32 %r5947, %r5946, %r5946, 16; + add.s32 %r5948, %r5947, %r5927; + xor.b32 %r5949, %r5948, %r5915; + shf.l.wrap.b32 %r5950, %r5949, %r5949, 20; + add.s32 %r5951, %r5945, %r5167; + add.s32 %r5952, %r5951, %r5950; + xor.b32 %r5953, %r5952, %r5947; + shf.l.wrap.b32 %r5954, %r5953, %r5953, 24; + add.s32 %r5955, %r5954, %r5948; + xor.b32 %r5956, %r5955, %r5950; + shf.l.wrap.b32 %r5957, %r5956, %r5956, 25; + add.s32 %r5958, %r5910, %r5087; + add.s32 %r5959, %r5958, %r5929; + xor.b32 %r5960, %r5959, %r5898; + shf.l.wrap.b32 %r5961, %r5960, %r5960, 16; + add.s32 %r5962, %r5961, %r5941; + xor.b32 %r5963, %r5962, %r5929; + shf.l.wrap.b32 %r5964, %r5963, %r5963, 20; + add.s32 %r5965, %r5959, %r5191; + add.s32 %r5966, %r5965, %r5964; + xor.b32 %r5967, %r5966, %r5961; + shf.l.wrap.b32 %r5968, %r5967, %r5967, 24; + add.s32 %r5969, %r5968, %r5962; + xor.b32 %r5970, %r5969, %r5964; + shf.l.wrap.b32 %r5971, %r5970, %r5970, 25; + add.s32 %r5972, %r5924, %r5095; + add.s32 %r5973, %r5972, %r5943; + xor.b32 %r5974, %r5973, %r5912; + shf.l.wrap.b32 %r5975, %r5974, %r5974, 16; + add.s32 %r5976, %r5975, %r5899; + xor.b32 %r5977, %r5976, %r5943; + shf.l.wrap.b32 %r5978, %r5977, %r5977, 20; + add.s32 %r5979, %r5973, %r5111; + add.s32 %r5980, %r5979, %r5978; + xor.b32 %r5981, %r5980, %r5975; + shf.l.wrap.b32 %r5982, %r5981, %r5981, 24; + add.s32 %r5983, %r5982, %r5976; + xor.b32 %r5984, %r5983, %r5978; + shf.l.wrap.b32 %r5985, %r5984, %r5984, 25; + add.s32 %r5986, %r5938, %r5135; + add.s32 %r5987, %r5986, %r5901; + xor.b32 %r5988, %r5987, %r5926; + shf.l.wrap.b32 %r5989, %r5988, %r5988, 16; + add.s32 %r5990, %r5989, %r5913; + xor.b32 %r5991, %r5990, %r5901; + shf.l.wrap.b32 %r5992, %r5991, %r5991, 20; + add.s32 %r5993, %r5987, %r5199; + add.s32 %r5994, %r5993, %r5992; + xor.b32 %r5995, %r5994, %r5989; + shf.l.wrap.b32 %r5996, %r5995, %r5995, 24; + add.s32 %r5997, %r5996, %r5990; + xor.b32 %r5998, %r5997, %r5992; + shf.l.wrap.b32 %r5999, %r5998, %r5998, 25; + xor.b32 %r9, %r5983, %r5952; + xor.b32 %r10, %r5997, %r5966; + st.local.v2.u32 [%rd3+32], {%r9, %r10}; + xor.b32 %r11, %r5955, %r5980; + xor.b32 %r12, %r5994, %r5969; + st.local.v2.u32 [%rd3+40], {%r11, %r12}; + xor.b32 %r13, %r5999, %r5968; + xor.b32 %r14, %r5957, %r5982; + st.local.v2.u32 [%rd3+48], {%r13, %r14}; + xor.b32 %r15, %r5996, %r5971; + xor.b32 %r16, %r5985, %r5954; + st.local.v2.u32 [%rd3+56], {%r15, %r16}; + ld.local.v4.u32 {%r6000, %r6001, %r6002, %r6003}, [%rd4+64]; + st.local.v2.u32 [%rd3+72], {%r6000, %r6001}; + st.local.v2.u32 [%rd3+80], {%r6002, %r6003}; + add.s16 %rs1, %rs412, 16; + and.b16 %rs485, %rs1, 255; + add.s16 %rs486, %rs413, 1; + st.local.v2.u8 [%rd3+136], {%rs1, %rs486}; + cvt.u32.u16 %r6008, %rs486; + cvt.u32.u16 %r6009, %rs485; + prmt.b32 %r6010, %r6008, %r6009, 30212; + cvt.u16.u32 %rs487, %r6010; + shr.u16 %rs2, %rs487, 8; + mov.b32 {%rs5, %rs6}, %r6001; + mov.b32 {%rs3, %rs4}, %r6000; + mov.b32 {%rs9, %rs10}, %r6003; + mov.b32 {%rs7, %rs8}, %r6002; + setp.eq.s16 %p15, %rs2, 0; + selp.u16 %rs488, 1, 0, %p15; + shr.u16 %rs489, %rs3, 8; + shr.u16 %rs490, %rs4, 8; + shr.u16 %rs491, %rs5, 8; + shr.u16 %rs492, %rs6, 8; + shr.u16 %rs493, %rs7, 8; + shr.u16 %rs494, %rs8, 8; + shr.u16 %rs495, %rs9, 8; + shr.u16 %rs496, %rs10, 8; + or.b16 %rs497, %rs488, 10; + cvt.u32.u16 %r6011, %rs3; + and.b32 %r6012, %r6011, 255; + cvt.u32.u16 %r6013, %rs489; + prmt.b32 %r6014, %r6013, %r6012, 30212; + cvt.u32.u16 %r6015, %rs4; + prmt.b32 %r6016, %r6015, %r6014, 28756; + cvt.u32.u16 %r6017, %rs490; + prmt.b32 %r6018, %r6017, %r6016, 1620; + cvt.u32.u16 %r6019, %rs5; + and.b32 %r6020, %r6019, 255; + cvt.u32.u16 %r6021, %rs491; + prmt.b32 %r6022, %r6021, %r6020, 30212; + cvt.u32.u16 %r6023, %rs6; + prmt.b32 %r6024, %r6023, %r6022, 28756; + cvt.u32.u16 %r6025, %rs492; + prmt.b32 %r6026, %r6025, %r6024, 1620; + cvt.u32.u16 %r6027, %rs7; + and.b32 %r6028, %r6027, 255; + cvt.u32.u16 %r6029, %rs493; + prmt.b32 %r6030, %r6029, %r6028, 30212; + cvt.u32.u16 %r6031, %rs8; + prmt.b32 %r6032, %r6031, %r6030, 28756; + cvt.u32.u16 %r6033, %rs494; + prmt.b32 %r6034, %r6033, %r6032, 1620; + cvt.u32.u16 %r6035, %rs9; + and.b32 %r6036, %r6035, 255; + cvt.u32.u16 %r6037, %rs495; + prmt.b32 %r6038, %r6037, %r6036, 30212; + cvt.u32.u16 %r6039, %rs10; + prmt.b32 %r6040, %r6039, %r6038, 28756; + cvt.u32.u16 %r6041, %rs496; + prmt.b32 %r6042, %r6041, %r6040, 1620; + cvt.u32.u16 %r6043, %rs497; + add.s32 %r6044, %r13, %r9; + add.s32 %r6045, %r6044, %r6018; + add.s32 %r6046, %r6026, %r6045; + add.s32 %r6047, %r14, %r10; + add.s32 %r6048, %r6047, %r6034; + add.s32 %r6049, %r6042, %r6048; + add.s32 %r6050, %r15, %r11; + cvt.u32.u16 %r6051, %rs1; + and.b32 %r6052, %r6051, 255; + xor.b32 %r6053, %r6050, %r6052; + shr.u32 %r6054, %r6050, 16; + shl.b32 %r6055, %r6053, 16; + or.b32 %r6056, %r6055, %r6054; + add.s32 %r6057, %r6056, 1013904242; + xor.b32 %r6058, %r6057, %r15; + shf.l.wrap.b32 %r6059, %r6058, %r6058, 20; + add.s32 %r6060, %r6050, %r6059; + xor.b32 %r6061, %r6060, %r6056; + shf.l.wrap.b32 %r6062, %r6061, %r6061, 24; + add.s32 %r6063, %r6062, %r6057; + xor.b32 %r6064, %r6063, %r6059; + shf.l.wrap.b32 %r6065, %r6064, %r6064, 25; + add.s32 %r6066, %r16, %r12; + xor.b32 %r6067, %r6066, %r6043; + shr.u32 %r6068, %r6066, 16; + shl.b32 %r6069, %r6067, 16; + or.b32 %r6070, %r6069, %r6068; + add.s32 %r6071, %r6070, -1521486534; + xor.b32 %r6072, %r6071, %r16; + shf.l.wrap.b32 %r6073, %r6072, %r6072, 20; + add.s32 %r6074, %r6066, %r6073; + xor.b32 %r6075, %r6074, %r6070; + shf.l.wrap.b32 %r6076, %r6075, %r6075, 24; + add.s32 %r6077, %r6076, %r6071; + xor.b32 %r6078, %r6077, %r6073; + shf.l.wrap.b32 %r6079, %r6078, %r6078, 25; + add.s32 %r6080, %r6079, %r6060; + shf.l.wrap.b32 %r6081, %r6045, %r6045, 16; + add.s32 %r6082, %r6081, 1779033703; + xor.b32 %r6083, %r6082, %r13; + shf.l.wrap.b32 %r6084, %r6083, %r6083, 20; + add.s32 %r6085, %r6046, %r6084; + xor.b32 %r6086, %r6085, %r6081; + shf.l.wrap.b32 %r6087, %r6086, %r6086, 24; + add.s32 %r6088, %r6087, %r6082; + xor.b32 %r6089, %r6088, %r6084; + shf.l.wrap.b32 %r6090, %r6089, %r6089, 25; + shf.l.wrap.b32 %r6091, %r6048, %r6048, 16; + add.s32 %r6092, %r6091, -1150833019; + xor.b32 %r6093, %r6092, %r14; + shf.l.wrap.b32 %r6094, %r6093, %r6093, 20; + add.s32 %r6095, %r6049, %r6094; + xor.b32 %r6096, %r6095, %r6091; + shf.l.wrap.b32 %r6097, %r6096, %r6096, 24; + add.s32 %r6098, %r6097, %r6092; + xor.b32 %r6099, %r6098, %r6094; + shf.l.wrap.b32 %r6100, %r6099, %r6099, 25; + add.s32 %r6101, %r6085, %r6100; + xor.b32 %r6102, %r6101, %r6076; + shf.l.wrap.b32 %r6103, %r6102, %r6102, 16; + add.s32 %r6104, %r6103, %r6063; + xor.b32 %r6105, %r6104, %r6100; + shf.l.wrap.b32 %r6106, %r6105, %r6105, 20; + add.s32 %r6107, %r6101, %r6106; + xor.b32 %r6108, %r6107, %r6103; + shf.l.wrap.b32 %r6109, %r6108, %r6108, 24; + add.s32 %r6110, %r6109, %r6104; + xor.b32 %r6111, %r6110, %r6106; + shf.l.wrap.b32 %r6112, %r6111, %r6111, 25; + add.s32 %r6113, %r6065, %r6095; + xor.b32 %r6114, %r6087, %r6113; + shf.l.wrap.b32 %r6115, %r6114, %r6114, 16; + add.s32 %r6116, %r6115, %r6077; + xor.b32 %r6117, %r6116, %r6065; + shf.l.wrap.b32 %r6118, %r6117, %r6117, 20; + add.s32 %r6119, %r6113, %r6118; + xor.b32 %r6120, %r6119, %r6115; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 24; + add.s32 %r6122, %r6121, %r6116; + xor.b32 %r6123, %r6122, %r6118; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 25; + xor.b32 %r6125, %r6097, %r6080; + shf.l.wrap.b32 %r6126, %r6125, %r6125, 16; + add.s32 %r6127, %r6126, %r6088; + xor.b32 %r6128, %r6127, %r6079; + shf.l.wrap.b32 %r6129, %r6128, %r6128, 20; + add.s32 %r6130, %r6080, %r6129; + xor.b32 %r6131, %r6130, %r6126; + shf.l.wrap.b32 %r6132, %r6131, %r6131, 24; + add.s32 %r6133, %r6132, %r6127; + xor.b32 %r6134, %r6133, %r6129; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 25; + add.s32 %r6136, %r6074, %r6090; + xor.b32 %r6137, %r6136, %r6062; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 16; + add.s32 %r6139, %r6138, %r6098; + xor.b32 %r6140, %r6139, %r6090; + shf.l.wrap.b32 %r6141, %r6140, %r6140, 20; + add.s32 %r6142, %r6136, %r6141; + xor.b32 %r6143, %r6142, %r6138; + shf.l.wrap.b32 %r6144, %r6143, %r6143, 24; + add.s32 %r6145, %r6144, %r6139; + xor.b32 %r6146, %r6145, %r6141; + shf.l.wrap.b32 %r6147, %r6146, %r6146, 25; + add.s32 %r6148, %r6107, %r6034; + add.s32 %r6149, %r6148, %r6147; + xor.b32 %r6150, %r6149, %r6121; + shf.l.wrap.b32 %r6151, %r6150, %r6150, 16; + add.s32 %r6152, %r6151, %r6133; + xor.b32 %r6153, %r6152, %r6147; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6149, %r6154; + xor.b32 %r6156, %r6155, %r6151; + shf.l.wrap.b32 %r6157, %r6156, %r6156, 24; + add.s32 %r6158, %r6157, %r6152; + xor.b32 %r6159, %r6158, %r6154; + shf.l.wrap.b32 %r6160, %r6159, %r6159, 25; + add.s32 %r6161, %r6119, %r6042; + add.s32 %r6162, %r6161, %r6112; + xor.b32 %r6163, %r6162, %r6132; + shf.l.wrap.b32 %r6164, %r6163, %r6163, 16; + add.s32 %r6165, %r6164, %r6145; + xor.b32 %r6166, %r6165, %r6112; + shf.l.wrap.b32 %r6167, %r6166, %r6166, 20; + add.s32 %r6168, %r6162, %r6167; + xor.b32 %r6169, %r6168, %r6164; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 24; + add.s32 %r6171, %r6170, %r6165; + xor.b32 %r6172, %r6171, %r6167; + shf.l.wrap.b32 %r6173, %r6172, %r6172, 25; + add.s32 %r6174, %r6130, %r6124; + xor.b32 %r6175, %r6144, %r6174; + shf.l.wrap.b32 %r6176, %r6175, %r6175, 16; + add.s32 %r6177, %r6176, %r6110; + xor.b32 %r6178, %r6177, %r6124; + shf.l.wrap.b32 %r6179, %r6178, %r6178, 20; + add.s32 %r6180, %r6174, %r6018; + add.s32 %r6181, %r6180, %r6179; + xor.b32 %r6182, %r6181, %r6176; + shf.l.wrap.b32 %r6183, %r6182, %r6182, 24; + add.s32 %r6184, %r6183, %r6177; + xor.b32 %r6185, %r6184, %r6179; + shf.l.wrap.b32 %r6186, %r6185, %r6185, 25; + add.s32 %r6187, %r6142, %r6135; + xor.b32 %r6188, %r6109, %r6187; + shf.l.wrap.b32 %r6189, %r6188, %r6188, 16; + add.s32 %r6190, %r6189, %r6122; + xor.b32 %r6191, %r6190, %r6135; + shf.l.wrap.b32 %r6192, %r6191, %r6191, 20; + add.s32 %r6193, %r6187, %r6192; + xor.b32 %r6194, %r6193, %r6189; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 24; + add.s32 %r6196, %r6195, %r6190; + xor.b32 %r6197, %r6196, %r6192; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 25; + add.s32 %r6199, %r6155, %r6026; + add.s32 %r6200, %r6199, %r6173; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 16; + add.s32 %r6203, %r6202, %r6184; + xor.b32 %r6204, %r6203, %r6173; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 20; + add.s32 %r6206, %r6200, %r6205; + xor.b32 %r6207, %r6206, %r6202; + shf.l.wrap.b32 %r6208, %r6207, %r6207, 24; + add.s32 %r6209, %r6208, %r6203; + xor.b32 %r6210, %r6209, %r6205; + shf.l.wrap.b32 %r6211, %r6210, %r6210, 25; + add.s32 %r6212, %r6186, %r6168; + xor.b32 %r6213, %r6157, %r6212; + shf.l.wrap.b32 %r6214, %r6213, %r6213, 16; + add.s32 %r6215, %r6214, %r6196; + xor.b32 %r6216, %r6215, %r6186; + shf.l.wrap.b32 %r6217, %r6216, %r6216, 20; + add.s32 %r6218, %r6212, %r6217; + xor.b32 %r6219, %r6218, %r6214; + shf.l.wrap.b32 %r6220, %r6219, %r6219, 24; + add.s32 %r6221, %r6220, %r6215; + xor.b32 %r6222, %r6221, %r6217; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 25; + add.s32 %r6224, %r6181, %r6198; + xor.b32 %r6225, %r6170, %r6224; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 16; + add.s32 %r6227, %r6226, %r6158; + xor.b32 %r6228, %r6227, %r6198; + shf.l.wrap.b32 %r6229, %r6228, %r6228, 20; + add.s32 %r6230, %r6224, %r6229; + xor.b32 %r6231, %r6230, %r6226; + shf.l.wrap.b32 %r6232, %r6231, %r6231, 24; + add.s32 %r6233, %r6232, %r6227; + xor.b32 %r6234, %r6233, %r6229; + shf.l.wrap.b32 %r6235, %r6234, %r6234, 25; + add.s32 %r6236, %r6193, %r6160; + xor.b32 %r6237, %r6236, %r6183; + shf.l.wrap.b32 %r6238, %r6237, %r6237, 16; + add.s32 %r6239, %r6238, %r6171; + xor.b32 %r6240, %r6239, %r6160; + shf.l.wrap.b32 %r6241, %r6240, %r6240, 20; + add.s32 %r6242, %r6236, %r6241; + xor.b32 %r6243, %r6242, %r6238; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6239; + xor.b32 %r6246, %r6245, %r6241; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6206, %r6042; + add.s32 %r6249, %r6248, %r6247; + xor.b32 %r6250, %r6249, %r6220; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6251, %r6233; + xor.b32 %r6253, %r6252, %r6247; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6254; + xor.b32 %r6256, %r6255, %r6251; + shf.l.wrap.b32 %r6257, %r6256, %r6256, 24; + add.s32 %r6258, %r6257, %r6252; + xor.b32 %r6259, %r6258, %r6254; + shf.l.wrap.b32 %r6260, %r6259, %r6259, 25; + add.s32 %r6261, %r6218, %r6211; + xor.b32 %r6262, %r6261, %r6232; + shf.l.wrap.b32 %r6263, %r6262, %r6262, 16; + add.s32 %r6264, %r6263, %r6245; + xor.b32 %r6265, %r6264, %r6211; + shf.l.wrap.b32 %r6266, %r6265, %r6265, 20; + add.s32 %r6267, %r6261, %r6266; + xor.b32 %r6268, %r6267, %r6263; + shf.l.wrap.b32 %r6269, %r6268, %r6268, 24; + add.s32 %r6270, %r6269, %r6264; + xor.b32 %r6271, %r6270, %r6266; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 25; + add.s32 %r6273, %r6230, %r6223; + xor.b32 %r6274, %r6244, %r6273; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 16; + add.s32 %r6276, %r6275, %r6209; + xor.b32 %r6277, %r6276, %r6223; + shf.l.wrap.b32 %r6278, %r6277, %r6277, 20; + add.s32 %r6279, %r6273, %r6034; + add.s32 %r6280, %r6279, %r6278; + xor.b32 %r6281, %r6280, %r6275; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 24; + add.s32 %r6283, %r6282, %r6276; + xor.b32 %r6284, %r6283, %r6278; + shf.l.wrap.b32 %r6285, %r6284, %r6284, 25; + add.s32 %r6286, %r6242, %r6235; + xor.b32 %r6287, %r6208, %r6286; + shf.l.wrap.b32 %r6288, %r6287, %r6287, 16; + add.s32 %r6289, %r6288, %r6221; + xor.b32 %r6290, %r6289, %r6235; + shf.l.wrap.b32 %r6291, %r6290, %r6290, 20; + add.s32 %r6292, %r6286, %r6291; + xor.b32 %r6293, %r6292, %r6288; + shf.l.wrap.b32 %r6294, %r6293, %r6293, 24; + add.s32 %r6295, %r6294, %r6289; + xor.b32 %r6296, %r6295, %r6291; + shf.l.wrap.b32 %r6297, %r6296, %r6296, 25; + add.s32 %r6298, %r6255, %r6272; + xor.b32 %r6299, %r6298, %r6294; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 16; + add.s32 %r6301, %r6300, %r6283; + xor.b32 %r6302, %r6301, %r6272; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 20; + add.s32 %r6304, %r6298, %r6303; + xor.b32 %r6305, %r6304, %r6300; + shf.l.wrap.b32 %r6306, %r6305, %r6305, 24; + add.s32 %r6307, %r6306, %r6301; + xor.b32 %r6308, %r6307, %r6303; + shf.l.wrap.b32 %r6309, %r6308, %r6308, 25; + add.s32 %r6310, %r6285, %r6267; + xor.b32 %r6311, %r6257, %r6310; + shf.l.wrap.b32 %r6312, %r6311, %r6311, 16; + add.s32 %r6313, %r6312, %r6295; + xor.b32 %r6314, %r6313, %r6285; + shf.l.wrap.b32 %r6315, %r6314, %r6314, 20; + add.s32 %r6316, %r6310, %r6018; + add.s32 %r6317, %r6316, %r6315; + xor.b32 %r6318, %r6317, %r6312; + shf.l.wrap.b32 %r6319, %r6318, %r6318, 24; + add.s32 %r6320, %r6319, %r6313; + xor.b32 %r6321, %r6320, %r6315; + shf.l.wrap.b32 %r6322, %r6321, %r6321, 25; + add.s32 %r6323, %r6280, %r6297; + xor.b32 %r6324, %r6269, %r6323; + shf.l.wrap.b32 %r6325, %r6324, %r6324, 16; + add.s32 %r6326, %r6325, %r6258; + xor.b32 %r6327, %r6326, %r6297; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 20; + add.s32 %r6329, %r6323, %r6328; + xor.b32 %r6330, %r6329, %r6325; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 24; + add.s32 %r6332, %r6331, %r6326; + xor.b32 %r6333, %r6332, %r6328; + shf.l.wrap.b32 %r6334, %r6333, %r6333, 25; + add.s32 %r6335, %r6292, %r6260; + xor.b32 %r6336, %r6335, %r6282; + shf.l.wrap.b32 %r6337, %r6336, %r6336, 16; + add.s32 %r6338, %r6337, %r6270; + xor.b32 %r6339, %r6338, %r6260; + shf.l.wrap.b32 %r6340, %r6339, %r6339, 20; + add.s32 %r6341, %r6335, %r6026; + add.s32 %r6342, %r6341, %r6340; + xor.b32 %r6343, %r6342, %r6337; + shf.l.wrap.b32 %r6344, %r6343, %r6343, 24; + add.s32 %r6345, %r6344, %r6338; + xor.b32 %r6346, %r6345, %r6340; + shf.l.wrap.b32 %r6347, %r6346, %r6346, 25; + add.s32 %r6348, %r6304, %r6347; + xor.b32 %r6349, %r6348, %r6319; + shf.l.wrap.b32 %r6350, %r6349, %r6349, 16; + add.s32 %r6351, %r6350, %r6332; + xor.b32 %r6352, %r6351, %r6347; + shf.l.wrap.b32 %r6353, %r6352, %r6352, 20; + add.s32 %r6354, %r6348, %r6353; + xor.b32 %r6355, %r6354, %r6350; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6351; + xor.b32 %r6358, %r6357, %r6353; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6317, %r6309; + xor.b32 %r6361, %r6360, %r6331; + shf.l.wrap.b32 %r6362, %r6361, %r6361, 16; + add.s32 %r6363, %r6362, %r6345; + xor.b32 %r6364, %r6363, %r6309; + shf.l.wrap.b32 %r6365, %r6364, %r6364, 20; + add.s32 %r6366, %r6360, %r6365; + xor.b32 %r6367, %r6366, %r6362; + shf.l.wrap.b32 %r6368, %r6367, %r6367, 24; + add.s32 %r6369, %r6368, %r6363; + xor.b32 %r6370, %r6369, %r6365; + shf.l.wrap.b32 %r6371, %r6370, %r6370, 25; + add.s32 %r6372, %r6329, %r6322; + xor.b32 %r6373, %r6344, %r6372; + shf.l.wrap.b32 %r6374, %r6373, %r6373, 16; + add.s32 %r6375, %r6374, %r6307; + xor.b32 %r6376, %r6375, %r6322; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 20; + add.s32 %r6378, %r6372, %r6042; + add.s32 %r6379, %r6378, %r6377; + xor.b32 %r6380, %r6379, %r6374; + shf.l.wrap.b32 %r6381, %r6380, %r6380, 24; + add.s32 %r6382, %r6381, %r6375; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 25; + add.s32 %r6385, %r6342, %r6334; + xor.b32 %r6386, %r6306, %r6385; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 16; + add.s32 %r6388, %r6387, %r6320; + xor.b32 %r6389, %r6388, %r6334; + shf.l.wrap.b32 %r6390, %r6389, %r6389, 20; + add.s32 %r6391, %r6385, %r6390; + xor.b32 %r6392, %r6391, %r6387; + shf.l.wrap.b32 %r6393, %r6392, %r6392, 24; + add.s32 %r6394, %r6393, %r6388; + xor.b32 %r6395, %r6394, %r6390; + shf.l.wrap.b32 %r6396, %r6395, %r6395, 25; + add.s32 %r6397, %r6354, %r6371; + xor.b32 %r6398, %r6397, %r6393; + shf.l.wrap.b32 %r6399, %r6398, %r6398, 16; + add.s32 %r6400, %r6399, %r6382; + xor.b32 %r6401, %r6400, %r6371; + shf.l.wrap.b32 %r6402, %r6401, %r6401, 20; + add.s32 %r6403, %r6397, %r6018; + add.s32 %r6404, %r6403, %r6402; + xor.b32 %r6405, %r6404, %r6399; + shf.l.wrap.b32 %r6406, %r6405, %r6405, 24; + add.s32 %r6407, %r6406, %r6400; + xor.b32 %r6408, %r6407, %r6402; + shf.l.wrap.b32 %r6409, %r6408, %r6408, 25; + add.s32 %r6410, %r6384, %r6366; + xor.b32 %r6411, %r6356, %r6410; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 16; + add.s32 %r6413, %r6412, %r6394; + xor.b32 %r6414, %r6413, %r6384; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 20; + add.s32 %r6416, %r6410, %r6034; + add.s32 %r6417, %r6416, %r6415; + xor.b32 %r6418, %r6417, %r6412; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 24; + add.s32 %r6420, %r6419, %r6413; + xor.b32 %r6421, %r6420, %r6415; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 25; + add.s32 %r6423, %r6379, %r6396; + xor.b32 %r6424, %r6368, %r6423; + shf.l.wrap.b32 %r6425, %r6424, %r6424, 16; + add.s32 %r6426, %r6425, %r6357; + xor.b32 %r6427, %r6426, %r6396; + shf.l.wrap.b32 %r6428, %r6427, %r6427, 20; + add.s32 %r6429, %r6423, %r6428; + xor.b32 %r6430, %r6429, %r6425; + shf.l.wrap.b32 %r6431, %r6430, %r6430, 24; + add.s32 %r6432, %r6431, %r6426; + xor.b32 %r6433, %r6432, %r6428; + shf.l.wrap.b32 %r6434, %r6433, %r6433, 25; + add.s32 %r6435, %r6391, %r6026; + add.s32 %r6436, %r6435, %r6359; + xor.b32 %r6437, %r6436, %r6381; + shf.l.wrap.b32 %r6438, %r6437, %r6437, 16; + add.s32 %r6439, %r6438, %r6369; + xor.b32 %r6440, %r6439, %r6359; + shf.l.wrap.b32 %r6441, %r6440, %r6440, 20; + add.s32 %r6442, %r6436, %r6441; + xor.b32 %r6443, %r6442, %r6438; + shf.l.wrap.b32 %r6444, %r6443, %r6443, 24; + add.s32 %r6445, %r6444, %r6439; + xor.b32 %r6446, %r6445, %r6441; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 25; + add.s32 %r6448, %r6404, %r6447; + xor.b32 %r6449, %r6448, %r6419; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 16; + add.s32 %r6451, %r6450, %r6432; + xor.b32 %r6452, %r6451, %r6447; + shf.l.wrap.b32 %r6453, %r6452, %r6452, 20; + add.s32 %r6454, %r6448, %r6453; + xor.b32 %r6455, %r6454, %r6450; + shf.l.wrap.b32 %r6456, %r6455, %r6455, 24; + add.s32 %r6457, %r6456, %r6451; + xor.b32 %r6458, %r6457, %r6453; + shf.l.wrap.b32 %r6459, %r6458, %r6458, 25; + add.s32 %r6460, %r6417, %r6409; + xor.b32 %r6461, %r6460, %r6431; + shf.l.wrap.b32 %r6462, %r6461, %r6461, 16; + add.s32 %r6463, %r6462, %r6445; + xor.b32 %r6464, %r6463, %r6409; + shf.l.wrap.b32 %r6465, %r6464, %r6464, 20; + add.s32 %r6466, %r6460, %r6465; + xor.b32 %r6467, %r6466, %r6462; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6463; + xor.b32 %r6470, %r6469, %r6465; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6429, %r6422; + xor.b32 %r6473, %r6444, %r6472; + shf.l.wrap.b32 %r6474, %r6473, %r6473, 16; + add.s32 %r6475, %r6474, %r6407; + xor.b32 %r6476, %r6475, %r6422; + shf.l.wrap.b32 %r6477, %r6476, %r6476, 20; + add.s32 %r6478, %r6472, %r6477; + xor.b32 %r6479, %r6478, %r6474; + shf.l.wrap.b32 %r6480, %r6479, %r6479, 24; + add.s32 %r6481, %r6480, %r6475; + xor.b32 %r6482, %r6481, %r6477; + shf.l.wrap.b32 %r6483, %r6482, %r6482, 25; + add.s32 %r6484, %r6442, %r6434; + xor.b32 %r6485, %r6406, %r6484; + shf.l.wrap.b32 %r6486, %r6485, %r6485, 16; + add.s32 %r6487, %r6486, %r6420; + xor.b32 %r6488, %r6487, %r6434; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 20; + add.s32 %r6490, %r6484, %r6489; + xor.b32 %r6491, %r6490, %r6486; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 24; + add.s32 %r6493, %r6492, %r6487; + xor.b32 %r6494, %r6493, %r6489; + shf.l.wrap.b32 %r6495, %r6494, %r6494, 25; + add.s32 %r6496, %r6454, %r6471; + xor.b32 %r6497, %r6496, %r6492; + shf.l.wrap.b32 %r6498, %r6497, %r6497, 16; + add.s32 %r6499, %r6498, %r6481; + xor.b32 %r6500, %r6499, %r6471; + shf.l.wrap.b32 %r6501, %r6500, %r6500, 20; + add.s32 %r6502, %r6496, %r6034; + add.s32 %r6503, %r6502, %r6501; + xor.b32 %r6504, %r6503, %r6498; + shf.l.wrap.b32 %r6505, %r6504, %r6504, 24; + add.s32 %r6506, %r6505, %r6499; + xor.b32 %r6507, %r6506, %r6501; + shf.l.wrap.b32 %r6508, %r6507, %r6507, 25; + add.s32 %r6509, %r6483, %r6466; + xor.b32 %r6510, %r6456, %r6509; + shf.l.wrap.b32 %r6511, %r6510, %r6510, 16; + add.s32 %r6512, %r6511, %r6493; + xor.b32 %r6513, %r6512, %r6483; + shf.l.wrap.b32 %r6514, %r6513, %r6513, 20; + add.s32 %r6515, %r6509, %r6042; + add.s32 %r6516, %r6515, %r6514; + xor.b32 %r6517, %r6516, %r6511; + shf.l.wrap.b32 %r6518, %r6517, %r6517, 24; + add.s32 %r6519, %r6518, %r6512; + xor.b32 %r6520, %r6519, %r6514; + shf.l.wrap.b32 %r6521, %r6520, %r6520, 25; + add.s32 %r6522, %r6478, %r6018; + add.s32 %r6523, %r6522, %r6495; + xor.b32 %r6524, %r6468, %r6523; + shf.l.wrap.b32 %r6525, %r6524, %r6524, 16; + add.s32 %r6526, %r6525, %r6457; + xor.b32 %r6527, %r6526, %r6495; + shf.l.wrap.b32 %r6528, %r6527, %r6527, 20; + add.s32 %r6529, %r6523, %r6026; + add.s32 %r6530, %r6529, %r6528; + xor.b32 %r6531, %r6530, %r6525; + shf.l.wrap.b32 %r6532, %r6531, %r6531, 24; + add.s32 %r6533, %r6532, %r6526; + xor.b32 %r6534, %r6533, %r6528; + shf.l.wrap.b32 %r6535, %r6534, %r6534, 25; + add.s32 %r6536, %r6490, %r6459; + xor.b32 %r6537, %r6536, %r6480; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 16; + add.s32 %r6539, %r6538, %r6469; + xor.b32 %r6540, %r6539, %r6459; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 20; + add.s32 %r6542, %r6536, %r6541; + xor.b32 %r6543, %r6542, %r6538; + shf.l.wrap.b32 %r6544, %r6543, %r6543, 24; + add.s32 %r6545, %r6544, %r6539; + xor.b32 %r6546, %r6545, %r6541; + shf.l.wrap.b32 %r6547, %r6546, %r6546, 25; + add.s32 %r6548, %r6503, %r6547; + xor.b32 %r6549, %r6548, %r6518; + shf.l.wrap.b32 %r6550, %r6549, %r6549, 16; + add.s32 %r6551, %r6550, %r6533; + xor.b32 %r6552, %r6551, %r6547; + shf.l.wrap.b32 %r6553, %r6552, %r6552, 20; + add.s32 %r6554, %r6548, %r6553; + xor.b32 %r6555, %r6554, %r6550; + shf.l.wrap.b32 %r6556, %r6555, %r6555, 24; + add.s32 %r6557, %r6556, %r6551; + xor.b32 %r6558, %r6557, %r6553; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 25; + add.s32 %r6560, %r6516, %r6508; + xor.b32 %r6561, %r6560, %r6532; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 16; + add.s32 %r6563, %r6562, %r6545; + xor.b32 %r6564, %r6563, %r6508; + shf.l.wrap.b32 %r6565, %r6564, %r6564, 20; + add.s32 %r6566, %r6560, %r6565; + xor.b32 %r6567, %r6566, %r6562; + shf.l.wrap.b32 %r6568, %r6567, %r6567, 24; + add.s32 %r6569, %r6568, %r6563; + xor.b32 %r6570, %r6569, %r6565; + shf.l.wrap.b32 %r6571, %r6570, %r6570, 25; + add.s32 %r6572, %r6530, %r6521; + xor.b32 %r6573, %r6544, %r6572; + shf.l.wrap.b32 %r6574, %r6573, %r6573, 16; + add.s32 %r6575, %r6574, %r6506; + xor.b32 %r6576, %r6575, %r6521; + shf.l.wrap.b32 %r6577, %r6576, %r6576, 20; + add.s32 %r6578, %r6572, %r6577; + xor.b32 %r6579, %r6578, %r6574; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6575; + xor.b32 %r6582, %r6581, %r6577; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6542, %r6535; + xor.b32 %r6585, %r6505, %r6584; + shf.l.wrap.b32 %r6586, %r6585, %r6585, 16; + add.s32 %r6587, %r6586, %r6519; + xor.b32 %r6588, %r6587, %r6535; + shf.l.wrap.b32 %r6589, %r6588, %r6588, 20; + add.s32 %r6590, %r6584, %r6026; + add.s32 %r6591, %r6590, %r6589; + xor.b32 %r6592, %r6591, %r6586; + shf.l.wrap.b32 %r6593, %r6592, %r6592, 24; + add.s32 %r6594, %r6593, %r6587; + xor.b32 %r6595, %r6594, %r6589; + shf.l.wrap.b32 %r6596, %r6595, %r6595, 25; + add.s32 %r6597, %r6554, %r6571; + xor.b32 %r6598, %r6597, %r6593; + shf.l.wrap.b32 %r6599, %r6598, %r6598, 16; + add.s32 %r6600, %r6599, %r6581; + xor.b32 %r6601, %r6600, %r6571; + shf.l.wrap.b32 %r6602, %r6601, %r6601, 20; + add.s32 %r6603, %r6597, %r6042; + add.s32 %r6604, %r6603, %r6602; + xor.b32 %r6605, %r6604, %r6599; + shf.l.wrap.b32 %r6606, %r6605, %r6605, 24; + add.s32 %r6607, %r6606, %r6600; + xor.b32 %r6608, %r6607, %r6602; + shf.l.wrap.b32 %r6609, %r6608, %r6608, 25; + add.s32 %r6610, %r6583, %r6018; + add.s32 %r6611, %r6610, %r6566; + xor.b32 %r6612, %r6556, %r6611; + shf.l.wrap.b32 %r6613, %r6612, %r6612, 16; + add.s32 %r6614, %r6613, %r6594; + xor.b32 %r6615, %r6614, %r6583; + shf.l.wrap.b32 %r6616, %r6615, %r6615, 20; + add.s32 %r6617, %r6611, %r6616; + xor.b32 %r6618, %r6617, %r6613; + shf.l.wrap.b32 %r6619, %r6618, %r6618, 24; + add.s32 %r6620, %r6619, %r6614; + xor.b32 %r6621, %r6620, %r6616; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 25; + add.s32 %r6623, %r6578, %r6034; + add.s32 %r6624, %r6623, %r6596; + xor.b32 %r6625, %r6568, %r6624; + shf.l.wrap.b32 %r6626, %r6625, %r6625, 16; + add.s32 %r6627, %r6626, %r6557; + xor.b32 %r6628, %r6627, %r6596; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 20; + add.s32 %r6630, %r6624, %r6629; + xor.b32 %r6631, %r6630, %r6626; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 24; + add.s32 %r6633, %r6632, %r6627; + xor.b32 %r6634, %r6633, %r6629; + shf.l.wrap.b32 %r6635, %r6634, %r6634, 25; + add.s32 %r6636, %r6591, %r6559; + xor.b32 %r6637, %r6636, %r6580; + shf.l.wrap.b32 %r6638, %r6637, %r6637, 16; + add.s32 %r6639, %r6638, %r6569; + xor.b32 %r6640, %r6639, %r6559; + shf.l.wrap.b32 %r6641, %r6640, %r6640, 20; + add.s32 %r6642, %r6636, %r6641; + xor.b32 %r6643, %r6642, %r6638; + shf.l.wrap.b32 %r6644, %r6643, %r6643, 24; + add.s32 %r6645, %r6644, %r6639; + xor.b32 %r6646, %r6645, %r6641; + shf.l.wrap.b32 %r6647, %r6646, %r6646, 25; + add.s32 %r6648, %r6604, %r6647; + xor.b32 %r6649, %r6648, %r6619; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 16; + add.s32 %r6651, %r6650, %r6633; + xor.b32 %r6652, %r6651, %r6647; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 20; + add.s32 %r6654, %r6648, %r6653; + xor.b32 %r6655, %r6654, %r6650; + shf.l.wrap.b32 %r6656, %r6655, %r6655, 24; + add.s32 %r6657, %r6656, %r6651; + xor.b32 %r6658, %r6657, %r6653; + shf.l.wrap.b32 %r6659, %r6658, %r6658, 25; + add.s32 %r6660, %r6617, %r6609; + xor.b32 %r6661, %r6660, %r6632; + shf.l.wrap.b32 %r6662, %r6661, %r6661, 16; + add.s32 %r6663, %r6662, %r6645; + xor.b32 %r6664, %r6663, %r6609; + shf.l.wrap.b32 %r6665, %r6664, %r6664, 20; + add.s32 %r6666, %r6660, %r6018; + add.s32 %r6667, %r6666, %r6665; + xor.b32 %r6668, %r6667, %r6662; + shf.l.wrap.b32 %r6669, %r6668, %r6668, 24; + add.s32 %r6670, %r6669, %r6663; + xor.b32 %r6671, %r6670, %r6665; + shf.l.wrap.b32 %r6672, %r6671, %r6671, 25; + add.s32 %r6673, %r6630, %r6026; + add.s32 %r6674, %r6673, %r6622; + xor.b32 %r6675, %r6644, %r6674; + shf.l.wrap.b32 %r6676, %r6675, %r6675, 16; + add.s32 %r6677, %r6676, %r6607; + xor.b32 %r6678, %r6677, %r6622; + shf.l.wrap.b32 %r6679, %r6678, %r6678, 20; + add.s32 %r6680, %r6674, %r6679; + xor.b32 %r6681, %r6680, %r6676; + shf.l.wrap.b32 %r6682, %r6681, %r6681, 24; + add.s32 %r6683, %r6682, %r6677; + xor.b32 %r6684, %r6683, %r6679; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 25; + add.s32 %r6686, %r6642, %r6635; + xor.b32 %r6687, %r6606, %r6686; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 16; + add.s32 %r6689, %r6688, %r6620; + xor.b32 %r6690, %r6689, %r6635; + shf.l.wrap.b32 %r6691, %r6690, %r6690, 20; + add.s32 %r6692, %r6686, %r6691; + xor.b32 %r6693, %r6692, %r6688; + shf.l.wrap.b32 %r6694, %r6693, %r6693, 24; + add.s32 %r6695, %r6694, %r6689; + xor.b32 %r6696, %r6695, %r6691; + shf.l.wrap.b32 %r6697, %r6696, %r6696, 25; + add.s32 %r6698, %r6654, %r6672; + xor.b32 %r6699, %r6698, %r6694; + shf.l.wrap.b32 %r6700, %r6699, %r6699, 16; + add.s32 %r6701, %r6700, %r6683; + xor.b32 %r6702, %r6701, %r6672; + shf.l.wrap.b32 %r6703, %r6702, %r6702, 20; + add.s32 %r6704, %r6698, %r6703; + xor.b32 %r6705, %r6704, %r6700; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6701; + xor.b32 %r6708, %r6707, %r6703; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6685, %r6034; + add.s32 %r6711, %r6710, %r6667; + xor.b32 %r6712, %r6656, %r6711; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6695; + xor.b32 %r6715, %r6714, %r6685; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6716; + xor.b32 %r6718, %r6717, %r6713; + shf.l.wrap.b32 %r6719, %r6718, %r6718, 24; + add.s32 %r6720, %r6719, %r6714; + xor.b32 %r6721, %r6720, %r6716; + shf.l.wrap.b32 %r6722, %r6721, %r6721, 25; + add.s32 %r6723, %r6680, %r6042; + add.s32 %r6724, %r6723, %r6697; + xor.b32 %r6725, %r6669, %r6724; + shf.l.wrap.b32 %r6726, %r6725, %r6725, 16; + add.s32 %r6727, %r6726, %r6657; + xor.b32 %r6728, %r6727, %r6697; + shf.l.wrap.b32 %r6729, %r6728, %r6728, 20; + add.s32 %r6730, %r6724, %r6729; + xor.b32 %r6731, %r6730, %r6726; + shf.l.wrap.b32 %r6732, %r6731, %r6731, 24; + add.s32 %r6733, %r6732, %r6727; + xor.b32 %r6734, %r6733, %r6729; + shf.l.wrap.b32 %r6735, %r6734, %r6734, 25; + add.s32 %r6736, %r6692, %r6659; + xor.b32 %r6737, %r6736, %r6682; + shf.l.wrap.b32 %r6738, %r6737, %r6737, 16; + add.s32 %r6739, %r6738, %r6670; + xor.b32 %r6740, %r6739, %r6659; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 20; + add.s32 %r6742, %r6736, %r6741; + xor.b32 %r6743, %r6742, %r6738; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 24; + add.s32 %r6745, %r6744, %r6739; + xor.b32 %r6746, %r6745, %r6741; + shf.l.wrap.b32 %r6747, %r6746, %r6746, 25; + xor.b32 %r6748, %r6704, %r6733; + cvt.u64.u32 %rd418, %r6748; + xor.b32 %r6749, %r6745, %r6717; + and.b32 %r6750, %r6749, 255; + cvt.u64.u32 %rd419, %r6750; + bfi.b64 %rd420, %rd419, %rd418, 32, 32; + cvt.u64.u32 %rd421, %r6749; + shl.b64 %rd422, %rd421, 32; + and.b64 %rd423, %rd422, 280375465082880; + or.b64 %rd424, %rd420, %rd423; + and.b64 %rd425, %rd422, 71776119061217280; + shr.u32 %r6751, %r6749, 24; + cvt.u64.u32 %rd426, %r6751; + shl.b64 %rd427, %rd426, 56; + or.b64 %rd428, %rd424, %rd425; + or.b64 %rd22, %rd428, %rd427; + xor.b32 %r6752, %r6707, %r6730; + cvt.u64.u32 %rd429, %r6752; + xor.b32 %r6753, %r6742, %r6720; + and.b32 %r6754, %r6753, 255; + cvt.u64.u32 %rd430, %r6754; + bfi.b64 %rd431, %rd430, %rd429, 32, 32; + cvt.u64.u32 %rd432, %r6753; + shl.b64 %rd433, %rd432, 32; + and.b64 %rd434, %rd433, 280375465082880; + or.b64 %rd435, %rd431, %rd434; + and.b64 %rd436, %rd433, 71776119061217280; + shr.u32 %r6755, %r6753, 24; + cvt.u64.u32 %rd437, %r6755; + shl.b64 %rd438, %rd437, 56; + or.b64 %rd439, %rd435, %rd436; + or.b64 %rd23, %rd439, %rd438; + xor.b32 %r6756, %r6747, %r6719; + cvt.u64.u32 %rd440, %r6756; + xor.b32 %r6757, %r6709, %r6732; + and.b32 %r6758, %r6757, 255; + cvt.u64.u32 %rd441, %r6758; + bfi.b64 %rd442, %rd441, %rd440, 32, 32; + cvt.u64.u32 %rd443, %r6757; + shl.b64 %rd444, %rd443, 32; + and.b64 %rd445, %rd444, 280375465082880; + or.b64 %rd446, %rd442, %rd445; + and.b64 %rd447, %rd444, 71776119061217280; + shr.u32 %r6759, %r6757, 24; + cvt.u64.u32 %rd448, %r6759; + shl.b64 %rd449, %rd448, 56; + or.b64 %rd450, %rd446, %rd447; + or.b64 %rd24, %rd450, %rd449; + xor.b32 %r6760, %r6744, %r6722; + cvt.u64.u32 %rd451, %r6760; + xor.b32 %r6761, %r6706, %r6735; + and.b32 %r6762, %r6761, 255; + cvt.u64.u32 %rd452, %r6762; + bfi.b64 %rd453, %rd452, %rd451, 32, 32; + cvt.u64.u32 %rd454, %r6761; + shl.b64 %rd455, %rd454, 32; + and.b64 %rd456, %rd455, 280375465082880; + or.b64 %rd457, %rd453, %rd456; + and.b64 %rd458, %rd455, 71776119061217280; + shr.u32 %r6763, %r6761, 24; + cvt.u64.u32 %rd459, %r6763; + shl.b64 %rd460, %rd459, 56; + or.b64 %rd461, %rd457, %rd458; + or.b64 %rd25, %rd461, %rd460; + add.u64 %rd1248, %SPL, 2016; + mov.u32 %r29538, 0; + st.local.v4.u32 [%rd1248+32], {%r29538, %r29538, %r29538, %r29538}; + st.local.v4.u32 [%rd1248+48], {%r29538, %r29538, %r29538, %r29538}; + st.local.v4.u32 [%rd1248+64], {%r29538, %r29538, %r29538, %r29538}; + and.b64 %rd26, %rd22, -256; + st.local.v2.u64 [%rd1248], {%rd22, %rd23}; + st.local.v2.u64 [%rd1248+16], {%rd24, %rd25}; + and.b64 %rd1260, %rd22, 255; + +$L__BB2_19: + mov.b64 {%r20, %r19}, %rd1260; + mul.wide.u32 %rd463, %r20, 1908875315; + shr.u64 %rd464, %rd463, 56; + cvt.u32.u64 %r6765, %rd464; + mul.lo.s32 %r6766, %r6765, 37748717; + sub.s32 %r21, %r20, %r6766; + mov.b64 {%r24, %r23}, %rd1258; + mul.wide.u32 %rd465, %r24, 1908875315; + shr.u64 %rd466, %rd465, 56; + cvt.u32.u64 %r6767, %rd466; + mul.lo.s32 %r6768, %r6767, 37748717; + sub.s32 %r25, %r24, %r6768; + mov.b64 {%r28, %r27}, %rd1257; + mul.wide.u32 %rd467, %r28, 1908875315; + shr.u64 %rd468, %rd467, 56; + cvt.u32.u64 %r6769, %rd468; + mul.lo.s32 %r6770, %r6769, 37748717; + sub.s32 %r29, %r28, %r6770; + shl.b32 %r30, %r21, 1; + mul.wide.u32 %rd469, %r30, -954391867; + shr.u64 %rd470, %rd469, 32; + cvt.u32.u64 %r6771, %rd470; + sub.s32 %r6772, %r30, %r6771; + shr.u32 %r6773, %r6772, 1; + add.s32 %r6774, %r6773, %r6771; + shr.u32 %r6775, %r6774, 20; + mul.lo.s32 %r6776, %r6775, 1179641; + sub.s32 %r6777, %r30, %r6776; + cvta.to.global.u64 %rd471, %rd354; + mul.wide.u32 %rd472, %r6777, 64; + add.s64 %rd34, %rd471, %rd472; + or.b32 %r31, %r30, 1; + mul.wide.u32 %rd473, %r31, -954391867; + shr.u64 %rd474, %rd473, 32; + cvt.u32.u64 %r6778, %rd474; + sub.s32 %r6779, %r31, %r6778; + shr.u32 %r6780, %r6779, 1; + add.s32 %r6781, %r6780, %r6778; + shr.u32 %r6782, %r6781, 20; + mul.lo.s32 %r6783, %r6782, 1179641; + sub.s32 %r6784, %r31, %r6783; + mul.wide.u32 %rd475, %r6784, 64; + add.s64 %rd35, %rd471, %rd475; + setp.eq.s64 %p16, %rd353, 0; + @%p16 bra $L__BB2_33; + + cvta.to.global.u64 %rd476, %rd353; + mul.wide.u32 %rd477, %r21, 128; + add.s64 %rd36, %rd476, %rd477; + ld.global.u64 %rd1261, [%rd36]; + setp.eq.s64 %p17, %rd1261, 0; + @%p17 bra $L__BB2_22; + + ld.global.u64 %rd1264, [%rd36+32]; + ld.global.u64 %rd1263, [%rd36+16]; + ld.global.u64 %rd1262, [%rd36+8]; + bra.uni $L__BB2_44; + +$L__BB2_33: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd579, 1179641; + st.local.u64 [%rd3+8], %rd579; + st.local.u32 [%rd3+16], %r30; + ld.global.u64 %rd580, [%rd34]; + ld.global.u64 %rd581, [%rd34+8]; + ld.global.u64 %rd582, [%rd34+16]; + ld.global.u64 %rd583, [%rd34+24]; + ld.global.u64 %rd584, [%rd34+32]; + ld.global.u64 %rd585, [%rd34+40]; + ld.global.u64 %rd586, [%rd34+48]; + ld.global.u64 %rd587, [%rd34+56]; + st.local.u64 [%rd3+24], %rd580; + st.local.u64 [%rd3+32], %rd581; + st.local.u64 [%rd3+40], %rd582; + st.local.u64 [%rd3+48], %rd583; + st.local.u64 [%rd3+56], %rd584; + st.local.u64 [%rd3+64], %rd585; + st.local.u64 [%rd3+72], %rd586; + st.local.u64 [%rd3+80], %rd587; + cvt.u32.u64 %r10110, %rd580; + xor.b32 %r10111, %r30, %r10110; + st.local.u32 [%rd3+24], %r10111; + mov.u32 %r29776, 0; + st.local.v2.u32 [%rd3+96], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+104], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+112], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+120], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+128], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+136], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+144], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+152], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+160], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+168], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+176], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+184], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+192], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+200], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+208], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+216], {%r29776, %r29776}; + mov.u32 %r29791, -2147483648; + mov.u32 %r10083, 1; + st.local.v2.u32 [%rd3+88], {%r10083, %r29791}; + ld.local.v2.u32 {%r29812, %r29813}, [%rd3+24]; + mov.b64 {%r29810, %r29811}, %rd585; + shr.u64 %rd588, %rd581, 32; + cvt.u32.u64 %r29824, %rd581; + cvt.u32.u64 %r29825, %rd588; + shr.u64 %rd589, %rd586, 32; + cvt.u32.u64 %r29822, %rd586; + cvt.u32.u64 %r29823, %rd589; + shr.u64 %rd590, %rd582, 32; + cvt.u32.u64 %r29820, %rd582; + cvt.u32.u64 %r29821, %rd590; + shr.u64 %rd591, %rd587, 32; + cvt.u32.u64 %r29818, %rd587; + cvt.u32.u64 %r29819, %rd591; + shr.u64 %rd592, %rd583, 32; + cvt.u32.u64 %r29816, %rd583; + cvt.u32.u64 %r29817, %rd592; + shr.u64 %rd593, %rd584, 32; + cvt.u32.u64 %r29814, %rd584; + cvt.u32.u64 %r29815, %rd593; + mov.u32 %r29777, %r29776; + mov.u32 %r29778, %r29776; + mov.u32 %r29779, %r29776; + mov.u32 %r29780, %r29776; + mov.u32 %r29781, %r29776; + mov.u32 %r29782, %r29776; + mov.u32 %r29783, %r29776; + mov.u32 %r29784, %r29776; + mov.u32 %r29785, %r29776; + mov.u32 %r29786, %r29776; + mov.u32 %r29787, %r29776; + mov.u32 %r29788, %r29776; + mov.u32 %r29789, %r29776; + mov.u32 %r29790, %r10083; + mov.u32 %r29792, %r29776; + mov.u32 %r29793, %r29776; + mov.u32 %r29794, %r29776; + mov.u32 %r29795, %r29776; + mov.u32 %r29796, %r29776; + mov.u32 %r29797, %r29776; + mov.u32 %r29798, %r29776; + mov.u32 %r29799, %r29776; + mov.u32 %r29800, %r29776; + mov.u32 %r29801, %r29776; + mov.u32 %r29802, %r29776; + mov.u32 %r29803, %r29776; + mov.u32 %r29804, %r29776; + mov.u32 %r29805, %r29776; + mov.u32 %r29806, %r29776; + mov.u32 %r29807, %r29776; + mov.u32 %r29808, %r29776; + mov.u32 %r29809, %r29776; + mov.u32 %r29826, %r29776; + +$L__BB2_34: + // begin inline asm + // xor5 + lop3.b32 %r10114, %r29812, %r29810, %r29808, 0x96; + lop3.b32 %r10114, %r10114, %r29806, %r29804, 0x96; + lop3.b32 %r10115, %r29813, %r29811, %r29809, 0x96; + lop3.b32 %r10115, %r10115, %r29807, %r29805, 0x96; + // end inline asm // begin inline asm - dp4a.u32.u32 %r1712, %r1713, %r5746, %r6244; + // xor5 + lop3.b32 %r10126, %r29824, %r29822, %r29802, 0x96; + lop3.b32 %r10126, %r10126, %r29800, %r29798, 0x96; + lop3.b32 %r10127, %r29825, %r29823, %r29803, 0x96; + lop3.b32 %r10127, %r10127, %r29801, %r29799, 0x96; // end inline asm - ld.const.u32 %r1717, [matrix+4]; // begin inline asm - dp4a.u32.u32 %r1716, %r1717, %r5750, %r1712; + // xor5 + lop3.b32 %r10138, %r29820, %r29818, %r29796, 0x96; + lop3.b32 %r10138, %r10138, %r29794, %r29792, 0x96; + lop3.b32 %r10139, %r29821, %r29819, %r29797, 0x96; + lop3.b32 %r10139, %r10139, %r29795, %r29793, 0x96; // end inline asm - ld.const.u32 %r1721, [matrix+8]; // begin inline asm - dp4a.u32.u32 %r1720, %r1721, %r5754, %r1716; + // xor5 + lop3.b32 %r10150, %r29816, %r29790, %r29788, 0x96; + lop3.b32 %r10150, %r10150, %r29786, %r29784, 0x96; + lop3.b32 %r10151, %r29817, %r29791, %r29789, 0x96; + lop3.b32 %r10151, %r10151, %r29787, %r29785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10162, %r29814, %r29782, %r29780, 0x96; + lop3.b32 %r10162, %r10162, %r29778, %r29776, 0x96; + lop3.b32 %r10163, %r29815, %r29783, %r29781, 0x96; + lop3.b32 %r10163, %r10163, %r29779, %r29777, 0x96; // end inline asm - ld.const.u32 %r1725, [matrix+12]; // begin inline asm - dp4a.u32.u32 %r1724, %r1725, %r5758, %r1720; + shf.l.wrap.b32 %r10174, %r10127, %r10126, %r10083; // end inline asm - ld.const.u32 %r1729, [matrix+16]; // begin inline asm - dp4a.u32.u32 %r1728, %r1729, %r5762, %r1724; + shf.l.wrap.b32 %r10178, %r10126, %r10127, %r10083; // end inline asm - ld.const.u32 %r1733, [matrix+20]; + xor.b32 %r10608, %r10174, %r10162; + xor.b32 %r10609, %r10178, %r10163; + xor.b32 %r10441, %r29812, %r10608; + xor.b32 %r10444, %r29813, %r10609; + xor.b32 %r10348, %r29810, %r10608; + xor.b32 %r10347, %r29811, %r10609; + xor.b32 %r10395, %r29808, %r10608; + xor.b32 %r10396, %r29809, %r10609; + xor.b32 %r10300, %r29806, %r10608; + xor.b32 %r10299, %r29807, %r10609; + xor.b32 %r10251, %r29804, %r10608; + xor.b32 %r10252, %r29805, %r10609; // begin inline asm - dp4a.u32.u32 %r1732, %r1733, %r5766, %r1728; + shf.l.wrap.b32 %r10182, %r10139, %r10138, %r10083; // end inline asm - ld.const.u32 %r1737, [matrix+24]; // begin inline asm - dp4a.u32.u32 %r1736, %r1737, %r5770, %r1732; + shf.l.wrap.b32 %r10186, %r10138, %r10139, %r10083; // end inline asm - ld.const.u32 %r1741, [matrix+28]; + xor.b32 %r10610, %r10182, %r10114; + xor.b32 %r10611, %r10186, %r10115; + xor.b32 %r10403, %r29824, %r10610; + xor.b32 %r10404, %r29825, %r10611; + xor.b32 %r10220, %r29822, %r10610; + xor.b32 %r10219, %r29823, %r10611; + xor.b32 %r10379, %r29802, %r10610; + xor.b32 %r10380, %r29803, %r10611; + xor.b32 %r10340, %r29800, %r10610; + xor.b32 %r10339, %r29801, %r10611; + xor.b32 %r10323, %r29798, %r10610; + xor.b32 %r10324, %r29799, %r10611; // begin inline asm - dp4a.u32.u32 %r1740, %r1741, %r5774, %r1736; + shf.l.wrap.b32 %r10190, %r10151, %r10150, %r10083; // end inline asm - ld.const.u32 %r1745, [matrix+32]; // begin inline asm - dp4a.u32.u32 %r1744, %r1745, %r5778, %r1740; + shf.l.wrap.b32 %r10194, %r10150, %r10151, %r10083; // end inline asm - ld.const.u32 %r1749, [matrix+36]; + xor.b32 %r10612, %r10190, %r10126; + xor.b32 %r10613, %r10194, %r10127; + xor.b32 %r10260, %r29820, %r10612; + xor.b32 %r10259, %r29821, %r10613; + xor.b32 %r10387, %r29818, %r10612; + xor.b32 %r10388, %r29819, %r10613; + xor.b32 %r10268, %r29796, %r10612; + xor.b32 %r10267, %r29797, %r10613; + xor.b32 %r10371, %r29794, %r10612; + xor.b32 %r10372, %r29795, %r10613; + xor.b32 %r10236, %r29792, %r10612; + xor.b32 %r10235, %r29793, %r10613; // begin inline asm - dp4a.u32.u32 %r1748, %r1749, %r5782, %r1744; + shf.l.wrap.b32 %r10198, %r10163, %r10162, %r10083; // end inline asm - ld.const.u32 %r1753, [matrix+40]; // begin inline asm - dp4a.u32.u32 %r1752, %r1753, %r5786, %r1748; + shf.l.wrap.b32 %r10202, %r10162, %r10163, %r10083; // end inline asm - ld.const.u32 %r1757, [matrix+44]; + xor.b32 %r10614, %r10198, %r10138; + xor.b32 %r10615, %r10202, %r10139; + xor.b32 %r10355, %r29816, %r10614; + xor.b32 %r10356, %r29817, %r10615; + xor.b32 %r10332, %r29790, %r10614; + xor.b32 %r10331, %r29791, %r10615; + xor.b32 %r10275, %r29788, %r10614; + xor.b32 %r10276, %r29789, %r10615; + xor.b32 %r10363, %r29786, %r10614; + xor.b32 %r10364, %r29787, %r10615; + xor.b32 %r10292, %r29784, %r10614; + xor.b32 %r10291, %r29785, %r10615; // begin inline asm - dp4a.u32.u32 %r1756, %r1757, %r5790, %r1752; + shf.l.wrap.b32 %r10206, %r10115, %r10114, %r10083; // end inline asm - ld.const.u32 %r1761, [matrix+48]; // begin inline asm - dp4a.u32.u32 %r1760, %r1761, %r5794, %r1756; + shf.l.wrap.b32 %r10210, %r10114, %r10115, %r10083; // end inline asm - ld.const.u32 %r1765, [matrix+52]; + xor.b32 %r10616, %r10206, %r10150; + xor.b32 %r10617, %r10210, %r10151; + xor.b32 %r10307, %r29814, %r10616; + xor.b32 %r10308, %r29815, %r10617; + xor.b32 %r10227, %r29782, %r10616; + xor.b32 %r10228, %r29783, %r10617; + xor.b32 %r10244, %r29780, %r10616; + xor.b32 %r10243, %r29781, %r10617; + xor.b32 %r10283, %r29778, %r10616; + xor.b32 %r10284, %r29779, %r10617; + xor.b32 %r10315, %r29776, %r10616; + xor.b32 %r10316, %r29777, %r10617; + mov.u32 %r10221, 44; // begin inline asm - dp4a.u32.u32 %r1764, %r1765, %r5798, %r1760; + shf.l.wrap.b32 %r10214, %r10220, %r10219, %r10221; // end inline asm - ld.const.u32 %r1769, [matrix+56]; // begin inline asm - dp4a.u32.u32 %r1768, %r1769, %r5802, %r1764; + shf.l.wrap.b32 %r10218, %r10219, %r10220, %r10221; // end inline asm - ld.const.u32 %r1773, [matrix+60]; + mov.u32 %r10229, 20; // begin inline asm - dp4a.u32.u32 %r1772, %r1773, %r5806, %r1768; + shf.l.wrap.b32 %r10222, %r10228, %r10227, %r10229; // end inline asm - ld.const.u32 %r1777, [matrix+64]; // begin inline asm - dp4a.u32.u32 %r1776, %r1777, %r5746, %r6244; + shf.l.wrap.b32 %r10226, %r10227, %r10228, %r10229; // end inline asm - ld.const.u32 %r1781, [matrix+68]; + mov.u32 %r10237, 61; // begin inline asm - dp4a.u32.u32 %r1780, %r1781, %r5750, %r1776; + shf.l.wrap.b32 %r10230, %r10236, %r10235, %r10237; // end inline asm - ld.const.u32 %r1785, [matrix+72]; // begin inline asm - dp4a.u32.u32 %r1784, %r1785, %r5754, %r1780; + shf.l.wrap.b32 %r10234, %r10235, %r10236, %r10237; // end inline asm - ld.const.u32 %r1789, [matrix+76]; + mov.u32 %r10245, 39; // begin inline asm - dp4a.u32.u32 %r1788, %r1789, %r5758, %r1784; + shf.l.wrap.b32 %r10238, %r10244, %r10243, %r10245; // end inline asm - ld.const.u32 %r1793, [matrix+80]; // begin inline asm - dp4a.u32.u32 %r1792, %r1793, %r5762, %r1788; + shf.l.wrap.b32 %r10242, %r10243, %r10244, %r10245; // end inline asm - ld.const.u32 %r1797, [matrix+84]; + mov.u32 %r10253, 18; // begin inline asm - dp4a.u32.u32 %r1796, %r1797, %r5766, %r1792; + shf.l.wrap.b32 %r10246, %r10252, %r10251, %r10253; // end inline asm - ld.const.u32 %r1801, [matrix+88]; // begin inline asm - dp4a.u32.u32 %r1800, %r1801, %r5770, %r1796; + shf.l.wrap.b32 %r10250, %r10251, %r10252, %r10253; // end inline asm - ld.const.u32 %r1805, [matrix+92]; + mov.u32 %r10261, 62; // begin inline asm - dp4a.u32.u32 %r1804, %r1805, %r5774, %r1800; + shf.l.wrap.b32 %r10254, %r10260, %r10259, %r10261; // end inline asm - ld.const.u32 %r1809, [matrix+96]; // begin inline asm - dp4a.u32.u32 %r1808, %r1809, %r5778, %r1804; + shf.l.wrap.b32 %r10258, %r10259, %r10260, %r10261; // end inline asm - ld.const.u32 %r1813, [matrix+100]; + mov.u32 %r10269, 43; // begin inline asm - dp4a.u32.u32 %r1812, %r1813, %r5782, %r1808; + shf.l.wrap.b32 %r10262, %r10268, %r10267, %r10269; // end inline asm - ld.const.u32 %r1817, [matrix+104]; // begin inline asm - dp4a.u32.u32 %r1816, %r1817, %r5786, %r1812; + shf.l.wrap.b32 %r10266, %r10267, %r10268, %r10269; // end inline asm - ld.const.u32 %r1821, [matrix+108]; + mov.u32 %r10277, 25; // begin inline asm - dp4a.u32.u32 %r1820, %r1821, %r5790, %r1816; + shf.l.wrap.b32 %r10270, %r10276, %r10275, %r10277; // end inline asm - ld.const.u32 %r1825, [matrix+112]; // begin inline asm - dp4a.u32.u32 %r1824, %r1825, %r5794, %r1820; + shf.l.wrap.b32 %r10274, %r10275, %r10276, %r10277; // end inline asm - ld.const.u32 %r1829, [matrix+116]; + mov.u32 %r10285, 8; // begin inline asm - dp4a.u32.u32 %r1828, %r1829, %r5798, %r1824; + shf.l.wrap.b32 %r10278, %r10284, %r10283, %r10285; // end inline asm - ld.const.u32 %r1833, [matrix+120]; // begin inline asm - dp4a.u32.u32 %r1832, %r1833, %r5802, %r1828; + shf.l.wrap.b32 %r10282, %r10283, %r10284, %r10285; // end inline asm - ld.const.u32 %r1837, [matrix+124]; + mov.u32 %r10293, 56; // begin inline asm - dp4a.u32.u32 %r1836, %r1837, %r5806, %r1832; + shf.l.wrap.b32 %r10286, %r10292, %r10291, %r10293; // end inline asm - shr.u32 %r5966, %r1772, 6; - and.b32 %r5967, %r5966, 240; - shr.u32 %r5968, %r1836, 10; - or.b32 %r5969, %r5968, %r5967; - xor.b32 %r5970, %r9, %r5969; - ld.const.u32 %r1841, [matrix+128]; // begin inline asm - dp4a.u32.u32 %r1840, %r1841, %r5746, %r6244; + shf.l.wrap.b32 %r10290, %r10291, %r10292, %r10293; // end inline asm - ld.const.u32 %r1845, [matrix+132]; + mov.u32 %r10301, 41; // begin inline asm - dp4a.u32.u32 %r1844, %r1845, %r5750, %r1840; + shf.l.wrap.b32 %r10294, %r10300, %r10299, %r10301; // end inline asm - ld.const.u32 %r1849, [matrix+136]; // begin inline asm - dp4a.u32.u32 %r1848, %r1849, %r5754, %r1844; + shf.l.wrap.b32 %r10298, %r10299, %r10300, %r10301; // end inline asm - ld.const.u32 %r1853, [matrix+140]; + mov.u32 %r10309, 27; // begin inline asm - dp4a.u32.u32 %r1852, %r1853, %r5758, %r1848; + shf.l.wrap.b32 %r10302, %r10308, %r10307, %r10309; // end inline asm - ld.const.u32 %r1857, [matrix+144]; // begin inline asm - dp4a.u32.u32 %r1856, %r1857, %r5762, %r1852; + shf.l.wrap.b32 %r10306, %r10307, %r10308, %r10309; // end inline asm - ld.const.u32 %r1861, [matrix+148]; + mov.u32 %r10317, 14; // begin inline asm - dp4a.u32.u32 %r1860, %r1861, %r5766, %r1856; + shf.l.wrap.b32 %r10310, %r10316, %r10315, %r10317; // end inline asm - ld.const.u32 %r1865, [matrix+152]; // begin inline asm - dp4a.u32.u32 %r1864, %r1865, %r5770, %r1860; + shf.l.wrap.b32 %r10314, %r10315, %r10316, %r10317; // end inline asm - ld.const.u32 %r1869, [matrix+156]; + mov.u32 %r10325, 2; // begin inline asm - dp4a.u32.u32 %r1868, %r1869, %r5774, %r1864; + shf.l.wrap.b32 %r10318, %r10324, %r10323, %r10325; // end inline asm - ld.const.u32 %r1873, [matrix+160]; // begin inline asm - dp4a.u32.u32 %r1872, %r1873, %r5778, %r1868; + shf.l.wrap.b32 %r10322, %r10323, %r10324, %r10325; // end inline asm - ld.const.u32 %r1877, [matrix+164]; + mov.u32 %r10333, 55; // begin inline asm - dp4a.u32.u32 %r1876, %r1877, %r5782, %r1872; + shf.l.wrap.b32 %r10326, %r10332, %r10331, %r10333; // end inline asm - ld.const.u32 %r1881, [matrix+168]; // begin inline asm - dp4a.u32.u32 %r1880, %r1881, %r5786, %r1876; + shf.l.wrap.b32 %r10330, %r10331, %r10332, %r10333; // end inline asm - ld.const.u32 %r1885, [matrix+172]; + mov.u32 %r10341, 45; // begin inline asm - dp4a.u32.u32 %r1884, %r1885, %r5790, %r1880; + shf.l.wrap.b32 %r10334, %r10340, %r10339, %r10341; // end inline asm - ld.const.u32 %r1889, [matrix+176]; // begin inline asm - dp4a.u32.u32 %r1888, %r1889, %r5794, %r1884; + shf.l.wrap.b32 %r10338, %r10339, %r10340, %r10341; // end inline asm - ld.const.u32 %r1893, [matrix+180]; + mov.u32 %r10349, 36; // begin inline asm - dp4a.u32.u32 %r1892, %r1893, %r5798, %r1888; + shf.l.wrap.b32 %r10342, %r10348, %r10347, %r10349; // end inline asm - ld.const.u32 %r1897, [matrix+184]; // begin inline asm - dp4a.u32.u32 %r1896, %r1897, %r5802, %r1892; + shf.l.wrap.b32 %r10346, %r10347, %r10348, %r10349; // end inline asm - ld.const.u32 %r1901, [matrix+188]; + mov.u32 %r10357, 28; // begin inline asm - dp4a.u32.u32 %r1900, %r1901, %r5806, %r1896; + shf.l.wrap.b32 %r10350, %r10356, %r10355, %r10357; // end inline asm - ld.const.u32 %r1905, [matrix+192]; // begin inline asm - dp4a.u32.u32 %r1904, %r1905, %r5746, %r6244; + shf.l.wrap.b32 %r10354, %r10355, %r10356, %r10357; // end inline asm - ld.const.u32 %r1909, [matrix+196]; + mov.u32 %r10365, 21; // begin inline asm - dp4a.u32.u32 %r1908, %r1909, %r5750, %r1904; + shf.l.wrap.b32 %r10358, %r10364, %r10363, %r10365; // end inline asm - ld.const.u32 %r1913, [matrix+200]; // begin inline asm - dp4a.u32.u32 %r1912, %r1913, %r5754, %r1908; + shf.l.wrap.b32 %r10362, %r10363, %r10364, %r10365; // end inline asm - ld.const.u32 %r1917, [matrix+204]; + mov.u32 %r10373, 15; // begin inline asm - dp4a.u32.u32 %r1916, %r1917, %r5758, %r1912; + shf.l.wrap.b32 %r10366, %r10372, %r10371, %r10373; // end inline asm - ld.const.u32 %r1921, [matrix+208]; // begin inline asm - dp4a.u32.u32 %r1920, %r1921, %r5762, %r1916; + shf.l.wrap.b32 %r10370, %r10371, %r10372, %r10373; // end inline asm - ld.const.u32 %r1925, [matrix+212]; + mov.u32 %r10381, 10; // begin inline asm - dp4a.u32.u32 %r1924, %r1925, %r5766, %r1920; + shf.l.wrap.b32 %r10374, %r10380, %r10379, %r10381; // end inline asm - ld.const.u32 %r1929, [matrix+216]; // begin inline asm - dp4a.u32.u32 %r1928, %r1929, %r5770, %r1924; + shf.l.wrap.b32 %r10378, %r10379, %r10380, %r10381; // end inline asm - ld.const.u32 %r1933, [matrix+220]; + mov.u32 %r10389, 6; // begin inline asm - dp4a.u32.u32 %r1932, %r1933, %r5774, %r1928; + shf.l.wrap.b32 %r10382, %r10388, %r10387, %r10389; // end inline asm - ld.const.u32 %r1937, [matrix+224]; // begin inline asm - dp4a.u32.u32 %r1936, %r1937, %r5778, %r1932; + shf.l.wrap.b32 %r10386, %r10387, %r10388, %r10389; // end inline asm - ld.const.u32 %r1941, [matrix+228]; + mov.u32 %r10397, 3; // begin inline asm - dp4a.u32.u32 %r1940, %r1941, %r5782, %r1936; + shf.l.wrap.b32 %r10390, %r10396, %r10395, %r10397; // end inline asm - ld.const.u32 %r1945, [matrix+232]; // begin inline asm - dp4a.u32.u32 %r1944, %r1945, %r5786, %r1940; + shf.l.wrap.b32 %r10394, %r10395, %r10396, %r10397; // end inline asm - ld.const.u32 %r1949, [matrix+236]; // begin inline asm - dp4a.u32.u32 %r1948, %r1949, %r5790, %r1944; + shf.l.wrap.b32 %r10398, %r10404, %r10403, %r10083; // end inline asm - ld.const.u32 %r1953, [matrix+240]; // begin inline asm - dp4a.u32.u32 %r1952, %r1953, %r5794, %r1948; + shf.l.wrap.b32 %r10402, %r10403, %r10404, %r10083; // end inline asm - ld.const.u32 %r1957, [matrix+244]; // begin inline asm - dp4a.u32.u32 %r1956, %r1957, %r5798, %r1952; + // chi + lop3.b32 %r10406, %r10441, %r10214, %r10262, 0xD2; + lop3.b32 %r10407, %r10444, %r10218, %r10266, 0xD2; // end inline asm - ld.const.u32 %r1961, [matrix+248]; // begin inline asm - dp4a.u32.u32 %r1960, %r1961, %r5802, %r1956; + // chi + lop3.b32 %r29824, %r10214, %r10262, %r10358, 0xD2; + lop3.b32 %r29825, %r10218, %r10266, %r10362, 0xD2; // end inline asm - ld.const.u32 %r1965, [matrix+252]; // begin inline asm - dp4a.u32.u32 %r1964, %r1965, %r5806, %r1960; + // chi + lop3.b32 %r29820, %r10262, %r10358, %r10310, 0xD2; + lop3.b32 %r29821, %r10266, %r10362, %r10314, 0xD2; // end inline asm - shr.u32 %r5971, %r1900, 6; - and.b32 %r5972, %r5971, 240; - shr.u32 %r5973, %r1964, 10; - or.b32 %r5974, %r5973, %r5972; - xor.b32 %r5975, %r5810, %r5974; - ld.const.u32 %r1969, [matrix+256]; // begin inline asm - dp4a.u32.u32 %r1968, %r1969, %r5746, %r6244; + // chi + lop3.b32 %r29816, %r10358, %r10310, %r10441, 0xD2; + lop3.b32 %r29817, %r10362, %r10314, %r10444, 0xD2; // end inline asm - ld.const.u32 %r1973, [matrix+260]; // begin inline asm - dp4a.u32.u32 %r1972, %r1973, %r5750, %r1968; + // chi + lop3.b32 %r29814, %r10310, %r10441, %r10214, 0xD2; + lop3.b32 %r29815, %r10314, %r10444, %r10218, 0xD2; // end inline asm - ld.const.u32 %r1977, [matrix+264]; // begin inline asm - dp4a.u32.u32 %r1976, %r1977, %r5754, %r1972; + // chi + lop3.b32 %r29810, %r10350, %r10222, %r10390, 0xD2; + lop3.b32 %r29811, %r10354, %r10226, %r10394, 0xD2; // end inline asm - ld.const.u32 %r1981, [matrix+268]; // begin inline asm - dp4a.u32.u32 %r1980, %r1981, %r5758, %r1976; + // chi + lop3.b32 %r29822, %r10222, %r10390, %r10334, 0xD2; + lop3.b32 %r29823, %r10226, %r10394, %r10338, 0xD2; // end inline asm - ld.const.u32 %r1985, [matrix+272]; // begin inline asm - dp4a.u32.u32 %r1984, %r1985, %r5762, %r1980; + // chi + lop3.b32 %r29818, %r10390, %r10334, %r10230, 0xD2; + lop3.b32 %r29819, %r10394, %r10338, %r10234, 0xD2; // end inline asm - ld.const.u32 %r1989, [matrix+276]; // begin inline asm - dp4a.u32.u32 %r1988, %r1989, %r5766, %r1984; + // chi + lop3.b32 %r29790, %r10334, %r10230, %r10350, 0xD2; + lop3.b32 %r29791, %r10338, %r10234, %r10354, 0xD2; // end inline asm - ld.const.u32 %r1993, [matrix+280]; + st.local.v2.u32 [%rd3+88], {%r29790, %r29791}; // begin inline asm - dp4a.u32.u32 %r1992, %r1993, %r5770, %r1988; + // chi + lop3.b32 %r29782, %r10230, %r10350, %r10222, 0xD2; + lop3.b32 %r29783, %r10234, %r10354, %r10226, 0xD2; // end inline asm - ld.const.u32 %r1997, [matrix+284]; + st.local.v2.u32 [%rd3+96], {%r29782, %r29783}; // begin inline asm - dp4a.u32.u32 %r1996, %r1997, %r5774, %r1992; + // chi + lop3.b32 %r29808, %r10398, %r10382, %r10270, 0xD2; + lop3.b32 %r29809, %r10402, %r10386, %r10274, 0xD2; // end inline asm - ld.const.u32 %r2001, [matrix+288]; + st.local.v2.u32 [%rd3+104], {%r29808, %r29809}; // begin inline asm - dp4a.u32.u32 %r2000, %r2001, %r5778, %r1996; + // chi + lop3.b32 %r29802, %r10382, %r10270, %r10278, 0xD2; + lop3.b32 %r29803, %r10386, %r10274, %r10282, 0xD2; // end inline asm - ld.const.u32 %r2005, [matrix+292]; + st.local.v2.u32 [%rd3+112], {%r29802, %r29803}; // begin inline asm - dp4a.u32.u32 %r2004, %r2005, %r5782, %r2000; + // chi + lop3.b32 %r29796, %r10270, %r10278, %r10246, 0xD2; + lop3.b32 %r29797, %r10274, %r10282, %r10250, 0xD2; // end inline asm - ld.const.u32 %r2009, [matrix+296]; + st.local.v2.u32 [%rd3+120], {%r29796, %r29797}; // begin inline asm - dp4a.u32.u32 %r2008, %r2009, %r5786, %r2004; + // chi + lop3.b32 %r29788, %r10278, %r10246, %r10398, 0xD2; + lop3.b32 %r29789, %r10282, %r10250, %r10402, 0xD2; // end inline asm - ld.const.u32 %r2013, [matrix+300]; + st.local.v2.u32 [%rd3+128], {%r29788, %r29789}; // begin inline asm - dp4a.u32.u32 %r2012, %r2013, %r5790, %r2008; + // chi + lop3.b32 %r29780, %r10246, %r10398, %r10382, 0xD2; + lop3.b32 %r29781, %r10250, %r10402, %r10386, 0xD2; // end inline asm - ld.const.u32 %r2017, [matrix+304]; + st.local.v2.u32 [%rd3+136], {%r29780, %r29781}; // begin inline asm - dp4a.u32.u32 %r2016, %r2017, %r5794, %r2012; + // chi + lop3.b32 %r29806, %r10302, %r10342, %r10374, 0xD2; + lop3.b32 %r29807, %r10306, %r10346, %r10378, 0xD2; // end inline asm - ld.const.u32 %r2021, [matrix+308]; + st.local.v2.u32 [%rd3+144], {%r29806, %r29807}; // begin inline asm - dp4a.u32.u32 %r2020, %r2021, %r5798, %r2016; + // chi + lop3.b32 %r29800, %r10342, %r10374, %r10366, 0xD2; + lop3.b32 %r29801, %r10346, %r10378, %r10370, 0xD2; // end inline asm - ld.const.u32 %r2025, [matrix+312]; + st.local.v2.u32 [%rd3+152], {%r29800, %r29801}; // begin inline asm - dp4a.u32.u32 %r2024, %r2025, %r5802, %r2020; + // chi + lop3.b32 %r29794, %r10374, %r10366, %r10286, 0xD2; + lop3.b32 %r29795, %r10378, %r10370, %r10290, 0xD2; // end inline asm - ld.const.u32 %r2029, [matrix+316]; + st.local.v2.u32 [%rd3+160], {%r29794, %r29795}; // begin inline asm - dp4a.u32.u32 %r2028, %r2029, %r5806, %r2024; + // chi + lop3.b32 %r29786, %r10366, %r10286, %r10302, 0xD2; + lop3.b32 %r29787, %r10370, %r10290, %r10306, 0xD2; // end inline asm - ld.const.u32 %r2033, [matrix+320]; + st.local.v2.u32 [%rd3+168], {%r29786, %r29787}; // begin inline asm - dp4a.u32.u32 %r2032, %r2033, %r5746, %r6244; + // chi + lop3.b32 %r29778, %r10286, %r10302, %r10342, 0xD2; + lop3.b32 %r29779, %r10290, %r10306, %r10346, 0xD2; // end inline asm - ld.const.u32 %r2037, [matrix+324]; + st.local.v2.u32 [%rd3+176], {%r29778, %r29779}; // begin inline asm - dp4a.u32.u32 %r2036, %r2037, %r5750, %r2032; + // chi + lop3.b32 %r29804, %r10254, %r10326, %r10238, 0xD2; + lop3.b32 %r29805, %r10258, %r10330, %r10242, 0xD2; // end inline asm - ld.const.u32 %r2041, [matrix+328]; + st.local.v2.u32 [%rd3+184], {%r29804, %r29805}; // begin inline asm - dp4a.u32.u32 %r2040, %r2041, %r5754, %r2036; + // chi + lop3.b32 %r29798, %r10326, %r10238, %r10294, 0xD2; + lop3.b32 %r29799, %r10330, %r10242, %r10298, 0xD2; // end inline asm - ld.const.u32 %r2045, [matrix+332]; + st.local.v2.u32 [%rd3+192], {%r29798, %r29799}; // begin inline asm - dp4a.u32.u32 %r2044, %r2045, %r5758, %r2040; + // chi + lop3.b32 %r29792, %r10238, %r10294, %r10318, 0xD2; + lop3.b32 %r29793, %r10242, %r10298, %r10322, 0xD2; // end inline asm - ld.const.u32 %r2049, [matrix+336]; + st.local.v2.u32 [%rd3+200], {%r29792, %r29793}; // begin inline asm - dp4a.u32.u32 %r2048, %r2049, %r5762, %r2044; + // chi + lop3.b32 %r29784, %r10294, %r10318, %r10254, 0xD2; + lop3.b32 %r29785, %r10298, %r10322, %r10258, 0xD2; // end inline asm - ld.const.u32 %r2053, [matrix+340]; + st.local.v2.u32 [%rd3+208], {%r29784, %r29785}; // begin inline asm - dp4a.u32.u32 %r2052, %r2053, %r5766, %r2048; + // chi + lop3.b32 %r29776, %r10318, %r10254, %r10326, 0xD2; + lop3.b32 %r29777, %r10322, %r10258, %r10330, 0xD2; // end inline asm - ld.const.u32 %r2057, [matrix+344]; + st.local.v2.u32 [%rd3+216], {%r29776, %r29777}; + mul.wide.s32 %rd595, %r29826, 8; + mov.u64 %rd596, keccak_round_constants; + cvta.const.u64 %rd597, %rd596; + add.s64 %rd594, %rd597, %rd595; // begin inline asm - dp4a.u32.u32 %r2056, %r2057, %r5770, %r2052; + ld.global.nc.v2.u32 {%r10606,%r10607}, [%rd594]; // end inline asm - ld.const.u32 %r2061, [matrix+348]; + xor.b32 %r29812, %r10406, %r10606; + xor.b32 %r29813, %r10407, %r10607; + add.s32 %r29826, %r29826, 1; + setp.lt.u32 %p23, %r29826, 23; + @%p23 bra $L__BB2_34; + + add.u64 %rd84, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r29824, %r29825}; + st.local.v2.u32 [%rd3+72], {%r29822, %r29823}; + st.local.v2.u32 [%rd3+40], {%r29820, %r29821}; + st.local.v2.u32 [%rd3+80], {%r29818, %r29819}; + st.local.v2.u32 [%rd3+48], {%r29816, %r29817}; + st.local.v2.u32 [%rd3+56], {%r29814, %r29815}; + st.local.v2.u32 [%rd3+24], {%r29812, %r29813}; // begin inline asm - dp4a.u32.u32 %r2060, %r2061, %r5774, %r2056; + // xor5 + lop3.b32 %r10618, %r29812, %r29810, %r29808, 0x96; + lop3.b32 %r10618, %r10618, %r29806, %r29804, 0x96; + lop3.b32 %r10619, %r29813, %r29811, %r29809, 0x96; + lop3.b32 %r10619, %r10619, %r29807, %r29805, 0x96; // end inline asm - ld.const.u32 %r2065, [matrix+352]; // begin inline asm - dp4a.u32.u32 %r2064, %r2065, %r5778, %r2060; + // xor5 + lop3.b32 %r10630, %r29824, %r29822, %r29802, 0x96; + lop3.b32 %r10630, %r10630, %r29800, %r29798, 0x96; + lop3.b32 %r10631, %r29825, %r29823, %r29803, 0x96; + lop3.b32 %r10631, %r10631, %r29801, %r29799, 0x96; // end inline asm - ld.const.u32 %r2069, [matrix+356]; // begin inline asm - dp4a.u32.u32 %r2068, %r2069, %r5782, %r2064; + // xor5 + lop3.b32 %r10642, %r29820, %r29818, %r29796, 0x96; + lop3.b32 %r10642, %r10642, %r29794, %r29792, 0x96; + lop3.b32 %r10643, %r29821, %r29819, %r29797, 0x96; + lop3.b32 %r10643, %r10643, %r29795, %r29793, 0x96; // end inline asm - ld.const.u32 %r2073, [matrix+360]; // begin inline asm - dp4a.u32.u32 %r2072, %r2073, %r5786, %r2068; + // xor5 + lop3.b32 %r10654, %r29816, %r29790, %r29788, 0x96; + lop3.b32 %r10654, %r10654, %r29786, %r29784, 0x96; + lop3.b32 %r10655, %r29817, %r29791, %r29789, 0x96; + lop3.b32 %r10655, %r10655, %r29787, %r29785, 0x96; // end inline asm - ld.const.u32 %r2077, [matrix+364]; // begin inline asm - dp4a.u32.u32 %r2076, %r2077, %r5790, %r2072; + // xor5 + lop3.b32 %r10666, %r29814, %r29782, %r29780, 0x96; + lop3.b32 %r10666, %r10666, %r29778, %r29776, 0x96; + lop3.b32 %r10667, %r29815, %r29783, %r29781, 0x96; + lop3.b32 %r10667, %r10667, %r29779, %r29777, 0x96; // end inline asm - ld.const.u32 %r2081, [matrix+368]; + mov.u32 %r10870, 1; // begin inline asm - dp4a.u32.u32 %r2080, %r2081, %r5794, %r2076; + shf.l.wrap.b32 %r10678, %r10631, %r10630, %r10870; // end inline asm - ld.const.u32 %r2085, [matrix+372]; // begin inline asm - dp4a.u32.u32 %r2084, %r2085, %r5798, %r2080; + shf.l.wrap.b32 %r10682, %r10630, %r10631, %r10870; // end inline asm - ld.const.u32 %r2089, [matrix+376]; + xor.b32 %r10897, %r10678, %r10666; + xor.b32 %r10898, %r10682, %r10667; + xor.b32 %r10825, %r29812, %r10897; + xor.b32 %r10828, %r29813, %r10898; + xor.b32 %r10788, %r29809, %r10898; + xor.b32 %r10787, %r29808, %r10897; + st.local.v2.u32 [%rd3+104], {%r10787, %r10788}; // begin inline asm - dp4a.u32.u32 %r2088, %r2089, %r5802, %r2084; + shf.l.wrap.b32 %r10686, %r10643, %r10642, %r10870; // end inline asm - ld.const.u32 %r2093, [matrix+380]; // begin inline asm - dp4a.u32.u32 %r2092, %r2093, %r5806, %r2088; + shf.l.wrap.b32 %r10690, %r10642, %r10643, %r10870; // end inline asm - shr.u32 %r5976, %r2028, 6; - and.b32 %r5977, %r5976, 240; - shr.u32 %r5978, %r2092, 10; - or.b32 %r5979, %r5978, %r5977; - xor.b32 %r5980, %r5822, %r5979; - ld.const.u32 %r2097, [matrix+384]; + xor.b32 %r10899, %r10686, %r10618; + xor.b32 %r10900, %r10690, %r10619; + xor.b32 %r10724, %r29822, %r10899; + xor.b32 %r10723, %r29823, %r10900; + xor.b32 %r10763, %r29801, %r10900; + xor.b32 %r10764, %r29800, %r10899; + st.local.v2.u32 [%rd3+152], {%r10764, %r10763}; // begin inline asm - dp4a.u32.u32 %r2096, %r2097, %r5746, %r6244; + shf.l.wrap.b32 %r10694, %r10655, %r10654, %r10870; // end inline asm - ld.const.u32 %r2101, [matrix+388]; // begin inline asm - dp4a.u32.u32 %r2100, %r2101, %r5750, %r2096; + shf.l.wrap.b32 %r10698, %r10654, %r10655, %r10870; // end inline asm - ld.const.u32 %r2105, [matrix+392]; + xor.b32 %r10901, %r10694, %r10630; + xor.b32 %r10902, %r10698, %r10631; + xor.b32 %r10747, %r29797, %r10902; + xor.b32 %r10748, %r29796, %r10901; + st.local.v2.u32 [%rd3+120], {%r10748, %r10747}; + xor.b32 %r10739, %r29793, %r10902; + xor.b32 %r10740, %r29792, %r10901; + st.local.v2.u32 [%rd3+200], {%r10740, %r10739}; // begin inline asm - dp4a.u32.u32 %r2104, %r2105, %r5754, %r2100; + shf.l.wrap.b32 %r10702, %r10667, %r10666, %r10870; // end inline asm - ld.const.u32 %r2109, [matrix+396]; // begin inline asm - dp4a.u32.u32 %r2108, %r2109, %r5758, %r2104; + shf.l.wrap.b32 %r10706, %r10666, %r10667, %r10870; // end inline asm - ld.const.u32 %r2113, [matrix+400]; + xor.b32 %r10903, %r10702, %r10642; + xor.b32 %r10904, %r10706, %r10643; + xor.b32 %r10771, %r29816, %r10903; + xor.b32 %r10772, %r29817, %r10904; + xor.b32 %r10780, %r29787, %r10904; + xor.b32 %r10779, %r29786, %r10903; + st.local.v2.u32 [%rd3+168], {%r10779, %r10780}; // begin inline asm - dp4a.u32.u32 %r2112, %r2113, %r5762, %r2108; + shf.l.wrap.b32 %r10710, %r10619, %r10618, %r10870; // end inline asm - ld.const.u32 %r2117, [matrix+404]; // begin inline asm - dp4a.u32.u32 %r2116, %r2117, %r5766, %r2112; + shf.l.wrap.b32 %r10714, %r10618, %r10619, %r10870; // end inline asm - ld.const.u32 %r2121, [matrix+408]; + xor.b32 %r10905, %r10710, %r10654; + xor.b32 %r10906, %r10714, %r10655; + xor.b32 %r10731, %r29782, %r10905; + xor.b32 %r10732, %r29783, %r10906; + xor.b32 %r10756, %r29777, %r10906; + xor.b32 %r10755, %r29776, %r10905; + st.local.v2.u32 [%rd3+216], {%r10755, %r10756}; // begin inline asm - dp4a.u32.u32 %r2120, %r2121, %r5770, %r2116; + shf.l.wrap.b32 %r10718, %r10724, %r10723, %r10221; // end inline asm - ld.const.u32 %r2125, [matrix+412]; // begin inline asm - dp4a.u32.u32 %r2124, %r2125, %r5774, %r2120; + shf.l.wrap.b32 %r10722, %r10723, %r10724, %r10221; // end inline asm - ld.const.u32 %r2129, [matrix+416]; // begin inline asm - dp4a.u32.u32 %r2128, %r2129, %r5778, %r2124; + shf.l.wrap.b32 %r10726, %r10732, %r10731, %r10229; // end inline asm - ld.const.u32 %r2133, [matrix+420]; // begin inline asm - dp4a.u32.u32 %r2132, %r2133, %r5782, %r2128; + shf.l.wrap.b32 %r10730, %r10731, %r10732, %r10229; // end inline asm - ld.const.u32 %r2137, [matrix+424]; // begin inline asm - dp4a.u32.u32 %r2136, %r2137, %r5786, %r2132; + shf.l.wrap.b32 %r10738, %r10739, %r10740, %r10237; // end inline asm - ld.const.u32 %r2141, [matrix+428]; // begin inline asm - dp4a.u32.u32 %r2140, %r2141, %r5790, %r2136; + shf.l.wrap.b32 %r10734, %r10740, %r10739, %r10237; // end inline asm - ld.const.u32 %r2145, [matrix+432]; + st.local.v2.u32 [%rd3+96], {%r10734, %r10738}; // begin inline asm - dp4a.u32.u32 %r2144, %r2145, %r5794, %r2140; + shf.l.wrap.b32 %r10742, %r10748, %r10747, %r10269; // end inline asm - ld.const.u32 %r2149, [matrix+436]; // begin inline asm - dp4a.u32.u32 %r2148, %r2149, %r5798, %r2144; + shf.l.wrap.b32 %r10746, %r10747, %r10748, %r10269; // end inline asm - ld.const.u32 %r2153, [matrix+440]; // begin inline asm - dp4a.u32.u32 %r2152, %r2153, %r5802, %r2148; + shf.l.wrap.b32 %r10750, %r10756, %r10755, %r10317; // end inline asm - ld.const.u32 %r2157, [matrix+444]; // begin inline asm - dp4a.u32.u32 %r2156, %r2157, %r5806, %r2152; + shf.l.wrap.b32 %r10754, %r10755, %r10756, %r10317; // end inline asm - ld.const.u32 %r2161, [matrix+448]; // begin inline asm - dp4a.u32.u32 %r2160, %r2161, %r5746, %r6244; + shf.l.wrap.b32 %r10762, %r10763, %r10764, %r10341; // end inline asm - ld.const.u32 %r2165, [matrix+452]; // begin inline asm - dp4a.u32.u32 %r2164, %r2165, %r5750, %r2160; + shf.l.wrap.b32 %r10758, %r10764, %r10763, %r10341; // end inline asm - ld.const.u32 %r2169, [matrix+456]; + st.local.v2.u32 [%rd3+88], {%r10758, %r10762}; // begin inline asm - dp4a.u32.u32 %r2168, %r2169, %r5754, %r2164; + shf.l.wrap.b32 %r10766, %r10772, %r10771, %r10357; // end inline asm - ld.const.u32 %r2173, [matrix+460]; // begin inline asm - dp4a.u32.u32 %r2172, %r2173, %r5758, %r2168; + shf.l.wrap.b32 %r10770, %r10771, %r10772, %r10357; // end inline asm - ld.const.u32 %r2177, [matrix+464]; // begin inline asm - dp4a.u32.u32 %r2176, %r2177, %r5762, %r2172; + shf.l.wrap.b32 %r10774, %r10780, %r10779, %r10365; // end inline asm - ld.const.u32 %r2181, [matrix+468]; // begin inline asm - dp4a.u32.u32 %r2180, %r2181, %r5766, %r2176; + shf.l.wrap.b32 %r10778, %r10779, %r10780, %r10365; // end inline asm - ld.const.u32 %r2185, [matrix+472]; // begin inline asm - dp4a.u32.u32 %r2184, %r2185, %r5770, %r2180; + shf.l.wrap.b32 %r10782, %r10788, %r10787, %r10397; // end inline asm - ld.const.u32 %r2189, [matrix+476]; // begin inline asm - dp4a.u32.u32 %r2188, %r2189, %r5774, %r2184; + shf.l.wrap.b32 %r10786, %r10787, %r10788, %r10397; // end inline asm - ld.const.u32 %r2193, [matrix+480]; // begin inline asm - dp4a.u32.u32 %r2192, %r2193, %r5778, %r2188; + // chi + lop3.b32 %r10790, %r10825, %r10718, %r10742, 0xD2; + lop3.b32 %r10791, %r10828, %r10722, %r10746, 0xD2; // end inline asm - ld.const.u32 %r2197, [matrix+484]; // begin inline asm - dp4a.u32.u32 %r2196, %r2197, %r5782, %r2192; + // chi + lop3.b32 %r29959, %r10718, %r10742, %r10774, 0xD2; + lop3.b32 %r29960, %r10722, %r10746, %r10778, 0xD2; // end inline asm - ld.const.u32 %r2201, [matrix+488]; + st.local.v2.u32 [%rd3+32], {%r29959, %r29960}; // begin inline asm - dp4a.u32.u32 %r2200, %r2201, %r5786, %r2196; + // chi + lop3.b32 %r29955, %r10742, %r10774, %r10750, 0xD2; + lop3.b32 %r29956, %r10746, %r10778, %r10754, 0xD2; // end inline asm - ld.const.u32 %r2205, [matrix+492]; + st.local.v2.u32 [%rd3+40], {%r29955, %r29956}; // begin inline asm - dp4a.u32.u32 %r2204, %r2205, %r5790, %r2200; + // chi + lop3.b32 %r29951, %r10774, %r10750, %r10825, 0xD2; + lop3.b32 %r29952, %r10778, %r10754, %r10828, 0xD2; // end inline asm - ld.const.u32 %r2209, [matrix+496]; + st.local.v2.u32 [%rd3+48], {%r29951, %r29952}; // begin inline asm - dp4a.u32.u32 %r2208, %r2209, %r5794, %r2204; + // chi + lop3.b32 %r29949, %r10750, %r10825, %r10718, 0xD2; + lop3.b32 %r29950, %r10754, %r10828, %r10722, 0xD2; // end inline asm - ld.const.u32 %r2213, [matrix+500]; + st.local.v2.u32 [%rd3+56], {%r29949, %r29950}; // begin inline asm - dp4a.u32.u32 %r2212, %r2213, %r5798, %r2208; + // chi + lop3.b32 %r29945, %r10766, %r10726, %r10782, 0xD2; + lop3.b32 %r29946, %r10770, %r10730, %r10786, 0xD2; // end inline asm - ld.const.u32 %r2217, [matrix+504]; + st.local.v2.u32 [%rd3+64], {%r29945, %r29946}; // begin inline asm - dp4a.u32.u32 %r2216, %r2217, %r5802, %r2212; + // chi + lop3.b32 %r29957, %r10726, %r10782, %r10758, 0xD2; + lop3.b32 %r29958, %r10730, %r10786, %r10762, 0xD2; // end inline asm - ld.const.u32 %r2221, [matrix+508]; + st.local.v2.u32 [%rd3+72], {%r29957, %r29958}; // begin inline asm - dp4a.u32.u32 %r2220, %r2221, %r5806, %r2216; + // chi + lop3.b32 %r29953, %r10782, %r10758, %r10734, 0xD2; + lop3.b32 %r29954, %r10786, %r10762, %r10738, 0xD2; // end inline asm - shr.u32 %r5981, %r2156, 6; - and.b32 %r5982, %r5981, 240; - shr.u32 %r5983, %r2220, 10; - or.b32 %r5984, %r5983, %r5982; - xor.b32 %r5985, %r5824, %r5984; - ld.const.u32 %r2225, [matrix+512]; + st.local.v2.u32 [%rd3+80], {%r29953, %r29954}; + add.s64 %rd598, %rd597, 184; // begin inline asm - dp4a.u32.u32 %r2224, %r2225, %r5746, %r6244; + ld.global.nc.v2.u32 {%r10854,%r10855}, [%rd598]; + // end inline asm + xor.b32 %r29947, %r10790, %r10854; + xor.b32 %r29948, %r10791, %r10855; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; + st.local.u64 [%rd84], %rd354; + mov.u64 %rd602, 1179641; + st.local.u64 [%rd84+8], %rd602; + st.local.u32 [%rd84+16], %r31; + ld.global.u64 %rd603, [%rd35]; + ld.global.u64 %rd604, [%rd35+8]; + ld.global.u64 %rd605, [%rd35+16]; + ld.global.u64 %rd606, [%rd35+24]; + ld.global.u64 %rd607, [%rd35+32]; + ld.global.u64 %rd608, [%rd35+40]; + ld.global.u64 %rd609, [%rd35+48]; + ld.global.u64 %rd610, [%rd35+56]; + st.local.u64 [%rd84+32], %rd604; + st.local.u64 [%rd84+40], %rd605; + st.local.u64 [%rd84+48], %rd606; + st.local.u64 [%rd84+56], %rd607; + st.local.u64 [%rd84+64], %rd608; + st.local.u64 [%rd84+72], %rd609; + st.local.u64 [%rd84+80], %rd610; + cvt.u32.u64 %r10907, %rd603; + xor.b32 %r10908, %r31, %r10907; + st.local.u64 [%rd84+24], %rd603; + st.local.u32 [%rd84+24], %r10908; + mov.u32 %r29827, 0; + st.local.v2.u32 [%rd84+96], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+104], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+112], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+120], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+128], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+136], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+144], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+152], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+160], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+168], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+176], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+184], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+192], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+200], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+208], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+216], {%r29827, %r29827}; + mov.u32 %r29842, -2147483648; + st.local.v2.u32 [%rd84+88], {%r10870, %r29842}; + ld.local.v2.u32 {%r29863, %r29864}, [%rd84+24]; + mov.b64 {%r29861, %r29862}, %rd608; + shr.u64 %rd611, %rd604, 32; + cvt.u32.u64 %r29875, %rd604; + cvt.u32.u64 %r29876, %rd611; + shr.u64 %rd612, %rd609, 32; + cvt.u32.u64 %r29873, %rd609; + cvt.u32.u64 %r29874, %rd612; + shr.u64 %rd613, %rd605, 32; + cvt.u32.u64 %r29871, %rd605; + cvt.u32.u64 %r29872, %rd613; + shr.u64 %rd614, %rd610, 32; + cvt.u32.u64 %r29869, %rd610; + cvt.u32.u64 %r29870, %rd614; + shr.u64 %rd615, %rd606, 32; + cvt.u32.u64 %r29867, %rd606; + cvt.u32.u64 %r29868, %rd615; + shr.u64 %rd616, %rd607, 32; + cvt.u32.u64 %r29865, %rd607; + cvt.u32.u64 %r29866, %rd616; + mov.u32 %r29828, %r29827; + mov.u32 %r29829, %r29827; + mov.u32 %r29830, %r29827; + mov.u32 %r29831, %r29827; + mov.u32 %r29832, %r29827; + mov.u32 %r29833, %r29827; + mov.u32 %r29834, %r29827; + mov.u32 %r29835, %r29827; + mov.u32 %r29836, %r29827; + mov.u32 %r29837, %r29827; + mov.u32 %r29838, %r29827; + mov.u32 %r29839, %r29827; + mov.u32 %r29840, %r29827; + mov.u32 %r29841, %r10870; + mov.u32 %r29843, %r29827; + mov.u32 %r29844, %r29827; + mov.u32 %r29845, %r29827; + mov.u32 %r29846, %r29827; + mov.u32 %r29847, %r29827; + mov.u32 %r29848, %r29827; + mov.u32 %r29849, %r29827; + mov.u32 %r29850, %r29827; + mov.u32 %r29851, %r29827; + mov.u32 %r29852, %r29827; + mov.u32 %r29853, %r29827; + mov.u32 %r29854, %r29827; + mov.u32 %r29855, %r29827; + mov.u32 %r29856, %r29827; + mov.u32 %r29857, %r29827; + mov.u32 %r29858, %r29827; + mov.u32 %r29859, %r29827; + mov.u32 %r29860, %r29827; + mov.u32 %r29877, %r29827; + +$L__BB2_36: + // begin inline asm + // xor5 + lop3.b32 %r10911, %r29863, %r29861, %r29859, 0x96; + lop3.b32 %r10911, %r10911, %r29857, %r29855, 0x96; + lop3.b32 %r10912, %r29864, %r29862, %r29860, 0x96; + lop3.b32 %r10912, %r10912, %r29858, %r29856, 0x96; // end inline asm - ld.const.u32 %r2229, [matrix+516]; // begin inline asm - dp4a.u32.u32 %r2228, %r2229, %r5750, %r2224; + // xor5 + lop3.b32 %r10923, %r29875, %r29873, %r29853, 0x96; + lop3.b32 %r10923, %r10923, %r29851, %r29849, 0x96; + lop3.b32 %r10924, %r29876, %r29874, %r29854, 0x96; + lop3.b32 %r10924, %r10924, %r29852, %r29850, 0x96; // end inline asm - ld.const.u32 %r2233, [matrix+520]; // begin inline asm - dp4a.u32.u32 %r2232, %r2233, %r5754, %r2228; + // xor5 + lop3.b32 %r10935, %r29871, %r29869, %r29847, 0x96; + lop3.b32 %r10935, %r10935, %r29845, %r29843, 0x96; + lop3.b32 %r10936, %r29872, %r29870, %r29848, 0x96; + lop3.b32 %r10936, %r10936, %r29846, %r29844, 0x96; // end inline asm - ld.const.u32 %r2237, [matrix+524]; // begin inline asm - dp4a.u32.u32 %r2236, %r2237, %r5758, %r2232; + // xor5 + lop3.b32 %r10947, %r29867, %r29841, %r29839, 0x96; + lop3.b32 %r10947, %r10947, %r29837, %r29835, 0x96; + lop3.b32 %r10948, %r29868, %r29842, %r29840, 0x96; + lop3.b32 %r10948, %r10948, %r29838, %r29836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10959, %r29865, %r29833, %r29831, 0x96; + lop3.b32 %r10959, %r10959, %r29829, %r29827, 0x96; + lop3.b32 %r10960, %r29866, %r29834, %r29832, 0x96; + lop3.b32 %r10960, %r10960, %r29830, %r29828, 0x96; // end inline asm - ld.const.u32 %r2241, [matrix+528]; // begin inline asm - dp4a.u32.u32 %r2240, %r2241, %r5762, %r2236; + shf.l.wrap.b32 %r10971, %r10924, %r10923, %r10870; // end inline asm - ld.const.u32 %r2245, [matrix+532]; // begin inline asm - dp4a.u32.u32 %r2244, %r2245, %r5766, %r2240; + shf.l.wrap.b32 %r10975, %r10923, %r10924, %r10870; // end inline asm - ld.const.u32 %r2249, [matrix+536]; + xor.b32 %r11405, %r10971, %r10959; + xor.b32 %r11406, %r10975, %r10960; + xor.b32 %r11238, %r29863, %r11405; + xor.b32 %r11241, %r29864, %r11406; + xor.b32 %r11145, %r29861, %r11405; + xor.b32 %r11144, %r29862, %r11406; + xor.b32 %r11192, %r29859, %r11405; + xor.b32 %r11193, %r29860, %r11406; + xor.b32 %r11097, %r29857, %r11405; + xor.b32 %r11096, %r29858, %r11406; + xor.b32 %r11048, %r29855, %r11405; + xor.b32 %r11049, %r29856, %r11406; // begin inline asm - dp4a.u32.u32 %r2248, %r2249, %r5770, %r2244; + shf.l.wrap.b32 %r10979, %r10936, %r10935, %r10870; // end inline asm - ld.const.u32 %r2253, [matrix+540]; // begin inline asm - dp4a.u32.u32 %r2252, %r2253, %r5774, %r2248; + shf.l.wrap.b32 %r10983, %r10935, %r10936, %r10870; // end inline asm - ld.const.u32 %r2257, [matrix+544]; + xor.b32 %r11407, %r10979, %r10911; + xor.b32 %r11408, %r10983, %r10912; + xor.b32 %r11200, %r29875, %r11407; + xor.b32 %r11201, %r29876, %r11408; + xor.b32 %r11017, %r29873, %r11407; + xor.b32 %r11016, %r29874, %r11408; + xor.b32 %r11176, %r29853, %r11407; + xor.b32 %r11177, %r29854, %r11408; + xor.b32 %r11137, %r29851, %r11407; + xor.b32 %r11136, %r29852, %r11408; + xor.b32 %r11120, %r29849, %r11407; + xor.b32 %r11121, %r29850, %r11408; // begin inline asm - dp4a.u32.u32 %r2256, %r2257, %r5778, %r2252; + shf.l.wrap.b32 %r10987, %r10948, %r10947, %r10870; // end inline asm - ld.const.u32 %r2261, [matrix+548]; // begin inline asm - dp4a.u32.u32 %r2260, %r2261, %r5782, %r2256; + shf.l.wrap.b32 %r10991, %r10947, %r10948, %r10870; // end inline asm - ld.const.u32 %r2265, [matrix+552]; + xor.b32 %r11409, %r10987, %r10923; + xor.b32 %r11410, %r10991, %r10924; + xor.b32 %r11057, %r29871, %r11409; + xor.b32 %r11056, %r29872, %r11410; + xor.b32 %r11184, %r29869, %r11409; + xor.b32 %r11185, %r29870, %r11410; + xor.b32 %r11065, %r29847, %r11409; + xor.b32 %r11064, %r29848, %r11410; + xor.b32 %r11168, %r29845, %r11409; + xor.b32 %r11169, %r29846, %r11410; + xor.b32 %r11033, %r29843, %r11409; + xor.b32 %r11032, %r29844, %r11410; // begin inline asm - dp4a.u32.u32 %r2264, %r2265, %r5786, %r2260; + shf.l.wrap.b32 %r10995, %r10960, %r10959, %r10870; // end inline asm - ld.const.u32 %r2269, [matrix+556]; // begin inline asm - dp4a.u32.u32 %r2268, %r2269, %r5790, %r2264; + shf.l.wrap.b32 %r10999, %r10959, %r10960, %r10870; // end inline asm - ld.const.u32 %r2273, [matrix+560]; + xor.b32 %r11411, %r10995, %r10935; + xor.b32 %r11412, %r10999, %r10936; + xor.b32 %r11152, %r29867, %r11411; + xor.b32 %r11153, %r29868, %r11412; + xor.b32 %r11129, %r29841, %r11411; + xor.b32 %r11128, %r29842, %r11412; + xor.b32 %r11072, %r29839, %r11411; + xor.b32 %r11073, %r29840, %r11412; + xor.b32 %r11160, %r29837, %r11411; + xor.b32 %r11161, %r29838, %r11412; + xor.b32 %r11089, %r29835, %r11411; + xor.b32 %r11088, %r29836, %r11412; // begin inline asm - dp4a.u32.u32 %r2272, %r2273, %r5794, %r2268; + shf.l.wrap.b32 %r11003, %r10912, %r10911, %r10870; // end inline asm - ld.const.u32 %r2277, [matrix+564]; // begin inline asm - dp4a.u32.u32 %r2276, %r2277, %r5798, %r2272; + shf.l.wrap.b32 %r11007, %r10911, %r10912, %r10870; // end inline asm - ld.const.u32 %r2281, [matrix+568]; + xor.b32 %r11413, %r11003, %r10947; + xor.b32 %r11414, %r11007, %r10948; + xor.b32 %r11104, %r29865, %r11413; + xor.b32 %r11105, %r29866, %r11414; + xor.b32 %r11024, %r29833, %r11413; + xor.b32 %r11025, %r29834, %r11414; + xor.b32 %r11041, %r29831, %r11413; + xor.b32 %r11040, %r29832, %r11414; + xor.b32 %r11080, %r29829, %r11413; + xor.b32 %r11081, %r29830, %r11414; + xor.b32 %r11112, %r29827, %r11413; + xor.b32 %r11113, %r29828, %r11414; + mov.u32 %r11018, 44; // begin inline asm - dp4a.u32.u32 %r2280, %r2281, %r5802, %r2276; + shf.l.wrap.b32 %r11011, %r11017, %r11016, %r11018; // end inline asm - ld.const.u32 %r2285, [matrix+572]; // begin inline asm - dp4a.u32.u32 %r2284, %r2285, %r5806, %r2280; + shf.l.wrap.b32 %r11015, %r11016, %r11017, %r11018; // end inline asm - ld.const.u32 %r2289, [matrix+576]; + mov.u32 %r11026, 20; // begin inline asm - dp4a.u32.u32 %r2288, %r2289, %r5746, %r6244; + shf.l.wrap.b32 %r11019, %r11025, %r11024, %r11026; // end inline asm - ld.const.u32 %r2293, [matrix+580]; // begin inline asm - dp4a.u32.u32 %r2292, %r2293, %r5750, %r2288; + shf.l.wrap.b32 %r11023, %r11024, %r11025, %r11026; // end inline asm - ld.const.u32 %r2297, [matrix+584]; + mov.u32 %r11034, 61; // begin inline asm - dp4a.u32.u32 %r2296, %r2297, %r5754, %r2292; + shf.l.wrap.b32 %r11027, %r11033, %r11032, %r11034; // end inline asm - ld.const.u32 %r2301, [matrix+588]; // begin inline asm - dp4a.u32.u32 %r2300, %r2301, %r5758, %r2296; + shf.l.wrap.b32 %r11031, %r11032, %r11033, %r11034; // end inline asm - ld.const.u32 %r2305, [matrix+592]; + mov.u32 %r11042, 39; // begin inline asm - dp4a.u32.u32 %r2304, %r2305, %r5762, %r2300; + shf.l.wrap.b32 %r11035, %r11041, %r11040, %r11042; // end inline asm - ld.const.u32 %r2309, [matrix+596]; // begin inline asm - dp4a.u32.u32 %r2308, %r2309, %r5766, %r2304; + shf.l.wrap.b32 %r11039, %r11040, %r11041, %r11042; // end inline asm - ld.const.u32 %r2313, [matrix+600]; + mov.u32 %r11050, 18; // begin inline asm - dp4a.u32.u32 %r2312, %r2313, %r5770, %r2308; + shf.l.wrap.b32 %r11043, %r11049, %r11048, %r11050; // end inline asm - ld.const.u32 %r2317, [matrix+604]; // begin inline asm - dp4a.u32.u32 %r2316, %r2317, %r5774, %r2312; + shf.l.wrap.b32 %r11047, %r11048, %r11049, %r11050; // end inline asm - ld.const.u32 %r2321, [matrix+608]; + mov.u32 %r11058, 62; // begin inline asm - dp4a.u32.u32 %r2320, %r2321, %r5778, %r2316; + shf.l.wrap.b32 %r11051, %r11057, %r11056, %r11058; // end inline asm - ld.const.u32 %r2325, [matrix+612]; // begin inline asm - dp4a.u32.u32 %r2324, %r2325, %r5782, %r2320; + shf.l.wrap.b32 %r11055, %r11056, %r11057, %r11058; // end inline asm - ld.const.u32 %r2329, [matrix+616]; + mov.u32 %r11066, 43; // begin inline asm - dp4a.u32.u32 %r2328, %r2329, %r5786, %r2324; + shf.l.wrap.b32 %r11059, %r11065, %r11064, %r11066; // end inline asm - ld.const.u32 %r2333, [matrix+620]; // begin inline asm - dp4a.u32.u32 %r2332, %r2333, %r5790, %r2328; + shf.l.wrap.b32 %r11063, %r11064, %r11065, %r11066; // end inline asm - ld.const.u32 %r2337, [matrix+624]; + mov.u32 %r11074, 25; // begin inline asm - dp4a.u32.u32 %r2336, %r2337, %r5794, %r2332; + shf.l.wrap.b32 %r11067, %r11073, %r11072, %r11074; // end inline asm - ld.const.u32 %r2341, [matrix+628]; // begin inline asm - dp4a.u32.u32 %r2340, %r2341, %r5798, %r2336; + shf.l.wrap.b32 %r11071, %r11072, %r11073, %r11074; // end inline asm - ld.const.u32 %r2345, [matrix+632]; + mov.u32 %r11082, 8; // begin inline asm - dp4a.u32.u32 %r2344, %r2345, %r5802, %r2340; + shf.l.wrap.b32 %r11075, %r11081, %r11080, %r11082; // end inline asm - ld.const.u32 %r2349, [matrix+636]; // begin inline asm - dp4a.u32.u32 %r2348, %r2349, %r5806, %r2344; + shf.l.wrap.b32 %r11079, %r11080, %r11081, %r11082; // end inline asm - shr.u32 %r5986, %r2284, 6; - and.b32 %r5987, %r5986, 240; - shr.u32 %r5988, %r2348, 10; - or.b32 %r5989, %r5988, %r5987; - cvt.u64.u32 %rd211, %r5989; - xor.b64 %rd212, %rd10, %rd211; - ld.const.u32 %r2353, [matrix+640]; + mov.u32 %r11090, 56; // begin inline asm - dp4a.u32.u32 %r2352, %r2353, %r5746, %r6244; + shf.l.wrap.b32 %r11083, %r11089, %r11088, %r11090; // end inline asm - ld.const.u32 %r2357, [matrix+644]; // begin inline asm - dp4a.u32.u32 %r2356, %r2357, %r5750, %r2352; + shf.l.wrap.b32 %r11087, %r11088, %r11089, %r11090; // end inline asm - ld.const.u32 %r2361, [matrix+648]; + mov.u32 %r11098, 41; // begin inline asm - dp4a.u32.u32 %r2360, %r2361, %r5754, %r2356; + shf.l.wrap.b32 %r11091, %r11097, %r11096, %r11098; // end inline asm - ld.const.u32 %r2365, [matrix+652]; // begin inline asm - dp4a.u32.u32 %r2364, %r2365, %r5758, %r2360; + shf.l.wrap.b32 %r11095, %r11096, %r11097, %r11098; // end inline asm - ld.const.u32 %r2369, [matrix+656]; + mov.u32 %r11106, 27; // begin inline asm - dp4a.u32.u32 %r2368, %r2369, %r5762, %r2364; + shf.l.wrap.b32 %r11099, %r11105, %r11104, %r11106; // end inline asm - ld.const.u32 %r2373, [matrix+660]; // begin inline asm - dp4a.u32.u32 %r2372, %r2373, %r5766, %r2368; + shf.l.wrap.b32 %r11103, %r11104, %r11105, %r11106; // end inline asm - ld.const.u32 %r2377, [matrix+664]; + mov.u32 %r11114, 14; // begin inline asm - dp4a.u32.u32 %r2376, %r2377, %r5770, %r2372; + shf.l.wrap.b32 %r11107, %r11113, %r11112, %r11114; // end inline asm - ld.const.u32 %r2381, [matrix+668]; // begin inline asm - dp4a.u32.u32 %r2380, %r2381, %r5774, %r2376; + shf.l.wrap.b32 %r11111, %r11112, %r11113, %r11114; // end inline asm - ld.const.u32 %r2385, [matrix+672]; + mov.u32 %r11122, 2; // begin inline asm - dp4a.u32.u32 %r2384, %r2385, %r5778, %r2380; + shf.l.wrap.b32 %r11115, %r11121, %r11120, %r11122; // end inline asm - ld.const.u32 %r2389, [matrix+676]; // begin inline asm - dp4a.u32.u32 %r2388, %r2389, %r5782, %r2384; + shf.l.wrap.b32 %r11119, %r11120, %r11121, %r11122; // end inline asm - ld.const.u32 %r2393, [matrix+680]; + mov.u32 %r11130, 55; // begin inline asm - dp4a.u32.u32 %r2392, %r2393, %r5786, %r2388; + shf.l.wrap.b32 %r11123, %r11129, %r11128, %r11130; // end inline asm - ld.const.u32 %r2397, [matrix+684]; // begin inline asm - dp4a.u32.u32 %r2396, %r2397, %r5790, %r2392; + shf.l.wrap.b32 %r11127, %r11128, %r11129, %r11130; // end inline asm - ld.const.u32 %r2401, [matrix+688]; + mov.u32 %r11138, 45; // begin inline asm - dp4a.u32.u32 %r2400, %r2401, %r5794, %r2396; + shf.l.wrap.b32 %r11131, %r11137, %r11136, %r11138; // end inline asm - ld.const.u32 %r2405, [matrix+692]; // begin inline asm - dp4a.u32.u32 %r2404, %r2405, %r5798, %r2400; + shf.l.wrap.b32 %r11135, %r11136, %r11137, %r11138; // end inline asm - ld.const.u32 %r2409, [matrix+696]; + mov.u32 %r11146, 36; // begin inline asm - dp4a.u32.u32 %r2408, %r2409, %r5802, %r2404; + shf.l.wrap.b32 %r11139, %r11145, %r11144, %r11146; // end inline asm - ld.const.u32 %r2413, [matrix+700]; // begin inline asm - dp4a.u32.u32 %r2412, %r2413, %r5806, %r2408; + shf.l.wrap.b32 %r11143, %r11144, %r11145, %r11146; // end inline asm - ld.const.u32 %r2417, [matrix+704]; + mov.u32 %r11154, 28; // begin inline asm - dp4a.u32.u32 %r2416, %r2417, %r5746, %r6244; + shf.l.wrap.b32 %r11147, %r11153, %r11152, %r11154; // end inline asm - ld.const.u32 %r2421, [matrix+708]; // begin inline asm - dp4a.u32.u32 %r2420, %r2421, %r5750, %r2416; + shf.l.wrap.b32 %r11151, %r11152, %r11153, %r11154; // end inline asm - ld.const.u32 %r2425, [matrix+712]; + mov.u32 %r11162, 21; // begin inline asm - dp4a.u32.u32 %r2424, %r2425, %r5754, %r2420; + shf.l.wrap.b32 %r11155, %r11161, %r11160, %r11162; // end inline asm - ld.const.u32 %r2429, [matrix+716]; // begin inline asm - dp4a.u32.u32 %r2428, %r2429, %r5758, %r2424; + shf.l.wrap.b32 %r11159, %r11160, %r11161, %r11162; // end inline asm - ld.const.u32 %r2433, [matrix+720]; + mov.u32 %r11170, 15; // begin inline asm - dp4a.u32.u32 %r2432, %r2433, %r5762, %r2428; + shf.l.wrap.b32 %r11163, %r11169, %r11168, %r11170; // end inline asm - ld.const.u32 %r2437, [matrix+724]; // begin inline asm - dp4a.u32.u32 %r2436, %r2437, %r5766, %r2432; + shf.l.wrap.b32 %r11167, %r11168, %r11169, %r11170; // end inline asm - ld.const.u32 %r2441, [matrix+728]; + mov.u32 %r11178, 10; // begin inline asm - dp4a.u32.u32 %r2440, %r2441, %r5770, %r2436; + shf.l.wrap.b32 %r11171, %r11177, %r11176, %r11178; // end inline asm - ld.const.u32 %r2445, [matrix+732]; // begin inline asm - dp4a.u32.u32 %r2444, %r2445, %r5774, %r2440; + shf.l.wrap.b32 %r11175, %r11176, %r11177, %r11178; // end inline asm - ld.const.u32 %r2449, [matrix+736]; + mov.u32 %r11186, 6; // begin inline asm - dp4a.u32.u32 %r2448, %r2449, %r5778, %r2444; + shf.l.wrap.b32 %r11179, %r11185, %r11184, %r11186; // end inline asm - ld.const.u32 %r2453, [matrix+740]; // begin inline asm - dp4a.u32.u32 %r2452, %r2453, %r5782, %r2448; + shf.l.wrap.b32 %r11183, %r11184, %r11185, %r11186; // end inline asm - ld.const.u32 %r2457, [matrix+744]; + mov.u32 %r11194, 3; // begin inline asm - dp4a.u32.u32 %r2456, %r2457, %r5786, %r2452; + shf.l.wrap.b32 %r11187, %r11193, %r11192, %r11194; // end inline asm - ld.const.u32 %r2461, [matrix+748]; // begin inline asm - dp4a.u32.u32 %r2460, %r2461, %r5790, %r2456; + shf.l.wrap.b32 %r11191, %r11192, %r11193, %r11194; // end inline asm - ld.const.u32 %r2465, [matrix+752]; // begin inline asm - dp4a.u32.u32 %r2464, %r2465, %r5794, %r2460; + shf.l.wrap.b32 %r11195, %r11201, %r11200, %r10870; // end inline asm - ld.const.u32 %r2469, [matrix+756]; // begin inline asm - dp4a.u32.u32 %r2468, %r2469, %r5798, %r2464; + shf.l.wrap.b32 %r11199, %r11200, %r11201, %r10870; // end inline asm - ld.const.u32 %r2473, [matrix+760]; // begin inline asm - dp4a.u32.u32 %r2472, %r2473, %r5802, %r2468; + // chi + lop3.b32 %r11203, %r11238, %r11011, %r11059, 0xD2; + lop3.b32 %r11204, %r11241, %r11015, %r11063, 0xD2; // end inline asm - ld.const.u32 %r2477, [matrix+764]; // begin inline asm - dp4a.u32.u32 %r2476, %r2477, %r5806, %r2472; + // chi + lop3.b32 %r29875, %r11011, %r11059, %r11155, 0xD2; + lop3.b32 %r29876, %r11015, %r11063, %r11159, 0xD2; // end inline asm - shr.u32 %r5990, %r2412, 6; - and.b32 %r5991, %r5990, 240; - shr.u32 %r5992, %r2476, 10; - or.b32 %r5993, %r5992, %r5991; - cvt.u64.u32 %rd213, %r5993; - xor.b64 %rd214, %rd11, %rd213; - ld.const.u32 %r2481, [matrix+768]; // begin inline asm - dp4a.u32.u32 %r2480, %r2481, %r5746, %r6244; + // chi + lop3.b32 %r29871, %r11059, %r11155, %r11107, 0xD2; + lop3.b32 %r29872, %r11063, %r11159, %r11111, 0xD2; // end inline asm - ld.const.u32 %r2485, [matrix+772]; // begin inline asm - dp4a.u32.u32 %r2484, %r2485, %r5750, %r2480; + // chi + lop3.b32 %r29867, %r11155, %r11107, %r11238, 0xD2; + lop3.b32 %r29868, %r11159, %r11111, %r11241, 0xD2; // end inline asm - ld.const.u32 %r2489, [matrix+776]; // begin inline asm - dp4a.u32.u32 %r2488, %r2489, %r5754, %r2484; + // chi + lop3.b32 %r29865, %r11107, %r11238, %r11011, 0xD2; + lop3.b32 %r29866, %r11111, %r11241, %r11015, 0xD2; // end inline asm - ld.const.u32 %r2493, [matrix+780]; // begin inline asm - dp4a.u32.u32 %r2492, %r2493, %r5758, %r2488; + // chi + lop3.b32 %r29861, %r11147, %r11019, %r11187, 0xD2; + lop3.b32 %r29862, %r11151, %r11023, %r11191, 0xD2; // end inline asm - ld.const.u32 %r2497, [matrix+784]; // begin inline asm - dp4a.u32.u32 %r2496, %r2497, %r5762, %r2492; + // chi + lop3.b32 %r29873, %r11019, %r11187, %r11131, 0xD2; + lop3.b32 %r29874, %r11023, %r11191, %r11135, 0xD2; // end inline asm - ld.const.u32 %r2501, [matrix+788]; // begin inline asm - dp4a.u32.u32 %r2500, %r2501, %r5766, %r2496; + // chi + lop3.b32 %r29869, %r11187, %r11131, %r11027, 0xD2; + lop3.b32 %r29870, %r11191, %r11135, %r11031, 0xD2; // end inline asm - ld.const.u32 %r2505, [matrix+792]; // begin inline asm - dp4a.u32.u32 %r2504, %r2505, %r5770, %r2500; + // chi + lop3.b32 %r29841, %r11131, %r11027, %r11147, 0xD2; + lop3.b32 %r29842, %r11135, %r11031, %r11151, 0xD2; // end inline asm - ld.const.u32 %r2509, [matrix+796]; + st.local.v2.u32 [%rd84+88], {%r29841, %r29842}; // begin inline asm - dp4a.u32.u32 %r2508, %r2509, %r5774, %r2504; + // chi + lop3.b32 %r29833, %r11027, %r11147, %r11019, 0xD2; + lop3.b32 %r29834, %r11031, %r11151, %r11023, 0xD2; // end inline asm - ld.const.u32 %r2513, [matrix+800]; + st.local.v2.u32 [%rd84+96], {%r29833, %r29834}; // begin inline asm - dp4a.u32.u32 %r2512, %r2513, %r5778, %r2508; + // chi + lop3.b32 %r29859, %r11195, %r11179, %r11067, 0xD2; + lop3.b32 %r29860, %r11199, %r11183, %r11071, 0xD2; // end inline asm - ld.const.u32 %r2517, [matrix+804]; + st.local.v2.u32 [%rd84+104], {%r29859, %r29860}; // begin inline asm - dp4a.u32.u32 %r2516, %r2517, %r5782, %r2512; + // chi + lop3.b32 %r29853, %r11179, %r11067, %r11075, 0xD2; + lop3.b32 %r29854, %r11183, %r11071, %r11079, 0xD2; // end inline asm - ld.const.u32 %r2521, [matrix+808]; + st.local.v2.u32 [%rd84+112], {%r29853, %r29854}; // begin inline asm - dp4a.u32.u32 %r2520, %r2521, %r5786, %r2516; + // chi + lop3.b32 %r29847, %r11067, %r11075, %r11043, 0xD2; + lop3.b32 %r29848, %r11071, %r11079, %r11047, 0xD2; // end inline asm - ld.const.u32 %r2525, [matrix+812]; + st.local.v2.u32 [%rd84+120], {%r29847, %r29848}; // begin inline asm - dp4a.u32.u32 %r2524, %r2525, %r5790, %r2520; + // chi + lop3.b32 %r29839, %r11075, %r11043, %r11195, 0xD2; + lop3.b32 %r29840, %r11079, %r11047, %r11199, 0xD2; // end inline asm - ld.const.u32 %r2529, [matrix+816]; + st.local.v2.u32 [%rd84+128], {%r29839, %r29840}; // begin inline asm - dp4a.u32.u32 %r2528, %r2529, %r5794, %r2524; + // chi + lop3.b32 %r29831, %r11043, %r11195, %r11179, 0xD2; + lop3.b32 %r29832, %r11047, %r11199, %r11183, 0xD2; // end inline asm - ld.const.u32 %r2533, [matrix+820]; + st.local.v2.u32 [%rd84+136], {%r29831, %r29832}; // begin inline asm - dp4a.u32.u32 %r2532, %r2533, %r5798, %r2528; + // chi + lop3.b32 %r29857, %r11099, %r11139, %r11171, 0xD2; + lop3.b32 %r29858, %r11103, %r11143, %r11175, 0xD2; // end inline asm - ld.const.u32 %r2537, [matrix+824]; + st.local.v2.u32 [%rd84+144], {%r29857, %r29858}; // begin inline asm - dp4a.u32.u32 %r2536, %r2537, %r5802, %r2532; + // chi + lop3.b32 %r29851, %r11139, %r11171, %r11163, 0xD2; + lop3.b32 %r29852, %r11143, %r11175, %r11167, 0xD2; // end inline asm - ld.const.u32 %r2541, [matrix+828]; + st.local.v2.u32 [%rd84+152], {%r29851, %r29852}; // begin inline asm - dp4a.u32.u32 %r2540, %r2541, %r5806, %r2536; + // chi + lop3.b32 %r29845, %r11171, %r11163, %r11083, 0xD2; + lop3.b32 %r29846, %r11175, %r11167, %r11087, 0xD2; // end inline asm - ld.const.u32 %r2545, [matrix+832]; + st.local.v2.u32 [%rd84+160], {%r29845, %r29846}; // begin inline asm - dp4a.u32.u32 %r2544, %r2545, %r5746, %r6244; + // chi + lop3.b32 %r29837, %r11163, %r11083, %r11099, 0xD2; + lop3.b32 %r29838, %r11167, %r11087, %r11103, 0xD2; // end inline asm - ld.const.u32 %r2549, [matrix+836]; + st.local.v2.u32 [%rd84+168], {%r29837, %r29838}; // begin inline asm - dp4a.u32.u32 %r2548, %r2549, %r5750, %r2544; + // chi + lop3.b32 %r29829, %r11083, %r11099, %r11139, 0xD2; + lop3.b32 %r29830, %r11087, %r11103, %r11143, 0xD2; // end inline asm - ld.const.u32 %r2553, [matrix+840]; + st.local.v2.u32 [%rd84+176], {%r29829, %r29830}; // begin inline asm - dp4a.u32.u32 %r2552, %r2553, %r5754, %r2548; + // chi + lop3.b32 %r29855, %r11051, %r11123, %r11035, 0xD2; + lop3.b32 %r29856, %r11055, %r11127, %r11039, 0xD2; // end inline asm - ld.const.u32 %r2557, [matrix+844]; + st.local.v2.u32 [%rd84+184], {%r29855, %r29856}; // begin inline asm - dp4a.u32.u32 %r2556, %r2557, %r5758, %r2552; + // chi + lop3.b32 %r29849, %r11123, %r11035, %r11091, 0xD2; + lop3.b32 %r29850, %r11127, %r11039, %r11095, 0xD2; // end inline asm - ld.const.u32 %r2561, [matrix+848]; + st.local.v2.u32 [%rd84+192], {%r29849, %r29850}; // begin inline asm - dp4a.u32.u32 %r2560, %r2561, %r5762, %r2556; + // chi + lop3.b32 %r29843, %r11035, %r11091, %r11115, 0xD2; + lop3.b32 %r29844, %r11039, %r11095, %r11119, 0xD2; // end inline asm - ld.const.u32 %r2565, [matrix+852]; + st.local.v2.u32 [%rd84+200], {%r29843, %r29844}; // begin inline asm - dp4a.u32.u32 %r2564, %r2565, %r5766, %r2560; + // chi + lop3.b32 %r29835, %r11091, %r11115, %r11051, 0xD2; + lop3.b32 %r29836, %r11095, %r11119, %r11055, 0xD2; // end inline asm - ld.const.u32 %r2569, [matrix+856]; + st.local.v2.u32 [%rd84+208], {%r29835, %r29836}; // begin inline asm - dp4a.u32.u32 %r2568, %r2569, %r5770, %r2564; + // chi + lop3.b32 %r29827, %r11115, %r11051, %r11123, 0xD2; + lop3.b32 %r29828, %r11119, %r11055, %r11127, 0xD2; // end inline asm - ld.const.u32 %r2573, [matrix+860]; + st.local.v2.u32 [%rd84+216], {%r29827, %r29828}; + mul.wide.s32 %rd618, %r29877, 8; + add.s64 %rd617, %rd597, %rd618; // begin inline asm - dp4a.u32.u32 %r2572, %r2573, %r5774, %r2568; + ld.global.nc.v2.u32 {%r11403,%r11404}, [%rd617]; // end inline asm - ld.const.u32 %r2577, [matrix+864]; + xor.b32 %r29863, %r11203, %r11403; + xor.b32 %r29864, %r11204, %r11404; + add.s32 %r29877, %r29877, 1; + setp.lt.u32 %p24, %r29877, 23; + @%p24 bra $L__BB2_36; + + mov.u32 %r29910, 0; + mov.u32 %r11514, 1; + st.local.v2.u32 [%rd84+32], {%r29875, %r29876}; + st.local.v2.u32 [%rd84+72], {%r29873, %r29874}; + st.local.v2.u32 [%rd84+40], {%r29871, %r29872}; + st.local.v2.u32 [%rd84+80], {%r29869, %r29870}; + st.local.v2.u32 [%rd84+48], {%r29867, %r29868}; + st.local.v2.u32 [%rd84+56], {%r29865, %r29866}; + st.local.v2.u32 [%rd84+24], {%r29863, %r29864}; // begin inline asm - dp4a.u32.u32 %r2576, %r2577, %r5778, %r2572; + // xor5 + lop3.b32 %r11415, %r29863, %r29861, %r29859, 0x96; + lop3.b32 %r11415, %r11415, %r29857, %r29855, 0x96; + lop3.b32 %r11416, %r29864, %r29862, %r29860, 0x96; + lop3.b32 %r11416, %r11416, %r29858, %r29856, 0x96; // end inline asm - ld.const.u32 %r2581, [matrix+868]; // begin inline asm - dp4a.u32.u32 %r2580, %r2581, %r5782, %r2576; + // xor5 + lop3.b32 %r11427, %r29875, %r29873, %r29853, 0x96; + lop3.b32 %r11427, %r11427, %r29851, %r29849, 0x96; + lop3.b32 %r11428, %r29876, %r29874, %r29854, 0x96; + lop3.b32 %r11428, %r11428, %r29852, %r29850, 0x96; // end inline asm - ld.const.u32 %r2585, [matrix+872]; // begin inline asm - dp4a.u32.u32 %r2584, %r2585, %r5786, %r2580; + // xor5 + lop3.b32 %r11439, %r29871, %r29869, %r29847, 0x96; + lop3.b32 %r11439, %r11439, %r29845, %r29843, 0x96; + lop3.b32 %r11440, %r29872, %r29870, %r29848, 0x96; + lop3.b32 %r11440, %r11440, %r29846, %r29844, 0x96; // end inline asm - ld.const.u32 %r2589, [matrix+876]; // begin inline asm - dp4a.u32.u32 %r2588, %r2589, %r5790, %r2584; + // xor5 + lop3.b32 %r11451, %r29867, %r29841, %r29839, 0x96; + lop3.b32 %r11451, %r11451, %r29837, %r29835, 0x96; + lop3.b32 %r11452, %r29868, %r29842, %r29840, 0x96; + lop3.b32 %r11452, %r11452, %r29838, %r29836, 0x96; // end inline asm - ld.const.u32 %r2593, [matrix+880]; // begin inline asm - dp4a.u32.u32 %r2592, %r2593, %r5794, %r2588; + // xor5 + lop3.b32 %r11463, %r29865, %r29833, %r29831, 0x96; + lop3.b32 %r11463, %r11463, %r29829, %r29827, 0x96; + lop3.b32 %r11464, %r29866, %r29834, %r29832, 0x96; + lop3.b32 %r11464, %r11464, %r29830, %r29828, 0x96; // end inline asm - ld.const.u32 %r2597, [matrix+884]; // begin inline asm - dp4a.u32.u32 %r2596, %r2597, %r5798, %r2592; + shf.l.wrap.b32 %r11475, %r11428, %r11427, %r11514; // end inline asm - ld.const.u32 %r2601, [matrix+888]; // begin inline asm - dp4a.u32.u32 %r2600, %r2601, %r5802, %r2596; + shf.l.wrap.b32 %r11479, %r11427, %r11428, %r11514; // end inline asm - ld.const.u32 %r2605, [matrix+892]; + xor.b32 %r11654, %r11475, %r11463; + xor.b32 %r11655, %r11479, %r11464; + xor.b32 %r11622, %r29863, %r11654; + xor.b32 %r11625, %r29864, %r11655; + xor.b32 %r11585, %r29860, %r11655; + xor.b32 %r11584, %r29859, %r11654; + st.local.v2.u32 [%rd84+104], {%r11584, %r11585}; // begin inline asm - dp4a.u32.u32 %r2604, %r2605, %r5806, %r2600; + shf.l.wrap.b32 %r11483, %r11440, %r11439, %r11514; // end inline asm - shr.u32 %r5994, %r2540, 6; - and.b32 %r5995, %r5994, 240; - shr.u32 %r5996, %r2604, 10; - or.b32 %r5997, %r5996, %r5995; - cvt.u64.u32 %rd215, %r5997; - xor.b64 %rd216, %rd12, %rd215; - ld.const.u32 %r2609, [matrix+896]; // begin inline asm - dp4a.u32.u32 %r2608, %r2609, %r5746, %r6244; + shf.l.wrap.b32 %r11487, %r11439, %r11440, %r11514; // end inline asm - ld.const.u32 %r2613, [matrix+900]; + xor.b32 %r11656, %r11483, %r11415; + xor.b32 %r11657, %r11487, %r11416; + xor.b32 %r11521, %r29873, %r11656; + xor.b32 %r11520, %r29874, %r11657; + xor.b32 %r11560, %r29852, %r11657; + xor.b32 %r11561, %r29851, %r11656; + st.local.v2.u32 [%rd84+152], {%r11561, %r11560}; // begin inline asm - dp4a.u32.u32 %r2612, %r2613, %r5750, %r2608; + shf.l.wrap.b32 %r11491, %r11452, %r11451, %r11514; // end inline asm - ld.const.u32 %r2617, [matrix+904]; // begin inline asm - dp4a.u32.u32 %r2616, %r2617, %r5754, %r2612; + shf.l.wrap.b32 %r11495, %r11451, %r11452, %r11514; // end inline asm - ld.const.u32 %r2621, [matrix+908]; + xor.b32 %r11658, %r11491, %r11427; + xor.b32 %r11659, %r11495, %r11428; + xor.b32 %r11544, %r29848, %r11659; + xor.b32 %r11545, %r29847, %r11658; + st.local.v2.u32 [%rd84+120], {%r11545, %r11544}; + xor.b32 %r11536, %r29844, %r11659; + xor.b32 %r11537, %r29843, %r11658; + st.local.v2.u32 [%rd84+200], {%r11537, %r11536}; // begin inline asm - dp4a.u32.u32 %r2620, %r2621, %r5758, %r2616; + shf.l.wrap.b32 %r11499, %r11464, %r11463, %r11514; // end inline asm - ld.const.u32 %r2625, [matrix+912]; // begin inline asm - dp4a.u32.u32 %r2624, %r2625, %r5762, %r2620; + shf.l.wrap.b32 %r11503, %r11463, %r11464, %r11514; // end inline asm - ld.const.u32 %r2629, [matrix+916]; + xor.b32 %r11660, %r11499, %r11439; + xor.b32 %r11661, %r11503, %r11440; + xor.b32 %r11568, %r29867, %r11660; + xor.b32 %r11569, %r29868, %r11661; + xor.b32 %r11577, %r29838, %r11661; + xor.b32 %r11576, %r29837, %r11660; + st.local.v2.u32 [%rd84+168], {%r11576, %r11577}; // begin inline asm - dp4a.u32.u32 %r2628, %r2629, %r5766, %r2624; + shf.l.wrap.b32 %r11507, %r11416, %r11415, %r11514; // end inline asm - ld.const.u32 %r2633, [matrix+920]; // begin inline asm - dp4a.u32.u32 %r2632, %r2633, %r5770, %r2628; + shf.l.wrap.b32 %r11511, %r11415, %r11416, %r11514; // end inline asm - ld.const.u32 %r2637, [matrix+924]; + xor.b32 %r11662, %r11507, %r11451; + xor.b32 %r11663, %r11511, %r11452; + xor.b32 %r11528, %r29833, %r11662; + xor.b32 %r11529, %r29834, %r11663; + xor.b32 %r11553, %r29828, %r11663; + xor.b32 %r11552, %r29827, %r11662; + st.local.v2.u32 [%rd84+216], {%r11552, %r11553}; // begin inline asm - dp4a.u32.u32 %r2636, %r2637, %r5774, %r2632; + shf.l.wrap.b32 %r11515, %r11521, %r11520, %r11018; // end inline asm - ld.const.u32 %r2641, [matrix+928]; // begin inline asm - dp4a.u32.u32 %r2640, %r2641, %r5778, %r2636; + shf.l.wrap.b32 %r11519, %r11520, %r11521, %r11018; // end inline asm - ld.const.u32 %r2645, [matrix+932]; // begin inline asm - dp4a.u32.u32 %r2644, %r2645, %r5782, %r2640; + shf.l.wrap.b32 %r11523, %r11529, %r11528, %r11026; // end inline asm - ld.const.u32 %r2649, [matrix+936]; // begin inline asm - dp4a.u32.u32 %r2648, %r2649, %r5786, %r2644; + shf.l.wrap.b32 %r11527, %r11528, %r11529, %r11026; // end inline asm - ld.const.u32 %r2653, [matrix+940]; // begin inline asm - dp4a.u32.u32 %r2652, %r2653, %r5790, %r2648; + shf.l.wrap.b32 %r11535, %r11536, %r11537, %r11034; // end inline asm - ld.const.u32 %r2657, [matrix+944]; // begin inline asm - dp4a.u32.u32 %r2656, %r2657, %r5794, %r2652; + shf.l.wrap.b32 %r11531, %r11537, %r11536, %r11034; // end inline asm - ld.const.u32 %r2661, [matrix+948]; + st.local.v2.u32 [%rd84+96], {%r11531, %r11535}; // begin inline asm - dp4a.u32.u32 %r2660, %r2661, %r5798, %r2656; + shf.l.wrap.b32 %r11539, %r11545, %r11544, %r11066; // end inline asm - ld.const.u32 %r2665, [matrix+952]; // begin inline asm - dp4a.u32.u32 %r2664, %r2665, %r5802, %r2660; + shf.l.wrap.b32 %r11543, %r11544, %r11545, %r11066; // end inline asm - ld.const.u32 %r2669, [matrix+956]; // begin inline asm - dp4a.u32.u32 %r2668, %r2669, %r5806, %r2664; + shf.l.wrap.b32 %r11547, %r11553, %r11552, %r11114; // end inline asm - ld.const.u32 %r2673, [matrix+960]; // begin inline asm - dp4a.u32.u32 %r2672, %r2673, %r5746, %r6244; + shf.l.wrap.b32 %r11551, %r11552, %r11553, %r11114; // end inline asm - ld.const.u32 %r2677, [matrix+964]; // begin inline asm - dp4a.u32.u32 %r2676, %r2677, %r5750, %r2672; + shf.l.wrap.b32 %r11559, %r11560, %r11561, %r11138; // end inline asm - ld.const.u32 %r2681, [matrix+968]; // begin inline asm - dp4a.u32.u32 %r2680, %r2681, %r5754, %r2676; + shf.l.wrap.b32 %r11555, %r11561, %r11560, %r11138; // end inline asm - ld.const.u32 %r2685, [matrix+972]; + st.local.v2.u32 [%rd84+88], {%r11555, %r11559}; // begin inline asm - dp4a.u32.u32 %r2684, %r2685, %r5758, %r2680; + shf.l.wrap.b32 %r11563, %r11569, %r11568, %r11154; // end inline asm - ld.const.u32 %r2689, [matrix+976]; // begin inline asm - dp4a.u32.u32 %r2688, %r2689, %r5762, %r2684; + shf.l.wrap.b32 %r11567, %r11568, %r11569, %r11154; // end inline asm - ld.const.u32 %r2693, [matrix+980]; // begin inline asm - dp4a.u32.u32 %r2692, %r2693, %r5766, %r2688; + shf.l.wrap.b32 %r11571, %r11577, %r11576, %r11162; // end inline asm - ld.const.u32 %r2697, [matrix+984]; // begin inline asm - dp4a.u32.u32 %r2696, %r2697, %r5770, %r2692; + shf.l.wrap.b32 %r11575, %r11576, %r11577, %r11162; // end inline asm - ld.const.u32 %r2701, [matrix+988]; // begin inline asm - dp4a.u32.u32 %r2700, %r2701, %r5774, %r2696; + shf.l.wrap.b32 %r11579, %r11585, %r11584, %r11194; // end inline asm - ld.const.u32 %r2705, [matrix+992]; // begin inline asm - dp4a.u32.u32 %r2704, %r2705, %r5778, %r2700; + shf.l.wrap.b32 %r11583, %r11584, %r11585, %r11194; // end inline asm - ld.const.u32 %r2709, [matrix+996]; // begin inline asm - dp4a.u32.u32 %r2708, %r2709, %r5782, %r2704; + // chi + lop3.b32 %r11587, %r11622, %r11515, %r11539, 0xD2; + lop3.b32 %r11588, %r11625, %r11519, %r11543, 0xD2; // end inline asm - ld.const.u32 %r2713, [matrix+1000]; // begin inline asm - dp4a.u32.u32 %r2712, %r2713, %r5786, %r2708; + // chi + lop3.b32 %r30010, %r11515, %r11539, %r11571, 0xD2; + lop3.b32 %r30011, %r11519, %r11543, %r11575, 0xD2; // end inline asm - ld.const.u32 %r2717, [matrix+1004]; + st.local.v2.u32 [%rd84+32], {%r30010, %r30011}; // begin inline asm - dp4a.u32.u32 %r2716, %r2717, %r5790, %r2712; + // chi + lop3.b32 %r30006, %r11539, %r11571, %r11547, 0xD2; + lop3.b32 %r30007, %r11543, %r11575, %r11551, 0xD2; // end inline asm - ld.const.u32 %r2721, [matrix+1008]; + st.local.v2.u32 [%rd84+40], {%r30006, %r30007}; // begin inline asm - dp4a.u32.u32 %r2720, %r2721, %r5794, %r2716; + // chi + lop3.b32 %r30002, %r11571, %r11547, %r11622, 0xD2; + lop3.b32 %r30003, %r11575, %r11551, %r11625, 0xD2; // end inline asm - ld.const.u32 %r2725, [matrix+1012]; + st.local.v2.u32 [%rd84+48], {%r30002, %r30003}; // begin inline asm - dp4a.u32.u32 %r2724, %r2725, %r5798, %r2720; + // chi + lop3.b32 %r30000, %r11547, %r11622, %r11515, 0xD2; + lop3.b32 %r30001, %r11551, %r11625, %r11519, 0xD2; // end inline asm - ld.const.u32 %r2729, [matrix+1016]; + st.local.v2.u32 [%rd84+56], {%r30000, %r30001}; // begin inline asm - dp4a.u32.u32 %r2728, %r2729, %r5802, %r2724; + // chi + lop3.b32 %r29996, %r11563, %r11523, %r11579, 0xD2; + lop3.b32 %r29997, %r11567, %r11527, %r11583, 0xD2; // end inline asm - ld.const.u32 %r2733, [matrix+1020]; + st.local.v2.u32 [%rd84+64], {%r29996, %r29997}; // begin inline asm - dp4a.u32.u32 %r2732, %r2733, %r5806, %r2728; + // chi + lop3.b32 %r30008, %r11523, %r11579, %r11555, 0xD2; + lop3.b32 %r30009, %r11527, %r11583, %r11559, 0xD2; // end inline asm - shr.u32 %r5998, %r2668, 6; - and.b32 %r5999, %r5998, 240; - ld.const.u32 %r2737, [matrix+1024]; + st.local.v2.u32 [%rd84+72], {%r30008, %r30009}; // begin inline asm - dp4a.u32.u32 %r2736, %r2737, %r5746, %r6244; + // chi + lop3.b32 %r30004, %r11579, %r11555, %r11531, 0xD2; + lop3.b32 %r30005, %r11583, %r11559, %r11535, 0xD2; // end inline asm - ld.const.u32 %r2741, [matrix+1028]; + st.local.v2.u32 [%rd84+80], {%r30004, %r30005}; // begin inline asm - dp4a.u32.u32 %r2740, %r2741, %r5750, %r2736; + ld.global.nc.v2.u32 {%r11651,%r11652}, [%rd598]; + // end inline asm + xor.b32 %r29998, %r11587, %r11651; + xor.b32 %r29999, %r11588, %r11652; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; + add.s64 %rd86, %rd84, 24; + add.s64 %rd87, %rd3, 24; + +$L__BB2_38: + shl.b32 %r11664, %r29910, 2; + cvt.u64.u32 %rd628, %r11664; + and.b64 %rd629, %rd628, 60; + add.s64 %rd630, %rd87, %rd629; + xor.b32 %r11665, %r30, %r29910; + mul.lo.s32 %r11666, %r11665, 16777619; + ld.local.u32 %r11667, [%rd630]; + xor.b32 %r11668, %r11666, %r11667; + mul.wide.u32 %rd631, %r11668, -954391867; + shr.u64 %rd632, %rd631, 32; + cvt.u32.u64 %r11669, %rd632; + sub.s32 %r11670, %r11668, %r11669; + shr.u32 %r11671, %r11670, 1; + add.s32 %r11672, %r11671, %r11669; + shr.u32 %r11673, %r11672, 20; + mul.lo.s32 %r11674, %r11673, 1179641; + sub.s32 %r11675, %r11668, %r11674; + mul.wide.u32 %rd633, %r11675, 64; + add.s64 %rd634, %rd471, %rd633; + mul.lo.s32 %r11676, %r29947, 16777619; + ld.global.u32 %r11677, [%rd634]; + xor.b32 %r29947, %r11676, %r11677; + mul.lo.s32 %r11678, %r29948, 16777619; + ld.global.u32 %r11679, [%rd634+4]; + xor.b32 %r29948, %r11678, %r11679; + mul.lo.s32 %r11680, %r29959, 16777619; + ld.global.u32 %r11681, [%rd634+8]; + mul.lo.s32 %r11682, %r29960, 16777619; + ld.global.u32 %r11683, [%rd634+12]; + xor.b32 %r11684, %r11682, %r11683; + xor.b32 %r29959, %r11680, %r11681; + mov.b64 %rd635, {%r29959, %r11684}; + mul.lo.s32 %r11685, %r29955, 16777619; + ld.global.u32 %r11686, [%rd634+16]; + mul.lo.s32 %r11687, %r29956, 16777619; + ld.global.u32 %r11688, [%rd634+20]; + xor.b32 %r11689, %r11687, %r11688; + xor.b32 %r29955, %r11685, %r11686; + mov.b64 %rd636, {%r29955, %r11689}; + mul.lo.s32 %r11690, %r29951, 16777619; + ld.global.u32 %r11691, [%rd634+24]; + mul.lo.s32 %r11692, %r29952, 16777619; + ld.global.u32 %r11693, [%rd634+28]; + xor.b32 %r11694, %r11692, %r11693; + xor.b32 %r29951, %r11690, %r11691; + mov.b64 %rd637, {%r29951, %r11694}; + mul.lo.s32 %r11695, %r29949, 16777619; + ld.global.u32 %r11696, [%rd634+32]; + mul.lo.s32 %r11697, %r29950, 16777619; + ld.global.u32 %r11698, [%rd634+36]; + xor.b32 %r11699, %r11697, %r11698; + xor.b32 %r29949, %r11695, %r11696; + mov.b64 %rd638, {%r29949, %r11699}; + mul.lo.s32 %r11700, %r29945, 16777619; + ld.global.u32 %r11701, [%rd634+40]; + xor.b32 %r29945, %r11700, %r11701; + mul.lo.s32 %r11702, %r29946, 16777619; + ld.global.u32 %r11703, [%rd634+44]; + xor.b32 %r29946, %r11702, %r11703; + mul.lo.s32 %r11704, %r29957, 16777619; + ld.global.u32 %r11705, [%rd634+48]; + mul.lo.s32 %r11706, %r29958, 16777619; + ld.global.u32 %r11707, [%rd634+52]; + xor.b32 %r11708, %r11706, %r11707; + xor.b32 %r29957, %r11704, %r11705; + mov.b64 %rd639, {%r29957, %r11708}; + mul.lo.s32 %r11709, %r29953, 16777619; + ld.global.u32 %r11710, [%rd634+56]; + mul.lo.s32 %r11711, %r29954, 16777619; + ld.global.u32 %r11712, [%rd634+60]; + xor.b32 %r11713, %r11711, %r11712; + xor.b32 %r29953, %r11709, %r11710; + mov.b64 %rd640, {%r29953, %r11713}; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; + st.local.v2.u32 [%rd3+32], {%r29959, %r11684}; + st.local.v2.u32 [%rd3+40], {%r29955, %r11689}; + st.local.v2.u32 [%rd3+48], {%r29951, %r11694}; + st.local.v2.u32 [%rd3+56], {%r29949, %r11699}; + st.local.v2.u32 [%rd3+64], {%r29945, %r29946}; + st.local.v2.u32 [%rd3+72], {%r29957, %r11708}; + st.local.v2.u32 [%rd3+80], {%r29953, %r11713}; + add.s64 %rd641, %rd86, %rd629; + xor.b32 %r11714, %r31, %r29910; + mul.lo.s32 %r11715, %r11714, 16777619; + ld.local.u32 %r11716, [%rd641]; + xor.b32 %r11717, %r11715, %r11716; + mul.wide.u32 %rd642, %r11717, -954391867; + shr.u64 %rd643, %rd642, 32; + cvt.u32.u64 %r11718, %rd643; + sub.s32 %r11719, %r11717, %r11718; + shr.u32 %r11720, %r11719, 1; + add.s32 %r11721, %r11720, %r11718; + shr.u32 %r11722, %r11721, 20; + mul.lo.s32 %r11723, %r11722, 1179641; + sub.s32 %r11724, %r11717, %r11723; + mul.wide.u32 %rd644, %r11724, 64; + add.s64 %rd645, %rd471, %rd644; + mul.lo.s32 %r11725, %r29998, 16777619; + ld.global.u32 %r11726, [%rd645]; + xor.b32 %r29998, %r11725, %r11726; + mul.lo.s32 %r11727, %r29999, 16777619; + ld.global.u32 %r11728, [%rd645+4]; + xor.b32 %r29999, %r11727, %r11728; + mul.lo.s32 %r11729, %r30010, 16777619; + ld.global.u32 %r11730, [%rd645+8]; + mul.lo.s32 %r11731, %r30011, 16777619; + ld.global.u32 %r11732, [%rd645+12]; + xor.b32 %r11733, %r11731, %r11732; + xor.b32 %r30010, %r11729, %r11730; + mov.b64 %rd646, {%r30010, %r11733}; + mul.lo.s32 %r11734, %r30006, 16777619; + ld.global.u32 %r11735, [%rd645+16]; + mul.lo.s32 %r11736, %r30007, 16777619; + ld.global.u32 %r11737, [%rd645+20]; + xor.b32 %r11738, %r11736, %r11737; + xor.b32 %r30006, %r11734, %r11735; + mov.b64 %rd647, {%r30006, %r11738}; + mul.lo.s32 %r11739, %r30002, 16777619; + ld.global.u32 %r11740, [%rd645+24]; + mul.lo.s32 %r11741, %r30003, 16777619; + ld.global.u32 %r11742, [%rd645+28]; + xor.b32 %r11743, %r11741, %r11742; + xor.b32 %r30002, %r11739, %r11740; + mov.b64 %rd648, {%r30002, %r11743}; + mul.lo.s32 %r11744, %r30000, 16777619; + ld.global.u32 %r11745, [%rd645+32]; + mul.lo.s32 %r11746, %r30001, 16777619; + ld.global.u32 %r11747, [%rd645+36]; + xor.b32 %r11748, %r11746, %r11747; + xor.b32 %r30000, %r11744, %r11745; + mov.b64 %rd649, {%r30000, %r11748}; + mul.lo.s32 %r11749, %r29996, 16777619; + ld.global.u32 %r11750, [%rd645+40]; + xor.b32 %r29996, %r11749, %r11750; + mul.lo.s32 %r11751, %r29997, 16777619; + ld.global.u32 %r11752, [%rd645+44]; + xor.b32 %r29997, %r11751, %r11752; + mul.lo.s32 %r11753, %r30008, 16777619; + ld.global.u32 %r11754, [%rd645+48]; + mul.lo.s32 %r11755, %r30009, 16777619; + ld.global.u32 %r11756, [%rd645+52]; + xor.b32 %r11757, %r11755, %r11756; + xor.b32 %r30008, %r11753, %r11754; + mov.b64 %rd650, {%r30008, %r11757}; + mul.lo.s32 %r11758, %r30004, 16777619; + ld.global.u32 %r11759, [%rd645+56]; + mul.lo.s32 %r11760, %r30005, 16777619; + ld.global.u32 %r11761, [%rd645+60]; + xor.b32 %r11762, %r11760, %r11761; + xor.b32 %r30004, %r11758, %r11759; + mov.b64 %rd651, {%r30004, %r11762}; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; + st.local.v2.u32 [%rd84+32], {%r30010, %r11733}; + st.local.v2.u32 [%rd84+40], {%r30006, %r11738}; + st.local.v2.u32 [%rd84+48], {%r30002, %r11743}; + st.local.v2.u32 [%rd84+56], {%r30000, %r11748}; + st.local.v2.u32 [%rd84+64], {%r29996, %r29997}; + st.local.v2.u32 [%rd84+72], {%r30008, %r11757}; + st.local.v2.u32 [%rd84+80], {%r30004, %r11762}; + add.s32 %r29910, %r29910, 1; + setp.lt.u32 %p25, %r29910, 512; + shr.u64 %rd652, %rd635, 32; + cvt.u32.u64 %r29960, %rd652; + shr.u64 %rd653, %rd636, 32; + cvt.u32.u64 %r29956, %rd653; + shr.u64 %rd654, %rd637, 32; + cvt.u32.u64 %r29952, %rd654; + shr.u64 %rd655, %rd638, 32; + cvt.u32.u64 %r29950, %rd655; + shr.u64 %rd656, %rd639, 32; + cvt.u32.u64 %r29958, %rd656; + shr.u64 %rd657, %rd640, 32; + cvt.u32.u64 %r29954, %rd657; + shr.u64 %rd658, %rd646, 32; + cvt.u32.u64 %r30011, %rd658; + shr.u64 %rd659, %rd647, 32; + cvt.u32.u64 %r30007, %rd659; + shr.u64 %rd660, %rd648, 32; + cvt.u32.u64 %r30003, %rd660; + shr.u64 %rd661, %rd649, 32; + cvt.u32.u64 %r30001, %rd661; + shr.u64 %rd662, %rd650, 32; + cvt.u32.u64 %r30009, %rd662; + shr.u64 %rd663, %rd651, 32; + cvt.u32.u64 %r30005, %rd663; + @%p25 bra $L__BB2_38; + + mov.u32 %r29911, 0; + st.local.v2.u32 [%rd3+96], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+104], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+112], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+120], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+128], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+136], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+144], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+152], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+160], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+168], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+176], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+184], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+192], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+200], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+208], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+216], {%r29911, %r29911}; + mov.u32 %r29926, -2147483648; + mov.u32 %r11777, 1; + st.local.v2.u32 [%rd3+88], {%r11777, %r29926}; + mov.u32 %r29912, %r29911; + mov.u32 %r29913, %r29911; + mov.u32 %r29914, %r29911; + mov.u32 %r29915, %r29911; + mov.u32 %r29916, %r29911; + mov.u32 %r29917, %r29911; + mov.u32 %r29918, %r29911; + mov.u32 %r29919, %r29911; + mov.u32 %r29920, %r29911; + mov.u32 %r29921, %r29911; + mov.u32 %r29922, %r29911; + mov.u32 %r29923, %r29911; + mov.u32 %r29924, %r29911; + mov.u32 %r29925, %r11777; + mov.u32 %r29927, %r29911; + mov.u32 %r29928, %r29911; + mov.u32 %r29929, %r29911; + mov.u32 %r29930, %r29911; + mov.u32 %r29931, %r29911; + mov.u32 %r29932, %r29911; + mov.u32 %r29933, %r29911; + mov.u32 %r29934, %r29911; + mov.u32 %r29935, %r29911; + mov.u32 %r29936, %r29911; + mov.u32 %r29937, %r29911; + mov.u32 %r29938, %r29911; + mov.u32 %r29939, %r29911; + mov.u32 %r29940, %r29911; + mov.u32 %r29941, %r29911; + mov.u32 %r29942, %r29911; + mov.u32 %r29943, %r29911; + mov.u32 %r29944, %r29911; + mov.u32 %r29961, %r29911; + +$L__BB2_40: + // begin inline asm + // xor5 + lop3.b32 %r11804, %r29947, %r29945, %r29943, 0x96; + lop3.b32 %r11804, %r11804, %r29941, %r29939, 0x96; + lop3.b32 %r11805, %r29948, %r29946, %r29944, 0x96; + lop3.b32 %r11805, %r11805, %r29942, %r29940, 0x96; // end inline asm - ld.const.u32 %r2745, [matrix+1032]; // begin inline asm - dp4a.u32.u32 %r2744, %r2745, %r5754, %r2740; + // xor5 + lop3.b32 %r11816, %r29959, %r29957, %r29937, 0x96; + lop3.b32 %r11816, %r11816, %r29935, %r29933, 0x96; + lop3.b32 %r11817, %r29960, %r29958, %r29938, 0x96; + lop3.b32 %r11817, %r11817, %r29936, %r29934, 0x96; // end inline asm - ld.const.u32 %r2749, [matrix+1036]; // begin inline asm - dp4a.u32.u32 %r2748, %r2749, %r5758, %r2744; + // xor5 + lop3.b32 %r11828, %r29955, %r29953, %r29931, 0x96; + lop3.b32 %r11828, %r11828, %r29929, %r29927, 0x96; + lop3.b32 %r11829, %r29956, %r29954, %r29932, 0x96; + lop3.b32 %r11829, %r11829, %r29930, %r29928, 0x96; // end inline asm - ld.const.u32 %r2753, [matrix+1040]; // begin inline asm - dp4a.u32.u32 %r2752, %r2753, %r5762, %r2748; + // xor5 + lop3.b32 %r11840, %r29951, %r29925, %r29923, 0x96; + lop3.b32 %r11840, %r11840, %r29921, %r29919, 0x96; + lop3.b32 %r11841, %r29952, %r29926, %r29924, 0x96; + lop3.b32 %r11841, %r11841, %r29922, %r29920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11852, %r29949, %r29917, %r29915, 0x96; + lop3.b32 %r11852, %r11852, %r29913, %r29911, 0x96; + lop3.b32 %r11853, %r29950, %r29918, %r29916, 0x96; + lop3.b32 %r11853, %r11853, %r29914, %r29912, 0x96; // end inline asm - ld.const.u32 %r2757, [matrix+1044]; // begin inline asm - dp4a.u32.u32 %r2756, %r2757, %r5766, %r2752; + shf.l.wrap.b32 %r11864, %r11817, %r11816, %r11777; // end inline asm - ld.const.u32 %r2761, [matrix+1048]; // begin inline asm - dp4a.u32.u32 %r2760, %r2761, %r5770, %r2756; + shf.l.wrap.b32 %r11868, %r11816, %r11817, %r11777; // end inline asm - ld.const.u32 %r2765, [matrix+1052]; + xor.b32 %r12298, %r11864, %r11852; + xor.b32 %r12299, %r11868, %r11853; + xor.b32 %r12131, %r29947, %r12298; + xor.b32 %r12134, %r29948, %r12299; + xor.b32 %r12038, %r29945, %r12298; + xor.b32 %r12037, %r29946, %r12299; + xor.b32 %r12085, %r29943, %r12298; + xor.b32 %r12086, %r29944, %r12299; + xor.b32 %r11990, %r29941, %r12298; + xor.b32 %r11989, %r29942, %r12299; + xor.b32 %r11941, %r29939, %r12298; + xor.b32 %r11942, %r29940, %r12299; // begin inline asm - dp4a.u32.u32 %r2764, %r2765, %r5774, %r2760; + shf.l.wrap.b32 %r11872, %r11829, %r11828, %r11777; // end inline asm - ld.const.u32 %r2769, [matrix+1056]; // begin inline asm - dp4a.u32.u32 %r2768, %r2769, %r5778, %r2764; + shf.l.wrap.b32 %r11876, %r11828, %r11829, %r11777; // end inline asm - ld.const.u32 %r2773, [matrix+1060]; + xor.b32 %r12300, %r11872, %r11804; + xor.b32 %r12301, %r11876, %r11805; + xor.b32 %r12093, %r29959, %r12300; + xor.b32 %r12094, %r29960, %r12301; + xor.b32 %r11910, %r29957, %r12300; + xor.b32 %r11909, %r29958, %r12301; + xor.b32 %r12069, %r29937, %r12300; + xor.b32 %r12070, %r29938, %r12301; + xor.b32 %r12030, %r29935, %r12300; + xor.b32 %r12029, %r29936, %r12301; + xor.b32 %r12013, %r29933, %r12300; + xor.b32 %r12014, %r29934, %r12301; // begin inline asm - dp4a.u32.u32 %r2772, %r2773, %r5782, %r2768; + shf.l.wrap.b32 %r11880, %r11841, %r11840, %r11777; // end inline asm - ld.const.u32 %r2777, [matrix+1064]; // begin inline asm - dp4a.u32.u32 %r2776, %r2777, %r5786, %r2772; + shf.l.wrap.b32 %r11884, %r11840, %r11841, %r11777; // end inline asm - ld.const.u32 %r2781, [matrix+1068]; + xor.b32 %r12302, %r11880, %r11816; + xor.b32 %r12303, %r11884, %r11817; + xor.b32 %r11950, %r29955, %r12302; + xor.b32 %r11949, %r29956, %r12303; + xor.b32 %r12077, %r29953, %r12302; + xor.b32 %r12078, %r29954, %r12303; + xor.b32 %r11958, %r29931, %r12302; + xor.b32 %r11957, %r29932, %r12303; + xor.b32 %r12061, %r29929, %r12302; + xor.b32 %r12062, %r29930, %r12303; + xor.b32 %r11926, %r29927, %r12302; + xor.b32 %r11925, %r29928, %r12303; // begin inline asm - dp4a.u32.u32 %r2780, %r2781, %r5790, %r2776; + shf.l.wrap.b32 %r11888, %r11853, %r11852, %r11777; // end inline asm - ld.const.u32 %r2785, [matrix+1072]; // begin inline asm - dp4a.u32.u32 %r2784, %r2785, %r5794, %r2780; + shf.l.wrap.b32 %r11892, %r11852, %r11853, %r11777; // end inline asm - ld.const.u32 %r2789, [matrix+1076]; + xor.b32 %r12304, %r11888, %r11828; + xor.b32 %r12305, %r11892, %r11829; + xor.b32 %r12045, %r29951, %r12304; + xor.b32 %r12046, %r29952, %r12305; + xor.b32 %r12022, %r29925, %r12304; + xor.b32 %r12021, %r29926, %r12305; + xor.b32 %r11965, %r29923, %r12304; + xor.b32 %r11966, %r29924, %r12305; + xor.b32 %r12053, %r29921, %r12304; + xor.b32 %r12054, %r29922, %r12305; + xor.b32 %r11982, %r29919, %r12304; + xor.b32 %r11981, %r29920, %r12305; // begin inline asm - dp4a.u32.u32 %r2788, %r2789, %r5798, %r2784; + shf.l.wrap.b32 %r11896, %r11805, %r11804, %r11777; // end inline asm - ld.const.u32 %r2793, [matrix+1080]; // begin inline asm - dp4a.u32.u32 %r2792, %r2793, %r5802, %r2788; + shf.l.wrap.b32 %r11900, %r11804, %r11805, %r11777; // end inline asm - ld.const.u32 %r2797, [matrix+1084]; + xor.b32 %r12306, %r11896, %r11840; + xor.b32 %r12307, %r11900, %r11841; + xor.b32 %r11997, %r29949, %r12306; + xor.b32 %r11998, %r29950, %r12307; + xor.b32 %r11917, %r29917, %r12306; + xor.b32 %r11918, %r29918, %r12307; + xor.b32 %r11934, %r29915, %r12306; + xor.b32 %r11933, %r29916, %r12307; + xor.b32 %r11973, %r29913, %r12306; + xor.b32 %r11974, %r29914, %r12307; + xor.b32 %r12005, %r29911, %r12306; + xor.b32 %r12006, %r29912, %r12307; + mov.u32 %r11911, 44; // begin inline asm - dp4a.u32.u32 %r2796, %r2797, %r5806, %r2792; + shf.l.wrap.b32 %r11904, %r11910, %r11909, %r11911; // end inline asm - ld.const.u32 %r2801, [matrix+1088]; // begin inline asm - dp4a.u32.u32 %r2800, %r2801, %r5746, %r6244; + shf.l.wrap.b32 %r11908, %r11909, %r11910, %r11911; // end inline asm - ld.const.u32 %r2805, [matrix+1092]; + mov.u32 %r11919, 20; // begin inline asm - dp4a.u32.u32 %r2804, %r2805, %r5750, %r2800; + shf.l.wrap.b32 %r11912, %r11918, %r11917, %r11919; // end inline asm - ld.const.u32 %r2809, [matrix+1096]; // begin inline asm - dp4a.u32.u32 %r2808, %r2809, %r5754, %r2804; + shf.l.wrap.b32 %r11916, %r11917, %r11918, %r11919; // end inline asm - ld.const.u32 %r2813, [matrix+1100]; + mov.u32 %r11927, 61; // begin inline asm - dp4a.u32.u32 %r2812, %r2813, %r5758, %r2808; + shf.l.wrap.b32 %r11920, %r11926, %r11925, %r11927; // end inline asm - ld.const.u32 %r2817, [matrix+1104]; // begin inline asm - dp4a.u32.u32 %r2816, %r2817, %r5762, %r2812; + shf.l.wrap.b32 %r11924, %r11925, %r11926, %r11927; // end inline asm - ld.const.u32 %r2821, [matrix+1108]; + mov.u32 %r11935, 39; // begin inline asm - dp4a.u32.u32 %r2820, %r2821, %r5766, %r2816; + shf.l.wrap.b32 %r11928, %r11934, %r11933, %r11935; // end inline asm - ld.const.u32 %r2825, [matrix+1112]; // begin inline asm - dp4a.u32.u32 %r2824, %r2825, %r5770, %r2820; + shf.l.wrap.b32 %r11932, %r11933, %r11934, %r11935; // end inline asm - ld.const.u32 %r2829, [matrix+1116]; + mov.u32 %r11943, 18; // begin inline asm - dp4a.u32.u32 %r2828, %r2829, %r5774, %r2824; + shf.l.wrap.b32 %r11936, %r11942, %r11941, %r11943; // end inline asm - ld.const.u32 %r2833, [matrix+1120]; // begin inline asm - dp4a.u32.u32 %r2832, %r2833, %r5778, %r2828; + shf.l.wrap.b32 %r11940, %r11941, %r11942, %r11943; // end inline asm - ld.const.u32 %r2837, [matrix+1124]; + mov.u32 %r11951, 62; // begin inline asm - dp4a.u32.u32 %r2836, %r2837, %r5782, %r2832; + shf.l.wrap.b32 %r11944, %r11950, %r11949, %r11951; // end inline asm - ld.const.u32 %r2841, [matrix+1128]; // begin inline asm - dp4a.u32.u32 %r2840, %r2841, %r5786, %r2836; + shf.l.wrap.b32 %r11948, %r11949, %r11950, %r11951; // end inline asm - ld.const.u32 %r2845, [matrix+1132]; + mov.u32 %r11959, 43; // begin inline asm - dp4a.u32.u32 %r2844, %r2845, %r5790, %r2840; + shf.l.wrap.b32 %r11952, %r11958, %r11957, %r11959; // end inline asm - ld.const.u32 %r2849, [matrix+1136]; // begin inline asm - dp4a.u32.u32 %r2848, %r2849, %r5794, %r2844; + shf.l.wrap.b32 %r11956, %r11957, %r11958, %r11959; // end inline asm - ld.const.u32 %r2853, [matrix+1140]; + mov.u32 %r11967, 25; // begin inline asm - dp4a.u32.u32 %r2852, %r2853, %r5798, %r2848; + shf.l.wrap.b32 %r11960, %r11966, %r11965, %r11967; // end inline asm - ld.const.u32 %r2857, [matrix+1144]; // begin inline asm - dp4a.u32.u32 %r2856, %r2857, %r5802, %r2852; + shf.l.wrap.b32 %r11964, %r11965, %r11966, %r11967; // end inline asm - ld.const.u32 %r2861, [matrix+1148]; + mov.u32 %r11975, 8; // begin inline asm - dp4a.u32.u32 %r2860, %r2861, %r5806, %r2856; + shf.l.wrap.b32 %r11968, %r11974, %r11973, %r11975; // end inline asm - shr.u32 %r6000, %r2796, 6; - and.b32 %r6001, %r6000, 240; - shr.u32 %r6002, %r2860, 10; - or.b32 %r6003, %r6002, %r6001; - xor.b32 %r6004, %r11, %r6003; - ld.const.u32 %r2865, [matrix+1152]; // begin inline asm - dp4a.u32.u32 %r2864, %r2865, %r5746, %r6244; + shf.l.wrap.b32 %r11972, %r11973, %r11974, %r11975; // end inline asm - ld.const.u32 %r2869, [matrix+1156]; + mov.u32 %r11983, 56; // begin inline asm - dp4a.u32.u32 %r2868, %r2869, %r5750, %r2864; + shf.l.wrap.b32 %r11976, %r11982, %r11981, %r11983; // end inline asm - ld.const.u32 %r2873, [matrix+1160]; // begin inline asm - dp4a.u32.u32 %r2872, %r2873, %r5754, %r2868; + shf.l.wrap.b32 %r11980, %r11981, %r11982, %r11983; // end inline asm - ld.const.u32 %r2877, [matrix+1164]; + mov.u32 %r11991, 41; // begin inline asm - dp4a.u32.u32 %r2876, %r2877, %r5758, %r2872; + shf.l.wrap.b32 %r11984, %r11990, %r11989, %r11991; // end inline asm - ld.const.u32 %r2881, [matrix+1168]; // begin inline asm - dp4a.u32.u32 %r2880, %r2881, %r5762, %r2876; + shf.l.wrap.b32 %r11988, %r11989, %r11990, %r11991; // end inline asm - ld.const.u32 %r2885, [matrix+1172]; + mov.u32 %r11999, 27; // begin inline asm - dp4a.u32.u32 %r2884, %r2885, %r5766, %r2880; + shf.l.wrap.b32 %r11992, %r11998, %r11997, %r11999; // end inline asm - ld.const.u32 %r2889, [matrix+1176]; // begin inline asm - dp4a.u32.u32 %r2888, %r2889, %r5770, %r2884; + shf.l.wrap.b32 %r11996, %r11997, %r11998, %r11999; // end inline asm - ld.const.u32 %r2893, [matrix+1180]; + mov.u32 %r12007, 14; // begin inline asm - dp4a.u32.u32 %r2892, %r2893, %r5774, %r2888; + shf.l.wrap.b32 %r12000, %r12006, %r12005, %r12007; // end inline asm - ld.const.u32 %r2897, [matrix+1184]; // begin inline asm - dp4a.u32.u32 %r2896, %r2897, %r5778, %r2892; + shf.l.wrap.b32 %r12004, %r12005, %r12006, %r12007; // end inline asm - ld.const.u32 %r2901, [matrix+1188]; + mov.u32 %r12015, 2; // begin inline asm - dp4a.u32.u32 %r2900, %r2901, %r5782, %r2896; + shf.l.wrap.b32 %r12008, %r12014, %r12013, %r12015; // end inline asm - ld.const.u32 %r2905, [matrix+1192]; // begin inline asm - dp4a.u32.u32 %r2904, %r2905, %r5786, %r2900; + shf.l.wrap.b32 %r12012, %r12013, %r12014, %r12015; // end inline asm - ld.const.u32 %r2909, [matrix+1196]; + mov.u32 %r12023, 55; // begin inline asm - dp4a.u32.u32 %r2908, %r2909, %r5790, %r2904; + shf.l.wrap.b32 %r12016, %r12022, %r12021, %r12023; // end inline asm - ld.const.u32 %r2913, [matrix+1200]; // begin inline asm - dp4a.u32.u32 %r2912, %r2913, %r5794, %r2908; + shf.l.wrap.b32 %r12020, %r12021, %r12022, %r12023; // end inline asm - ld.const.u32 %r2917, [matrix+1204]; + mov.u32 %r12031, 45; // begin inline asm - dp4a.u32.u32 %r2916, %r2917, %r5798, %r2912; + shf.l.wrap.b32 %r12024, %r12030, %r12029, %r12031; // end inline asm - ld.const.u32 %r2921, [matrix+1208]; // begin inline asm - dp4a.u32.u32 %r2920, %r2921, %r5802, %r2916; + shf.l.wrap.b32 %r12028, %r12029, %r12030, %r12031; // end inline asm - ld.const.u32 %r2925, [matrix+1212]; + mov.u32 %r12039, 36; // begin inline asm - dp4a.u32.u32 %r2924, %r2925, %r5806, %r2920; + shf.l.wrap.b32 %r12032, %r12038, %r12037, %r12039; // end inline asm - ld.const.u32 %r2929, [matrix+1216]; // begin inline asm - dp4a.u32.u32 %r2928, %r2929, %r5746, %r6244; + shf.l.wrap.b32 %r12036, %r12037, %r12038, %r12039; // end inline asm - ld.const.u32 %r2933, [matrix+1220]; + mov.u32 %r12047, 28; // begin inline asm - dp4a.u32.u32 %r2932, %r2933, %r5750, %r2928; + shf.l.wrap.b32 %r12040, %r12046, %r12045, %r12047; // end inline asm - ld.const.u32 %r2937, [matrix+1224]; // begin inline asm - dp4a.u32.u32 %r2936, %r2937, %r5754, %r2932; + shf.l.wrap.b32 %r12044, %r12045, %r12046, %r12047; // end inline asm - ld.const.u32 %r2941, [matrix+1228]; + mov.u32 %r12055, 21; // begin inline asm - dp4a.u32.u32 %r2940, %r2941, %r5758, %r2936; + shf.l.wrap.b32 %r12048, %r12054, %r12053, %r12055; // end inline asm - ld.const.u32 %r2945, [matrix+1232]; // begin inline asm - dp4a.u32.u32 %r2944, %r2945, %r5762, %r2940; + shf.l.wrap.b32 %r12052, %r12053, %r12054, %r12055; // end inline asm - ld.const.u32 %r2949, [matrix+1236]; + mov.u32 %r12063, 15; // begin inline asm - dp4a.u32.u32 %r2948, %r2949, %r5766, %r2944; + shf.l.wrap.b32 %r12056, %r12062, %r12061, %r12063; // end inline asm - ld.const.u32 %r2953, [matrix+1240]; // begin inline asm - dp4a.u32.u32 %r2952, %r2953, %r5770, %r2948; + shf.l.wrap.b32 %r12060, %r12061, %r12062, %r12063; // end inline asm - ld.const.u32 %r2957, [matrix+1244]; + mov.u32 %r12071, 10; // begin inline asm - dp4a.u32.u32 %r2956, %r2957, %r5774, %r2952; + shf.l.wrap.b32 %r12064, %r12070, %r12069, %r12071; // end inline asm - ld.const.u32 %r2961, [matrix+1248]; // begin inline asm - dp4a.u32.u32 %r2960, %r2961, %r5778, %r2956; + shf.l.wrap.b32 %r12068, %r12069, %r12070, %r12071; // end inline asm - ld.const.u32 %r2965, [matrix+1252]; + mov.u32 %r12079, 6; // begin inline asm - dp4a.u32.u32 %r2964, %r2965, %r5782, %r2960; + shf.l.wrap.b32 %r12072, %r12078, %r12077, %r12079; // end inline asm - ld.const.u32 %r2969, [matrix+1256]; // begin inline asm - dp4a.u32.u32 %r2968, %r2969, %r5786, %r2964; + shf.l.wrap.b32 %r12076, %r12077, %r12078, %r12079; // end inline asm - ld.const.u32 %r2973, [matrix+1260]; + mov.u32 %r12087, 3; // begin inline asm - dp4a.u32.u32 %r2972, %r2973, %r5790, %r2968; + shf.l.wrap.b32 %r12080, %r12086, %r12085, %r12087; // end inline asm - ld.const.u32 %r2977, [matrix+1264]; // begin inline asm - dp4a.u32.u32 %r2976, %r2977, %r5794, %r2972; + shf.l.wrap.b32 %r12084, %r12085, %r12086, %r12087; // end inline asm - ld.const.u32 %r2981, [matrix+1268]; // begin inline asm - dp4a.u32.u32 %r2980, %r2981, %r5798, %r2976; + shf.l.wrap.b32 %r12088, %r12094, %r12093, %r11777; // end inline asm - ld.const.u32 %r2985, [matrix+1272]; // begin inline asm - dp4a.u32.u32 %r2984, %r2985, %r5802, %r2980; + shf.l.wrap.b32 %r12092, %r12093, %r12094, %r11777; // end inline asm - ld.const.u32 %r2989, [matrix+1276]; // begin inline asm - dp4a.u32.u32 %r2988, %r2989, %r5806, %r2984; + // chi + lop3.b32 %r12096, %r12131, %r11904, %r11952, 0xD2; + lop3.b32 %r12097, %r12134, %r11908, %r11956, 0xD2; // end inline asm - shr.u32 %r6005, %r2924, 6; - and.b32 %r6006, %r6005, 240; - shr.u32 %r6007, %r2988, 10; - or.b32 %r6008, %r6007, %r6006; - xor.b32 %r6009, %r5848, %r6008; - ld.const.u32 %r2993, [matrix+1280]; // begin inline asm - dp4a.u32.u32 %r2992, %r2993, %r5746, %r6244; + // chi + lop3.b32 %r29959, %r11904, %r11952, %r12048, 0xD2; + lop3.b32 %r29960, %r11908, %r11956, %r12052, 0xD2; // end inline asm - ld.const.u32 %r2997, [matrix+1284]; // begin inline asm - dp4a.u32.u32 %r2996, %r2997, %r5750, %r2992; + // chi + lop3.b32 %r29955, %r11952, %r12048, %r12000, 0xD2; + lop3.b32 %r29956, %r11956, %r12052, %r12004, 0xD2; // end inline asm - ld.const.u32 %r3001, [matrix+1288]; // begin inline asm - dp4a.u32.u32 %r3000, %r3001, %r5754, %r2996; + // chi + lop3.b32 %r29951, %r12048, %r12000, %r12131, 0xD2; + lop3.b32 %r29952, %r12052, %r12004, %r12134, 0xD2; // end inline asm - ld.const.u32 %r3005, [matrix+1292]; // begin inline asm - dp4a.u32.u32 %r3004, %r3005, %r5758, %r3000; + // chi + lop3.b32 %r29949, %r12000, %r12131, %r11904, 0xD2; + lop3.b32 %r29950, %r12004, %r12134, %r11908, 0xD2; // end inline asm - ld.const.u32 %r3009, [matrix+1296]; // begin inline asm - dp4a.u32.u32 %r3008, %r3009, %r5762, %r3004; + // chi + lop3.b32 %r29945, %r12040, %r11912, %r12080, 0xD2; + lop3.b32 %r29946, %r12044, %r11916, %r12084, 0xD2; // end inline asm - ld.const.u32 %r3013, [matrix+1300]; // begin inline asm - dp4a.u32.u32 %r3012, %r3013, %r5766, %r3008; + // chi + lop3.b32 %r29957, %r11912, %r12080, %r12024, 0xD2; + lop3.b32 %r29958, %r11916, %r12084, %r12028, 0xD2; // end inline asm - ld.const.u32 %r3017, [matrix+1304]; // begin inline asm - dp4a.u32.u32 %r3016, %r3017, %r5770, %r3012; + // chi + lop3.b32 %r29953, %r12080, %r12024, %r11920, 0xD2; + lop3.b32 %r29954, %r12084, %r12028, %r11924, 0xD2; // end inline asm - ld.const.u32 %r3021, [matrix+1308]; // begin inline asm - dp4a.u32.u32 %r3020, %r3021, %r5774, %r3016; + // chi + lop3.b32 %r29925, %r12024, %r11920, %r12040, 0xD2; + lop3.b32 %r29926, %r12028, %r11924, %r12044, 0xD2; // end inline asm - ld.const.u32 %r3025, [matrix+1312]; + st.local.v2.u32 [%rd3+88], {%r29925, %r29926}; // begin inline asm - dp4a.u32.u32 %r3024, %r3025, %r5778, %r3020; + // chi + lop3.b32 %r29917, %r11920, %r12040, %r11912, 0xD2; + lop3.b32 %r29918, %r11924, %r12044, %r11916, 0xD2; // end inline asm - ld.const.u32 %r3029, [matrix+1316]; + st.local.v2.u32 [%rd3+96], {%r29917, %r29918}; // begin inline asm - dp4a.u32.u32 %r3028, %r3029, %r5782, %r3024; + // chi + lop3.b32 %r29943, %r12088, %r12072, %r11960, 0xD2; + lop3.b32 %r29944, %r12092, %r12076, %r11964, 0xD2; // end inline asm - ld.const.u32 %r3033, [matrix+1320]; + st.local.v2.u32 [%rd3+104], {%r29943, %r29944}; // begin inline asm - dp4a.u32.u32 %r3032, %r3033, %r5786, %r3028; + // chi + lop3.b32 %r29937, %r12072, %r11960, %r11968, 0xD2; + lop3.b32 %r29938, %r12076, %r11964, %r11972, 0xD2; // end inline asm - ld.const.u32 %r3037, [matrix+1324]; + st.local.v2.u32 [%rd3+112], {%r29937, %r29938}; // begin inline asm - dp4a.u32.u32 %r3036, %r3037, %r5790, %r3032; + // chi + lop3.b32 %r29931, %r11960, %r11968, %r11936, 0xD2; + lop3.b32 %r29932, %r11964, %r11972, %r11940, 0xD2; // end inline asm - ld.const.u32 %r3041, [matrix+1328]; + st.local.v2.u32 [%rd3+120], {%r29931, %r29932}; // begin inline asm - dp4a.u32.u32 %r3040, %r3041, %r5794, %r3036; + // chi + lop3.b32 %r29923, %r11968, %r11936, %r12088, 0xD2; + lop3.b32 %r29924, %r11972, %r11940, %r12092, 0xD2; // end inline asm - ld.const.u32 %r3045, [matrix+1332]; + st.local.v2.u32 [%rd3+128], {%r29923, %r29924}; // begin inline asm - dp4a.u32.u32 %r3044, %r3045, %r5798, %r3040; + // chi + lop3.b32 %r29915, %r11936, %r12088, %r12072, 0xD2; + lop3.b32 %r29916, %r11940, %r12092, %r12076, 0xD2; // end inline asm - ld.const.u32 %r3049, [matrix+1336]; + st.local.v2.u32 [%rd3+136], {%r29915, %r29916}; // begin inline asm - dp4a.u32.u32 %r3048, %r3049, %r5802, %r3044; + // chi + lop3.b32 %r29941, %r11992, %r12032, %r12064, 0xD2; + lop3.b32 %r29942, %r11996, %r12036, %r12068, 0xD2; // end inline asm - ld.const.u32 %r3053, [matrix+1340]; + st.local.v2.u32 [%rd3+144], {%r29941, %r29942}; // begin inline asm - dp4a.u32.u32 %r3052, %r3053, %r5806, %r3048; + // chi + lop3.b32 %r29935, %r12032, %r12064, %r12056, 0xD2; + lop3.b32 %r29936, %r12036, %r12068, %r12060, 0xD2; // end inline asm - ld.const.u32 %r3057, [matrix+1344]; + st.local.v2.u32 [%rd3+152], {%r29935, %r29936}; // begin inline asm - dp4a.u32.u32 %r3056, %r3057, %r5746, %r6244; + // chi + lop3.b32 %r29929, %r12064, %r12056, %r11976, 0xD2; + lop3.b32 %r29930, %r12068, %r12060, %r11980, 0xD2; // end inline asm - ld.const.u32 %r3061, [matrix+1348]; + st.local.v2.u32 [%rd3+160], {%r29929, %r29930}; // begin inline asm - dp4a.u32.u32 %r3060, %r3061, %r5750, %r3056; + // chi + lop3.b32 %r29921, %r12056, %r11976, %r11992, 0xD2; + lop3.b32 %r29922, %r12060, %r11980, %r11996, 0xD2; // end inline asm - ld.const.u32 %r3065, [matrix+1352]; + st.local.v2.u32 [%rd3+168], {%r29921, %r29922}; // begin inline asm - dp4a.u32.u32 %r3064, %r3065, %r5754, %r3060; + // chi + lop3.b32 %r29913, %r11976, %r11992, %r12032, 0xD2; + lop3.b32 %r29914, %r11980, %r11996, %r12036, 0xD2; // end inline asm - ld.const.u32 %r3069, [matrix+1356]; + st.local.v2.u32 [%rd3+176], {%r29913, %r29914}; // begin inline asm - dp4a.u32.u32 %r3068, %r3069, %r5758, %r3064; + // chi + lop3.b32 %r29939, %r11944, %r12016, %r11928, 0xD2; + lop3.b32 %r29940, %r11948, %r12020, %r11932, 0xD2; // end inline asm - ld.const.u32 %r3073, [matrix+1360]; + st.local.v2.u32 [%rd3+184], {%r29939, %r29940}; // begin inline asm - dp4a.u32.u32 %r3072, %r3073, %r5762, %r3068; + // chi + lop3.b32 %r29933, %r12016, %r11928, %r11984, 0xD2; + lop3.b32 %r29934, %r12020, %r11932, %r11988, 0xD2; // end inline asm - ld.const.u32 %r3077, [matrix+1364]; + st.local.v2.u32 [%rd3+192], {%r29933, %r29934}; // begin inline asm - dp4a.u32.u32 %r3076, %r3077, %r5766, %r3072; + // chi + lop3.b32 %r29927, %r11928, %r11984, %r12008, 0xD2; + lop3.b32 %r29928, %r11932, %r11988, %r12012, 0xD2; // end inline asm - ld.const.u32 %r3081, [matrix+1368]; + st.local.v2.u32 [%rd3+200], {%r29927, %r29928}; // begin inline asm - dp4a.u32.u32 %r3080, %r3081, %r5770, %r3076; + // chi + lop3.b32 %r29919, %r11984, %r12008, %r11944, 0xD2; + lop3.b32 %r29920, %r11988, %r12012, %r11948, 0xD2; // end inline asm - ld.const.u32 %r3085, [matrix+1372]; + st.local.v2.u32 [%rd3+208], {%r29919, %r29920}; // begin inline asm - dp4a.u32.u32 %r3084, %r3085, %r5774, %r3080; + // chi + lop3.b32 %r29911, %r12008, %r11944, %r12016, 0xD2; + lop3.b32 %r29912, %r12012, %r11948, %r12020, 0xD2; // end inline asm - ld.const.u32 %r3089, [matrix+1376]; + st.local.v2.u32 [%rd3+216], {%r29911, %r29912}; + mul.wide.s32 %rd665, %r29961, 8; + add.s64 %rd664, %rd597, %rd665; // begin inline asm - dp4a.u32.u32 %r3088, %r3089, %r5778, %r3084; + ld.global.nc.v2.u32 {%r12296,%r12297}, [%rd664]; // end inline asm - ld.const.u32 %r3093, [matrix+1380]; + xor.b32 %r29947, %r12096, %r12296; + xor.b32 %r29948, %r12097, %r12297; + add.s32 %r29961, %r29961, 1; + setp.lt.u32 %p26, %r29961, 23; + @%p26 bra $L__BB2_40; + + st.local.v2.u32 [%rd3+32], {%r29959, %r29960}; + st.local.v2.u32 [%rd3+72], {%r29957, %r29958}; + st.local.v2.u32 [%rd3+40], {%r29955, %r29956}; + st.local.v2.u32 [%rd3+80], {%r29953, %r29954}; + st.local.v2.u32 [%rd3+48], {%r29951, %r29952}; + st.local.v2.u32 [%rd3+56], {%r29949, %r29950}; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; // begin inline asm - dp4a.u32.u32 %r3092, %r3093, %r5782, %r3088; + // xor5 + lop3.b32 %r12308, %r29947, %r29945, %r29943, 0x96; + lop3.b32 %r12308, %r12308, %r29941, %r29939, 0x96; + lop3.b32 %r12309, %r29948, %r29946, %r29944, 0x96; + lop3.b32 %r12309, %r12309, %r29942, %r29940, 0x96; // end inline asm - ld.const.u32 %r3097, [matrix+1384]; // begin inline asm - dp4a.u32.u32 %r3096, %r3097, %r5786, %r3092; + // xor5 + lop3.b32 %r12320, %r29959, %r29957, %r29937, 0x96; + lop3.b32 %r12320, %r12320, %r29935, %r29933, 0x96; + lop3.b32 %r12321, %r29960, %r29958, %r29938, 0x96; + lop3.b32 %r12321, %r12321, %r29936, %r29934, 0x96; // end inline asm - ld.const.u32 %r3101, [matrix+1388]; // begin inline asm - dp4a.u32.u32 %r3100, %r3101, %r5790, %r3096; + // xor5 + lop3.b32 %r12332, %r29955, %r29953, %r29931, 0x96; + lop3.b32 %r12332, %r12332, %r29929, %r29927, 0x96; + lop3.b32 %r12333, %r29956, %r29954, %r29932, 0x96; + lop3.b32 %r12333, %r12333, %r29930, %r29928, 0x96; // end inline asm - ld.const.u32 %r3105, [matrix+1392]; // begin inline asm - dp4a.u32.u32 %r3104, %r3105, %r5794, %r3100; + // xor5 + lop3.b32 %r12344, %r29951, %r29925, %r29923, 0x96; + lop3.b32 %r12344, %r12344, %r29921, %r29919, 0x96; + lop3.b32 %r12345, %r29952, %r29926, %r29924, 0x96; + lop3.b32 %r12345, %r12345, %r29922, %r29920, 0x96; // end inline asm - ld.const.u32 %r3109, [matrix+1396]; // begin inline asm - dp4a.u32.u32 %r3108, %r3109, %r5798, %r3104; + // xor5 + lop3.b32 %r12356, %r29949, %r29917, %r29915, 0x96; + lop3.b32 %r12356, %r12356, %r29913, %r29911, 0x96; + lop3.b32 %r12357, %r29950, %r29918, %r29916, 0x96; + lop3.b32 %r12357, %r12357, %r29914, %r29912, 0x96; // end inline asm - ld.const.u32 %r3113, [matrix+1400]; + mov.u32 %r12560, 1; // begin inline asm - dp4a.u32.u32 %r3112, %r3113, %r5802, %r3108; + shf.l.wrap.b32 %r12368, %r12321, %r12320, %r12560; // end inline asm - ld.const.u32 %r3117, [matrix+1404]; // begin inline asm - dp4a.u32.u32 %r3116, %r3117, %r5806, %r3112; + shf.l.wrap.b32 %r12372, %r12320, %r12321, %r12560; // end inline asm - shr.u32 %r6010, %r3052, 6; - and.b32 %r6011, %r6010, 240; - shr.u32 %r6012, %r3116, 10; - or.b32 %r6013, %r6012, %r6011; - xor.b32 %r6014, %r5860, %r6013; - ld.const.u32 %r3121, [matrix+1408]; + xor.b32 %r12587, %r12368, %r12356; + xor.b32 %r12588, %r12372, %r12357; + xor.b32 %r12515, %r29947, %r12587; + xor.b32 %r12518, %r29948, %r12588; + xor.b32 %r12478, %r29944, %r12588; + xor.b32 %r12477, %r29943, %r12587; + st.local.v2.u32 [%rd3+104], {%r12477, %r12478}; // begin inline asm - dp4a.u32.u32 %r3120, %r3121, %r5746, %r6244; + shf.l.wrap.b32 %r12376, %r12333, %r12332, %r12560; // end inline asm - ld.const.u32 %r3125, [matrix+1412]; // begin inline asm - dp4a.u32.u32 %r3124, %r3125, %r5750, %r3120; + shf.l.wrap.b32 %r12380, %r12332, %r12333, %r12560; // end inline asm - ld.const.u32 %r3129, [matrix+1416]; + xor.b32 %r12589, %r12376, %r12308; + xor.b32 %r12590, %r12380, %r12309; + xor.b32 %r12414, %r29957, %r12589; + xor.b32 %r12413, %r29958, %r12590; + xor.b32 %r12453, %r29936, %r12590; + xor.b32 %r12454, %r29935, %r12589; + st.local.v2.u32 [%rd3+152], {%r12454, %r12453}; // begin inline asm - dp4a.u32.u32 %r3128, %r3129, %r5754, %r3124; + shf.l.wrap.b32 %r12384, %r12345, %r12344, %r12560; // end inline asm - ld.const.u32 %r3133, [matrix+1420]; // begin inline asm - dp4a.u32.u32 %r3132, %r3133, %r5758, %r3128; + shf.l.wrap.b32 %r12388, %r12344, %r12345, %r12560; // end inline asm - ld.const.u32 %r3137, [matrix+1424]; + xor.b32 %r12591, %r12384, %r12320; + xor.b32 %r12592, %r12388, %r12321; + xor.b32 %r12437, %r29932, %r12592; + xor.b32 %r12438, %r29931, %r12591; + st.local.v2.u32 [%rd3+120], {%r12438, %r12437}; + xor.b32 %r12429, %r29928, %r12592; + xor.b32 %r12430, %r29927, %r12591; + st.local.v2.u32 [%rd3+200], {%r12430, %r12429}; // begin inline asm - dp4a.u32.u32 %r3136, %r3137, %r5762, %r3132; + shf.l.wrap.b32 %r12392, %r12357, %r12356, %r12560; // end inline asm - ld.const.u32 %r3141, [matrix+1428]; // begin inline asm - dp4a.u32.u32 %r3140, %r3141, %r5766, %r3136; + shf.l.wrap.b32 %r12396, %r12356, %r12357, %r12560; // end inline asm - ld.const.u32 %r3145, [matrix+1432]; + xor.b32 %r12593, %r12392, %r12332; + xor.b32 %r12594, %r12396, %r12333; + xor.b32 %r12461, %r29951, %r12593; + xor.b32 %r12462, %r29952, %r12594; + xor.b32 %r12470, %r29922, %r12594; + xor.b32 %r12469, %r29921, %r12593; + st.local.v2.u32 [%rd3+168], {%r12469, %r12470}; // begin inline asm - dp4a.u32.u32 %r3144, %r3145, %r5770, %r3140; + shf.l.wrap.b32 %r12400, %r12309, %r12308, %r12560; // end inline asm - ld.const.u32 %r3149, [matrix+1436]; // begin inline asm - dp4a.u32.u32 %r3148, %r3149, %r5774, %r3144; + shf.l.wrap.b32 %r12404, %r12308, %r12309, %r12560; // end inline asm - ld.const.u32 %r3153, [matrix+1440]; + xor.b32 %r12595, %r12400, %r12344; + xor.b32 %r12596, %r12404, %r12345; + xor.b32 %r12421, %r29917, %r12595; + xor.b32 %r12422, %r29918, %r12596; + xor.b32 %r12446, %r29912, %r12596; + xor.b32 %r12445, %r29911, %r12595; + st.local.v2.u32 [%rd3+216], {%r12445, %r12446}; // begin inline asm - dp4a.u32.u32 %r3152, %r3153, %r5778, %r3148; + shf.l.wrap.b32 %r12408, %r12414, %r12413, %r11911; // end inline asm - ld.const.u32 %r3157, [matrix+1444]; // begin inline asm - dp4a.u32.u32 %r3156, %r3157, %r5782, %r3152; + shf.l.wrap.b32 %r12412, %r12413, %r12414, %r11911; // end inline asm - ld.const.u32 %r3161, [matrix+1448]; // begin inline asm - dp4a.u32.u32 %r3160, %r3161, %r5786, %r3156; + shf.l.wrap.b32 %r12416, %r12422, %r12421, %r11919; // end inline asm - ld.const.u32 %r3165, [matrix+1452]; // begin inline asm - dp4a.u32.u32 %r3164, %r3165, %r5790, %r3160; + shf.l.wrap.b32 %r12420, %r12421, %r12422, %r11919; // end inline asm - ld.const.u32 %r3169, [matrix+1456]; // begin inline asm - dp4a.u32.u32 %r3168, %r3169, %r5794, %r3164; + shf.l.wrap.b32 %r12428, %r12429, %r12430, %r11927; // end inline asm - ld.const.u32 %r3173, [matrix+1460]; // begin inline asm - dp4a.u32.u32 %r3172, %r3173, %r5798, %r3168; + shf.l.wrap.b32 %r12424, %r12430, %r12429, %r11927; // end inline asm - ld.const.u32 %r3177, [matrix+1464]; + st.local.v2.u32 [%rd3+96], {%r12424, %r12428}; // begin inline asm - dp4a.u32.u32 %r3176, %r3177, %r5802, %r3172; + shf.l.wrap.b32 %r12432, %r12438, %r12437, %r11959; // end inline asm - ld.const.u32 %r3181, [matrix+1468]; // begin inline asm - dp4a.u32.u32 %r3180, %r3181, %r5806, %r3176; + shf.l.wrap.b32 %r12436, %r12437, %r12438, %r11959; // end inline asm - ld.const.u32 %r3185, [matrix+1472]; // begin inline asm - dp4a.u32.u32 %r3184, %r3185, %r5746, %r6244; + shf.l.wrap.b32 %r12440, %r12446, %r12445, %r12007; // end inline asm - ld.const.u32 %r3189, [matrix+1476]; // begin inline asm - dp4a.u32.u32 %r3188, %r3189, %r5750, %r3184; + shf.l.wrap.b32 %r12444, %r12445, %r12446, %r12007; // end inline asm - ld.const.u32 %r3193, [matrix+1480]; // begin inline asm - dp4a.u32.u32 %r3192, %r3193, %r5754, %r3188; + shf.l.wrap.b32 %r12452, %r12453, %r12454, %r12031; // end inline asm - ld.const.u32 %r3197, [matrix+1484]; // begin inline asm - dp4a.u32.u32 %r3196, %r3197, %r5758, %r3192; + shf.l.wrap.b32 %r12448, %r12454, %r12453, %r12031; // end inline asm - ld.const.u32 %r3201, [matrix+1488]; + st.local.v2.u32 [%rd3+88], {%r12448, %r12452}; // begin inline asm - dp4a.u32.u32 %r3200, %r3201, %r5762, %r3196; + shf.l.wrap.b32 %r12456, %r12462, %r12461, %r12047; // end inline asm - ld.const.u32 %r3205, [matrix+1492]; // begin inline asm - dp4a.u32.u32 %r3204, %r3205, %r5766, %r3200; + shf.l.wrap.b32 %r12460, %r12461, %r12462, %r12047; // end inline asm - ld.const.u32 %r3209, [matrix+1496]; // begin inline asm - dp4a.u32.u32 %r3208, %r3209, %r5770, %r3204; + shf.l.wrap.b32 %r12464, %r12470, %r12469, %r12055; // end inline asm - ld.const.u32 %r3213, [matrix+1500]; // begin inline asm - dp4a.u32.u32 %r3212, %r3213, %r5774, %r3208; + shf.l.wrap.b32 %r12468, %r12469, %r12470, %r12055; // end inline asm - ld.const.u32 %r3217, [matrix+1504]; // begin inline asm - dp4a.u32.u32 %r3216, %r3217, %r5778, %r3212; + shf.l.wrap.b32 %r12472, %r12478, %r12477, %r12087; // end inline asm - ld.const.u32 %r3221, [matrix+1508]; // begin inline asm - dp4a.u32.u32 %r3220, %r3221, %r5782, %r3216; + shf.l.wrap.b32 %r12476, %r12477, %r12478, %r12087; // end inline asm - ld.const.u32 %r3225, [matrix+1512]; // begin inline asm - dp4a.u32.u32 %r3224, %r3225, %r5786, %r3220; + // chi + lop3.b32 %r12480, %r12515, %r12408, %r12432, 0xD2; + lop3.b32 %r12481, %r12518, %r12412, %r12436, 0xD2; // end inline asm - ld.const.u32 %r3229, [matrix+1516]; // begin inline asm - dp4a.u32.u32 %r3228, %r3229, %r5790, %r3224; + // chi + lop3.b32 %r12488, %r12408, %r12432, %r12464, 0xD2; + lop3.b32 %r12489, %r12412, %r12436, %r12468, 0xD2; // end inline asm - ld.const.u32 %r3233, [matrix+1520]; + st.local.v2.u32 [%rd3+32], {%r12488, %r12489}; // begin inline asm - dp4a.u32.u32 %r3232, %r3233, %r5794, %r3228; + // chi + lop3.b32 %r12496, %r12432, %r12464, %r12440, 0xD2; + lop3.b32 %r12497, %r12436, %r12468, %r12444, 0xD2; // end inline asm - ld.const.u32 %r3237, [matrix+1524]; + st.local.v2.u32 [%rd3+40], {%r12496, %r12497}; // begin inline asm - dp4a.u32.u32 %r3236, %r3237, %r5798, %r3232; + // chi + lop3.b32 %r12504, %r12464, %r12440, %r12515, 0xD2; + lop3.b32 %r12505, %r12468, %r12444, %r12518, 0xD2; // end inline asm - ld.const.u32 %r3241, [matrix+1528]; + st.local.v2.u32 [%rd3+48], {%r12504, %r12505}; // begin inline asm - dp4a.u32.u32 %r3240, %r3241, %r5802, %r3236; + // chi + lop3.b32 %r12512, %r12440, %r12515, %r12408, 0xD2; + lop3.b32 %r12513, %r12444, %r12518, %r12412, 0xD2; // end inline asm - ld.const.u32 %r3245, [matrix+1532]; + st.local.v2.u32 [%rd3+56], {%r12512, %r12513}; // begin inline asm - dp4a.u32.u32 %r3244, %r3245, %r5806, %r3240; + // chi + lop3.b32 %r12520, %r12456, %r12416, %r12472, 0xD2; + lop3.b32 %r12521, %r12460, %r12420, %r12476, 0xD2; // end inline asm - shr.u32 %r6015, %r3180, 6; - and.b32 %r6016, %r6015, 240; - shr.u32 %r6017, %r3244, 10; - or.b32 %r6018, %r6017, %r6016; - xor.b32 %r6019, %r5862, %r6018; - ld.const.u32 %r3249, [matrix+1536]; + st.local.v2.u32 [%rd3+64], {%r12520, %r12521}; // begin inline asm - dp4a.u32.u32 %r3248, %r3249, %r5746, %r6244; + // chi + lop3.b32 %r12528, %r12416, %r12472, %r12448, 0xD2; + lop3.b32 %r12529, %r12420, %r12476, %r12452, 0xD2; // end inline asm - ld.const.u32 %r3253, [matrix+1540]; + st.local.v2.u32 [%rd3+72], {%r12528, %r12529}; // begin inline asm - dp4a.u32.u32 %r3252, %r3253, %r5750, %r3248; + // chi + lop3.b32 %r12536, %r12472, %r12448, %r12424, 0xD2; + lop3.b32 %r12537, %r12476, %r12452, %r12428, 0xD2; // end inline asm - ld.const.u32 %r3257, [matrix+1544]; + st.local.v2.u32 [%rd3+80], {%r12536, %r12537}; // begin inline asm - dp4a.u32.u32 %r3256, %r3257, %r5754, %r3252; + ld.global.nc.v2.u32 {%r12544,%r12545}, [%rd598]; + // end inline asm + xor.b32 %r12597, %r12481, %r12545; + xor.b32 %r12598, %r12480, %r12544; + mov.b64 %rd1261, {%r12598, %r12597}; + mov.b64 %rd1262, {%r12488, %r12489}; + mov.b64 %rd1263, {%r12496, %r12497}; + mov.b64 %rd1264, {%r12512, %r12513}; + mov.u32 %r29962, 0; + st.local.v2.u32 [%rd3+24], {%r12598, %r12597}; + st.local.v2.u32 [%rd84+96], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+104], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+112], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+120], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+128], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+136], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+144], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+152], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+160], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+168], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+176], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+184], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+192], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+200], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+208], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+216], {%r29962, %r29962}; + mov.u32 %r29977, -2147483648; + st.local.v2.u32 [%rd84+88], {%r12560, %r29977}; + mov.u32 %r29963, %r29962; + mov.u32 %r29964, %r29962; + mov.u32 %r29965, %r29962; + mov.u32 %r29966, %r29962; + mov.u32 %r29967, %r29962; + mov.u32 %r29968, %r29962; + mov.u32 %r29969, %r29962; + mov.u32 %r29970, %r29962; + mov.u32 %r29971, %r29962; + mov.u32 %r29972, %r29962; + mov.u32 %r29973, %r29962; + mov.u32 %r29974, %r29962; + mov.u32 %r29975, %r29962; + mov.u32 %r29976, %r12560; + mov.u32 %r29978, %r29962; + mov.u32 %r29979, %r29962; + mov.u32 %r29980, %r29962; + mov.u32 %r29981, %r29962; + mov.u32 %r29982, %r29962; + mov.u32 %r29983, %r29962; + mov.u32 %r29984, %r29962; + mov.u32 %r29985, %r29962; + mov.u32 %r29986, %r29962; + mov.u32 %r29987, %r29962; + mov.u32 %r29988, %r29962; + mov.u32 %r29989, %r29962; + mov.u32 %r29990, %r29962; + mov.u32 %r29991, %r29962; + mov.u32 %r29992, %r29962; + mov.u32 %r29993, %r29962; + mov.u32 %r29994, %r29962; + mov.u32 %r29995, %r29962; + mov.u32 %r30012, %r29962; + +$L__BB2_42: + // begin inline asm + // xor5 + lop3.b32 %r12599, %r29998, %r29996, %r29994, 0x96; + lop3.b32 %r12599, %r12599, %r29992, %r29990, 0x96; + lop3.b32 %r12600, %r29999, %r29997, %r29995, 0x96; + lop3.b32 %r12600, %r12600, %r29993, %r29991, 0x96; // end inline asm - ld.const.u32 %r3261, [matrix+1548]; // begin inline asm - dp4a.u32.u32 %r3260, %r3261, %r5758, %r3256; + // xor5 + lop3.b32 %r12611, %r30010, %r30008, %r29988, 0x96; + lop3.b32 %r12611, %r12611, %r29986, %r29984, 0x96; + lop3.b32 %r12612, %r30011, %r30009, %r29989, 0x96; + lop3.b32 %r12612, %r12612, %r29987, %r29985, 0x96; // end inline asm - ld.const.u32 %r3265, [matrix+1552]; // begin inline asm - dp4a.u32.u32 %r3264, %r3265, %r5762, %r3260; + // xor5 + lop3.b32 %r12623, %r30006, %r30004, %r29982, 0x96; + lop3.b32 %r12623, %r12623, %r29980, %r29978, 0x96; + lop3.b32 %r12624, %r30007, %r30005, %r29983, 0x96; + lop3.b32 %r12624, %r12624, %r29981, %r29979, 0x96; // end inline asm - ld.const.u32 %r3269, [matrix+1556]; // begin inline asm - dp4a.u32.u32 %r3268, %r3269, %r5766, %r3264; + // xor5 + lop3.b32 %r12635, %r30002, %r29976, %r29974, 0x96; + lop3.b32 %r12635, %r12635, %r29972, %r29970, 0x96; + lop3.b32 %r12636, %r30003, %r29977, %r29975, 0x96; + lop3.b32 %r12636, %r12636, %r29973, %r29971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12647, %r30000, %r29968, %r29966, 0x96; + lop3.b32 %r12647, %r12647, %r29964, %r29962, 0x96; + lop3.b32 %r12648, %r30001, %r29969, %r29967, 0x96; + lop3.b32 %r12648, %r12648, %r29965, %r29963, 0x96; // end inline asm - ld.const.u32 %r3273, [matrix+1560]; // begin inline asm - dp4a.u32.u32 %r3272, %r3273, %r5770, %r3268; + shf.l.wrap.b32 %r12659, %r12612, %r12611, %r12560; // end inline asm - ld.const.u32 %r3277, [matrix+1564]; // begin inline asm - dp4a.u32.u32 %r3276, %r3277, %r5774, %r3272; + shf.l.wrap.b32 %r12663, %r12611, %r12612, %r12560; // end inline asm - ld.const.u32 %r3281, [matrix+1568]; + xor.b32 %r13093, %r12659, %r12647; + xor.b32 %r13094, %r12663, %r12648; + xor.b32 %r12926, %r29998, %r13093; + xor.b32 %r12929, %r29999, %r13094; + xor.b32 %r12833, %r29996, %r13093; + xor.b32 %r12832, %r29997, %r13094; + xor.b32 %r12880, %r29994, %r13093; + xor.b32 %r12881, %r29995, %r13094; + xor.b32 %r12785, %r29992, %r13093; + xor.b32 %r12784, %r29993, %r13094; + xor.b32 %r12736, %r29990, %r13093; + xor.b32 %r12737, %r29991, %r13094; // begin inline asm - dp4a.u32.u32 %r3280, %r3281, %r5778, %r3276; + shf.l.wrap.b32 %r12667, %r12624, %r12623, %r12560; // end inline asm - ld.const.u32 %r3285, [matrix+1572]; // begin inline asm - dp4a.u32.u32 %r3284, %r3285, %r5782, %r3280; + shf.l.wrap.b32 %r12671, %r12623, %r12624, %r12560; // end inline asm - ld.const.u32 %r3289, [matrix+1576]; + xor.b32 %r13095, %r12667, %r12599; + xor.b32 %r13096, %r12671, %r12600; + xor.b32 %r12888, %r30010, %r13095; + xor.b32 %r12889, %r30011, %r13096; + xor.b32 %r12705, %r30008, %r13095; + xor.b32 %r12704, %r30009, %r13096; + xor.b32 %r12864, %r29988, %r13095; + xor.b32 %r12865, %r29989, %r13096; + xor.b32 %r12825, %r29986, %r13095; + xor.b32 %r12824, %r29987, %r13096; + xor.b32 %r12808, %r29984, %r13095; + xor.b32 %r12809, %r29985, %r13096; // begin inline asm - dp4a.u32.u32 %r3288, %r3289, %r5786, %r3284; + shf.l.wrap.b32 %r12675, %r12636, %r12635, %r12560; // end inline asm - ld.const.u32 %r3293, [matrix+1580]; // begin inline asm - dp4a.u32.u32 %r3292, %r3293, %r5790, %r3288; + shf.l.wrap.b32 %r12679, %r12635, %r12636, %r12560; // end inline asm - ld.const.u32 %r3297, [matrix+1584]; + xor.b32 %r13097, %r12675, %r12611; + xor.b32 %r13098, %r12679, %r12612; + xor.b32 %r12745, %r30006, %r13097; + xor.b32 %r12744, %r30007, %r13098; + xor.b32 %r12872, %r30004, %r13097; + xor.b32 %r12873, %r30005, %r13098; + xor.b32 %r12753, %r29982, %r13097; + xor.b32 %r12752, %r29983, %r13098; + xor.b32 %r12856, %r29980, %r13097; + xor.b32 %r12857, %r29981, %r13098; + xor.b32 %r12721, %r29978, %r13097; + xor.b32 %r12720, %r29979, %r13098; // begin inline asm - dp4a.u32.u32 %r3296, %r3297, %r5794, %r3292; + shf.l.wrap.b32 %r12683, %r12648, %r12647, %r12560; // end inline asm - ld.const.u32 %r3301, [matrix+1588]; // begin inline asm - dp4a.u32.u32 %r3300, %r3301, %r5798, %r3296; + shf.l.wrap.b32 %r12687, %r12647, %r12648, %r12560; // end inline asm - ld.const.u32 %r3305, [matrix+1592]; + xor.b32 %r13099, %r12683, %r12623; + xor.b32 %r13100, %r12687, %r12624; + xor.b32 %r12840, %r30002, %r13099; + xor.b32 %r12841, %r30003, %r13100; + xor.b32 %r12817, %r29976, %r13099; + xor.b32 %r12816, %r29977, %r13100; + xor.b32 %r12760, %r29974, %r13099; + xor.b32 %r12761, %r29975, %r13100; + xor.b32 %r12848, %r29972, %r13099; + xor.b32 %r12849, %r29973, %r13100; + xor.b32 %r12777, %r29970, %r13099; + xor.b32 %r12776, %r29971, %r13100; // begin inline asm - dp4a.u32.u32 %r3304, %r3305, %r5802, %r3300; + shf.l.wrap.b32 %r12691, %r12600, %r12599, %r12560; // end inline asm - ld.const.u32 %r3309, [matrix+1596]; // begin inline asm - dp4a.u32.u32 %r3308, %r3309, %r5806, %r3304; + shf.l.wrap.b32 %r12695, %r12599, %r12600, %r12560; // end inline asm - ld.const.u32 %r3313, [matrix+1600]; + xor.b32 %r13101, %r12691, %r12635; + xor.b32 %r13102, %r12695, %r12636; + xor.b32 %r12792, %r30000, %r13101; + xor.b32 %r12793, %r30001, %r13102; + xor.b32 %r12712, %r29968, %r13101; + xor.b32 %r12713, %r29969, %r13102; + xor.b32 %r12729, %r29966, %r13101; + xor.b32 %r12728, %r29967, %r13102; + xor.b32 %r12768, %r29964, %r13101; + xor.b32 %r12769, %r29965, %r13102; + xor.b32 %r12800, %r29962, %r13101; + xor.b32 %r12801, %r29963, %r13102; + mov.u32 %r12706, 44; // begin inline asm - dp4a.u32.u32 %r3312, %r3313, %r5746, %r6244; + shf.l.wrap.b32 %r12699, %r12705, %r12704, %r12706; // end inline asm - ld.const.u32 %r3317, [matrix+1604]; // begin inline asm - dp4a.u32.u32 %r3316, %r3317, %r5750, %r3312; + shf.l.wrap.b32 %r12703, %r12704, %r12705, %r12706; // end inline asm - ld.const.u32 %r3321, [matrix+1608]; + mov.u32 %r12714, 20; // begin inline asm - dp4a.u32.u32 %r3320, %r3321, %r5754, %r3316; + shf.l.wrap.b32 %r12707, %r12713, %r12712, %r12714; // end inline asm - ld.const.u32 %r3325, [matrix+1612]; // begin inline asm - dp4a.u32.u32 %r3324, %r3325, %r5758, %r3320; + shf.l.wrap.b32 %r12711, %r12712, %r12713, %r12714; // end inline asm - ld.const.u32 %r3329, [matrix+1616]; + mov.u32 %r12722, 61; // begin inline asm - dp4a.u32.u32 %r3328, %r3329, %r5762, %r3324; + shf.l.wrap.b32 %r12715, %r12721, %r12720, %r12722; // end inline asm - ld.const.u32 %r3333, [matrix+1620]; // begin inline asm - dp4a.u32.u32 %r3332, %r3333, %r5766, %r3328; + shf.l.wrap.b32 %r12719, %r12720, %r12721, %r12722; // end inline asm - ld.const.u32 %r3337, [matrix+1624]; + mov.u32 %r12730, 39; // begin inline asm - dp4a.u32.u32 %r3336, %r3337, %r5770, %r3332; + shf.l.wrap.b32 %r12723, %r12729, %r12728, %r12730; // end inline asm - ld.const.u32 %r3341, [matrix+1628]; // begin inline asm - dp4a.u32.u32 %r3340, %r3341, %r5774, %r3336; + shf.l.wrap.b32 %r12727, %r12728, %r12729, %r12730; // end inline asm - ld.const.u32 %r3345, [matrix+1632]; + mov.u32 %r12738, 18; // begin inline asm - dp4a.u32.u32 %r3344, %r3345, %r5778, %r3340; + shf.l.wrap.b32 %r12731, %r12737, %r12736, %r12738; // end inline asm - ld.const.u32 %r3349, [matrix+1636]; // begin inline asm - dp4a.u32.u32 %r3348, %r3349, %r5782, %r3344; + shf.l.wrap.b32 %r12735, %r12736, %r12737, %r12738; // end inline asm - ld.const.u32 %r3353, [matrix+1640]; + mov.u32 %r12746, 62; // begin inline asm - dp4a.u32.u32 %r3352, %r3353, %r5786, %r3348; + shf.l.wrap.b32 %r12739, %r12745, %r12744, %r12746; // end inline asm - ld.const.u32 %r3357, [matrix+1644]; // begin inline asm - dp4a.u32.u32 %r3356, %r3357, %r5790, %r3352; + shf.l.wrap.b32 %r12743, %r12744, %r12745, %r12746; // end inline asm - ld.const.u32 %r3361, [matrix+1648]; + mov.u32 %r12754, 43; // begin inline asm - dp4a.u32.u32 %r3360, %r3361, %r5794, %r3356; + shf.l.wrap.b32 %r12747, %r12753, %r12752, %r12754; // end inline asm - ld.const.u32 %r3365, [matrix+1652]; // begin inline asm - dp4a.u32.u32 %r3364, %r3365, %r5798, %r3360; + shf.l.wrap.b32 %r12751, %r12752, %r12753, %r12754; // end inline asm - ld.const.u32 %r3369, [matrix+1656]; + mov.u32 %r12762, 25; // begin inline asm - dp4a.u32.u32 %r3368, %r3369, %r5802, %r3364; + shf.l.wrap.b32 %r12755, %r12761, %r12760, %r12762; // end inline asm - ld.const.u32 %r3373, [matrix+1660]; // begin inline asm - dp4a.u32.u32 %r3372, %r3373, %r5806, %r3368; + shf.l.wrap.b32 %r12759, %r12760, %r12761, %r12762; // end inline asm - shr.u32 %r6020, %r3308, 6; - and.b32 %r6021, %r6020, 240; - shr.u32 %r6022, %r3372, 10; - or.b32 %r6023, %r6022, %r6021; - cvt.u64.u32 %rd217, %r6023; - xor.b64 %rd218, %rd13, %rd217; - ld.const.u32 %r3377, [matrix+1664]; + mov.u32 %r12770, 8; // begin inline asm - dp4a.u32.u32 %r3376, %r3377, %r5746, %r6244; + shf.l.wrap.b32 %r12763, %r12769, %r12768, %r12770; // end inline asm - ld.const.u32 %r3381, [matrix+1668]; // begin inline asm - dp4a.u32.u32 %r3380, %r3381, %r5750, %r3376; + shf.l.wrap.b32 %r12767, %r12768, %r12769, %r12770; // end inline asm - ld.const.u32 %r3385, [matrix+1672]; + mov.u32 %r12778, 56; // begin inline asm - dp4a.u32.u32 %r3384, %r3385, %r5754, %r3380; + shf.l.wrap.b32 %r12771, %r12777, %r12776, %r12778; // end inline asm - ld.const.u32 %r3389, [matrix+1676]; // begin inline asm - dp4a.u32.u32 %r3388, %r3389, %r5758, %r3384; + shf.l.wrap.b32 %r12775, %r12776, %r12777, %r12778; // end inline asm - ld.const.u32 %r3393, [matrix+1680]; + mov.u32 %r12786, 41; // begin inline asm - dp4a.u32.u32 %r3392, %r3393, %r5762, %r3388; + shf.l.wrap.b32 %r12779, %r12785, %r12784, %r12786; // end inline asm - ld.const.u32 %r3397, [matrix+1684]; // begin inline asm - dp4a.u32.u32 %r3396, %r3397, %r5766, %r3392; + shf.l.wrap.b32 %r12783, %r12784, %r12785, %r12786; // end inline asm - ld.const.u32 %r3401, [matrix+1688]; + mov.u32 %r12794, 27; // begin inline asm - dp4a.u32.u32 %r3400, %r3401, %r5770, %r3396; + shf.l.wrap.b32 %r12787, %r12793, %r12792, %r12794; // end inline asm - ld.const.u32 %r3405, [matrix+1692]; // begin inline asm - dp4a.u32.u32 %r3404, %r3405, %r5774, %r3400; + shf.l.wrap.b32 %r12791, %r12792, %r12793, %r12794; // end inline asm - ld.const.u32 %r3409, [matrix+1696]; + mov.u32 %r12802, 14; // begin inline asm - dp4a.u32.u32 %r3408, %r3409, %r5778, %r3404; + shf.l.wrap.b32 %r12795, %r12801, %r12800, %r12802; // end inline asm - ld.const.u32 %r3413, [matrix+1700]; // begin inline asm - dp4a.u32.u32 %r3412, %r3413, %r5782, %r3408; + shf.l.wrap.b32 %r12799, %r12800, %r12801, %r12802; // end inline asm - ld.const.u32 %r3417, [matrix+1704]; + mov.u32 %r12810, 2; // begin inline asm - dp4a.u32.u32 %r3416, %r3417, %r5786, %r3412; + shf.l.wrap.b32 %r12803, %r12809, %r12808, %r12810; // end inline asm - ld.const.u32 %r3421, [matrix+1708]; // begin inline asm - dp4a.u32.u32 %r3420, %r3421, %r5790, %r3416; + shf.l.wrap.b32 %r12807, %r12808, %r12809, %r12810; // end inline asm - ld.const.u32 %r3425, [matrix+1712]; + mov.u32 %r12818, 55; // begin inline asm - dp4a.u32.u32 %r3424, %r3425, %r5794, %r3420; + shf.l.wrap.b32 %r12811, %r12817, %r12816, %r12818; // end inline asm - ld.const.u32 %r3429, [matrix+1716]; // begin inline asm - dp4a.u32.u32 %r3428, %r3429, %r5798, %r3424; + shf.l.wrap.b32 %r12815, %r12816, %r12817, %r12818; // end inline asm - ld.const.u32 %r3433, [matrix+1720]; + mov.u32 %r12826, 45; // begin inline asm - dp4a.u32.u32 %r3432, %r3433, %r5802, %r3428; + shf.l.wrap.b32 %r12819, %r12825, %r12824, %r12826; // end inline asm - ld.const.u32 %r3437, [matrix+1724]; // begin inline asm - dp4a.u32.u32 %r3436, %r3437, %r5806, %r3432; + shf.l.wrap.b32 %r12823, %r12824, %r12825, %r12826; // end inline asm - ld.const.u32 %r3441, [matrix+1728]; + mov.u32 %r12834, 36; // begin inline asm - dp4a.u32.u32 %r3440, %r3441, %r5746, %r6244; + shf.l.wrap.b32 %r12827, %r12833, %r12832, %r12834; // end inline asm - ld.const.u32 %r3445, [matrix+1732]; // begin inline asm - dp4a.u32.u32 %r3444, %r3445, %r5750, %r3440; + shf.l.wrap.b32 %r12831, %r12832, %r12833, %r12834; // end inline asm - ld.const.u32 %r3449, [matrix+1736]; + mov.u32 %r12842, 28; // begin inline asm - dp4a.u32.u32 %r3448, %r3449, %r5754, %r3444; + shf.l.wrap.b32 %r12835, %r12841, %r12840, %r12842; // end inline asm - ld.const.u32 %r3453, [matrix+1740]; // begin inline asm - dp4a.u32.u32 %r3452, %r3453, %r5758, %r3448; + shf.l.wrap.b32 %r12839, %r12840, %r12841, %r12842; // end inline asm - ld.const.u32 %r3457, [matrix+1744]; + mov.u32 %r12850, 21; // begin inline asm - dp4a.u32.u32 %r3456, %r3457, %r5762, %r3452; + shf.l.wrap.b32 %r12843, %r12849, %r12848, %r12850; // end inline asm - ld.const.u32 %r3461, [matrix+1748]; // begin inline asm - dp4a.u32.u32 %r3460, %r3461, %r5766, %r3456; + shf.l.wrap.b32 %r12847, %r12848, %r12849, %r12850; // end inline asm - ld.const.u32 %r3465, [matrix+1752]; + mov.u32 %r12858, 15; // begin inline asm - dp4a.u32.u32 %r3464, %r3465, %r5770, %r3460; + shf.l.wrap.b32 %r12851, %r12857, %r12856, %r12858; // end inline asm - ld.const.u32 %r3469, [matrix+1756]; // begin inline asm - dp4a.u32.u32 %r3468, %r3469, %r5774, %r3464; + shf.l.wrap.b32 %r12855, %r12856, %r12857, %r12858; // end inline asm - ld.const.u32 %r3473, [matrix+1760]; + mov.u32 %r12866, 10; // begin inline asm - dp4a.u32.u32 %r3472, %r3473, %r5778, %r3468; + shf.l.wrap.b32 %r12859, %r12865, %r12864, %r12866; // end inline asm - ld.const.u32 %r3477, [matrix+1764]; // begin inline asm - dp4a.u32.u32 %r3476, %r3477, %r5782, %r3472; + shf.l.wrap.b32 %r12863, %r12864, %r12865, %r12866; // end inline asm - ld.const.u32 %r3481, [matrix+1768]; + mov.u32 %r12874, 6; // begin inline asm - dp4a.u32.u32 %r3480, %r3481, %r5786, %r3476; + shf.l.wrap.b32 %r12867, %r12873, %r12872, %r12874; // end inline asm - ld.const.u32 %r3485, [matrix+1772]; // begin inline asm - dp4a.u32.u32 %r3484, %r3485, %r5790, %r3480; + shf.l.wrap.b32 %r12871, %r12872, %r12873, %r12874; // end inline asm - ld.const.u32 %r3489, [matrix+1776]; + mov.u32 %r12882, 3; // begin inline asm - dp4a.u32.u32 %r3488, %r3489, %r5794, %r3484; + shf.l.wrap.b32 %r12875, %r12881, %r12880, %r12882; // end inline asm - ld.const.u32 %r3493, [matrix+1780]; // begin inline asm - dp4a.u32.u32 %r3492, %r3493, %r5798, %r3488; + shf.l.wrap.b32 %r12879, %r12880, %r12881, %r12882; // end inline asm - ld.const.u32 %r3497, [matrix+1784]; // begin inline asm - dp4a.u32.u32 %r3496, %r3497, %r5802, %r3492; + shf.l.wrap.b32 %r12883, %r12889, %r12888, %r12560; // end inline asm - ld.const.u32 %r3501, [matrix+1788]; // begin inline asm - dp4a.u32.u32 %r3500, %r3501, %r5806, %r3496; + shf.l.wrap.b32 %r12887, %r12888, %r12889, %r12560; // end inline asm - shr.u32 %r6024, %r3436, 6; - and.b32 %r6025, %r6024, 240; - shr.u32 %r6026, %r3500, 10; - or.b32 %r6027, %r6026, %r6025; - cvt.u64.u32 %rd219, %r6027; - xor.b64 %rd220, %rd14, %rd219; - ld.const.u32 %r3505, [matrix+1792]; // begin inline asm - dp4a.u32.u32 %r3504, %r3505, %r5746, %r6244; + // chi + lop3.b32 %r12891, %r12926, %r12699, %r12747, 0xD2; + lop3.b32 %r12892, %r12929, %r12703, %r12751, 0xD2; // end inline asm - ld.const.u32 %r3509, [matrix+1796]; // begin inline asm - dp4a.u32.u32 %r3508, %r3509, %r5750, %r3504; + // chi + lop3.b32 %r30010, %r12699, %r12747, %r12843, 0xD2; + lop3.b32 %r30011, %r12703, %r12751, %r12847, 0xD2; // end inline asm - ld.const.u32 %r3513, [matrix+1800]; // begin inline asm - dp4a.u32.u32 %r3512, %r3513, %r5754, %r3508; + // chi + lop3.b32 %r30006, %r12747, %r12843, %r12795, 0xD2; + lop3.b32 %r30007, %r12751, %r12847, %r12799, 0xD2; // end inline asm - ld.const.u32 %r3517, [matrix+1804]; // begin inline asm - dp4a.u32.u32 %r3516, %r3517, %r5758, %r3512; + // chi + lop3.b32 %r30002, %r12843, %r12795, %r12926, 0xD2; + lop3.b32 %r30003, %r12847, %r12799, %r12929, 0xD2; // end inline asm - ld.const.u32 %r3521, [matrix+1808]; // begin inline asm - dp4a.u32.u32 %r3520, %r3521, %r5762, %r3516; + // chi + lop3.b32 %r30000, %r12795, %r12926, %r12699, 0xD2; + lop3.b32 %r30001, %r12799, %r12929, %r12703, 0xD2; // end inline asm - ld.const.u32 %r3525, [matrix+1812]; // begin inline asm - dp4a.u32.u32 %r3524, %r3525, %r5766, %r3520; + // chi + lop3.b32 %r29996, %r12835, %r12707, %r12875, 0xD2; + lop3.b32 %r29997, %r12839, %r12711, %r12879, 0xD2; // end inline asm - ld.const.u32 %r3529, [matrix+1816]; // begin inline asm - dp4a.u32.u32 %r3528, %r3529, %r5770, %r3524; + // chi + lop3.b32 %r30008, %r12707, %r12875, %r12819, 0xD2; + lop3.b32 %r30009, %r12711, %r12879, %r12823, 0xD2; // end inline asm - ld.const.u32 %r3533, [matrix+1820]; // begin inline asm - dp4a.u32.u32 %r3532, %r3533, %r5774, %r3528; + // chi + lop3.b32 %r30004, %r12875, %r12819, %r12715, 0xD2; + lop3.b32 %r30005, %r12879, %r12823, %r12719, 0xD2; // end inline asm - ld.const.u32 %r3537, [matrix+1824]; // begin inline asm - dp4a.u32.u32 %r3536, %r3537, %r5778, %r3532; + // chi + lop3.b32 %r29976, %r12819, %r12715, %r12835, 0xD2; + lop3.b32 %r29977, %r12823, %r12719, %r12839, 0xD2; // end inline asm - ld.const.u32 %r3541, [matrix+1828]; + st.local.v2.u32 [%rd84+88], {%r29976, %r29977}; // begin inline asm - dp4a.u32.u32 %r3540, %r3541, %r5782, %r3536; + // chi + lop3.b32 %r29968, %r12715, %r12835, %r12707, 0xD2; + lop3.b32 %r29969, %r12719, %r12839, %r12711, 0xD2; // end inline asm - ld.const.u32 %r3545, [matrix+1832]; + st.local.v2.u32 [%rd84+96], {%r29968, %r29969}; // begin inline asm - dp4a.u32.u32 %r3544, %r3545, %r5786, %r3540; + // chi + lop3.b32 %r29994, %r12883, %r12867, %r12755, 0xD2; + lop3.b32 %r29995, %r12887, %r12871, %r12759, 0xD2; // end inline asm - ld.const.u32 %r3549, [matrix+1836]; + st.local.v2.u32 [%rd84+104], {%r29994, %r29995}; // begin inline asm - dp4a.u32.u32 %r3548, %r3549, %r5790, %r3544; + // chi + lop3.b32 %r29988, %r12867, %r12755, %r12763, 0xD2; + lop3.b32 %r29989, %r12871, %r12759, %r12767, 0xD2; // end inline asm - ld.const.u32 %r3553, [matrix+1840]; + st.local.v2.u32 [%rd84+112], {%r29988, %r29989}; // begin inline asm - dp4a.u32.u32 %r3552, %r3553, %r5794, %r3548; + // chi + lop3.b32 %r29982, %r12755, %r12763, %r12731, 0xD2; + lop3.b32 %r29983, %r12759, %r12767, %r12735, 0xD2; // end inline asm - ld.const.u32 %r3557, [matrix+1844]; + st.local.v2.u32 [%rd84+120], {%r29982, %r29983}; // begin inline asm - dp4a.u32.u32 %r3556, %r3557, %r5798, %r3552; + // chi + lop3.b32 %r29974, %r12763, %r12731, %r12883, 0xD2; + lop3.b32 %r29975, %r12767, %r12735, %r12887, 0xD2; // end inline asm - ld.const.u32 %r3561, [matrix+1848]; + st.local.v2.u32 [%rd84+128], {%r29974, %r29975}; // begin inline asm - dp4a.u32.u32 %r3560, %r3561, %r5802, %r3556; + // chi + lop3.b32 %r29966, %r12731, %r12883, %r12867, 0xD2; + lop3.b32 %r29967, %r12735, %r12887, %r12871, 0xD2; // end inline asm - ld.const.u32 %r3565, [matrix+1852]; + st.local.v2.u32 [%rd84+136], {%r29966, %r29967}; // begin inline asm - dp4a.u32.u32 %r3564, %r3565, %r5806, %r3560; + // chi + lop3.b32 %r29992, %r12787, %r12827, %r12859, 0xD2; + lop3.b32 %r29993, %r12791, %r12831, %r12863, 0xD2; // end inline asm - ld.const.u32 %r3569, [matrix+1856]; + st.local.v2.u32 [%rd84+144], {%r29992, %r29993}; // begin inline asm - dp4a.u32.u32 %r3568, %r3569, %r5746, %r6244; + // chi + lop3.b32 %r29986, %r12827, %r12859, %r12851, 0xD2; + lop3.b32 %r29987, %r12831, %r12863, %r12855, 0xD2; // end inline asm - ld.const.u32 %r3573, [matrix+1860]; + st.local.v2.u32 [%rd84+152], {%r29986, %r29987}; // begin inline asm - dp4a.u32.u32 %r3572, %r3573, %r5750, %r3568; + // chi + lop3.b32 %r29980, %r12859, %r12851, %r12771, 0xD2; + lop3.b32 %r29981, %r12863, %r12855, %r12775, 0xD2; // end inline asm - ld.const.u32 %r3577, [matrix+1864]; + st.local.v2.u32 [%rd84+160], {%r29980, %r29981}; // begin inline asm - dp4a.u32.u32 %r3576, %r3577, %r5754, %r3572; + // chi + lop3.b32 %r29972, %r12851, %r12771, %r12787, 0xD2; + lop3.b32 %r29973, %r12855, %r12775, %r12791, 0xD2; // end inline asm - ld.const.u32 %r3581, [matrix+1868]; + st.local.v2.u32 [%rd84+168], {%r29972, %r29973}; // begin inline asm - dp4a.u32.u32 %r3580, %r3581, %r5758, %r3576; + // chi + lop3.b32 %r29964, %r12771, %r12787, %r12827, 0xD2; + lop3.b32 %r29965, %r12775, %r12791, %r12831, 0xD2; // end inline asm - ld.const.u32 %r3585, [matrix+1872]; + st.local.v2.u32 [%rd84+176], {%r29964, %r29965}; // begin inline asm - dp4a.u32.u32 %r3584, %r3585, %r5762, %r3580; + // chi + lop3.b32 %r29990, %r12739, %r12811, %r12723, 0xD2; + lop3.b32 %r29991, %r12743, %r12815, %r12727, 0xD2; // end inline asm - ld.const.u32 %r3589, [matrix+1876]; + st.local.v2.u32 [%rd84+184], {%r29990, %r29991}; // begin inline asm - dp4a.u32.u32 %r3588, %r3589, %r5766, %r3584; + // chi + lop3.b32 %r29984, %r12811, %r12723, %r12779, 0xD2; + lop3.b32 %r29985, %r12815, %r12727, %r12783, 0xD2; // end inline asm - ld.const.u32 %r3593, [matrix+1880]; + st.local.v2.u32 [%rd84+192], {%r29984, %r29985}; // begin inline asm - dp4a.u32.u32 %r3592, %r3593, %r5770, %r3588; + // chi + lop3.b32 %r29978, %r12723, %r12779, %r12803, 0xD2; + lop3.b32 %r29979, %r12727, %r12783, %r12807, 0xD2; // end inline asm - ld.const.u32 %r3597, [matrix+1884]; + st.local.v2.u32 [%rd84+200], {%r29978, %r29979}; // begin inline asm - dp4a.u32.u32 %r3596, %r3597, %r5774, %r3592; + // chi + lop3.b32 %r29970, %r12779, %r12803, %r12739, 0xD2; + lop3.b32 %r29971, %r12783, %r12807, %r12743, 0xD2; // end inline asm - ld.const.u32 %r3601, [matrix+1888]; + st.local.v2.u32 [%rd84+208], {%r29970, %r29971}; // begin inline asm - dp4a.u32.u32 %r3600, %r3601, %r5778, %r3596; + // chi + lop3.b32 %r29962, %r12803, %r12739, %r12811, 0xD2; + lop3.b32 %r29963, %r12807, %r12743, %r12815, 0xD2; // end inline asm - ld.const.u32 %r3605, [matrix+1892]; + st.local.v2.u32 [%rd84+216], {%r29962, %r29963}; + mul.wide.s32 %rd672, %r30012, 8; + add.s64 %rd671, %rd597, %rd672; // begin inline asm - dp4a.u32.u32 %r3604, %r3605, %r5782, %r3600; + ld.global.nc.v2.u32 {%r13091,%r13092}, [%rd671]; // end inline asm - ld.const.u32 %r3609, [matrix+1896]; + xor.b32 %r29998, %r12891, %r13091; + xor.b32 %r29999, %r12892, %r13092; + add.s32 %r30012, %r30012, 1; + setp.lt.u32 %p27, %r30012, 23; + @%p27 bra $L__BB2_42; + + mov.u32 %r13202, 1; + st.local.v2.u32 [%rd84+32], {%r30010, %r30011}; + st.local.v2.u32 [%rd84+72], {%r30008, %r30009}; + st.local.v2.u32 [%rd84+40], {%r30006, %r30007}; + st.local.v2.u32 [%rd84+80], {%r30004, %r30005}; + st.local.v2.u32 [%rd84+48], {%r30002, %r30003}; + st.local.v2.u32 [%rd84+56], {%r30000, %r30001}; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; // begin inline asm - dp4a.u32.u32 %r3608, %r3609, %r5786, %r3604; + // xor5 + lop3.b32 %r13103, %r29998, %r29996, %r29994, 0x96; + lop3.b32 %r13103, %r13103, %r29992, %r29990, 0x96; + lop3.b32 %r13104, %r29999, %r29997, %r29995, 0x96; + lop3.b32 %r13104, %r13104, %r29993, %r29991, 0x96; // end inline asm - ld.const.u32 %r3613, [matrix+1900]; // begin inline asm - dp4a.u32.u32 %r3612, %r3613, %r5790, %r3608; + // xor5 + lop3.b32 %r13115, %r30010, %r30008, %r29988, 0x96; + lop3.b32 %r13115, %r13115, %r29986, %r29984, 0x96; + lop3.b32 %r13116, %r30011, %r30009, %r29989, 0x96; + lop3.b32 %r13116, %r13116, %r29987, %r29985, 0x96; // end inline asm - ld.const.u32 %r3617, [matrix+1904]; // begin inline asm - dp4a.u32.u32 %r3616, %r3617, %r5794, %r3612; + // xor5 + lop3.b32 %r13127, %r30006, %r30004, %r29982, 0x96; + lop3.b32 %r13127, %r13127, %r29980, %r29978, 0x96; + lop3.b32 %r13128, %r30007, %r30005, %r29983, 0x96; + lop3.b32 %r13128, %r13128, %r29981, %r29979, 0x96; // end inline asm - ld.const.u32 %r3621, [matrix+1908]; // begin inline asm - dp4a.u32.u32 %r3620, %r3621, %r5798, %r3616; + // xor5 + lop3.b32 %r13139, %r30002, %r29976, %r29974, 0x96; + lop3.b32 %r13139, %r13139, %r29972, %r29970, 0x96; + lop3.b32 %r13140, %r30003, %r29977, %r29975, 0x96; + lop3.b32 %r13140, %r13140, %r29973, %r29971, 0x96; // end inline asm - ld.const.u32 %r3625, [matrix+1912]; // begin inline asm - dp4a.u32.u32 %r3624, %r3625, %r5802, %r3620; + // xor5 + lop3.b32 %r13151, %r30000, %r29968, %r29966, 0x96; + lop3.b32 %r13151, %r13151, %r29964, %r29962, 0x96; + lop3.b32 %r13152, %r30001, %r29969, %r29967, 0x96; + lop3.b32 %r13152, %r13152, %r29965, %r29963, 0x96; // end inline asm - ld.const.u32 %r3629, [matrix+1916]; // begin inline asm - dp4a.u32.u32 %r3628, %r3629, %r5806, %r3624; + shf.l.wrap.b32 %r13163, %r13116, %r13115, %r13202; // end inline asm - shr.u32 %r6028, %r3564, 6; - and.b32 %r6029, %r6028, 240; - shr.u32 %r6030, %r3628, 10; - or.b32 %r6031, %r6030, %r6029; - cvt.u64.u32 %rd221, %r6031; - xor.b64 %rd222, %rd15, %rd221; - ld.const.u32 %r3633, [matrix+1920]; // begin inline asm - dp4a.u32.u32 %r3632, %r3633, %r5746, %r6244; + shf.l.wrap.b32 %r13167, %r13115, %r13116, %r13202; // end inline asm - ld.const.u32 %r3637, [matrix+1924]; + xor.b32 %r13341, %r13163, %r13151; + xor.b32 %r13342, %r13167, %r13152; + xor.b32 %r13310, %r29998, %r13341; + xor.b32 %r13313, %r29999, %r13342; + xor.b32 %r13273, %r29995, %r13342; + xor.b32 %r13272, %r29994, %r13341; + st.local.v2.u32 [%rd84+104], {%r13272, %r13273}; // begin inline asm - dp4a.u32.u32 %r3636, %r3637, %r5750, %r3632; + shf.l.wrap.b32 %r13171, %r13128, %r13127, %r13202; // end inline asm - ld.const.u32 %r3641, [matrix+1928]; // begin inline asm - dp4a.u32.u32 %r3640, %r3641, %r5754, %r3636; + shf.l.wrap.b32 %r13175, %r13127, %r13128, %r13202; // end inline asm - ld.const.u32 %r3645, [matrix+1932]; + xor.b32 %r13343, %r13171, %r13103; + xor.b32 %r13344, %r13175, %r13104; + xor.b32 %r13209, %r30008, %r13343; + xor.b32 %r13208, %r30009, %r13344; + xor.b32 %r13248, %r29987, %r13344; + xor.b32 %r13249, %r29986, %r13343; + st.local.v2.u32 [%rd84+152], {%r13249, %r13248}; // begin inline asm - dp4a.u32.u32 %r3644, %r3645, %r5758, %r3640; + shf.l.wrap.b32 %r13179, %r13140, %r13139, %r13202; // end inline asm - ld.const.u32 %r3649, [matrix+1936]; // begin inline asm - dp4a.u32.u32 %r3648, %r3649, %r5762, %r3644; + shf.l.wrap.b32 %r13183, %r13139, %r13140, %r13202; // end inline asm - ld.const.u32 %r3653, [matrix+1940]; + xor.b32 %r13345, %r13179, %r13115; + xor.b32 %r13346, %r13183, %r13116; + xor.b32 %r13232, %r29983, %r13346; + xor.b32 %r13233, %r29982, %r13345; + st.local.v2.u32 [%rd84+120], {%r13233, %r13232}; + xor.b32 %r13224, %r29979, %r13346; + xor.b32 %r13225, %r29978, %r13345; + st.local.v2.u32 [%rd84+200], {%r13225, %r13224}; // begin inline asm - dp4a.u32.u32 %r3652, %r3653, %r5766, %r3648; + shf.l.wrap.b32 %r13187, %r13152, %r13151, %r13202; // end inline asm - ld.const.u32 %r3657, [matrix+1944]; // begin inline asm - dp4a.u32.u32 %r3656, %r3657, %r5770, %r3652; + shf.l.wrap.b32 %r13191, %r13151, %r13152, %r13202; // end inline asm - ld.const.u32 %r3661, [matrix+1948]; + xor.b32 %r13347, %r13187, %r13127; + xor.b32 %r13348, %r13191, %r13128; + xor.b32 %r13256, %r30002, %r13347; + xor.b32 %r13257, %r30003, %r13348; + xor.b32 %r13265, %r29973, %r13348; + xor.b32 %r13264, %r29972, %r13347; + st.local.v2.u32 [%rd84+168], {%r13264, %r13265}; // begin inline asm - dp4a.u32.u32 %r3660, %r3661, %r5774, %r3656; + shf.l.wrap.b32 %r13195, %r13104, %r13103, %r13202; // end inline asm - ld.const.u32 %r3665, [matrix+1952]; // begin inline asm - dp4a.u32.u32 %r3664, %r3665, %r5778, %r3660; + shf.l.wrap.b32 %r13199, %r13103, %r13104, %r13202; // end inline asm - ld.const.u32 %r3669, [matrix+1956]; + xor.b32 %r13349, %r13195, %r13139; + xor.b32 %r13350, %r13199, %r13140; + xor.b32 %r13216, %r29968, %r13349; + xor.b32 %r13217, %r29969, %r13350; + xor.b32 %r13241, %r29963, %r13350; + xor.b32 %r13240, %r29962, %r13349; + st.local.v2.u32 [%rd84+216], {%r13240, %r13241}; // begin inline asm - dp4a.u32.u32 %r3668, %r3669, %r5782, %r3664; + shf.l.wrap.b32 %r13203, %r13209, %r13208, %r12706; // end inline asm - ld.const.u32 %r3673, [matrix+1960]; // begin inline asm - dp4a.u32.u32 %r3672, %r3673, %r5786, %r3668; + shf.l.wrap.b32 %r13207, %r13208, %r13209, %r12706; // end inline asm - ld.const.u32 %r3677, [matrix+1964]; // begin inline asm - dp4a.u32.u32 %r3676, %r3677, %r5790, %r3672; + shf.l.wrap.b32 %r13211, %r13217, %r13216, %r12714; // end inline asm - ld.const.u32 %r3681, [matrix+1968]; // begin inline asm - dp4a.u32.u32 %r3680, %r3681, %r5794, %r3676; + shf.l.wrap.b32 %r13215, %r13216, %r13217, %r12714; // end inline asm - ld.const.u32 %r3685, [matrix+1972]; // begin inline asm - dp4a.u32.u32 %r3684, %r3685, %r5798, %r3680; + shf.l.wrap.b32 %r13223, %r13224, %r13225, %r12722; // end inline asm - ld.const.u32 %r3689, [matrix+1976]; // begin inline asm - dp4a.u32.u32 %r3688, %r3689, %r5802, %r3684; + shf.l.wrap.b32 %r13219, %r13225, %r13224, %r12722; // end inline asm - ld.const.u32 %r3693, [matrix+1980]; + st.local.v2.u32 [%rd84+96], {%r13219, %r13223}; // begin inline asm - dp4a.u32.u32 %r3692, %r3693, %r5806, %r3688; + shf.l.wrap.b32 %r13227, %r13233, %r13232, %r12754; // end inline asm - ld.const.u32 %r3697, [matrix+1984]; // begin inline asm - dp4a.u32.u32 %r3696, %r3697, %r5746, %r6244; + shf.l.wrap.b32 %r13231, %r13232, %r13233, %r12754; // end inline asm - ld.const.u32 %r3701, [matrix+1988]; // begin inline asm - dp4a.u32.u32 %r3700, %r3701, %r5750, %r3696; + shf.l.wrap.b32 %r13235, %r13241, %r13240, %r12802; // end inline asm - ld.const.u32 %r3705, [matrix+1992]; // begin inline asm - dp4a.u32.u32 %r3704, %r3705, %r5754, %r3700; + shf.l.wrap.b32 %r13239, %r13240, %r13241, %r12802; // end inline asm - ld.const.u32 %r3709, [matrix+1996]; // begin inline asm - dp4a.u32.u32 %r3708, %r3709, %r5758, %r3704; + shf.l.wrap.b32 %r13247, %r13248, %r13249, %r12826; // end inline asm - ld.const.u32 %r3713, [matrix+2000]; // begin inline asm - dp4a.u32.u32 %r3712, %r3713, %r5762, %r3708; + shf.l.wrap.b32 %r13243, %r13249, %r13248, %r12826; // end inline asm - ld.const.u32 %r3717, [matrix+2004]; + st.local.v2.u32 [%rd84+88], {%r13243, %r13247}; // begin inline asm - dp4a.u32.u32 %r3716, %r3717, %r5766, %r3712; + shf.l.wrap.b32 %r13251, %r13257, %r13256, %r12842; // end inline asm - ld.const.u32 %r3721, [matrix+2008]; // begin inline asm - dp4a.u32.u32 %r3720, %r3721, %r5770, %r3716; + shf.l.wrap.b32 %r13255, %r13256, %r13257, %r12842; // end inline asm - ld.const.u32 %r3725, [matrix+2012]; // begin inline asm - dp4a.u32.u32 %r3724, %r3725, %r5774, %r3720; + shf.l.wrap.b32 %r13259, %r13265, %r13264, %r12850; // end inline asm - ld.const.u32 %r3729, [matrix+2016]; // begin inline asm - dp4a.u32.u32 %r3728, %r3729, %r5778, %r3724; + shf.l.wrap.b32 %r13263, %r13264, %r13265, %r12850; // end inline asm - ld.const.u32 %r3733, [matrix+2020]; // begin inline asm - dp4a.u32.u32 %r3732, %r3733, %r5782, %r3728; + shf.l.wrap.b32 %r13267, %r13273, %r13272, %r12882; // end inline asm - ld.const.u32 %r3737, [matrix+2024]; // begin inline asm - dp4a.u32.u32 %r3736, %r3737, %r5786, %r3732; + shf.l.wrap.b32 %r13271, %r13272, %r13273, %r12882; // end inline asm - ld.const.u32 %r3741, [matrix+2028]; // begin inline asm - dp4a.u32.u32 %r3740, %r3741, %r5790, %r3736; + // chi + lop3.b32 %r13275, %r13310, %r13203, %r13227, 0xD2; + lop3.b32 %r13276, %r13313, %r13207, %r13231, 0xD2; // end inline asm - ld.const.u32 %r3745, [matrix+2032]; // begin inline asm - dp4a.u32.u32 %r3744, %r3745, %r5794, %r3740; + // chi + lop3.b32 %r13283, %r13203, %r13227, %r13259, 0xD2; + lop3.b32 %r13284, %r13207, %r13231, %r13263, 0xD2; // end inline asm - ld.const.u32 %r3749, [matrix+2036]; + st.local.v2.u32 [%rd84+32], {%r13283, %r13284}; // begin inline asm - dp4a.u32.u32 %r3748, %r3749, %r5798, %r3744; + // chi + lop3.b32 %r13291, %r13227, %r13259, %r13235, 0xD2; + lop3.b32 %r13292, %r13231, %r13263, %r13239, 0xD2; // end inline asm - ld.const.u32 %r3753, [matrix+2040]; + st.local.v2.u32 [%rd84+40], {%r13291, %r13292}; // begin inline asm - dp4a.u32.u32 %r3752, %r3753, %r5802, %r3748; + // chi + lop3.b32 %r13299, %r13259, %r13235, %r13310, 0xD2; + lop3.b32 %r13300, %r13263, %r13239, %r13313, 0xD2; // end inline asm - ld.const.u32 %r3757, [matrix+2044]; + st.local.v2.u32 [%rd84+48], {%r13299, %r13300}; // begin inline asm - dp4a.u32.u32 %r3756, %r3757, %r5806, %r3752; + // chi + lop3.b32 %r13307, %r13235, %r13310, %r13203, 0xD2; + lop3.b32 %r13308, %r13239, %r13313, %r13207, 0xD2; // end inline asm - shr.u32 %r6032, %r3692, 6; - and.b32 %r6033, %r6032, 240; - ld.const.u32 %r3761, [matrix+2048]; + st.local.v2.u32 [%rd84+56], {%r13307, %r13308}; // begin inline asm - dp4a.u32.u32 %r3760, %r3761, %r5746, %r6244; + // chi + lop3.b32 %r13315, %r13251, %r13211, %r13267, 0xD2; + lop3.b32 %r13316, %r13255, %r13215, %r13271, 0xD2; // end inline asm - ld.const.u32 %r3765, [matrix+2052]; + st.local.v2.u32 [%rd84+64], {%r13315, %r13316}; // begin inline asm - dp4a.u32.u32 %r3764, %r3765, %r5750, %r3760; + // chi + lop3.b32 %r13323, %r13211, %r13267, %r13243, 0xD2; + lop3.b32 %r13324, %r13215, %r13271, %r13247, 0xD2; // end inline asm - ld.const.u32 %r3769, [matrix+2056]; + st.local.v2.u32 [%rd84+72], {%r13323, %r13324}; // begin inline asm - dp4a.u32.u32 %r3768, %r3769, %r5754, %r3764; + // chi + lop3.b32 %r13331, %r13267, %r13243, %r13219, 0xD2; + lop3.b32 %r13332, %r13271, %r13247, %r13223, 0xD2; // end inline asm - ld.const.u32 %r3773, [matrix+2060]; + st.local.v2.u32 [%rd84+80], {%r13331, %r13332}; // begin inline asm - dp4a.u32.u32 %r3772, %r3773, %r5758, %r3768; + ld.global.nc.v2.u32 {%r13339,%r13340}, [%rd598]; // end inline asm - ld.const.u32 %r3777, [matrix+2064]; + xor.b32 %r13351, %r13276, %r13340; + xor.b32 %r13352, %r13275, %r13339; + st.local.v2.u32 [%rd84+24], {%r13352, %r13351}; + bra.uni $L__BB2_44; + +$L__BB2_22: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd479, 1179641; + st.local.u64 [%rd3+8], %rd479; + st.local.u32 [%rd3+16], %r30; + ld.global.u64 %rd480, [%rd34]; + ld.global.u64 %rd481, [%rd34+8]; + ld.global.u64 %rd482, [%rd34+16]; + ld.global.u64 %rd483, [%rd34+24]; + ld.global.u64 %rd484, [%rd34+32]; + ld.global.u64 %rd485, [%rd34+40]; + ld.global.u64 %rd486, [%rd34+48]; + ld.global.u64 %rd487, [%rd34+56]; + st.local.u64 [%rd3+24], %rd480; + st.local.u64 [%rd3+32], %rd481; + st.local.u64 [%rd3+40], %rd482; + st.local.u64 [%rd3+48], %rd483; + st.local.u64 [%rd3+56], %rd484; + st.local.u64 [%rd3+64], %rd485; + st.local.u64 [%rd3+72], %rd486; + st.local.u64 [%rd3+80], %rd487; + cvt.u32.u64 %r6826, %rd480; + xor.b32 %r6827, %r30, %r6826; + st.local.u32 [%rd3+24], %r6827; + mov.u32 %r29539, 0; + st.local.v2.u32 [%rd3+96], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+104], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+112], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+120], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+128], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+136], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+144], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+152], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+160], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+168], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+176], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+184], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+192], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+200], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+208], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+216], {%r29539, %r29539}; + mov.u32 %r29554, -2147483648; + mov.u32 %r6799, 1; + st.local.v2.u32 [%rd3+88], {%r6799, %r29554}; + ld.local.v2.u32 {%r29575, %r29576}, [%rd3+24]; + mov.b64 {%r29573, %r29574}, %rd485; + shr.u64 %rd488, %rd481, 32; + cvt.u32.u64 %r29587, %rd481; + cvt.u32.u64 %r29588, %rd488; + shr.u64 %rd489, %rd486, 32; + cvt.u32.u64 %r29585, %rd486; + cvt.u32.u64 %r29586, %rd489; + shr.u64 %rd490, %rd482, 32; + cvt.u32.u64 %r29583, %rd482; + cvt.u32.u64 %r29584, %rd490; + shr.u64 %rd491, %rd487, 32; + cvt.u32.u64 %r29581, %rd487; + cvt.u32.u64 %r29582, %rd491; + shr.u64 %rd492, %rd483, 32; + cvt.u32.u64 %r29579, %rd483; + cvt.u32.u64 %r29580, %rd492; + shr.u64 %rd493, %rd484, 32; + cvt.u32.u64 %r29577, %rd484; + cvt.u32.u64 %r29578, %rd493; + mov.u32 %r29540, %r29539; + mov.u32 %r29541, %r29539; + mov.u32 %r29542, %r29539; + mov.u32 %r29543, %r29539; + mov.u32 %r29544, %r29539; + mov.u32 %r29545, %r29539; + mov.u32 %r29546, %r29539; + mov.u32 %r29547, %r29539; + mov.u32 %r29548, %r29539; + mov.u32 %r29549, %r29539; + mov.u32 %r29550, %r29539; + mov.u32 %r29551, %r29539; + mov.u32 %r29552, %r29539; + mov.u32 %r29553, %r6799; + mov.u32 %r29555, %r29539; + mov.u32 %r29556, %r29539; + mov.u32 %r29557, %r29539; + mov.u32 %r29558, %r29539; + mov.u32 %r29559, %r29539; + mov.u32 %r29560, %r29539; + mov.u32 %r29561, %r29539; + mov.u32 %r29562, %r29539; + mov.u32 %r29563, %r29539; + mov.u32 %r29564, %r29539; + mov.u32 %r29565, %r29539; + mov.u32 %r29566, %r29539; + mov.u32 %r29567, %r29539; + mov.u32 %r29568, %r29539; + mov.u32 %r29569, %r29539; + mov.u32 %r29570, %r29539; + mov.u32 %r29571, %r29539; + mov.u32 %r29572, %r29539; + mov.u32 %r29589, %r29539; + +$L__BB2_23: // begin inline asm - dp4a.u32.u32 %r3776, %r3777, %r5762, %r3772; + // xor5 + lop3.b32 %r6830, %r29575, %r29573, %r29571, 0x96; + lop3.b32 %r6830, %r6830, %r29569, %r29567, 0x96; + lop3.b32 %r6831, %r29576, %r29574, %r29572, 0x96; + lop3.b32 %r6831, %r6831, %r29570, %r29568, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6842, %r29587, %r29585, %r29565, 0x96; + lop3.b32 %r6842, %r6842, %r29563, %r29561, 0x96; + lop3.b32 %r6843, %r29588, %r29586, %r29566, 0x96; + lop3.b32 %r6843, %r6843, %r29564, %r29562, 0x96; // end inline asm - ld.const.u32 %r3781, [matrix+2068]; // begin inline asm - dp4a.u32.u32 %r3780, %r3781, %r5766, %r3776; + // xor5 + lop3.b32 %r6854, %r29583, %r29581, %r29559, 0x96; + lop3.b32 %r6854, %r6854, %r29557, %r29555, 0x96; + lop3.b32 %r6855, %r29584, %r29582, %r29560, 0x96; + lop3.b32 %r6855, %r6855, %r29558, %r29556, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6866, %r29579, %r29553, %r29551, 0x96; + lop3.b32 %r6866, %r6866, %r29549, %r29547, 0x96; + lop3.b32 %r6867, %r29580, %r29554, %r29552, 0x96; + lop3.b32 %r6867, %r6867, %r29550, %r29548, 0x96; // end inline asm - ld.const.u32 %r3785, [matrix+2072]; // begin inline asm - dp4a.u32.u32 %r3784, %r3785, %r5770, %r3780; + // xor5 + lop3.b32 %r6878, %r29577, %r29545, %r29543, 0x96; + lop3.b32 %r6878, %r6878, %r29541, %r29539, 0x96; + lop3.b32 %r6879, %r29578, %r29546, %r29544, 0x96; + lop3.b32 %r6879, %r6879, %r29542, %r29540, 0x96; // end inline asm - ld.const.u32 %r3789, [matrix+2076]; // begin inline asm - dp4a.u32.u32 %r3788, %r3789, %r5774, %r3784; + shf.l.wrap.b32 %r6890, %r6843, %r6842, %r6799; // end inline asm - ld.const.u32 %r3793, [matrix+2080]; // begin inline asm - dp4a.u32.u32 %r3792, %r3793, %r5778, %r3788; + shf.l.wrap.b32 %r6894, %r6842, %r6843, %r6799; // end inline asm - ld.const.u32 %r3797, [matrix+2084]; + xor.b32 %r7324, %r6890, %r6878; + xor.b32 %r7325, %r6894, %r6879; + xor.b32 %r7157, %r29575, %r7324; + xor.b32 %r7160, %r29576, %r7325; + xor.b32 %r7064, %r29573, %r7324; + xor.b32 %r7063, %r29574, %r7325; + xor.b32 %r7111, %r29571, %r7324; + xor.b32 %r7112, %r29572, %r7325; + xor.b32 %r7016, %r29569, %r7324; + xor.b32 %r7015, %r29570, %r7325; + xor.b32 %r6967, %r29567, %r7324; + xor.b32 %r6968, %r29568, %r7325; // begin inline asm - dp4a.u32.u32 %r3796, %r3797, %r5782, %r3792; + shf.l.wrap.b32 %r6898, %r6855, %r6854, %r6799; // end inline asm - ld.const.u32 %r3801, [matrix+2088]; // begin inline asm - dp4a.u32.u32 %r3800, %r3801, %r5786, %r3796; + shf.l.wrap.b32 %r6902, %r6854, %r6855, %r6799; // end inline asm - ld.const.u32 %r3805, [matrix+2092]; + xor.b32 %r7326, %r6898, %r6830; + xor.b32 %r7327, %r6902, %r6831; + xor.b32 %r7119, %r29587, %r7326; + xor.b32 %r7120, %r29588, %r7327; + xor.b32 %r6936, %r29585, %r7326; + xor.b32 %r6935, %r29586, %r7327; + xor.b32 %r7095, %r29565, %r7326; + xor.b32 %r7096, %r29566, %r7327; + xor.b32 %r7056, %r29563, %r7326; + xor.b32 %r7055, %r29564, %r7327; + xor.b32 %r7039, %r29561, %r7326; + xor.b32 %r7040, %r29562, %r7327; // begin inline asm - dp4a.u32.u32 %r3804, %r3805, %r5790, %r3800; + shf.l.wrap.b32 %r6906, %r6867, %r6866, %r6799; // end inline asm - ld.const.u32 %r3809, [matrix+2096]; // begin inline asm - dp4a.u32.u32 %r3808, %r3809, %r5794, %r3804; + shf.l.wrap.b32 %r6910, %r6866, %r6867, %r6799; // end inline asm - ld.const.u32 %r3813, [matrix+2100]; + xor.b32 %r7328, %r6906, %r6842; + xor.b32 %r7329, %r6910, %r6843; + xor.b32 %r6976, %r29583, %r7328; + xor.b32 %r6975, %r29584, %r7329; + xor.b32 %r7103, %r29581, %r7328; + xor.b32 %r7104, %r29582, %r7329; + xor.b32 %r6984, %r29559, %r7328; + xor.b32 %r6983, %r29560, %r7329; + xor.b32 %r7087, %r29557, %r7328; + xor.b32 %r7088, %r29558, %r7329; + xor.b32 %r6952, %r29555, %r7328; + xor.b32 %r6951, %r29556, %r7329; // begin inline asm - dp4a.u32.u32 %r3812, %r3813, %r5798, %r3808; + shf.l.wrap.b32 %r6914, %r6879, %r6878, %r6799; // end inline asm - ld.const.u32 %r3817, [matrix+2104]; // begin inline asm - dp4a.u32.u32 %r3816, %r3817, %r5802, %r3812; + shf.l.wrap.b32 %r6918, %r6878, %r6879, %r6799; // end inline asm - ld.const.u32 %r3821, [matrix+2108]; + xor.b32 %r7330, %r6914, %r6854; + xor.b32 %r7331, %r6918, %r6855; + xor.b32 %r7071, %r29579, %r7330; + xor.b32 %r7072, %r29580, %r7331; + xor.b32 %r7048, %r29553, %r7330; + xor.b32 %r7047, %r29554, %r7331; + xor.b32 %r6991, %r29551, %r7330; + xor.b32 %r6992, %r29552, %r7331; + xor.b32 %r7079, %r29549, %r7330; + xor.b32 %r7080, %r29550, %r7331; + xor.b32 %r7008, %r29547, %r7330; + xor.b32 %r7007, %r29548, %r7331; // begin inline asm - dp4a.u32.u32 %r3820, %r3821, %r5806, %r3816; + shf.l.wrap.b32 %r6922, %r6831, %r6830, %r6799; // end inline asm - ld.const.u32 %r3825, [matrix+2112]; // begin inline asm - dp4a.u32.u32 %r3824, %r3825, %r5746, %r6244; + shf.l.wrap.b32 %r6926, %r6830, %r6831, %r6799; // end inline asm - ld.const.u32 %r3829, [matrix+2116]; + xor.b32 %r7332, %r6922, %r6866; + xor.b32 %r7333, %r6926, %r6867; + xor.b32 %r7023, %r29577, %r7332; + xor.b32 %r7024, %r29578, %r7333; + xor.b32 %r6943, %r29545, %r7332; + xor.b32 %r6944, %r29546, %r7333; + xor.b32 %r6960, %r29543, %r7332; + xor.b32 %r6959, %r29544, %r7333; + xor.b32 %r6999, %r29541, %r7332; + xor.b32 %r7000, %r29542, %r7333; + xor.b32 %r7031, %r29539, %r7332; + xor.b32 %r7032, %r29540, %r7333; + mov.u32 %r6937, 44; // begin inline asm - dp4a.u32.u32 %r3828, %r3829, %r5750, %r3824; + shf.l.wrap.b32 %r6930, %r6936, %r6935, %r6937; // end inline asm - ld.const.u32 %r3833, [matrix+2120]; // begin inline asm - dp4a.u32.u32 %r3832, %r3833, %r5754, %r3828; + shf.l.wrap.b32 %r6934, %r6935, %r6936, %r6937; // end inline asm - ld.const.u32 %r3837, [matrix+2124]; + mov.u32 %r6945, 20; // begin inline asm - dp4a.u32.u32 %r3836, %r3837, %r5758, %r3832; + shf.l.wrap.b32 %r6938, %r6944, %r6943, %r6945; // end inline asm - ld.const.u32 %r3841, [matrix+2128]; // begin inline asm - dp4a.u32.u32 %r3840, %r3841, %r5762, %r3836; + shf.l.wrap.b32 %r6942, %r6943, %r6944, %r6945; // end inline asm - ld.const.u32 %r3845, [matrix+2132]; + mov.u32 %r6953, 61; // begin inline asm - dp4a.u32.u32 %r3844, %r3845, %r5766, %r3840; + shf.l.wrap.b32 %r6946, %r6952, %r6951, %r6953; // end inline asm - ld.const.u32 %r3849, [matrix+2136]; // begin inline asm - dp4a.u32.u32 %r3848, %r3849, %r5770, %r3844; + shf.l.wrap.b32 %r6950, %r6951, %r6952, %r6953; // end inline asm - ld.const.u32 %r3853, [matrix+2140]; + mov.u32 %r6961, 39; // begin inline asm - dp4a.u32.u32 %r3852, %r3853, %r5774, %r3848; + shf.l.wrap.b32 %r6954, %r6960, %r6959, %r6961; // end inline asm - ld.const.u32 %r3857, [matrix+2144]; // begin inline asm - dp4a.u32.u32 %r3856, %r3857, %r5778, %r3852; + shf.l.wrap.b32 %r6958, %r6959, %r6960, %r6961; // end inline asm - ld.const.u32 %r3861, [matrix+2148]; + mov.u32 %r6969, 18; // begin inline asm - dp4a.u32.u32 %r3860, %r3861, %r5782, %r3856; + shf.l.wrap.b32 %r6962, %r6968, %r6967, %r6969; // end inline asm - ld.const.u32 %r3865, [matrix+2152]; // begin inline asm - dp4a.u32.u32 %r3864, %r3865, %r5786, %r3860; + shf.l.wrap.b32 %r6966, %r6967, %r6968, %r6969; // end inline asm - ld.const.u32 %r3869, [matrix+2156]; + mov.u32 %r6977, 62; // begin inline asm - dp4a.u32.u32 %r3868, %r3869, %r5790, %r3864; + shf.l.wrap.b32 %r6970, %r6976, %r6975, %r6977; // end inline asm - ld.const.u32 %r3873, [matrix+2160]; // begin inline asm - dp4a.u32.u32 %r3872, %r3873, %r5794, %r3868; + shf.l.wrap.b32 %r6974, %r6975, %r6976, %r6977; // end inline asm - ld.const.u32 %r3877, [matrix+2164]; + mov.u32 %r6985, 43; // begin inline asm - dp4a.u32.u32 %r3876, %r3877, %r5798, %r3872; + shf.l.wrap.b32 %r6978, %r6984, %r6983, %r6985; // end inline asm - ld.const.u32 %r3881, [matrix+2168]; // begin inline asm - dp4a.u32.u32 %r3880, %r3881, %r5802, %r3876; + shf.l.wrap.b32 %r6982, %r6983, %r6984, %r6985; // end inline asm - ld.const.u32 %r3885, [matrix+2172]; + mov.u32 %r6993, 25; // begin inline asm - dp4a.u32.u32 %r3884, %r3885, %r5806, %r3880; + shf.l.wrap.b32 %r6986, %r6992, %r6991, %r6993; // end inline asm - shr.u32 %r6034, %r3820, 6; - and.b32 %r6035, %r6034, 240; - shr.u32 %r6036, %r3884, 10; - or.b32 %r6037, %r6036, %r6035; - xor.b32 %r6038, %r13, %r6037; - ld.const.u32 %r3889, [matrix+2176]; // begin inline asm - dp4a.u32.u32 %r3888, %r3889, %r5746, %r6244; + shf.l.wrap.b32 %r6990, %r6991, %r6992, %r6993; // end inline asm - ld.const.u32 %r3893, [matrix+2180]; + mov.u32 %r7001, 8; // begin inline asm - dp4a.u32.u32 %r3892, %r3893, %r5750, %r3888; + shf.l.wrap.b32 %r6994, %r7000, %r6999, %r7001; // end inline asm - ld.const.u32 %r3897, [matrix+2184]; // begin inline asm - dp4a.u32.u32 %r3896, %r3897, %r5754, %r3892; + shf.l.wrap.b32 %r6998, %r6999, %r7000, %r7001; // end inline asm - ld.const.u32 %r3901, [matrix+2188]; + mov.u32 %r7009, 56; // begin inline asm - dp4a.u32.u32 %r3900, %r3901, %r5758, %r3896; + shf.l.wrap.b32 %r7002, %r7008, %r7007, %r7009; // end inline asm - ld.const.u32 %r3905, [matrix+2192]; // begin inline asm - dp4a.u32.u32 %r3904, %r3905, %r5762, %r3900; + shf.l.wrap.b32 %r7006, %r7007, %r7008, %r7009; // end inline asm - ld.const.u32 %r3909, [matrix+2196]; + mov.u32 %r7017, 41; // begin inline asm - dp4a.u32.u32 %r3908, %r3909, %r5766, %r3904; + shf.l.wrap.b32 %r7010, %r7016, %r7015, %r7017; // end inline asm - ld.const.u32 %r3913, [matrix+2200]; // begin inline asm - dp4a.u32.u32 %r3912, %r3913, %r5770, %r3908; + shf.l.wrap.b32 %r7014, %r7015, %r7016, %r7017; // end inline asm - ld.const.u32 %r3917, [matrix+2204]; + mov.u32 %r7025, 27; // begin inline asm - dp4a.u32.u32 %r3916, %r3917, %r5774, %r3912; + shf.l.wrap.b32 %r7018, %r7024, %r7023, %r7025; // end inline asm - ld.const.u32 %r3921, [matrix+2208]; // begin inline asm - dp4a.u32.u32 %r3920, %r3921, %r5778, %r3916; + shf.l.wrap.b32 %r7022, %r7023, %r7024, %r7025; // end inline asm - ld.const.u32 %r3925, [matrix+2212]; + mov.u32 %r7033, 14; // begin inline asm - dp4a.u32.u32 %r3924, %r3925, %r5782, %r3920; + shf.l.wrap.b32 %r7026, %r7032, %r7031, %r7033; // end inline asm - ld.const.u32 %r3929, [matrix+2216]; // begin inline asm - dp4a.u32.u32 %r3928, %r3929, %r5786, %r3924; + shf.l.wrap.b32 %r7030, %r7031, %r7032, %r7033; // end inline asm - ld.const.u32 %r3933, [matrix+2220]; + mov.u32 %r7041, 2; // begin inline asm - dp4a.u32.u32 %r3932, %r3933, %r5790, %r3928; + shf.l.wrap.b32 %r7034, %r7040, %r7039, %r7041; // end inline asm - ld.const.u32 %r3937, [matrix+2224]; // begin inline asm - dp4a.u32.u32 %r3936, %r3937, %r5794, %r3932; + shf.l.wrap.b32 %r7038, %r7039, %r7040, %r7041; // end inline asm - ld.const.u32 %r3941, [matrix+2228]; + mov.u32 %r7049, 55; // begin inline asm - dp4a.u32.u32 %r3940, %r3941, %r5798, %r3936; + shf.l.wrap.b32 %r7042, %r7048, %r7047, %r7049; // end inline asm - ld.const.u32 %r3945, [matrix+2232]; // begin inline asm - dp4a.u32.u32 %r3944, %r3945, %r5802, %r3940; + shf.l.wrap.b32 %r7046, %r7047, %r7048, %r7049; // end inline asm - ld.const.u32 %r3949, [matrix+2236]; + mov.u32 %r7057, 45; // begin inline asm - dp4a.u32.u32 %r3948, %r3949, %r5806, %r3944; + shf.l.wrap.b32 %r7050, %r7056, %r7055, %r7057; // end inline asm - ld.const.u32 %r3953, [matrix+2240]; // begin inline asm - dp4a.u32.u32 %r3952, %r3953, %r5746, %r6244; + shf.l.wrap.b32 %r7054, %r7055, %r7056, %r7057; // end inline asm - ld.const.u32 %r3957, [matrix+2244]; + mov.u32 %r7065, 36; // begin inline asm - dp4a.u32.u32 %r3956, %r3957, %r5750, %r3952; + shf.l.wrap.b32 %r7058, %r7064, %r7063, %r7065; // end inline asm - ld.const.u32 %r3961, [matrix+2248]; // begin inline asm - dp4a.u32.u32 %r3960, %r3961, %r5754, %r3956; + shf.l.wrap.b32 %r7062, %r7063, %r7064, %r7065; // end inline asm - ld.const.u32 %r3965, [matrix+2252]; + mov.u32 %r7073, 28; // begin inline asm - dp4a.u32.u32 %r3964, %r3965, %r5758, %r3960; + shf.l.wrap.b32 %r7066, %r7072, %r7071, %r7073; // end inline asm - ld.const.u32 %r3969, [matrix+2256]; // begin inline asm - dp4a.u32.u32 %r3968, %r3969, %r5762, %r3964; + shf.l.wrap.b32 %r7070, %r7071, %r7072, %r7073; // end inline asm - ld.const.u32 %r3973, [matrix+2260]; + mov.u32 %r7081, 21; // begin inline asm - dp4a.u32.u32 %r3972, %r3973, %r5766, %r3968; + shf.l.wrap.b32 %r7074, %r7080, %r7079, %r7081; // end inline asm - ld.const.u32 %r3977, [matrix+2264]; // begin inline asm - dp4a.u32.u32 %r3976, %r3977, %r5770, %r3972; + shf.l.wrap.b32 %r7078, %r7079, %r7080, %r7081; // end inline asm - ld.const.u32 %r3981, [matrix+2268]; + mov.u32 %r7089, 15; // begin inline asm - dp4a.u32.u32 %r3980, %r3981, %r5774, %r3976; + shf.l.wrap.b32 %r7082, %r7088, %r7087, %r7089; // end inline asm - ld.const.u32 %r3985, [matrix+2272]; // begin inline asm - dp4a.u32.u32 %r3984, %r3985, %r5778, %r3980; + shf.l.wrap.b32 %r7086, %r7087, %r7088, %r7089; // end inline asm - ld.const.u32 %r3989, [matrix+2276]; + mov.u32 %r7097, 10; // begin inline asm - dp4a.u32.u32 %r3988, %r3989, %r5782, %r3984; + shf.l.wrap.b32 %r7090, %r7096, %r7095, %r7097; // end inline asm - ld.const.u32 %r3993, [matrix+2280]; // begin inline asm - dp4a.u32.u32 %r3992, %r3993, %r5786, %r3988; + shf.l.wrap.b32 %r7094, %r7095, %r7096, %r7097; // end inline asm - ld.const.u32 %r3997, [matrix+2284]; + mov.u32 %r7105, 6; // begin inline asm - dp4a.u32.u32 %r3996, %r3997, %r5790, %r3992; + shf.l.wrap.b32 %r7098, %r7104, %r7103, %r7105; // end inline asm - ld.const.u32 %r4001, [matrix+2288]; // begin inline asm - dp4a.u32.u32 %r4000, %r4001, %r5794, %r3996; + shf.l.wrap.b32 %r7102, %r7103, %r7104, %r7105; // end inline asm - ld.const.u32 %r4005, [matrix+2292]; + mov.u32 %r7113, 3; // begin inline asm - dp4a.u32.u32 %r4004, %r4005, %r5798, %r4000; + shf.l.wrap.b32 %r7106, %r7112, %r7111, %r7113; // end inline asm - ld.const.u32 %r4009, [matrix+2296]; // begin inline asm - dp4a.u32.u32 %r4008, %r4009, %r5802, %r4004; + shf.l.wrap.b32 %r7110, %r7111, %r7112, %r7113; // end inline asm - ld.const.u32 %r4013, [matrix+2300]; // begin inline asm - dp4a.u32.u32 %r4012, %r4013, %r5806, %r4008; + shf.l.wrap.b32 %r7114, %r7120, %r7119, %r6799; // end inline asm - shr.u32 %r6039, %r3948, 6; - and.b32 %r6040, %r6039, 240; - shr.u32 %r6041, %r4012, 10; - or.b32 %r6042, %r6041, %r6040; - xor.b32 %r6043, %r5886, %r6042; - ld.const.u32 %r4017, [matrix+2304]; // begin inline asm - dp4a.u32.u32 %r4016, %r4017, %r5746, %r6244; + shf.l.wrap.b32 %r7118, %r7119, %r7120, %r6799; // end inline asm - ld.const.u32 %r4021, [matrix+2308]; // begin inline asm - dp4a.u32.u32 %r4020, %r4021, %r5750, %r4016; + // chi + lop3.b32 %r7122, %r7157, %r6930, %r6978, 0xD2; + lop3.b32 %r7123, %r7160, %r6934, %r6982, 0xD2; // end inline asm - ld.const.u32 %r4025, [matrix+2312]; // begin inline asm - dp4a.u32.u32 %r4024, %r4025, %r5754, %r4020; + // chi + lop3.b32 %r29587, %r6930, %r6978, %r7074, 0xD2; + lop3.b32 %r29588, %r6934, %r6982, %r7078, 0xD2; // end inline asm - ld.const.u32 %r4029, [matrix+2316]; // begin inline asm - dp4a.u32.u32 %r4028, %r4029, %r5758, %r4024; + // chi + lop3.b32 %r29583, %r6978, %r7074, %r7026, 0xD2; + lop3.b32 %r29584, %r6982, %r7078, %r7030, 0xD2; // end inline asm - ld.const.u32 %r4033, [matrix+2320]; // begin inline asm - dp4a.u32.u32 %r4032, %r4033, %r5762, %r4028; + // chi + lop3.b32 %r29579, %r7074, %r7026, %r7157, 0xD2; + lop3.b32 %r29580, %r7078, %r7030, %r7160, 0xD2; // end inline asm - ld.const.u32 %r4037, [matrix+2324]; // begin inline asm - dp4a.u32.u32 %r4036, %r4037, %r5766, %r4032; + // chi + lop3.b32 %r29577, %r7026, %r7157, %r6930, 0xD2; + lop3.b32 %r29578, %r7030, %r7160, %r6934, 0xD2; // end inline asm - ld.const.u32 %r4041, [matrix+2328]; // begin inline asm - dp4a.u32.u32 %r4040, %r4041, %r5770, %r4036; + // chi + lop3.b32 %r29573, %r7066, %r6938, %r7106, 0xD2; + lop3.b32 %r29574, %r7070, %r6942, %r7110, 0xD2; // end inline asm - ld.const.u32 %r4045, [matrix+2332]; // begin inline asm - dp4a.u32.u32 %r4044, %r4045, %r5774, %r4040; + // chi + lop3.b32 %r29585, %r6938, %r7106, %r7050, 0xD2; + lop3.b32 %r29586, %r6942, %r7110, %r7054, 0xD2; // end inline asm - ld.const.u32 %r4049, [matrix+2336]; // begin inline asm - dp4a.u32.u32 %r4048, %r4049, %r5778, %r4044; + // chi + lop3.b32 %r29581, %r7106, %r7050, %r6946, 0xD2; + lop3.b32 %r29582, %r7110, %r7054, %r6950, 0xD2; // end inline asm - ld.const.u32 %r4053, [matrix+2340]; // begin inline asm - dp4a.u32.u32 %r4052, %r4053, %r5782, %r4048; + // chi + lop3.b32 %r29553, %r7050, %r6946, %r7066, 0xD2; + lop3.b32 %r29554, %r7054, %r6950, %r7070, 0xD2; // end inline asm - ld.const.u32 %r4057, [matrix+2344]; + st.local.v2.u32 [%rd3+88], {%r29553, %r29554}; // begin inline asm - dp4a.u32.u32 %r4056, %r4057, %r5786, %r4052; + // chi + lop3.b32 %r29545, %r6946, %r7066, %r6938, 0xD2; + lop3.b32 %r29546, %r6950, %r7070, %r6942, 0xD2; // end inline asm - ld.const.u32 %r4061, [matrix+2348]; + st.local.v2.u32 [%rd3+96], {%r29545, %r29546}; // begin inline asm - dp4a.u32.u32 %r4060, %r4061, %r5790, %r4056; + // chi + lop3.b32 %r29571, %r7114, %r7098, %r6986, 0xD2; + lop3.b32 %r29572, %r7118, %r7102, %r6990, 0xD2; // end inline asm - ld.const.u32 %r4065, [matrix+2352]; + st.local.v2.u32 [%rd3+104], {%r29571, %r29572}; // begin inline asm - dp4a.u32.u32 %r4064, %r4065, %r5794, %r4060; + // chi + lop3.b32 %r29565, %r7098, %r6986, %r6994, 0xD2; + lop3.b32 %r29566, %r7102, %r6990, %r6998, 0xD2; // end inline asm - ld.const.u32 %r4069, [matrix+2356]; + st.local.v2.u32 [%rd3+112], {%r29565, %r29566}; // begin inline asm - dp4a.u32.u32 %r4068, %r4069, %r5798, %r4064; + // chi + lop3.b32 %r29559, %r6986, %r6994, %r6962, 0xD2; + lop3.b32 %r29560, %r6990, %r6998, %r6966, 0xD2; // end inline asm - ld.const.u32 %r4073, [matrix+2360]; + st.local.v2.u32 [%rd3+120], {%r29559, %r29560}; // begin inline asm - dp4a.u32.u32 %r4072, %r4073, %r5802, %r4068; + // chi + lop3.b32 %r29551, %r6994, %r6962, %r7114, 0xD2; + lop3.b32 %r29552, %r6998, %r6966, %r7118, 0xD2; // end inline asm - ld.const.u32 %r4077, [matrix+2364]; + st.local.v2.u32 [%rd3+128], {%r29551, %r29552}; // begin inline asm - dp4a.u32.u32 %r4076, %r4077, %r5806, %r4072; + // chi + lop3.b32 %r29543, %r6962, %r7114, %r7098, 0xD2; + lop3.b32 %r29544, %r6966, %r7118, %r7102, 0xD2; // end inline asm - ld.const.u32 %r4081, [matrix+2368]; + st.local.v2.u32 [%rd3+136], {%r29543, %r29544}; // begin inline asm - dp4a.u32.u32 %r4080, %r4081, %r5746, %r6244; + // chi + lop3.b32 %r29569, %r7018, %r7058, %r7090, 0xD2; + lop3.b32 %r29570, %r7022, %r7062, %r7094, 0xD2; // end inline asm - ld.const.u32 %r4085, [matrix+2372]; + st.local.v2.u32 [%rd3+144], {%r29569, %r29570}; // begin inline asm - dp4a.u32.u32 %r4084, %r4085, %r5750, %r4080; + // chi + lop3.b32 %r29563, %r7058, %r7090, %r7082, 0xD2; + lop3.b32 %r29564, %r7062, %r7094, %r7086, 0xD2; // end inline asm - ld.const.u32 %r4089, [matrix+2376]; + st.local.v2.u32 [%rd3+152], {%r29563, %r29564}; // begin inline asm - dp4a.u32.u32 %r4088, %r4089, %r5754, %r4084; + // chi + lop3.b32 %r29557, %r7090, %r7082, %r7002, 0xD2; + lop3.b32 %r29558, %r7094, %r7086, %r7006, 0xD2; // end inline asm - ld.const.u32 %r4093, [matrix+2380]; + st.local.v2.u32 [%rd3+160], {%r29557, %r29558}; // begin inline asm - dp4a.u32.u32 %r4092, %r4093, %r5758, %r4088; + // chi + lop3.b32 %r29549, %r7082, %r7002, %r7018, 0xD2; + lop3.b32 %r29550, %r7086, %r7006, %r7022, 0xD2; // end inline asm - ld.const.u32 %r4097, [matrix+2384]; + st.local.v2.u32 [%rd3+168], {%r29549, %r29550}; // begin inline asm - dp4a.u32.u32 %r4096, %r4097, %r5762, %r4092; + // chi + lop3.b32 %r29541, %r7002, %r7018, %r7058, 0xD2; + lop3.b32 %r29542, %r7006, %r7022, %r7062, 0xD2; // end inline asm - ld.const.u32 %r4101, [matrix+2388]; + st.local.v2.u32 [%rd3+176], {%r29541, %r29542}; // begin inline asm - dp4a.u32.u32 %r4100, %r4101, %r5766, %r4096; + // chi + lop3.b32 %r29567, %r6970, %r7042, %r6954, 0xD2; + lop3.b32 %r29568, %r6974, %r7046, %r6958, 0xD2; // end inline asm - ld.const.u32 %r4105, [matrix+2392]; + st.local.v2.u32 [%rd3+184], {%r29567, %r29568}; // begin inline asm - dp4a.u32.u32 %r4104, %r4105, %r5770, %r4100; + // chi + lop3.b32 %r29561, %r7042, %r6954, %r7010, 0xD2; + lop3.b32 %r29562, %r7046, %r6958, %r7014, 0xD2; // end inline asm - ld.const.u32 %r4109, [matrix+2396]; + st.local.v2.u32 [%rd3+192], {%r29561, %r29562}; // begin inline asm - dp4a.u32.u32 %r4108, %r4109, %r5774, %r4104; + // chi + lop3.b32 %r29555, %r6954, %r7010, %r7034, 0xD2; + lop3.b32 %r29556, %r6958, %r7014, %r7038, 0xD2; // end inline asm - ld.const.u32 %r4113, [matrix+2400]; + st.local.v2.u32 [%rd3+200], {%r29555, %r29556}; // begin inline asm - dp4a.u32.u32 %r4112, %r4113, %r5778, %r4108; + // chi + lop3.b32 %r29547, %r7010, %r7034, %r6970, 0xD2; + lop3.b32 %r29548, %r7014, %r7038, %r6974, 0xD2; // end inline asm - ld.const.u32 %r4117, [matrix+2404]; + st.local.v2.u32 [%rd3+208], {%r29547, %r29548}; // begin inline asm - dp4a.u32.u32 %r4116, %r4117, %r5782, %r4112; + // chi + lop3.b32 %r29539, %r7034, %r6970, %r7042, 0xD2; + lop3.b32 %r29540, %r7038, %r6974, %r7046, 0xD2; // end inline asm - ld.const.u32 %r4121, [matrix+2408]; + st.local.v2.u32 [%rd3+216], {%r29539, %r29540}; + mul.wide.s32 %rd495, %r29589, 8; + mov.u64 %rd496, keccak_round_constants; + cvta.const.u64 %rd497, %rd496; + add.s64 %rd494, %rd497, %rd495; // begin inline asm - dp4a.u32.u32 %r4120, %r4121, %r5786, %r4116; + ld.global.nc.v2.u32 {%r7322,%r7323}, [%rd494]; // end inline asm - ld.const.u32 %r4125, [matrix+2412]; + xor.b32 %r29575, %r7122, %r7322; + xor.b32 %r29576, %r7123, %r7323; + add.s32 %r29589, %r29589, 1; + setp.lt.u32 %p18, %r29589, 23; + @%p18 bra $L__BB2_23; + + add.u64 %rd55, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r29587, %r29588}; + st.local.v2.u32 [%rd3+72], {%r29585, %r29586}; + st.local.v2.u32 [%rd3+40], {%r29583, %r29584}; + st.local.v2.u32 [%rd3+80], {%r29581, %r29582}; + st.local.v2.u32 [%rd3+48], {%r29579, %r29580}; + st.local.v2.u32 [%rd3+56], {%r29577, %r29578}; + st.local.v2.u32 [%rd3+24], {%r29575, %r29576}; // begin inline asm - dp4a.u32.u32 %r4124, %r4125, %r5790, %r4120; + // xor5 + lop3.b32 %r7334, %r29575, %r29573, %r29571, 0x96; + lop3.b32 %r7334, %r7334, %r29569, %r29567, 0x96; + lop3.b32 %r7335, %r29576, %r29574, %r29572, 0x96; + lop3.b32 %r7335, %r7335, %r29570, %r29568, 0x96; // end inline asm - ld.const.u32 %r4129, [matrix+2416]; // begin inline asm - dp4a.u32.u32 %r4128, %r4129, %r5794, %r4124; + // xor5 + lop3.b32 %r7346, %r29587, %r29585, %r29565, 0x96; + lop3.b32 %r7346, %r7346, %r29563, %r29561, 0x96; + lop3.b32 %r7347, %r29588, %r29586, %r29566, 0x96; + lop3.b32 %r7347, %r7347, %r29564, %r29562, 0x96; // end inline asm - ld.const.u32 %r4133, [matrix+2420]; // begin inline asm - dp4a.u32.u32 %r4132, %r4133, %r5798, %r4128; + // xor5 + lop3.b32 %r7358, %r29583, %r29581, %r29559, 0x96; + lop3.b32 %r7358, %r7358, %r29557, %r29555, 0x96; + lop3.b32 %r7359, %r29584, %r29582, %r29560, 0x96; + lop3.b32 %r7359, %r7359, %r29558, %r29556, 0x96; // end inline asm - ld.const.u32 %r4137, [matrix+2424]; // begin inline asm - dp4a.u32.u32 %r4136, %r4137, %r5802, %r4132; + // xor5 + lop3.b32 %r7370, %r29579, %r29553, %r29551, 0x96; + lop3.b32 %r7370, %r7370, %r29549, %r29547, 0x96; + lop3.b32 %r7371, %r29580, %r29554, %r29552, 0x96; + lop3.b32 %r7371, %r7371, %r29550, %r29548, 0x96; // end inline asm - ld.const.u32 %r4141, [matrix+2428]; // begin inline asm - dp4a.u32.u32 %r4140, %r4141, %r5806, %r4136; + // xor5 + lop3.b32 %r7382, %r29577, %r29545, %r29543, 0x96; + lop3.b32 %r7382, %r7382, %r29541, %r29539, 0x96; + lop3.b32 %r7383, %r29578, %r29546, %r29544, 0x96; + lop3.b32 %r7383, %r7383, %r29542, %r29540, 0x96; // end inline asm - shr.u32 %r6044, %r4076, 6; - and.b32 %r6045, %r6044, 240; - shr.u32 %r6046, %r4140, 10; - or.b32 %r6047, %r6046, %r6045; - xor.b32 %r6048, %r5898, %r6047; - ld.const.u32 %r4145, [matrix+2432]; + mov.u32 %r7586, 1; // begin inline asm - dp4a.u32.u32 %r4144, %r4145, %r5746, %r6244; + shf.l.wrap.b32 %r7394, %r7347, %r7346, %r7586; // end inline asm - ld.const.u32 %r4149, [matrix+2436]; // begin inline asm - dp4a.u32.u32 %r4148, %r4149, %r5750, %r4144; + shf.l.wrap.b32 %r7398, %r7346, %r7347, %r7586; // end inline asm - ld.const.u32 %r4153, [matrix+2440]; + xor.b32 %r7613, %r7394, %r7382; + xor.b32 %r7614, %r7398, %r7383; + xor.b32 %r7541, %r29575, %r7613; + xor.b32 %r7544, %r29576, %r7614; + xor.b32 %r7504, %r29572, %r7614; + xor.b32 %r7503, %r29571, %r7613; + st.local.v2.u32 [%rd3+104], {%r7503, %r7504}; // begin inline asm - dp4a.u32.u32 %r4152, %r4153, %r5754, %r4148; + shf.l.wrap.b32 %r7402, %r7359, %r7358, %r7586; // end inline asm - ld.const.u32 %r4157, [matrix+2444]; // begin inline asm - dp4a.u32.u32 %r4156, %r4157, %r5758, %r4152; + shf.l.wrap.b32 %r7406, %r7358, %r7359, %r7586; // end inline asm - ld.const.u32 %r4161, [matrix+2448]; + xor.b32 %r7615, %r7402, %r7334; + xor.b32 %r7616, %r7406, %r7335; + xor.b32 %r7440, %r29585, %r7615; + xor.b32 %r7439, %r29586, %r7616; + xor.b32 %r7479, %r29564, %r7616; + xor.b32 %r7480, %r29563, %r7615; + st.local.v2.u32 [%rd3+152], {%r7480, %r7479}; // begin inline asm - dp4a.u32.u32 %r4160, %r4161, %r5762, %r4156; + shf.l.wrap.b32 %r7410, %r7371, %r7370, %r7586; // end inline asm - ld.const.u32 %r4165, [matrix+2452]; // begin inline asm - dp4a.u32.u32 %r4164, %r4165, %r5766, %r4160; + shf.l.wrap.b32 %r7414, %r7370, %r7371, %r7586; // end inline asm - ld.const.u32 %r4169, [matrix+2456]; + xor.b32 %r7617, %r7410, %r7346; + xor.b32 %r7618, %r7414, %r7347; + xor.b32 %r7463, %r29560, %r7618; + xor.b32 %r7464, %r29559, %r7617; + st.local.v2.u32 [%rd3+120], {%r7464, %r7463}; + xor.b32 %r7455, %r29556, %r7618; + xor.b32 %r7456, %r29555, %r7617; + st.local.v2.u32 [%rd3+200], {%r7456, %r7455}; // begin inline asm - dp4a.u32.u32 %r4168, %r4169, %r5770, %r4164; + shf.l.wrap.b32 %r7418, %r7383, %r7382, %r7586; // end inline asm - ld.const.u32 %r4173, [matrix+2460]; // begin inline asm - dp4a.u32.u32 %r4172, %r4173, %r5774, %r4168; + shf.l.wrap.b32 %r7422, %r7382, %r7383, %r7586; // end inline asm - ld.const.u32 %r4177, [matrix+2464]; + xor.b32 %r7619, %r7418, %r7358; + xor.b32 %r7620, %r7422, %r7359; + xor.b32 %r7487, %r29579, %r7619; + xor.b32 %r7488, %r29580, %r7620; + xor.b32 %r7496, %r29550, %r7620; + xor.b32 %r7495, %r29549, %r7619; + st.local.v2.u32 [%rd3+168], {%r7495, %r7496}; // begin inline asm - dp4a.u32.u32 %r4176, %r4177, %r5778, %r4172; + shf.l.wrap.b32 %r7426, %r7335, %r7334, %r7586; // end inline asm - ld.const.u32 %r4181, [matrix+2468]; // begin inline asm - dp4a.u32.u32 %r4180, %r4181, %r5782, %r4176; + shf.l.wrap.b32 %r7430, %r7334, %r7335, %r7586; // end inline asm - ld.const.u32 %r4185, [matrix+2472]; + xor.b32 %r7621, %r7426, %r7370; + xor.b32 %r7622, %r7430, %r7371; + xor.b32 %r7447, %r29545, %r7621; + xor.b32 %r7448, %r29546, %r7622; + xor.b32 %r7472, %r29540, %r7622; + xor.b32 %r7471, %r29539, %r7621; + st.local.v2.u32 [%rd3+216], {%r7471, %r7472}; // begin inline asm - dp4a.u32.u32 %r4184, %r4185, %r5786, %r4180; + shf.l.wrap.b32 %r7434, %r7440, %r7439, %r6937; // end inline asm - ld.const.u32 %r4189, [matrix+2476]; // begin inline asm - dp4a.u32.u32 %r4188, %r4189, %r5790, %r4184; + shf.l.wrap.b32 %r7438, %r7439, %r7440, %r6937; // end inline asm - ld.const.u32 %r4193, [matrix+2480]; // begin inline asm - dp4a.u32.u32 %r4192, %r4193, %r5794, %r4188; + shf.l.wrap.b32 %r7442, %r7448, %r7447, %r6945; // end inline asm - ld.const.u32 %r4197, [matrix+2484]; // begin inline asm - dp4a.u32.u32 %r4196, %r4197, %r5798, %r4192; + shf.l.wrap.b32 %r7446, %r7447, %r7448, %r6945; // end inline asm - ld.const.u32 %r4201, [matrix+2488]; // begin inline asm - dp4a.u32.u32 %r4200, %r4201, %r5802, %r4196; + shf.l.wrap.b32 %r7454, %r7455, %r7456, %r6953; // end inline asm - ld.const.u32 %r4205, [matrix+2492]; // begin inline asm - dp4a.u32.u32 %r4204, %r4205, %r5806, %r4200; + shf.l.wrap.b32 %r7450, %r7456, %r7455, %r6953; // end inline asm - ld.const.u32 %r4209, [matrix+2496]; + st.local.v2.u32 [%rd3+96], {%r7450, %r7454}; // begin inline asm - dp4a.u32.u32 %r4208, %r4209, %r5746, %r6244; + shf.l.wrap.b32 %r7458, %r7464, %r7463, %r6985; // end inline asm - ld.const.u32 %r4213, [matrix+2500]; // begin inline asm - dp4a.u32.u32 %r4212, %r4213, %r5750, %r4208; + shf.l.wrap.b32 %r7462, %r7463, %r7464, %r6985; // end inline asm - ld.const.u32 %r4217, [matrix+2504]; // begin inline asm - dp4a.u32.u32 %r4216, %r4217, %r5754, %r4212; + shf.l.wrap.b32 %r7466, %r7472, %r7471, %r7033; // end inline asm - ld.const.u32 %r4221, [matrix+2508]; // begin inline asm - dp4a.u32.u32 %r4220, %r4221, %r5758, %r4216; + shf.l.wrap.b32 %r7470, %r7471, %r7472, %r7033; // end inline asm - ld.const.u32 %r4225, [matrix+2512]; // begin inline asm - dp4a.u32.u32 %r4224, %r4225, %r5762, %r4220; + shf.l.wrap.b32 %r7478, %r7479, %r7480, %r7057; // end inline asm - ld.const.u32 %r4229, [matrix+2516]; // begin inline asm - dp4a.u32.u32 %r4228, %r4229, %r5766, %r4224; + shf.l.wrap.b32 %r7474, %r7480, %r7479, %r7057; // end inline asm - ld.const.u32 %r4233, [matrix+2520]; + st.local.v2.u32 [%rd3+88], {%r7474, %r7478}; // begin inline asm - dp4a.u32.u32 %r4232, %r4233, %r5770, %r4228; + shf.l.wrap.b32 %r7482, %r7488, %r7487, %r7073; // end inline asm - ld.const.u32 %r4237, [matrix+2524]; // begin inline asm - dp4a.u32.u32 %r4236, %r4237, %r5774, %r4232; + shf.l.wrap.b32 %r7486, %r7487, %r7488, %r7073; // end inline asm - ld.const.u32 %r4241, [matrix+2528]; // begin inline asm - dp4a.u32.u32 %r4240, %r4241, %r5778, %r4236; + shf.l.wrap.b32 %r7490, %r7496, %r7495, %r7081; // end inline asm - ld.const.u32 %r4245, [matrix+2532]; // begin inline asm - dp4a.u32.u32 %r4244, %r4245, %r5782, %r4240; + shf.l.wrap.b32 %r7494, %r7495, %r7496, %r7081; // end inline asm - ld.const.u32 %r4249, [matrix+2536]; // begin inline asm - dp4a.u32.u32 %r4248, %r4249, %r5786, %r4244; + shf.l.wrap.b32 %r7498, %r7504, %r7503, %r7113; // end inline asm - ld.const.u32 %r4253, [matrix+2540]; // begin inline asm - dp4a.u32.u32 %r4252, %r4253, %r5790, %r4248; + shf.l.wrap.b32 %r7502, %r7503, %r7504, %r7113; // end inline asm - ld.const.u32 %r4257, [matrix+2544]; // begin inline asm - dp4a.u32.u32 %r4256, %r4257, %r5794, %r4252; + // chi + lop3.b32 %r7506, %r7541, %r7434, %r7458, 0xD2; + lop3.b32 %r7507, %r7544, %r7438, %r7462, 0xD2; // end inline asm - ld.const.u32 %r4261, [matrix+2548]; // begin inline asm - dp4a.u32.u32 %r4260, %r4261, %r5798, %r4256; + // chi + lop3.b32 %r29722, %r7434, %r7458, %r7490, 0xD2; + lop3.b32 %r29723, %r7438, %r7462, %r7494, 0xD2; // end inline asm - ld.const.u32 %r4265, [matrix+2552]; + st.local.v2.u32 [%rd3+32], {%r29722, %r29723}; // begin inline asm - dp4a.u32.u32 %r4264, %r4265, %r5802, %r4260; + // chi + lop3.b32 %r29718, %r7458, %r7490, %r7466, 0xD2; + lop3.b32 %r29719, %r7462, %r7494, %r7470, 0xD2; // end inline asm - ld.const.u32 %r4269, [matrix+2556]; + st.local.v2.u32 [%rd3+40], {%r29718, %r29719}; // begin inline asm - dp4a.u32.u32 %r4268, %r4269, %r5806, %r4264; + // chi + lop3.b32 %r29714, %r7490, %r7466, %r7541, 0xD2; + lop3.b32 %r29715, %r7494, %r7470, %r7544, 0xD2; // end inline asm - shr.u32 %r6049, %r4204, 6; - and.b32 %r6050, %r6049, 240; - shr.u32 %r6051, %r4268, 10; - or.b32 %r6052, %r6051, %r6050; - xor.b32 %r6053, %r5900, %r6052; - ld.const.u32 %r4273, [matrix+2560]; + st.local.v2.u32 [%rd3+48], {%r29714, %r29715}; // begin inline asm - dp4a.u32.u32 %r4272, %r4273, %r5746, %r6244; + // chi + lop3.b32 %r29712, %r7466, %r7541, %r7434, 0xD2; + lop3.b32 %r29713, %r7470, %r7544, %r7438, 0xD2; // end inline asm - ld.const.u32 %r4277, [matrix+2564]; + st.local.v2.u32 [%rd3+56], {%r29712, %r29713}; // begin inline asm - dp4a.u32.u32 %r4276, %r4277, %r5750, %r4272; + // chi + lop3.b32 %r29708, %r7482, %r7442, %r7498, 0xD2; + lop3.b32 %r29709, %r7486, %r7446, %r7502, 0xD2; // end inline asm - ld.const.u32 %r4281, [matrix+2568]; + st.local.v2.u32 [%rd3+64], {%r29708, %r29709}; // begin inline asm - dp4a.u32.u32 %r4280, %r4281, %r5754, %r4276; + // chi + lop3.b32 %r29720, %r7442, %r7498, %r7474, 0xD2; + lop3.b32 %r29721, %r7446, %r7502, %r7478, 0xD2; // end inline asm - ld.const.u32 %r4285, [matrix+2572]; + st.local.v2.u32 [%rd3+72], {%r29720, %r29721}; // begin inline asm - dp4a.u32.u32 %r4284, %r4285, %r5758, %r4280; + // chi + lop3.b32 %r29716, %r7498, %r7474, %r7450, 0xD2; + lop3.b32 %r29717, %r7502, %r7478, %r7454, 0xD2; // end inline asm - ld.const.u32 %r4289, [matrix+2576]; + st.local.v2.u32 [%rd3+80], {%r29716, %r29717}; + add.s64 %rd498, %rd497, 184; // begin inline asm - dp4a.u32.u32 %r4288, %r4289, %r5762, %r4284; + ld.global.nc.v2.u32 {%r7570,%r7571}, [%rd498]; // end inline asm - ld.const.u32 %r4293, [matrix+2580]; + xor.b32 %r29710, %r7506, %r7570; + xor.b32 %r29711, %r7507, %r7571; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; + st.local.u64 [%rd55], %rd354; + mov.u64 %rd502, 1179641; + st.local.u64 [%rd55+8], %rd502; + add.s32 %r226, %r30, 1; + st.local.u32 [%rd55+16], %r226; + ld.global.u64 %rd503, [%rd35]; + ld.global.u64 %rd504, [%rd35+8]; + ld.global.u64 %rd505, [%rd35+16]; + ld.global.u64 %rd506, [%rd35+24]; + ld.global.u64 %rd507, [%rd35+32]; + ld.global.u64 %rd508, [%rd35+40]; + ld.global.u64 %rd509, [%rd35+48]; + ld.global.u64 %rd510, [%rd35+56]; + st.local.u64 [%rd55+32], %rd504; + st.local.u64 [%rd55+40], %rd505; + st.local.u64 [%rd55+48], %rd506; + st.local.u64 [%rd55+56], %rd507; + st.local.u64 [%rd55+64], %rd508; + st.local.u64 [%rd55+72], %rd509; + st.local.u64 [%rd55+80], %rd510; + cvt.u32.u64 %r7623, %rd503; + xor.b32 %r7624, %r226, %r7623; + st.local.u64 [%rd55+24], %rd503; + st.local.u32 [%rd55+24], %r7624; + mov.u32 %r29590, 0; + st.local.v2.u32 [%rd55+96], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+104], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+112], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+120], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+128], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+136], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+144], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+152], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+160], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+168], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+176], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+184], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+192], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+200], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+208], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+216], {%r29590, %r29590}; + mov.u32 %r29605, -2147483648; + st.local.v2.u32 [%rd55+88], {%r7586, %r29605}; + ld.local.v2.u32 {%r29626, %r29627}, [%rd55+24]; + mov.b64 {%r29624, %r29625}, %rd508; + shr.u64 %rd511, %rd504, 32; + cvt.u32.u64 %r29638, %rd504; + cvt.u32.u64 %r29639, %rd511; + shr.u64 %rd512, %rd509, 32; + cvt.u32.u64 %r29636, %rd509; + cvt.u32.u64 %r29637, %rd512; + shr.u64 %rd513, %rd505, 32; + cvt.u32.u64 %r29634, %rd505; + cvt.u32.u64 %r29635, %rd513; + shr.u64 %rd514, %rd510, 32; + cvt.u32.u64 %r29632, %rd510; + cvt.u32.u64 %r29633, %rd514; + shr.u64 %rd515, %rd506, 32; + cvt.u32.u64 %r29630, %rd506; + cvt.u32.u64 %r29631, %rd515; + shr.u64 %rd516, %rd507, 32; + cvt.u32.u64 %r29628, %rd507; + cvt.u32.u64 %r29629, %rd516; + mov.u32 %r29591, %r29590; + mov.u32 %r29592, %r29590; + mov.u32 %r29593, %r29590; + mov.u32 %r29594, %r29590; + mov.u32 %r29595, %r29590; + mov.u32 %r29596, %r29590; + mov.u32 %r29597, %r29590; + mov.u32 %r29598, %r29590; + mov.u32 %r29599, %r29590; + mov.u32 %r29600, %r29590; + mov.u32 %r29601, %r29590; + mov.u32 %r29602, %r29590; + mov.u32 %r29603, %r29590; + mov.u32 %r29604, %r7586; + mov.u32 %r29606, %r29590; + mov.u32 %r29607, %r29590; + mov.u32 %r29608, %r29590; + mov.u32 %r29609, %r29590; + mov.u32 %r29610, %r29590; + mov.u32 %r29611, %r29590; + mov.u32 %r29612, %r29590; + mov.u32 %r29613, %r29590; + mov.u32 %r29614, %r29590; + mov.u32 %r29615, %r29590; + mov.u32 %r29616, %r29590; + mov.u32 %r29617, %r29590; + mov.u32 %r29618, %r29590; + mov.u32 %r29619, %r29590; + mov.u32 %r29620, %r29590; + mov.u32 %r29621, %r29590; + mov.u32 %r29622, %r29590; + mov.u32 %r29623, %r29590; + mov.u32 %r29640, %r29590; + +$L__BB2_25: // begin inline asm - dp4a.u32.u32 %r4292, %r4293, %r5766, %r4288; + // xor5 + lop3.b32 %r7627, %r29626, %r29624, %r29622, 0x96; + lop3.b32 %r7627, %r7627, %r29620, %r29618, 0x96; + lop3.b32 %r7628, %r29627, %r29625, %r29623, 0x96; + lop3.b32 %r7628, %r7628, %r29621, %r29619, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7639, %r29638, %r29636, %r29616, 0x96; + lop3.b32 %r7639, %r7639, %r29614, %r29612, 0x96; + lop3.b32 %r7640, %r29639, %r29637, %r29617, 0x96; + lop3.b32 %r7640, %r7640, %r29615, %r29613, 0x96; // end inline asm - ld.const.u32 %r4297, [matrix+2584]; // begin inline asm - dp4a.u32.u32 %r4296, %r4297, %r5770, %r4292; + // xor5 + lop3.b32 %r7651, %r29634, %r29632, %r29610, 0x96; + lop3.b32 %r7651, %r7651, %r29608, %r29606, 0x96; + lop3.b32 %r7652, %r29635, %r29633, %r29611, 0x96; + lop3.b32 %r7652, %r7652, %r29609, %r29607, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7663, %r29630, %r29604, %r29602, 0x96; + lop3.b32 %r7663, %r7663, %r29600, %r29598, 0x96; + lop3.b32 %r7664, %r29631, %r29605, %r29603, 0x96; + lop3.b32 %r7664, %r7664, %r29601, %r29599, 0x96; // end inline asm - ld.const.u32 %r4301, [matrix+2588]; // begin inline asm - dp4a.u32.u32 %r4300, %r4301, %r5774, %r4296; + // xor5 + lop3.b32 %r7675, %r29628, %r29596, %r29594, 0x96; + lop3.b32 %r7675, %r7675, %r29592, %r29590, 0x96; + lop3.b32 %r7676, %r29629, %r29597, %r29595, 0x96; + lop3.b32 %r7676, %r7676, %r29593, %r29591, 0x96; // end inline asm - ld.const.u32 %r4305, [matrix+2592]; // begin inline asm - dp4a.u32.u32 %r4304, %r4305, %r5778, %r4300; + shf.l.wrap.b32 %r7687, %r7640, %r7639, %r7586; // end inline asm - ld.const.u32 %r4309, [matrix+2596]; // begin inline asm - dp4a.u32.u32 %r4308, %r4309, %r5782, %r4304; + shf.l.wrap.b32 %r7691, %r7639, %r7640, %r7586; // end inline asm - ld.const.u32 %r4313, [matrix+2600]; + xor.b32 %r8121, %r7687, %r7675; + xor.b32 %r8122, %r7691, %r7676; + xor.b32 %r7954, %r29626, %r8121; + xor.b32 %r7957, %r29627, %r8122; + xor.b32 %r7861, %r29624, %r8121; + xor.b32 %r7860, %r29625, %r8122; + xor.b32 %r7908, %r29622, %r8121; + xor.b32 %r7909, %r29623, %r8122; + xor.b32 %r7813, %r29620, %r8121; + xor.b32 %r7812, %r29621, %r8122; + xor.b32 %r7764, %r29618, %r8121; + xor.b32 %r7765, %r29619, %r8122; // begin inline asm - dp4a.u32.u32 %r4312, %r4313, %r5786, %r4308; + shf.l.wrap.b32 %r7695, %r7652, %r7651, %r7586; // end inline asm - ld.const.u32 %r4317, [matrix+2604]; // begin inline asm - dp4a.u32.u32 %r4316, %r4317, %r5790, %r4312; + shf.l.wrap.b32 %r7699, %r7651, %r7652, %r7586; // end inline asm - ld.const.u32 %r4321, [matrix+2608]; + xor.b32 %r8123, %r7695, %r7627; + xor.b32 %r8124, %r7699, %r7628; + xor.b32 %r7916, %r29638, %r8123; + xor.b32 %r7917, %r29639, %r8124; + xor.b32 %r7733, %r29636, %r8123; + xor.b32 %r7732, %r29637, %r8124; + xor.b32 %r7892, %r29616, %r8123; + xor.b32 %r7893, %r29617, %r8124; + xor.b32 %r7853, %r29614, %r8123; + xor.b32 %r7852, %r29615, %r8124; + xor.b32 %r7836, %r29612, %r8123; + xor.b32 %r7837, %r29613, %r8124; // begin inline asm - dp4a.u32.u32 %r4320, %r4321, %r5794, %r4316; + shf.l.wrap.b32 %r7703, %r7664, %r7663, %r7586; // end inline asm - ld.const.u32 %r4325, [matrix+2612]; // begin inline asm - dp4a.u32.u32 %r4324, %r4325, %r5798, %r4320; + shf.l.wrap.b32 %r7707, %r7663, %r7664, %r7586; // end inline asm - ld.const.u32 %r4329, [matrix+2616]; + xor.b32 %r8125, %r7703, %r7639; + xor.b32 %r8126, %r7707, %r7640; + xor.b32 %r7773, %r29634, %r8125; + xor.b32 %r7772, %r29635, %r8126; + xor.b32 %r7900, %r29632, %r8125; + xor.b32 %r7901, %r29633, %r8126; + xor.b32 %r7781, %r29610, %r8125; + xor.b32 %r7780, %r29611, %r8126; + xor.b32 %r7884, %r29608, %r8125; + xor.b32 %r7885, %r29609, %r8126; + xor.b32 %r7749, %r29606, %r8125; + xor.b32 %r7748, %r29607, %r8126; // begin inline asm - dp4a.u32.u32 %r4328, %r4329, %r5802, %r4324; + shf.l.wrap.b32 %r7711, %r7676, %r7675, %r7586; // end inline asm - ld.const.u32 %r4333, [matrix+2620]; // begin inline asm - dp4a.u32.u32 %r4332, %r4333, %r5806, %r4328; + shf.l.wrap.b32 %r7715, %r7675, %r7676, %r7586; // end inline asm - ld.const.u32 %r4337, [matrix+2624]; + xor.b32 %r8127, %r7711, %r7651; + xor.b32 %r8128, %r7715, %r7652; + xor.b32 %r7868, %r29630, %r8127; + xor.b32 %r7869, %r29631, %r8128; + xor.b32 %r7845, %r29604, %r8127; + xor.b32 %r7844, %r29605, %r8128; + xor.b32 %r7788, %r29602, %r8127; + xor.b32 %r7789, %r29603, %r8128; + xor.b32 %r7876, %r29600, %r8127; + xor.b32 %r7877, %r29601, %r8128; + xor.b32 %r7805, %r29598, %r8127; + xor.b32 %r7804, %r29599, %r8128; // begin inline asm - dp4a.u32.u32 %r4336, %r4337, %r5746, %r6244; + shf.l.wrap.b32 %r7719, %r7628, %r7627, %r7586; // end inline asm - ld.const.u32 %r4341, [matrix+2628]; // begin inline asm - dp4a.u32.u32 %r4340, %r4341, %r5750, %r4336; + shf.l.wrap.b32 %r7723, %r7627, %r7628, %r7586; // end inline asm - ld.const.u32 %r4345, [matrix+2632]; + xor.b32 %r8129, %r7719, %r7663; + xor.b32 %r8130, %r7723, %r7664; + xor.b32 %r7820, %r29628, %r8129; + xor.b32 %r7821, %r29629, %r8130; + xor.b32 %r7740, %r29596, %r8129; + xor.b32 %r7741, %r29597, %r8130; + xor.b32 %r7757, %r29594, %r8129; + xor.b32 %r7756, %r29595, %r8130; + xor.b32 %r7796, %r29592, %r8129; + xor.b32 %r7797, %r29593, %r8130; + xor.b32 %r7828, %r29590, %r8129; + xor.b32 %r7829, %r29591, %r8130; + mov.u32 %r7734, 44; // begin inline asm - dp4a.u32.u32 %r4344, %r4345, %r5754, %r4340; + shf.l.wrap.b32 %r7727, %r7733, %r7732, %r7734; // end inline asm - ld.const.u32 %r4349, [matrix+2636]; // begin inline asm - dp4a.u32.u32 %r4348, %r4349, %r5758, %r4344; + shf.l.wrap.b32 %r7731, %r7732, %r7733, %r7734; // end inline asm - ld.const.u32 %r4353, [matrix+2640]; + mov.u32 %r7742, 20; // begin inline asm - dp4a.u32.u32 %r4352, %r4353, %r5762, %r4348; + shf.l.wrap.b32 %r7735, %r7741, %r7740, %r7742; // end inline asm - ld.const.u32 %r4357, [matrix+2644]; // begin inline asm - dp4a.u32.u32 %r4356, %r4357, %r5766, %r4352; + shf.l.wrap.b32 %r7739, %r7740, %r7741, %r7742; // end inline asm - ld.const.u32 %r4361, [matrix+2648]; + mov.u32 %r7750, 61; // begin inline asm - dp4a.u32.u32 %r4360, %r4361, %r5770, %r4356; + shf.l.wrap.b32 %r7743, %r7749, %r7748, %r7750; // end inline asm - ld.const.u32 %r4365, [matrix+2652]; // begin inline asm - dp4a.u32.u32 %r4364, %r4365, %r5774, %r4360; + shf.l.wrap.b32 %r7747, %r7748, %r7749, %r7750; // end inline asm - ld.const.u32 %r4369, [matrix+2656]; + mov.u32 %r7758, 39; // begin inline asm - dp4a.u32.u32 %r4368, %r4369, %r5778, %r4364; + shf.l.wrap.b32 %r7751, %r7757, %r7756, %r7758; // end inline asm - ld.const.u32 %r4373, [matrix+2660]; // begin inline asm - dp4a.u32.u32 %r4372, %r4373, %r5782, %r4368; + shf.l.wrap.b32 %r7755, %r7756, %r7757, %r7758; // end inline asm - ld.const.u32 %r4377, [matrix+2664]; + mov.u32 %r7766, 18; // begin inline asm - dp4a.u32.u32 %r4376, %r4377, %r5786, %r4372; + shf.l.wrap.b32 %r7759, %r7765, %r7764, %r7766; // end inline asm - ld.const.u32 %r4381, [matrix+2668]; // begin inline asm - dp4a.u32.u32 %r4380, %r4381, %r5790, %r4376; + shf.l.wrap.b32 %r7763, %r7764, %r7765, %r7766; // end inline asm - ld.const.u32 %r4385, [matrix+2672]; + mov.u32 %r7774, 62; // begin inline asm - dp4a.u32.u32 %r4384, %r4385, %r5794, %r4380; + shf.l.wrap.b32 %r7767, %r7773, %r7772, %r7774; // end inline asm - ld.const.u32 %r4389, [matrix+2676]; // begin inline asm - dp4a.u32.u32 %r4388, %r4389, %r5798, %r4384; + shf.l.wrap.b32 %r7771, %r7772, %r7773, %r7774; // end inline asm - ld.const.u32 %r4393, [matrix+2680]; + mov.u32 %r7782, 43; // begin inline asm - dp4a.u32.u32 %r4392, %r4393, %r5802, %r4388; + shf.l.wrap.b32 %r7775, %r7781, %r7780, %r7782; // end inline asm - ld.const.u32 %r4397, [matrix+2684]; // begin inline asm - dp4a.u32.u32 %r4396, %r4397, %r5806, %r4392; + shf.l.wrap.b32 %r7779, %r7780, %r7781, %r7782; // end inline asm - shr.u32 %r6054, %r4332, 6; - and.b32 %r6055, %r6054, 240; - shr.u32 %r6056, %r4396, 10; - or.b32 %r6057, %r6056, %r6055; - cvt.u64.u32 %rd223, %r6057; - xor.b64 %rd224, %rd16, %rd223; - ld.const.u32 %r4401, [matrix+2688]; + mov.u32 %r7790, 25; // begin inline asm - dp4a.u32.u32 %r4400, %r4401, %r5746, %r6244; + shf.l.wrap.b32 %r7783, %r7789, %r7788, %r7790; // end inline asm - ld.const.u32 %r4405, [matrix+2692]; // begin inline asm - dp4a.u32.u32 %r4404, %r4405, %r5750, %r4400; + shf.l.wrap.b32 %r7787, %r7788, %r7789, %r7790; // end inline asm - ld.const.u32 %r4409, [matrix+2696]; + mov.u32 %r7798, 8; // begin inline asm - dp4a.u32.u32 %r4408, %r4409, %r5754, %r4404; + shf.l.wrap.b32 %r7791, %r7797, %r7796, %r7798; // end inline asm - ld.const.u32 %r4413, [matrix+2700]; // begin inline asm - dp4a.u32.u32 %r4412, %r4413, %r5758, %r4408; + shf.l.wrap.b32 %r7795, %r7796, %r7797, %r7798; // end inline asm - ld.const.u32 %r4417, [matrix+2704]; + mov.u32 %r7806, 56; // begin inline asm - dp4a.u32.u32 %r4416, %r4417, %r5762, %r4412; + shf.l.wrap.b32 %r7799, %r7805, %r7804, %r7806; // end inline asm - ld.const.u32 %r4421, [matrix+2708]; // begin inline asm - dp4a.u32.u32 %r4420, %r4421, %r5766, %r4416; + shf.l.wrap.b32 %r7803, %r7804, %r7805, %r7806; // end inline asm - ld.const.u32 %r4425, [matrix+2712]; + mov.u32 %r7814, 41; // begin inline asm - dp4a.u32.u32 %r4424, %r4425, %r5770, %r4420; + shf.l.wrap.b32 %r7807, %r7813, %r7812, %r7814; // end inline asm - ld.const.u32 %r4429, [matrix+2716]; // begin inline asm - dp4a.u32.u32 %r4428, %r4429, %r5774, %r4424; + shf.l.wrap.b32 %r7811, %r7812, %r7813, %r7814; // end inline asm - ld.const.u32 %r4433, [matrix+2720]; + mov.u32 %r7822, 27; // begin inline asm - dp4a.u32.u32 %r4432, %r4433, %r5778, %r4428; + shf.l.wrap.b32 %r7815, %r7821, %r7820, %r7822; // end inline asm - ld.const.u32 %r4437, [matrix+2724]; // begin inline asm - dp4a.u32.u32 %r4436, %r4437, %r5782, %r4432; + shf.l.wrap.b32 %r7819, %r7820, %r7821, %r7822; // end inline asm - ld.const.u32 %r4441, [matrix+2728]; + mov.u32 %r7830, 14; // begin inline asm - dp4a.u32.u32 %r4440, %r4441, %r5786, %r4436; + shf.l.wrap.b32 %r7823, %r7829, %r7828, %r7830; // end inline asm - ld.const.u32 %r4445, [matrix+2732]; // begin inline asm - dp4a.u32.u32 %r4444, %r4445, %r5790, %r4440; + shf.l.wrap.b32 %r7827, %r7828, %r7829, %r7830; // end inline asm - ld.const.u32 %r4449, [matrix+2736]; + mov.u32 %r7838, 2; // begin inline asm - dp4a.u32.u32 %r4448, %r4449, %r5794, %r4444; + shf.l.wrap.b32 %r7831, %r7837, %r7836, %r7838; // end inline asm - ld.const.u32 %r4453, [matrix+2740]; // begin inline asm - dp4a.u32.u32 %r4452, %r4453, %r5798, %r4448; + shf.l.wrap.b32 %r7835, %r7836, %r7837, %r7838; // end inline asm - ld.const.u32 %r4457, [matrix+2744]; + mov.u32 %r7846, 55; // begin inline asm - dp4a.u32.u32 %r4456, %r4457, %r5802, %r4452; + shf.l.wrap.b32 %r7839, %r7845, %r7844, %r7846; // end inline asm - ld.const.u32 %r4461, [matrix+2748]; // begin inline asm - dp4a.u32.u32 %r4460, %r4461, %r5806, %r4456; + shf.l.wrap.b32 %r7843, %r7844, %r7845, %r7846; // end inline asm - ld.const.u32 %r4465, [matrix+2752]; + mov.u32 %r7854, 45; // begin inline asm - dp4a.u32.u32 %r4464, %r4465, %r5746, %r6244; + shf.l.wrap.b32 %r7847, %r7853, %r7852, %r7854; // end inline asm - ld.const.u32 %r4469, [matrix+2756]; // begin inline asm - dp4a.u32.u32 %r4468, %r4469, %r5750, %r4464; + shf.l.wrap.b32 %r7851, %r7852, %r7853, %r7854; // end inline asm - ld.const.u32 %r4473, [matrix+2760]; + mov.u32 %r7862, 36; // begin inline asm - dp4a.u32.u32 %r4472, %r4473, %r5754, %r4468; + shf.l.wrap.b32 %r7855, %r7861, %r7860, %r7862; // end inline asm - ld.const.u32 %r4477, [matrix+2764]; // begin inline asm - dp4a.u32.u32 %r4476, %r4477, %r5758, %r4472; + shf.l.wrap.b32 %r7859, %r7860, %r7861, %r7862; // end inline asm - ld.const.u32 %r4481, [matrix+2768]; + mov.u32 %r7870, 28; // begin inline asm - dp4a.u32.u32 %r4480, %r4481, %r5762, %r4476; + shf.l.wrap.b32 %r7863, %r7869, %r7868, %r7870; // end inline asm - ld.const.u32 %r4485, [matrix+2772]; // begin inline asm - dp4a.u32.u32 %r4484, %r4485, %r5766, %r4480; + shf.l.wrap.b32 %r7867, %r7868, %r7869, %r7870; // end inline asm - ld.const.u32 %r4489, [matrix+2776]; + mov.u32 %r7878, 21; // begin inline asm - dp4a.u32.u32 %r4488, %r4489, %r5770, %r4484; + shf.l.wrap.b32 %r7871, %r7877, %r7876, %r7878; // end inline asm - ld.const.u32 %r4493, [matrix+2780]; // begin inline asm - dp4a.u32.u32 %r4492, %r4493, %r5774, %r4488; + shf.l.wrap.b32 %r7875, %r7876, %r7877, %r7878; // end inline asm - ld.const.u32 %r4497, [matrix+2784]; + mov.u32 %r7886, 15; // begin inline asm - dp4a.u32.u32 %r4496, %r4497, %r5778, %r4492; + shf.l.wrap.b32 %r7879, %r7885, %r7884, %r7886; // end inline asm - ld.const.u32 %r4501, [matrix+2788]; // begin inline asm - dp4a.u32.u32 %r4500, %r4501, %r5782, %r4496; + shf.l.wrap.b32 %r7883, %r7884, %r7885, %r7886; // end inline asm - ld.const.u32 %r4505, [matrix+2792]; + mov.u32 %r7894, 10; // begin inline asm - dp4a.u32.u32 %r4504, %r4505, %r5786, %r4500; + shf.l.wrap.b32 %r7887, %r7893, %r7892, %r7894; // end inline asm - ld.const.u32 %r4509, [matrix+2796]; // begin inline asm - dp4a.u32.u32 %r4508, %r4509, %r5790, %r4504; + shf.l.wrap.b32 %r7891, %r7892, %r7893, %r7894; // end inline asm - ld.const.u32 %r4513, [matrix+2800]; + mov.u32 %r7902, 6; // begin inline asm - dp4a.u32.u32 %r4512, %r4513, %r5794, %r4508; + shf.l.wrap.b32 %r7895, %r7901, %r7900, %r7902; // end inline asm - ld.const.u32 %r4517, [matrix+2804]; // begin inline asm - dp4a.u32.u32 %r4516, %r4517, %r5798, %r4512; + shf.l.wrap.b32 %r7899, %r7900, %r7901, %r7902; // end inline asm - ld.const.u32 %r4521, [matrix+2808]; + mov.u32 %r7910, 3; // begin inline asm - dp4a.u32.u32 %r4520, %r4521, %r5802, %r4516; + shf.l.wrap.b32 %r7903, %r7909, %r7908, %r7910; // end inline asm - ld.const.u32 %r4525, [matrix+2812]; // begin inline asm - dp4a.u32.u32 %r4524, %r4525, %r5806, %r4520; + shf.l.wrap.b32 %r7907, %r7908, %r7909, %r7910; // end inline asm - shr.u32 %r6058, %r4460, 6; - and.b32 %r6059, %r6058, 240; - shr.u32 %r6060, %r4524, 10; - or.b32 %r6061, %r6060, %r6059; - cvt.u64.u32 %rd225, %r6061; - xor.b64 %rd226, %rd17, %rd225; - ld.const.u32 %r4529, [matrix+2816]; // begin inline asm - dp4a.u32.u32 %r4528, %r4529, %r5746, %r6244; + shf.l.wrap.b32 %r7911, %r7917, %r7916, %r7586; // end inline asm - ld.const.u32 %r4533, [matrix+2820]; // begin inline asm - dp4a.u32.u32 %r4532, %r4533, %r5750, %r4528; + shf.l.wrap.b32 %r7915, %r7916, %r7917, %r7586; // end inline asm - ld.const.u32 %r4537, [matrix+2824]; // begin inline asm - dp4a.u32.u32 %r4536, %r4537, %r5754, %r4532; + // chi + lop3.b32 %r7919, %r7954, %r7727, %r7775, 0xD2; + lop3.b32 %r7920, %r7957, %r7731, %r7779, 0xD2; // end inline asm - ld.const.u32 %r4541, [matrix+2828]; // begin inline asm - dp4a.u32.u32 %r4540, %r4541, %r5758, %r4536; + // chi + lop3.b32 %r29638, %r7727, %r7775, %r7871, 0xD2; + lop3.b32 %r29639, %r7731, %r7779, %r7875, 0xD2; // end inline asm - ld.const.u32 %r4545, [matrix+2832]; // begin inline asm - dp4a.u32.u32 %r4544, %r4545, %r5762, %r4540; + // chi + lop3.b32 %r29634, %r7775, %r7871, %r7823, 0xD2; + lop3.b32 %r29635, %r7779, %r7875, %r7827, 0xD2; // end inline asm - ld.const.u32 %r4549, [matrix+2836]; // begin inline asm - dp4a.u32.u32 %r4548, %r4549, %r5766, %r4544; + // chi + lop3.b32 %r29630, %r7871, %r7823, %r7954, 0xD2; + lop3.b32 %r29631, %r7875, %r7827, %r7957, 0xD2; // end inline asm - ld.const.u32 %r4553, [matrix+2840]; // begin inline asm - dp4a.u32.u32 %r4552, %r4553, %r5770, %r4548; + // chi + lop3.b32 %r29628, %r7823, %r7954, %r7727, 0xD2; + lop3.b32 %r29629, %r7827, %r7957, %r7731, 0xD2; // end inline asm - ld.const.u32 %r4557, [matrix+2844]; // begin inline asm - dp4a.u32.u32 %r4556, %r4557, %r5774, %r4552; + // chi + lop3.b32 %r29624, %r7863, %r7735, %r7903, 0xD2; + lop3.b32 %r29625, %r7867, %r7739, %r7907, 0xD2; // end inline asm - ld.const.u32 %r4561, [matrix+2848]; // begin inline asm - dp4a.u32.u32 %r4560, %r4561, %r5778, %r4556; + // chi + lop3.b32 %r29636, %r7735, %r7903, %r7847, 0xD2; + lop3.b32 %r29637, %r7739, %r7907, %r7851, 0xD2; // end inline asm - ld.const.u32 %r4565, [matrix+2852]; // begin inline asm - dp4a.u32.u32 %r4564, %r4565, %r5782, %r4560; + // chi + lop3.b32 %r29632, %r7903, %r7847, %r7743, 0xD2; + lop3.b32 %r29633, %r7907, %r7851, %r7747, 0xD2; // end inline asm - ld.const.u32 %r4569, [matrix+2856]; // begin inline asm - dp4a.u32.u32 %r4568, %r4569, %r5786, %r4564; + // chi + lop3.b32 %r29604, %r7847, %r7743, %r7863, 0xD2; + lop3.b32 %r29605, %r7851, %r7747, %r7867, 0xD2; // end inline asm - ld.const.u32 %r4573, [matrix+2860]; + st.local.v2.u32 [%rd55+88], {%r29604, %r29605}; // begin inline asm - dp4a.u32.u32 %r4572, %r4573, %r5790, %r4568; + // chi + lop3.b32 %r29596, %r7743, %r7863, %r7735, 0xD2; + lop3.b32 %r29597, %r7747, %r7867, %r7739, 0xD2; // end inline asm - ld.const.u32 %r4577, [matrix+2864]; + st.local.v2.u32 [%rd55+96], {%r29596, %r29597}; // begin inline asm - dp4a.u32.u32 %r4576, %r4577, %r5794, %r4572; + // chi + lop3.b32 %r29622, %r7911, %r7895, %r7783, 0xD2; + lop3.b32 %r29623, %r7915, %r7899, %r7787, 0xD2; // end inline asm - ld.const.u32 %r4581, [matrix+2868]; + st.local.v2.u32 [%rd55+104], {%r29622, %r29623}; // begin inline asm - dp4a.u32.u32 %r4580, %r4581, %r5798, %r4576; + // chi + lop3.b32 %r29616, %r7895, %r7783, %r7791, 0xD2; + lop3.b32 %r29617, %r7899, %r7787, %r7795, 0xD2; // end inline asm - ld.const.u32 %r4585, [matrix+2872]; + st.local.v2.u32 [%rd55+112], {%r29616, %r29617}; // begin inline asm - dp4a.u32.u32 %r4584, %r4585, %r5802, %r4580; + // chi + lop3.b32 %r29610, %r7783, %r7791, %r7759, 0xD2; + lop3.b32 %r29611, %r7787, %r7795, %r7763, 0xD2; // end inline asm - ld.const.u32 %r4589, [matrix+2876]; + st.local.v2.u32 [%rd55+120], {%r29610, %r29611}; // begin inline asm - dp4a.u32.u32 %r4588, %r4589, %r5806, %r4584; + // chi + lop3.b32 %r29602, %r7791, %r7759, %r7911, 0xD2; + lop3.b32 %r29603, %r7795, %r7763, %r7915, 0xD2; // end inline asm - ld.const.u32 %r4593, [matrix+2880]; + st.local.v2.u32 [%rd55+128], {%r29602, %r29603}; // begin inline asm - dp4a.u32.u32 %r4592, %r4593, %r5746, %r6244; + // chi + lop3.b32 %r29594, %r7759, %r7911, %r7895, 0xD2; + lop3.b32 %r29595, %r7763, %r7915, %r7899, 0xD2; // end inline asm - ld.const.u32 %r4597, [matrix+2884]; + st.local.v2.u32 [%rd55+136], {%r29594, %r29595}; // begin inline asm - dp4a.u32.u32 %r4596, %r4597, %r5750, %r4592; + // chi + lop3.b32 %r29620, %r7815, %r7855, %r7887, 0xD2; + lop3.b32 %r29621, %r7819, %r7859, %r7891, 0xD2; // end inline asm - ld.const.u32 %r4601, [matrix+2888]; + st.local.v2.u32 [%rd55+144], {%r29620, %r29621}; // begin inline asm - dp4a.u32.u32 %r4600, %r4601, %r5754, %r4596; + // chi + lop3.b32 %r29614, %r7855, %r7887, %r7879, 0xD2; + lop3.b32 %r29615, %r7859, %r7891, %r7883, 0xD2; // end inline asm - ld.const.u32 %r4605, [matrix+2892]; + st.local.v2.u32 [%rd55+152], {%r29614, %r29615}; // begin inline asm - dp4a.u32.u32 %r4604, %r4605, %r5758, %r4600; + // chi + lop3.b32 %r29608, %r7887, %r7879, %r7799, 0xD2; + lop3.b32 %r29609, %r7891, %r7883, %r7803, 0xD2; // end inline asm - ld.const.u32 %r4609, [matrix+2896]; + st.local.v2.u32 [%rd55+160], {%r29608, %r29609}; // begin inline asm - dp4a.u32.u32 %r4608, %r4609, %r5762, %r4604; + // chi + lop3.b32 %r29600, %r7879, %r7799, %r7815, 0xD2; + lop3.b32 %r29601, %r7883, %r7803, %r7819, 0xD2; // end inline asm - ld.const.u32 %r4613, [matrix+2900]; + st.local.v2.u32 [%rd55+168], {%r29600, %r29601}; // begin inline asm - dp4a.u32.u32 %r4612, %r4613, %r5766, %r4608; + // chi + lop3.b32 %r29592, %r7799, %r7815, %r7855, 0xD2; + lop3.b32 %r29593, %r7803, %r7819, %r7859, 0xD2; // end inline asm - ld.const.u32 %r4617, [matrix+2904]; + st.local.v2.u32 [%rd55+176], {%r29592, %r29593}; // begin inline asm - dp4a.u32.u32 %r4616, %r4617, %r5770, %r4612; + // chi + lop3.b32 %r29618, %r7767, %r7839, %r7751, 0xD2; + lop3.b32 %r29619, %r7771, %r7843, %r7755, 0xD2; // end inline asm - ld.const.u32 %r4621, [matrix+2908]; + st.local.v2.u32 [%rd55+184], {%r29618, %r29619}; // begin inline asm - dp4a.u32.u32 %r4620, %r4621, %r5774, %r4616; + // chi + lop3.b32 %r29612, %r7839, %r7751, %r7807, 0xD2; + lop3.b32 %r29613, %r7843, %r7755, %r7811, 0xD2; // end inline asm - ld.const.u32 %r4625, [matrix+2912]; + st.local.v2.u32 [%rd55+192], {%r29612, %r29613}; // begin inline asm - dp4a.u32.u32 %r4624, %r4625, %r5778, %r4620; + // chi + lop3.b32 %r29606, %r7751, %r7807, %r7831, 0xD2; + lop3.b32 %r29607, %r7755, %r7811, %r7835, 0xD2; // end inline asm - ld.const.u32 %r4629, [matrix+2916]; + st.local.v2.u32 [%rd55+200], {%r29606, %r29607}; // begin inline asm - dp4a.u32.u32 %r4628, %r4629, %r5782, %r4624; + // chi + lop3.b32 %r29598, %r7807, %r7831, %r7767, 0xD2; + lop3.b32 %r29599, %r7811, %r7835, %r7771, 0xD2; // end inline asm - ld.const.u32 %r4633, [matrix+2920]; + st.local.v2.u32 [%rd55+208], {%r29598, %r29599}; // begin inline asm - dp4a.u32.u32 %r4632, %r4633, %r5786, %r4628; + // chi + lop3.b32 %r29590, %r7831, %r7767, %r7839, 0xD2; + lop3.b32 %r29591, %r7835, %r7771, %r7843, 0xD2; // end inline asm - ld.const.u32 %r4637, [matrix+2924]; + st.local.v2.u32 [%rd55+216], {%r29590, %r29591}; + mul.wide.s32 %rd518, %r29640, 8; + add.s64 %rd517, %rd497, %rd518; // begin inline asm - dp4a.u32.u32 %r4636, %r4637, %r5790, %r4632; + ld.global.nc.v2.u32 {%r8119,%r8120}, [%rd517]; // end inline asm - ld.const.u32 %r4641, [matrix+2928]; + xor.b32 %r29626, %r7919, %r8119; + xor.b32 %r29627, %r7920, %r8120; + add.s32 %r29640, %r29640, 1; + setp.lt.u32 %p19, %r29640, 23; + @%p19 bra $L__BB2_25; + + mov.u32 %r29673, 0; + mov.u32 %r8230, 1; + st.local.v2.u32 [%rd55+32], {%r29638, %r29639}; + st.local.v2.u32 [%rd55+72], {%r29636, %r29637}; + st.local.v2.u32 [%rd55+40], {%r29634, %r29635}; + st.local.v2.u32 [%rd55+80], {%r29632, %r29633}; + st.local.v2.u32 [%rd55+48], {%r29630, %r29631}; + st.local.v2.u32 [%rd55+56], {%r29628, %r29629}; + st.local.v2.u32 [%rd55+24], {%r29626, %r29627}; // begin inline asm - dp4a.u32.u32 %r4640, %r4641, %r5794, %r4636; + // xor5 + lop3.b32 %r8131, %r29626, %r29624, %r29622, 0x96; + lop3.b32 %r8131, %r8131, %r29620, %r29618, 0x96; + lop3.b32 %r8132, %r29627, %r29625, %r29623, 0x96; + lop3.b32 %r8132, %r8132, %r29621, %r29619, 0x96; // end inline asm - ld.const.u32 %r4645, [matrix+2932]; // begin inline asm - dp4a.u32.u32 %r4644, %r4645, %r5798, %r4640; + // xor5 + lop3.b32 %r8143, %r29638, %r29636, %r29616, 0x96; + lop3.b32 %r8143, %r8143, %r29614, %r29612, 0x96; + lop3.b32 %r8144, %r29639, %r29637, %r29617, 0x96; + lop3.b32 %r8144, %r8144, %r29615, %r29613, 0x96; // end inline asm - ld.const.u32 %r4649, [matrix+2936]; // begin inline asm - dp4a.u32.u32 %r4648, %r4649, %r5802, %r4644; + // xor5 + lop3.b32 %r8155, %r29634, %r29632, %r29610, 0x96; + lop3.b32 %r8155, %r8155, %r29608, %r29606, 0x96; + lop3.b32 %r8156, %r29635, %r29633, %r29611, 0x96; + lop3.b32 %r8156, %r8156, %r29609, %r29607, 0x96; // end inline asm - ld.const.u32 %r4653, [matrix+2940]; // begin inline asm - dp4a.u32.u32 %r4652, %r4653, %r5806, %r4648; + // xor5 + lop3.b32 %r8167, %r29630, %r29604, %r29602, 0x96; + lop3.b32 %r8167, %r8167, %r29600, %r29598, 0x96; + lop3.b32 %r8168, %r29631, %r29605, %r29603, 0x96; + lop3.b32 %r8168, %r8168, %r29601, %r29599, 0x96; // end inline asm - shr.u32 %r6062, %r4588, 6; - and.b32 %r6063, %r6062, 240; - shr.u32 %r6064, %r4652, 10; - or.b32 %r6065, %r6064, %r6063; - cvt.u64.u32 %rd227, %r6065; - xor.b64 %rd228, %rd18, %rd227; - ld.const.u32 %r4657, [matrix+2944]; // begin inline asm - dp4a.u32.u32 %r4656, %r4657, %r5746, %r6244; + // xor5 + lop3.b32 %r8179, %r29628, %r29596, %r29594, 0x96; + lop3.b32 %r8179, %r8179, %r29592, %r29590, 0x96; + lop3.b32 %r8180, %r29629, %r29597, %r29595, 0x96; + lop3.b32 %r8180, %r8180, %r29593, %r29591, 0x96; // end inline asm - ld.const.u32 %r4661, [matrix+2948]; // begin inline asm - dp4a.u32.u32 %r4660, %r4661, %r5750, %r4656; + shf.l.wrap.b32 %r8191, %r8144, %r8143, %r8230; // end inline asm - ld.const.u32 %r4665, [matrix+2952]; // begin inline asm - dp4a.u32.u32 %r4664, %r4665, %r5754, %r4660; + shf.l.wrap.b32 %r8195, %r8143, %r8144, %r8230; // end inline asm - ld.const.u32 %r4669, [matrix+2956]; + xor.b32 %r8370, %r8191, %r8179; + xor.b32 %r8371, %r8195, %r8180; + xor.b32 %r8338, %r29626, %r8370; + xor.b32 %r8341, %r29627, %r8371; + xor.b32 %r8301, %r29623, %r8371; + xor.b32 %r8300, %r29622, %r8370; + st.local.v2.u32 [%rd55+104], {%r8300, %r8301}; // begin inline asm - dp4a.u32.u32 %r4668, %r4669, %r5758, %r4664; + shf.l.wrap.b32 %r8199, %r8156, %r8155, %r8230; // end inline asm - ld.const.u32 %r4673, [matrix+2960]; // begin inline asm - dp4a.u32.u32 %r4672, %r4673, %r5762, %r4668; + shf.l.wrap.b32 %r8203, %r8155, %r8156, %r8230; // end inline asm - ld.const.u32 %r4677, [matrix+2964]; + xor.b32 %r8372, %r8199, %r8131; + xor.b32 %r8373, %r8203, %r8132; + xor.b32 %r8237, %r29636, %r8372; + xor.b32 %r8236, %r29637, %r8373; + xor.b32 %r8276, %r29615, %r8373; + xor.b32 %r8277, %r29614, %r8372; + st.local.v2.u32 [%rd55+152], {%r8277, %r8276}; // begin inline asm - dp4a.u32.u32 %r4676, %r4677, %r5766, %r4672; + shf.l.wrap.b32 %r8207, %r8168, %r8167, %r8230; // end inline asm - ld.const.u32 %r4681, [matrix+2968]; // begin inline asm - dp4a.u32.u32 %r4680, %r4681, %r5770, %r4676; + shf.l.wrap.b32 %r8211, %r8167, %r8168, %r8230; // end inline asm - ld.const.u32 %r4685, [matrix+2972]; + xor.b32 %r8374, %r8207, %r8143; + xor.b32 %r8375, %r8211, %r8144; + xor.b32 %r8260, %r29611, %r8375; + xor.b32 %r8261, %r29610, %r8374; + st.local.v2.u32 [%rd55+120], {%r8261, %r8260}; + xor.b32 %r8252, %r29607, %r8375; + xor.b32 %r8253, %r29606, %r8374; + st.local.v2.u32 [%rd55+200], {%r8253, %r8252}; // begin inline asm - dp4a.u32.u32 %r4684, %r4685, %r5774, %r4680; + shf.l.wrap.b32 %r8215, %r8180, %r8179, %r8230; // end inline asm - ld.const.u32 %r4689, [matrix+2976]; // begin inline asm - dp4a.u32.u32 %r4688, %r4689, %r5778, %r4684; + shf.l.wrap.b32 %r8219, %r8179, %r8180, %r8230; // end inline asm - ld.const.u32 %r4693, [matrix+2980]; + xor.b32 %r8376, %r8215, %r8155; + xor.b32 %r8377, %r8219, %r8156; + xor.b32 %r8284, %r29630, %r8376; + xor.b32 %r8285, %r29631, %r8377; + xor.b32 %r8293, %r29601, %r8377; + xor.b32 %r8292, %r29600, %r8376; + st.local.v2.u32 [%rd55+168], {%r8292, %r8293}; // begin inline asm - dp4a.u32.u32 %r4692, %r4693, %r5782, %r4688; + shf.l.wrap.b32 %r8223, %r8132, %r8131, %r8230; // end inline asm - ld.const.u32 %r4697, [matrix+2984]; // begin inline asm - dp4a.u32.u32 %r4696, %r4697, %r5786, %r4692; + shf.l.wrap.b32 %r8227, %r8131, %r8132, %r8230; // end inline asm - ld.const.u32 %r4701, [matrix+2988]; + xor.b32 %r8378, %r8223, %r8167; + xor.b32 %r8379, %r8227, %r8168; + xor.b32 %r8244, %r29596, %r8378; + xor.b32 %r8245, %r29597, %r8379; + xor.b32 %r8269, %r29591, %r8379; + xor.b32 %r8268, %r29590, %r8378; + st.local.v2.u32 [%rd55+216], {%r8268, %r8269}; // begin inline asm - dp4a.u32.u32 %r4700, %r4701, %r5790, %r4696; + shf.l.wrap.b32 %r8231, %r8237, %r8236, %r7734; // end inline asm - ld.const.u32 %r4705, [matrix+2992]; // begin inline asm - dp4a.u32.u32 %r4704, %r4705, %r5794, %r4700; + shf.l.wrap.b32 %r8235, %r8236, %r8237, %r7734; // end inline asm - ld.const.u32 %r4709, [matrix+2996]; // begin inline asm - dp4a.u32.u32 %r4708, %r4709, %r5798, %r4704; + shf.l.wrap.b32 %r8239, %r8245, %r8244, %r7742; // end inline asm - ld.const.u32 %r4713, [matrix+3000]; // begin inline asm - dp4a.u32.u32 %r4712, %r4713, %r5802, %r4708; + shf.l.wrap.b32 %r8243, %r8244, %r8245, %r7742; // end inline asm - ld.const.u32 %r4717, [matrix+3004]; // begin inline asm - dp4a.u32.u32 %r4716, %r4717, %r5806, %r4712; + shf.l.wrap.b32 %r8251, %r8252, %r8253, %r7750; // end inline asm - ld.const.u32 %r4721, [matrix+3008]; // begin inline asm - dp4a.u32.u32 %r4720, %r4721, %r5746, %r6244; + shf.l.wrap.b32 %r8247, %r8253, %r8252, %r7750; // end inline asm - ld.const.u32 %r4725, [matrix+3012]; + st.local.v2.u32 [%rd55+96], {%r8247, %r8251}; // begin inline asm - dp4a.u32.u32 %r4724, %r4725, %r5750, %r4720; + shf.l.wrap.b32 %r8255, %r8261, %r8260, %r7782; // end inline asm - ld.const.u32 %r4729, [matrix+3016]; // begin inline asm - dp4a.u32.u32 %r4728, %r4729, %r5754, %r4724; + shf.l.wrap.b32 %r8259, %r8260, %r8261, %r7782; // end inline asm - ld.const.u32 %r4733, [matrix+3020]; // begin inline asm - dp4a.u32.u32 %r4732, %r4733, %r5758, %r4728; + shf.l.wrap.b32 %r8263, %r8269, %r8268, %r7830; // end inline asm - ld.const.u32 %r4737, [matrix+3024]; // begin inline asm - dp4a.u32.u32 %r4736, %r4737, %r5762, %r4732; + shf.l.wrap.b32 %r8267, %r8268, %r8269, %r7830; // end inline asm - ld.const.u32 %r4741, [matrix+3028]; // begin inline asm - dp4a.u32.u32 %r4740, %r4741, %r5766, %r4736; + shf.l.wrap.b32 %r8275, %r8276, %r8277, %r7854; // end inline asm - ld.const.u32 %r4745, [matrix+3032]; // begin inline asm - dp4a.u32.u32 %r4744, %r4745, %r5770, %r4740; + shf.l.wrap.b32 %r8271, %r8277, %r8276, %r7854; // end inline asm - ld.const.u32 %r4749, [matrix+3036]; + st.local.v2.u32 [%rd55+88], {%r8271, %r8275}; // begin inline asm - dp4a.u32.u32 %r4748, %r4749, %r5774, %r4744; + shf.l.wrap.b32 %r8279, %r8285, %r8284, %r7870; // end inline asm - ld.const.u32 %r4753, [matrix+3040]; // begin inline asm - dp4a.u32.u32 %r4752, %r4753, %r5778, %r4748; + shf.l.wrap.b32 %r8283, %r8284, %r8285, %r7870; // end inline asm - ld.const.u32 %r4757, [matrix+3044]; // begin inline asm - dp4a.u32.u32 %r4756, %r4757, %r5782, %r4752; + shf.l.wrap.b32 %r8287, %r8293, %r8292, %r7878; // end inline asm - ld.const.u32 %r4761, [matrix+3048]; // begin inline asm - dp4a.u32.u32 %r4760, %r4761, %r5786, %r4756; + shf.l.wrap.b32 %r8291, %r8292, %r8293, %r7878; // end inline asm - ld.const.u32 %r4765, [matrix+3052]; // begin inline asm - dp4a.u32.u32 %r4764, %r4765, %r5790, %r4760; + shf.l.wrap.b32 %r8295, %r8301, %r8300, %r7910; // end inline asm - ld.const.u32 %r4769, [matrix+3056]; // begin inline asm - dp4a.u32.u32 %r4768, %r4769, %r5794, %r4764; + shf.l.wrap.b32 %r8299, %r8300, %r8301, %r7910; // end inline asm - ld.const.u32 %r4773, [matrix+3060]; // begin inline asm - dp4a.u32.u32 %r4772, %r4773, %r5798, %r4768; + // chi + lop3.b32 %r8303, %r8338, %r8231, %r8255, 0xD2; + lop3.b32 %r8304, %r8341, %r8235, %r8259, 0xD2; // end inline asm - ld.const.u32 %r4777, [matrix+3064]; // begin inline asm - dp4a.u32.u32 %r4776, %r4777, %r5802, %r4772; + // chi + lop3.b32 %r29773, %r8231, %r8255, %r8287, 0xD2; + lop3.b32 %r29774, %r8235, %r8259, %r8291, 0xD2; // end inline asm - ld.const.u32 %r4781, [matrix+3068]; + st.local.v2.u32 [%rd55+32], {%r29773, %r29774}; // begin inline asm - dp4a.u32.u32 %r4780, %r4781, %r5806, %r4776; + // chi + lop3.b32 %r29769, %r8255, %r8287, %r8263, 0xD2; + lop3.b32 %r29770, %r8259, %r8291, %r8267, 0xD2; // end inline asm - shr.u32 %r6066, %r4716, 6; - and.b32 %r6067, %r6066, 240; - ld.const.u32 %r4785, [matrix+3072]; + st.local.v2.u32 [%rd55+40], {%r29769, %r29770}; // begin inline asm - dp4a.u32.u32 %r4784, %r4785, %r5746, %r6244; + // chi + lop3.b32 %r29765, %r8287, %r8263, %r8338, 0xD2; + lop3.b32 %r29766, %r8291, %r8267, %r8341, 0xD2; // end inline asm - ld.const.u32 %r4789, [matrix+3076]; + st.local.v2.u32 [%rd55+48], {%r29765, %r29766}; // begin inline asm - dp4a.u32.u32 %r4788, %r4789, %r5750, %r4784; + // chi + lop3.b32 %r29763, %r8263, %r8338, %r8231, 0xD2; + lop3.b32 %r29764, %r8267, %r8341, %r8235, 0xD2; // end inline asm - ld.const.u32 %r4793, [matrix+3080]; + st.local.v2.u32 [%rd55+56], {%r29763, %r29764}; // begin inline asm - dp4a.u32.u32 %r4792, %r4793, %r5754, %r4788; + // chi + lop3.b32 %r29759, %r8279, %r8239, %r8295, 0xD2; + lop3.b32 %r29760, %r8283, %r8243, %r8299, 0xD2; // end inline asm - ld.const.u32 %r4797, [matrix+3084]; + st.local.v2.u32 [%rd55+64], {%r29759, %r29760}; // begin inline asm - dp4a.u32.u32 %r4796, %r4797, %r5758, %r4792; + // chi + lop3.b32 %r29771, %r8239, %r8295, %r8271, 0xD2; + lop3.b32 %r29772, %r8243, %r8299, %r8275, 0xD2; // end inline asm - ld.const.u32 %r4801, [matrix+3088]; + st.local.v2.u32 [%rd55+72], {%r29771, %r29772}; // begin inline asm - dp4a.u32.u32 %r4800, %r4801, %r5762, %r4796; + // chi + lop3.b32 %r29767, %r8295, %r8271, %r8247, 0xD2; + lop3.b32 %r29768, %r8299, %r8275, %r8251, 0xD2; // end inline asm - ld.const.u32 %r4805, [matrix+3092]; + st.local.v2.u32 [%rd55+80], {%r29767, %r29768}; // begin inline asm - dp4a.u32.u32 %r4804, %r4805, %r5766, %r4800; + ld.global.nc.v2.u32 {%r8367,%r8368}, [%rd498]; // end inline asm - ld.const.u32 %r4809, [matrix+3096]; + xor.b32 %r29761, %r8303, %r8367; + xor.b32 %r29762, %r8304, %r8368; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; + add.s64 %rd57, %rd55, 24; + add.s64 %rd58, %rd3, 24; + +$L__BB2_27: + shl.b32 %r8380, %r29673, 2; + cvt.u64.u32 %rd528, %r8380; + and.b64 %rd529, %rd528, 60; + add.s64 %rd530, %rd58, %rd529; + xor.b32 %r8381, %r30, %r29673; + mul.lo.s32 %r8382, %r8381, 16777619; + ld.local.u32 %r8383, [%rd530]; + xor.b32 %r8384, %r8382, %r8383; + mul.wide.u32 %rd531, %r8384, -954391867; + shr.u64 %rd532, %rd531, 32; + cvt.u32.u64 %r8385, %rd532; + sub.s32 %r8386, %r8384, %r8385; + shr.u32 %r8387, %r8386, 1; + add.s32 %r8388, %r8387, %r8385; + shr.u32 %r8389, %r8388, 20; + mul.lo.s32 %r8390, %r8389, 1179641; + sub.s32 %r8391, %r8384, %r8390; + mul.wide.u32 %rd533, %r8391, 64; + add.s64 %rd534, %rd471, %rd533; + mul.lo.s32 %r8392, %r29710, 16777619; + ld.global.u32 %r8393, [%rd534]; + xor.b32 %r29710, %r8392, %r8393; + mul.lo.s32 %r8394, %r29711, 16777619; + ld.global.u32 %r8395, [%rd534+4]; + xor.b32 %r29711, %r8394, %r8395; + mul.lo.s32 %r8396, %r29722, 16777619; + ld.global.u32 %r8397, [%rd534+8]; + mul.lo.s32 %r8398, %r29723, 16777619; + ld.global.u32 %r8399, [%rd534+12]; + xor.b32 %r8400, %r8398, %r8399; + xor.b32 %r29722, %r8396, %r8397; + mov.b64 %rd535, {%r29722, %r8400}; + mul.lo.s32 %r8401, %r29718, 16777619; + ld.global.u32 %r8402, [%rd534+16]; + mul.lo.s32 %r8403, %r29719, 16777619; + ld.global.u32 %r8404, [%rd534+20]; + xor.b32 %r8405, %r8403, %r8404; + xor.b32 %r29718, %r8401, %r8402; + mov.b64 %rd536, {%r29718, %r8405}; + mul.lo.s32 %r8406, %r29714, 16777619; + ld.global.u32 %r8407, [%rd534+24]; + mul.lo.s32 %r8408, %r29715, 16777619; + ld.global.u32 %r8409, [%rd534+28]; + xor.b32 %r8410, %r8408, %r8409; + xor.b32 %r29714, %r8406, %r8407; + mov.b64 %rd537, {%r29714, %r8410}; + mul.lo.s32 %r8411, %r29712, 16777619; + ld.global.u32 %r8412, [%rd534+32]; + mul.lo.s32 %r8413, %r29713, 16777619; + ld.global.u32 %r8414, [%rd534+36]; + xor.b32 %r8415, %r8413, %r8414; + xor.b32 %r29712, %r8411, %r8412; + mov.b64 %rd538, {%r29712, %r8415}; + mul.lo.s32 %r8416, %r29708, 16777619; + ld.global.u32 %r8417, [%rd534+40]; + xor.b32 %r29708, %r8416, %r8417; + mul.lo.s32 %r8418, %r29709, 16777619; + ld.global.u32 %r8419, [%rd534+44]; + xor.b32 %r29709, %r8418, %r8419; + mul.lo.s32 %r8420, %r29720, 16777619; + ld.global.u32 %r8421, [%rd534+48]; + mul.lo.s32 %r8422, %r29721, 16777619; + ld.global.u32 %r8423, [%rd534+52]; + xor.b32 %r8424, %r8422, %r8423; + xor.b32 %r29720, %r8420, %r8421; + mov.b64 %rd539, {%r29720, %r8424}; + mul.lo.s32 %r8425, %r29716, 16777619; + ld.global.u32 %r8426, [%rd534+56]; + mul.lo.s32 %r8427, %r29717, 16777619; + ld.global.u32 %r8428, [%rd534+60]; + xor.b32 %r8429, %r8427, %r8428; + xor.b32 %r29716, %r8425, %r8426; + mov.b64 %rd540, {%r29716, %r8429}; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; + st.local.v2.u32 [%rd3+32], {%r29722, %r8400}; + st.local.v2.u32 [%rd3+40], {%r29718, %r8405}; + st.local.v2.u32 [%rd3+48], {%r29714, %r8410}; + st.local.v2.u32 [%rd3+56], {%r29712, %r8415}; + st.local.v2.u32 [%rd3+64], {%r29708, %r29709}; + st.local.v2.u32 [%rd3+72], {%r29720, %r8424}; + st.local.v2.u32 [%rd3+80], {%r29716, %r8429}; + add.s64 %rd541, %rd57, %rd529; + xor.b32 %r8430, %r226, %r29673; + mul.lo.s32 %r8431, %r8430, 16777619; + ld.local.u32 %r8432, [%rd541]; + xor.b32 %r8433, %r8431, %r8432; + mul.wide.u32 %rd542, %r8433, -954391867; + shr.u64 %rd543, %rd542, 32; + cvt.u32.u64 %r8434, %rd543; + sub.s32 %r8435, %r8433, %r8434; + shr.u32 %r8436, %r8435, 1; + add.s32 %r8437, %r8436, %r8434; + shr.u32 %r8438, %r8437, 20; + mul.lo.s32 %r8439, %r8438, 1179641; + sub.s32 %r8440, %r8433, %r8439; + mul.wide.u32 %rd544, %r8440, 64; + add.s64 %rd545, %rd471, %rd544; + mul.lo.s32 %r8441, %r29761, 16777619; + ld.global.u32 %r8442, [%rd545]; + xor.b32 %r29761, %r8441, %r8442; + mul.lo.s32 %r8443, %r29762, 16777619; + ld.global.u32 %r8444, [%rd545+4]; + xor.b32 %r29762, %r8443, %r8444; + mul.lo.s32 %r8445, %r29773, 16777619; + ld.global.u32 %r8446, [%rd545+8]; + mul.lo.s32 %r8447, %r29774, 16777619; + ld.global.u32 %r8448, [%rd545+12]; + xor.b32 %r8449, %r8447, %r8448; + xor.b32 %r29773, %r8445, %r8446; + mov.b64 %rd546, {%r29773, %r8449}; + mul.lo.s32 %r8450, %r29769, 16777619; + ld.global.u32 %r8451, [%rd545+16]; + mul.lo.s32 %r8452, %r29770, 16777619; + ld.global.u32 %r8453, [%rd545+20]; + xor.b32 %r8454, %r8452, %r8453; + xor.b32 %r29769, %r8450, %r8451; + mov.b64 %rd547, {%r29769, %r8454}; + mul.lo.s32 %r8455, %r29765, 16777619; + ld.global.u32 %r8456, [%rd545+24]; + mul.lo.s32 %r8457, %r29766, 16777619; + ld.global.u32 %r8458, [%rd545+28]; + xor.b32 %r8459, %r8457, %r8458; + xor.b32 %r29765, %r8455, %r8456; + mov.b64 %rd548, {%r29765, %r8459}; + mul.lo.s32 %r8460, %r29763, 16777619; + ld.global.u32 %r8461, [%rd545+32]; + mul.lo.s32 %r8462, %r29764, 16777619; + ld.global.u32 %r8463, [%rd545+36]; + xor.b32 %r8464, %r8462, %r8463; + xor.b32 %r29763, %r8460, %r8461; + mov.b64 %rd549, {%r29763, %r8464}; + mul.lo.s32 %r8465, %r29759, 16777619; + ld.global.u32 %r8466, [%rd545+40]; + xor.b32 %r29759, %r8465, %r8466; + mul.lo.s32 %r8467, %r29760, 16777619; + ld.global.u32 %r8468, [%rd545+44]; + xor.b32 %r29760, %r8467, %r8468; + mul.lo.s32 %r8469, %r29771, 16777619; + ld.global.u32 %r8470, [%rd545+48]; + mul.lo.s32 %r8471, %r29772, 16777619; + ld.global.u32 %r8472, [%rd545+52]; + xor.b32 %r8473, %r8471, %r8472; + xor.b32 %r29771, %r8469, %r8470; + mov.b64 %rd550, {%r29771, %r8473}; + mul.lo.s32 %r8474, %r29767, 16777619; + ld.global.u32 %r8475, [%rd545+56]; + mul.lo.s32 %r8476, %r29768, 16777619; + ld.global.u32 %r8477, [%rd545+60]; + xor.b32 %r8478, %r8476, %r8477; + xor.b32 %r29767, %r8474, %r8475; + mov.b64 %rd551, {%r29767, %r8478}; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; + st.local.v2.u32 [%rd55+32], {%r29773, %r8449}; + st.local.v2.u32 [%rd55+40], {%r29769, %r8454}; + st.local.v2.u32 [%rd55+48], {%r29765, %r8459}; + st.local.v2.u32 [%rd55+56], {%r29763, %r8464}; + st.local.v2.u32 [%rd55+64], {%r29759, %r29760}; + st.local.v2.u32 [%rd55+72], {%r29771, %r8473}; + st.local.v2.u32 [%rd55+80], {%r29767, %r8478}; + add.s32 %r29673, %r29673, 1; + setp.lt.u32 %p20, %r29673, 512; + shr.u64 %rd552, %rd535, 32; + cvt.u32.u64 %r29723, %rd552; + shr.u64 %rd553, %rd536, 32; + cvt.u32.u64 %r29719, %rd553; + shr.u64 %rd554, %rd537, 32; + cvt.u32.u64 %r29715, %rd554; + shr.u64 %rd555, %rd538, 32; + cvt.u32.u64 %r29713, %rd555; + shr.u64 %rd556, %rd539, 32; + cvt.u32.u64 %r29721, %rd556; + shr.u64 %rd557, %rd540, 32; + cvt.u32.u64 %r29717, %rd557; + shr.u64 %rd558, %rd546, 32; + cvt.u32.u64 %r29774, %rd558; + shr.u64 %rd559, %rd547, 32; + cvt.u32.u64 %r29770, %rd559; + shr.u64 %rd560, %rd548, 32; + cvt.u32.u64 %r29766, %rd560; + shr.u64 %rd561, %rd549, 32; + cvt.u32.u64 %r29764, %rd561; + shr.u64 %rd562, %rd550, 32; + cvt.u32.u64 %r29772, %rd562; + shr.u64 %rd563, %rd551, 32; + cvt.u32.u64 %r29768, %rd563; + @%p20 bra $L__BB2_27; + + mov.u32 %r29674, 0; + st.local.v2.u32 [%rd3+96], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+104], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+112], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+120], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+128], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+136], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+144], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+152], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+160], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+168], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+176], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+184], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+192], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+200], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+208], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+216], {%r29674, %r29674}; + mov.u32 %r29689, -2147483648; + mov.u32 %r8493, 1; + st.local.v2.u32 [%rd3+88], {%r8493, %r29689}; + mov.u32 %r29675, %r29674; + mov.u32 %r29676, %r29674; + mov.u32 %r29677, %r29674; + mov.u32 %r29678, %r29674; + mov.u32 %r29679, %r29674; + mov.u32 %r29680, %r29674; + mov.u32 %r29681, %r29674; + mov.u32 %r29682, %r29674; + mov.u32 %r29683, %r29674; + mov.u32 %r29684, %r29674; + mov.u32 %r29685, %r29674; + mov.u32 %r29686, %r29674; + mov.u32 %r29687, %r29674; + mov.u32 %r29688, %r8493; + mov.u32 %r29690, %r29674; + mov.u32 %r29691, %r29674; + mov.u32 %r29692, %r29674; + mov.u32 %r29693, %r29674; + mov.u32 %r29694, %r29674; + mov.u32 %r29695, %r29674; + mov.u32 %r29696, %r29674; + mov.u32 %r29697, %r29674; + mov.u32 %r29698, %r29674; + mov.u32 %r29699, %r29674; + mov.u32 %r29700, %r29674; + mov.u32 %r29701, %r29674; + mov.u32 %r29702, %r29674; + mov.u32 %r29703, %r29674; + mov.u32 %r29704, %r29674; + mov.u32 %r29705, %r29674; + mov.u32 %r29706, %r29674; + mov.u32 %r29707, %r29674; + mov.u32 %r29724, %r29674; + +$L__BB2_29: // begin inline asm - dp4a.u32.u32 %r4808, %r4809, %r5770, %r4804; + // xor5 + lop3.b32 %r8520, %r29710, %r29708, %r29706, 0x96; + lop3.b32 %r8520, %r8520, %r29704, %r29702, 0x96; + lop3.b32 %r8521, %r29711, %r29709, %r29707, 0x96; + lop3.b32 %r8521, %r8521, %r29705, %r29703, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8532, %r29722, %r29720, %r29700, 0x96; + lop3.b32 %r8532, %r8532, %r29698, %r29696, 0x96; + lop3.b32 %r8533, %r29723, %r29721, %r29701, 0x96; + lop3.b32 %r8533, %r8533, %r29699, %r29697, 0x96; // end inline asm - ld.const.u32 %r4813, [matrix+3100]; // begin inline asm - dp4a.u32.u32 %r4812, %r4813, %r5774, %r4808; + // xor5 + lop3.b32 %r8544, %r29718, %r29716, %r29694, 0x96; + lop3.b32 %r8544, %r8544, %r29692, %r29690, 0x96; + lop3.b32 %r8545, %r29719, %r29717, %r29695, 0x96; + lop3.b32 %r8545, %r8545, %r29693, %r29691, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8556, %r29714, %r29688, %r29686, 0x96; + lop3.b32 %r8556, %r8556, %r29684, %r29682, 0x96; + lop3.b32 %r8557, %r29715, %r29689, %r29687, 0x96; + lop3.b32 %r8557, %r8557, %r29685, %r29683, 0x96; // end inline asm - ld.const.u32 %r4817, [matrix+3104]; // begin inline asm - dp4a.u32.u32 %r4816, %r4817, %r5778, %r4812; + // xor5 + lop3.b32 %r8568, %r29712, %r29680, %r29678, 0x96; + lop3.b32 %r8568, %r8568, %r29676, %r29674, 0x96; + lop3.b32 %r8569, %r29713, %r29681, %r29679, 0x96; + lop3.b32 %r8569, %r8569, %r29677, %r29675, 0x96; // end inline asm - ld.const.u32 %r4821, [matrix+3108]; // begin inline asm - dp4a.u32.u32 %r4820, %r4821, %r5782, %r4816; + shf.l.wrap.b32 %r8580, %r8533, %r8532, %r8493; // end inline asm - ld.const.u32 %r4825, [matrix+3112]; // begin inline asm - dp4a.u32.u32 %r4824, %r4825, %r5786, %r4820; + shf.l.wrap.b32 %r8584, %r8532, %r8533, %r8493; // end inline asm - ld.const.u32 %r4829, [matrix+3116]; + xor.b32 %r9014, %r8580, %r8568; + xor.b32 %r9015, %r8584, %r8569; + xor.b32 %r8847, %r29710, %r9014; + xor.b32 %r8850, %r29711, %r9015; + xor.b32 %r8754, %r29708, %r9014; + xor.b32 %r8753, %r29709, %r9015; + xor.b32 %r8801, %r29706, %r9014; + xor.b32 %r8802, %r29707, %r9015; + xor.b32 %r8706, %r29704, %r9014; + xor.b32 %r8705, %r29705, %r9015; + xor.b32 %r8657, %r29702, %r9014; + xor.b32 %r8658, %r29703, %r9015; // begin inline asm - dp4a.u32.u32 %r4828, %r4829, %r5790, %r4824; + shf.l.wrap.b32 %r8588, %r8545, %r8544, %r8493; // end inline asm - ld.const.u32 %r4833, [matrix+3120]; // begin inline asm - dp4a.u32.u32 %r4832, %r4833, %r5794, %r4828; + shf.l.wrap.b32 %r8592, %r8544, %r8545, %r8493; // end inline asm - ld.const.u32 %r4837, [matrix+3124]; + xor.b32 %r9016, %r8588, %r8520; + xor.b32 %r9017, %r8592, %r8521; + xor.b32 %r8809, %r29722, %r9016; + xor.b32 %r8810, %r29723, %r9017; + xor.b32 %r8626, %r29720, %r9016; + xor.b32 %r8625, %r29721, %r9017; + xor.b32 %r8785, %r29700, %r9016; + xor.b32 %r8786, %r29701, %r9017; + xor.b32 %r8746, %r29698, %r9016; + xor.b32 %r8745, %r29699, %r9017; + xor.b32 %r8729, %r29696, %r9016; + xor.b32 %r8730, %r29697, %r9017; // begin inline asm - dp4a.u32.u32 %r4836, %r4837, %r5798, %r4832; + shf.l.wrap.b32 %r8596, %r8557, %r8556, %r8493; // end inline asm - ld.const.u32 %r4841, [matrix+3128]; // begin inline asm - dp4a.u32.u32 %r4840, %r4841, %r5802, %r4836; + shf.l.wrap.b32 %r8600, %r8556, %r8557, %r8493; // end inline asm - ld.const.u32 %r4845, [matrix+3132]; + xor.b32 %r9018, %r8596, %r8532; + xor.b32 %r9019, %r8600, %r8533; + xor.b32 %r8666, %r29718, %r9018; + xor.b32 %r8665, %r29719, %r9019; + xor.b32 %r8793, %r29716, %r9018; + xor.b32 %r8794, %r29717, %r9019; + xor.b32 %r8674, %r29694, %r9018; + xor.b32 %r8673, %r29695, %r9019; + xor.b32 %r8777, %r29692, %r9018; + xor.b32 %r8778, %r29693, %r9019; + xor.b32 %r8642, %r29690, %r9018; + xor.b32 %r8641, %r29691, %r9019; // begin inline asm - dp4a.u32.u32 %r4844, %r4845, %r5806, %r4840; + shf.l.wrap.b32 %r8604, %r8569, %r8568, %r8493; // end inline asm - ld.const.u32 %r4849, [matrix+3136]; // begin inline asm - dp4a.u32.u32 %r4848, %r4849, %r5746, %r6244; + shf.l.wrap.b32 %r8608, %r8568, %r8569, %r8493; // end inline asm - ld.const.u32 %r4853, [matrix+3140]; + xor.b32 %r9020, %r8604, %r8544; + xor.b32 %r9021, %r8608, %r8545; + xor.b32 %r8761, %r29714, %r9020; + xor.b32 %r8762, %r29715, %r9021; + xor.b32 %r8738, %r29688, %r9020; + xor.b32 %r8737, %r29689, %r9021; + xor.b32 %r8681, %r29686, %r9020; + xor.b32 %r8682, %r29687, %r9021; + xor.b32 %r8769, %r29684, %r9020; + xor.b32 %r8770, %r29685, %r9021; + xor.b32 %r8698, %r29682, %r9020; + xor.b32 %r8697, %r29683, %r9021; // begin inline asm - dp4a.u32.u32 %r4852, %r4853, %r5750, %r4848; + shf.l.wrap.b32 %r8612, %r8521, %r8520, %r8493; // end inline asm - ld.const.u32 %r4857, [matrix+3144]; // begin inline asm - dp4a.u32.u32 %r4856, %r4857, %r5754, %r4852; + shf.l.wrap.b32 %r8616, %r8520, %r8521, %r8493; // end inline asm - ld.const.u32 %r4861, [matrix+3148]; + xor.b32 %r9022, %r8612, %r8556; + xor.b32 %r9023, %r8616, %r8557; + xor.b32 %r8713, %r29712, %r9022; + xor.b32 %r8714, %r29713, %r9023; + xor.b32 %r8633, %r29680, %r9022; + xor.b32 %r8634, %r29681, %r9023; + xor.b32 %r8650, %r29678, %r9022; + xor.b32 %r8649, %r29679, %r9023; + xor.b32 %r8689, %r29676, %r9022; + xor.b32 %r8690, %r29677, %r9023; + xor.b32 %r8721, %r29674, %r9022; + xor.b32 %r8722, %r29675, %r9023; + mov.u32 %r8627, 44; // begin inline asm - dp4a.u32.u32 %r4860, %r4861, %r5758, %r4856; + shf.l.wrap.b32 %r8620, %r8626, %r8625, %r8627; // end inline asm - ld.const.u32 %r4865, [matrix+3152]; // begin inline asm - dp4a.u32.u32 %r4864, %r4865, %r5762, %r4860; + shf.l.wrap.b32 %r8624, %r8625, %r8626, %r8627; // end inline asm - ld.const.u32 %r4869, [matrix+3156]; + mov.u32 %r8635, 20; // begin inline asm - dp4a.u32.u32 %r4868, %r4869, %r5766, %r4864; + shf.l.wrap.b32 %r8628, %r8634, %r8633, %r8635; // end inline asm - ld.const.u32 %r4873, [matrix+3160]; // begin inline asm - dp4a.u32.u32 %r4872, %r4873, %r5770, %r4868; + shf.l.wrap.b32 %r8632, %r8633, %r8634, %r8635; // end inline asm - ld.const.u32 %r4877, [matrix+3164]; + mov.u32 %r8643, 61; // begin inline asm - dp4a.u32.u32 %r4876, %r4877, %r5774, %r4872; + shf.l.wrap.b32 %r8636, %r8642, %r8641, %r8643; // end inline asm - ld.const.u32 %r4881, [matrix+3168]; // begin inline asm - dp4a.u32.u32 %r4880, %r4881, %r5778, %r4876; + shf.l.wrap.b32 %r8640, %r8641, %r8642, %r8643; // end inline asm - ld.const.u32 %r4885, [matrix+3172]; + mov.u32 %r8651, 39; // begin inline asm - dp4a.u32.u32 %r4884, %r4885, %r5782, %r4880; + shf.l.wrap.b32 %r8644, %r8650, %r8649, %r8651; // end inline asm - ld.const.u32 %r4889, [matrix+3176]; // begin inline asm - dp4a.u32.u32 %r4888, %r4889, %r5786, %r4884; + shf.l.wrap.b32 %r8648, %r8649, %r8650, %r8651; // end inline asm - ld.const.u32 %r4893, [matrix+3180]; + mov.u32 %r8659, 18; // begin inline asm - dp4a.u32.u32 %r4892, %r4893, %r5790, %r4888; + shf.l.wrap.b32 %r8652, %r8658, %r8657, %r8659; // end inline asm - ld.const.u32 %r4897, [matrix+3184]; // begin inline asm - dp4a.u32.u32 %r4896, %r4897, %r5794, %r4892; + shf.l.wrap.b32 %r8656, %r8657, %r8658, %r8659; // end inline asm - ld.const.u32 %r4901, [matrix+3188]; + mov.u32 %r8667, 62; // begin inline asm - dp4a.u32.u32 %r4900, %r4901, %r5798, %r4896; + shf.l.wrap.b32 %r8660, %r8666, %r8665, %r8667; // end inline asm - ld.const.u32 %r4905, [matrix+3192]; // begin inline asm - dp4a.u32.u32 %r4904, %r4905, %r5802, %r4900; + shf.l.wrap.b32 %r8664, %r8665, %r8666, %r8667; // end inline asm - ld.const.u32 %r4909, [matrix+3196]; + mov.u32 %r8675, 43; // begin inline asm - dp4a.u32.u32 %r4908, %r4909, %r5806, %r4904; + shf.l.wrap.b32 %r8668, %r8674, %r8673, %r8675; // end inline asm - shr.u32 %r6068, %r4844, 6; - and.b32 %r6069, %r6068, 240; - shr.u32 %r6070, %r4908, 10; - and.b32 %r6071, %r6070, 255; - or.b32 %r6072, %r6071, %r6069; - cvt.u64.u32 %rd229, %r6072; - ld.const.u32 %r4913, [matrix+3200]; // begin inline asm - dp4a.u32.u32 %r4912, %r4913, %r5746, %r6244; + shf.l.wrap.b32 %r8672, %r8673, %r8674, %r8675; // end inline asm - ld.const.u32 %r4917, [matrix+3204]; + mov.u32 %r8683, 25; // begin inline asm - dp4a.u32.u32 %r4916, %r4917, %r5750, %r4912; + shf.l.wrap.b32 %r8676, %r8682, %r8681, %r8683; // end inline asm - ld.const.u32 %r4921, [matrix+3208]; // begin inline asm - dp4a.u32.u32 %r4920, %r4921, %r5754, %r4916; + shf.l.wrap.b32 %r8680, %r8681, %r8682, %r8683; // end inline asm - ld.const.u32 %r4925, [matrix+3212]; + mov.u32 %r8691, 8; // begin inline asm - dp4a.u32.u32 %r4924, %r4925, %r5758, %r4920; + shf.l.wrap.b32 %r8684, %r8690, %r8689, %r8691; // end inline asm - ld.const.u32 %r4929, [matrix+3216]; // begin inline asm - dp4a.u32.u32 %r4928, %r4929, %r5762, %r4924; + shf.l.wrap.b32 %r8688, %r8689, %r8690, %r8691; // end inline asm - ld.const.u32 %r4933, [matrix+3220]; + mov.u32 %r8699, 56; // begin inline asm - dp4a.u32.u32 %r4932, %r4933, %r5766, %r4928; + shf.l.wrap.b32 %r8692, %r8698, %r8697, %r8699; // end inline asm - ld.const.u32 %r4937, [matrix+3224]; // begin inline asm - dp4a.u32.u32 %r4936, %r4937, %r5770, %r4932; + shf.l.wrap.b32 %r8696, %r8697, %r8698, %r8699; // end inline asm - ld.const.u32 %r4941, [matrix+3228]; + mov.u32 %r8707, 41; // begin inline asm - dp4a.u32.u32 %r4940, %r4941, %r5774, %r4936; + shf.l.wrap.b32 %r8700, %r8706, %r8705, %r8707; // end inline asm - ld.const.u32 %r4945, [matrix+3232]; // begin inline asm - dp4a.u32.u32 %r4944, %r4945, %r5778, %r4940; + shf.l.wrap.b32 %r8704, %r8705, %r8706, %r8707; // end inline asm - ld.const.u32 %r4949, [matrix+3236]; + mov.u32 %r8715, 27; // begin inline asm - dp4a.u32.u32 %r4948, %r4949, %r5782, %r4944; + shf.l.wrap.b32 %r8708, %r8714, %r8713, %r8715; // end inline asm - ld.const.u32 %r4953, [matrix+3240]; // begin inline asm - dp4a.u32.u32 %r4952, %r4953, %r5786, %r4948; + shf.l.wrap.b32 %r8712, %r8713, %r8714, %r8715; // end inline asm - ld.const.u32 %r4957, [matrix+3244]; + mov.u32 %r8723, 14; // begin inline asm - dp4a.u32.u32 %r4956, %r4957, %r5790, %r4952; + shf.l.wrap.b32 %r8716, %r8722, %r8721, %r8723; // end inline asm - ld.const.u32 %r4961, [matrix+3248]; // begin inline asm - dp4a.u32.u32 %r4960, %r4961, %r5794, %r4956; + shf.l.wrap.b32 %r8720, %r8721, %r8722, %r8723; // end inline asm - ld.const.u32 %r4965, [matrix+3252]; + mov.u32 %r8731, 2; // begin inline asm - dp4a.u32.u32 %r4964, %r4965, %r5798, %r4960; + shf.l.wrap.b32 %r8724, %r8730, %r8729, %r8731; // end inline asm - ld.const.u32 %r4969, [matrix+3256]; // begin inline asm - dp4a.u32.u32 %r4968, %r4969, %r5802, %r4964; + shf.l.wrap.b32 %r8728, %r8729, %r8730, %r8731; // end inline asm - ld.const.u32 %r4973, [matrix+3260]; + mov.u32 %r8739, 55; // begin inline asm - dp4a.u32.u32 %r4972, %r4973, %r5806, %r4968; + shf.l.wrap.b32 %r8732, %r8738, %r8737, %r8739; // end inline asm - ld.const.u32 %r4977, [matrix+3264]; // begin inline asm - dp4a.u32.u32 %r4976, %r4977, %r5746, %r6244; + shf.l.wrap.b32 %r8736, %r8737, %r8738, %r8739; // end inline asm - ld.const.u32 %r4981, [matrix+3268]; + mov.u32 %r8747, 45; // begin inline asm - dp4a.u32.u32 %r4980, %r4981, %r5750, %r4976; + shf.l.wrap.b32 %r8740, %r8746, %r8745, %r8747; // end inline asm - ld.const.u32 %r4985, [matrix+3272]; // begin inline asm - dp4a.u32.u32 %r4984, %r4985, %r5754, %r4980; + shf.l.wrap.b32 %r8744, %r8745, %r8746, %r8747; // end inline asm - ld.const.u32 %r4989, [matrix+3276]; + mov.u32 %r8755, 36; // begin inline asm - dp4a.u32.u32 %r4988, %r4989, %r5758, %r4984; + shf.l.wrap.b32 %r8748, %r8754, %r8753, %r8755; // end inline asm - ld.const.u32 %r4993, [matrix+3280]; // begin inline asm - dp4a.u32.u32 %r4992, %r4993, %r5762, %r4988; + shf.l.wrap.b32 %r8752, %r8753, %r8754, %r8755; // end inline asm - ld.const.u32 %r4997, [matrix+3284]; + mov.u32 %r8763, 28; // begin inline asm - dp4a.u32.u32 %r4996, %r4997, %r5766, %r4992; + shf.l.wrap.b32 %r8756, %r8762, %r8761, %r8763; // end inline asm - ld.const.u32 %r5001, [matrix+3288]; // begin inline asm - dp4a.u32.u32 %r5000, %r5001, %r5770, %r4996; + shf.l.wrap.b32 %r8760, %r8761, %r8762, %r8763; // end inline asm - ld.const.u32 %r5005, [matrix+3292]; + mov.u32 %r8771, 21; // begin inline asm - dp4a.u32.u32 %r5004, %r5005, %r5774, %r5000; + shf.l.wrap.b32 %r8764, %r8770, %r8769, %r8771; // end inline asm - ld.const.u32 %r5009, [matrix+3296]; // begin inline asm - dp4a.u32.u32 %r5008, %r5009, %r5778, %r5004; + shf.l.wrap.b32 %r8768, %r8769, %r8770, %r8771; // end inline asm - ld.const.u32 %r5013, [matrix+3300]; + mov.u32 %r8779, 15; // begin inline asm - dp4a.u32.u32 %r5012, %r5013, %r5782, %r5008; + shf.l.wrap.b32 %r8772, %r8778, %r8777, %r8779; // end inline asm - ld.const.u32 %r5017, [matrix+3304]; // begin inline asm - dp4a.u32.u32 %r5016, %r5017, %r5786, %r5012; + shf.l.wrap.b32 %r8776, %r8777, %r8778, %r8779; // end inline asm - ld.const.u32 %r5021, [matrix+3308]; + mov.u32 %r8787, 10; // begin inline asm - dp4a.u32.u32 %r5020, %r5021, %r5790, %r5016; + shf.l.wrap.b32 %r8780, %r8786, %r8785, %r8787; // end inline asm - ld.const.u32 %r5025, [matrix+3312]; // begin inline asm - dp4a.u32.u32 %r5024, %r5025, %r5794, %r5020; + shf.l.wrap.b32 %r8784, %r8785, %r8786, %r8787; // end inline asm - ld.const.u32 %r5029, [matrix+3316]; + mov.u32 %r8795, 6; // begin inline asm - dp4a.u32.u32 %r5028, %r5029, %r5798, %r5024; + shf.l.wrap.b32 %r8788, %r8794, %r8793, %r8795; // end inline asm - ld.const.u32 %r5033, [matrix+3320]; // begin inline asm - dp4a.u32.u32 %r5032, %r5033, %r5802, %r5028; + shf.l.wrap.b32 %r8792, %r8793, %r8794, %r8795; // end inline asm - ld.const.u32 %r5037, [matrix+3324]; + mov.u32 %r8803, 3; // begin inline asm - dp4a.u32.u32 %r5036, %r5037, %r5806, %r5032; + shf.l.wrap.b32 %r8796, %r8802, %r8801, %r8803; // end inline asm - shr.u32 %r6073, %r4972, 6; - and.b32 %r6074, %r6073, 240; - shr.u32 %r6075, %r5036, 10; - or.b32 %r6076, %r6075, %r6074; - cvt.u64.u32 %rd230, %r6076; - xor.b64 %rd231, %rd201, %rd230; - and.b64 %rd232, %rd9, 255; - xor.b64 %rd233, %rd232, %rd229; - ld.const.u32 %r5041, [matrix+3328]; // begin inline asm - dp4a.u32.u32 %r5040, %r5041, %r5746, %r6244; + shf.l.wrap.b32 %r8800, %r8801, %r8802, %r8803; // end inline asm - ld.const.u32 %r5045, [matrix+3332]; // begin inline asm - dp4a.u32.u32 %r5044, %r5045, %r5750, %r5040; + shf.l.wrap.b32 %r8804, %r8810, %r8809, %r8493; // end inline asm - ld.const.u32 %r5049, [matrix+3336]; // begin inline asm - dp4a.u32.u32 %r5048, %r5049, %r5754, %r5044; + shf.l.wrap.b32 %r8808, %r8809, %r8810, %r8493; // end inline asm - ld.const.u32 %r5053, [matrix+3340]; // begin inline asm - dp4a.u32.u32 %r5052, %r5053, %r5758, %r5048; + // chi + lop3.b32 %r8812, %r8847, %r8620, %r8668, 0xD2; + lop3.b32 %r8813, %r8850, %r8624, %r8672, 0xD2; // end inline asm - ld.const.u32 %r5057, [matrix+3344]; // begin inline asm - dp4a.u32.u32 %r5056, %r5057, %r5762, %r5052; + // chi + lop3.b32 %r29722, %r8620, %r8668, %r8764, 0xD2; + lop3.b32 %r29723, %r8624, %r8672, %r8768, 0xD2; // end inline asm - ld.const.u32 %r5061, [matrix+3348]; // begin inline asm - dp4a.u32.u32 %r5060, %r5061, %r5766, %r5056; + // chi + lop3.b32 %r29718, %r8668, %r8764, %r8716, 0xD2; + lop3.b32 %r29719, %r8672, %r8768, %r8720, 0xD2; // end inline asm - ld.const.u32 %r5065, [matrix+3352]; // begin inline asm - dp4a.u32.u32 %r5064, %r5065, %r5770, %r5060; + // chi + lop3.b32 %r29714, %r8764, %r8716, %r8847, 0xD2; + lop3.b32 %r29715, %r8768, %r8720, %r8850, 0xD2; // end inline asm - ld.const.u32 %r5069, [matrix+3356]; // begin inline asm - dp4a.u32.u32 %r5068, %r5069, %r5774, %r5064; + // chi + lop3.b32 %r29712, %r8716, %r8847, %r8620, 0xD2; + lop3.b32 %r29713, %r8720, %r8850, %r8624, 0xD2; // end inline asm - ld.const.u32 %r5073, [matrix+3360]; // begin inline asm - dp4a.u32.u32 %r5072, %r5073, %r5778, %r5068; + // chi + lop3.b32 %r29708, %r8756, %r8628, %r8796, 0xD2; + lop3.b32 %r29709, %r8760, %r8632, %r8800, 0xD2; // end inline asm - ld.const.u32 %r5077, [matrix+3364]; // begin inline asm - dp4a.u32.u32 %r5076, %r5077, %r5782, %r5072; + // chi + lop3.b32 %r29720, %r8628, %r8796, %r8740, 0xD2; + lop3.b32 %r29721, %r8632, %r8800, %r8744, 0xD2; // end inline asm - ld.const.u32 %r5081, [matrix+3368]; // begin inline asm - dp4a.u32.u32 %r5080, %r5081, %r5786, %r5076; + // chi + lop3.b32 %r29716, %r8796, %r8740, %r8636, 0xD2; + lop3.b32 %r29717, %r8800, %r8744, %r8640, 0xD2; // end inline asm - ld.const.u32 %r5085, [matrix+3372]; // begin inline asm - dp4a.u32.u32 %r5084, %r5085, %r5790, %r5080; + // chi + lop3.b32 %r29688, %r8740, %r8636, %r8756, 0xD2; + lop3.b32 %r29689, %r8744, %r8640, %r8760, 0xD2; // end inline asm - ld.const.u32 %r5089, [matrix+3376]; + st.local.v2.u32 [%rd3+88], {%r29688, %r29689}; // begin inline asm - dp4a.u32.u32 %r5088, %r5089, %r5794, %r5084; + // chi + lop3.b32 %r29680, %r8636, %r8756, %r8628, 0xD2; + lop3.b32 %r29681, %r8640, %r8760, %r8632, 0xD2; // end inline asm - ld.const.u32 %r5093, [matrix+3380]; + st.local.v2.u32 [%rd3+96], {%r29680, %r29681}; // begin inline asm - dp4a.u32.u32 %r5092, %r5093, %r5798, %r5088; + // chi + lop3.b32 %r29706, %r8804, %r8788, %r8676, 0xD2; + lop3.b32 %r29707, %r8808, %r8792, %r8680, 0xD2; // end inline asm - ld.const.u32 %r5097, [matrix+3384]; + st.local.v2.u32 [%rd3+104], {%r29706, %r29707}; // begin inline asm - dp4a.u32.u32 %r5096, %r5097, %r5802, %r5092; + // chi + lop3.b32 %r29700, %r8788, %r8676, %r8684, 0xD2; + lop3.b32 %r29701, %r8792, %r8680, %r8688, 0xD2; // end inline asm - ld.const.u32 %r5101, [matrix+3388]; + st.local.v2.u32 [%rd3+112], {%r29700, %r29701}; // begin inline asm - dp4a.u32.u32 %r5100, %r5101, %r5806, %r5096; + // chi + lop3.b32 %r29694, %r8676, %r8684, %r8652, 0xD2; + lop3.b32 %r29695, %r8680, %r8688, %r8656, 0xD2; // end inline asm - ld.const.u32 %r5105, [matrix+3392]; + st.local.v2.u32 [%rd3+120], {%r29694, %r29695}; // begin inline asm - dp4a.u32.u32 %r5104, %r5105, %r5746, %r6244; + // chi + lop3.b32 %r29686, %r8684, %r8652, %r8804, 0xD2; + lop3.b32 %r29687, %r8688, %r8656, %r8808, 0xD2; // end inline asm - ld.const.u32 %r5109, [matrix+3396]; + st.local.v2.u32 [%rd3+128], {%r29686, %r29687}; // begin inline asm - dp4a.u32.u32 %r5108, %r5109, %r5750, %r5104; + // chi + lop3.b32 %r29678, %r8652, %r8804, %r8788, 0xD2; + lop3.b32 %r29679, %r8656, %r8808, %r8792, 0xD2; // end inline asm - ld.const.u32 %r5113, [matrix+3400]; + st.local.v2.u32 [%rd3+136], {%r29678, %r29679}; // begin inline asm - dp4a.u32.u32 %r5112, %r5113, %r5754, %r5108; + // chi + lop3.b32 %r29704, %r8708, %r8748, %r8780, 0xD2; + lop3.b32 %r29705, %r8712, %r8752, %r8784, 0xD2; // end inline asm - ld.const.u32 %r5117, [matrix+3404]; + st.local.v2.u32 [%rd3+144], {%r29704, %r29705}; // begin inline asm - dp4a.u32.u32 %r5116, %r5117, %r5758, %r5112; + // chi + lop3.b32 %r29698, %r8748, %r8780, %r8772, 0xD2; + lop3.b32 %r29699, %r8752, %r8784, %r8776, 0xD2; // end inline asm - ld.const.u32 %r5121, [matrix+3408]; + st.local.v2.u32 [%rd3+152], {%r29698, %r29699}; // begin inline asm - dp4a.u32.u32 %r5120, %r5121, %r5762, %r5116; + // chi + lop3.b32 %r29692, %r8780, %r8772, %r8692, 0xD2; + lop3.b32 %r29693, %r8784, %r8776, %r8696, 0xD2; // end inline asm - ld.const.u32 %r5125, [matrix+3412]; + st.local.v2.u32 [%rd3+160], {%r29692, %r29693}; // begin inline asm - dp4a.u32.u32 %r5124, %r5125, %r5766, %r5120; + // chi + lop3.b32 %r29684, %r8772, %r8692, %r8708, 0xD2; + lop3.b32 %r29685, %r8776, %r8696, %r8712, 0xD2; // end inline asm - ld.const.u32 %r5129, [matrix+3416]; + st.local.v2.u32 [%rd3+168], {%r29684, %r29685}; // begin inline asm - dp4a.u32.u32 %r5128, %r5129, %r5770, %r5124; + // chi + lop3.b32 %r29676, %r8692, %r8708, %r8748, 0xD2; + lop3.b32 %r29677, %r8696, %r8712, %r8752, 0xD2; // end inline asm - ld.const.u32 %r5133, [matrix+3420]; + st.local.v2.u32 [%rd3+176], {%r29676, %r29677}; // begin inline asm - dp4a.u32.u32 %r5132, %r5133, %r5774, %r5128; + // chi + lop3.b32 %r29702, %r8660, %r8732, %r8644, 0xD2; + lop3.b32 %r29703, %r8664, %r8736, %r8648, 0xD2; // end inline asm - ld.const.u32 %r5137, [matrix+3424]; + st.local.v2.u32 [%rd3+184], {%r29702, %r29703}; // begin inline asm - dp4a.u32.u32 %r5136, %r5137, %r5778, %r5132; + // chi + lop3.b32 %r29696, %r8732, %r8644, %r8700, 0xD2; + lop3.b32 %r29697, %r8736, %r8648, %r8704, 0xD2; // end inline asm - ld.const.u32 %r5141, [matrix+3428]; + st.local.v2.u32 [%rd3+192], {%r29696, %r29697}; // begin inline asm - dp4a.u32.u32 %r5140, %r5141, %r5782, %r5136; + // chi + lop3.b32 %r29690, %r8644, %r8700, %r8724, 0xD2; + lop3.b32 %r29691, %r8648, %r8704, %r8728, 0xD2; // end inline asm - ld.const.u32 %r5145, [matrix+3432]; + st.local.v2.u32 [%rd3+200], {%r29690, %r29691}; // begin inline asm - dp4a.u32.u32 %r5144, %r5145, %r5786, %r5140; + // chi + lop3.b32 %r29682, %r8700, %r8724, %r8660, 0xD2; + lop3.b32 %r29683, %r8704, %r8728, %r8664, 0xD2; // end inline asm - ld.const.u32 %r5149, [matrix+3436]; + st.local.v2.u32 [%rd3+208], {%r29682, %r29683}; // begin inline asm - dp4a.u32.u32 %r5148, %r5149, %r5790, %r5144; + // chi + lop3.b32 %r29674, %r8724, %r8660, %r8732, 0xD2; + lop3.b32 %r29675, %r8728, %r8664, %r8736, 0xD2; // end inline asm - ld.const.u32 %r5153, [matrix+3440]; + st.local.v2.u32 [%rd3+216], {%r29674, %r29675}; + mul.wide.s32 %rd565, %r29724, 8; + add.s64 %rd564, %rd497, %rd565; // begin inline asm - dp4a.u32.u32 %r5152, %r5153, %r5794, %r5148; + ld.global.nc.v2.u32 {%r9012,%r9013}, [%rd564]; // end inline asm - ld.const.u32 %r5157, [matrix+3444]; + xor.b32 %r29710, %r8812, %r9012; + xor.b32 %r29711, %r8813, %r9013; + add.s32 %r29724, %r29724, 1; + setp.lt.u32 %p21, %r29724, 23; + @%p21 bra $L__BB2_29; + + st.local.v2.u32 [%rd3+32], {%r29722, %r29723}; + st.local.v2.u32 [%rd3+72], {%r29720, %r29721}; + st.local.v2.u32 [%rd3+40], {%r29718, %r29719}; + st.local.v2.u32 [%rd3+80], {%r29716, %r29717}; + st.local.v2.u32 [%rd3+48], {%r29714, %r29715}; + st.local.v2.u32 [%rd3+56], {%r29712, %r29713}; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; // begin inline asm - dp4a.u32.u32 %r5156, %r5157, %r5798, %r5152; + // xor5 + lop3.b32 %r9024, %r29710, %r29708, %r29706, 0x96; + lop3.b32 %r9024, %r9024, %r29704, %r29702, 0x96; + lop3.b32 %r9025, %r29711, %r29709, %r29707, 0x96; + lop3.b32 %r9025, %r9025, %r29705, %r29703, 0x96; // end inline asm - ld.const.u32 %r5161, [matrix+3448]; // begin inline asm - dp4a.u32.u32 %r5160, %r5161, %r5802, %r5156; + // xor5 + lop3.b32 %r9036, %r29722, %r29720, %r29700, 0x96; + lop3.b32 %r9036, %r9036, %r29698, %r29696, 0x96; + lop3.b32 %r9037, %r29723, %r29721, %r29701, 0x96; + lop3.b32 %r9037, %r9037, %r29699, %r29697, 0x96; // end inline asm - ld.const.u32 %r5165, [matrix+3452]; // begin inline asm - dp4a.u32.u32 %r5164, %r5165, %r5806, %r5160; + // xor5 + lop3.b32 %r9048, %r29718, %r29716, %r29694, 0x96; + lop3.b32 %r9048, %r9048, %r29692, %r29690, 0x96; + lop3.b32 %r9049, %r29719, %r29717, %r29695, 0x96; + lop3.b32 %r9049, %r9049, %r29693, %r29691, 0x96; // end inline asm - shr.u32 %r6077, %r5100, 6; - and.b32 %r6078, %r6077, 240; - shr.u32 %r6079, %r5164, 10; - or.b32 %r6080, %r6079, %r6078; - cvt.u64.u32 %rd234, %r6080; - xor.b64 %rd235, %rd202, %rd234; - ld.const.u32 %r5169, [matrix+3456]; // begin inline asm - dp4a.u32.u32 %r5168, %r5169, %r5746, %r6244; + // xor5 + lop3.b32 %r9060, %r29714, %r29688, %r29686, 0x96; + lop3.b32 %r9060, %r9060, %r29684, %r29682, 0x96; + lop3.b32 %r9061, %r29715, %r29689, %r29687, 0x96; + lop3.b32 %r9061, %r9061, %r29685, %r29683, 0x96; // end inline asm - ld.const.u32 %r5173, [matrix+3460]; // begin inline asm - dp4a.u32.u32 %r5172, %r5173, %r5750, %r5168; + // xor5 + lop3.b32 %r9072, %r29712, %r29680, %r29678, 0x96; + lop3.b32 %r9072, %r9072, %r29676, %r29674, 0x96; + lop3.b32 %r9073, %r29713, %r29681, %r29679, 0x96; + lop3.b32 %r9073, %r9073, %r29677, %r29675, 0x96; // end inline asm - ld.const.u32 %r5177, [matrix+3464]; + mov.u32 %r9276, 1; // begin inline asm - dp4a.u32.u32 %r5176, %r5177, %r5754, %r5172; + shf.l.wrap.b32 %r9084, %r9037, %r9036, %r9276; // end inline asm - ld.const.u32 %r5181, [matrix+3468]; // begin inline asm - dp4a.u32.u32 %r5180, %r5181, %r5758, %r5176; + shf.l.wrap.b32 %r9088, %r9036, %r9037, %r9276; // end inline asm - ld.const.u32 %r5185, [matrix+3472]; + xor.b32 %r9303, %r9084, %r9072; + xor.b32 %r9304, %r9088, %r9073; + xor.b32 %r9231, %r29710, %r9303; + xor.b32 %r9234, %r29711, %r9304; + xor.b32 %r9194, %r29707, %r9304; + xor.b32 %r9193, %r29706, %r9303; + st.local.v2.u32 [%rd3+104], {%r9193, %r9194}; // begin inline asm - dp4a.u32.u32 %r5184, %r5185, %r5762, %r5180; + shf.l.wrap.b32 %r9092, %r9049, %r9048, %r9276; // end inline asm - ld.const.u32 %r5189, [matrix+3476]; // begin inline asm - dp4a.u32.u32 %r5188, %r5189, %r5766, %r5184; + shf.l.wrap.b32 %r9096, %r9048, %r9049, %r9276; // end inline asm - ld.const.u32 %r5193, [matrix+3480]; + xor.b32 %r9305, %r9092, %r9024; + xor.b32 %r9306, %r9096, %r9025; + xor.b32 %r9130, %r29720, %r9305; + xor.b32 %r9129, %r29721, %r9306; + xor.b32 %r9169, %r29699, %r9306; + xor.b32 %r9170, %r29698, %r9305; + st.local.v2.u32 [%rd3+152], {%r9170, %r9169}; // begin inline asm - dp4a.u32.u32 %r5192, %r5193, %r5770, %r5188; + shf.l.wrap.b32 %r9100, %r9061, %r9060, %r9276; // end inline asm - ld.const.u32 %r5197, [matrix+3484]; // begin inline asm - dp4a.u32.u32 %r5196, %r5197, %r5774, %r5192; + shf.l.wrap.b32 %r9104, %r9060, %r9061, %r9276; // end inline asm - ld.const.u32 %r5201, [matrix+3488]; + xor.b32 %r9307, %r9100, %r9036; + xor.b32 %r9308, %r9104, %r9037; + xor.b32 %r9153, %r29695, %r9308; + xor.b32 %r9154, %r29694, %r9307; + st.local.v2.u32 [%rd3+120], {%r9154, %r9153}; + xor.b32 %r9145, %r29691, %r9308; + xor.b32 %r9146, %r29690, %r9307; + st.local.v2.u32 [%rd3+200], {%r9146, %r9145}; // begin inline asm - dp4a.u32.u32 %r5200, %r5201, %r5778, %r5196; + shf.l.wrap.b32 %r9108, %r9073, %r9072, %r9276; // end inline asm - ld.const.u32 %r5205, [matrix+3492]; // begin inline asm - dp4a.u32.u32 %r5204, %r5205, %r5782, %r5200; + shf.l.wrap.b32 %r9112, %r9072, %r9073, %r9276; // end inline asm - ld.const.u32 %r5209, [matrix+3496]; + xor.b32 %r9309, %r9108, %r9048; + xor.b32 %r9310, %r9112, %r9049; + xor.b32 %r9177, %r29714, %r9309; + xor.b32 %r9178, %r29715, %r9310; + xor.b32 %r9186, %r29685, %r9310; + xor.b32 %r9185, %r29684, %r9309; + st.local.v2.u32 [%rd3+168], {%r9185, %r9186}; // begin inline asm - dp4a.u32.u32 %r5208, %r5209, %r5786, %r5204; + shf.l.wrap.b32 %r9116, %r9025, %r9024, %r9276; // end inline asm - ld.const.u32 %r5213, [matrix+3500]; // begin inline asm - dp4a.u32.u32 %r5212, %r5213, %r5790, %r5208; + shf.l.wrap.b32 %r9120, %r9024, %r9025, %r9276; // end inline asm - ld.const.u32 %r5217, [matrix+3504]; + xor.b32 %r9311, %r9116, %r9060; + xor.b32 %r9312, %r9120, %r9061; + xor.b32 %r9137, %r29680, %r9311; + xor.b32 %r9138, %r29681, %r9312; + xor.b32 %r9162, %r29675, %r9312; + xor.b32 %r9161, %r29674, %r9311; + st.local.v2.u32 [%rd3+216], {%r9161, %r9162}; // begin inline asm - dp4a.u32.u32 %r5216, %r5217, %r5794, %r5212; + shf.l.wrap.b32 %r9124, %r9130, %r9129, %r8627; // end inline asm - ld.const.u32 %r5221, [matrix+3508]; // begin inline asm - dp4a.u32.u32 %r5220, %r5221, %r5798, %r5216; + shf.l.wrap.b32 %r9128, %r9129, %r9130, %r8627; // end inline asm - ld.const.u32 %r5225, [matrix+3512]; // begin inline asm - dp4a.u32.u32 %r5224, %r5225, %r5802, %r5220; + shf.l.wrap.b32 %r9132, %r9138, %r9137, %r8635; // end inline asm - ld.const.u32 %r5229, [matrix+3516]; // begin inline asm - dp4a.u32.u32 %r5228, %r5229, %r5806, %r5224; + shf.l.wrap.b32 %r9136, %r9137, %r9138, %r8635; // end inline asm - ld.const.u32 %r5233, [matrix+3520]; // begin inline asm - dp4a.u32.u32 %r5232, %r5233, %r5746, %r6244; + shf.l.wrap.b32 %r9144, %r9145, %r9146, %r8643; // end inline asm - ld.const.u32 %r5237, [matrix+3524]; // begin inline asm - dp4a.u32.u32 %r5236, %r5237, %r5750, %r5232; + shf.l.wrap.b32 %r9140, %r9146, %r9145, %r8643; // end inline asm - ld.const.u32 %r5241, [matrix+3528]; + st.local.v2.u32 [%rd3+96], {%r9140, %r9144}; // begin inline asm - dp4a.u32.u32 %r5240, %r5241, %r5754, %r5236; + shf.l.wrap.b32 %r9148, %r9154, %r9153, %r8675; // end inline asm - ld.const.u32 %r5245, [matrix+3532]; // begin inline asm - dp4a.u32.u32 %r5244, %r5245, %r5758, %r5240; + shf.l.wrap.b32 %r9152, %r9153, %r9154, %r8675; // end inline asm - ld.const.u32 %r5249, [matrix+3536]; // begin inline asm - dp4a.u32.u32 %r5248, %r5249, %r5762, %r5244; + shf.l.wrap.b32 %r9156, %r9162, %r9161, %r8723; // end inline asm - ld.const.u32 %r5253, [matrix+3540]; // begin inline asm - dp4a.u32.u32 %r5252, %r5253, %r5766, %r5248; + shf.l.wrap.b32 %r9160, %r9161, %r9162, %r8723; // end inline asm - ld.const.u32 %r5257, [matrix+3544]; // begin inline asm - dp4a.u32.u32 %r5256, %r5257, %r5770, %r5252; + shf.l.wrap.b32 %r9168, %r9169, %r9170, %r8747; // end inline asm - ld.const.u32 %r5261, [matrix+3548]; // begin inline asm - dp4a.u32.u32 %r5260, %r5261, %r5774, %r5256; + shf.l.wrap.b32 %r9164, %r9170, %r9169, %r8747; // end inline asm - ld.const.u32 %r5265, [matrix+3552]; + st.local.v2.u32 [%rd3+88], {%r9164, %r9168}; // begin inline asm - dp4a.u32.u32 %r5264, %r5265, %r5778, %r5260; + shf.l.wrap.b32 %r9172, %r9178, %r9177, %r8763; // end inline asm - ld.const.u32 %r5269, [matrix+3556]; // begin inline asm - dp4a.u32.u32 %r5268, %r5269, %r5782, %r5264; + shf.l.wrap.b32 %r9176, %r9177, %r9178, %r8763; // end inline asm - ld.const.u32 %r5273, [matrix+3560]; // begin inline asm - dp4a.u32.u32 %r5272, %r5273, %r5786, %r5268; + shf.l.wrap.b32 %r9180, %r9186, %r9185, %r8771; // end inline asm - ld.const.u32 %r5277, [matrix+3564]; // begin inline asm - dp4a.u32.u32 %r5276, %r5277, %r5790, %r5272; + shf.l.wrap.b32 %r9184, %r9185, %r9186, %r8771; // end inline asm - ld.const.u32 %r5281, [matrix+3568]; // begin inline asm - dp4a.u32.u32 %r5280, %r5281, %r5794, %r5276; + shf.l.wrap.b32 %r9188, %r9194, %r9193, %r8803; // end inline asm - ld.const.u32 %r5285, [matrix+3572]; // begin inline asm - dp4a.u32.u32 %r5284, %r5285, %r5798, %r5280; + shf.l.wrap.b32 %r9192, %r9193, %r9194, %r8803; // end inline asm - ld.const.u32 %r5289, [matrix+3576]; // begin inline asm - dp4a.u32.u32 %r5288, %r5289, %r5802, %r5284; + // chi + lop3.b32 %r9196, %r9231, %r9124, %r9148, 0xD2; + lop3.b32 %r9197, %r9234, %r9128, %r9152, 0xD2; // end inline asm - ld.const.u32 %r5293, [matrix+3580]; // begin inline asm - dp4a.u32.u32 %r5292, %r5293, %r5806, %r5288; + // chi + lop3.b32 %r9204, %r9124, %r9148, %r9180, 0xD2; + lop3.b32 %r9205, %r9128, %r9152, %r9184, 0xD2; // end inline asm - shr.u32 %r6081, %r5228, 6; - and.b32 %r6082, %r6081, 240; - shr.u32 %r6083, %r5292, 10; - or.b32 %r6084, %r6083, %r6082; - cvt.u64.u32 %rd236, %r6084; - xor.b64 %rd237, %rd203, %rd236; - ld.const.u32 %r5297, [matrix+3584]; + st.local.v2.u32 [%rd3+32], {%r9204, %r9205}; // begin inline asm - dp4a.u32.u32 %r5296, %r5297, %r5746, %r6244; + // chi + lop3.b32 %r9212, %r9148, %r9180, %r9156, 0xD2; + lop3.b32 %r9213, %r9152, %r9184, %r9160, 0xD2; // end inline asm - ld.const.u32 %r5301, [matrix+3588]; + st.local.v2.u32 [%rd3+40], {%r9212, %r9213}; // begin inline asm - dp4a.u32.u32 %r5300, %r5301, %r5750, %r5296; + // chi + lop3.b32 %r9220, %r9180, %r9156, %r9231, 0xD2; + lop3.b32 %r9221, %r9184, %r9160, %r9234, 0xD2; // end inline asm - ld.const.u32 %r5305, [matrix+3592]; + st.local.v2.u32 [%rd3+48], {%r9220, %r9221}; // begin inline asm - dp4a.u32.u32 %r5304, %r5305, %r5754, %r5300; + // chi + lop3.b32 %r9228, %r9156, %r9231, %r9124, 0xD2; + lop3.b32 %r9229, %r9160, %r9234, %r9128, 0xD2; // end inline asm - ld.const.u32 %r5309, [matrix+3596]; + st.local.v2.u32 [%rd3+56], {%r9228, %r9229}; // begin inline asm - dp4a.u32.u32 %r5308, %r5309, %r5758, %r5304; + // chi + lop3.b32 %r9236, %r9172, %r9132, %r9188, 0xD2; + lop3.b32 %r9237, %r9176, %r9136, %r9192, 0xD2; // end inline asm - ld.const.u32 %r5313, [matrix+3600]; + st.local.v2.u32 [%rd3+64], {%r9236, %r9237}; // begin inline asm - dp4a.u32.u32 %r5312, %r5313, %r5762, %r5308; + // chi + lop3.b32 %r9244, %r9132, %r9188, %r9164, 0xD2; + lop3.b32 %r9245, %r9136, %r9192, %r9168, 0xD2; // end inline asm - ld.const.u32 %r5317, [matrix+3604]; + st.local.v2.u32 [%rd3+72], {%r9244, %r9245}; // begin inline asm - dp4a.u32.u32 %r5316, %r5317, %r5766, %r5312; + // chi + lop3.b32 %r9252, %r9188, %r9164, %r9140, 0xD2; + lop3.b32 %r9253, %r9192, %r9168, %r9144, 0xD2; // end inline asm - ld.const.u32 %r5321, [matrix+3608]; + st.local.v2.u32 [%rd3+80], {%r9252, %r9253}; // begin inline asm - dp4a.u32.u32 %r5320, %r5321, %r5770, %r5316; + ld.global.nc.v2.u32 {%r9260,%r9261}, [%rd498]; // end inline asm - ld.const.u32 %r5325, [matrix+3612]; + xor.b32 %r9313, %r9197, %r9261; + xor.b32 %r9314, %r9196, %r9260; + mov.b64 %rd1261, {%r9314, %r9313}; + mov.b64 %rd1262, {%r9204, %r9205}; + mov.b64 %rd1263, {%r9212, %r9213}; + mov.b64 %rd62, {%r9220, %r9221}; + mov.b64 %rd1264, {%r9228, %r9229}; + mov.b64 %rd64, {%r9236, %r9237}; + mov.b64 %rd65, {%r9244, %r9245}; + mov.b64 %rd66, {%r9252, %r9253}; + mov.u32 %r29725, 0; + st.local.v2.u32 [%rd3+24], {%r9314, %r9313}; + st.local.v2.u32 [%rd55+96], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+104], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+112], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+120], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+128], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+136], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+144], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+152], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+160], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+168], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+176], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+184], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+192], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+200], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+208], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+216], {%r29725, %r29725}; + mov.u32 %r29740, -2147483648; + st.local.v2.u32 [%rd55+88], {%r9276, %r29740}; + mov.u32 %r29726, %r29725; + mov.u32 %r29727, %r29725; + mov.u32 %r29728, %r29725; + mov.u32 %r29729, %r29725; + mov.u32 %r29730, %r29725; + mov.u32 %r29731, %r29725; + mov.u32 %r29732, %r29725; + mov.u32 %r29733, %r29725; + mov.u32 %r29734, %r29725; + mov.u32 %r29735, %r29725; + mov.u32 %r29736, %r29725; + mov.u32 %r29737, %r29725; + mov.u32 %r29738, %r29725; + mov.u32 %r29739, %r9276; + mov.u32 %r29741, %r29725; + mov.u32 %r29742, %r29725; + mov.u32 %r29743, %r29725; + mov.u32 %r29744, %r29725; + mov.u32 %r29745, %r29725; + mov.u32 %r29746, %r29725; + mov.u32 %r29747, %r29725; + mov.u32 %r29748, %r29725; + mov.u32 %r29749, %r29725; + mov.u32 %r29750, %r29725; + mov.u32 %r29751, %r29725; + mov.u32 %r29752, %r29725; + mov.u32 %r29753, %r29725; + mov.u32 %r29754, %r29725; + mov.u32 %r29755, %r29725; + mov.u32 %r29756, %r29725; + mov.u32 %r29757, %r29725; + mov.u32 %r29758, %r29725; + mov.u32 %r29775, %r29725; + +$L__BB2_31: // begin inline asm - dp4a.u32.u32 %r5324, %r5325, %r5774, %r5320; + // xor5 + lop3.b32 %r9315, %r29761, %r29759, %r29757, 0x96; + lop3.b32 %r9315, %r9315, %r29755, %r29753, 0x96; + lop3.b32 %r9316, %r29762, %r29760, %r29758, 0x96; + lop3.b32 %r9316, %r9316, %r29756, %r29754, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9327, %r29773, %r29771, %r29751, 0x96; + lop3.b32 %r9327, %r9327, %r29749, %r29747, 0x96; + lop3.b32 %r9328, %r29774, %r29772, %r29752, 0x96; + lop3.b32 %r9328, %r9328, %r29750, %r29748, 0x96; // end inline asm - ld.const.u32 %r5329, [matrix+3616]; // begin inline asm - dp4a.u32.u32 %r5328, %r5329, %r5778, %r5324; + // xor5 + lop3.b32 %r9339, %r29769, %r29767, %r29745, 0x96; + lop3.b32 %r9339, %r9339, %r29743, %r29741, 0x96; + lop3.b32 %r9340, %r29770, %r29768, %r29746, 0x96; + lop3.b32 %r9340, %r9340, %r29744, %r29742, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9351, %r29765, %r29739, %r29737, 0x96; + lop3.b32 %r9351, %r9351, %r29735, %r29733, 0x96; + lop3.b32 %r9352, %r29766, %r29740, %r29738, 0x96; + lop3.b32 %r9352, %r9352, %r29736, %r29734, 0x96; // end inline asm - ld.const.u32 %r5333, [matrix+3620]; // begin inline asm - dp4a.u32.u32 %r5332, %r5333, %r5782, %r5328; + // xor5 + lop3.b32 %r9363, %r29763, %r29731, %r29729, 0x96; + lop3.b32 %r9363, %r9363, %r29727, %r29725, 0x96; + lop3.b32 %r9364, %r29764, %r29732, %r29730, 0x96; + lop3.b32 %r9364, %r9364, %r29728, %r29726, 0x96; // end inline asm - ld.const.u32 %r5337, [matrix+3624]; // begin inline asm - dp4a.u32.u32 %r5336, %r5337, %r5786, %r5332; + shf.l.wrap.b32 %r9375, %r9328, %r9327, %r9276; // end inline asm - ld.const.u32 %r5341, [matrix+3628]; // begin inline asm - dp4a.u32.u32 %r5340, %r5341, %r5790, %r5336; + shf.l.wrap.b32 %r9379, %r9327, %r9328, %r9276; // end inline asm - ld.const.u32 %r5345, [matrix+3632]; + xor.b32 %r9809, %r9375, %r9363; + xor.b32 %r9810, %r9379, %r9364; + xor.b32 %r9642, %r29761, %r9809; + xor.b32 %r9645, %r29762, %r9810; + xor.b32 %r9549, %r29759, %r9809; + xor.b32 %r9548, %r29760, %r9810; + xor.b32 %r9596, %r29757, %r9809; + xor.b32 %r9597, %r29758, %r9810; + xor.b32 %r9501, %r29755, %r9809; + xor.b32 %r9500, %r29756, %r9810; + xor.b32 %r9452, %r29753, %r9809; + xor.b32 %r9453, %r29754, %r9810; // begin inline asm - dp4a.u32.u32 %r5344, %r5345, %r5794, %r5340; + shf.l.wrap.b32 %r9383, %r9340, %r9339, %r9276; // end inline asm - ld.const.u32 %r5349, [matrix+3636]; // begin inline asm - dp4a.u32.u32 %r5348, %r5349, %r5798, %r5344; + shf.l.wrap.b32 %r9387, %r9339, %r9340, %r9276; // end inline asm - ld.const.u32 %r5353, [matrix+3640]; + xor.b32 %r9811, %r9383, %r9315; + xor.b32 %r9812, %r9387, %r9316; + xor.b32 %r9604, %r29773, %r9811; + xor.b32 %r9605, %r29774, %r9812; + xor.b32 %r9421, %r29771, %r9811; + xor.b32 %r9420, %r29772, %r9812; + xor.b32 %r9580, %r29751, %r9811; + xor.b32 %r9581, %r29752, %r9812; + xor.b32 %r9541, %r29749, %r9811; + xor.b32 %r9540, %r29750, %r9812; + xor.b32 %r9524, %r29747, %r9811; + xor.b32 %r9525, %r29748, %r9812; // begin inline asm - dp4a.u32.u32 %r5352, %r5353, %r5802, %r5348; + shf.l.wrap.b32 %r9391, %r9352, %r9351, %r9276; // end inline asm - ld.const.u32 %r5357, [matrix+3644]; // begin inline asm - dp4a.u32.u32 %r5356, %r5357, %r5806, %r5352; + shf.l.wrap.b32 %r9395, %r9351, %r9352, %r9276; // end inline asm - ld.const.u32 %r5361, [matrix+3648]; + xor.b32 %r9813, %r9391, %r9327; + xor.b32 %r9814, %r9395, %r9328; + xor.b32 %r9461, %r29769, %r9813; + xor.b32 %r9460, %r29770, %r9814; + xor.b32 %r9588, %r29767, %r9813; + xor.b32 %r9589, %r29768, %r9814; + xor.b32 %r9469, %r29745, %r9813; + xor.b32 %r9468, %r29746, %r9814; + xor.b32 %r9572, %r29743, %r9813; + xor.b32 %r9573, %r29744, %r9814; + xor.b32 %r9437, %r29741, %r9813; + xor.b32 %r9436, %r29742, %r9814; // begin inline asm - dp4a.u32.u32 %r5360, %r5361, %r5746, %r6244; + shf.l.wrap.b32 %r9399, %r9364, %r9363, %r9276; // end inline asm - ld.const.u32 %r5365, [matrix+3652]; // begin inline asm - dp4a.u32.u32 %r5364, %r5365, %r5750, %r5360; + shf.l.wrap.b32 %r9403, %r9363, %r9364, %r9276; // end inline asm - ld.const.u32 %r5369, [matrix+3656]; + xor.b32 %r9815, %r9399, %r9339; + xor.b32 %r9816, %r9403, %r9340; + xor.b32 %r9556, %r29765, %r9815; + xor.b32 %r9557, %r29766, %r9816; + xor.b32 %r9533, %r29739, %r9815; + xor.b32 %r9532, %r29740, %r9816; + xor.b32 %r9476, %r29737, %r9815; + xor.b32 %r9477, %r29738, %r9816; + xor.b32 %r9564, %r29735, %r9815; + xor.b32 %r9565, %r29736, %r9816; + xor.b32 %r9493, %r29733, %r9815; + xor.b32 %r9492, %r29734, %r9816; // begin inline asm - dp4a.u32.u32 %r5368, %r5369, %r5754, %r5364; + shf.l.wrap.b32 %r9407, %r9316, %r9315, %r9276; // end inline asm - ld.const.u32 %r5373, [matrix+3660]; // begin inline asm - dp4a.u32.u32 %r5372, %r5373, %r5758, %r5368; + shf.l.wrap.b32 %r9411, %r9315, %r9316, %r9276; // end inline asm - ld.const.u32 %r5377, [matrix+3664]; + xor.b32 %r9817, %r9407, %r9351; + xor.b32 %r9818, %r9411, %r9352; + xor.b32 %r9508, %r29763, %r9817; + xor.b32 %r9509, %r29764, %r9818; + xor.b32 %r9428, %r29731, %r9817; + xor.b32 %r9429, %r29732, %r9818; + xor.b32 %r9445, %r29729, %r9817; + xor.b32 %r9444, %r29730, %r9818; + xor.b32 %r9484, %r29727, %r9817; + xor.b32 %r9485, %r29728, %r9818; + xor.b32 %r9516, %r29725, %r9817; + xor.b32 %r9517, %r29726, %r9818; + mov.u32 %r9422, 44; // begin inline asm - dp4a.u32.u32 %r5376, %r5377, %r5762, %r5372; + shf.l.wrap.b32 %r9415, %r9421, %r9420, %r9422; // end inline asm - ld.const.u32 %r5381, [matrix+3668]; // begin inline asm - dp4a.u32.u32 %r5380, %r5381, %r5766, %r5376; + shf.l.wrap.b32 %r9419, %r9420, %r9421, %r9422; // end inline asm - ld.const.u32 %r5385, [matrix+3672]; + mov.u32 %r9430, 20; // begin inline asm - dp4a.u32.u32 %r5384, %r5385, %r5770, %r5380; + shf.l.wrap.b32 %r9423, %r9429, %r9428, %r9430; // end inline asm - ld.const.u32 %r5389, [matrix+3676]; // begin inline asm - dp4a.u32.u32 %r5388, %r5389, %r5774, %r5384; + shf.l.wrap.b32 %r9427, %r9428, %r9429, %r9430; // end inline asm - ld.const.u32 %r5393, [matrix+3680]; + mov.u32 %r9438, 61; // begin inline asm - dp4a.u32.u32 %r5392, %r5393, %r5778, %r5388; + shf.l.wrap.b32 %r9431, %r9437, %r9436, %r9438; // end inline asm - ld.const.u32 %r5397, [matrix+3684]; // begin inline asm - dp4a.u32.u32 %r5396, %r5397, %r5782, %r5392; + shf.l.wrap.b32 %r9435, %r9436, %r9437, %r9438; // end inline asm - ld.const.u32 %r5401, [matrix+3688]; + mov.u32 %r9446, 39; // begin inline asm - dp4a.u32.u32 %r5400, %r5401, %r5786, %r5396; + shf.l.wrap.b32 %r9439, %r9445, %r9444, %r9446; // end inline asm - ld.const.u32 %r5405, [matrix+3692]; // begin inline asm - dp4a.u32.u32 %r5404, %r5405, %r5790, %r5400; + shf.l.wrap.b32 %r9443, %r9444, %r9445, %r9446; // end inline asm - ld.const.u32 %r5409, [matrix+3696]; + mov.u32 %r9454, 18; // begin inline asm - dp4a.u32.u32 %r5408, %r5409, %r5794, %r5404; + shf.l.wrap.b32 %r9447, %r9453, %r9452, %r9454; // end inline asm - ld.const.u32 %r5413, [matrix+3700]; // begin inline asm - dp4a.u32.u32 %r5412, %r5413, %r5798, %r5408; + shf.l.wrap.b32 %r9451, %r9452, %r9453, %r9454; // end inline asm - ld.const.u32 %r5417, [matrix+3704]; + mov.u32 %r9462, 62; // begin inline asm - dp4a.u32.u32 %r5416, %r5417, %r5802, %r5412; + shf.l.wrap.b32 %r9455, %r9461, %r9460, %r9462; // end inline asm - ld.const.u32 %r5421, [matrix+3708]; // begin inline asm - dp4a.u32.u32 %r5420, %r5421, %r5806, %r5416; + shf.l.wrap.b32 %r9459, %r9460, %r9461, %r9462; // end inline asm - shr.u32 %r6085, %r5356, 6; - and.b32 %r6086, %r6085, 240; - shr.u32 %r6087, %r5420, 10; - or.b32 %r6088, %r6087, %r6086; - cvt.u64.u32 %rd238, %r6088; - xor.b64 %rd239, %rd204, %rd238; - ld.const.u32 %r5425, [matrix+3712]; + mov.u32 %r9470, 43; // begin inline asm - dp4a.u32.u32 %r5424, %r5425, %r5746, %r6244; + shf.l.wrap.b32 %r9463, %r9469, %r9468, %r9470; // end inline asm - ld.const.u32 %r5429, [matrix+3716]; // begin inline asm - dp4a.u32.u32 %r5428, %r5429, %r5750, %r5424; + shf.l.wrap.b32 %r9467, %r9468, %r9469, %r9470; // end inline asm - ld.const.u32 %r5433, [matrix+3720]; + mov.u32 %r9478, 25; // begin inline asm - dp4a.u32.u32 %r5432, %r5433, %r5754, %r5428; + shf.l.wrap.b32 %r9471, %r9477, %r9476, %r9478; // end inline asm - ld.const.u32 %r5437, [matrix+3724]; // begin inline asm - dp4a.u32.u32 %r5436, %r5437, %r5758, %r5432; + shf.l.wrap.b32 %r9475, %r9476, %r9477, %r9478; // end inline asm - ld.const.u32 %r5441, [matrix+3728]; + mov.u32 %r9486, 8; // begin inline asm - dp4a.u32.u32 %r5440, %r5441, %r5762, %r5436; + shf.l.wrap.b32 %r9479, %r9485, %r9484, %r9486; // end inline asm - ld.const.u32 %r5445, [matrix+3732]; // begin inline asm - dp4a.u32.u32 %r5444, %r5445, %r5766, %r5440; + shf.l.wrap.b32 %r9483, %r9484, %r9485, %r9486; // end inline asm - ld.const.u32 %r5449, [matrix+3736]; + mov.u32 %r9494, 56; // begin inline asm - dp4a.u32.u32 %r5448, %r5449, %r5770, %r5444; + shf.l.wrap.b32 %r9487, %r9493, %r9492, %r9494; // end inline asm - ld.const.u32 %r5453, [matrix+3740]; // begin inline asm - dp4a.u32.u32 %r5452, %r5453, %r5774, %r5448; + shf.l.wrap.b32 %r9491, %r9492, %r9493, %r9494; // end inline asm - ld.const.u32 %r5457, [matrix+3744]; + mov.u32 %r9502, 41; // begin inline asm - dp4a.u32.u32 %r5456, %r5457, %r5778, %r5452; + shf.l.wrap.b32 %r9495, %r9501, %r9500, %r9502; // end inline asm - ld.const.u32 %r5461, [matrix+3748]; // begin inline asm - dp4a.u32.u32 %r5460, %r5461, %r5782, %r5456; + shf.l.wrap.b32 %r9499, %r9500, %r9501, %r9502; // end inline asm - ld.const.u32 %r5465, [matrix+3752]; + mov.u32 %r9510, 27; // begin inline asm - dp4a.u32.u32 %r5464, %r5465, %r5786, %r5460; + shf.l.wrap.b32 %r9503, %r9509, %r9508, %r9510; // end inline asm - ld.const.u32 %r5469, [matrix+3756]; // begin inline asm - dp4a.u32.u32 %r5468, %r5469, %r5790, %r5464; + shf.l.wrap.b32 %r9507, %r9508, %r9509, %r9510; // end inline asm - ld.const.u32 %r5473, [matrix+3760]; + mov.u32 %r9518, 14; // begin inline asm - dp4a.u32.u32 %r5472, %r5473, %r5794, %r5468; + shf.l.wrap.b32 %r9511, %r9517, %r9516, %r9518; // end inline asm - ld.const.u32 %r5477, [matrix+3764]; // begin inline asm - dp4a.u32.u32 %r5476, %r5477, %r5798, %r5472; + shf.l.wrap.b32 %r9515, %r9516, %r9517, %r9518; // end inline asm - ld.const.u32 %r5481, [matrix+3768]; + mov.u32 %r9526, 2; // begin inline asm - dp4a.u32.u32 %r5480, %r5481, %r5802, %r5476; + shf.l.wrap.b32 %r9519, %r9525, %r9524, %r9526; // end inline asm - ld.const.u32 %r5485, [matrix+3772]; // begin inline asm - dp4a.u32.u32 %r5484, %r5485, %r5806, %r5480; + shf.l.wrap.b32 %r9523, %r9524, %r9525, %r9526; // end inline asm - ld.const.u32 %r5489, [matrix+3776]; + mov.u32 %r9534, 55; // begin inline asm - dp4a.u32.u32 %r5488, %r5489, %r5746, %r6244; + shf.l.wrap.b32 %r9527, %r9533, %r9532, %r9534; // end inline asm - ld.const.u32 %r5493, [matrix+3780]; // begin inline asm - dp4a.u32.u32 %r5492, %r5493, %r5750, %r5488; + shf.l.wrap.b32 %r9531, %r9532, %r9533, %r9534; // end inline asm - ld.const.u32 %r5497, [matrix+3784]; + mov.u32 %r9542, 45; // begin inline asm - dp4a.u32.u32 %r5496, %r5497, %r5754, %r5492; + shf.l.wrap.b32 %r9535, %r9541, %r9540, %r9542; // end inline asm - ld.const.u32 %r5501, [matrix+3788]; // begin inline asm - dp4a.u32.u32 %r5500, %r5501, %r5758, %r5496; + shf.l.wrap.b32 %r9539, %r9540, %r9541, %r9542; // end inline asm - ld.const.u32 %r5505, [matrix+3792]; + mov.u32 %r9550, 36; // begin inline asm - dp4a.u32.u32 %r5504, %r5505, %r5762, %r5500; + shf.l.wrap.b32 %r9543, %r9549, %r9548, %r9550; // end inline asm - ld.const.u32 %r5509, [matrix+3796]; // begin inline asm - dp4a.u32.u32 %r5508, %r5509, %r5766, %r5504; + shf.l.wrap.b32 %r9547, %r9548, %r9549, %r9550; // end inline asm - ld.const.u32 %r5513, [matrix+3800]; + mov.u32 %r9558, 28; // begin inline asm - dp4a.u32.u32 %r5512, %r5513, %r5770, %r5508; + shf.l.wrap.b32 %r9551, %r9557, %r9556, %r9558; // end inline asm - ld.const.u32 %r5517, [matrix+3804]; // begin inline asm - dp4a.u32.u32 %r5516, %r5517, %r5774, %r5512; + shf.l.wrap.b32 %r9555, %r9556, %r9557, %r9558; // end inline asm - ld.const.u32 %r5521, [matrix+3808]; + mov.u32 %r9566, 21; // begin inline asm - dp4a.u32.u32 %r5520, %r5521, %r5778, %r5516; + shf.l.wrap.b32 %r9559, %r9565, %r9564, %r9566; // end inline asm - ld.const.u32 %r5525, [matrix+3812]; // begin inline asm - dp4a.u32.u32 %r5524, %r5525, %r5782, %r5520; + shf.l.wrap.b32 %r9563, %r9564, %r9565, %r9566; // end inline asm - ld.const.u32 %r5529, [matrix+3816]; + mov.u32 %r9574, 15; // begin inline asm - dp4a.u32.u32 %r5528, %r5529, %r5786, %r5524; + shf.l.wrap.b32 %r9567, %r9573, %r9572, %r9574; // end inline asm - ld.const.u32 %r5533, [matrix+3820]; // begin inline asm - dp4a.u32.u32 %r5532, %r5533, %r5790, %r5528; + shf.l.wrap.b32 %r9571, %r9572, %r9573, %r9574; // end inline asm - ld.const.u32 %r5537, [matrix+3824]; + mov.u32 %r9582, 10; // begin inline asm - dp4a.u32.u32 %r5536, %r5537, %r5794, %r5532; + shf.l.wrap.b32 %r9575, %r9581, %r9580, %r9582; // end inline asm - ld.const.u32 %r5541, [matrix+3828]; // begin inline asm - dp4a.u32.u32 %r5540, %r5541, %r5798, %r5536; + shf.l.wrap.b32 %r9579, %r9580, %r9581, %r9582; // end inline asm - ld.const.u32 %r5545, [matrix+3832]; + mov.u32 %r9590, 6; // begin inline asm - dp4a.u32.u32 %r5544, %r5545, %r5802, %r5540; + shf.l.wrap.b32 %r9583, %r9589, %r9588, %r9590; // end inline asm - ld.const.u32 %r5549, [matrix+3836]; // begin inline asm - dp4a.u32.u32 %r5548, %r5549, %r5806, %r5544; + shf.l.wrap.b32 %r9587, %r9588, %r9589, %r9590; // end inline asm - shr.u32 %r6089, %r5484, 6; - and.b32 %r6090, %r6089, 240; - shr.u32 %r6091, %r5548, 10; - or.b32 %r6092, %r6091, %r6090; - cvt.u64.u32 %rd240, %r6092; - xor.b64 %rd241, %rd206, %rd240; - ld.const.u32 %r5553, [matrix+3840]; + mov.u32 %r9598, 3; // begin inline asm - dp4a.u32.u32 %r5552, %r5553, %r5746, %r6244; + shf.l.wrap.b32 %r9591, %r9597, %r9596, %r9598; // end inline asm - ld.const.u32 %r5557, [matrix+3844]; // begin inline asm - dp4a.u32.u32 %r5556, %r5557, %r5750, %r5552; + shf.l.wrap.b32 %r9595, %r9596, %r9597, %r9598; // end inline asm - ld.const.u32 %r5561, [matrix+3848]; // begin inline asm - dp4a.u32.u32 %r5560, %r5561, %r5754, %r5556; + shf.l.wrap.b32 %r9599, %r9605, %r9604, %r9276; // end inline asm - ld.const.u32 %r5565, [matrix+3852]; // begin inline asm - dp4a.u32.u32 %r5564, %r5565, %r5758, %r5560; + shf.l.wrap.b32 %r9603, %r9604, %r9605, %r9276; // end inline asm - ld.const.u32 %r5569, [matrix+3856]; // begin inline asm - dp4a.u32.u32 %r5568, %r5569, %r5762, %r5564; + // chi + lop3.b32 %r9607, %r9642, %r9415, %r9463, 0xD2; + lop3.b32 %r9608, %r9645, %r9419, %r9467, 0xD2; // end inline asm - ld.const.u32 %r5573, [matrix+3860]; // begin inline asm - dp4a.u32.u32 %r5572, %r5573, %r5766, %r5568; + // chi + lop3.b32 %r29773, %r9415, %r9463, %r9559, 0xD2; + lop3.b32 %r29774, %r9419, %r9467, %r9563, 0xD2; // end inline asm - ld.const.u32 %r5577, [matrix+3864]; // begin inline asm - dp4a.u32.u32 %r5576, %r5577, %r5770, %r5572; + // chi + lop3.b32 %r29769, %r9463, %r9559, %r9511, 0xD2; + lop3.b32 %r29770, %r9467, %r9563, %r9515, 0xD2; // end inline asm - ld.const.u32 %r5581, [matrix+3868]; // begin inline asm - dp4a.u32.u32 %r5580, %r5581, %r5774, %r5576; + // chi + lop3.b32 %r29765, %r9559, %r9511, %r9642, 0xD2; + lop3.b32 %r29766, %r9563, %r9515, %r9645, 0xD2; // end inline asm - ld.const.u32 %r5585, [matrix+3872]; // begin inline asm - dp4a.u32.u32 %r5584, %r5585, %r5778, %r5580; + // chi + lop3.b32 %r29763, %r9511, %r9642, %r9415, 0xD2; + lop3.b32 %r29764, %r9515, %r9645, %r9419, 0xD2; // end inline asm - ld.const.u32 %r5589, [matrix+3876]; // begin inline asm - dp4a.u32.u32 %r5588, %r5589, %r5782, %r5584; + // chi + lop3.b32 %r29759, %r9551, %r9423, %r9591, 0xD2; + lop3.b32 %r29760, %r9555, %r9427, %r9595, 0xD2; // end inline asm - ld.const.u32 %r5593, [matrix+3880]; // begin inline asm - dp4a.u32.u32 %r5592, %r5593, %r5786, %r5588; + // chi + lop3.b32 %r29771, %r9423, %r9591, %r9535, 0xD2; + lop3.b32 %r29772, %r9427, %r9595, %r9539, 0xD2; // end inline asm - ld.const.u32 %r5597, [matrix+3884]; // begin inline asm - dp4a.u32.u32 %r5596, %r5597, %r5790, %r5592; + // chi + lop3.b32 %r29767, %r9591, %r9535, %r9431, 0xD2; + lop3.b32 %r29768, %r9595, %r9539, %r9435, 0xD2; // end inline asm - ld.const.u32 %r5601, [matrix+3888]; // begin inline asm - dp4a.u32.u32 %r5600, %r5601, %r5794, %r5596; + // chi + lop3.b32 %r29739, %r9535, %r9431, %r9551, 0xD2; + lop3.b32 %r29740, %r9539, %r9435, %r9555, 0xD2; // end inline asm - ld.const.u32 %r5605, [matrix+3892]; + st.local.v2.u32 [%rd55+88], {%r29739, %r29740}; // begin inline asm - dp4a.u32.u32 %r5604, %r5605, %r5798, %r5600; + // chi + lop3.b32 %r29731, %r9431, %r9551, %r9423, 0xD2; + lop3.b32 %r29732, %r9435, %r9555, %r9427, 0xD2; // end inline asm - ld.const.u32 %r5609, [matrix+3896]; + st.local.v2.u32 [%rd55+96], {%r29731, %r29732}; // begin inline asm - dp4a.u32.u32 %r5608, %r5609, %r5802, %r5604; + // chi + lop3.b32 %r29757, %r9599, %r9583, %r9471, 0xD2; + lop3.b32 %r29758, %r9603, %r9587, %r9475, 0xD2; // end inline asm - ld.const.u32 %r5613, [matrix+3900]; + st.local.v2.u32 [%rd55+104], {%r29757, %r29758}; // begin inline asm - dp4a.u32.u32 %r5612, %r5613, %r5806, %r5608; + // chi + lop3.b32 %r29751, %r9583, %r9471, %r9479, 0xD2; + lop3.b32 %r29752, %r9587, %r9475, %r9483, 0xD2; // end inline asm - ld.const.u32 %r5617, [matrix+3904]; + st.local.v2.u32 [%rd55+112], {%r29751, %r29752}; // begin inline asm - dp4a.u32.u32 %r5616, %r5617, %r5746, %r6244; + // chi + lop3.b32 %r29745, %r9471, %r9479, %r9447, 0xD2; + lop3.b32 %r29746, %r9475, %r9483, %r9451, 0xD2; // end inline asm - ld.const.u32 %r5621, [matrix+3908]; + st.local.v2.u32 [%rd55+120], {%r29745, %r29746}; // begin inline asm - dp4a.u32.u32 %r5620, %r5621, %r5750, %r5616; + // chi + lop3.b32 %r29737, %r9479, %r9447, %r9599, 0xD2; + lop3.b32 %r29738, %r9483, %r9451, %r9603, 0xD2; // end inline asm - ld.const.u32 %r5625, [matrix+3912]; + st.local.v2.u32 [%rd55+128], {%r29737, %r29738}; // begin inline asm - dp4a.u32.u32 %r5624, %r5625, %r5754, %r5620; + // chi + lop3.b32 %r29729, %r9447, %r9599, %r9583, 0xD2; + lop3.b32 %r29730, %r9451, %r9603, %r9587, 0xD2; // end inline asm - ld.const.u32 %r5629, [matrix+3916]; + st.local.v2.u32 [%rd55+136], {%r29729, %r29730}; // begin inline asm - dp4a.u32.u32 %r5628, %r5629, %r5758, %r5624; + // chi + lop3.b32 %r29755, %r9503, %r9543, %r9575, 0xD2; + lop3.b32 %r29756, %r9507, %r9547, %r9579, 0xD2; // end inline asm - ld.const.u32 %r5633, [matrix+3920]; + st.local.v2.u32 [%rd55+144], {%r29755, %r29756}; // begin inline asm - dp4a.u32.u32 %r5632, %r5633, %r5762, %r5628; + // chi + lop3.b32 %r29749, %r9543, %r9575, %r9567, 0xD2; + lop3.b32 %r29750, %r9547, %r9579, %r9571, 0xD2; // end inline asm - ld.const.u32 %r5637, [matrix+3924]; + st.local.v2.u32 [%rd55+152], {%r29749, %r29750}; // begin inline asm - dp4a.u32.u32 %r5636, %r5637, %r5766, %r5632; + // chi + lop3.b32 %r29743, %r9575, %r9567, %r9487, 0xD2; + lop3.b32 %r29744, %r9579, %r9571, %r9491, 0xD2; // end inline asm - ld.const.u32 %r5641, [matrix+3928]; + st.local.v2.u32 [%rd55+160], {%r29743, %r29744}; // begin inline asm - dp4a.u32.u32 %r5640, %r5641, %r5770, %r5636; + // chi + lop3.b32 %r29735, %r9567, %r9487, %r9503, 0xD2; + lop3.b32 %r29736, %r9571, %r9491, %r9507, 0xD2; // end inline asm - ld.const.u32 %r5645, [matrix+3932]; + st.local.v2.u32 [%rd55+168], {%r29735, %r29736}; // begin inline asm - dp4a.u32.u32 %r5644, %r5645, %r5774, %r5640; + // chi + lop3.b32 %r29727, %r9487, %r9503, %r9543, 0xD2; + lop3.b32 %r29728, %r9491, %r9507, %r9547, 0xD2; // end inline asm - ld.const.u32 %r5649, [matrix+3936]; + st.local.v2.u32 [%rd55+176], {%r29727, %r29728}; // begin inline asm - dp4a.u32.u32 %r5648, %r5649, %r5778, %r5644; + // chi + lop3.b32 %r29753, %r9455, %r9527, %r9439, 0xD2; + lop3.b32 %r29754, %r9459, %r9531, %r9443, 0xD2; // end inline asm - ld.const.u32 %r5653, [matrix+3940]; + st.local.v2.u32 [%rd55+184], {%r29753, %r29754}; // begin inline asm - dp4a.u32.u32 %r5652, %r5653, %r5782, %r5648; + // chi + lop3.b32 %r29747, %r9527, %r9439, %r9495, 0xD2; + lop3.b32 %r29748, %r9531, %r9443, %r9499, 0xD2; // end inline asm - ld.const.u32 %r5657, [matrix+3944]; + st.local.v2.u32 [%rd55+192], {%r29747, %r29748}; // begin inline asm - dp4a.u32.u32 %r5656, %r5657, %r5786, %r5652; + // chi + lop3.b32 %r29741, %r9439, %r9495, %r9519, 0xD2; + lop3.b32 %r29742, %r9443, %r9499, %r9523, 0xD2; // end inline asm - ld.const.u32 %r5661, [matrix+3948]; + st.local.v2.u32 [%rd55+200], {%r29741, %r29742}; // begin inline asm - dp4a.u32.u32 %r5660, %r5661, %r5790, %r5656; + // chi + lop3.b32 %r29733, %r9495, %r9519, %r9455, 0xD2; + lop3.b32 %r29734, %r9499, %r9523, %r9459, 0xD2; // end inline asm - ld.const.u32 %r5665, [matrix+3952]; + st.local.v2.u32 [%rd55+208], {%r29733, %r29734}; // begin inline asm - dp4a.u32.u32 %r5664, %r5665, %r5794, %r5660; + // chi + lop3.b32 %r29725, %r9519, %r9455, %r9527, 0xD2; + lop3.b32 %r29726, %r9523, %r9459, %r9531, 0xD2; // end inline asm - ld.const.u32 %r5669, [matrix+3956]; + st.local.v2.u32 [%rd55+216], {%r29725, %r29726}; + mul.wide.s32 %rd572, %r29775, 8; + add.s64 %rd571, %rd497, %rd572; // begin inline asm - dp4a.u32.u32 %r5668, %r5669, %r5798, %r5664; + ld.global.nc.v2.u32 {%r9807,%r9808}, [%rd571]; // end inline asm - ld.const.u32 %r5673, [matrix+3960]; + xor.b32 %r29761, %r9607, %r9807; + xor.b32 %r29762, %r9608, %r9808; + add.s32 %r29775, %r29775, 1; + setp.lt.u32 %p22, %r29775, 23; + @%p22 bra $L__BB2_31; + + mov.u32 %r9918, 1; + st.local.v2.u32 [%rd55+32], {%r29773, %r29774}; + st.local.v2.u32 [%rd55+72], {%r29771, %r29772}; + st.local.v2.u32 [%rd55+40], {%r29769, %r29770}; + st.local.v2.u32 [%rd55+80], {%r29767, %r29768}; + st.local.v2.u32 [%rd55+48], {%r29765, %r29766}; + st.local.v2.u32 [%rd55+56], {%r29763, %r29764}; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; // begin inline asm - dp4a.u32.u32 %r5672, %r5673, %r5802, %r5668; + // xor5 + lop3.b32 %r9819, %r29761, %r29759, %r29757, 0x96; + lop3.b32 %r9819, %r9819, %r29755, %r29753, 0x96; + lop3.b32 %r9820, %r29762, %r29760, %r29758, 0x96; + lop3.b32 %r9820, %r9820, %r29756, %r29754, 0x96; // end inline asm - ld.const.u32 %r5677, [matrix+3964]; // begin inline asm - dp4a.u32.u32 %r5676, %r5677, %r5806, %r5672; + // xor5 + lop3.b32 %r9831, %r29773, %r29771, %r29751, 0x96; + lop3.b32 %r9831, %r9831, %r29749, %r29747, 0x96; + lop3.b32 %r9832, %r29774, %r29772, %r29752, 0x96; + lop3.b32 %r9832, %r9832, %r29750, %r29748, 0x96; // end inline asm - shr.u32 %r6093, %r5612, 6; - and.b32 %r6094, %r6093, 240; - shr.u32 %r6095, %r5676, 10; - or.b32 %r6096, %r6095, %r6094; - cvt.u64.u32 %rd242, %r6096; - xor.b64 %rd243, %rd208, %rd242; - ld.const.u32 %r5681, [matrix+3968]; // begin inline asm - dp4a.u32.u32 %r5680, %r5681, %r5746, %r6244; + // xor5 + lop3.b32 %r9843, %r29769, %r29767, %r29745, 0x96; + lop3.b32 %r9843, %r9843, %r29743, %r29741, 0x96; + lop3.b32 %r9844, %r29770, %r29768, %r29746, 0x96; + lop3.b32 %r9844, %r9844, %r29744, %r29742, 0x96; // end inline asm - ld.const.u32 %r5685, [matrix+3972]; // begin inline asm - dp4a.u32.u32 %r5684, %r5685, %r5750, %r5680; + // xor5 + lop3.b32 %r9855, %r29765, %r29739, %r29737, 0x96; + lop3.b32 %r9855, %r9855, %r29735, %r29733, 0x96; + lop3.b32 %r9856, %r29766, %r29740, %r29738, 0x96; + lop3.b32 %r9856, %r9856, %r29736, %r29734, 0x96; // end inline asm - ld.const.u32 %r5689, [matrix+3976]; // begin inline asm - dp4a.u32.u32 %r5688, %r5689, %r5754, %r5684; + // xor5 + lop3.b32 %r9867, %r29763, %r29731, %r29729, 0x96; + lop3.b32 %r9867, %r9867, %r29727, %r29725, 0x96; + lop3.b32 %r9868, %r29764, %r29732, %r29730, 0x96; + lop3.b32 %r9868, %r9868, %r29728, %r29726, 0x96; // end inline asm - ld.const.u32 %r5693, [matrix+3980]; // begin inline asm - dp4a.u32.u32 %r5692, %r5693, %r5758, %r5688; + shf.l.wrap.b32 %r9879, %r9832, %r9831, %r9918; // end inline asm - ld.const.u32 %r5697, [matrix+3984]; // begin inline asm - dp4a.u32.u32 %r5696, %r5697, %r5762, %r5692; + shf.l.wrap.b32 %r9883, %r9831, %r9832, %r9918; // end inline asm - ld.const.u32 %r5701, [matrix+3988]; + xor.b32 %r10057, %r9879, %r9867; + xor.b32 %r10058, %r9883, %r9868; + xor.b32 %r10026, %r29761, %r10057; + xor.b32 %r10029, %r29762, %r10058; + xor.b32 %r9989, %r29758, %r10058; + xor.b32 %r9988, %r29757, %r10057; + st.local.v2.u32 [%rd55+104], {%r9988, %r9989}; // begin inline asm - dp4a.u32.u32 %r5700, %r5701, %r5766, %r5696; + shf.l.wrap.b32 %r9887, %r9844, %r9843, %r9918; // end inline asm - ld.const.u32 %r5705, [matrix+3992]; // begin inline asm - dp4a.u32.u32 %r5704, %r5705, %r5770, %r5700; + shf.l.wrap.b32 %r9891, %r9843, %r9844, %r9918; // end inline asm - ld.const.u32 %r5709, [matrix+3996]; + xor.b32 %r10059, %r9887, %r9819; + xor.b32 %r10060, %r9891, %r9820; + xor.b32 %r9925, %r29771, %r10059; + xor.b32 %r9924, %r29772, %r10060; + xor.b32 %r9964, %r29750, %r10060; + xor.b32 %r9965, %r29749, %r10059; + st.local.v2.u32 [%rd55+152], {%r9965, %r9964}; // begin inline asm - dp4a.u32.u32 %r5708, %r5709, %r5774, %r5704; + shf.l.wrap.b32 %r9895, %r9856, %r9855, %r9918; // end inline asm - ld.const.u32 %r5713, [matrix+4000]; // begin inline asm - dp4a.u32.u32 %r5712, %r5713, %r5778, %r5708; + shf.l.wrap.b32 %r9899, %r9855, %r9856, %r9918; // end inline asm - ld.const.u32 %r5717, [matrix+4004]; + xor.b32 %r10061, %r9895, %r9831; + xor.b32 %r10062, %r9899, %r9832; + xor.b32 %r9948, %r29746, %r10062; + xor.b32 %r9949, %r29745, %r10061; + st.local.v2.u32 [%rd55+120], {%r9949, %r9948}; + xor.b32 %r9940, %r29742, %r10062; + xor.b32 %r9941, %r29741, %r10061; + st.local.v2.u32 [%rd55+200], {%r9941, %r9940}; // begin inline asm - dp4a.u32.u32 %r5716, %r5717, %r5782, %r5712; + shf.l.wrap.b32 %r9903, %r9868, %r9867, %r9918; // end inline asm - ld.const.u32 %r5721, [matrix+4008]; // begin inline asm - dp4a.u32.u32 %r5720, %r5721, %r5786, %r5716; + shf.l.wrap.b32 %r9907, %r9867, %r9868, %r9918; // end inline asm - ld.const.u32 %r5725, [matrix+4012]; + xor.b32 %r10063, %r9903, %r9843; + xor.b32 %r10064, %r9907, %r9844; + xor.b32 %r9972, %r29765, %r10063; + xor.b32 %r9973, %r29766, %r10064; + xor.b32 %r9981, %r29736, %r10064; + xor.b32 %r9980, %r29735, %r10063; + st.local.v2.u32 [%rd55+168], {%r9980, %r9981}; // begin inline asm - dp4a.u32.u32 %r5724, %r5725, %r5790, %r5720; + shf.l.wrap.b32 %r9911, %r9820, %r9819, %r9918; // end inline asm - ld.const.u32 %r5729, [matrix+4016]; // begin inline asm - dp4a.u32.u32 %r5728, %r5729, %r5794, %r5724; + shf.l.wrap.b32 %r9915, %r9819, %r9820, %r9918; // end inline asm - ld.const.u32 %r5733, [matrix+4020]; + xor.b32 %r10065, %r9911, %r9855; + xor.b32 %r10066, %r9915, %r9856; + xor.b32 %r9932, %r29731, %r10065; + xor.b32 %r9933, %r29732, %r10066; + xor.b32 %r9957, %r29726, %r10066; + xor.b32 %r9956, %r29725, %r10065; + st.local.v2.u32 [%rd55+216], {%r9956, %r9957}; // begin inline asm - dp4a.u32.u32 %r5732, %r5733, %r5798, %r5728; + shf.l.wrap.b32 %r9919, %r9925, %r9924, %r9422; // end inline asm - ld.const.u32 %r5737, [matrix+4024]; // begin inline asm - dp4a.u32.u32 %r5736, %r5737, %r5802, %r5732; + shf.l.wrap.b32 %r9923, %r9924, %r9925, %r9422; // end inline asm - ld.const.u32 %r5741, [matrix+4028]; // begin inline asm - dp4a.u32.u32 %r5740, %r5741, %r5806, %r5736; + shf.l.wrap.b32 %r9927, %r9933, %r9932, %r9430; // end inline asm - ld.const.u32 %r5745, [matrix+4032]; // begin inline asm - dp4a.u32.u32 %r5744, %r5745, %r5746, %r6244; + shf.l.wrap.b32 %r9931, %r9932, %r9933, %r9430; // end inline asm - ld.const.u32 %r5749, [matrix+4036]; // begin inline asm - dp4a.u32.u32 %r5748, %r5749, %r5750, %r5744; + shf.l.wrap.b32 %r9939, %r9940, %r9941, %r9438; // end inline asm - ld.const.u32 %r5753, [matrix+4040]; // begin inline asm - dp4a.u32.u32 %r5752, %r5753, %r5754, %r5748; + shf.l.wrap.b32 %r9935, %r9941, %r9940, %r9438; // end inline asm - ld.const.u32 %r5757, [matrix+4044]; + st.local.v2.u32 [%rd55+96], {%r9935, %r9939}; // begin inline asm - dp4a.u32.u32 %r5756, %r5757, %r5758, %r5752; + shf.l.wrap.b32 %r9943, %r9949, %r9948, %r9470; // end inline asm - ld.const.u32 %r5761, [matrix+4048]; // begin inline asm - dp4a.u32.u32 %r5760, %r5761, %r5762, %r5756; + shf.l.wrap.b32 %r9947, %r9948, %r9949, %r9470; // end inline asm - ld.const.u32 %r5765, [matrix+4052]; // begin inline asm - dp4a.u32.u32 %r5764, %r5765, %r5766, %r5760; + shf.l.wrap.b32 %r9951, %r9957, %r9956, %r9518; // end inline asm - ld.const.u32 %r5769, [matrix+4056]; // begin inline asm - dp4a.u32.u32 %r5768, %r5769, %r5770, %r5764; + shf.l.wrap.b32 %r9955, %r9956, %r9957, %r9518; // end inline asm - ld.const.u32 %r5773, [matrix+4060]; // begin inline asm - dp4a.u32.u32 %r5772, %r5773, %r5774, %r5768; + shf.l.wrap.b32 %r9963, %r9964, %r9965, %r9542; // end inline asm - ld.const.u32 %r5777, [matrix+4064]; // begin inline asm - dp4a.u32.u32 %r5776, %r5777, %r5778, %r5772; + shf.l.wrap.b32 %r9959, %r9965, %r9964, %r9542; // end inline asm - ld.const.u32 %r5781, [matrix+4068]; + st.local.v2.u32 [%rd55+88], {%r9959, %r9963}; // begin inline asm - dp4a.u32.u32 %r5780, %r5781, %r5782, %r5776; + shf.l.wrap.b32 %r9967, %r9973, %r9972, %r9558; // end inline asm - ld.const.u32 %r5785, [matrix+4072]; // begin inline asm - dp4a.u32.u32 %r5784, %r5785, %r5786, %r5780; + shf.l.wrap.b32 %r9971, %r9972, %r9973, %r9558; // end inline asm - ld.const.u32 %r5789, [matrix+4076]; // begin inline asm - dp4a.u32.u32 %r5788, %r5789, %r5790, %r5784; + shf.l.wrap.b32 %r9975, %r9981, %r9980, %r9566; // end inline asm - ld.const.u32 %r5793, [matrix+4080]; // begin inline asm - dp4a.u32.u32 %r5792, %r5793, %r5794, %r5788; + shf.l.wrap.b32 %r9979, %r9980, %r9981, %r9566; // end inline asm - ld.const.u32 %r5797, [matrix+4084]; // begin inline asm - dp4a.u32.u32 %r5796, %r5797, %r5798, %r5792; + shf.l.wrap.b32 %r9983, %r9989, %r9988, %r9598; // end inline asm - ld.const.u32 %r5801, [matrix+4088]; // begin inline asm - dp4a.u32.u32 %r5800, %r5801, %r5802, %r5796; + shf.l.wrap.b32 %r9987, %r9988, %r9989, %r9598; // end inline asm - ld.const.u32 %r5805, [matrix+4092]; // begin inline asm - dp4a.u32.u32 %r5804, %r5805, %r5806, %r5800; + // chi + lop3.b32 %r9991, %r10026, %r9919, %r9943, 0xD2; + lop3.b32 %r9992, %r10029, %r9923, %r9947, 0xD2; // end inline asm - shr.u32 %r6097, %r5740, 6; - and.b32 %r6098, %r6097, 240; - shr.u32 %r6099, %r5804, 10; - or.b32 %r6100, %r6099, %r6098; - cvt.u64.u32 %rd244, %r6100; - xor.b64 %rd245, %rd210, %rd244; - shl.b32 %r6101, %r5985, 24; - cvt.u64.u32 %rd246, %r6101; - shl.b32 %r6102, %r5980, 16; - and.b32 %r6103, %r6102, 16711680; - cvt.u64.u32 %rd247, %r6103; - shl.b32 %r6104, %r5975, 8; - and.b32 %r6105, %r6104, 65280; - cvt.u64.u32 %rd248, %r6105; - and.b32 %r6106, %r5970, 255; - cvt.u64.u32 %rd249, %r6106; - shl.b32 %r6107, %r6019, 24; - cvt.u64.u32 %rd250, %r6107; - shl.b32 %r6108, %r6014, 16; - and.b32 %r6109, %r6108, 16711680; - cvt.u64.u32 %rd251, %r6109; - shl.b32 %r6110, %r6009, 8; - and.b32 %r6111, %r6110, 65280; - cvt.u64.u32 %rd252, %r6111; - and.b32 %r6112, %r6004, 255; - cvt.u64.u32 %rd253, %r6112; - shl.b32 %r6113, %r6053, 24; - cvt.u64.u32 %rd254, %r6113; - shl.b32 %r6114, %r6048, 16; - and.b32 %r6115, %r6114, 16711680; - cvt.u64.u32 %rd255, %r6115; - shl.b32 %r6116, %r6043, 8; - and.b32 %r6117, %r6116, 65280; - cvt.u64.u32 %rd256, %r6117; - and.b32 %r6118, %r6038, 255; - cvt.u64.u32 %rd257, %r6118; - shr.u32 %r6119, %r2732, 10; - or.b32 %r6120, %r6119, %r5999; - xor.b32 %r6121, %r10, %r6120; - cvt.u64.u32 %rd258, %r6121; - shl.b64 %rd259, %rd258, 56; - shl.b64 %rd260, %rd216, 48; - and.b64 %rd261, %rd260, 71776119061217280; - or.b64 %rd262, %rd259, %rd261; - shl.b64 %rd263, %rd214, 40; - and.b64 %rd264, %rd263, 280375465082880; - or.b64 %rd265, %rd262, %rd264; - shl.b64 %rd266, %rd212, 32; - and.b64 %rd267, %rd266, 1095216660480; - or.b64 %rd268, %rd265, %rd267; - or.b64 %rd269, %rd268, %rd246; - or.b64 %rd270, %rd269, %rd247; - or.b64 %rd271, %rd270, %rd248; - or.b64 %rd272, %rd271, %rd249; - xor.b64 %rd73, %rd272, 4239941492252378377; - shr.u32 %r6122, %r3756, 10; - or.b32 %r6123, %r6122, %r6033; - xor.b32 %r6124, %r12, %r6123; - cvt.u64.u32 %rd273, %r6124; - shl.b64 %rd274, %rd273, 56; - shl.b64 %rd275, %rd222, 48; - and.b64 %rd276, %rd275, 71776119061217280; - or.b64 %rd277, %rd274, %rd276; - shl.b64 %rd278, %rd220, 40; - and.b64 %rd279, %rd278, 280375465082880; - or.b64 %rd280, %rd277, %rd279; - shl.b64 %rd281, %rd218, 32; - and.b64 %rd282, %rd281, 1095216660480; - or.b64 %rd283, %rd280, %rd282; - or.b64 %rd284, %rd283, %rd250; - or.b64 %rd285, %rd284, %rd251; - or.b64 %rd286, %rd285, %rd252; - or.b64 %rd287, %rd286, %rd253; - xor.b64 %rd484, %rd287, 8746723911537738262; - shr.u32 %r6125, %r4780, 10; - or.b32 %r6126, %r6125, %r6067; - xor.b32 %r6127, %r14, %r6126; - cvt.u64.u32 %rd288, %r6127; - shl.b64 %rd289, %rd288, 56; - shl.b64 %rd290, %rd228, 48; - and.b64 %rd291, %rd290, 71776119061217280; - or.b64 %rd292, %rd289, %rd291; - shl.b64 %rd293, %rd226, 40; - and.b64 %rd294, %rd293, 280375465082880; - or.b64 %rd295, %rd292, %rd294; - shl.b64 %rd296, %rd224, 32; - and.b64 %rd297, %rd296, 1095216660480; - or.b64 %rd298, %rd295, %rd297; - or.b64 %rd299, %rd298, %rd254; - or.b64 %rd300, %rd299, %rd255; - or.b64 %rd301, %rd300, %rd256; - or.b64 %rd302, %rd301, %rd257; - xor.b64 %rd479, %rd302, 8796936657246353646; - shl.b64 %rd303, %rd245, 56; - shl.b64 %rd304, %rd243, 48; - and.b64 %rd305, %rd304, 71776119061217280; - or.b64 %rd306, %rd303, %rd305; - shl.b64 %rd307, %rd241, 40; - and.b64 %rd308, %rd307, 280375465082880; - or.b64 %rd309, %rd306, %rd308; - shl.b64 %rd310, %rd239, 32; - and.b64 %rd311, %rd310, 1095216660480; - or.b64 %rd312, %rd309, %rd311; - shl.b64 %rd313, %rd237, 24; - and.b64 %rd314, %rd313, 4278190080; - or.b64 %rd315, %rd312, %rd314; - shl.b64 %rd316, %rd235, 16; - and.b64 %rd317, %rd316, 16711680; - shl.b64 %rd318, %rd231, 8; - and.b64 %rd319, %rd318, 65280; - or.b64 %rd320, %rd315, %rd317; - or.b64 %rd321, %rd320, %rd319; - or.b64 %rd322, %rd321, %rd233; - xor.b64 %rd474, %rd322, 1272090201925444760; - mov.u64 %rd488, 8270816933120786537; - mov.u64 %rd487, -850687345431043546; - mov.u64 %rd486, 8596393687355028144; - mov.u64 %rd485, -4073852189716399785; - mov.u64 %rd483, -4539347866060507718; - mov.u64 %rd482, -3233781605604422593; - mov.u64 %rd481, 570094237299545110; - mov.u64 %rd480, 5171152063242093102; - mov.u64 %rd478, 6782861118970774626; - mov.u64 %rd477, 7812475424661425213; - mov.u64 %rd476, 9119540418498120711; - mov.u64 %rd475, -7873636174015165430; - mov.u64 %rd473, -9207053471590684088; - mov.u64 %rd472, 3370482334374859748; - mov.u64 %rd471, -1544774801229058759; - mov.u64 %rd470, 6096431547456407061; - mov.u64 %rd469, -1792185402154627366; - mov.u64 %rd468, -6864424130110145268; - mov.u64 %rd467, 5690099369266491460; - mov.u64 %rd466, -5074726839974049192; - mov.u64 %rd465, 1592359455985097269; - mov.u64 %rd464, RC; - -$L__BB0_9: - xor.b64 %rd323, %rd488, %rd73; - xor.b64 %rd324, %rd323, %rd487; - xor.b64 %rd325, %rd324, %rd486; - xor.b64 %rd326, %rd325, %rd485; - xor.b64 %rd327, %rd483, %rd484; - xor.b64 %rd328, %rd327, %rd482; - xor.b64 %rd329, %rd328, %rd481; - xor.b64 %rd330, %rd329, %rd480; - xor.b64 %rd331, %rd478, %rd479; - xor.b64 %rd332, %rd331, %rd477; - xor.b64 %rd333, %rd332, %rd476; - xor.b64 %rd334, %rd333, %rd475; - xor.b64 %rd335, %rd473, %rd474; - xor.b64 %rd336, %rd335, %rd472; - xor.b64 %rd337, %rd336, %rd471; - xor.b64 %rd338, %rd337, %rd470; - xor.b64 %rd339, %rd468, %rd469; - xor.b64 %rd340, %rd339, %rd467; - xor.b64 %rd341, %rd340, %rd466; - xor.b64 %rd342, %rd341, %rd465; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6128}, %rd330; - } - { - .reg .b32 %dummy; - mov.b64 {%r6129,%dummy}, %rd330; - } - shf.l.wrap.b32 %r6130, %r6129, %r6128, 1; - shf.l.wrap.b32 %r6131, %r6128, %r6129, 1; - mov.b64 %rd343, {%r6131, %r6130}; - xor.b64 %rd344, %rd342, %rd343; - xor.b64 %rd345, %rd344, %rd73; - xor.b64 %rd346, %rd488, %rd344; - xor.b64 %rd347, %rd487, %rd344; - xor.b64 %rd348, %rd486, %rd344; - xor.b64 %rd349, %rd485, %rd344; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6132}, %rd334; - } - { - .reg .b32 %dummy; - mov.b64 {%r6133,%dummy}, %rd334; - } - shf.l.wrap.b32 %r6134, %r6133, %r6132, 1; - shf.l.wrap.b32 %r6135, %r6132, %r6133, 1; - mov.b64 %rd350, {%r6135, %r6134}; - xor.b64 %rd351, %rd350, %rd326; - xor.b64 %rd352, %rd484, %rd351; - xor.b64 %rd353, %rd483, %rd351; - xor.b64 %rd354, %rd482, %rd351; - xor.b64 %rd355, %rd481, %rd351; - xor.b64 %rd356, %rd480, %rd351; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6136}, %rd338; - } - { - .reg .b32 %dummy; - mov.b64 {%r6137,%dummy}, %rd338; - } - shf.l.wrap.b32 %r6138, %r6137, %r6136, 1; - shf.l.wrap.b32 %r6139, %r6136, %r6137, 1; - mov.b64 %rd357, {%r6139, %r6138}; - xor.b64 %rd358, %rd357, %rd330; - xor.b64 %rd359, %rd479, %rd358; - xor.b64 %rd360, %rd478, %rd358; - xor.b64 %rd361, %rd477, %rd358; - xor.b64 %rd362, %rd476, %rd358; - xor.b64 %rd363, %rd475, %rd358; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6140}, %rd342; - } - { - .reg .b32 %dummy; - mov.b64 {%r6141,%dummy}, %rd342; - } - shf.l.wrap.b32 %r6142, %r6141, %r6140, 1; - shf.l.wrap.b32 %r6143, %r6140, %r6141, 1; - mov.b64 %rd364, {%r6143, %r6142}; - xor.b64 %rd365, %rd364, %rd334; - xor.b64 %rd366, %rd474, %rd365; - xor.b64 %rd367, %rd473, %rd365; - xor.b64 %rd368, %rd472, %rd365; - xor.b64 %rd369, %rd471, %rd365; - xor.b64 %rd370, %rd470, %rd365; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6144}, %rd326; - } - { - .reg .b32 %dummy; - mov.b64 {%r6145,%dummy}, %rd326; - } - shf.l.wrap.b32 %r6146, %r6145, %r6144, 1; - shf.l.wrap.b32 %r6147, %r6144, %r6145, 1; - mov.b64 %rd371, {%r6147, %r6146}; - xor.b64 %rd372, %rd338, %rd371; - xor.b64 %rd373, %rd469, %rd372; - xor.b64 %rd374, %rd468, %rd372; - xor.b64 %rd375, %rd467, %rd372; - xor.b64 %rd376, %rd466, %rd372; - xor.b64 %rd377, %rd465, %rd372; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6148}, %rd352; - } - { - .reg .b32 %dummy; - mov.b64 {%r6149,%dummy}, %rd352; - } - shf.l.wrap.b32 %r6150, %r6149, %r6148, 1; - shf.l.wrap.b32 %r6151, %r6148, %r6149, 1; - mov.b64 %rd378, {%r6151, %r6150}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6152}, %rd347; - } - { - .reg .b32 %dummy; - mov.b64 {%r6153,%dummy}, %rd347; - } - shf.l.wrap.b32 %r6154, %r6153, %r6152, 3; - shf.l.wrap.b32 %r6155, %r6152, %r6153, 3; - mov.b64 %rd379, {%r6155, %r6154}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6156}, %rd360; - } - { - .reg .b32 %dummy; - mov.b64 {%r6157,%dummy}, %rd360; - } - shf.l.wrap.b32 %r6158, %r6157, %r6156, 6; - shf.l.wrap.b32 %r6159, %r6156, %r6157, 6; - mov.b64 %rd380, {%r6159, %r6158}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6160}, %rd354; - } - { - .reg .b32 %dummy; - mov.b64 {%r6161,%dummy}, %rd354; - } - shf.l.wrap.b32 %r6162, %r6161, %r6160, 10; - shf.l.wrap.b32 %r6163, %r6160, %r6161, 10; - mov.b64 %rd381, {%r6163, %r6162}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6164}, %rd362; - } - { - .reg .b32 %dummy; - mov.b64 {%r6165,%dummy}, %rd362; - } - shf.l.wrap.b32 %r6166, %r6165, %r6164, 15; - shf.l.wrap.b32 %r6167, %r6164, %r6165, 15; - mov.b64 %rd382, {%r6167, %r6166}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6168}, %rd369; - } - { - .reg .b32 %dummy; - mov.b64 {%r6169,%dummy}, %rd369; - } - shf.l.wrap.b32 %r6170, %r6169, %r6168, 21; - shf.l.wrap.b32 %r6171, %r6168, %r6169, 21; - mov.b64 %rd383, {%r6171, %r6170}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6172}, %rd366; - } - { - .reg .b32 %dummy; - mov.b64 {%r6173,%dummy}, %rd366; - } - shf.l.wrap.b32 %r6174, %r6173, %r6172, 28; - shf.l.wrap.b32 %r6175, %r6172, %r6173, 28; - mov.b64 %rd384, {%r6175, %r6174}; - { - .reg .b32 %dummy; - mov.b64 {%r6176,%dummy}, %rd346; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6177}, %rd346; - } - shf.r.wrap.b32 %r6178, %r6177, %r6176, 28; - shf.r.wrap.b32 %r6179, %r6176, %r6177, 28; - mov.b64 %rd385, {%r6179, %r6178}; - { - .reg .b32 %dummy; - mov.b64 {%r6180,%dummy}, %rd355; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6181}, %rd355; - } - shf.r.wrap.b32 %r6182, %r6181, %r6180, 19; - shf.r.wrap.b32 %r6183, %r6180, %r6181, 19; - mov.b64 %rd386, {%r6183, %r6182}; - { - .reg .b32 %dummy; - mov.b64 {%r6184,%dummy}, %rd367; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6185}, %rd367; - } - shf.r.wrap.b32 %r6186, %r6185, %r6184, 9; - shf.r.wrap.b32 %r6187, %r6184, %r6185, 9; - mov.b64 %rd387, {%r6187, %r6186}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6188}, %rd356; - } - { - .reg .b32 %dummy; - mov.b64 {%r6189,%dummy}, %rd356; - } - shf.l.wrap.b32 %r6190, %r6189, %r6188, 2; - shf.l.wrap.b32 %r6191, %r6188, %r6189, 2; - mov.b64 %rd388, {%r6191, %r6190}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6192}, %rd377; - } - { - .reg .b32 %dummy; - mov.b64 {%r6193,%dummy}, %rd377; - } - shf.l.wrap.b32 %r6194, %r6193, %r6192, 14; - shf.l.wrap.b32 %r6195, %r6192, %r6193, 14; - mov.b64 %rd389, {%r6195, %r6194}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6196}, %rd373; - } - { - .reg .b32 %dummy; - mov.b64 {%r6197,%dummy}, %rd373; - } - shf.l.wrap.b32 %r6198, %r6197, %r6196, 27; - shf.l.wrap.b32 %r6199, %r6196, %r6197, 27; - mov.b64 %rd390, {%r6199, %r6198}; - { - .reg .b32 %dummy; - mov.b64 {%r6200,%dummy}, %rd348; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6201}, %rd348; - } - shf.r.wrap.b32 %r6202, %r6201, %r6200, 23; - shf.r.wrap.b32 %r6203, %r6200, %r6201, 23; - mov.b64 %rd391, {%r6203, %r6202}; - { - .reg .b32 %dummy; - mov.b64 {%r6204,%dummy}, %rd370; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6205}, %rd370; - } - shf.r.wrap.b32 %r6206, %r6205, %r6204, 8; - shf.r.wrap.b32 %r6207, %r6204, %r6205, 8; - mov.b64 %rd392, {%r6207, %r6206}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6208}, %rd376; - } - { - .reg .b32 %dummy; - mov.b64 {%r6209,%dummy}, %rd376; - } - shf.l.wrap.b32 %r6210, %r6209, %r6208, 8; - shf.l.wrap.b32 %r6211, %r6208, %r6209, 8; - mov.b64 %rd393, {%r6211, %r6210}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6212}, %rd368; - } - { - .reg .b32 %dummy; - mov.b64 {%r6213,%dummy}, %rd368; - } - shf.l.wrap.b32 %r6214, %r6213, %r6212, 25; - shf.l.wrap.b32 %r6215, %r6212, %r6213, 25; - mov.b64 %rd394, {%r6215, %r6214}; - { - .reg .b32 %dummy; - mov.b64 {%r6216,%dummy}, %rd361; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6217}, %rd361; - } - shf.r.wrap.b32 %r6218, %r6217, %r6216, 21; - shf.r.wrap.b32 %r6219, %r6216, %r6217, 21; - mov.b64 %rd395, {%r6219, %r6218}; - { - .reg .b32 %dummy; - mov.b64 {%r6220,%dummy}, %rd359; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6221}, %rd359; - } - shf.r.wrap.b32 %r6222, %r6221, %r6220, 2; - shf.r.wrap.b32 %r6223, %r6220, %r6221, 2; - mov.b64 %rd396, {%r6223, %r6222}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6224}, %rd349; - } - { - .reg .b32 %dummy; - mov.b64 {%r6225,%dummy}, %rd349; - } - shf.l.wrap.b32 %r6226, %r6225, %r6224, 18; - shf.l.wrap.b32 %r6227, %r6224, %r6225, 18; - mov.b64 %rd397, {%r6227, %r6226}; - { - .reg .b32 %dummy; - mov.b64 {%r6228,%dummy}, %rd375; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6229}, %rd375; - } - shf.r.wrap.b32 %r6230, %r6229, %r6228, 25; - shf.r.wrap.b32 %r6231, %r6228, %r6229, 25; - mov.b64 %rd398, {%r6231, %r6230}; - { - .reg .b32 %dummy; - mov.b64 {%r6232,%dummy}, %rd363; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6233}, %rd363; - } - shf.r.wrap.b32 %r6234, %r6233, %r6232, 3; - shf.r.wrap.b32 %r6235, %r6232, %r6233, 3; - mov.b64 %rd399, {%r6235, %r6234}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6236}, %rd374; - } - { - .reg .b32 %dummy; - mov.b64 {%r6237,%dummy}, %rd374; - } - shf.l.wrap.b32 %r6238, %r6237, %r6236, 20; - shf.l.wrap.b32 %r6239, %r6236, %r6237, 20; - mov.b64 %rd400, {%r6239, %r6238}; - { - .reg .b32 %dummy; - mov.b64 {%r6240,%dummy}, %rd353; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6241}, %rd353; - } - shf.r.wrap.b32 %r6242, %r6241, %r6240, 20; - shf.r.wrap.b32 %r6243, %r6240, %r6241, 20; - mov.b64 %rd401, {%r6243, %r6242}; - not.b64 %rd402, %rd401; - and.b64 %rd403, %rd395, %rd402; - xor.b64 %rd404, %rd403, %rd345; - not.b64 %rd405, %rd395; - and.b64 %rd406, %rd383, %rd405; - xor.b64 %rd484, %rd406, %rd401; - not.b64 %rd407, %rd383; - and.b64 %rd408, %rd389, %rd407; - xor.b64 %rd479, %rd408, %rd395; - not.b64 %rd409, %rd389; - and.b64 %rd410, %rd345, %rd409; - xor.b64 %rd474, %rd410, %rd383; - not.b64 %rd411, %rd345; - and.b64 %rd412, %rd401, %rd411; - xor.b64 %rd469, %rd389, %rd412; - not.b64 %rd413, %rd400; - and.b64 %rd414, %rd379, %rd413; - xor.b64 %rd488, %rd414, %rd384; - not.b64 %rd415, %rd379; - and.b64 %rd416, %rd386, %rd415; - xor.b64 %rd483, %rd416, %rd400; - not.b64 %rd417, %rd386; - and.b64 %rd418, %rd399, %rd417; - xor.b64 %rd478, %rd418, %rd379; - not.b64 %rd419, %rd399; - and.b64 %rd420, %rd384, %rd419; - xor.b64 %rd473, %rd420, %rd386; - not.b64 %rd421, %rd384; - and.b64 %rd422, %rd400, %rd421; - xor.b64 %rd468, %rd399, %rd422; - not.b64 %rd423, %rd380; - and.b64 %rd424, %rd394, %rd423; - xor.b64 %rd487, %rd424, %rd378; - not.b64 %rd425, %rd394; - and.b64 %rd426, %rd393, %rd425; - xor.b64 %rd482, %rd426, %rd380; - not.b64 %rd427, %rd393; - and.b64 %rd428, %rd397, %rd427; - xor.b64 %rd477, %rd428, %rd394; - not.b64 %rd429, %rd397; - and.b64 %rd430, %rd378, %rd429; - xor.b64 %rd472, %rd430, %rd393; - not.b64 %rd431, %rd378; - and.b64 %rd432, %rd380, %rd431; - xor.b64 %rd467, %rd397, %rd432; - not.b64 %rd433, %rd385; - and.b64 %rd434, %rd381, %rd433; - xor.b64 %rd486, %rd434, %rd390; - not.b64 %rd435, %rd381; - and.b64 %rd436, %rd382, %rd435; - xor.b64 %rd481, %rd436, %rd385; - not.b64 %rd437, %rd382; - and.b64 %rd438, %rd392, %rd437; - xor.b64 %rd476, %rd438, %rd381; - not.b64 %rd439, %rd392; - and.b64 %rd440, %rd390, %rd439; - xor.b64 %rd471, %rd440, %rd382; - not.b64 %rd441, %rd390; - and.b64 %rd442, %rd385, %rd441; - xor.b64 %rd466, %rd392, %rd442; - not.b64 %rd443, %rd387; - and.b64 %rd444, %rd398, %rd443; - xor.b64 %rd485, %rd444, %rd396; - not.b64 %rd445, %rd398; - and.b64 %rd446, %rd391, %rd445; - xor.b64 %rd480, %rd446, %rd387; - not.b64 %rd447, %rd391; - and.b64 %rd448, %rd388, %rd447; - xor.b64 %rd475, %rd448, %rd398; - not.b64 %rd449, %rd388; - and.b64 %rd450, %rd396, %rd449; - xor.b64 %rd470, %rd450, %rd391; - not.b64 %rd451, %rd396; - and.b64 %rd452, %rd387, %rd451; - xor.b64 %rd465, %rd388, %rd452; - ld.global.nc.u64 %rd453, [%rd464]; - xor.b64 %rd73, %rd404, %rd453; - add.s64 %rd464, %rd464, 8; - add.s32 %r6244, %r6244, 1; - setp.ne.s32 %p11, %r6244, 24; - @%p11 bra $L__BB0_9; - - ld.const.u64 %rd75, [target+24]; - setp.eq.s64 %p12, %rd474, %rd75; - @%p12 bra $L__BB0_12; - bra.uni $L__BB0_11; + // begin inline asm + // chi + lop3.b32 %r9999, %r9919, %r9943, %r9975, 0xD2; + lop3.b32 %r10000, %r9923, %r9947, %r9979, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+32], {%r9999, %r10000}; + // begin inline asm + // chi + lop3.b32 %r10007, %r9943, %r9975, %r9951, 0xD2; + lop3.b32 %r10008, %r9947, %r9979, %r9955, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+40], {%r10007, %r10008}; + // begin inline asm + // chi + lop3.b32 %r10015, %r9975, %r9951, %r10026, 0xD2; + lop3.b32 %r10016, %r9979, %r9955, %r10029, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+48], {%r10015, %r10016}; + // begin inline asm + // chi + lop3.b32 %r10023, %r9951, %r10026, %r9919, 0xD2; + lop3.b32 %r10024, %r9955, %r10029, %r9923, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+56], {%r10023, %r10024}; + // begin inline asm + // chi + lop3.b32 %r10031, %r9967, %r9927, %r9983, 0xD2; + lop3.b32 %r10032, %r9971, %r9931, %r9987, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+64], {%r10031, %r10032}; + // begin inline asm + // chi + lop3.b32 %r10039, %r9927, %r9983, %r9959, 0xD2; + lop3.b32 %r10040, %r9931, %r9987, %r9963, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+72], {%r10039, %r10040}; + // begin inline asm + // chi + lop3.b32 %r10047, %r9983, %r9959, %r9935, 0xD2; + lop3.b32 %r10048, %r9987, %r9963, %r9939, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+80], {%r10047, %r10048}; + // begin inline asm + ld.global.nc.v2.u32 {%r10055,%r10056}, [%rd498]; + // end inline asm + xor.b32 %r10067, %r9992, %r10056; + xor.b32 %r10068, %r9991, %r10055; + st.local.v2.u32 [%rd55+24], {%r10068, %r10067}; + st.global.u64 [%rd36], %rd1261; + st.global.u64 [%rd36+8], %rd1262; + st.global.u64 [%rd36+16], %rd1263; + st.global.u64 [%rd36+24], %rd62; + st.global.u64 [%rd36+32], %rd1264; + st.global.u64 [%rd36+40], %rd64; + st.global.u64 [%rd36+48], %rd65; + st.global.u64 [%rd36+56], %rd66; + st.global.v2.u32 [%rd36+64], {%r10068, %r10067}; + st.global.v2.u32 [%rd36+72], {%r9999, %r10000}; + st.global.v2.u32 [%rd36+80], {%r10007, %r10008}; + st.global.v2.u32 [%rd36+88], {%r10015, %r10016}; + st.global.v2.u32 [%rd36+96], {%r10023, %r10024}; + st.global.v2.u32 [%rd36+104], {%r10031, %r10032}; + st.global.v2.u32 [%rd36+112], {%r10039, %r10040}; + st.global.v2.u32 [%rd36+120], {%r10047, %r10048}; + +$L__BB2_44: + shl.b32 %r1678, %r25, 1; + mul.wide.u32 %rd678, %r1678, -954391867; + shr.u64 %rd679, %rd678, 32; + cvt.u32.u64 %r13353, %rd679; + sub.s32 %r13354, %r1678, %r13353; + shr.u32 %r13355, %r13354, 1; + add.s32 %r13356, %r13355, %r13353; + shr.u32 %r13357, %r13356, 20; + mul.lo.s32 %r13358, %r13357, 1179641; + sub.s32 %r13359, %r1678, %r13358; + mul.wide.u32 %rd681, %r13359, 64; + add.s64 %rd128, %rd471, %rd681; + or.b32 %r1679, %r1678, 1; + mul.wide.u32 %rd682, %r1679, -954391867; + shr.u64 %rd683, %rd682, 32; + cvt.u32.u64 %r13360, %rd683; + sub.s32 %r13361, %r1679, %r13360; + shr.u32 %r13362, %r13361, 1; + add.s32 %r13363, %r13362, %r13360; + shr.u32 %r13364, %r13363, 20; + mul.lo.s32 %r13365, %r13364, 1179641; + sub.s32 %r13366, %r1679, %r13365; + mul.wide.u32 %rd684, %r13366, 64; + add.s64 %rd129, %rd471, %rd684; + @%p16 bra $L__BB2_58; + + cvta.to.global.u64 %rd685, %rd353; + mul.wide.u32 %rd686, %r25, 128; + add.s64 %rd130, %rd685, %rd686; + ld.global.u64 %rd1265, [%rd130]; + setp.eq.s64 %p29, %rd1265, 0; + @%p29 bra $L__BB2_47; + + ld.global.u64 %rd1268, [%rd130+32]; + ld.global.u64 %rd1267, [%rd130+16]; + ld.global.u64 %rd1266, [%rd130+8]; + bra.uni $L__BB2_69; + +$L__BB2_58: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd788, 1179641; + st.local.u64 [%rd3+8], %rd788; + st.local.u32 [%rd3+16], %r1678; + ld.global.u64 %rd789, [%rd128]; + ld.global.u64 %rd790, [%rd128+8]; + ld.global.u64 %rd791, [%rd128+16]; + ld.global.u64 %rd792, [%rd128+24]; + ld.global.u64 %rd793, [%rd128+32]; + ld.global.u64 %rd794, [%rd128+40]; + ld.global.u64 %rd795, [%rd128+48]; + ld.global.u64 %rd796, [%rd128+56]; + st.local.u64 [%rd3+24], %rd789; + st.local.u64 [%rd3+32], %rd790; + st.local.u64 [%rd3+40], %rd791; + st.local.u64 [%rd3+48], %rd792; + st.local.u64 [%rd3+56], %rd793; + st.local.u64 [%rd3+64], %rd794; + st.local.u64 [%rd3+72], %rd795; + st.local.u64 [%rd3+80], %rd796; + cvt.u32.u64 %r16692, %rd789; + xor.b32 %r16693, %r1678, %r16692; + st.local.u32 [%rd3+24], %r16693; + mov.u32 %r30250, 0; + st.local.v2.u32 [%rd3+96], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+104], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+112], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+120], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+128], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+136], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+144], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+152], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+160], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+168], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+176], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+184], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+192], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+200], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+208], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+216], {%r30250, %r30250}; + mov.u32 %r30265, -2147483648; + mov.u32 %r16665, 1; + st.local.v2.u32 [%rd3+88], {%r16665, %r30265}; + ld.local.v2.u32 {%r30286, %r30287}, [%rd3+24]; + mov.b64 {%r30284, %r30285}, %rd794; + shr.u64 %rd797, %rd790, 32; + cvt.u32.u64 %r30298, %rd790; + cvt.u32.u64 %r30299, %rd797; + shr.u64 %rd798, %rd795, 32; + cvt.u32.u64 %r30296, %rd795; + cvt.u32.u64 %r30297, %rd798; + shr.u64 %rd799, %rd791, 32; + cvt.u32.u64 %r30294, %rd791; + cvt.u32.u64 %r30295, %rd799; + shr.u64 %rd800, %rd796, 32; + cvt.u32.u64 %r30292, %rd796; + cvt.u32.u64 %r30293, %rd800; + shr.u64 %rd801, %rd792, 32; + cvt.u32.u64 %r30290, %rd792; + cvt.u32.u64 %r30291, %rd801; + shr.u64 %rd802, %rd793, 32; + cvt.u32.u64 %r30288, %rd793; + cvt.u32.u64 %r30289, %rd802; + mov.u32 %r30251, %r30250; + mov.u32 %r30252, %r30250; + mov.u32 %r30253, %r30250; + mov.u32 %r30254, %r30250; + mov.u32 %r30255, %r30250; + mov.u32 %r30256, %r30250; + mov.u32 %r30257, %r30250; + mov.u32 %r30258, %r30250; + mov.u32 %r30259, %r30250; + mov.u32 %r30260, %r30250; + mov.u32 %r30261, %r30250; + mov.u32 %r30262, %r30250; + mov.u32 %r30263, %r30250; + mov.u32 %r30264, %r16665; + mov.u32 %r30266, %r30250; + mov.u32 %r30267, %r30250; + mov.u32 %r30268, %r30250; + mov.u32 %r30269, %r30250; + mov.u32 %r30270, %r30250; + mov.u32 %r30271, %r30250; + mov.u32 %r30272, %r30250; + mov.u32 %r30273, %r30250; + mov.u32 %r30274, %r30250; + mov.u32 %r30275, %r30250; + mov.u32 %r30276, %r30250; + mov.u32 %r30277, %r30250; + mov.u32 %r30278, %r30250; + mov.u32 %r30279, %r30250; + mov.u32 %r30280, %r30250; + mov.u32 %r30281, %r30250; + mov.u32 %r30282, %r30250; + mov.u32 %r30283, %r30250; + mov.u32 %r30300, %r30250; + +$L__BB2_59: + // begin inline asm + // xor5 + lop3.b32 %r16696, %r30286, %r30284, %r30282, 0x96; + lop3.b32 %r16696, %r16696, %r30280, %r30278, 0x96; + lop3.b32 %r16697, %r30287, %r30285, %r30283, 0x96; + lop3.b32 %r16697, %r16697, %r30281, %r30279, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16708, %r30298, %r30296, %r30276, 0x96; + lop3.b32 %r16708, %r16708, %r30274, %r30272, 0x96; + lop3.b32 %r16709, %r30299, %r30297, %r30277, 0x96; + lop3.b32 %r16709, %r16709, %r30275, %r30273, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16720, %r30294, %r30292, %r30270, 0x96; + lop3.b32 %r16720, %r16720, %r30268, %r30266, 0x96; + lop3.b32 %r16721, %r30295, %r30293, %r30271, 0x96; + lop3.b32 %r16721, %r16721, %r30269, %r30267, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16732, %r30290, %r30264, %r30262, 0x96; + lop3.b32 %r16732, %r16732, %r30260, %r30258, 0x96; + lop3.b32 %r16733, %r30291, %r30265, %r30263, 0x96; + lop3.b32 %r16733, %r16733, %r30261, %r30259, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16744, %r30288, %r30256, %r30254, 0x96; + lop3.b32 %r16744, %r16744, %r30252, %r30250, 0x96; + lop3.b32 %r16745, %r30289, %r30257, %r30255, 0x96; + lop3.b32 %r16745, %r16745, %r30253, %r30251, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16756, %r16709, %r16708, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16760, %r16708, %r16709, %r16665; + // end inline asm + xor.b32 %r17190, %r16756, %r16744; + xor.b32 %r17191, %r16760, %r16745; + xor.b32 %r17023, %r30286, %r17190; + xor.b32 %r17026, %r30287, %r17191; + xor.b32 %r16930, %r30284, %r17190; + xor.b32 %r16929, %r30285, %r17191; + xor.b32 %r16977, %r30282, %r17190; + xor.b32 %r16978, %r30283, %r17191; + xor.b32 %r16882, %r30280, %r17190; + xor.b32 %r16881, %r30281, %r17191; + xor.b32 %r16833, %r30278, %r17190; + xor.b32 %r16834, %r30279, %r17191; + // begin inline asm + shf.l.wrap.b32 %r16764, %r16721, %r16720, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16768, %r16720, %r16721, %r16665; + // end inline asm + xor.b32 %r17192, %r16764, %r16696; + xor.b32 %r17193, %r16768, %r16697; + xor.b32 %r16985, %r30298, %r17192; + xor.b32 %r16986, %r30299, %r17193; + xor.b32 %r16802, %r30296, %r17192; + xor.b32 %r16801, %r30297, %r17193; + xor.b32 %r16961, %r30276, %r17192; + xor.b32 %r16962, %r30277, %r17193; + xor.b32 %r16922, %r30274, %r17192; + xor.b32 %r16921, %r30275, %r17193; + xor.b32 %r16905, %r30272, %r17192; + xor.b32 %r16906, %r30273, %r17193; + // begin inline asm + shf.l.wrap.b32 %r16772, %r16733, %r16732, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16776, %r16732, %r16733, %r16665; + // end inline asm + xor.b32 %r17194, %r16772, %r16708; + xor.b32 %r17195, %r16776, %r16709; + xor.b32 %r16842, %r30294, %r17194; + xor.b32 %r16841, %r30295, %r17195; + xor.b32 %r16969, %r30292, %r17194; + xor.b32 %r16970, %r30293, %r17195; + xor.b32 %r16850, %r30270, %r17194; + xor.b32 %r16849, %r30271, %r17195; + xor.b32 %r16953, %r30268, %r17194; + xor.b32 %r16954, %r30269, %r17195; + xor.b32 %r16818, %r30266, %r17194; + xor.b32 %r16817, %r30267, %r17195; + // begin inline asm + shf.l.wrap.b32 %r16780, %r16745, %r16744, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16784, %r16744, %r16745, %r16665; + // end inline asm + xor.b32 %r17196, %r16780, %r16720; + xor.b32 %r17197, %r16784, %r16721; + xor.b32 %r16937, %r30290, %r17196; + xor.b32 %r16938, %r30291, %r17197; + xor.b32 %r16914, %r30264, %r17196; + xor.b32 %r16913, %r30265, %r17197; + xor.b32 %r16857, %r30262, %r17196; + xor.b32 %r16858, %r30263, %r17197; + xor.b32 %r16945, %r30260, %r17196; + xor.b32 %r16946, %r30261, %r17197; + xor.b32 %r16874, %r30258, %r17196; + xor.b32 %r16873, %r30259, %r17197; + // begin inline asm + shf.l.wrap.b32 %r16788, %r16697, %r16696, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16792, %r16696, %r16697, %r16665; + // end inline asm + xor.b32 %r17198, %r16788, %r16732; + xor.b32 %r17199, %r16792, %r16733; + xor.b32 %r16889, %r30288, %r17198; + xor.b32 %r16890, %r30289, %r17199; + xor.b32 %r16809, %r30256, %r17198; + xor.b32 %r16810, %r30257, %r17199; + xor.b32 %r16826, %r30254, %r17198; + xor.b32 %r16825, %r30255, %r17199; + xor.b32 %r16865, %r30252, %r17198; + xor.b32 %r16866, %r30253, %r17199; + xor.b32 %r16897, %r30250, %r17198; + xor.b32 %r16898, %r30251, %r17199; + mov.u32 %r16803, 44; + // begin inline asm + shf.l.wrap.b32 %r16796, %r16802, %r16801, %r16803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16800, %r16801, %r16802, %r16803; + // end inline asm + mov.u32 %r16811, 20; + // begin inline asm + shf.l.wrap.b32 %r16804, %r16810, %r16809, %r16811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16808, %r16809, %r16810, %r16811; + // end inline asm + mov.u32 %r16819, 61; + // begin inline asm + shf.l.wrap.b32 %r16812, %r16818, %r16817, %r16819; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16816, %r16817, %r16818, %r16819; + // end inline asm + mov.u32 %r16827, 39; + // begin inline asm + shf.l.wrap.b32 %r16820, %r16826, %r16825, %r16827; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16824, %r16825, %r16826, %r16827; + // end inline asm + mov.u32 %r16835, 18; + // begin inline asm + shf.l.wrap.b32 %r16828, %r16834, %r16833, %r16835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16832, %r16833, %r16834, %r16835; + // end inline asm + mov.u32 %r16843, 62; + // begin inline asm + shf.l.wrap.b32 %r16836, %r16842, %r16841, %r16843; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16840, %r16841, %r16842, %r16843; + // end inline asm + mov.u32 %r16851, 43; + // begin inline asm + shf.l.wrap.b32 %r16844, %r16850, %r16849, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16848, %r16849, %r16850, %r16851; + // end inline asm + mov.u32 %r16859, 25; + // begin inline asm + shf.l.wrap.b32 %r16852, %r16858, %r16857, %r16859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16856, %r16857, %r16858, %r16859; + // end inline asm + mov.u32 %r16867, 8; + // begin inline asm + shf.l.wrap.b32 %r16860, %r16866, %r16865, %r16867; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16864, %r16865, %r16866, %r16867; + // end inline asm + mov.u32 %r16875, 56; + // begin inline asm + shf.l.wrap.b32 %r16868, %r16874, %r16873, %r16875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16872, %r16873, %r16874, %r16875; + // end inline asm + mov.u32 %r16883, 41; + // begin inline asm + shf.l.wrap.b32 %r16876, %r16882, %r16881, %r16883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16880, %r16881, %r16882, %r16883; + // end inline asm + mov.u32 %r16891, 27; + // begin inline asm + shf.l.wrap.b32 %r16884, %r16890, %r16889, %r16891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16888, %r16889, %r16890, %r16891; + // end inline asm + mov.u32 %r16899, 14; + // begin inline asm + shf.l.wrap.b32 %r16892, %r16898, %r16897, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16896, %r16897, %r16898, %r16899; + // end inline asm + mov.u32 %r16907, 2; + // begin inline asm + shf.l.wrap.b32 %r16900, %r16906, %r16905, %r16907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16904, %r16905, %r16906, %r16907; + // end inline asm + mov.u32 %r16915, 55; + // begin inline asm + shf.l.wrap.b32 %r16908, %r16914, %r16913, %r16915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16912, %r16913, %r16914, %r16915; + // end inline asm + mov.u32 %r16923, 45; + // begin inline asm + shf.l.wrap.b32 %r16916, %r16922, %r16921, %r16923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16920, %r16921, %r16922, %r16923; + // end inline asm + mov.u32 %r16931, 36; + // begin inline asm + shf.l.wrap.b32 %r16924, %r16930, %r16929, %r16931; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16928, %r16929, %r16930, %r16931; + // end inline asm + mov.u32 %r16939, 28; + // begin inline asm + shf.l.wrap.b32 %r16932, %r16938, %r16937, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16936, %r16937, %r16938, %r16939; + // end inline asm + mov.u32 %r16947, 21; + // begin inline asm + shf.l.wrap.b32 %r16940, %r16946, %r16945, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16944, %r16945, %r16946, %r16947; + // end inline asm + mov.u32 %r16955, 15; + // begin inline asm + shf.l.wrap.b32 %r16948, %r16954, %r16953, %r16955; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16952, %r16953, %r16954, %r16955; + // end inline asm + mov.u32 %r16963, 10; + // begin inline asm + shf.l.wrap.b32 %r16956, %r16962, %r16961, %r16963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16960, %r16961, %r16962, %r16963; + // end inline asm + mov.u32 %r16971, 6; + // begin inline asm + shf.l.wrap.b32 %r16964, %r16970, %r16969, %r16971; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16968, %r16969, %r16970, %r16971; + // end inline asm + mov.u32 %r16979, 3; + // begin inline asm + shf.l.wrap.b32 %r16972, %r16978, %r16977, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16976, %r16977, %r16978, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16980, %r16986, %r16985, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16984, %r16985, %r16986, %r16665; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16988, %r17023, %r16796, %r16844, 0xD2; + lop3.b32 %r16989, %r17026, %r16800, %r16848, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30298, %r16796, %r16844, %r16940, 0xD2; + lop3.b32 %r30299, %r16800, %r16848, %r16944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30294, %r16844, %r16940, %r16892, 0xD2; + lop3.b32 %r30295, %r16848, %r16944, %r16896, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30290, %r16940, %r16892, %r17023, 0xD2; + lop3.b32 %r30291, %r16944, %r16896, %r17026, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30288, %r16892, %r17023, %r16796, 0xD2; + lop3.b32 %r30289, %r16896, %r17026, %r16800, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30284, %r16932, %r16804, %r16972, 0xD2; + lop3.b32 %r30285, %r16936, %r16808, %r16976, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30296, %r16804, %r16972, %r16916, 0xD2; + lop3.b32 %r30297, %r16808, %r16976, %r16920, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30292, %r16972, %r16916, %r16812, 0xD2; + lop3.b32 %r30293, %r16976, %r16920, %r16816, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30264, %r16916, %r16812, %r16932, 0xD2; + lop3.b32 %r30265, %r16920, %r16816, %r16936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30264, %r30265}; + // begin inline asm + // chi + lop3.b32 %r30256, %r16812, %r16932, %r16804, 0xD2; + lop3.b32 %r30257, %r16816, %r16936, %r16808, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30256, %r30257}; + // begin inline asm + // chi + lop3.b32 %r30282, %r16980, %r16964, %r16852, 0xD2; + lop3.b32 %r30283, %r16984, %r16968, %r16856, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30282, %r30283}; + // begin inline asm + // chi + lop3.b32 %r30276, %r16964, %r16852, %r16860, 0xD2; + lop3.b32 %r30277, %r16968, %r16856, %r16864, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30276, %r30277}; + // begin inline asm + // chi + lop3.b32 %r30270, %r16852, %r16860, %r16828, 0xD2; + lop3.b32 %r30271, %r16856, %r16864, %r16832, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30270, %r30271}; + // begin inline asm + // chi + lop3.b32 %r30262, %r16860, %r16828, %r16980, 0xD2; + lop3.b32 %r30263, %r16864, %r16832, %r16984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30262, %r30263}; + // begin inline asm + // chi + lop3.b32 %r30254, %r16828, %r16980, %r16964, 0xD2; + lop3.b32 %r30255, %r16832, %r16984, %r16968, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30254, %r30255}; + // begin inline asm + // chi + lop3.b32 %r30280, %r16884, %r16924, %r16956, 0xD2; + lop3.b32 %r30281, %r16888, %r16928, %r16960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30280, %r30281}; + // begin inline asm + // chi + lop3.b32 %r30274, %r16924, %r16956, %r16948, 0xD2; + lop3.b32 %r30275, %r16928, %r16960, %r16952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30274, %r30275}; + // begin inline asm + // chi + lop3.b32 %r30268, %r16956, %r16948, %r16868, 0xD2; + lop3.b32 %r30269, %r16960, %r16952, %r16872, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30268, %r30269}; + // begin inline asm + // chi + lop3.b32 %r30260, %r16948, %r16868, %r16884, 0xD2; + lop3.b32 %r30261, %r16952, %r16872, %r16888, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30260, %r30261}; + // begin inline asm + // chi + lop3.b32 %r30252, %r16868, %r16884, %r16924, 0xD2; + lop3.b32 %r30253, %r16872, %r16888, %r16928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30252, %r30253}; + // begin inline asm + // chi + lop3.b32 %r30278, %r16836, %r16908, %r16820, 0xD2; + lop3.b32 %r30279, %r16840, %r16912, %r16824, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30278, %r30279}; + // begin inline asm + // chi + lop3.b32 %r30272, %r16908, %r16820, %r16876, 0xD2; + lop3.b32 %r30273, %r16912, %r16824, %r16880, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30272, %r30273}; + // begin inline asm + // chi + lop3.b32 %r30266, %r16820, %r16876, %r16900, 0xD2; + lop3.b32 %r30267, %r16824, %r16880, %r16904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30266, %r30267}; + // begin inline asm + // chi + lop3.b32 %r30258, %r16876, %r16900, %r16836, 0xD2; + lop3.b32 %r30259, %r16880, %r16904, %r16840, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30258, %r30259}; + // begin inline asm + // chi + lop3.b32 %r30250, %r16900, %r16836, %r16908, 0xD2; + lop3.b32 %r30251, %r16904, %r16840, %r16912, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30250, %r30251}; + mul.wide.s32 %rd804, %r30300, 8; + mov.u64 %rd805, keccak_round_constants; + cvta.const.u64 %rd806, %rd805; + add.s64 %rd803, %rd806, %rd804; + // begin inline asm + ld.global.nc.v2.u32 {%r17188,%r17189}, [%rd803]; + // end inline asm + xor.b32 %r30286, %r16988, %r17188; + xor.b32 %r30287, %r16989, %r17189; + add.s32 %r30300, %r30300, 1; + setp.lt.u32 %p35, %r30300, 23; + @%p35 bra $L__BB2_59; + + add.u64 %rd178, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30298, %r30299}; + st.local.v2.u32 [%rd3+72], {%r30296, %r30297}; + st.local.v2.u32 [%rd3+40], {%r30294, %r30295}; + st.local.v2.u32 [%rd3+80], {%r30292, %r30293}; + st.local.v2.u32 [%rd3+48], {%r30290, %r30291}; + st.local.v2.u32 [%rd3+56], {%r30288, %r30289}; + st.local.v2.u32 [%rd3+24], {%r30286, %r30287}; + // begin inline asm + // xor5 + lop3.b32 %r17200, %r30286, %r30284, %r30282, 0x96; + lop3.b32 %r17200, %r17200, %r30280, %r30278, 0x96; + lop3.b32 %r17201, %r30287, %r30285, %r30283, 0x96; + lop3.b32 %r17201, %r17201, %r30281, %r30279, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17212, %r30298, %r30296, %r30276, 0x96; + lop3.b32 %r17212, %r17212, %r30274, %r30272, 0x96; + lop3.b32 %r17213, %r30299, %r30297, %r30277, 0x96; + lop3.b32 %r17213, %r17213, %r30275, %r30273, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17224, %r30294, %r30292, %r30270, 0x96; + lop3.b32 %r17224, %r17224, %r30268, %r30266, 0x96; + lop3.b32 %r17225, %r30295, %r30293, %r30271, 0x96; + lop3.b32 %r17225, %r17225, %r30269, %r30267, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17236, %r30290, %r30264, %r30262, 0x96; + lop3.b32 %r17236, %r17236, %r30260, %r30258, 0x96; + lop3.b32 %r17237, %r30291, %r30265, %r30263, 0x96; + lop3.b32 %r17237, %r17237, %r30261, %r30259, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17248, %r30288, %r30256, %r30254, 0x96; + lop3.b32 %r17248, %r17248, %r30252, %r30250, 0x96; + lop3.b32 %r17249, %r30289, %r30257, %r30255, 0x96; + lop3.b32 %r17249, %r17249, %r30253, %r30251, 0x96; + // end inline asm + mov.u32 %r17452, 1; + // begin inline asm + shf.l.wrap.b32 %r17260, %r17213, %r17212, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17264, %r17212, %r17213, %r17452; + // end inline asm + xor.b32 %r17479, %r17260, %r17248; + xor.b32 %r17480, %r17264, %r17249; + xor.b32 %r17407, %r30286, %r17479; + xor.b32 %r17410, %r30287, %r17480; + xor.b32 %r17370, %r30283, %r17480; + xor.b32 %r17369, %r30282, %r17479; + st.local.v2.u32 [%rd3+104], {%r17369, %r17370}; + // begin inline asm + shf.l.wrap.b32 %r17268, %r17225, %r17224, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17272, %r17224, %r17225, %r17452; + // end inline asm + xor.b32 %r17481, %r17268, %r17200; + xor.b32 %r17482, %r17272, %r17201; + xor.b32 %r17306, %r30296, %r17481; + xor.b32 %r17305, %r30297, %r17482; + xor.b32 %r17345, %r30275, %r17482; + xor.b32 %r17346, %r30274, %r17481; + st.local.v2.u32 [%rd3+152], {%r17346, %r17345}; + // begin inline asm + shf.l.wrap.b32 %r17276, %r17237, %r17236, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17280, %r17236, %r17237, %r17452; + // end inline asm + xor.b32 %r17483, %r17276, %r17212; + xor.b32 %r17484, %r17280, %r17213; + xor.b32 %r17329, %r30271, %r17484; + xor.b32 %r17330, %r30270, %r17483; + st.local.v2.u32 [%rd3+120], {%r17330, %r17329}; + xor.b32 %r17321, %r30267, %r17484; + xor.b32 %r17322, %r30266, %r17483; + st.local.v2.u32 [%rd3+200], {%r17322, %r17321}; + // begin inline asm + shf.l.wrap.b32 %r17284, %r17249, %r17248, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17288, %r17248, %r17249, %r17452; + // end inline asm + xor.b32 %r17485, %r17284, %r17224; + xor.b32 %r17486, %r17288, %r17225; + xor.b32 %r17353, %r30290, %r17485; + xor.b32 %r17354, %r30291, %r17486; + xor.b32 %r17362, %r30261, %r17486; + xor.b32 %r17361, %r30260, %r17485; + st.local.v2.u32 [%rd3+168], {%r17361, %r17362}; + // begin inline asm + shf.l.wrap.b32 %r17292, %r17201, %r17200, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17296, %r17200, %r17201, %r17452; + // end inline asm + xor.b32 %r17487, %r17292, %r17236; + xor.b32 %r17488, %r17296, %r17237; + xor.b32 %r17313, %r30256, %r17487; + xor.b32 %r17314, %r30257, %r17488; + xor.b32 %r17338, %r30251, %r17488; + xor.b32 %r17337, %r30250, %r17487; + st.local.v2.u32 [%rd3+216], {%r17337, %r17338}; + // begin inline asm + shf.l.wrap.b32 %r17300, %r17306, %r17305, %r16803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17304, %r17305, %r17306, %r16803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17308, %r17314, %r17313, %r16811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17312, %r17313, %r17314, %r16811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17320, %r17321, %r17322, %r16819; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17316, %r17322, %r17321, %r16819; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r17316, %r17320}; + // begin inline asm + shf.l.wrap.b32 %r17324, %r17330, %r17329, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17328, %r17329, %r17330, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17332, %r17338, %r17337, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17336, %r17337, %r17338, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17344, %r17345, %r17346, %r16923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17340, %r17346, %r17345, %r16923; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r17340, %r17344}; + // begin inline asm + shf.l.wrap.b32 %r17348, %r17354, %r17353, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17352, %r17353, %r17354, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17356, %r17362, %r17361, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17360, %r17361, %r17362, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17364, %r17370, %r17369, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17368, %r17369, %r17370, %r16979; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17372, %r17407, %r17300, %r17324, 0xD2; + lop3.b32 %r17373, %r17410, %r17304, %r17328, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30433, %r17300, %r17324, %r17356, 0xD2; + lop3.b32 %r30434, %r17304, %r17328, %r17360, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30433, %r30434}; + // begin inline asm + // chi + lop3.b32 %r30429, %r17324, %r17356, %r17332, 0xD2; + lop3.b32 %r30430, %r17328, %r17360, %r17336, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30429, %r30430}; + // begin inline asm + // chi + lop3.b32 %r30425, %r17356, %r17332, %r17407, 0xD2; + lop3.b32 %r30426, %r17360, %r17336, %r17410, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30425, %r30426}; + // begin inline asm + // chi + lop3.b32 %r30423, %r17332, %r17407, %r17300, 0xD2; + lop3.b32 %r30424, %r17336, %r17410, %r17304, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30423, %r30424}; + // begin inline asm + // chi + lop3.b32 %r30419, %r17348, %r17308, %r17364, 0xD2; + lop3.b32 %r30420, %r17352, %r17312, %r17368, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30419, %r30420}; + // begin inline asm + // chi + lop3.b32 %r30431, %r17308, %r17364, %r17340, 0xD2; + lop3.b32 %r30432, %r17312, %r17368, %r17344, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30431, %r30432}; + // begin inline asm + // chi + lop3.b32 %r30427, %r17364, %r17340, %r17316, 0xD2; + lop3.b32 %r30428, %r17368, %r17344, %r17320, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30427, %r30428}; + add.s64 %rd807, %rd806, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r17436,%r17437}, [%rd807]; + // end inline asm + xor.b32 %r30421, %r17372, %r17436; + xor.b32 %r30422, %r17373, %r17437; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + st.local.u64 [%rd178], %rd354; + mov.u64 %rd811, 1179641; + st.local.u64 [%rd178+8], %rd811; + st.local.u32 [%rd178+16], %r1679; + ld.global.u64 %rd812, [%rd129]; + ld.global.u64 %rd813, [%rd129+8]; + ld.global.u64 %rd814, [%rd129+16]; + ld.global.u64 %rd815, [%rd129+24]; + ld.global.u64 %rd816, [%rd129+32]; + ld.global.u64 %rd817, [%rd129+40]; + ld.global.u64 %rd818, [%rd129+48]; + ld.global.u64 %rd819, [%rd129+56]; + st.local.u64 [%rd178+32], %rd813; + st.local.u64 [%rd178+40], %rd814; + st.local.u64 [%rd178+48], %rd815; + st.local.u64 [%rd178+56], %rd816; + st.local.u64 [%rd178+64], %rd817; + st.local.u64 [%rd178+72], %rd818; + st.local.u64 [%rd178+80], %rd819; + cvt.u32.u64 %r17489, %rd812; + xor.b32 %r17490, %r1679, %r17489; + st.local.u64 [%rd178+24], %rd812; + st.local.u32 [%rd178+24], %r17490; + mov.u32 %r30301, 0; + st.local.v2.u32 [%rd178+96], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+104], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+112], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+120], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+128], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+136], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+144], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+152], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+160], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+168], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+176], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+184], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+192], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+200], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+208], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+216], {%r30301, %r30301}; + mov.u32 %r30316, -2147483648; + st.local.v2.u32 [%rd178+88], {%r17452, %r30316}; + ld.local.v2.u32 {%r30337, %r30338}, [%rd178+24]; + mov.b64 {%r30335, %r30336}, %rd817; + shr.u64 %rd820, %rd813, 32; + cvt.u32.u64 %r30349, %rd813; + cvt.u32.u64 %r30350, %rd820; + shr.u64 %rd821, %rd818, 32; + cvt.u32.u64 %r30347, %rd818; + cvt.u32.u64 %r30348, %rd821; + shr.u64 %rd822, %rd814, 32; + cvt.u32.u64 %r30345, %rd814; + cvt.u32.u64 %r30346, %rd822; + shr.u64 %rd823, %rd819, 32; + cvt.u32.u64 %r30343, %rd819; + cvt.u32.u64 %r30344, %rd823; + shr.u64 %rd824, %rd815, 32; + cvt.u32.u64 %r30341, %rd815; + cvt.u32.u64 %r30342, %rd824; + shr.u64 %rd825, %rd816, 32; + cvt.u32.u64 %r30339, %rd816; + cvt.u32.u64 %r30340, %rd825; + mov.u32 %r30302, %r30301; + mov.u32 %r30303, %r30301; + mov.u32 %r30304, %r30301; + mov.u32 %r30305, %r30301; + mov.u32 %r30306, %r30301; + mov.u32 %r30307, %r30301; + mov.u32 %r30308, %r30301; + mov.u32 %r30309, %r30301; + mov.u32 %r30310, %r30301; + mov.u32 %r30311, %r30301; + mov.u32 %r30312, %r30301; + mov.u32 %r30313, %r30301; + mov.u32 %r30314, %r30301; + mov.u32 %r30315, %r17452; + mov.u32 %r30317, %r30301; + mov.u32 %r30318, %r30301; + mov.u32 %r30319, %r30301; + mov.u32 %r30320, %r30301; + mov.u32 %r30321, %r30301; + mov.u32 %r30322, %r30301; + mov.u32 %r30323, %r30301; + mov.u32 %r30324, %r30301; + mov.u32 %r30325, %r30301; + mov.u32 %r30326, %r30301; + mov.u32 %r30327, %r30301; + mov.u32 %r30328, %r30301; + mov.u32 %r30329, %r30301; + mov.u32 %r30330, %r30301; + mov.u32 %r30331, %r30301; + mov.u32 %r30332, %r30301; + mov.u32 %r30333, %r30301; + mov.u32 %r30334, %r30301; + mov.u32 %r30351, %r30301; + +$L__BB2_61: + // begin inline asm + // xor5 + lop3.b32 %r17493, %r30337, %r30335, %r30333, 0x96; + lop3.b32 %r17493, %r17493, %r30331, %r30329, 0x96; + lop3.b32 %r17494, %r30338, %r30336, %r30334, 0x96; + lop3.b32 %r17494, %r17494, %r30332, %r30330, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17505, %r30349, %r30347, %r30327, 0x96; + lop3.b32 %r17505, %r17505, %r30325, %r30323, 0x96; + lop3.b32 %r17506, %r30350, %r30348, %r30328, 0x96; + lop3.b32 %r17506, %r17506, %r30326, %r30324, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17517, %r30345, %r30343, %r30321, 0x96; + lop3.b32 %r17517, %r17517, %r30319, %r30317, 0x96; + lop3.b32 %r17518, %r30346, %r30344, %r30322, 0x96; + lop3.b32 %r17518, %r17518, %r30320, %r30318, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17529, %r30341, %r30315, %r30313, 0x96; + lop3.b32 %r17529, %r17529, %r30311, %r30309, 0x96; + lop3.b32 %r17530, %r30342, %r30316, %r30314, 0x96; + lop3.b32 %r17530, %r17530, %r30312, %r30310, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17541, %r30339, %r30307, %r30305, 0x96; + lop3.b32 %r17541, %r17541, %r30303, %r30301, 0x96; + lop3.b32 %r17542, %r30340, %r30308, %r30306, 0x96; + lop3.b32 %r17542, %r17542, %r30304, %r30302, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17553, %r17506, %r17505, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17557, %r17505, %r17506, %r17452; + // end inline asm + xor.b32 %r17987, %r17553, %r17541; + xor.b32 %r17988, %r17557, %r17542; + xor.b32 %r17820, %r30337, %r17987; + xor.b32 %r17823, %r30338, %r17988; + xor.b32 %r17727, %r30335, %r17987; + xor.b32 %r17726, %r30336, %r17988; + xor.b32 %r17774, %r30333, %r17987; + xor.b32 %r17775, %r30334, %r17988; + xor.b32 %r17679, %r30331, %r17987; + xor.b32 %r17678, %r30332, %r17988; + xor.b32 %r17630, %r30329, %r17987; + xor.b32 %r17631, %r30330, %r17988; + // begin inline asm + shf.l.wrap.b32 %r17561, %r17518, %r17517, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17565, %r17517, %r17518, %r17452; + // end inline asm + xor.b32 %r17989, %r17561, %r17493; + xor.b32 %r17990, %r17565, %r17494; + xor.b32 %r17782, %r30349, %r17989; + xor.b32 %r17783, %r30350, %r17990; + xor.b32 %r17599, %r30347, %r17989; + xor.b32 %r17598, %r30348, %r17990; + xor.b32 %r17758, %r30327, %r17989; + xor.b32 %r17759, %r30328, %r17990; + xor.b32 %r17719, %r30325, %r17989; + xor.b32 %r17718, %r30326, %r17990; + xor.b32 %r17702, %r30323, %r17989; + xor.b32 %r17703, %r30324, %r17990; + // begin inline asm + shf.l.wrap.b32 %r17569, %r17530, %r17529, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17573, %r17529, %r17530, %r17452; + // end inline asm + xor.b32 %r17991, %r17569, %r17505; + xor.b32 %r17992, %r17573, %r17506; + xor.b32 %r17639, %r30345, %r17991; + xor.b32 %r17638, %r30346, %r17992; + xor.b32 %r17766, %r30343, %r17991; + xor.b32 %r17767, %r30344, %r17992; + xor.b32 %r17647, %r30321, %r17991; + xor.b32 %r17646, %r30322, %r17992; + xor.b32 %r17750, %r30319, %r17991; + xor.b32 %r17751, %r30320, %r17992; + xor.b32 %r17615, %r30317, %r17991; + xor.b32 %r17614, %r30318, %r17992; + // begin inline asm + shf.l.wrap.b32 %r17577, %r17542, %r17541, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17581, %r17541, %r17542, %r17452; + // end inline asm + xor.b32 %r17993, %r17577, %r17517; + xor.b32 %r17994, %r17581, %r17518; + xor.b32 %r17734, %r30341, %r17993; + xor.b32 %r17735, %r30342, %r17994; + xor.b32 %r17711, %r30315, %r17993; + xor.b32 %r17710, %r30316, %r17994; + xor.b32 %r17654, %r30313, %r17993; + xor.b32 %r17655, %r30314, %r17994; + xor.b32 %r17742, %r30311, %r17993; + xor.b32 %r17743, %r30312, %r17994; + xor.b32 %r17671, %r30309, %r17993; + xor.b32 %r17670, %r30310, %r17994; + // begin inline asm + shf.l.wrap.b32 %r17585, %r17494, %r17493, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17589, %r17493, %r17494, %r17452; + // end inline asm + xor.b32 %r17995, %r17585, %r17529; + xor.b32 %r17996, %r17589, %r17530; + xor.b32 %r17686, %r30339, %r17995; + xor.b32 %r17687, %r30340, %r17996; + xor.b32 %r17606, %r30307, %r17995; + xor.b32 %r17607, %r30308, %r17996; + xor.b32 %r17623, %r30305, %r17995; + xor.b32 %r17622, %r30306, %r17996; + xor.b32 %r17662, %r30303, %r17995; + xor.b32 %r17663, %r30304, %r17996; + xor.b32 %r17694, %r30301, %r17995; + xor.b32 %r17695, %r30302, %r17996; + mov.u32 %r17600, 44; + // begin inline asm + shf.l.wrap.b32 %r17593, %r17599, %r17598, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17597, %r17598, %r17599, %r17600; + // end inline asm + mov.u32 %r17608, 20; + // begin inline asm + shf.l.wrap.b32 %r17601, %r17607, %r17606, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17605, %r17606, %r17607, %r17608; + // end inline asm + mov.u32 %r17616, 61; + // begin inline asm + shf.l.wrap.b32 %r17609, %r17615, %r17614, %r17616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17613, %r17614, %r17615, %r17616; + // end inline asm + mov.u32 %r17624, 39; + // begin inline asm + shf.l.wrap.b32 %r17617, %r17623, %r17622, %r17624; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17621, %r17622, %r17623, %r17624; + // end inline asm + mov.u32 %r17632, 18; + // begin inline asm + shf.l.wrap.b32 %r17625, %r17631, %r17630, %r17632; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17629, %r17630, %r17631, %r17632; + // end inline asm + mov.u32 %r17640, 62; + // begin inline asm + shf.l.wrap.b32 %r17633, %r17639, %r17638, %r17640; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17637, %r17638, %r17639, %r17640; + // end inline asm + mov.u32 %r17648, 43; + // begin inline asm + shf.l.wrap.b32 %r17641, %r17647, %r17646, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17645, %r17646, %r17647, %r17648; + // end inline asm + mov.u32 %r17656, 25; + // begin inline asm + shf.l.wrap.b32 %r17649, %r17655, %r17654, %r17656; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17653, %r17654, %r17655, %r17656; + // end inline asm + mov.u32 %r17664, 8; + // begin inline asm + shf.l.wrap.b32 %r17657, %r17663, %r17662, %r17664; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17661, %r17662, %r17663, %r17664; + // end inline asm + mov.u32 %r17672, 56; + // begin inline asm + shf.l.wrap.b32 %r17665, %r17671, %r17670, %r17672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17669, %r17670, %r17671, %r17672; + // end inline asm + mov.u32 %r17680, 41; + // begin inline asm + shf.l.wrap.b32 %r17673, %r17679, %r17678, %r17680; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17677, %r17678, %r17679, %r17680; + // end inline asm + mov.u32 %r17688, 27; + // begin inline asm + shf.l.wrap.b32 %r17681, %r17687, %r17686, %r17688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17685, %r17686, %r17687, %r17688; + // end inline asm + mov.u32 %r17696, 14; + // begin inline asm + shf.l.wrap.b32 %r17689, %r17695, %r17694, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17693, %r17694, %r17695, %r17696; + // end inline asm + mov.u32 %r17704, 2; + // begin inline asm + shf.l.wrap.b32 %r17697, %r17703, %r17702, %r17704; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17701, %r17702, %r17703, %r17704; + // end inline asm + mov.u32 %r17712, 55; + // begin inline asm + shf.l.wrap.b32 %r17705, %r17711, %r17710, %r17712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17709, %r17710, %r17711, %r17712; + // end inline asm + mov.u32 %r17720, 45; + // begin inline asm + shf.l.wrap.b32 %r17713, %r17719, %r17718, %r17720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17717, %r17718, %r17719, %r17720; + // end inline asm + mov.u32 %r17728, 36; + // begin inline asm + shf.l.wrap.b32 %r17721, %r17727, %r17726, %r17728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17725, %r17726, %r17727, %r17728; + // end inline asm + mov.u32 %r17736, 28; + // begin inline asm + shf.l.wrap.b32 %r17729, %r17735, %r17734, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17733, %r17734, %r17735, %r17736; + // end inline asm + mov.u32 %r17744, 21; + // begin inline asm + shf.l.wrap.b32 %r17737, %r17743, %r17742, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17741, %r17742, %r17743, %r17744; + // end inline asm + mov.u32 %r17752, 15; + // begin inline asm + shf.l.wrap.b32 %r17745, %r17751, %r17750, %r17752; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17749, %r17750, %r17751, %r17752; + // end inline asm + mov.u32 %r17760, 10; + // begin inline asm + shf.l.wrap.b32 %r17753, %r17759, %r17758, %r17760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17757, %r17758, %r17759, %r17760; + // end inline asm + mov.u32 %r17768, 6; + // begin inline asm + shf.l.wrap.b32 %r17761, %r17767, %r17766, %r17768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17765, %r17766, %r17767, %r17768; + // end inline asm + mov.u32 %r17776, 3; + // begin inline asm + shf.l.wrap.b32 %r17769, %r17775, %r17774, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17773, %r17774, %r17775, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17777, %r17783, %r17782, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17781, %r17782, %r17783, %r17452; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17785, %r17820, %r17593, %r17641, 0xD2; + lop3.b32 %r17786, %r17823, %r17597, %r17645, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30349, %r17593, %r17641, %r17737, 0xD2; + lop3.b32 %r30350, %r17597, %r17645, %r17741, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30345, %r17641, %r17737, %r17689, 0xD2; + lop3.b32 %r30346, %r17645, %r17741, %r17693, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30341, %r17737, %r17689, %r17820, 0xD2; + lop3.b32 %r30342, %r17741, %r17693, %r17823, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30339, %r17689, %r17820, %r17593, 0xD2; + lop3.b32 %r30340, %r17693, %r17823, %r17597, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30335, %r17729, %r17601, %r17769, 0xD2; + lop3.b32 %r30336, %r17733, %r17605, %r17773, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30347, %r17601, %r17769, %r17713, 0xD2; + lop3.b32 %r30348, %r17605, %r17773, %r17717, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30343, %r17769, %r17713, %r17609, 0xD2; + lop3.b32 %r30344, %r17773, %r17717, %r17613, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30315, %r17713, %r17609, %r17729, 0xD2; + lop3.b32 %r30316, %r17717, %r17613, %r17733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r30315, %r30316}; + // begin inline asm + // chi + lop3.b32 %r30307, %r17609, %r17729, %r17601, 0xD2; + lop3.b32 %r30308, %r17613, %r17733, %r17605, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r30307, %r30308}; + // begin inline asm + // chi + lop3.b32 %r30333, %r17777, %r17761, %r17649, 0xD2; + lop3.b32 %r30334, %r17781, %r17765, %r17653, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+104], {%r30333, %r30334}; + // begin inline asm + // chi + lop3.b32 %r30327, %r17761, %r17649, %r17657, 0xD2; + lop3.b32 %r30328, %r17765, %r17653, %r17661, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+112], {%r30327, %r30328}; + // begin inline asm + // chi + lop3.b32 %r30321, %r17649, %r17657, %r17625, 0xD2; + lop3.b32 %r30322, %r17653, %r17661, %r17629, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+120], {%r30321, %r30322}; + // begin inline asm + // chi + lop3.b32 %r30313, %r17657, %r17625, %r17777, 0xD2; + lop3.b32 %r30314, %r17661, %r17629, %r17781, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+128], {%r30313, %r30314}; + // begin inline asm + // chi + lop3.b32 %r30305, %r17625, %r17777, %r17761, 0xD2; + lop3.b32 %r30306, %r17629, %r17781, %r17765, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+136], {%r30305, %r30306}; + // begin inline asm + // chi + lop3.b32 %r30331, %r17681, %r17721, %r17753, 0xD2; + lop3.b32 %r30332, %r17685, %r17725, %r17757, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+144], {%r30331, %r30332}; + // begin inline asm + // chi + lop3.b32 %r30325, %r17721, %r17753, %r17745, 0xD2; + lop3.b32 %r30326, %r17725, %r17757, %r17749, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+152], {%r30325, %r30326}; + // begin inline asm + // chi + lop3.b32 %r30319, %r17753, %r17745, %r17665, 0xD2; + lop3.b32 %r30320, %r17757, %r17749, %r17669, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+160], {%r30319, %r30320}; + // begin inline asm + // chi + lop3.b32 %r30311, %r17745, %r17665, %r17681, 0xD2; + lop3.b32 %r30312, %r17749, %r17669, %r17685, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+168], {%r30311, %r30312}; + // begin inline asm + // chi + lop3.b32 %r30303, %r17665, %r17681, %r17721, 0xD2; + lop3.b32 %r30304, %r17669, %r17685, %r17725, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+176], {%r30303, %r30304}; + // begin inline asm + // chi + lop3.b32 %r30329, %r17633, %r17705, %r17617, 0xD2; + lop3.b32 %r30330, %r17637, %r17709, %r17621, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+184], {%r30329, %r30330}; + // begin inline asm + // chi + lop3.b32 %r30323, %r17705, %r17617, %r17673, 0xD2; + lop3.b32 %r30324, %r17709, %r17621, %r17677, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+192], {%r30323, %r30324}; + // begin inline asm + // chi + lop3.b32 %r30317, %r17617, %r17673, %r17697, 0xD2; + lop3.b32 %r30318, %r17621, %r17677, %r17701, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+200], {%r30317, %r30318}; + // begin inline asm + // chi + lop3.b32 %r30309, %r17673, %r17697, %r17633, 0xD2; + lop3.b32 %r30310, %r17677, %r17701, %r17637, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+208], {%r30309, %r30310}; + // begin inline asm + // chi + lop3.b32 %r30301, %r17697, %r17633, %r17705, 0xD2; + lop3.b32 %r30302, %r17701, %r17637, %r17709, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+216], {%r30301, %r30302}; + mul.wide.s32 %rd827, %r30351, 8; + add.s64 %rd826, %rd806, %rd827; + // begin inline asm + ld.global.nc.v2.u32 {%r17985,%r17986}, [%rd826]; + // end inline asm + xor.b32 %r30337, %r17785, %r17985; + xor.b32 %r30338, %r17786, %r17986; + add.s32 %r30351, %r30351, 1; + setp.lt.u32 %p36, %r30351, 23; + @%p36 bra $L__BB2_61; + + mov.u32 %r30384, 0; + mov.u32 %r18096, 1; + st.local.v2.u32 [%rd178+32], {%r30349, %r30350}; + st.local.v2.u32 [%rd178+72], {%r30347, %r30348}; + st.local.v2.u32 [%rd178+40], {%r30345, %r30346}; + st.local.v2.u32 [%rd178+80], {%r30343, %r30344}; + st.local.v2.u32 [%rd178+48], {%r30341, %r30342}; + st.local.v2.u32 [%rd178+56], {%r30339, %r30340}; + st.local.v2.u32 [%rd178+24], {%r30337, %r30338}; + // begin inline asm + // xor5 + lop3.b32 %r17997, %r30337, %r30335, %r30333, 0x96; + lop3.b32 %r17997, %r17997, %r30331, %r30329, 0x96; + lop3.b32 %r17998, %r30338, %r30336, %r30334, 0x96; + lop3.b32 %r17998, %r17998, %r30332, %r30330, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18009, %r30349, %r30347, %r30327, 0x96; + lop3.b32 %r18009, %r18009, %r30325, %r30323, 0x96; + lop3.b32 %r18010, %r30350, %r30348, %r30328, 0x96; + lop3.b32 %r18010, %r18010, %r30326, %r30324, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18021, %r30345, %r30343, %r30321, 0x96; + lop3.b32 %r18021, %r18021, %r30319, %r30317, 0x96; + lop3.b32 %r18022, %r30346, %r30344, %r30322, 0x96; + lop3.b32 %r18022, %r18022, %r30320, %r30318, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18033, %r30341, %r30315, %r30313, 0x96; + lop3.b32 %r18033, %r18033, %r30311, %r30309, 0x96; + lop3.b32 %r18034, %r30342, %r30316, %r30314, 0x96; + lop3.b32 %r18034, %r18034, %r30312, %r30310, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18045, %r30339, %r30307, %r30305, 0x96; + lop3.b32 %r18045, %r18045, %r30303, %r30301, 0x96; + lop3.b32 %r18046, %r30340, %r30308, %r30306, 0x96; + lop3.b32 %r18046, %r18046, %r30304, %r30302, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18057, %r18010, %r18009, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18061, %r18009, %r18010, %r18096; + // end inline asm + xor.b32 %r18236, %r18057, %r18045; + xor.b32 %r18237, %r18061, %r18046; + xor.b32 %r18204, %r30337, %r18236; + xor.b32 %r18207, %r30338, %r18237; + xor.b32 %r18167, %r30334, %r18237; + xor.b32 %r18166, %r30333, %r18236; + st.local.v2.u32 [%rd178+104], {%r18166, %r18167}; + // begin inline asm + shf.l.wrap.b32 %r18065, %r18022, %r18021, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18069, %r18021, %r18022, %r18096; + // end inline asm + xor.b32 %r18238, %r18065, %r17997; + xor.b32 %r18239, %r18069, %r17998; + xor.b32 %r18103, %r30347, %r18238; + xor.b32 %r18102, %r30348, %r18239; + xor.b32 %r18142, %r30326, %r18239; + xor.b32 %r18143, %r30325, %r18238; + st.local.v2.u32 [%rd178+152], {%r18143, %r18142}; + // begin inline asm + shf.l.wrap.b32 %r18073, %r18034, %r18033, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18077, %r18033, %r18034, %r18096; + // end inline asm + xor.b32 %r18240, %r18073, %r18009; + xor.b32 %r18241, %r18077, %r18010; + xor.b32 %r18126, %r30322, %r18241; + xor.b32 %r18127, %r30321, %r18240; + st.local.v2.u32 [%rd178+120], {%r18127, %r18126}; + xor.b32 %r18118, %r30318, %r18241; + xor.b32 %r18119, %r30317, %r18240; + st.local.v2.u32 [%rd178+200], {%r18119, %r18118}; + // begin inline asm + shf.l.wrap.b32 %r18081, %r18046, %r18045, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18085, %r18045, %r18046, %r18096; + // end inline asm + xor.b32 %r18242, %r18081, %r18021; + xor.b32 %r18243, %r18085, %r18022; + xor.b32 %r18150, %r30341, %r18242; + xor.b32 %r18151, %r30342, %r18243; + xor.b32 %r18159, %r30312, %r18243; + xor.b32 %r18158, %r30311, %r18242; + st.local.v2.u32 [%rd178+168], {%r18158, %r18159}; + // begin inline asm + shf.l.wrap.b32 %r18089, %r17998, %r17997, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18093, %r17997, %r17998, %r18096; + // end inline asm + xor.b32 %r18244, %r18089, %r18033; + xor.b32 %r18245, %r18093, %r18034; + xor.b32 %r18110, %r30307, %r18244; + xor.b32 %r18111, %r30308, %r18245; + xor.b32 %r18135, %r30302, %r18245; + xor.b32 %r18134, %r30301, %r18244; + st.local.v2.u32 [%rd178+216], {%r18134, %r18135}; + // begin inline asm + shf.l.wrap.b32 %r18097, %r18103, %r18102, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18101, %r18102, %r18103, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18105, %r18111, %r18110, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18109, %r18110, %r18111, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18117, %r18118, %r18119, %r17616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18113, %r18119, %r18118, %r17616; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r18113, %r18117}; + // begin inline asm + shf.l.wrap.b32 %r18121, %r18127, %r18126, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18125, %r18126, %r18127, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18129, %r18135, %r18134, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18133, %r18134, %r18135, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18141, %r18142, %r18143, %r17720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18137, %r18143, %r18142, %r17720; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r18137, %r18141}; + // begin inline asm + shf.l.wrap.b32 %r18145, %r18151, %r18150, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18149, %r18150, %r18151, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18153, %r18159, %r18158, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18157, %r18158, %r18159, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18161, %r18167, %r18166, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18165, %r18166, %r18167, %r17776; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18169, %r18204, %r18097, %r18121, 0xD2; + lop3.b32 %r18170, %r18207, %r18101, %r18125, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30484, %r18097, %r18121, %r18153, 0xD2; + lop3.b32 %r30485, %r18101, %r18125, %r18157, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+32], {%r30484, %r30485}; + // begin inline asm + // chi + lop3.b32 %r30480, %r18121, %r18153, %r18129, 0xD2; + lop3.b32 %r30481, %r18125, %r18157, %r18133, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+40], {%r30480, %r30481}; + // begin inline asm + // chi + lop3.b32 %r30476, %r18153, %r18129, %r18204, 0xD2; + lop3.b32 %r30477, %r18157, %r18133, %r18207, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+48], {%r30476, %r30477}; + // begin inline asm + // chi + lop3.b32 %r30474, %r18129, %r18204, %r18097, 0xD2; + lop3.b32 %r30475, %r18133, %r18207, %r18101, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+56], {%r30474, %r30475}; + // begin inline asm + // chi + lop3.b32 %r30470, %r18145, %r18105, %r18161, 0xD2; + lop3.b32 %r30471, %r18149, %r18109, %r18165, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+64], {%r30470, %r30471}; + // begin inline asm + // chi + lop3.b32 %r30482, %r18105, %r18161, %r18137, 0xD2; + lop3.b32 %r30483, %r18109, %r18165, %r18141, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+72], {%r30482, %r30483}; + // begin inline asm + // chi + lop3.b32 %r30478, %r18161, %r18137, %r18113, 0xD2; + lop3.b32 %r30479, %r18165, %r18141, %r18117, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+80], {%r30478, %r30479}; + // begin inline asm + ld.global.nc.v2.u32 {%r18233,%r18234}, [%rd807]; + // end inline asm + xor.b32 %r30472, %r18169, %r18233; + xor.b32 %r30473, %r18170, %r18234; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + add.s64 %rd180, %rd178, 24; + add.s64 %rd181, %rd3, 24; + +$L__BB2_63: + shl.b32 %r18246, %r30384, 2; + cvt.u64.u32 %rd837, %r18246; + and.b64 %rd838, %rd837, 60; + add.s64 %rd839, %rd181, %rd838; + xor.b32 %r18247, %r1678, %r30384; + mul.lo.s32 %r18248, %r18247, 16777619; + ld.local.u32 %r18249, [%rd839]; + xor.b32 %r18250, %r18248, %r18249; + mul.wide.u32 %rd840, %r18250, -954391867; + shr.u64 %rd841, %rd840, 32; + cvt.u32.u64 %r18251, %rd841; + sub.s32 %r18252, %r18250, %r18251; + shr.u32 %r18253, %r18252, 1; + add.s32 %r18254, %r18253, %r18251; + shr.u32 %r18255, %r18254, 20; + mul.lo.s32 %r18256, %r18255, 1179641; + sub.s32 %r18257, %r18250, %r18256; + mul.wide.u32 %rd842, %r18257, 64; + add.s64 %rd843, %rd471, %rd842; + mul.lo.s32 %r18258, %r30421, 16777619; + ld.global.u32 %r18259, [%rd843]; + xor.b32 %r30421, %r18258, %r18259; + mul.lo.s32 %r18260, %r30422, 16777619; + ld.global.u32 %r18261, [%rd843+4]; + xor.b32 %r30422, %r18260, %r18261; + mul.lo.s32 %r18262, %r30433, 16777619; + ld.global.u32 %r18263, [%rd843+8]; + mul.lo.s32 %r18264, %r30434, 16777619; + ld.global.u32 %r18265, [%rd843+12]; + xor.b32 %r18266, %r18264, %r18265; + xor.b32 %r30433, %r18262, %r18263; + mov.b64 %rd844, {%r30433, %r18266}; + mul.lo.s32 %r18267, %r30429, 16777619; + ld.global.u32 %r18268, [%rd843+16]; + mul.lo.s32 %r18269, %r30430, 16777619; + ld.global.u32 %r18270, [%rd843+20]; + xor.b32 %r18271, %r18269, %r18270; + xor.b32 %r30429, %r18267, %r18268; + mov.b64 %rd845, {%r30429, %r18271}; + mul.lo.s32 %r18272, %r30425, 16777619; + ld.global.u32 %r18273, [%rd843+24]; + mul.lo.s32 %r18274, %r30426, 16777619; + ld.global.u32 %r18275, [%rd843+28]; + xor.b32 %r18276, %r18274, %r18275; + xor.b32 %r30425, %r18272, %r18273; + mov.b64 %rd846, {%r30425, %r18276}; + mul.lo.s32 %r18277, %r30423, 16777619; + ld.global.u32 %r18278, [%rd843+32]; + mul.lo.s32 %r18279, %r30424, 16777619; + ld.global.u32 %r18280, [%rd843+36]; + xor.b32 %r18281, %r18279, %r18280; + xor.b32 %r30423, %r18277, %r18278; + mov.b64 %rd847, {%r30423, %r18281}; + mul.lo.s32 %r18282, %r30419, 16777619; + ld.global.u32 %r18283, [%rd843+40]; + xor.b32 %r30419, %r18282, %r18283; + mul.lo.s32 %r18284, %r30420, 16777619; + ld.global.u32 %r18285, [%rd843+44]; + xor.b32 %r30420, %r18284, %r18285; + mul.lo.s32 %r18286, %r30431, 16777619; + ld.global.u32 %r18287, [%rd843+48]; + mul.lo.s32 %r18288, %r30432, 16777619; + ld.global.u32 %r18289, [%rd843+52]; + xor.b32 %r18290, %r18288, %r18289; + xor.b32 %r30431, %r18286, %r18287; + mov.b64 %rd848, {%r30431, %r18290}; + mul.lo.s32 %r18291, %r30427, 16777619; + ld.global.u32 %r18292, [%rd843+56]; + mul.lo.s32 %r18293, %r30428, 16777619; + ld.global.u32 %r18294, [%rd843+60]; + xor.b32 %r18295, %r18293, %r18294; + xor.b32 %r30427, %r18291, %r18292; + mov.b64 %rd849, {%r30427, %r18295}; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + st.local.v2.u32 [%rd3+32], {%r30433, %r18266}; + st.local.v2.u32 [%rd3+40], {%r30429, %r18271}; + st.local.v2.u32 [%rd3+48], {%r30425, %r18276}; + st.local.v2.u32 [%rd3+56], {%r30423, %r18281}; + st.local.v2.u32 [%rd3+64], {%r30419, %r30420}; + st.local.v2.u32 [%rd3+72], {%r30431, %r18290}; + st.local.v2.u32 [%rd3+80], {%r30427, %r18295}; + add.s64 %rd850, %rd180, %rd838; + xor.b32 %r18296, %r1679, %r30384; + mul.lo.s32 %r18297, %r18296, 16777619; + ld.local.u32 %r18298, [%rd850]; + xor.b32 %r18299, %r18297, %r18298; + mul.wide.u32 %rd851, %r18299, -954391867; + shr.u64 %rd852, %rd851, 32; + cvt.u32.u64 %r18300, %rd852; + sub.s32 %r18301, %r18299, %r18300; + shr.u32 %r18302, %r18301, 1; + add.s32 %r18303, %r18302, %r18300; + shr.u32 %r18304, %r18303, 20; + mul.lo.s32 %r18305, %r18304, 1179641; + sub.s32 %r18306, %r18299, %r18305; + mul.wide.u32 %rd853, %r18306, 64; + add.s64 %rd854, %rd471, %rd853; + mul.lo.s32 %r18307, %r30472, 16777619; + ld.global.u32 %r18308, [%rd854]; + xor.b32 %r30472, %r18307, %r18308; + mul.lo.s32 %r18309, %r30473, 16777619; + ld.global.u32 %r18310, [%rd854+4]; + xor.b32 %r30473, %r18309, %r18310; + mul.lo.s32 %r18311, %r30484, 16777619; + ld.global.u32 %r18312, [%rd854+8]; + mul.lo.s32 %r18313, %r30485, 16777619; + ld.global.u32 %r18314, [%rd854+12]; + xor.b32 %r18315, %r18313, %r18314; + xor.b32 %r30484, %r18311, %r18312; + mov.b64 %rd855, {%r30484, %r18315}; + mul.lo.s32 %r18316, %r30480, 16777619; + ld.global.u32 %r18317, [%rd854+16]; + mul.lo.s32 %r18318, %r30481, 16777619; + ld.global.u32 %r18319, [%rd854+20]; + xor.b32 %r18320, %r18318, %r18319; + xor.b32 %r30480, %r18316, %r18317; + mov.b64 %rd856, {%r30480, %r18320}; + mul.lo.s32 %r18321, %r30476, 16777619; + ld.global.u32 %r18322, [%rd854+24]; + mul.lo.s32 %r18323, %r30477, 16777619; + ld.global.u32 %r18324, [%rd854+28]; + xor.b32 %r18325, %r18323, %r18324; + xor.b32 %r30476, %r18321, %r18322; + mov.b64 %rd857, {%r30476, %r18325}; + mul.lo.s32 %r18326, %r30474, 16777619; + ld.global.u32 %r18327, [%rd854+32]; + mul.lo.s32 %r18328, %r30475, 16777619; + ld.global.u32 %r18329, [%rd854+36]; + xor.b32 %r18330, %r18328, %r18329; + xor.b32 %r30474, %r18326, %r18327; + mov.b64 %rd858, {%r30474, %r18330}; + mul.lo.s32 %r18331, %r30470, 16777619; + ld.global.u32 %r18332, [%rd854+40]; + xor.b32 %r30470, %r18331, %r18332; + mul.lo.s32 %r18333, %r30471, 16777619; + ld.global.u32 %r18334, [%rd854+44]; + xor.b32 %r30471, %r18333, %r18334; + mul.lo.s32 %r18335, %r30482, 16777619; + ld.global.u32 %r18336, [%rd854+48]; + mul.lo.s32 %r18337, %r30483, 16777619; + ld.global.u32 %r18338, [%rd854+52]; + xor.b32 %r18339, %r18337, %r18338; + xor.b32 %r30482, %r18335, %r18336; + mov.b64 %rd859, {%r30482, %r18339}; + mul.lo.s32 %r18340, %r30478, 16777619; + ld.global.u32 %r18341, [%rd854+56]; + mul.lo.s32 %r18342, %r30479, 16777619; + ld.global.u32 %r18343, [%rd854+60]; + xor.b32 %r18344, %r18342, %r18343; + xor.b32 %r30478, %r18340, %r18341; + mov.b64 %rd860, {%r30478, %r18344}; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + st.local.v2.u32 [%rd178+32], {%r30484, %r18315}; + st.local.v2.u32 [%rd178+40], {%r30480, %r18320}; + st.local.v2.u32 [%rd178+48], {%r30476, %r18325}; + st.local.v2.u32 [%rd178+56], {%r30474, %r18330}; + st.local.v2.u32 [%rd178+64], {%r30470, %r30471}; + st.local.v2.u32 [%rd178+72], {%r30482, %r18339}; + st.local.v2.u32 [%rd178+80], {%r30478, %r18344}; + add.s32 %r30384, %r30384, 1; + setp.lt.u32 %p37, %r30384, 512; + shr.u64 %rd861, %rd844, 32; + cvt.u32.u64 %r30434, %rd861; + shr.u64 %rd862, %rd845, 32; + cvt.u32.u64 %r30430, %rd862; + shr.u64 %rd863, %rd846, 32; + cvt.u32.u64 %r30426, %rd863; + shr.u64 %rd864, %rd847, 32; + cvt.u32.u64 %r30424, %rd864; + shr.u64 %rd865, %rd848, 32; + cvt.u32.u64 %r30432, %rd865; + shr.u64 %rd866, %rd849, 32; + cvt.u32.u64 %r30428, %rd866; + shr.u64 %rd867, %rd855, 32; + cvt.u32.u64 %r30485, %rd867; + shr.u64 %rd868, %rd856, 32; + cvt.u32.u64 %r30481, %rd868; + shr.u64 %rd869, %rd857, 32; + cvt.u32.u64 %r30477, %rd869; + shr.u64 %rd870, %rd858, 32; + cvt.u32.u64 %r30475, %rd870; + shr.u64 %rd871, %rd859, 32; + cvt.u32.u64 %r30483, %rd871; + shr.u64 %rd872, %rd860, 32; + cvt.u32.u64 %r30479, %rd872; + @%p37 bra $L__BB2_63; + + mov.u32 %r30385, 0; + st.local.v2.u32 [%rd3+96], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+104], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+112], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+120], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+128], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+136], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+144], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+152], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+160], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+168], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+176], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+184], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+192], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+200], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+208], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+216], {%r30385, %r30385}; + mov.u32 %r30400, -2147483648; + mov.u32 %r18359, 1; + st.local.v2.u32 [%rd3+88], {%r18359, %r30400}; + mov.u32 %r30386, %r30385; + mov.u32 %r30387, %r30385; + mov.u32 %r30388, %r30385; + mov.u32 %r30389, %r30385; + mov.u32 %r30390, %r30385; + mov.u32 %r30391, %r30385; + mov.u32 %r30392, %r30385; + mov.u32 %r30393, %r30385; + mov.u32 %r30394, %r30385; + mov.u32 %r30395, %r30385; + mov.u32 %r30396, %r30385; + mov.u32 %r30397, %r30385; + mov.u32 %r30398, %r30385; + mov.u32 %r30399, %r18359; + mov.u32 %r30401, %r30385; + mov.u32 %r30402, %r30385; + mov.u32 %r30403, %r30385; + mov.u32 %r30404, %r30385; + mov.u32 %r30405, %r30385; + mov.u32 %r30406, %r30385; + mov.u32 %r30407, %r30385; + mov.u32 %r30408, %r30385; + mov.u32 %r30409, %r30385; + mov.u32 %r30410, %r30385; + mov.u32 %r30411, %r30385; + mov.u32 %r30412, %r30385; + mov.u32 %r30413, %r30385; + mov.u32 %r30414, %r30385; + mov.u32 %r30415, %r30385; + mov.u32 %r30416, %r30385; + mov.u32 %r30417, %r30385; + mov.u32 %r30418, %r30385; + mov.u32 %r30435, %r30385; + +$L__BB2_65: + // begin inline asm + // xor5 + lop3.b32 %r18386, %r30421, %r30419, %r30417, 0x96; + lop3.b32 %r18386, %r18386, %r30415, %r30413, 0x96; + lop3.b32 %r18387, %r30422, %r30420, %r30418, 0x96; + lop3.b32 %r18387, %r18387, %r30416, %r30414, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18398, %r30433, %r30431, %r30411, 0x96; + lop3.b32 %r18398, %r18398, %r30409, %r30407, 0x96; + lop3.b32 %r18399, %r30434, %r30432, %r30412, 0x96; + lop3.b32 %r18399, %r18399, %r30410, %r30408, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18410, %r30429, %r30427, %r30405, 0x96; + lop3.b32 %r18410, %r18410, %r30403, %r30401, 0x96; + lop3.b32 %r18411, %r30430, %r30428, %r30406, 0x96; + lop3.b32 %r18411, %r18411, %r30404, %r30402, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18422, %r30425, %r30399, %r30397, 0x96; + lop3.b32 %r18422, %r18422, %r30395, %r30393, 0x96; + lop3.b32 %r18423, %r30426, %r30400, %r30398, 0x96; + lop3.b32 %r18423, %r18423, %r30396, %r30394, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18434, %r30423, %r30391, %r30389, 0x96; + lop3.b32 %r18434, %r18434, %r30387, %r30385, 0x96; + lop3.b32 %r18435, %r30424, %r30392, %r30390, 0x96; + lop3.b32 %r18435, %r18435, %r30388, %r30386, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18446, %r18399, %r18398, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18450, %r18398, %r18399, %r18359; + // end inline asm + xor.b32 %r18880, %r18446, %r18434; + xor.b32 %r18881, %r18450, %r18435; + xor.b32 %r18713, %r30421, %r18880; + xor.b32 %r18716, %r30422, %r18881; + xor.b32 %r18620, %r30419, %r18880; + xor.b32 %r18619, %r30420, %r18881; + xor.b32 %r18667, %r30417, %r18880; + xor.b32 %r18668, %r30418, %r18881; + xor.b32 %r18572, %r30415, %r18880; + xor.b32 %r18571, %r30416, %r18881; + xor.b32 %r18523, %r30413, %r18880; + xor.b32 %r18524, %r30414, %r18881; + // begin inline asm + shf.l.wrap.b32 %r18454, %r18411, %r18410, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18458, %r18410, %r18411, %r18359; + // end inline asm + xor.b32 %r18882, %r18454, %r18386; + xor.b32 %r18883, %r18458, %r18387; + xor.b32 %r18675, %r30433, %r18882; + xor.b32 %r18676, %r30434, %r18883; + xor.b32 %r18492, %r30431, %r18882; + xor.b32 %r18491, %r30432, %r18883; + xor.b32 %r18651, %r30411, %r18882; + xor.b32 %r18652, %r30412, %r18883; + xor.b32 %r18612, %r30409, %r18882; + xor.b32 %r18611, %r30410, %r18883; + xor.b32 %r18595, %r30407, %r18882; + xor.b32 %r18596, %r30408, %r18883; + // begin inline asm + shf.l.wrap.b32 %r18462, %r18423, %r18422, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18466, %r18422, %r18423, %r18359; + // end inline asm + xor.b32 %r18884, %r18462, %r18398; + xor.b32 %r18885, %r18466, %r18399; + xor.b32 %r18532, %r30429, %r18884; + xor.b32 %r18531, %r30430, %r18885; + xor.b32 %r18659, %r30427, %r18884; + xor.b32 %r18660, %r30428, %r18885; + xor.b32 %r18540, %r30405, %r18884; + xor.b32 %r18539, %r30406, %r18885; + xor.b32 %r18643, %r30403, %r18884; + xor.b32 %r18644, %r30404, %r18885; + xor.b32 %r18508, %r30401, %r18884; + xor.b32 %r18507, %r30402, %r18885; + // begin inline asm + shf.l.wrap.b32 %r18470, %r18435, %r18434, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18474, %r18434, %r18435, %r18359; + // end inline asm + xor.b32 %r18886, %r18470, %r18410; + xor.b32 %r18887, %r18474, %r18411; + xor.b32 %r18627, %r30425, %r18886; + xor.b32 %r18628, %r30426, %r18887; + xor.b32 %r18604, %r30399, %r18886; + xor.b32 %r18603, %r30400, %r18887; + xor.b32 %r18547, %r30397, %r18886; + xor.b32 %r18548, %r30398, %r18887; + xor.b32 %r18635, %r30395, %r18886; + xor.b32 %r18636, %r30396, %r18887; + xor.b32 %r18564, %r30393, %r18886; + xor.b32 %r18563, %r30394, %r18887; + // begin inline asm + shf.l.wrap.b32 %r18478, %r18387, %r18386, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18482, %r18386, %r18387, %r18359; + // end inline asm + xor.b32 %r18888, %r18478, %r18422; + xor.b32 %r18889, %r18482, %r18423; + xor.b32 %r18579, %r30423, %r18888; + xor.b32 %r18580, %r30424, %r18889; + xor.b32 %r18499, %r30391, %r18888; + xor.b32 %r18500, %r30392, %r18889; + xor.b32 %r18516, %r30389, %r18888; + xor.b32 %r18515, %r30390, %r18889; + xor.b32 %r18555, %r30387, %r18888; + xor.b32 %r18556, %r30388, %r18889; + xor.b32 %r18587, %r30385, %r18888; + xor.b32 %r18588, %r30386, %r18889; + mov.u32 %r18493, 44; + // begin inline asm + shf.l.wrap.b32 %r18486, %r18492, %r18491, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18490, %r18491, %r18492, %r18493; + // end inline asm + mov.u32 %r18501, 20; + // begin inline asm + shf.l.wrap.b32 %r18494, %r18500, %r18499, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18498, %r18499, %r18500, %r18501; + // end inline asm + mov.u32 %r18509, 61; + // begin inline asm + shf.l.wrap.b32 %r18502, %r18508, %r18507, %r18509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18506, %r18507, %r18508, %r18509; + // end inline asm + mov.u32 %r18517, 39; + // begin inline asm + shf.l.wrap.b32 %r18510, %r18516, %r18515, %r18517; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18514, %r18515, %r18516, %r18517; + // end inline asm + mov.u32 %r18525, 18; + // begin inline asm + shf.l.wrap.b32 %r18518, %r18524, %r18523, %r18525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18522, %r18523, %r18524, %r18525; + // end inline asm + mov.u32 %r18533, 62; + // begin inline asm + shf.l.wrap.b32 %r18526, %r18532, %r18531, %r18533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18530, %r18531, %r18532, %r18533; + // end inline asm + mov.u32 %r18541, 43; + // begin inline asm + shf.l.wrap.b32 %r18534, %r18540, %r18539, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18538, %r18539, %r18540, %r18541; + // end inline asm + mov.u32 %r18549, 25; + // begin inline asm + shf.l.wrap.b32 %r18542, %r18548, %r18547, %r18549; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18546, %r18547, %r18548, %r18549; + // end inline asm + mov.u32 %r18557, 8; + // begin inline asm + shf.l.wrap.b32 %r18550, %r18556, %r18555, %r18557; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18554, %r18555, %r18556, %r18557; + // end inline asm + mov.u32 %r18565, 56; + // begin inline asm + shf.l.wrap.b32 %r18558, %r18564, %r18563, %r18565; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18562, %r18563, %r18564, %r18565; + // end inline asm + mov.u32 %r18573, 41; + // begin inline asm + shf.l.wrap.b32 %r18566, %r18572, %r18571, %r18573; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18570, %r18571, %r18572, %r18573; + // end inline asm + mov.u32 %r18581, 27; + // begin inline asm + shf.l.wrap.b32 %r18574, %r18580, %r18579, %r18581; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18578, %r18579, %r18580, %r18581; + // end inline asm + mov.u32 %r18589, 14; + // begin inline asm + shf.l.wrap.b32 %r18582, %r18588, %r18587, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18586, %r18587, %r18588, %r18589; + // end inline asm + mov.u32 %r18597, 2; + // begin inline asm + shf.l.wrap.b32 %r18590, %r18596, %r18595, %r18597; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18594, %r18595, %r18596, %r18597; + // end inline asm + mov.u32 %r18605, 55; + // begin inline asm + shf.l.wrap.b32 %r18598, %r18604, %r18603, %r18605; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18602, %r18603, %r18604, %r18605; + // end inline asm + mov.u32 %r18613, 45; + // begin inline asm + shf.l.wrap.b32 %r18606, %r18612, %r18611, %r18613; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18610, %r18611, %r18612, %r18613; + // end inline asm + mov.u32 %r18621, 36; + // begin inline asm + shf.l.wrap.b32 %r18614, %r18620, %r18619, %r18621; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18618, %r18619, %r18620, %r18621; + // end inline asm + mov.u32 %r18629, 28; + // begin inline asm + shf.l.wrap.b32 %r18622, %r18628, %r18627, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18626, %r18627, %r18628, %r18629; + // end inline asm + mov.u32 %r18637, 21; + // begin inline asm + shf.l.wrap.b32 %r18630, %r18636, %r18635, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18634, %r18635, %r18636, %r18637; + // end inline asm + mov.u32 %r18645, 15; + // begin inline asm + shf.l.wrap.b32 %r18638, %r18644, %r18643, %r18645; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18642, %r18643, %r18644, %r18645; + // end inline asm + mov.u32 %r18653, 10; + // begin inline asm + shf.l.wrap.b32 %r18646, %r18652, %r18651, %r18653; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18650, %r18651, %r18652, %r18653; + // end inline asm + mov.u32 %r18661, 6; + // begin inline asm + shf.l.wrap.b32 %r18654, %r18660, %r18659, %r18661; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18658, %r18659, %r18660, %r18661; + // end inline asm + mov.u32 %r18669, 3; + // begin inline asm + shf.l.wrap.b32 %r18662, %r18668, %r18667, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18666, %r18667, %r18668, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18670, %r18676, %r18675, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18674, %r18675, %r18676, %r18359; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18678, %r18713, %r18486, %r18534, 0xD2; + lop3.b32 %r18679, %r18716, %r18490, %r18538, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30433, %r18486, %r18534, %r18630, 0xD2; + lop3.b32 %r30434, %r18490, %r18538, %r18634, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30429, %r18534, %r18630, %r18582, 0xD2; + lop3.b32 %r30430, %r18538, %r18634, %r18586, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30425, %r18630, %r18582, %r18713, 0xD2; + lop3.b32 %r30426, %r18634, %r18586, %r18716, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30423, %r18582, %r18713, %r18486, 0xD2; + lop3.b32 %r30424, %r18586, %r18716, %r18490, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30419, %r18622, %r18494, %r18662, 0xD2; + lop3.b32 %r30420, %r18626, %r18498, %r18666, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30431, %r18494, %r18662, %r18606, 0xD2; + lop3.b32 %r30432, %r18498, %r18666, %r18610, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30427, %r18662, %r18606, %r18502, 0xD2; + lop3.b32 %r30428, %r18666, %r18610, %r18506, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30399, %r18606, %r18502, %r18622, 0xD2; + lop3.b32 %r30400, %r18610, %r18506, %r18626, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30399, %r30400}; + // begin inline asm + // chi + lop3.b32 %r30391, %r18502, %r18622, %r18494, 0xD2; + lop3.b32 %r30392, %r18506, %r18626, %r18498, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30391, %r30392}; + // begin inline asm + // chi + lop3.b32 %r30417, %r18670, %r18654, %r18542, 0xD2; + lop3.b32 %r30418, %r18674, %r18658, %r18546, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30417, %r30418}; + // begin inline asm + // chi + lop3.b32 %r30411, %r18654, %r18542, %r18550, 0xD2; + lop3.b32 %r30412, %r18658, %r18546, %r18554, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30411, %r30412}; + // begin inline asm + // chi + lop3.b32 %r30405, %r18542, %r18550, %r18518, 0xD2; + lop3.b32 %r30406, %r18546, %r18554, %r18522, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30405, %r30406}; + // begin inline asm + // chi + lop3.b32 %r30397, %r18550, %r18518, %r18670, 0xD2; + lop3.b32 %r30398, %r18554, %r18522, %r18674, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30397, %r30398}; + // begin inline asm + // chi + lop3.b32 %r30389, %r18518, %r18670, %r18654, 0xD2; + lop3.b32 %r30390, %r18522, %r18674, %r18658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30389, %r30390}; + // begin inline asm + // chi + lop3.b32 %r30415, %r18574, %r18614, %r18646, 0xD2; + lop3.b32 %r30416, %r18578, %r18618, %r18650, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30415, %r30416}; + // begin inline asm + // chi + lop3.b32 %r30409, %r18614, %r18646, %r18638, 0xD2; + lop3.b32 %r30410, %r18618, %r18650, %r18642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30409, %r30410}; + // begin inline asm + // chi + lop3.b32 %r30403, %r18646, %r18638, %r18558, 0xD2; + lop3.b32 %r30404, %r18650, %r18642, %r18562, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30403, %r30404}; + // begin inline asm + // chi + lop3.b32 %r30395, %r18638, %r18558, %r18574, 0xD2; + lop3.b32 %r30396, %r18642, %r18562, %r18578, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30395, %r30396}; + // begin inline asm + // chi + lop3.b32 %r30387, %r18558, %r18574, %r18614, 0xD2; + lop3.b32 %r30388, %r18562, %r18578, %r18618, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30387, %r30388}; + // begin inline asm + // chi + lop3.b32 %r30413, %r18526, %r18598, %r18510, 0xD2; + lop3.b32 %r30414, %r18530, %r18602, %r18514, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30413, %r30414}; + // begin inline asm + // chi + lop3.b32 %r30407, %r18598, %r18510, %r18566, 0xD2; + lop3.b32 %r30408, %r18602, %r18514, %r18570, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30407, %r30408}; + // begin inline asm + // chi + lop3.b32 %r30401, %r18510, %r18566, %r18590, 0xD2; + lop3.b32 %r30402, %r18514, %r18570, %r18594, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30401, %r30402}; + // begin inline asm + // chi + lop3.b32 %r30393, %r18566, %r18590, %r18526, 0xD2; + lop3.b32 %r30394, %r18570, %r18594, %r18530, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30393, %r30394}; + // begin inline asm + // chi + lop3.b32 %r30385, %r18590, %r18526, %r18598, 0xD2; + lop3.b32 %r30386, %r18594, %r18530, %r18602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30385, %r30386}; + mul.wide.s32 %rd874, %r30435, 8; + add.s64 %rd873, %rd806, %rd874; + // begin inline asm + ld.global.nc.v2.u32 {%r18878,%r18879}, [%rd873]; + // end inline asm + xor.b32 %r30421, %r18678, %r18878; + xor.b32 %r30422, %r18679, %r18879; + add.s32 %r30435, %r30435, 1; + setp.lt.u32 %p38, %r30435, 23; + @%p38 bra $L__BB2_65; + + st.local.v2.u32 [%rd3+32], {%r30433, %r30434}; + st.local.v2.u32 [%rd3+72], {%r30431, %r30432}; + st.local.v2.u32 [%rd3+40], {%r30429, %r30430}; + st.local.v2.u32 [%rd3+80], {%r30427, %r30428}; + st.local.v2.u32 [%rd3+48], {%r30425, %r30426}; + st.local.v2.u32 [%rd3+56], {%r30423, %r30424}; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + // begin inline asm + // xor5 + lop3.b32 %r18890, %r30421, %r30419, %r30417, 0x96; + lop3.b32 %r18890, %r18890, %r30415, %r30413, 0x96; + lop3.b32 %r18891, %r30422, %r30420, %r30418, 0x96; + lop3.b32 %r18891, %r18891, %r30416, %r30414, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18902, %r30433, %r30431, %r30411, 0x96; + lop3.b32 %r18902, %r18902, %r30409, %r30407, 0x96; + lop3.b32 %r18903, %r30434, %r30432, %r30412, 0x96; + lop3.b32 %r18903, %r18903, %r30410, %r30408, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18914, %r30429, %r30427, %r30405, 0x96; + lop3.b32 %r18914, %r18914, %r30403, %r30401, 0x96; + lop3.b32 %r18915, %r30430, %r30428, %r30406, 0x96; + lop3.b32 %r18915, %r18915, %r30404, %r30402, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18926, %r30425, %r30399, %r30397, 0x96; + lop3.b32 %r18926, %r18926, %r30395, %r30393, 0x96; + lop3.b32 %r18927, %r30426, %r30400, %r30398, 0x96; + lop3.b32 %r18927, %r18927, %r30396, %r30394, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18938, %r30423, %r30391, %r30389, 0x96; + lop3.b32 %r18938, %r18938, %r30387, %r30385, 0x96; + lop3.b32 %r18939, %r30424, %r30392, %r30390, 0x96; + lop3.b32 %r18939, %r18939, %r30388, %r30386, 0x96; + // end inline asm + mov.u32 %r19142, 1; + // begin inline asm + shf.l.wrap.b32 %r18950, %r18903, %r18902, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18954, %r18902, %r18903, %r19142; + // end inline asm + xor.b32 %r19169, %r18950, %r18938; + xor.b32 %r19170, %r18954, %r18939; + xor.b32 %r19097, %r30421, %r19169; + xor.b32 %r19100, %r30422, %r19170; + xor.b32 %r19060, %r30418, %r19170; + xor.b32 %r19059, %r30417, %r19169; + st.local.v2.u32 [%rd3+104], {%r19059, %r19060}; + // begin inline asm + shf.l.wrap.b32 %r18958, %r18915, %r18914, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18962, %r18914, %r18915, %r19142; + // end inline asm + xor.b32 %r19171, %r18958, %r18890; + xor.b32 %r19172, %r18962, %r18891; + xor.b32 %r18996, %r30431, %r19171; + xor.b32 %r18995, %r30432, %r19172; + xor.b32 %r19035, %r30410, %r19172; + xor.b32 %r19036, %r30409, %r19171; + st.local.v2.u32 [%rd3+152], {%r19036, %r19035}; + // begin inline asm + shf.l.wrap.b32 %r18966, %r18927, %r18926, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18970, %r18926, %r18927, %r19142; + // end inline asm + xor.b32 %r19173, %r18966, %r18902; + xor.b32 %r19174, %r18970, %r18903; + xor.b32 %r19019, %r30406, %r19174; + xor.b32 %r19020, %r30405, %r19173; + st.local.v2.u32 [%rd3+120], {%r19020, %r19019}; + xor.b32 %r19011, %r30402, %r19174; + xor.b32 %r19012, %r30401, %r19173; + st.local.v2.u32 [%rd3+200], {%r19012, %r19011}; + // begin inline asm + shf.l.wrap.b32 %r18974, %r18939, %r18938, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18978, %r18938, %r18939, %r19142; + // end inline asm + xor.b32 %r19175, %r18974, %r18914; + xor.b32 %r19176, %r18978, %r18915; + xor.b32 %r19043, %r30425, %r19175; + xor.b32 %r19044, %r30426, %r19176; + xor.b32 %r19052, %r30396, %r19176; + xor.b32 %r19051, %r30395, %r19175; + st.local.v2.u32 [%rd3+168], {%r19051, %r19052}; + // begin inline asm + shf.l.wrap.b32 %r18982, %r18891, %r18890, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18986, %r18890, %r18891, %r19142; + // end inline asm + xor.b32 %r19177, %r18982, %r18926; + xor.b32 %r19178, %r18986, %r18927; + xor.b32 %r19003, %r30391, %r19177; + xor.b32 %r19004, %r30392, %r19178; + xor.b32 %r19028, %r30386, %r19178; + xor.b32 %r19027, %r30385, %r19177; + st.local.v2.u32 [%rd3+216], {%r19027, %r19028}; + // begin inline asm + shf.l.wrap.b32 %r18990, %r18996, %r18995, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18994, %r18995, %r18996, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18998, %r19004, %r19003, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19002, %r19003, %r19004, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19010, %r19011, %r19012, %r18509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19006, %r19012, %r19011, %r18509; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r19006, %r19010}; + // begin inline asm + shf.l.wrap.b32 %r19014, %r19020, %r19019, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19018, %r19019, %r19020, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19022, %r19028, %r19027, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19026, %r19027, %r19028, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19034, %r19035, %r19036, %r18613; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19030, %r19036, %r19035, %r18613; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r19030, %r19034}; + // begin inline asm + shf.l.wrap.b32 %r19038, %r19044, %r19043, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19042, %r19043, %r19044, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19046, %r19052, %r19051, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19050, %r19051, %r19052, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19054, %r19060, %r19059, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19058, %r19059, %r19060, %r18669; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19062, %r19097, %r18990, %r19014, 0xD2; + lop3.b32 %r19063, %r19100, %r18994, %r19018, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19070, %r18990, %r19014, %r19046, 0xD2; + lop3.b32 %r19071, %r18994, %r19018, %r19050, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r19070, %r19071}; + // begin inline asm + // chi + lop3.b32 %r19078, %r19014, %r19046, %r19022, 0xD2; + lop3.b32 %r19079, %r19018, %r19050, %r19026, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r19078, %r19079}; + // begin inline asm + // chi + lop3.b32 %r19086, %r19046, %r19022, %r19097, 0xD2; + lop3.b32 %r19087, %r19050, %r19026, %r19100, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r19086, %r19087}; + // begin inline asm + // chi + lop3.b32 %r19094, %r19022, %r19097, %r18990, 0xD2; + lop3.b32 %r19095, %r19026, %r19100, %r18994, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r19094, %r19095}; + // begin inline asm + // chi + lop3.b32 %r19102, %r19038, %r18998, %r19054, 0xD2; + lop3.b32 %r19103, %r19042, %r19002, %r19058, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r19102, %r19103}; + // begin inline asm + // chi + lop3.b32 %r19110, %r18998, %r19054, %r19030, 0xD2; + lop3.b32 %r19111, %r19002, %r19058, %r19034, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r19110, %r19111}; + // begin inline asm + // chi + lop3.b32 %r19118, %r19054, %r19030, %r19006, 0xD2; + lop3.b32 %r19119, %r19058, %r19034, %r19010, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r19118, %r19119}; + // begin inline asm + ld.global.nc.v2.u32 {%r19126,%r19127}, [%rd807]; + // end inline asm + xor.b32 %r19179, %r19063, %r19127; + xor.b32 %r19180, %r19062, %r19126; + mov.b64 %rd1265, {%r19180, %r19179}; + mov.b64 %rd1266, {%r19070, %r19071}; + mov.b64 %rd1267, {%r19078, %r19079}; + mov.b64 %rd1268, {%r19094, %r19095}; + mov.u32 %r30436, 0; + st.local.v2.u32 [%rd3+24], {%r19180, %r19179}; + st.local.v2.u32 [%rd178+96], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+104], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+112], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+120], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+128], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+136], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+144], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+152], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+160], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+168], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+176], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+184], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+192], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+200], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+208], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+216], {%r30436, %r30436}; + mov.u32 %r30451, -2147483648; + st.local.v2.u32 [%rd178+88], {%r19142, %r30451}; + mov.u32 %r30437, %r30436; + mov.u32 %r30438, %r30436; + mov.u32 %r30439, %r30436; + mov.u32 %r30440, %r30436; + mov.u32 %r30441, %r30436; + mov.u32 %r30442, %r30436; + mov.u32 %r30443, %r30436; + mov.u32 %r30444, %r30436; + mov.u32 %r30445, %r30436; + mov.u32 %r30446, %r30436; + mov.u32 %r30447, %r30436; + mov.u32 %r30448, %r30436; + mov.u32 %r30449, %r30436; + mov.u32 %r30450, %r19142; + mov.u32 %r30452, %r30436; + mov.u32 %r30453, %r30436; + mov.u32 %r30454, %r30436; + mov.u32 %r30455, %r30436; + mov.u32 %r30456, %r30436; + mov.u32 %r30457, %r30436; + mov.u32 %r30458, %r30436; + mov.u32 %r30459, %r30436; + mov.u32 %r30460, %r30436; + mov.u32 %r30461, %r30436; + mov.u32 %r30462, %r30436; + mov.u32 %r30463, %r30436; + mov.u32 %r30464, %r30436; + mov.u32 %r30465, %r30436; + mov.u32 %r30466, %r30436; + mov.u32 %r30467, %r30436; + mov.u32 %r30468, %r30436; + mov.u32 %r30469, %r30436; + mov.u32 %r30486, %r30436; + +$L__BB2_67: + // begin inline asm + // xor5 + lop3.b32 %r19181, %r30472, %r30470, %r30468, 0x96; + lop3.b32 %r19181, %r19181, %r30466, %r30464, 0x96; + lop3.b32 %r19182, %r30473, %r30471, %r30469, 0x96; + lop3.b32 %r19182, %r19182, %r30467, %r30465, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19193, %r30484, %r30482, %r30462, 0x96; + lop3.b32 %r19193, %r19193, %r30460, %r30458, 0x96; + lop3.b32 %r19194, %r30485, %r30483, %r30463, 0x96; + lop3.b32 %r19194, %r19194, %r30461, %r30459, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19205, %r30480, %r30478, %r30456, 0x96; + lop3.b32 %r19205, %r19205, %r30454, %r30452, 0x96; + lop3.b32 %r19206, %r30481, %r30479, %r30457, 0x96; + lop3.b32 %r19206, %r19206, %r30455, %r30453, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19217, %r30476, %r30450, %r30448, 0x96; + lop3.b32 %r19217, %r19217, %r30446, %r30444, 0x96; + lop3.b32 %r19218, %r30477, %r30451, %r30449, 0x96; + lop3.b32 %r19218, %r19218, %r30447, %r30445, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19229, %r30474, %r30442, %r30440, 0x96; + lop3.b32 %r19229, %r19229, %r30438, %r30436, 0x96; + lop3.b32 %r19230, %r30475, %r30443, %r30441, 0x96; + lop3.b32 %r19230, %r19230, %r30439, %r30437, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19241, %r19194, %r19193, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19245, %r19193, %r19194, %r19142; + // end inline asm + xor.b32 %r19675, %r19241, %r19229; + xor.b32 %r19676, %r19245, %r19230; + xor.b32 %r19508, %r30472, %r19675; + xor.b32 %r19511, %r30473, %r19676; + xor.b32 %r19415, %r30470, %r19675; + xor.b32 %r19414, %r30471, %r19676; + xor.b32 %r19462, %r30468, %r19675; + xor.b32 %r19463, %r30469, %r19676; + xor.b32 %r19367, %r30466, %r19675; + xor.b32 %r19366, %r30467, %r19676; + xor.b32 %r19318, %r30464, %r19675; + xor.b32 %r19319, %r30465, %r19676; + // begin inline asm + shf.l.wrap.b32 %r19249, %r19206, %r19205, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19253, %r19205, %r19206, %r19142; + // end inline asm + xor.b32 %r19677, %r19249, %r19181; + xor.b32 %r19678, %r19253, %r19182; + xor.b32 %r19470, %r30484, %r19677; + xor.b32 %r19471, %r30485, %r19678; + xor.b32 %r19287, %r30482, %r19677; + xor.b32 %r19286, %r30483, %r19678; + xor.b32 %r19446, %r30462, %r19677; + xor.b32 %r19447, %r30463, %r19678; + xor.b32 %r19407, %r30460, %r19677; + xor.b32 %r19406, %r30461, %r19678; + xor.b32 %r19390, %r30458, %r19677; + xor.b32 %r19391, %r30459, %r19678; + // begin inline asm + shf.l.wrap.b32 %r19257, %r19218, %r19217, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19261, %r19217, %r19218, %r19142; + // end inline asm + xor.b32 %r19679, %r19257, %r19193; + xor.b32 %r19680, %r19261, %r19194; + xor.b32 %r19327, %r30480, %r19679; + xor.b32 %r19326, %r30481, %r19680; + xor.b32 %r19454, %r30478, %r19679; + xor.b32 %r19455, %r30479, %r19680; + xor.b32 %r19335, %r30456, %r19679; + xor.b32 %r19334, %r30457, %r19680; + xor.b32 %r19438, %r30454, %r19679; + xor.b32 %r19439, %r30455, %r19680; + xor.b32 %r19303, %r30452, %r19679; + xor.b32 %r19302, %r30453, %r19680; + // begin inline asm + shf.l.wrap.b32 %r19265, %r19230, %r19229, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19269, %r19229, %r19230, %r19142; + // end inline asm + xor.b32 %r19681, %r19265, %r19205; + xor.b32 %r19682, %r19269, %r19206; + xor.b32 %r19422, %r30476, %r19681; + xor.b32 %r19423, %r30477, %r19682; + xor.b32 %r19399, %r30450, %r19681; + xor.b32 %r19398, %r30451, %r19682; + xor.b32 %r19342, %r30448, %r19681; + xor.b32 %r19343, %r30449, %r19682; + xor.b32 %r19430, %r30446, %r19681; + xor.b32 %r19431, %r30447, %r19682; + xor.b32 %r19359, %r30444, %r19681; + xor.b32 %r19358, %r30445, %r19682; + // begin inline asm + shf.l.wrap.b32 %r19273, %r19182, %r19181, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19277, %r19181, %r19182, %r19142; + // end inline asm + xor.b32 %r19683, %r19273, %r19217; + xor.b32 %r19684, %r19277, %r19218; + xor.b32 %r19374, %r30474, %r19683; + xor.b32 %r19375, %r30475, %r19684; + xor.b32 %r19294, %r30442, %r19683; + xor.b32 %r19295, %r30443, %r19684; + xor.b32 %r19311, %r30440, %r19683; + xor.b32 %r19310, %r30441, %r19684; + xor.b32 %r19350, %r30438, %r19683; + xor.b32 %r19351, %r30439, %r19684; + xor.b32 %r19382, %r30436, %r19683; + xor.b32 %r19383, %r30437, %r19684; + mov.u32 %r19288, 44; + // begin inline asm + shf.l.wrap.b32 %r19281, %r19287, %r19286, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19285, %r19286, %r19287, %r19288; + // end inline asm + mov.u32 %r19296, 20; + // begin inline asm + shf.l.wrap.b32 %r19289, %r19295, %r19294, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19293, %r19294, %r19295, %r19296; + // end inline asm + mov.u32 %r19304, 61; + // begin inline asm + shf.l.wrap.b32 %r19297, %r19303, %r19302, %r19304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19301, %r19302, %r19303, %r19304; + // end inline asm + mov.u32 %r19312, 39; + // begin inline asm + shf.l.wrap.b32 %r19305, %r19311, %r19310, %r19312; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19309, %r19310, %r19311, %r19312; + // end inline asm + mov.u32 %r19320, 18; + // begin inline asm + shf.l.wrap.b32 %r19313, %r19319, %r19318, %r19320; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19317, %r19318, %r19319, %r19320; + // end inline asm + mov.u32 %r19328, 62; + // begin inline asm + shf.l.wrap.b32 %r19321, %r19327, %r19326, %r19328; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19325, %r19326, %r19327, %r19328; + // end inline asm + mov.u32 %r19336, 43; + // begin inline asm + shf.l.wrap.b32 %r19329, %r19335, %r19334, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19333, %r19334, %r19335, %r19336; + // end inline asm + mov.u32 %r19344, 25; + // begin inline asm + shf.l.wrap.b32 %r19337, %r19343, %r19342, %r19344; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19341, %r19342, %r19343, %r19344; + // end inline asm + mov.u32 %r19352, 8; + // begin inline asm + shf.l.wrap.b32 %r19345, %r19351, %r19350, %r19352; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19349, %r19350, %r19351, %r19352; + // end inline asm + mov.u32 %r19360, 56; + // begin inline asm + shf.l.wrap.b32 %r19353, %r19359, %r19358, %r19360; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19357, %r19358, %r19359, %r19360; + // end inline asm + mov.u32 %r19368, 41; + // begin inline asm + shf.l.wrap.b32 %r19361, %r19367, %r19366, %r19368; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19365, %r19366, %r19367, %r19368; + // end inline asm + mov.u32 %r19376, 27; + // begin inline asm + shf.l.wrap.b32 %r19369, %r19375, %r19374, %r19376; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19373, %r19374, %r19375, %r19376; + // end inline asm + mov.u32 %r19384, 14; + // begin inline asm + shf.l.wrap.b32 %r19377, %r19383, %r19382, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19381, %r19382, %r19383, %r19384; + // end inline asm + mov.u32 %r19392, 2; + // begin inline asm + shf.l.wrap.b32 %r19385, %r19391, %r19390, %r19392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19389, %r19390, %r19391, %r19392; + // end inline asm + mov.u32 %r19400, 55; + // begin inline asm + shf.l.wrap.b32 %r19393, %r19399, %r19398, %r19400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19397, %r19398, %r19399, %r19400; + // end inline asm + mov.u32 %r19408, 45; + // begin inline asm + shf.l.wrap.b32 %r19401, %r19407, %r19406, %r19408; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19405, %r19406, %r19407, %r19408; + // end inline asm + mov.u32 %r19416, 36; + // begin inline asm + shf.l.wrap.b32 %r19409, %r19415, %r19414, %r19416; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19413, %r19414, %r19415, %r19416; + // end inline asm + mov.u32 %r19424, 28; + // begin inline asm + shf.l.wrap.b32 %r19417, %r19423, %r19422, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19421, %r19422, %r19423, %r19424; + // end inline asm + mov.u32 %r19432, 21; + // begin inline asm + shf.l.wrap.b32 %r19425, %r19431, %r19430, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19429, %r19430, %r19431, %r19432; + // end inline asm + mov.u32 %r19440, 15; + // begin inline asm + shf.l.wrap.b32 %r19433, %r19439, %r19438, %r19440; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19437, %r19438, %r19439, %r19440; + // end inline asm + mov.u32 %r19448, 10; + // begin inline asm + shf.l.wrap.b32 %r19441, %r19447, %r19446, %r19448; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19445, %r19446, %r19447, %r19448; + // end inline asm + mov.u32 %r19456, 6; + // begin inline asm + shf.l.wrap.b32 %r19449, %r19455, %r19454, %r19456; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19453, %r19454, %r19455, %r19456; + // end inline asm + mov.u32 %r19464, 3; + // begin inline asm + shf.l.wrap.b32 %r19457, %r19463, %r19462, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19461, %r19462, %r19463, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19465, %r19471, %r19470, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19469, %r19470, %r19471, %r19142; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19473, %r19508, %r19281, %r19329, 0xD2; + lop3.b32 %r19474, %r19511, %r19285, %r19333, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30484, %r19281, %r19329, %r19425, 0xD2; + lop3.b32 %r30485, %r19285, %r19333, %r19429, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30480, %r19329, %r19425, %r19377, 0xD2; + lop3.b32 %r30481, %r19333, %r19429, %r19381, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30476, %r19425, %r19377, %r19508, 0xD2; + lop3.b32 %r30477, %r19429, %r19381, %r19511, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30474, %r19377, %r19508, %r19281, 0xD2; + lop3.b32 %r30475, %r19381, %r19511, %r19285, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30470, %r19417, %r19289, %r19457, 0xD2; + lop3.b32 %r30471, %r19421, %r19293, %r19461, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30482, %r19289, %r19457, %r19401, 0xD2; + lop3.b32 %r30483, %r19293, %r19461, %r19405, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30478, %r19457, %r19401, %r19297, 0xD2; + lop3.b32 %r30479, %r19461, %r19405, %r19301, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30450, %r19401, %r19297, %r19417, 0xD2; + lop3.b32 %r30451, %r19405, %r19301, %r19421, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r30450, %r30451}; + // begin inline asm + // chi + lop3.b32 %r30442, %r19297, %r19417, %r19289, 0xD2; + lop3.b32 %r30443, %r19301, %r19421, %r19293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r30442, %r30443}; + // begin inline asm + // chi + lop3.b32 %r30468, %r19465, %r19449, %r19337, 0xD2; + lop3.b32 %r30469, %r19469, %r19453, %r19341, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+104], {%r30468, %r30469}; + // begin inline asm + // chi + lop3.b32 %r30462, %r19449, %r19337, %r19345, 0xD2; + lop3.b32 %r30463, %r19453, %r19341, %r19349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+112], {%r30462, %r30463}; + // begin inline asm + // chi + lop3.b32 %r30456, %r19337, %r19345, %r19313, 0xD2; + lop3.b32 %r30457, %r19341, %r19349, %r19317, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+120], {%r30456, %r30457}; + // begin inline asm + // chi + lop3.b32 %r30448, %r19345, %r19313, %r19465, 0xD2; + lop3.b32 %r30449, %r19349, %r19317, %r19469, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+128], {%r30448, %r30449}; + // begin inline asm + // chi + lop3.b32 %r30440, %r19313, %r19465, %r19449, 0xD2; + lop3.b32 %r30441, %r19317, %r19469, %r19453, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+136], {%r30440, %r30441}; + // begin inline asm + // chi + lop3.b32 %r30466, %r19369, %r19409, %r19441, 0xD2; + lop3.b32 %r30467, %r19373, %r19413, %r19445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+144], {%r30466, %r30467}; + // begin inline asm + // chi + lop3.b32 %r30460, %r19409, %r19441, %r19433, 0xD2; + lop3.b32 %r30461, %r19413, %r19445, %r19437, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+152], {%r30460, %r30461}; + // begin inline asm + // chi + lop3.b32 %r30454, %r19441, %r19433, %r19353, 0xD2; + lop3.b32 %r30455, %r19445, %r19437, %r19357, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+160], {%r30454, %r30455}; + // begin inline asm + // chi + lop3.b32 %r30446, %r19433, %r19353, %r19369, 0xD2; + lop3.b32 %r30447, %r19437, %r19357, %r19373, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+168], {%r30446, %r30447}; + // begin inline asm + // chi + lop3.b32 %r30438, %r19353, %r19369, %r19409, 0xD2; + lop3.b32 %r30439, %r19357, %r19373, %r19413, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+176], {%r30438, %r30439}; + // begin inline asm + // chi + lop3.b32 %r30464, %r19321, %r19393, %r19305, 0xD2; + lop3.b32 %r30465, %r19325, %r19397, %r19309, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+184], {%r30464, %r30465}; + // begin inline asm + // chi + lop3.b32 %r30458, %r19393, %r19305, %r19361, 0xD2; + lop3.b32 %r30459, %r19397, %r19309, %r19365, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+192], {%r30458, %r30459}; + // begin inline asm + // chi + lop3.b32 %r30452, %r19305, %r19361, %r19385, 0xD2; + lop3.b32 %r30453, %r19309, %r19365, %r19389, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+200], {%r30452, %r30453}; + // begin inline asm + // chi + lop3.b32 %r30444, %r19361, %r19385, %r19321, 0xD2; + lop3.b32 %r30445, %r19365, %r19389, %r19325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+208], {%r30444, %r30445}; + // begin inline asm + // chi + lop3.b32 %r30436, %r19385, %r19321, %r19393, 0xD2; + lop3.b32 %r30437, %r19389, %r19325, %r19397, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+216], {%r30436, %r30437}; + mul.wide.s32 %rd881, %r30486, 8; + add.s64 %rd880, %rd806, %rd881; + // begin inline asm + ld.global.nc.v2.u32 {%r19673,%r19674}, [%rd880]; + // end inline asm + xor.b32 %r30472, %r19473, %r19673; + xor.b32 %r30473, %r19474, %r19674; + add.s32 %r30486, %r30486, 1; + setp.lt.u32 %p39, %r30486, 23; + @%p39 bra $L__BB2_67; + + mov.u32 %r19784, 1; + st.local.v2.u32 [%rd178+32], {%r30484, %r30485}; + st.local.v2.u32 [%rd178+72], {%r30482, %r30483}; + st.local.v2.u32 [%rd178+40], {%r30480, %r30481}; + st.local.v2.u32 [%rd178+80], {%r30478, %r30479}; + st.local.v2.u32 [%rd178+48], {%r30476, %r30477}; + st.local.v2.u32 [%rd178+56], {%r30474, %r30475}; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + // begin inline asm + // xor5 + lop3.b32 %r19685, %r30472, %r30470, %r30468, 0x96; + lop3.b32 %r19685, %r19685, %r30466, %r30464, 0x96; + lop3.b32 %r19686, %r30473, %r30471, %r30469, 0x96; + lop3.b32 %r19686, %r19686, %r30467, %r30465, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19697, %r30484, %r30482, %r30462, 0x96; + lop3.b32 %r19697, %r19697, %r30460, %r30458, 0x96; + lop3.b32 %r19698, %r30485, %r30483, %r30463, 0x96; + lop3.b32 %r19698, %r19698, %r30461, %r30459, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19709, %r30480, %r30478, %r30456, 0x96; + lop3.b32 %r19709, %r19709, %r30454, %r30452, 0x96; + lop3.b32 %r19710, %r30481, %r30479, %r30457, 0x96; + lop3.b32 %r19710, %r19710, %r30455, %r30453, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19721, %r30476, %r30450, %r30448, 0x96; + lop3.b32 %r19721, %r19721, %r30446, %r30444, 0x96; + lop3.b32 %r19722, %r30477, %r30451, %r30449, 0x96; + lop3.b32 %r19722, %r19722, %r30447, %r30445, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19733, %r30474, %r30442, %r30440, 0x96; + lop3.b32 %r19733, %r19733, %r30438, %r30436, 0x96; + lop3.b32 %r19734, %r30475, %r30443, %r30441, 0x96; + lop3.b32 %r19734, %r19734, %r30439, %r30437, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19745, %r19698, %r19697, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19749, %r19697, %r19698, %r19784; + // end inline asm + xor.b32 %r19923, %r19745, %r19733; + xor.b32 %r19924, %r19749, %r19734; + xor.b32 %r19892, %r30472, %r19923; + xor.b32 %r19895, %r30473, %r19924; + xor.b32 %r19855, %r30469, %r19924; + xor.b32 %r19854, %r30468, %r19923; + st.local.v2.u32 [%rd178+104], {%r19854, %r19855}; + // begin inline asm + shf.l.wrap.b32 %r19753, %r19710, %r19709, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19757, %r19709, %r19710, %r19784; + // end inline asm + xor.b32 %r19925, %r19753, %r19685; + xor.b32 %r19926, %r19757, %r19686; + xor.b32 %r19791, %r30482, %r19925; + xor.b32 %r19790, %r30483, %r19926; + xor.b32 %r19830, %r30461, %r19926; + xor.b32 %r19831, %r30460, %r19925; + st.local.v2.u32 [%rd178+152], {%r19831, %r19830}; + // begin inline asm + shf.l.wrap.b32 %r19761, %r19722, %r19721, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19765, %r19721, %r19722, %r19784; + // end inline asm + xor.b32 %r19927, %r19761, %r19697; + xor.b32 %r19928, %r19765, %r19698; + xor.b32 %r19814, %r30457, %r19928; + xor.b32 %r19815, %r30456, %r19927; + st.local.v2.u32 [%rd178+120], {%r19815, %r19814}; + xor.b32 %r19806, %r30453, %r19928; + xor.b32 %r19807, %r30452, %r19927; + st.local.v2.u32 [%rd178+200], {%r19807, %r19806}; + // begin inline asm + shf.l.wrap.b32 %r19769, %r19734, %r19733, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19773, %r19733, %r19734, %r19784; + // end inline asm + xor.b32 %r19929, %r19769, %r19709; + xor.b32 %r19930, %r19773, %r19710; + xor.b32 %r19838, %r30476, %r19929; + xor.b32 %r19839, %r30477, %r19930; + xor.b32 %r19847, %r30447, %r19930; + xor.b32 %r19846, %r30446, %r19929; + st.local.v2.u32 [%rd178+168], {%r19846, %r19847}; + // begin inline asm + shf.l.wrap.b32 %r19777, %r19686, %r19685, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19781, %r19685, %r19686, %r19784; + // end inline asm + xor.b32 %r19931, %r19777, %r19721; + xor.b32 %r19932, %r19781, %r19722; + xor.b32 %r19798, %r30442, %r19931; + xor.b32 %r19799, %r30443, %r19932; + xor.b32 %r19823, %r30437, %r19932; + xor.b32 %r19822, %r30436, %r19931; + st.local.v2.u32 [%rd178+216], {%r19822, %r19823}; + // begin inline asm + shf.l.wrap.b32 %r19785, %r19791, %r19790, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19789, %r19790, %r19791, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19793, %r19799, %r19798, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19797, %r19798, %r19799, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19805, %r19806, %r19807, %r19304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19801, %r19807, %r19806, %r19304; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r19801, %r19805}; + // begin inline asm + shf.l.wrap.b32 %r19809, %r19815, %r19814, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19813, %r19814, %r19815, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19817, %r19823, %r19822, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19821, %r19822, %r19823, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19829, %r19830, %r19831, %r19408; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19825, %r19831, %r19830, %r19408; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r19825, %r19829}; + // begin inline asm + shf.l.wrap.b32 %r19833, %r19839, %r19838, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19837, %r19838, %r19839, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19841, %r19847, %r19846, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19845, %r19846, %r19847, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19849, %r19855, %r19854, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19853, %r19854, %r19855, %r19464; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19857, %r19892, %r19785, %r19809, 0xD2; + lop3.b32 %r19858, %r19895, %r19789, %r19813, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19865, %r19785, %r19809, %r19841, 0xD2; + lop3.b32 %r19866, %r19789, %r19813, %r19845, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+32], {%r19865, %r19866}; + // begin inline asm + // chi + lop3.b32 %r19873, %r19809, %r19841, %r19817, 0xD2; + lop3.b32 %r19874, %r19813, %r19845, %r19821, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+40], {%r19873, %r19874}; + // begin inline asm + // chi + lop3.b32 %r19881, %r19841, %r19817, %r19892, 0xD2; + lop3.b32 %r19882, %r19845, %r19821, %r19895, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+48], {%r19881, %r19882}; + // begin inline asm + // chi + lop3.b32 %r19889, %r19817, %r19892, %r19785, 0xD2; + lop3.b32 %r19890, %r19821, %r19895, %r19789, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+56], {%r19889, %r19890}; + // begin inline asm + // chi + lop3.b32 %r19897, %r19833, %r19793, %r19849, 0xD2; + lop3.b32 %r19898, %r19837, %r19797, %r19853, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+64], {%r19897, %r19898}; + // begin inline asm + // chi + lop3.b32 %r19905, %r19793, %r19849, %r19825, 0xD2; + lop3.b32 %r19906, %r19797, %r19853, %r19829, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+72], {%r19905, %r19906}; + // begin inline asm + // chi + lop3.b32 %r19913, %r19849, %r19825, %r19801, 0xD2; + lop3.b32 %r19914, %r19853, %r19829, %r19805, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+80], {%r19913, %r19914}; + // begin inline asm + ld.global.nc.v2.u32 {%r19921,%r19922}, [%rd807]; + // end inline asm + xor.b32 %r19933, %r19858, %r19922; + xor.b32 %r19934, %r19857, %r19921; + st.local.v2.u32 [%rd178+24], {%r19934, %r19933}; + bra.uni $L__BB2_69; + +$L__BB2_47: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd688, 1179641; + st.local.u64 [%rd3+8], %rd688; + st.local.u32 [%rd3+16], %r1678; + ld.global.u64 %rd689, [%rd128]; + ld.global.u64 %rd690, [%rd128+8]; + ld.global.u64 %rd691, [%rd128+16]; + ld.global.u64 %rd692, [%rd128+24]; + ld.global.u64 %rd693, [%rd128+32]; + ld.global.u64 %rd694, [%rd128+40]; + ld.global.u64 %rd695, [%rd128+48]; + ld.global.u64 %rd696, [%rd128+56]; + st.local.u64 [%rd3+24], %rd689; + st.local.u64 [%rd3+32], %rd690; + st.local.u64 [%rd3+40], %rd691; + st.local.u64 [%rd3+48], %rd692; + st.local.u64 [%rd3+56], %rd693; + st.local.u64 [%rd3+64], %rd694; + st.local.u64 [%rd3+72], %rd695; + st.local.u64 [%rd3+80], %rd696; + cvt.u32.u64 %r13408, %rd689; + xor.b32 %r13409, %r1678, %r13408; + st.local.u32 [%rd3+24], %r13409; + mov.u32 %r30013, 0; + st.local.v2.u32 [%rd3+96], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+104], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+112], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+120], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+128], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+136], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+144], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+152], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+160], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+168], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+176], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+184], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+192], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+200], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+208], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+216], {%r30013, %r30013}; + mov.u32 %r30028, -2147483648; + mov.u32 %r13381, 1; + st.local.v2.u32 [%rd3+88], {%r13381, %r30028}; + ld.local.v2.u32 {%r30049, %r30050}, [%rd3+24]; + mov.b64 {%r30047, %r30048}, %rd694; + shr.u64 %rd697, %rd690, 32; + cvt.u32.u64 %r30061, %rd690; + cvt.u32.u64 %r30062, %rd697; + shr.u64 %rd698, %rd695, 32; + cvt.u32.u64 %r30059, %rd695; + cvt.u32.u64 %r30060, %rd698; + shr.u64 %rd699, %rd691, 32; + cvt.u32.u64 %r30057, %rd691; + cvt.u32.u64 %r30058, %rd699; + shr.u64 %rd700, %rd696, 32; + cvt.u32.u64 %r30055, %rd696; + cvt.u32.u64 %r30056, %rd700; + shr.u64 %rd701, %rd692, 32; + cvt.u32.u64 %r30053, %rd692; + cvt.u32.u64 %r30054, %rd701; + shr.u64 %rd702, %rd693, 32; + cvt.u32.u64 %r30051, %rd693; + cvt.u32.u64 %r30052, %rd702; + mov.u32 %r30014, %r30013; + mov.u32 %r30015, %r30013; + mov.u32 %r30016, %r30013; + mov.u32 %r30017, %r30013; + mov.u32 %r30018, %r30013; + mov.u32 %r30019, %r30013; + mov.u32 %r30020, %r30013; + mov.u32 %r30021, %r30013; + mov.u32 %r30022, %r30013; + mov.u32 %r30023, %r30013; + mov.u32 %r30024, %r30013; + mov.u32 %r30025, %r30013; + mov.u32 %r30026, %r30013; + mov.u32 %r30027, %r13381; + mov.u32 %r30029, %r30013; + mov.u32 %r30030, %r30013; + mov.u32 %r30031, %r30013; + mov.u32 %r30032, %r30013; + mov.u32 %r30033, %r30013; + mov.u32 %r30034, %r30013; + mov.u32 %r30035, %r30013; + mov.u32 %r30036, %r30013; + mov.u32 %r30037, %r30013; + mov.u32 %r30038, %r30013; + mov.u32 %r30039, %r30013; + mov.u32 %r30040, %r30013; + mov.u32 %r30041, %r30013; + mov.u32 %r30042, %r30013; + mov.u32 %r30043, %r30013; + mov.u32 %r30044, %r30013; + mov.u32 %r30045, %r30013; + mov.u32 %r30046, %r30013; + mov.u32 %r30063, %r30013; + +$L__BB2_48: + // begin inline asm + // xor5 + lop3.b32 %r13412, %r30049, %r30047, %r30045, 0x96; + lop3.b32 %r13412, %r13412, %r30043, %r30041, 0x96; + lop3.b32 %r13413, %r30050, %r30048, %r30046, 0x96; + lop3.b32 %r13413, %r13413, %r30044, %r30042, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13424, %r30061, %r30059, %r30039, 0x96; + lop3.b32 %r13424, %r13424, %r30037, %r30035, 0x96; + lop3.b32 %r13425, %r30062, %r30060, %r30040, 0x96; + lop3.b32 %r13425, %r13425, %r30038, %r30036, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13436, %r30057, %r30055, %r30033, 0x96; + lop3.b32 %r13436, %r13436, %r30031, %r30029, 0x96; + lop3.b32 %r13437, %r30058, %r30056, %r30034, 0x96; + lop3.b32 %r13437, %r13437, %r30032, %r30030, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13448, %r30053, %r30027, %r30025, 0x96; + lop3.b32 %r13448, %r13448, %r30023, %r30021, 0x96; + lop3.b32 %r13449, %r30054, %r30028, %r30026, 0x96; + lop3.b32 %r13449, %r13449, %r30024, %r30022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13460, %r30051, %r30019, %r30017, 0x96; + lop3.b32 %r13460, %r13460, %r30015, %r30013, 0x96; + lop3.b32 %r13461, %r30052, %r30020, %r30018, 0x96; + lop3.b32 %r13461, %r13461, %r30016, %r30014, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13472, %r13425, %r13424, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13476, %r13424, %r13425, %r13381; + // end inline asm + xor.b32 %r13906, %r13472, %r13460; + xor.b32 %r13907, %r13476, %r13461; + xor.b32 %r13739, %r30049, %r13906; + xor.b32 %r13742, %r30050, %r13907; + xor.b32 %r13646, %r30047, %r13906; + xor.b32 %r13645, %r30048, %r13907; + xor.b32 %r13693, %r30045, %r13906; + xor.b32 %r13694, %r30046, %r13907; + xor.b32 %r13598, %r30043, %r13906; + xor.b32 %r13597, %r30044, %r13907; + xor.b32 %r13549, %r30041, %r13906; + xor.b32 %r13550, %r30042, %r13907; + // begin inline asm + shf.l.wrap.b32 %r13480, %r13437, %r13436, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13484, %r13436, %r13437, %r13381; + // end inline asm + xor.b32 %r13908, %r13480, %r13412; + xor.b32 %r13909, %r13484, %r13413; + xor.b32 %r13701, %r30061, %r13908; + xor.b32 %r13702, %r30062, %r13909; + xor.b32 %r13518, %r30059, %r13908; + xor.b32 %r13517, %r30060, %r13909; + xor.b32 %r13677, %r30039, %r13908; + xor.b32 %r13678, %r30040, %r13909; + xor.b32 %r13638, %r30037, %r13908; + xor.b32 %r13637, %r30038, %r13909; + xor.b32 %r13621, %r30035, %r13908; + xor.b32 %r13622, %r30036, %r13909; + // begin inline asm + shf.l.wrap.b32 %r13488, %r13449, %r13448, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13492, %r13448, %r13449, %r13381; + // end inline asm + xor.b32 %r13910, %r13488, %r13424; + xor.b32 %r13911, %r13492, %r13425; + xor.b32 %r13558, %r30057, %r13910; + xor.b32 %r13557, %r30058, %r13911; + xor.b32 %r13685, %r30055, %r13910; + xor.b32 %r13686, %r30056, %r13911; + xor.b32 %r13566, %r30033, %r13910; + xor.b32 %r13565, %r30034, %r13911; + xor.b32 %r13669, %r30031, %r13910; + xor.b32 %r13670, %r30032, %r13911; + xor.b32 %r13534, %r30029, %r13910; + xor.b32 %r13533, %r30030, %r13911; + // begin inline asm + shf.l.wrap.b32 %r13496, %r13461, %r13460, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13500, %r13460, %r13461, %r13381; + // end inline asm + xor.b32 %r13912, %r13496, %r13436; + xor.b32 %r13913, %r13500, %r13437; + xor.b32 %r13653, %r30053, %r13912; + xor.b32 %r13654, %r30054, %r13913; + xor.b32 %r13630, %r30027, %r13912; + xor.b32 %r13629, %r30028, %r13913; + xor.b32 %r13573, %r30025, %r13912; + xor.b32 %r13574, %r30026, %r13913; + xor.b32 %r13661, %r30023, %r13912; + xor.b32 %r13662, %r30024, %r13913; + xor.b32 %r13590, %r30021, %r13912; + xor.b32 %r13589, %r30022, %r13913; + // begin inline asm + shf.l.wrap.b32 %r13504, %r13413, %r13412, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13508, %r13412, %r13413, %r13381; + // end inline asm + xor.b32 %r13914, %r13504, %r13448; + xor.b32 %r13915, %r13508, %r13449; + xor.b32 %r13605, %r30051, %r13914; + xor.b32 %r13606, %r30052, %r13915; + xor.b32 %r13525, %r30019, %r13914; + xor.b32 %r13526, %r30020, %r13915; + xor.b32 %r13542, %r30017, %r13914; + xor.b32 %r13541, %r30018, %r13915; + xor.b32 %r13581, %r30015, %r13914; + xor.b32 %r13582, %r30016, %r13915; + xor.b32 %r13613, %r30013, %r13914; + xor.b32 %r13614, %r30014, %r13915; + mov.u32 %r13519, 44; + // begin inline asm + shf.l.wrap.b32 %r13512, %r13518, %r13517, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13516, %r13517, %r13518, %r13519; + // end inline asm + mov.u32 %r13527, 20; + // begin inline asm + shf.l.wrap.b32 %r13520, %r13526, %r13525, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13524, %r13525, %r13526, %r13527; + // end inline asm + mov.u32 %r13535, 61; + // begin inline asm + shf.l.wrap.b32 %r13528, %r13534, %r13533, %r13535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13532, %r13533, %r13534, %r13535; + // end inline asm + mov.u32 %r13543, 39; + // begin inline asm + shf.l.wrap.b32 %r13536, %r13542, %r13541, %r13543; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13540, %r13541, %r13542, %r13543; + // end inline asm + mov.u32 %r13551, 18; + // begin inline asm + shf.l.wrap.b32 %r13544, %r13550, %r13549, %r13551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13548, %r13549, %r13550, %r13551; + // end inline asm + mov.u32 %r13559, 62; + // begin inline asm + shf.l.wrap.b32 %r13552, %r13558, %r13557, %r13559; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13556, %r13557, %r13558, %r13559; + // end inline asm + mov.u32 %r13567, 43; + // begin inline asm + shf.l.wrap.b32 %r13560, %r13566, %r13565, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13564, %r13565, %r13566, %r13567; + // end inline asm + mov.u32 %r13575, 25; + // begin inline asm + shf.l.wrap.b32 %r13568, %r13574, %r13573, %r13575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13572, %r13573, %r13574, %r13575; + // end inline asm + mov.u32 %r13583, 8; + // begin inline asm + shf.l.wrap.b32 %r13576, %r13582, %r13581, %r13583; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13580, %r13581, %r13582, %r13583; + // end inline asm + mov.u32 %r13591, 56; + // begin inline asm + shf.l.wrap.b32 %r13584, %r13590, %r13589, %r13591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13588, %r13589, %r13590, %r13591; + // end inline asm + mov.u32 %r13599, 41; + // begin inline asm + shf.l.wrap.b32 %r13592, %r13598, %r13597, %r13599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13596, %r13597, %r13598, %r13599; + // end inline asm + mov.u32 %r13607, 27; + // begin inline asm + shf.l.wrap.b32 %r13600, %r13606, %r13605, %r13607; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13604, %r13605, %r13606, %r13607; + // end inline asm + mov.u32 %r13615, 14; + // begin inline asm + shf.l.wrap.b32 %r13608, %r13614, %r13613, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13612, %r13613, %r13614, %r13615; + // end inline asm + mov.u32 %r13623, 2; + // begin inline asm + shf.l.wrap.b32 %r13616, %r13622, %r13621, %r13623; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13620, %r13621, %r13622, %r13623; + // end inline asm + mov.u32 %r13631, 55; + // begin inline asm + shf.l.wrap.b32 %r13624, %r13630, %r13629, %r13631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13628, %r13629, %r13630, %r13631; + // end inline asm + mov.u32 %r13639, 45; + // begin inline asm + shf.l.wrap.b32 %r13632, %r13638, %r13637, %r13639; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13636, %r13637, %r13638, %r13639; + // end inline asm + mov.u32 %r13647, 36; + // begin inline asm + shf.l.wrap.b32 %r13640, %r13646, %r13645, %r13647; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13644, %r13645, %r13646, %r13647; + // end inline asm + mov.u32 %r13655, 28; + // begin inline asm + shf.l.wrap.b32 %r13648, %r13654, %r13653, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13652, %r13653, %r13654, %r13655; + // end inline asm + mov.u32 %r13663, 21; + // begin inline asm + shf.l.wrap.b32 %r13656, %r13662, %r13661, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13660, %r13661, %r13662, %r13663; + // end inline asm + mov.u32 %r13671, 15; + // begin inline asm + shf.l.wrap.b32 %r13664, %r13670, %r13669, %r13671; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13668, %r13669, %r13670, %r13671; + // end inline asm + mov.u32 %r13679, 10; + // begin inline asm + shf.l.wrap.b32 %r13672, %r13678, %r13677, %r13679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13676, %r13677, %r13678, %r13679; + // end inline asm + mov.u32 %r13687, 6; + // begin inline asm + shf.l.wrap.b32 %r13680, %r13686, %r13685, %r13687; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13684, %r13685, %r13686, %r13687; + // end inline asm + mov.u32 %r13695, 3; + // begin inline asm + shf.l.wrap.b32 %r13688, %r13694, %r13693, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13692, %r13693, %r13694, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13696, %r13702, %r13701, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13700, %r13701, %r13702, %r13381; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13704, %r13739, %r13512, %r13560, 0xD2; + lop3.b32 %r13705, %r13742, %r13516, %r13564, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30061, %r13512, %r13560, %r13656, 0xD2; + lop3.b32 %r30062, %r13516, %r13564, %r13660, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30057, %r13560, %r13656, %r13608, 0xD2; + lop3.b32 %r30058, %r13564, %r13660, %r13612, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30053, %r13656, %r13608, %r13739, 0xD2; + lop3.b32 %r30054, %r13660, %r13612, %r13742, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30051, %r13608, %r13739, %r13512, 0xD2; + lop3.b32 %r30052, %r13612, %r13742, %r13516, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30047, %r13648, %r13520, %r13688, 0xD2; + lop3.b32 %r30048, %r13652, %r13524, %r13692, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30059, %r13520, %r13688, %r13632, 0xD2; + lop3.b32 %r30060, %r13524, %r13692, %r13636, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30055, %r13688, %r13632, %r13528, 0xD2; + lop3.b32 %r30056, %r13692, %r13636, %r13532, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30027, %r13632, %r13528, %r13648, 0xD2; + lop3.b32 %r30028, %r13636, %r13532, %r13652, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30027, %r30028}; + // begin inline asm + // chi + lop3.b32 %r30019, %r13528, %r13648, %r13520, 0xD2; + lop3.b32 %r30020, %r13532, %r13652, %r13524, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30019, %r30020}; + // begin inline asm + // chi + lop3.b32 %r30045, %r13696, %r13680, %r13568, 0xD2; + lop3.b32 %r30046, %r13700, %r13684, %r13572, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30045, %r30046}; + // begin inline asm + // chi + lop3.b32 %r30039, %r13680, %r13568, %r13576, 0xD2; + lop3.b32 %r30040, %r13684, %r13572, %r13580, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30039, %r30040}; + // begin inline asm + // chi + lop3.b32 %r30033, %r13568, %r13576, %r13544, 0xD2; + lop3.b32 %r30034, %r13572, %r13580, %r13548, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30033, %r30034}; + // begin inline asm + // chi + lop3.b32 %r30025, %r13576, %r13544, %r13696, 0xD2; + lop3.b32 %r30026, %r13580, %r13548, %r13700, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30025, %r30026}; + // begin inline asm + // chi + lop3.b32 %r30017, %r13544, %r13696, %r13680, 0xD2; + lop3.b32 %r30018, %r13548, %r13700, %r13684, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30017, %r30018}; + // begin inline asm + // chi + lop3.b32 %r30043, %r13600, %r13640, %r13672, 0xD2; + lop3.b32 %r30044, %r13604, %r13644, %r13676, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30043, %r30044}; + // begin inline asm + // chi + lop3.b32 %r30037, %r13640, %r13672, %r13664, 0xD2; + lop3.b32 %r30038, %r13644, %r13676, %r13668, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30037, %r30038}; + // begin inline asm + // chi + lop3.b32 %r30031, %r13672, %r13664, %r13584, 0xD2; + lop3.b32 %r30032, %r13676, %r13668, %r13588, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30031, %r30032}; + // begin inline asm + // chi + lop3.b32 %r30023, %r13664, %r13584, %r13600, 0xD2; + lop3.b32 %r30024, %r13668, %r13588, %r13604, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30023, %r30024}; + // begin inline asm + // chi + lop3.b32 %r30015, %r13584, %r13600, %r13640, 0xD2; + lop3.b32 %r30016, %r13588, %r13604, %r13644, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30015, %r30016}; + // begin inline asm + // chi + lop3.b32 %r30041, %r13552, %r13624, %r13536, 0xD2; + lop3.b32 %r30042, %r13556, %r13628, %r13540, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30041, %r30042}; + // begin inline asm + // chi + lop3.b32 %r30035, %r13624, %r13536, %r13592, 0xD2; + lop3.b32 %r30036, %r13628, %r13540, %r13596, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30035, %r30036}; + // begin inline asm + // chi + lop3.b32 %r30029, %r13536, %r13592, %r13616, 0xD2; + lop3.b32 %r30030, %r13540, %r13596, %r13620, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30029, %r30030}; + // begin inline asm + // chi + lop3.b32 %r30021, %r13592, %r13616, %r13552, 0xD2; + lop3.b32 %r30022, %r13596, %r13620, %r13556, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30021, %r30022}; + // begin inline asm + // chi + lop3.b32 %r30013, %r13616, %r13552, %r13624, 0xD2; + lop3.b32 %r30014, %r13620, %r13556, %r13628, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30013, %r30014}; + mul.wide.s32 %rd704, %r30063, 8; + mov.u64 %rd705, keccak_round_constants; + cvta.const.u64 %rd706, %rd705; + add.s64 %rd703, %rd706, %rd704; + // begin inline asm + ld.global.nc.v2.u32 {%r13904,%r13905}, [%rd703]; + // end inline asm + xor.b32 %r30049, %r13704, %r13904; + xor.b32 %r30050, %r13705, %r13905; + add.s32 %r30063, %r30063, 1; + setp.lt.u32 %p30, %r30063, 23; + @%p30 bra $L__BB2_48; + + add.u64 %rd149, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30061, %r30062}; + st.local.v2.u32 [%rd3+72], {%r30059, %r30060}; + st.local.v2.u32 [%rd3+40], {%r30057, %r30058}; + st.local.v2.u32 [%rd3+80], {%r30055, %r30056}; + st.local.v2.u32 [%rd3+48], {%r30053, %r30054}; + st.local.v2.u32 [%rd3+56], {%r30051, %r30052}; + st.local.v2.u32 [%rd3+24], {%r30049, %r30050}; + // begin inline asm + // xor5 + lop3.b32 %r13916, %r30049, %r30047, %r30045, 0x96; + lop3.b32 %r13916, %r13916, %r30043, %r30041, 0x96; + lop3.b32 %r13917, %r30050, %r30048, %r30046, 0x96; + lop3.b32 %r13917, %r13917, %r30044, %r30042, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13928, %r30061, %r30059, %r30039, 0x96; + lop3.b32 %r13928, %r13928, %r30037, %r30035, 0x96; + lop3.b32 %r13929, %r30062, %r30060, %r30040, 0x96; + lop3.b32 %r13929, %r13929, %r30038, %r30036, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13940, %r30057, %r30055, %r30033, 0x96; + lop3.b32 %r13940, %r13940, %r30031, %r30029, 0x96; + lop3.b32 %r13941, %r30058, %r30056, %r30034, 0x96; + lop3.b32 %r13941, %r13941, %r30032, %r30030, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13952, %r30053, %r30027, %r30025, 0x96; + lop3.b32 %r13952, %r13952, %r30023, %r30021, 0x96; + lop3.b32 %r13953, %r30054, %r30028, %r30026, 0x96; + lop3.b32 %r13953, %r13953, %r30024, %r30022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13964, %r30051, %r30019, %r30017, 0x96; + lop3.b32 %r13964, %r13964, %r30015, %r30013, 0x96; + lop3.b32 %r13965, %r30052, %r30020, %r30018, 0x96; + lop3.b32 %r13965, %r13965, %r30016, %r30014, 0x96; + // end inline asm + mov.u32 %r14168, 1; + // begin inline asm + shf.l.wrap.b32 %r13976, %r13929, %r13928, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13980, %r13928, %r13929, %r14168; + // end inline asm + xor.b32 %r14195, %r13976, %r13964; + xor.b32 %r14196, %r13980, %r13965; + xor.b32 %r14123, %r30049, %r14195; + xor.b32 %r14126, %r30050, %r14196; + xor.b32 %r14086, %r30046, %r14196; + xor.b32 %r14085, %r30045, %r14195; + st.local.v2.u32 [%rd3+104], {%r14085, %r14086}; + // begin inline asm + shf.l.wrap.b32 %r13984, %r13941, %r13940, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13988, %r13940, %r13941, %r14168; + // end inline asm + xor.b32 %r14197, %r13984, %r13916; + xor.b32 %r14198, %r13988, %r13917; + xor.b32 %r14022, %r30059, %r14197; + xor.b32 %r14021, %r30060, %r14198; + xor.b32 %r14061, %r30038, %r14198; + xor.b32 %r14062, %r30037, %r14197; + st.local.v2.u32 [%rd3+152], {%r14062, %r14061}; + // begin inline asm + shf.l.wrap.b32 %r13992, %r13953, %r13952, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13996, %r13952, %r13953, %r14168; + // end inline asm + xor.b32 %r14199, %r13992, %r13928; + xor.b32 %r14200, %r13996, %r13929; + xor.b32 %r14045, %r30034, %r14200; + xor.b32 %r14046, %r30033, %r14199; + st.local.v2.u32 [%rd3+120], {%r14046, %r14045}; + xor.b32 %r14037, %r30030, %r14200; + xor.b32 %r14038, %r30029, %r14199; + st.local.v2.u32 [%rd3+200], {%r14038, %r14037}; + // begin inline asm + shf.l.wrap.b32 %r14000, %r13965, %r13964, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14004, %r13964, %r13965, %r14168; + // end inline asm + xor.b32 %r14201, %r14000, %r13940; + xor.b32 %r14202, %r14004, %r13941; + xor.b32 %r14069, %r30053, %r14201; + xor.b32 %r14070, %r30054, %r14202; + xor.b32 %r14078, %r30024, %r14202; + xor.b32 %r14077, %r30023, %r14201; + st.local.v2.u32 [%rd3+168], {%r14077, %r14078}; + // begin inline asm + shf.l.wrap.b32 %r14008, %r13917, %r13916, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14012, %r13916, %r13917, %r14168; + // end inline asm + xor.b32 %r14203, %r14008, %r13952; + xor.b32 %r14204, %r14012, %r13953; + xor.b32 %r14029, %r30019, %r14203; + xor.b32 %r14030, %r30020, %r14204; + xor.b32 %r14054, %r30014, %r14204; + xor.b32 %r14053, %r30013, %r14203; + st.local.v2.u32 [%rd3+216], {%r14053, %r14054}; + // begin inline asm + shf.l.wrap.b32 %r14016, %r14022, %r14021, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14020, %r14021, %r14022, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14024, %r14030, %r14029, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14028, %r14029, %r14030, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14036, %r14037, %r14038, %r13535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14032, %r14038, %r14037, %r13535; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r14032, %r14036}; + // begin inline asm + shf.l.wrap.b32 %r14040, %r14046, %r14045, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14044, %r14045, %r14046, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14048, %r14054, %r14053, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14052, %r14053, %r14054, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14060, %r14061, %r14062, %r13639; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14056, %r14062, %r14061, %r13639; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r14056, %r14060}; + // begin inline asm + shf.l.wrap.b32 %r14064, %r14070, %r14069, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14068, %r14069, %r14070, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14072, %r14078, %r14077, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14076, %r14077, %r14078, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14080, %r14086, %r14085, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14084, %r14085, %r14086, %r13695; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14088, %r14123, %r14016, %r14040, 0xD2; + lop3.b32 %r14089, %r14126, %r14020, %r14044, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30196, %r14016, %r14040, %r14072, 0xD2; + lop3.b32 %r30197, %r14020, %r14044, %r14076, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30196, %r30197}; + // begin inline asm + // chi + lop3.b32 %r30192, %r14040, %r14072, %r14048, 0xD2; + lop3.b32 %r30193, %r14044, %r14076, %r14052, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30192, %r30193}; + // begin inline asm + // chi + lop3.b32 %r30188, %r14072, %r14048, %r14123, 0xD2; + lop3.b32 %r30189, %r14076, %r14052, %r14126, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30188, %r30189}; + // begin inline asm + // chi + lop3.b32 %r30186, %r14048, %r14123, %r14016, 0xD2; + lop3.b32 %r30187, %r14052, %r14126, %r14020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30186, %r30187}; + // begin inline asm + // chi + lop3.b32 %r30182, %r14064, %r14024, %r14080, 0xD2; + lop3.b32 %r30183, %r14068, %r14028, %r14084, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30182, %r30183}; + // begin inline asm + // chi + lop3.b32 %r30194, %r14024, %r14080, %r14056, 0xD2; + lop3.b32 %r30195, %r14028, %r14084, %r14060, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30194, %r30195}; + // begin inline asm + // chi + lop3.b32 %r30190, %r14080, %r14056, %r14032, 0xD2; + lop3.b32 %r30191, %r14084, %r14060, %r14036, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30190, %r30191}; + add.s64 %rd707, %rd706, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r14152,%r14153}, [%rd707]; + // end inline asm + xor.b32 %r30184, %r14088, %r14152; + xor.b32 %r30185, %r14089, %r14153; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + st.local.u64 [%rd149], %rd354; + mov.u64 %rd711, 1179641; + st.local.u64 [%rd149+8], %rd711; + add.s32 %r1874, %r1678, 1; + st.local.u32 [%rd149+16], %r1874; + ld.global.u64 %rd712, [%rd129]; + ld.global.u64 %rd713, [%rd129+8]; + ld.global.u64 %rd714, [%rd129+16]; + ld.global.u64 %rd715, [%rd129+24]; + ld.global.u64 %rd716, [%rd129+32]; + ld.global.u64 %rd717, [%rd129+40]; + ld.global.u64 %rd718, [%rd129+48]; + ld.global.u64 %rd719, [%rd129+56]; + st.local.u64 [%rd149+32], %rd713; + st.local.u64 [%rd149+40], %rd714; + st.local.u64 [%rd149+48], %rd715; + st.local.u64 [%rd149+56], %rd716; + st.local.u64 [%rd149+64], %rd717; + st.local.u64 [%rd149+72], %rd718; + st.local.u64 [%rd149+80], %rd719; + cvt.u32.u64 %r14205, %rd712; + xor.b32 %r14206, %r1874, %r14205; + st.local.u64 [%rd149+24], %rd712; + st.local.u32 [%rd149+24], %r14206; + mov.u32 %r30064, 0; + st.local.v2.u32 [%rd149+96], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+104], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+112], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+120], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+128], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+136], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+144], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+152], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+160], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+168], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+176], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+184], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+192], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+200], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+208], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+216], {%r30064, %r30064}; + mov.u32 %r30079, -2147483648; + st.local.v2.u32 [%rd149+88], {%r14168, %r30079}; + ld.local.v2.u32 {%r30100, %r30101}, [%rd149+24]; + mov.b64 {%r30098, %r30099}, %rd717; + shr.u64 %rd720, %rd713, 32; + cvt.u32.u64 %r30112, %rd713; + cvt.u32.u64 %r30113, %rd720; + shr.u64 %rd721, %rd718, 32; + cvt.u32.u64 %r30110, %rd718; + cvt.u32.u64 %r30111, %rd721; + shr.u64 %rd722, %rd714, 32; + cvt.u32.u64 %r30108, %rd714; + cvt.u32.u64 %r30109, %rd722; + shr.u64 %rd723, %rd719, 32; + cvt.u32.u64 %r30106, %rd719; + cvt.u32.u64 %r30107, %rd723; + shr.u64 %rd724, %rd715, 32; + cvt.u32.u64 %r30104, %rd715; + cvt.u32.u64 %r30105, %rd724; + shr.u64 %rd725, %rd716, 32; + cvt.u32.u64 %r30102, %rd716; + cvt.u32.u64 %r30103, %rd725; + mov.u32 %r30065, %r30064; + mov.u32 %r30066, %r30064; + mov.u32 %r30067, %r30064; + mov.u32 %r30068, %r30064; + mov.u32 %r30069, %r30064; + mov.u32 %r30070, %r30064; + mov.u32 %r30071, %r30064; + mov.u32 %r30072, %r30064; + mov.u32 %r30073, %r30064; + mov.u32 %r30074, %r30064; + mov.u32 %r30075, %r30064; + mov.u32 %r30076, %r30064; + mov.u32 %r30077, %r30064; + mov.u32 %r30078, %r14168; + mov.u32 %r30080, %r30064; + mov.u32 %r30081, %r30064; + mov.u32 %r30082, %r30064; + mov.u32 %r30083, %r30064; + mov.u32 %r30084, %r30064; + mov.u32 %r30085, %r30064; + mov.u32 %r30086, %r30064; + mov.u32 %r30087, %r30064; + mov.u32 %r30088, %r30064; + mov.u32 %r30089, %r30064; + mov.u32 %r30090, %r30064; + mov.u32 %r30091, %r30064; + mov.u32 %r30092, %r30064; + mov.u32 %r30093, %r30064; + mov.u32 %r30094, %r30064; + mov.u32 %r30095, %r30064; + mov.u32 %r30096, %r30064; + mov.u32 %r30097, %r30064; + mov.u32 %r30114, %r30064; + +$L__BB2_50: + // begin inline asm + // xor5 + lop3.b32 %r14209, %r30100, %r30098, %r30096, 0x96; + lop3.b32 %r14209, %r14209, %r30094, %r30092, 0x96; + lop3.b32 %r14210, %r30101, %r30099, %r30097, 0x96; + lop3.b32 %r14210, %r14210, %r30095, %r30093, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14221, %r30112, %r30110, %r30090, 0x96; + lop3.b32 %r14221, %r14221, %r30088, %r30086, 0x96; + lop3.b32 %r14222, %r30113, %r30111, %r30091, 0x96; + lop3.b32 %r14222, %r14222, %r30089, %r30087, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14233, %r30108, %r30106, %r30084, 0x96; + lop3.b32 %r14233, %r14233, %r30082, %r30080, 0x96; + lop3.b32 %r14234, %r30109, %r30107, %r30085, 0x96; + lop3.b32 %r14234, %r14234, %r30083, %r30081, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14245, %r30104, %r30078, %r30076, 0x96; + lop3.b32 %r14245, %r14245, %r30074, %r30072, 0x96; + lop3.b32 %r14246, %r30105, %r30079, %r30077, 0x96; + lop3.b32 %r14246, %r14246, %r30075, %r30073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14257, %r30102, %r30070, %r30068, 0x96; + lop3.b32 %r14257, %r14257, %r30066, %r30064, 0x96; + lop3.b32 %r14258, %r30103, %r30071, %r30069, 0x96; + lop3.b32 %r14258, %r14258, %r30067, %r30065, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14269, %r14222, %r14221, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14273, %r14221, %r14222, %r14168; + // end inline asm + xor.b32 %r14703, %r14269, %r14257; + xor.b32 %r14704, %r14273, %r14258; + xor.b32 %r14536, %r30100, %r14703; + xor.b32 %r14539, %r30101, %r14704; + xor.b32 %r14443, %r30098, %r14703; + xor.b32 %r14442, %r30099, %r14704; + xor.b32 %r14490, %r30096, %r14703; + xor.b32 %r14491, %r30097, %r14704; + xor.b32 %r14395, %r30094, %r14703; + xor.b32 %r14394, %r30095, %r14704; + xor.b32 %r14346, %r30092, %r14703; + xor.b32 %r14347, %r30093, %r14704; + // begin inline asm + shf.l.wrap.b32 %r14277, %r14234, %r14233, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14281, %r14233, %r14234, %r14168; + // end inline asm + xor.b32 %r14705, %r14277, %r14209; + xor.b32 %r14706, %r14281, %r14210; + xor.b32 %r14498, %r30112, %r14705; + xor.b32 %r14499, %r30113, %r14706; + xor.b32 %r14315, %r30110, %r14705; + xor.b32 %r14314, %r30111, %r14706; + xor.b32 %r14474, %r30090, %r14705; + xor.b32 %r14475, %r30091, %r14706; + xor.b32 %r14435, %r30088, %r14705; + xor.b32 %r14434, %r30089, %r14706; + xor.b32 %r14418, %r30086, %r14705; + xor.b32 %r14419, %r30087, %r14706; + // begin inline asm + shf.l.wrap.b32 %r14285, %r14246, %r14245, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14289, %r14245, %r14246, %r14168; + // end inline asm + xor.b32 %r14707, %r14285, %r14221; + xor.b32 %r14708, %r14289, %r14222; + xor.b32 %r14355, %r30108, %r14707; + xor.b32 %r14354, %r30109, %r14708; + xor.b32 %r14482, %r30106, %r14707; + xor.b32 %r14483, %r30107, %r14708; + xor.b32 %r14363, %r30084, %r14707; + xor.b32 %r14362, %r30085, %r14708; + xor.b32 %r14466, %r30082, %r14707; + xor.b32 %r14467, %r30083, %r14708; + xor.b32 %r14331, %r30080, %r14707; + xor.b32 %r14330, %r30081, %r14708; + // begin inline asm + shf.l.wrap.b32 %r14293, %r14258, %r14257, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14297, %r14257, %r14258, %r14168; + // end inline asm + xor.b32 %r14709, %r14293, %r14233; + xor.b32 %r14710, %r14297, %r14234; + xor.b32 %r14450, %r30104, %r14709; + xor.b32 %r14451, %r30105, %r14710; + xor.b32 %r14427, %r30078, %r14709; + xor.b32 %r14426, %r30079, %r14710; + xor.b32 %r14370, %r30076, %r14709; + xor.b32 %r14371, %r30077, %r14710; + xor.b32 %r14458, %r30074, %r14709; + xor.b32 %r14459, %r30075, %r14710; + xor.b32 %r14387, %r30072, %r14709; + xor.b32 %r14386, %r30073, %r14710; + // begin inline asm + shf.l.wrap.b32 %r14301, %r14210, %r14209, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14305, %r14209, %r14210, %r14168; + // end inline asm + xor.b32 %r14711, %r14301, %r14245; + xor.b32 %r14712, %r14305, %r14246; + xor.b32 %r14402, %r30102, %r14711; + xor.b32 %r14403, %r30103, %r14712; + xor.b32 %r14322, %r30070, %r14711; + xor.b32 %r14323, %r30071, %r14712; + xor.b32 %r14339, %r30068, %r14711; + xor.b32 %r14338, %r30069, %r14712; + xor.b32 %r14378, %r30066, %r14711; + xor.b32 %r14379, %r30067, %r14712; + xor.b32 %r14410, %r30064, %r14711; + xor.b32 %r14411, %r30065, %r14712; + mov.u32 %r14316, 44; + // begin inline asm + shf.l.wrap.b32 %r14309, %r14315, %r14314, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14313, %r14314, %r14315, %r14316; + // end inline asm + mov.u32 %r14324, 20; + // begin inline asm + shf.l.wrap.b32 %r14317, %r14323, %r14322, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14321, %r14322, %r14323, %r14324; + // end inline asm + mov.u32 %r14332, 61; + // begin inline asm + shf.l.wrap.b32 %r14325, %r14331, %r14330, %r14332; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14329, %r14330, %r14331, %r14332; + // end inline asm + mov.u32 %r14340, 39; + // begin inline asm + shf.l.wrap.b32 %r14333, %r14339, %r14338, %r14340; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14337, %r14338, %r14339, %r14340; + // end inline asm + mov.u32 %r14348, 18; + // begin inline asm + shf.l.wrap.b32 %r14341, %r14347, %r14346, %r14348; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14345, %r14346, %r14347, %r14348; + // end inline asm + mov.u32 %r14356, 62; + // begin inline asm + shf.l.wrap.b32 %r14349, %r14355, %r14354, %r14356; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14353, %r14354, %r14355, %r14356; + // end inline asm + mov.u32 %r14364, 43; + // begin inline asm + shf.l.wrap.b32 %r14357, %r14363, %r14362, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14361, %r14362, %r14363, %r14364; + // end inline asm + mov.u32 %r14372, 25; + // begin inline asm + shf.l.wrap.b32 %r14365, %r14371, %r14370, %r14372; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14369, %r14370, %r14371, %r14372; + // end inline asm + mov.u32 %r14380, 8; + // begin inline asm + shf.l.wrap.b32 %r14373, %r14379, %r14378, %r14380; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14377, %r14378, %r14379, %r14380; + // end inline asm + mov.u32 %r14388, 56; + // begin inline asm + shf.l.wrap.b32 %r14381, %r14387, %r14386, %r14388; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14385, %r14386, %r14387, %r14388; + // end inline asm + mov.u32 %r14396, 41; + // begin inline asm + shf.l.wrap.b32 %r14389, %r14395, %r14394, %r14396; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14393, %r14394, %r14395, %r14396; + // end inline asm + mov.u32 %r14404, 27; + // begin inline asm + shf.l.wrap.b32 %r14397, %r14403, %r14402, %r14404; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14401, %r14402, %r14403, %r14404; + // end inline asm + mov.u32 %r14412, 14; + // begin inline asm + shf.l.wrap.b32 %r14405, %r14411, %r14410, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14409, %r14410, %r14411, %r14412; + // end inline asm + mov.u32 %r14420, 2; + // begin inline asm + shf.l.wrap.b32 %r14413, %r14419, %r14418, %r14420; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14417, %r14418, %r14419, %r14420; + // end inline asm + mov.u32 %r14428, 55; + // begin inline asm + shf.l.wrap.b32 %r14421, %r14427, %r14426, %r14428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14425, %r14426, %r14427, %r14428; + // end inline asm + mov.u32 %r14436, 45; + // begin inline asm + shf.l.wrap.b32 %r14429, %r14435, %r14434, %r14436; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14433, %r14434, %r14435, %r14436; + // end inline asm + mov.u32 %r14444, 36; + // begin inline asm + shf.l.wrap.b32 %r14437, %r14443, %r14442, %r14444; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14441, %r14442, %r14443, %r14444; + // end inline asm + mov.u32 %r14452, 28; + // begin inline asm + shf.l.wrap.b32 %r14445, %r14451, %r14450, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14449, %r14450, %r14451, %r14452; + // end inline asm + mov.u32 %r14460, 21; + // begin inline asm + shf.l.wrap.b32 %r14453, %r14459, %r14458, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14457, %r14458, %r14459, %r14460; + // end inline asm + mov.u32 %r14468, 15; + // begin inline asm + shf.l.wrap.b32 %r14461, %r14467, %r14466, %r14468; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14465, %r14466, %r14467, %r14468; + // end inline asm + mov.u32 %r14476, 10; + // begin inline asm + shf.l.wrap.b32 %r14469, %r14475, %r14474, %r14476; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14473, %r14474, %r14475, %r14476; + // end inline asm + mov.u32 %r14484, 6; + // begin inline asm + shf.l.wrap.b32 %r14477, %r14483, %r14482, %r14484; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14481, %r14482, %r14483, %r14484; + // end inline asm + mov.u32 %r14492, 3; + // begin inline asm + shf.l.wrap.b32 %r14485, %r14491, %r14490, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14489, %r14490, %r14491, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14493, %r14499, %r14498, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14497, %r14498, %r14499, %r14168; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14501, %r14536, %r14309, %r14357, 0xD2; + lop3.b32 %r14502, %r14539, %r14313, %r14361, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30112, %r14309, %r14357, %r14453, 0xD2; + lop3.b32 %r30113, %r14313, %r14361, %r14457, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30108, %r14357, %r14453, %r14405, 0xD2; + lop3.b32 %r30109, %r14361, %r14457, %r14409, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30104, %r14453, %r14405, %r14536, 0xD2; + lop3.b32 %r30105, %r14457, %r14409, %r14539, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30102, %r14405, %r14536, %r14309, 0xD2; + lop3.b32 %r30103, %r14409, %r14539, %r14313, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30098, %r14445, %r14317, %r14485, 0xD2; + lop3.b32 %r30099, %r14449, %r14321, %r14489, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30110, %r14317, %r14485, %r14429, 0xD2; + lop3.b32 %r30111, %r14321, %r14489, %r14433, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30106, %r14485, %r14429, %r14325, 0xD2; + lop3.b32 %r30107, %r14489, %r14433, %r14329, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30078, %r14429, %r14325, %r14445, 0xD2; + lop3.b32 %r30079, %r14433, %r14329, %r14449, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r30078, %r30079}; + // begin inline asm + // chi + lop3.b32 %r30070, %r14325, %r14445, %r14317, 0xD2; + lop3.b32 %r30071, %r14329, %r14449, %r14321, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r30070, %r30071}; + // begin inline asm + // chi + lop3.b32 %r30096, %r14493, %r14477, %r14365, 0xD2; + lop3.b32 %r30097, %r14497, %r14481, %r14369, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+104], {%r30096, %r30097}; + // begin inline asm + // chi + lop3.b32 %r30090, %r14477, %r14365, %r14373, 0xD2; + lop3.b32 %r30091, %r14481, %r14369, %r14377, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+112], {%r30090, %r30091}; + // begin inline asm + // chi + lop3.b32 %r30084, %r14365, %r14373, %r14341, 0xD2; + lop3.b32 %r30085, %r14369, %r14377, %r14345, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+120], {%r30084, %r30085}; + // begin inline asm + // chi + lop3.b32 %r30076, %r14373, %r14341, %r14493, 0xD2; + lop3.b32 %r30077, %r14377, %r14345, %r14497, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+128], {%r30076, %r30077}; + // begin inline asm + // chi + lop3.b32 %r30068, %r14341, %r14493, %r14477, 0xD2; + lop3.b32 %r30069, %r14345, %r14497, %r14481, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+136], {%r30068, %r30069}; + // begin inline asm + // chi + lop3.b32 %r30094, %r14397, %r14437, %r14469, 0xD2; + lop3.b32 %r30095, %r14401, %r14441, %r14473, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+144], {%r30094, %r30095}; + // begin inline asm + // chi + lop3.b32 %r30088, %r14437, %r14469, %r14461, 0xD2; + lop3.b32 %r30089, %r14441, %r14473, %r14465, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+152], {%r30088, %r30089}; + // begin inline asm + // chi + lop3.b32 %r30082, %r14469, %r14461, %r14381, 0xD2; + lop3.b32 %r30083, %r14473, %r14465, %r14385, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+160], {%r30082, %r30083}; + // begin inline asm + // chi + lop3.b32 %r30074, %r14461, %r14381, %r14397, 0xD2; + lop3.b32 %r30075, %r14465, %r14385, %r14401, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+168], {%r30074, %r30075}; + // begin inline asm + // chi + lop3.b32 %r30066, %r14381, %r14397, %r14437, 0xD2; + lop3.b32 %r30067, %r14385, %r14401, %r14441, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+176], {%r30066, %r30067}; + // begin inline asm + // chi + lop3.b32 %r30092, %r14349, %r14421, %r14333, 0xD2; + lop3.b32 %r30093, %r14353, %r14425, %r14337, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+184], {%r30092, %r30093}; + // begin inline asm + // chi + lop3.b32 %r30086, %r14421, %r14333, %r14389, 0xD2; + lop3.b32 %r30087, %r14425, %r14337, %r14393, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+192], {%r30086, %r30087}; + // begin inline asm + // chi + lop3.b32 %r30080, %r14333, %r14389, %r14413, 0xD2; + lop3.b32 %r30081, %r14337, %r14393, %r14417, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+200], {%r30080, %r30081}; + // begin inline asm + // chi + lop3.b32 %r30072, %r14389, %r14413, %r14349, 0xD2; + lop3.b32 %r30073, %r14393, %r14417, %r14353, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+208], {%r30072, %r30073}; + // begin inline asm + // chi + lop3.b32 %r30064, %r14413, %r14349, %r14421, 0xD2; + lop3.b32 %r30065, %r14417, %r14353, %r14425, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+216], {%r30064, %r30065}; + mul.wide.s32 %rd727, %r30114, 8; + add.s64 %rd726, %rd706, %rd727; + // begin inline asm + ld.global.nc.v2.u32 {%r14701,%r14702}, [%rd726]; + // end inline asm + xor.b32 %r30100, %r14501, %r14701; + xor.b32 %r30101, %r14502, %r14702; + add.s32 %r30114, %r30114, 1; + setp.lt.u32 %p31, %r30114, 23; + @%p31 bra $L__BB2_50; + + mov.u32 %r30147, 0; + mov.u32 %r14812, 1; + st.local.v2.u32 [%rd149+32], {%r30112, %r30113}; + st.local.v2.u32 [%rd149+72], {%r30110, %r30111}; + st.local.v2.u32 [%rd149+40], {%r30108, %r30109}; + st.local.v2.u32 [%rd149+80], {%r30106, %r30107}; + st.local.v2.u32 [%rd149+48], {%r30104, %r30105}; + st.local.v2.u32 [%rd149+56], {%r30102, %r30103}; + st.local.v2.u32 [%rd149+24], {%r30100, %r30101}; + // begin inline asm + // xor5 + lop3.b32 %r14713, %r30100, %r30098, %r30096, 0x96; + lop3.b32 %r14713, %r14713, %r30094, %r30092, 0x96; + lop3.b32 %r14714, %r30101, %r30099, %r30097, 0x96; + lop3.b32 %r14714, %r14714, %r30095, %r30093, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14725, %r30112, %r30110, %r30090, 0x96; + lop3.b32 %r14725, %r14725, %r30088, %r30086, 0x96; + lop3.b32 %r14726, %r30113, %r30111, %r30091, 0x96; + lop3.b32 %r14726, %r14726, %r30089, %r30087, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14737, %r30108, %r30106, %r30084, 0x96; + lop3.b32 %r14737, %r14737, %r30082, %r30080, 0x96; + lop3.b32 %r14738, %r30109, %r30107, %r30085, 0x96; + lop3.b32 %r14738, %r14738, %r30083, %r30081, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14749, %r30104, %r30078, %r30076, 0x96; + lop3.b32 %r14749, %r14749, %r30074, %r30072, 0x96; + lop3.b32 %r14750, %r30105, %r30079, %r30077, 0x96; + lop3.b32 %r14750, %r14750, %r30075, %r30073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14761, %r30102, %r30070, %r30068, 0x96; + lop3.b32 %r14761, %r14761, %r30066, %r30064, 0x96; + lop3.b32 %r14762, %r30103, %r30071, %r30069, 0x96; + lop3.b32 %r14762, %r14762, %r30067, %r30065, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14773, %r14726, %r14725, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14777, %r14725, %r14726, %r14812; + // end inline asm + xor.b32 %r14952, %r14773, %r14761; + xor.b32 %r14953, %r14777, %r14762; + xor.b32 %r14920, %r30100, %r14952; + xor.b32 %r14923, %r30101, %r14953; + xor.b32 %r14883, %r30097, %r14953; + xor.b32 %r14882, %r30096, %r14952; + st.local.v2.u32 [%rd149+104], {%r14882, %r14883}; + // begin inline asm + shf.l.wrap.b32 %r14781, %r14738, %r14737, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14785, %r14737, %r14738, %r14812; + // end inline asm + xor.b32 %r14954, %r14781, %r14713; + xor.b32 %r14955, %r14785, %r14714; + xor.b32 %r14819, %r30110, %r14954; + xor.b32 %r14818, %r30111, %r14955; + xor.b32 %r14858, %r30089, %r14955; + xor.b32 %r14859, %r30088, %r14954; + st.local.v2.u32 [%rd149+152], {%r14859, %r14858}; + // begin inline asm + shf.l.wrap.b32 %r14789, %r14750, %r14749, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14793, %r14749, %r14750, %r14812; + // end inline asm + xor.b32 %r14956, %r14789, %r14725; + xor.b32 %r14957, %r14793, %r14726; + xor.b32 %r14842, %r30085, %r14957; + xor.b32 %r14843, %r30084, %r14956; + st.local.v2.u32 [%rd149+120], {%r14843, %r14842}; + xor.b32 %r14834, %r30081, %r14957; + xor.b32 %r14835, %r30080, %r14956; + st.local.v2.u32 [%rd149+200], {%r14835, %r14834}; + // begin inline asm + shf.l.wrap.b32 %r14797, %r14762, %r14761, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14801, %r14761, %r14762, %r14812; + // end inline asm + xor.b32 %r14958, %r14797, %r14737; + xor.b32 %r14959, %r14801, %r14738; + xor.b32 %r14866, %r30104, %r14958; + xor.b32 %r14867, %r30105, %r14959; + xor.b32 %r14875, %r30075, %r14959; + xor.b32 %r14874, %r30074, %r14958; + st.local.v2.u32 [%rd149+168], {%r14874, %r14875}; + // begin inline asm + shf.l.wrap.b32 %r14805, %r14714, %r14713, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14809, %r14713, %r14714, %r14812; + // end inline asm + xor.b32 %r14960, %r14805, %r14749; + xor.b32 %r14961, %r14809, %r14750; + xor.b32 %r14826, %r30070, %r14960; + xor.b32 %r14827, %r30071, %r14961; + xor.b32 %r14851, %r30065, %r14961; + xor.b32 %r14850, %r30064, %r14960; + st.local.v2.u32 [%rd149+216], {%r14850, %r14851}; + // begin inline asm + shf.l.wrap.b32 %r14813, %r14819, %r14818, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14817, %r14818, %r14819, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14821, %r14827, %r14826, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14825, %r14826, %r14827, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14833, %r14834, %r14835, %r14332; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14829, %r14835, %r14834, %r14332; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r14829, %r14833}; + // begin inline asm + shf.l.wrap.b32 %r14837, %r14843, %r14842, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14841, %r14842, %r14843, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14845, %r14851, %r14850, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14849, %r14850, %r14851, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14857, %r14858, %r14859, %r14436; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14853, %r14859, %r14858, %r14436; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r14853, %r14857}; + // begin inline asm + shf.l.wrap.b32 %r14861, %r14867, %r14866, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14865, %r14866, %r14867, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14869, %r14875, %r14874, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14873, %r14874, %r14875, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14877, %r14883, %r14882, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14881, %r14882, %r14883, %r14492; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14885, %r14920, %r14813, %r14837, 0xD2; + lop3.b32 %r14886, %r14923, %r14817, %r14841, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30247, %r14813, %r14837, %r14869, 0xD2; + lop3.b32 %r30248, %r14817, %r14841, %r14873, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+32], {%r30247, %r30248}; + // begin inline asm + // chi + lop3.b32 %r30243, %r14837, %r14869, %r14845, 0xD2; + lop3.b32 %r30244, %r14841, %r14873, %r14849, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+40], {%r30243, %r30244}; + // begin inline asm + // chi + lop3.b32 %r30239, %r14869, %r14845, %r14920, 0xD2; + lop3.b32 %r30240, %r14873, %r14849, %r14923, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+48], {%r30239, %r30240}; + // begin inline asm + // chi + lop3.b32 %r30237, %r14845, %r14920, %r14813, 0xD2; + lop3.b32 %r30238, %r14849, %r14923, %r14817, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+56], {%r30237, %r30238}; + // begin inline asm + // chi + lop3.b32 %r30233, %r14861, %r14821, %r14877, 0xD2; + lop3.b32 %r30234, %r14865, %r14825, %r14881, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+64], {%r30233, %r30234}; + // begin inline asm + // chi + lop3.b32 %r30245, %r14821, %r14877, %r14853, 0xD2; + lop3.b32 %r30246, %r14825, %r14881, %r14857, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+72], {%r30245, %r30246}; + // begin inline asm + // chi + lop3.b32 %r30241, %r14877, %r14853, %r14829, 0xD2; + lop3.b32 %r30242, %r14881, %r14857, %r14833, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+80], {%r30241, %r30242}; + // begin inline asm + ld.global.nc.v2.u32 {%r14949,%r14950}, [%rd707]; + // end inline asm + xor.b32 %r30235, %r14885, %r14949; + xor.b32 %r30236, %r14886, %r14950; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + add.s64 %rd151, %rd149, 24; + add.s64 %rd152, %rd3, 24; + +$L__BB2_52: + shl.b32 %r14962, %r30147, 2; + cvt.u64.u32 %rd737, %r14962; + and.b64 %rd738, %rd737, 60; + add.s64 %rd739, %rd152, %rd738; + xor.b32 %r14963, %r1678, %r30147; + mul.lo.s32 %r14964, %r14963, 16777619; + ld.local.u32 %r14965, [%rd739]; + xor.b32 %r14966, %r14964, %r14965; + mul.wide.u32 %rd740, %r14966, -954391867; + shr.u64 %rd741, %rd740, 32; + cvt.u32.u64 %r14967, %rd741; + sub.s32 %r14968, %r14966, %r14967; + shr.u32 %r14969, %r14968, 1; + add.s32 %r14970, %r14969, %r14967; + shr.u32 %r14971, %r14970, 20; + mul.lo.s32 %r14972, %r14971, 1179641; + sub.s32 %r14973, %r14966, %r14972; + mul.wide.u32 %rd742, %r14973, 64; + add.s64 %rd743, %rd471, %rd742; + mul.lo.s32 %r14974, %r30184, 16777619; + ld.global.u32 %r14975, [%rd743]; + xor.b32 %r30184, %r14974, %r14975; + mul.lo.s32 %r14976, %r30185, 16777619; + ld.global.u32 %r14977, [%rd743+4]; + xor.b32 %r30185, %r14976, %r14977; + mul.lo.s32 %r14978, %r30196, 16777619; + ld.global.u32 %r14979, [%rd743+8]; + mul.lo.s32 %r14980, %r30197, 16777619; + ld.global.u32 %r14981, [%rd743+12]; + xor.b32 %r14982, %r14980, %r14981; + xor.b32 %r30196, %r14978, %r14979; + mov.b64 %rd744, {%r30196, %r14982}; + mul.lo.s32 %r14983, %r30192, 16777619; + ld.global.u32 %r14984, [%rd743+16]; + mul.lo.s32 %r14985, %r30193, 16777619; + ld.global.u32 %r14986, [%rd743+20]; + xor.b32 %r14987, %r14985, %r14986; + xor.b32 %r30192, %r14983, %r14984; + mov.b64 %rd745, {%r30192, %r14987}; + mul.lo.s32 %r14988, %r30188, 16777619; + ld.global.u32 %r14989, [%rd743+24]; + mul.lo.s32 %r14990, %r30189, 16777619; + ld.global.u32 %r14991, [%rd743+28]; + xor.b32 %r14992, %r14990, %r14991; + xor.b32 %r30188, %r14988, %r14989; + mov.b64 %rd746, {%r30188, %r14992}; + mul.lo.s32 %r14993, %r30186, 16777619; + ld.global.u32 %r14994, [%rd743+32]; + mul.lo.s32 %r14995, %r30187, 16777619; + ld.global.u32 %r14996, [%rd743+36]; + xor.b32 %r14997, %r14995, %r14996; + xor.b32 %r30186, %r14993, %r14994; + mov.b64 %rd747, {%r30186, %r14997}; + mul.lo.s32 %r14998, %r30182, 16777619; + ld.global.u32 %r14999, [%rd743+40]; + xor.b32 %r30182, %r14998, %r14999; + mul.lo.s32 %r15000, %r30183, 16777619; + ld.global.u32 %r15001, [%rd743+44]; + xor.b32 %r30183, %r15000, %r15001; + mul.lo.s32 %r15002, %r30194, 16777619; + ld.global.u32 %r15003, [%rd743+48]; + mul.lo.s32 %r15004, %r30195, 16777619; + ld.global.u32 %r15005, [%rd743+52]; + xor.b32 %r15006, %r15004, %r15005; + xor.b32 %r30194, %r15002, %r15003; + mov.b64 %rd748, {%r30194, %r15006}; + mul.lo.s32 %r15007, %r30190, 16777619; + ld.global.u32 %r15008, [%rd743+56]; + mul.lo.s32 %r15009, %r30191, 16777619; + ld.global.u32 %r15010, [%rd743+60]; + xor.b32 %r15011, %r15009, %r15010; + xor.b32 %r30190, %r15007, %r15008; + mov.b64 %rd749, {%r30190, %r15011}; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + st.local.v2.u32 [%rd3+32], {%r30196, %r14982}; + st.local.v2.u32 [%rd3+40], {%r30192, %r14987}; + st.local.v2.u32 [%rd3+48], {%r30188, %r14992}; + st.local.v2.u32 [%rd3+56], {%r30186, %r14997}; + st.local.v2.u32 [%rd3+64], {%r30182, %r30183}; + st.local.v2.u32 [%rd3+72], {%r30194, %r15006}; + st.local.v2.u32 [%rd3+80], {%r30190, %r15011}; + add.s64 %rd750, %rd151, %rd738; + xor.b32 %r15012, %r1874, %r30147; + mul.lo.s32 %r15013, %r15012, 16777619; + ld.local.u32 %r15014, [%rd750]; + xor.b32 %r15015, %r15013, %r15014; + mul.wide.u32 %rd751, %r15015, -954391867; + shr.u64 %rd752, %rd751, 32; + cvt.u32.u64 %r15016, %rd752; + sub.s32 %r15017, %r15015, %r15016; + shr.u32 %r15018, %r15017, 1; + add.s32 %r15019, %r15018, %r15016; + shr.u32 %r15020, %r15019, 20; + mul.lo.s32 %r15021, %r15020, 1179641; + sub.s32 %r15022, %r15015, %r15021; + mul.wide.u32 %rd753, %r15022, 64; + add.s64 %rd754, %rd471, %rd753; + mul.lo.s32 %r15023, %r30235, 16777619; + ld.global.u32 %r15024, [%rd754]; + xor.b32 %r30235, %r15023, %r15024; + mul.lo.s32 %r15025, %r30236, 16777619; + ld.global.u32 %r15026, [%rd754+4]; + xor.b32 %r30236, %r15025, %r15026; + mul.lo.s32 %r15027, %r30247, 16777619; + ld.global.u32 %r15028, [%rd754+8]; + mul.lo.s32 %r15029, %r30248, 16777619; + ld.global.u32 %r15030, [%rd754+12]; + xor.b32 %r15031, %r15029, %r15030; + xor.b32 %r30247, %r15027, %r15028; + mov.b64 %rd755, {%r30247, %r15031}; + mul.lo.s32 %r15032, %r30243, 16777619; + ld.global.u32 %r15033, [%rd754+16]; + mul.lo.s32 %r15034, %r30244, 16777619; + ld.global.u32 %r15035, [%rd754+20]; + xor.b32 %r15036, %r15034, %r15035; + xor.b32 %r30243, %r15032, %r15033; + mov.b64 %rd756, {%r30243, %r15036}; + mul.lo.s32 %r15037, %r30239, 16777619; + ld.global.u32 %r15038, [%rd754+24]; + mul.lo.s32 %r15039, %r30240, 16777619; + ld.global.u32 %r15040, [%rd754+28]; + xor.b32 %r15041, %r15039, %r15040; + xor.b32 %r30239, %r15037, %r15038; + mov.b64 %rd757, {%r30239, %r15041}; + mul.lo.s32 %r15042, %r30237, 16777619; + ld.global.u32 %r15043, [%rd754+32]; + mul.lo.s32 %r15044, %r30238, 16777619; + ld.global.u32 %r15045, [%rd754+36]; + xor.b32 %r15046, %r15044, %r15045; + xor.b32 %r30237, %r15042, %r15043; + mov.b64 %rd758, {%r30237, %r15046}; + mul.lo.s32 %r15047, %r30233, 16777619; + ld.global.u32 %r15048, [%rd754+40]; + xor.b32 %r30233, %r15047, %r15048; + mul.lo.s32 %r15049, %r30234, 16777619; + ld.global.u32 %r15050, [%rd754+44]; + xor.b32 %r30234, %r15049, %r15050; + mul.lo.s32 %r15051, %r30245, 16777619; + ld.global.u32 %r15052, [%rd754+48]; + mul.lo.s32 %r15053, %r30246, 16777619; + ld.global.u32 %r15054, [%rd754+52]; + xor.b32 %r15055, %r15053, %r15054; + xor.b32 %r30245, %r15051, %r15052; + mov.b64 %rd759, {%r30245, %r15055}; + mul.lo.s32 %r15056, %r30241, 16777619; + ld.global.u32 %r15057, [%rd754+56]; + mul.lo.s32 %r15058, %r30242, 16777619; + ld.global.u32 %r15059, [%rd754+60]; + xor.b32 %r15060, %r15058, %r15059; + xor.b32 %r30241, %r15056, %r15057; + mov.b64 %rd760, {%r30241, %r15060}; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + st.local.v2.u32 [%rd149+32], {%r30247, %r15031}; + st.local.v2.u32 [%rd149+40], {%r30243, %r15036}; + st.local.v2.u32 [%rd149+48], {%r30239, %r15041}; + st.local.v2.u32 [%rd149+56], {%r30237, %r15046}; + st.local.v2.u32 [%rd149+64], {%r30233, %r30234}; + st.local.v2.u32 [%rd149+72], {%r30245, %r15055}; + st.local.v2.u32 [%rd149+80], {%r30241, %r15060}; + add.s32 %r30147, %r30147, 1; + setp.lt.u32 %p32, %r30147, 512; + shr.u64 %rd761, %rd744, 32; + cvt.u32.u64 %r30197, %rd761; + shr.u64 %rd762, %rd745, 32; + cvt.u32.u64 %r30193, %rd762; + shr.u64 %rd763, %rd746, 32; + cvt.u32.u64 %r30189, %rd763; + shr.u64 %rd764, %rd747, 32; + cvt.u32.u64 %r30187, %rd764; + shr.u64 %rd765, %rd748, 32; + cvt.u32.u64 %r30195, %rd765; + shr.u64 %rd766, %rd749, 32; + cvt.u32.u64 %r30191, %rd766; + shr.u64 %rd767, %rd755, 32; + cvt.u32.u64 %r30248, %rd767; + shr.u64 %rd768, %rd756, 32; + cvt.u32.u64 %r30244, %rd768; + shr.u64 %rd769, %rd757, 32; + cvt.u32.u64 %r30240, %rd769; + shr.u64 %rd770, %rd758, 32; + cvt.u32.u64 %r30238, %rd770; + shr.u64 %rd771, %rd759, 32; + cvt.u32.u64 %r30246, %rd771; + shr.u64 %rd772, %rd760, 32; + cvt.u32.u64 %r30242, %rd772; + @%p32 bra $L__BB2_52; + + mov.u32 %r30148, 0; + st.local.v2.u32 [%rd3+96], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+104], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+112], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+120], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+128], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+136], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+144], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+152], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+160], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+168], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+176], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+184], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+192], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+200], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+208], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+216], {%r30148, %r30148}; + mov.u32 %r30163, -2147483648; + mov.u32 %r15075, 1; + st.local.v2.u32 [%rd3+88], {%r15075, %r30163}; + mov.u32 %r30149, %r30148; + mov.u32 %r30150, %r30148; + mov.u32 %r30151, %r30148; + mov.u32 %r30152, %r30148; + mov.u32 %r30153, %r30148; + mov.u32 %r30154, %r30148; + mov.u32 %r30155, %r30148; + mov.u32 %r30156, %r30148; + mov.u32 %r30157, %r30148; + mov.u32 %r30158, %r30148; + mov.u32 %r30159, %r30148; + mov.u32 %r30160, %r30148; + mov.u32 %r30161, %r30148; + mov.u32 %r30162, %r15075; + mov.u32 %r30164, %r30148; + mov.u32 %r30165, %r30148; + mov.u32 %r30166, %r30148; + mov.u32 %r30167, %r30148; + mov.u32 %r30168, %r30148; + mov.u32 %r30169, %r30148; + mov.u32 %r30170, %r30148; + mov.u32 %r30171, %r30148; + mov.u32 %r30172, %r30148; + mov.u32 %r30173, %r30148; + mov.u32 %r30174, %r30148; + mov.u32 %r30175, %r30148; + mov.u32 %r30176, %r30148; + mov.u32 %r30177, %r30148; + mov.u32 %r30178, %r30148; + mov.u32 %r30179, %r30148; + mov.u32 %r30180, %r30148; + mov.u32 %r30181, %r30148; + mov.u32 %r30198, %r30148; + +$L__BB2_54: + // begin inline asm + // xor5 + lop3.b32 %r15102, %r30184, %r30182, %r30180, 0x96; + lop3.b32 %r15102, %r15102, %r30178, %r30176, 0x96; + lop3.b32 %r15103, %r30185, %r30183, %r30181, 0x96; + lop3.b32 %r15103, %r15103, %r30179, %r30177, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15114, %r30196, %r30194, %r30174, 0x96; + lop3.b32 %r15114, %r15114, %r30172, %r30170, 0x96; + lop3.b32 %r15115, %r30197, %r30195, %r30175, 0x96; + lop3.b32 %r15115, %r15115, %r30173, %r30171, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15126, %r30192, %r30190, %r30168, 0x96; + lop3.b32 %r15126, %r15126, %r30166, %r30164, 0x96; + lop3.b32 %r15127, %r30193, %r30191, %r30169, 0x96; + lop3.b32 %r15127, %r15127, %r30167, %r30165, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15138, %r30188, %r30162, %r30160, 0x96; + lop3.b32 %r15138, %r15138, %r30158, %r30156, 0x96; + lop3.b32 %r15139, %r30189, %r30163, %r30161, 0x96; + lop3.b32 %r15139, %r15139, %r30159, %r30157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15150, %r30186, %r30154, %r30152, 0x96; + lop3.b32 %r15150, %r15150, %r30150, %r30148, 0x96; + lop3.b32 %r15151, %r30187, %r30155, %r30153, 0x96; + lop3.b32 %r15151, %r15151, %r30151, %r30149, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15162, %r15115, %r15114, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15166, %r15114, %r15115, %r15075; + // end inline asm + xor.b32 %r15596, %r15162, %r15150; + xor.b32 %r15597, %r15166, %r15151; + xor.b32 %r15429, %r30184, %r15596; + xor.b32 %r15432, %r30185, %r15597; + xor.b32 %r15336, %r30182, %r15596; + xor.b32 %r15335, %r30183, %r15597; + xor.b32 %r15383, %r30180, %r15596; + xor.b32 %r15384, %r30181, %r15597; + xor.b32 %r15288, %r30178, %r15596; + xor.b32 %r15287, %r30179, %r15597; + xor.b32 %r15239, %r30176, %r15596; + xor.b32 %r15240, %r30177, %r15597; + // begin inline asm + shf.l.wrap.b32 %r15170, %r15127, %r15126, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15174, %r15126, %r15127, %r15075; + // end inline asm + xor.b32 %r15598, %r15170, %r15102; + xor.b32 %r15599, %r15174, %r15103; + xor.b32 %r15391, %r30196, %r15598; + xor.b32 %r15392, %r30197, %r15599; + xor.b32 %r15208, %r30194, %r15598; + xor.b32 %r15207, %r30195, %r15599; + xor.b32 %r15367, %r30174, %r15598; + xor.b32 %r15368, %r30175, %r15599; + xor.b32 %r15328, %r30172, %r15598; + xor.b32 %r15327, %r30173, %r15599; + xor.b32 %r15311, %r30170, %r15598; + xor.b32 %r15312, %r30171, %r15599; + // begin inline asm + shf.l.wrap.b32 %r15178, %r15139, %r15138, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15182, %r15138, %r15139, %r15075; + // end inline asm + xor.b32 %r15600, %r15178, %r15114; + xor.b32 %r15601, %r15182, %r15115; + xor.b32 %r15248, %r30192, %r15600; + xor.b32 %r15247, %r30193, %r15601; + xor.b32 %r15375, %r30190, %r15600; + xor.b32 %r15376, %r30191, %r15601; + xor.b32 %r15256, %r30168, %r15600; + xor.b32 %r15255, %r30169, %r15601; + xor.b32 %r15359, %r30166, %r15600; + xor.b32 %r15360, %r30167, %r15601; + xor.b32 %r15224, %r30164, %r15600; + xor.b32 %r15223, %r30165, %r15601; + // begin inline asm + shf.l.wrap.b32 %r15186, %r15151, %r15150, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15190, %r15150, %r15151, %r15075; + // end inline asm + xor.b32 %r15602, %r15186, %r15126; + xor.b32 %r15603, %r15190, %r15127; + xor.b32 %r15343, %r30188, %r15602; + xor.b32 %r15344, %r30189, %r15603; + xor.b32 %r15320, %r30162, %r15602; + xor.b32 %r15319, %r30163, %r15603; + xor.b32 %r15263, %r30160, %r15602; + xor.b32 %r15264, %r30161, %r15603; + xor.b32 %r15351, %r30158, %r15602; + xor.b32 %r15352, %r30159, %r15603; + xor.b32 %r15280, %r30156, %r15602; + xor.b32 %r15279, %r30157, %r15603; + // begin inline asm + shf.l.wrap.b32 %r15194, %r15103, %r15102, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15198, %r15102, %r15103, %r15075; + // end inline asm + xor.b32 %r15604, %r15194, %r15138; + xor.b32 %r15605, %r15198, %r15139; + xor.b32 %r15295, %r30186, %r15604; + xor.b32 %r15296, %r30187, %r15605; + xor.b32 %r15215, %r30154, %r15604; + xor.b32 %r15216, %r30155, %r15605; + xor.b32 %r15232, %r30152, %r15604; + xor.b32 %r15231, %r30153, %r15605; + xor.b32 %r15271, %r30150, %r15604; + xor.b32 %r15272, %r30151, %r15605; + xor.b32 %r15303, %r30148, %r15604; + xor.b32 %r15304, %r30149, %r15605; + mov.u32 %r15209, 44; + // begin inline asm + shf.l.wrap.b32 %r15202, %r15208, %r15207, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15206, %r15207, %r15208, %r15209; + // end inline asm + mov.u32 %r15217, 20; + // begin inline asm + shf.l.wrap.b32 %r15210, %r15216, %r15215, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15214, %r15215, %r15216, %r15217; + // end inline asm + mov.u32 %r15225, 61; + // begin inline asm + shf.l.wrap.b32 %r15218, %r15224, %r15223, %r15225; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15222, %r15223, %r15224, %r15225; + // end inline asm + mov.u32 %r15233, 39; + // begin inline asm + shf.l.wrap.b32 %r15226, %r15232, %r15231, %r15233; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15230, %r15231, %r15232, %r15233; + // end inline asm + mov.u32 %r15241, 18; + // begin inline asm + shf.l.wrap.b32 %r15234, %r15240, %r15239, %r15241; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15238, %r15239, %r15240, %r15241; + // end inline asm + mov.u32 %r15249, 62; + // begin inline asm + shf.l.wrap.b32 %r15242, %r15248, %r15247, %r15249; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15246, %r15247, %r15248, %r15249; + // end inline asm + mov.u32 %r15257, 43; + // begin inline asm + shf.l.wrap.b32 %r15250, %r15256, %r15255, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15254, %r15255, %r15256, %r15257; + // end inline asm + mov.u32 %r15265, 25; + // begin inline asm + shf.l.wrap.b32 %r15258, %r15264, %r15263, %r15265; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15262, %r15263, %r15264, %r15265; + // end inline asm + mov.u32 %r15273, 8; + // begin inline asm + shf.l.wrap.b32 %r15266, %r15272, %r15271, %r15273; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15270, %r15271, %r15272, %r15273; + // end inline asm + mov.u32 %r15281, 56; + // begin inline asm + shf.l.wrap.b32 %r15274, %r15280, %r15279, %r15281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15278, %r15279, %r15280, %r15281; + // end inline asm + mov.u32 %r15289, 41; + // begin inline asm + shf.l.wrap.b32 %r15282, %r15288, %r15287, %r15289; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15286, %r15287, %r15288, %r15289; + // end inline asm + mov.u32 %r15297, 27; + // begin inline asm + shf.l.wrap.b32 %r15290, %r15296, %r15295, %r15297; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15294, %r15295, %r15296, %r15297; + // end inline asm + mov.u32 %r15305, 14; + // begin inline asm + shf.l.wrap.b32 %r15298, %r15304, %r15303, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15302, %r15303, %r15304, %r15305; + // end inline asm + mov.u32 %r15313, 2; + // begin inline asm + shf.l.wrap.b32 %r15306, %r15312, %r15311, %r15313; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15310, %r15311, %r15312, %r15313; + // end inline asm + mov.u32 %r15321, 55; + // begin inline asm + shf.l.wrap.b32 %r15314, %r15320, %r15319, %r15321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15318, %r15319, %r15320, %r15321; + // end inline asm + mov.u32 %r15329, 45; + // begin inline asm + shf.l.wrap.b32 %r15322, %r15328, %r15327, %r15329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15326, %r15327, %r15328, %r15329; + // end inline asm + mov.u32 %r15337, 36; + // begin inline asm + shf.l.wrap.b32 %r15330, %r15336, %r15335, %r15337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15334, %r15335, %r15336, %r15337; + // end inline asm + mov.u32 %r15345, 28; + // begin inline asm + shf.l.wrap.b32 %r15338, %r15344, %r15343, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15342, %r15343, %r15344, %r15345; + // end inline asm + mov.u32 %r15353, 21; + // begin inline asm + shf.l.wrap.b32 %r15346, %r15352, %r15351, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15350, %r15351, %r15352, %r15353; + // end inline asm + mov.u32 %r15361, 15; + // begin inline asm + shf.l.wrap.b32 %r15354, %r15360, %r15359, %r15361; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15358, %r15359, %r15360, %r15361; + // end inline asm + mov.u32 %r15369, 10; + // begin inline asm + shf.l.wrap.b32 %r15362, %r15368, %r15367, %r15369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15366, %r15367, %r15368, %r15369; + // end inline asm + mov.u32 %r15377, 6; + // begin inline asm + shf.l.wrap.b32 %r15370, %r15376, %r15375, %r15377; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15374, %r15375, %r15376, %r15377; + // end inline asm + mov.u32 %r15385, 3; + // begin inline asm + shf.l.wrap.b32 %r15378, %r15384, %r15383, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15382, %r15383, %r15384, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15386, %r15392, %r15391, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15390, %r15391, %r15392, %r15075; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15394, %r15429, %r15202, %r15250, 0xD2; + lop3.b32 %r15395, %r15432, %r15206, %r15254, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30196, %r15202, %r15250, %r15346, 0xD2; + lop3.b32 %r30197, %r15206, %r15254, %r15350, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30192, %r15250, %r15346, %r15298, 0xD2; + lop3.b32 %r30193, %r15254, %r15350, %r15302, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30188, %r15346, %r15298, %r15429, 0xD2; + lop3.b32 %r30189, %r15350, %r15302, %r15432, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30186, %r15298, %r15429, %r15202, 0xD2; + lop3.b32 %r30187, %r15302, %r15432, %r15206, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30182, %r15338, %r15210, %r15378, 0xD2; + lop3.b32 %r30183, %r15342, %r15214, %r15382, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30194, %r15210, %r15378, %r15322, 0xD2; + lop3.b32 %r30195, %r15214, %r15382, %r15326, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30190, %r15378, %r15322, %r15218, 0xD2; + lop3.b32 %r30191, %r15382, %r15326, %r15222, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30162, %r15322, %r15218, %r15338, 0xD2; + lop3.b32 %r30163, %r15326, %r15222, %r15342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30162, %r30163}; + // begin inline asm + // chi + lop3.b32 %r30154, %r15218, %r15338, %r15210, 0xD2; + lop3.b32 %r30155, %r15222, %r15342, %r15214, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30154, %r30155}; + // begin inline asm + // chi + lop3.b32 %r30180, %r15386, %r15370, %r15258, 0xD2; + lop3.b32 %r30181, %r15390, %r15374, %r15262, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30180, %r30181}; + // begin inline asm + // chi + lop3.b32 %r30174, %r15370, %r15258, %r15266, 0xD2; + lop3.b32 %r30175, %r15374, %r15262, %r15270, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30174, %r30175}; + // begin inline asm + // chi + lop3.b32 %r30168, %r15258, %r15266, %r15234, 0xD2; + lop3.b32 %r30169, %r15262, %r15270, %r15238, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30168, %r30169}; + // begin inline asm + // chi + lop3.b32 %r30160, %r15266, %r15234, %r15386, 0xD2; + lop3.b32 %r30161, %r15270, %r15238, %r15390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30160, %r30161}; + // begin inline asm + // chi + lop3.b32 %r30152, %r15234, %r15386, %r15370, 0xD2; + lop3.b32 %r30153, %r15238, %r15390, %r15374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30152, %r30153}; + // begin inline asm + // chi + lop3.b32 %r30178, %r15290, %r15330, %r15362, 0xD2; + lop3.b32 %r30179, %r15294, %r15334, %r15366, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30178, %r30179}; + // begin inline asm + // chi + lop3.b32 %r30172, %r15330, %r15362, %r15354, 0xD2; + lop3.b32 %r30173, %r15334, %r15366, %r15358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30172, %r30173}; + // begin inline asm + // chi + lop3.b32 %r30166, %r15362, %r15354, %r15274, 0xD2; + lop3.b32 %r30167, %r15366, %r15358, %r15278, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30166, %r30167}; + // begin inline asm + // chi + lop3.b32 %r30158, %r15354, %r15274, %r15290, 0xD2; + lop3.b32 %r30159, %r15358, %r15278, %r15294, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30158, %r30159}; + // begin inline asm + // chi + lop3.b32 %r30150, %r15274, %r15290, %r15330, 0xD2; + lop3.b32 %r30151, %r15278, %r15294, %r15334, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30150, %r30151}; + // begin inline asm + // chi + lop3.b32 %r30176, %r15242, %r15314, %r15226, 0xD2; + lop3.b32 %r30177, %r15246, %r15318, %r15230, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30176, %r30177}; + // begin inline asm + // chi + lop3.b32 %r30170, %r15314, %r15226, %r15282, 0xD2; + lop3.b32 %r30171, %r15318, %r15230, %r15286, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30170, %r30171}; + // begin inline asm + // chi + lop3.b32 %r30164, %r15226, %r15282, %r15306, 0xD2; + lop3.b32 %r30165, %r15230, %r15286, %r15310, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30164, %r30165}; + // begin inline asm + // chi + lop3.b32 %r30156, %r15282, %r15306, %r15242, 0xD2; + lop3.b32 %r30157, %r15286, %r15310, %r15246, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30156, %r30157}; + // begin inline asm + // chi + lop3.b32 %r30148, %r15306, %r15242, %r15314, 0xD2; + lop3.b32 %r30149, %r15310, %r15246, %r15318, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30148, %r30149}; + mul.wide.s32 %rd774, %r30198, 8; + add.s64 %rd773, %rd706, %rd774; + // begin inline asm + ld.global.nc.v2.u32 {%r15594,%r15595}, [%rd773]; + // end inline asm + xor.b32 %r30184, %r15394, %r15594; + xor.b32 %r30185, %r15395, %r15595; + add.s32 %r30198, %r30198, 1; + setp.lt.u32 %p33, %r30198, 23; + @%p33 bra $L__BB2_54; + + st.local.v2.u32 [%rd3+32], {%r30196, %r30197}; + st.local.v2.u32 [%rd3+72], {%r30194, %r30195}; + st.local.v2.u32 [%rd3+40], {%r30192, %r30193}; + st.local.v2.u32 [%rd3+80], {%r30190, %r30191}; + st.local.v2.u32 [%rd3+48], {%r30188, %r30189}; + st.local.v2.u32 [%rd3+56], {%r30186, %r30187}; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + // begin inline asm + // xor5 + lop3.b32 %r15606, %r30184, %r30182, %r30180, 0x96; + lop3.b32 %r15606, %r15606, %r30178, %r30176, 0x96; + lop3.b32 %r15607, %r30185, %r30183, %r30181, 0x96; + lop3.b32 %r15607, %r15607, %r30179, %r30177, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15618, %r30196, %r30194, %r30174, 0x96; + lop3.b32 %r15618, %r15618, %r30172, %r30170, 0x96; + lop3.b32 %r15619, %r30197, %r30195, %r30175, 0x96; + lop3.b32 %r15619, %r15619, %r30173, %r30171, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15630, %r30192, %r30190, %r30168, 0x96; + lop3.b32 %r15630, %r15630, %r30166, %r30164, 0x96; + lop3.b32 %r15631, %r30193, %r30191, %r30169, 0x96; + lop3.b32 %r15631, %r15631, %r30167, %r30165, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15642, %r30188, %r30162, %r30160, 0x96; + lop3.b32 %r15642, %r15642, %r30158, %r30156, 0x96; + lop3.b32 %r15643, %r30189, %r30163, %r30161, 0x96; + lop3.b32 %r15643, %r15643, %r30159, %r30157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15654, %r30186, %r30154, %r30152, 0x96; + lop3.b32 %r15654, %r15654, %r30150, %r30148, 0x96; + lop3.b32 %r15655, %r30187, %r30155, %r30153, 0x96; + lop3.b32 %r15655, %r15655, %r30151, %r30149, 0x96; + // end inline asm + mov.u32 %r15858, 1; + // begin inline asm + shf.l.wrap.b32 %r15666, %r15619, %r15618, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15670, %r15618, %r15619, %r15858; + // end inline asm + xor.b32 %r15885, %r15666, %r15654; + xor.b32 %r15886, %r15670, %r15655; + xor.b32 %r15813, %r30184, %r15885; + xor.b32 %r15816, %r30185, %r15886; + xor.b32 %r15776, %r30181, %r15886; + xor.b32 %r15775, %r30180, %r15885; + st.local.v2.u32 [%rd3+104], {%r15775, %r15776}; + // begin inline asm + shf.l.wrap.b32 %r15674, %r15631, %r15630, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15678, %r15630, %r15631, %r15858; + // end inline asm + xor.b32 %r15887, %r15674, %r15606; + xor.b32 %r15888, %r15678, %r15607; + xor.b32 %r15712, %r30194, %r15887; + xor.b32 %r15711, %r30195, %r15888; + xor.b32 %r15751, %r30173, %r15888; + xor.b32 %r15752, %r30172, %r15887; + st.local.v2.u32 [%rd3+152], {%r15752, %r15751}; + // begin inline asm + shf.l.wrap.b32 %r15682, %r15643, %r15642, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15686, %r15642, %r15643, %r15858; + // end inline asm + xor.b32 %r15889, %r15682, %r15618; + xor.b32 %r15890, %r15686, %r15619; + xor.b32 %r15735, %r30169, %r15890; + xor.b32 %r15736, %r30168, %r15889; + st.local.v2.u32 [%rd3+120], {%r15736, %r15735}; + xor.b32 %r15727, %r30165, %r15890; + xor.b32 %r15728, %r30164, %r15889; + st.local.v2.u32 [%rd3+200], {%r15728, %r15727}; + // begin inline asm + shf.l.wrap.b32 %r15690, %r15655, %r15654, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15694, %r15654, %r15655, %r15858; + // end inline asm + xor.b32 %r15891, %r15690, %r15630; + xor.b32 %r15892, %r15694, %r15631; + xor.b32 %r15759, %r30188, %r15891; + xor.b32 %r15760, %r30189, %r15892; + xor.b32 %r15768, %r30159, %r15892; + xor.b32 %r15767, %r30158, %r15891; + st.local.v2.u32 [%rd3+168], {%r15767, %r15768}; + // begin inline asm + shf.l.wrap.b32 %r15698, %r15607, %r15606, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15702, %r15606, %r15607, %r15858; + // end inline asm + xor.b32 %r15893, %r15698, %r15642; + xor.b32 %r15894, %r15702, %r15643; + xor.b32 %r15719, %r30154, %r15893; + xor.b32 %r15720, %r30155, %r15894; + xor.b32 %r15744, %r30149, %r15894; + xor.b32 %r15743, %r30148, %r15893; + st.local.v2.u32 [%rd3+216], {%r15743, %r15744}; + // begin inline asm + shf.l.wrap.b32 %r15706, %r15712, %r15711, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15710, %r15711, %r15712, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15714, %r15720, %r15719, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15718, %r15719, %r15720, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15726, %r15727, %r15728, %r15225; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15722, %r15728, %r15727, %r15225; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r15722, %r15726}; + // begin inline asm + shf.l.wrap.b32 %r15730, %r15736, %r15735, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15734, %r15735, %r15736, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15738, %r15744, %r15743, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15742, %r15743, %r15744, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15750, %r15751, %r15752, %r15329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15746, %r15752, %r15751, %r15329; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r15746, %r15750}; + // begin inline asm + shf.l.wrap.b32 %r15754, %r15760, %r15759, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15758, %r15759, %r15760, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15762, %r15768, %r15767, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15766, %r15767, %r15768, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15770, %r15776, %r15775, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15774, %r15775, %r15776, %r15385; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15778, %r15813, %r15706, %r15730, 0xD2; + lop3.b32 %r15779, %r15816, %r15710, %r15734, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15786, %r15706, %r15730, %r15762, 0xD2; + lop3.b32 %r15787, %r15710, %r15734, %r15766, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r15786, %r15787}; + // begin inline asm + // chi + lop3.b32 %r15794, %r15730, %r15762, %r15738, 0xD2; + lop3.b32 %r15795, %r15734, %r15766, %r15742, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r15794, %r15795}; + // begin inline asm + // chi + lop3.b32 %r15802, %r15762, %r15738, %r15813, 0xD2; + lop3.b32 %r15803, %r15766, %r15742, %r15816, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r15802, %r15803}; + // begin inline asm + // chi + lop3.b32 %r15810, %r15738, %r15813, %r15706, 0xD2; + lop3.b32 %r15811, %r15742, %r15816, %r15710, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r15810, %r15811}; + // begin inline asm + // chi + lop3.b32 %r15818, %r15754, %r15714, %r15770, 0xD2; + lop3.b32 %r15819, %r15758, %r15718, %r15774, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r15818, %r15819}; + // begin inline asm + // chi + lop3.b32 %r15826, %r15714, %r15770, %r15746, 0xD2; + lop3.b32 %r15827, %r15718, %r15774, %r15750, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r15826, %r15827}; + // begin inline asm + // chi + lop3.b32 %r15834, %r15770, %r15746, %r15722, 0xD2; + lop3.b32 %r15835, %r15774, %r15750, %r15726, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r15834, %r15835}; + // begin inline asm + ld.global.nc.v2.u32 {%r15842,%r15843}, [%rd707]; + // end inline asm + xor.b32 %r15895, %r15779, %r15843; + xor.b32 %r15896, %r15778, %r15842; + mov.b64 %rd1265, {%r15896, %r15895}; + mov.b64 %rd1266, {%r15786, %r15787}; + mov.b64 %rd1267, {%r15794, %r15795}; + mov.b64 %rd156, {%r15802, %r15803}; + mov.b64 %rd1268, {%r15810, %r15811}; + mov.b64 %rd158, {%r15818, %r15819}; + mov.b64 %rd159, {%r15826, %r15827}; + mov.b64 %rd160, {%r15834, %r15835}; + mov.u32 %r30199, 0; + st.local.v2.u32 [%rd3+24], {%r15896, %r15895}; + st.local.v2.u32 [%rd149+96], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+104], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+112], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+120], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+128], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+136], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+144], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+152], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+160], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+168], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+176], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+184], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+192], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+200], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+208], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+216], {%r30199, %r30199}; + mov.u32 %r30214, -2147483648; + st.local.v2.u32 [%rd149+88], {%r15858, %r30214}; + mov.u32 %r30200, %r30199; + mov.u32 %r30201, %r30199; + mov.u32 %r30202, %r30199; + mov.u32 %r30203, %r30199; + mov.u32 %r30204, %r30199; + mov.u32 %r30205, %r30199; + mov.u32 %r30206, %r30199; + mov.u32 %r30207, %r30199; + mov.u32 %r30208, %r30199; + mov.u32 %r30209, %r30199; + mov.u32 %r30210, %r30199; + mov.u32 %r30211, %r30199; + mov.u32 %r30212, %r30199; + mov.u32 %r30213, %r15858; + mov.u32 %r30215, %r30199; + mov.u32 %r30216, %r30199; + mov.u32 %r30217, %r30199; + mov.u32 %r30218, %r30199; + mov.u32 %r30219, %r30199; + mov.u32 %r30220, %r30199; + mov.u32 %r30221, %r30199; + mov.u32 %r30222, %r30199; + mov.u32 %r30223, %r30199; + mov.u32 %r30224, %r30199; + mov.u32 %r30225, %r30199; + mov.u32 %r30226, %r30199; + mov.u32 %r30227, %r30199; + mov.u32 %r30228, %r30199; + mov.u32 %r30229, %r30199; + mov.u32 %r30230, %r30199; + mov.u32 %r30231, %r30199; + mov.u32 %r30232, %r30199; + mov.u32 %r30249, %r30199; + +$L__BB2_56: + // begin inline asm + // xor5 + lop3.b32 %r15897, %r30235, %r30233, %r30231, 0x96; + lop3.b32 %r15897, %r15897, %r30229, %r30227, 0x96; + lop3.b32 %r15898, %r30236, %r30234, %r30232, 0x96; + lop3.b32 %r15898, %r15898, %r30230, %r30228, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15909, %r30247, %r30245, %r30225, 0x96; + lop3.b32 %r15909, %r15909, %r30223, %r30221, 0x96; + lop3.b32 %r15910, %r30248, %r30246, %r30226, 0x96; + lop3.b32 %r15910, %r15910, %r30224, %r30222, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15921, %r30243, %r30241, %r30219, 0x96; + lop3.b32 %r15921, %r15921, %r30217, %r30215, 0x96; + lop3.b32 %r15922, %r30244, %r30242, %r30220, 0x96; + lop3.b32 %r15922, %r15922, %r30218, %r30216, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15933, %r30239, %r30213, %r30211, 0x96; + lop3.b32 %r15933, %r15933, %r30209, %r30207, 0x96; + lop3.b32 %r15934, %r30240, %r30214, %r30212, 0x96; + lop3.b32 %r15934, %r15934, %r30210, %r30208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15945, %r30237, %r30205, %r30203, 0x96; + lop3.b32 %r15945, %r15945, %r30201, %r30199, 0x96; + lop3.b32 %r15946, %r30238, %r30206, %r30204, 0x96; + lop3.b32 %r15946, %r15946, %r30202, %r30200, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15957, %r15910, %r15909, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15961, %r15909, %r15910, %r15858; + // end inline asm + xor.b32 %r16391, %r15957, %r15945; + xor.b32 %r16392, %r15961, %r15946; + xor.b32 %r16224, %r30235, %r16391; + xor.b32 %r16227, %r30236, %r16392; + xor.b32 %r16131, %r30233, %r16391; + xor.b32 %r16130, %r30234, %r16392; + xor.b32 %r16178, %r30231, %r16391; + xor.b32 %r16179, %r30232, %r16392; + xor.b32 %r16083, %r30229, %r16391; + xor.b32 %r16082, %r30230, %r16392; + xor.b32 %r16034, %r30227, %r16391; + xor.b32 %r16035, %r30228, %r16392; + // begin inline asm + shf.l.wrap.b32 %r15965, %r15922, %r15921, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15969, %r15921, %r15922, %r15858; + // end inline asm + xor.b32 %r16393, %r15965, %r15897; + xor.b32 %r16394, %r15969, %r15898; + xor.b32 %r16186, %r30247, %r16393; + xor.b32 %r16187, %r30248, %r16394; + xor.b32 %r16003, %r30245, %r16393; + xor.b32 %r16002, %r30246, %r16394; + xor.b32 %r16162, %r30225, %r16393; + xor.b32 %r16163, %r30226, %r16394; + xor.b32 %r16123, %r30223, %r16393; + xor.b32 %r16122, %r30224, %r16394; + xor.b32 %r16106, %r30221, %r16393; + xor.b32 %r16107, %r30222, %r16394; + // begin inline asm + shf.l.wrap.b32 %r15973, %r15934, %r15933, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15977, %r15933, %r15934, %r15858; + // end inline asm + xor.b32 %r16395, %r15973, %r15909; + xor.b32 %r16396, %r15977, %r15910; + xor.b32 %r16043, %r30243, %r16395; + xor.b32 %r16042, %r30244, %r16396; + xor.b32 %r16170, %r30241, %r16395; + xor.b32 %r16171, %r30242, %r16396; + xor.b32 %r16051, %r30219, %r16395; + xor.b32 %r16050, %r30220, %r16396; + xor.b32 %r16154, %r30217, %r16395; + xor.b32 %r16155, %r30218, %r16396; + xor.b32 %r16019, %r30215, %r16395; + xor.b32 %r16018, %r30216, %r16396; + // begin inline asm + shf.l.wrap.b32 %r15981, %r15946, %r15945, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15985, %r15945, %r15946, %r15858; + // end inline asm + xor.b32 %r16397, %r15981, %r15921; + xor.b32 %r16398, %r15985, %r15922; + xor.b32 %r16138, %r30239, %r16397; + xor.b32 %r16139, %r30240, %r16398; + xor.b32 %r16115, %r30213, %r16397; + xor.b32 %r16114, %r30214, %r16398; + xor.b32 %r16058, %r30211, %r16397; + xor.b32 %r16059, %r30212, %r16398; + xor.b32 %r16146, %r30209, %r16397; + xor.b32 %r16147, %r30210, %r16398; + xor.b32 %r16075, %r30207, %r16397; + xor.b32 %r16074, %r30208, %r16398; + // begin inline asm + shf.l.wrap.b32 %r15989, %r15898, %r15897, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15993, %r15897, %r15898, %r15858; + // end inline asm + xor.b32 %r16399, %r15989, %r15933; + xor.b32 %r16400, %r15993, %r15934; + xor.b32 %r16090, %r30237, %r16399; + xor.b32 %r16091, %r30238, %r16400; + xor.b32 %r16010, %r30205, %r16399; + xor.b32 %r16011, %r30206, %r16400; + xor.b32 %r16027, %r30203, %r16399; + xor.b32 %r16026, %r30204, %r16400; + xor.b32 %r16066, %r30201, %r16399; + xor.b32 %r16067, %r30202, %r16400; + xor.b32 %r16098, %r30199, %r16399; + xor.b32 %r16099, %r30200, %r16400; + mov.u32 %r16004, 44; + // begin inline asm + shf.l.wrap.b32 %r15997, %r16003, %r16002, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16001, %r16002, %r16003, %r16004; + // end inline asm + mov.u32 %r16012, 20; + // begin inline asm + shf.l.wrap.b32 %r16005, %r16011, %r16010, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16009, %r16010, %r16011, %r16012; + // end inline asm + mov.u32 %r16020, 61; + // begin inline asm + shf.l.wrap.b32 %r16013, %r16019, %r16018, %r16020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16017, %r16018, %r16019, %r16020; + // end inline asm + mov.u32 %r16028, 39; + // begin inline asm + shf.l.wrap.b32 %r16021, %r16027, %r16026, %r16028; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16025, %r16026, %r16027, %r16028; + // end inline asm + mov.u32 %r16036, 18; + // begin inline asm + shf.l.wrap.b32 %r16029, %r16035, %r16034, %r16036; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16033, %r16034, %r16035, %r16036; + // end inline asm + mov.u32 %r16044, 62; + // begin inline asm + shf.l.wrap.b32 %r16037, %r16043, %r16042, %r16044; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16041, %r16042, %r16043, %r16044; + // end inline asm + mov.u32 %r16052, 43; + // begin inline asm + shf.l.wrap.b32 %r16045, %r16051, %r16050, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16049, %r16050, %r16051, %r16052; + // end inline asm + mov.u32 %r16060, 25; + // begin inline asm + shf.l.wrap.b32 %r16053, %r16059, %r16058, %r16060; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16057, %r16058, %r16059, %r16060; + // end inline asm + mov.u32 %r16068, 8; + // begin inline asm + shf.l.wrap.b32 %r16061, %r16067, %r16066, %r16068; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16065, %r16066, %r16067, %r16068; + // end inline asm + mov.u32 %r16076, 56; + // begin inline asm + shf.l.wrap.b32 %r16069, %r16075, %r16074, %r16076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16073, %r16074, %r16075, %r16076; + // end inline asm + mov.u32 %r16084, 41; + // begin inline asm + shf.l.wrap.b32 %r16077, %r16083, %r16082, %r16084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16081, %r16082, %r16083, %r16084; + // end inline asm + mov.u32 %r16092, 27; + // begin inline asm + shf.l.wrap.b32 %r16085, %r16091, %r16090, %r16092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16089, %r16090, %r16091, %r16092; + // end inline asm + mov.u32 %r16100, 14; + // begin inline asm + shf.l.wrap.b32 %r16093, %r16099, %r16098, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16097, %r16098, %r16099, %r16100; + // end inline asm + mov.u32 %r16108, 2; + // begin inline asm + shf.l.wrap.b32 %r16101, %r16107, %r16106, %r16108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16105, %r16106, %r16107, %r16108; + // end inline asm + mov.u32 %r16116, 55; + // begin inline asm + shf.l.wrap.b32 %r16109, %r16115, %r16114, %r16116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16113, %r16114, %r16115, %r16116; + // end inline asm + mov.u32 %r16124, 45; + // begin inline asm + shf.l.wrap.b32 %r16117, %r16123, %r16122, %r16124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16121, %r16122, %r16123, %r16124; + // end inline asm + mov.u32 %r16132, 36; + // begin inline asm + shf.l.wrap.b32 %r16125, %r16131, %r16130, %r16132; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16129, %r16130, %r16131, %r16132; + // end inline asm + mov.u32 %r16140, 28; + // begin inline asm + shf.l.wrap.b32 %r16133, %r16139, %r16138, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16137, %r16138, %r16139, %r16140; + // end inline asm + mov.u32 %r16148, 21; + // begin inline asm + shf.l.wrap.b32 %r16141, %r16147, %r16146, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16145, %r16146, %r16147, %r16148; + // end inline asm + mov.u32 %r16156, 15; + // begin inline asm + shf.l.wrap.b32 %r16149, %r16155, %r16154, %r16156; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16153, %r16154, %r16155, %r16156; + // end inline asm + mov.u32 %r16164, 10; + // begin inline asm + shf.l.wrap.b32 %r16157, %r16163, %r16162, %r16164; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16161, %r16162, %r16163, %r16164; + // end inline asm + mov.u32 %r16172, 6; + // begin inline asm + shf.l.wrap.b32 %r16165, %r16171, %r16170, %r16172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16169, %r16170, %r16171, %r16172; + // end inline asm + mov.u32 %r16180, 3; + // begin inline asm + shf.l.wrap.b32 %r16173, %r16179, %r16178, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16177, %r16178, %r16179, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16181, %r16187, %r16186, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16185, %r16186, %r16187, %r15858; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16189, %r16224, %r15997, %r16045, 0xD2; + lop3.b32 %r16190, %r16227, %r16001, %r16049, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30247, %r15997, %r16045, %r16141, 0xD2; + lop3.b32 %r30248, %r16001, %r16049, %r16145, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30243, %r16045, %r16141, %r16093, 0xD2; + lop3.b32 %r30244, %r16049, %r16145, %r16097, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30239, %r16141, %r16093, %r16224, 0xD2; + lop3.b32 %r30240, %r16145, %r16097, %r16227, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30237, %r16093, %r16224, %r15997, 0xD2; + lop3.b32 %r30238, %r16097, %r16227, %r16001, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30233, %r16133, %r16005, %r16173, 0xD2; + lop3.b32 %r30234, %r16137, %r16009, %r16177, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30245, %r16005, %r16173, %r16117, 0xD2; + lop3.b32 %r30246, %r16009, %r16177, %r16121, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30241, %r16173, %r16117, %r16013, 0xD2; + lop3.b32 %r30242, %r16177, %r16121, %r16017, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30213, %r16117, %r16013, %r16133, 0xD2; + lop3.b32 %r30214, %r16121, %r16017, %r16137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r30213, %r30214}; + // begin inline asm + // chi + lop3.b32 %r30205, %r16013, %r16133, %r16005, 0xD2; + lop3.b32 %r30206, %r16017, %r16137, %r16009, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r30205, %r30206}; + // begin inline asm + // chi + lop3.b32 %r30231, %r16181, %r16165, %r16053, 0xD2; + lop3.b32 %r30232, %r16185, %r16169, %r16057, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+104], {%r30231, %r30232}; + // begin inline asm + // chi + lop3.b32 %r30225, %r16165, %r16053, %r16061, 0xD2; + lop3.b32 %r30226, %r16169, %r16057, %r16065, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+112], {%r30225, %r30226}; + // begin inline asm + // chi + lop3.b32 %r30219, %r16053, %r16061, %r16029, 0xD2; + lop3.b32 %r30220, %r16057, %r16065, %r16033, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+120], {%r30219, %r30220}; + // begin inline asm + // chi + lop3.b32 %r30211, %r16061, %r16029, %r16181, 0xD2; + lop3.b32 %r30212, %r16065, %r16033, %r16185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+128], {%r30211, %r30212}; + // begin inline asm + // chi + lop3.b32 %r30203, %r16029, %r16181, %r16165, 0xD2; + lop3.b32 %r30204, %r16033, %r16185, %r16169, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+136], {%r30203, %r30204}; + // begin inline asm + // chi + lop3.b32 %r30229, %r16085, %r16125, %r16157, 0xD2; + lop3.b32 %r30230, %r16089, %r16129, %r16161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+144], {%r30229, %r30230}; + // begin inline asm + // chi + lop3.b32 %r30223, %r16125, %r16157, %r16149, 0xD2; + lop3.b32 %r30224, %r16129, %r16161, %r16153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+152], {%r30223, %r30224}; + // begin inline asm + // chi + lop3.b32 %r30217, %r16157, %r16149, %r16069, 0xD2; + lop3.b32 %r30218, %r16161, %r16153, %r16073, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+160], {%r30217, %r30218}; + // begin inline asm + // chi + lop3.b32 %r30209, %r16149, %r16069, %r16085, 0xD2; + lop3.b32 %r30210, %r16153, %r16073, %r16089, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+168], {%r30209, %r30210}; + // begin inline asm + // chi + lop3.b32 %r30201, %r16069, %r16085, %r16125, 0xD2; + lop3.b32 %r30202, %r16073, %r16089, %r16129, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+176], {%r30201, %r30202}; + // begin inline asm + // chi + lop3.b32 %r30227, %r16037, %r16109, %r16021, 0xD2; + lop3.b32 %r30228, %r16041, %r16113, %r16025, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+184], {%r30227, %r30228}; + // begin inline asm + // chi + lop3.b32 %r30221, %r16109, %r16021, %r16077, 0xD2; + lop3.b32 %r30222, %r16113, %r16025, %r16081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+192], {%r30221, %r30222}; + // begin inline asm + // chi + lop3.b32 %r30215, %r16021, %r16077, %r16101, 0xD2; + lop3.b32 %r30216, %r16025, %r16081, %r16105, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+200], {%r30215, %r30216}; + // begin inline asm + // chi + lop3.b32 %r30207, %r16077, %r16101, %r16037, 0xD2; + lop3.b32 %r30208, %r16081, %r16105, %r16041, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+208], {%r30207, %r30208}; + // begin inline asm + // chi + lop3.b32 %r30199, %r16101, %r16037, %r16109, 0xD2; + lop3.b32 %r30200, %r16105, %r16041, %r16113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+216], {%r30199, %r30200}; + mul.wide.s32 %rd781, %r30249, 8; + add.s64 %rd780, %rd706, %rd781; + // begin inline asm + ld.global.nc.v2.u32 {%r16389,%r16390}, [%rd780]; + // end inline asm + xor.b32 %r30235, %r16189, %r16389; + xor.b32 %r30236, %r16190, %r16390; + add.s32 %r30249, %r30249, 1; + setp.lt.u32 %p34, %r30249, 23; + @%p34 bra $L__BB2_56; + + mov.u32 %r16500, 1; + st.local.v2.u32 [%rd149+32], {%r30247, %r30248}; + st.local.v2.u32 [%rd149+72], {%r30245, %r30246}; + st.local.v2.u32 [%rd149+40], {%r30243, %r30244}; + st.local.v2.u32 [%rd149+80], {%r30241, %r30242}; + st.local.v2.u32 [%rd149+48], {%r30239, %r30240}; + st.local.v2.u32 [%rd149+56], {%r30237, %r30238}; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + // begin inline asm + // xor5 + lop3.b32 %r16401, %r30235, %r30233, %r30231, 0x96; + lop3.b32 %r16401, %r16401, %r30229, %r30227, 0x96; + lop3.b32 %r16402, %r30236, %r30234, %r30232, 0x96; + lop3.b32 %r16402, %r16402, %r30230, %r30228, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16413, %r30247, %r30245, %r30225, 0x96; + lop3.b32 %r16413, %r16413, %r30223, %r30221, 0x96; + lop3.b32 %r16414, %r30248, %r30246, %r30226, 0x96; + lop3.b32 %r16414, %r16414, %r30224, %r30222, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16425, %r30243, %r30241, %r30219, 0x96; + lop3.b32 %r16425, %r16425, %r30217, %r30215, 0x96; + lop3.b32 %r16426, %r30244, %r30242, %r30220, 0x96; + lop3.b32 %r16426, %r16426, %r30218, %r30216, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16437, %r30239, %r30213, %r30211, 0x96; + lop3.b32 %r16437, %r16437, %r30209, %r30207, 0x96; + lop3.b32 %r16438, %r30240, %r30214, %r30212, 0x96; + lop3.b32 %r16438, %r16438, %r30210, %r30208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16449, %r30237, %r30205, %r30203, 0x96; + lop3.b32 %r16449, %r16449, %r30201, %r30199, 0x96; + lop3.b32 %r16450, %r30238, %r30206, %r30204, 0x96; + lop3.b32 %r16450, %r16450, %r30202, %r30200, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16461, %r16414, %r16413, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16465, %r16413, %r16414, %r16500; + // end inline asm + xor.b32 %r16639, %r16461, %r16449; + xor.b32 %r16640, %r16465, %r16450; + xor.b32 %r16608, %r30235, %r16639; + xor.b32 %r16611, %r30236, %r16640; + xor.b32 %r16571, %r30232, %r16640; + xor.b32 %r16570, %r30231, %r16639; + st.local.v2.u32 [%rd149+104], {%r16570, %r16571}; + // begin inline asm + shf.l.wrap.b32 %r16469, %r16426, %r16425, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16473, %r16425, %r16426, %r16500; + // end inline asm + xor.b32 %r16641, %r16469, %r16401; + xor.b32 %r16642, %r16473, %r16402; + xor.b32 %r16507, %r30245, %r16641; + xor.b32 %r16506, %r30246, %r16642; + xor.b32 %r16546, %r30224, %r16642; + xor.b32 %r16547, %r30223, %r16641; + st.local.v2.u32 [%rd149+152], {%r16547, %r16546}; + // begin inline asm + shf.l.wrap.b32 %r16477, %r16438, %r16437, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16481, %r16437, %r16438, %r16500; + // end inline asm + xor.b32 %r16643, %r16477, %r16413; + xor.b32 %r16644, %r16481, %r16414; + xor.b32 %r16530, %r30220, %r16644; + xor.b32 %r16531, %r30219, %r16643; + st.local.v2.u32 [%rd149+120], {%r16531, %r16530}; + xor.b32 %r16522, %r30216, %r16644; + xor.b32 %r16523, %r30215, %r16643; + st.local.v2.u32 [%rd149+200], {%r16523, %r16522}; + // begin inline asm + shf.l.wrap.b32 %r16485, %r16450, %r16449, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16489, %r16449, %r16450, %r16500; + // end inline asm + xor.b32 %r16645, %r16485, %r16425; + xor.b32 %r16646, %r16489, %r16426; + xor.b32 %r16554, %r30239, %r16645; + xor.b32 %r16555, %r30240, %r16646; + xor.b32 %r16563, %r30210, %r16646; + xor.b32 %r16562, %r30209, %r16645; + st.local.v2.u32 [%rd149+168], {%r16562, %r16563}; + // begin inline asm + shf.l.wrap.b32 %r16493, %r16402, %r16401, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16497, %r16401, %r16402, %r16500; + // end inline asm + xor.b32 %r16647, %r16493, %r16437; + xor.b32 %r16648, %r16497, %r16438; + xor.b32 %r16514, %r30205, %r16647; + xor.b32 %r16515, %r30206, %r16648; + xor.b32 %r16539, %r30200, %r16648; + xor.b32 %r16538, %r30199, %r16647; + st.local.v2.u32 [%rd149+216], {%r16538, %r16539}; + // begin inline asm + shf.l.wrap.b32 %r16501, %r16507, %r16506, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16505, %r16506, %r16507, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16509, %r16515, %r16514, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16513, %r16514, %r16515, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16521, %r16522, %r16523, %r16020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16517, %r16523, %r16522, %r16020; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r16517, %r16521}; + // begin inline asm + shf.l.wrap.b32 %r16525, %r16531, %r16530, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16529, %r16530, %r16531, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16533, %r16539, %r16538, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16537, %r16538, %r16539, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16545, %r16546, %r16547, %r16124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16541, %r16547, %r16546, %r16124; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r16541, %r16545}; + // begin inline asm + shf.l.wrap.b32 %r16549, %r16555, %r16554, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16553, %r16554, %r16555, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16557, %r16563, %r16562, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16561, %r16562, %r16563, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16565, %r16571, %r16570, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16569, %r16570, %r16571, %r16180; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16573, %r16608, %r16501, %r16525, 0xD2; + lop3.b32 %r16574, %r16611, %r16505, %r16529, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16581, %r16501, %r16525, %r16557, 0xD2; + lop3.b32 %r16582, %r16505, %r16529, %r16561, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+32], {%r16581, %r16582}; + // begin inline asm + // chi + lop3.b32 %r16589, %r16525, %r16557, %r16533, 0xD2; + lop3.b32 %r16590, %r16529, %r16561, %r16537, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+40], {%r16589, %r16590}; + // begin inline asm + // chi + lop3.b32 %r16597, %r16557, %r16533, %r16608, 0xD2; + lop3.b32 %r16598, %r16561, %r16537, %r16611, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+48], {%r16597, %r16598}; + // begin inline asm + // chi + lop3.b32 %r16605, %r16533, %r16608, %r16501, 0xD2; + lop3.b32 %r16606, %r16537, %r16611, %r16505, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+56], {%r16605, %r16606}; + // begin inline asm + // chi + lop3.b32 %r16613, %r16549, %r16509, %r16565, 0xD2; + lop3.b32 %r16614, %r16553, %r16513, %r16569, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+64], {%r16613, %r16614}; + // begin inline asm + // chi + lop3.b32 %r16621, %r16509, %r16565, %r16541, 0xD2; + lop3.b32 %r16622, %r16513, %r16569, %r16545, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+72], {%r16621, %r16622}; + // begin inline asm + // chi + lop3.b32 %r16629, %r16565, %r16541, %r16517, 0xD2; + lop3.b32 %r16630, %r16569, %r16545, %r16521, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+80], {%r16629, %r16630}; + // begin inline asm + ld.global.nc.v2.u32 {%r16637,%r16638}, [%rd707]; + // end inline asm + xor.b32 %r16649, %r16574, %r16638; + xor.b32 %r16650, %r16573, %r16637; + st.local.v2.u32 [%rd149+24], {%r16650, %r16649}; + st.global.u64 [%rd130], %rd1265; + st.global.u64 [%rd130+8], %rd1266; + st.global.u64 [%rd130+16], %rd1267; + st.global.u64 [%rd130+24], %rd156; + st.global.u64 [%rd130+32], %rd1268; + st.global.u64 [%rd130+40], %rd158; + st.global.u64 [%rd130+48], %rd159; + st.global.u64 [%rd130+56], %rd160; + st.global.v2.u32 [%rd130+64], {%r16650, %r16649}; + st.global.v2.u32 [%rd130+72], {%r16581, %r16582}; + st.global.v2.u32 [%rd130+80], {%r16589, %r16590}; + st.global.v2.u32 [%rd130+88], {%r16597, %r16598}; + st.global.v2.u32 [%rd130+96], {%r16605, %r16606}; + st.global.v2.u32 [%rd130+104], {%r16613, %r16614}; + st.global.v2.u32 [%rd130+112], {%r16621, %r16622}; + st.global.v2.u32 [%rd130+120], {%r16629, %r16630}; + +$L__BB2_69: + shl.b32 %r3326, %r29, 1; + mul.wide.u32 %rd887, %r3326, -954391867; + shr.u64 %rd888, %rd887, 32; + cvt.u32.u64 %r19935, %rd888; + sub.s32 %r19936, %r3326, %r19935; + shr.u32 %r19937, %r19936, 1; + add.s32 %r19938, %r19937, %r19935; + shr.u32 %r19939, %r19938, 20; + mul.lo.s32 %r19940, %r19939, 1179641; + sub.s32 %r19941, %r3326, %r19940; + mul.wide.u32 %rd890, %r19941, 64; + add.s64 %rd222, %rd471, %rd890; + or.b32 %r3327, %r3326, 1; + mul.wide.u32 %rd891, %r3327, -954391867; + shr.u64 %rd892, %rd891, 32; + cvt.u32.u64 %r19942, %rd892; + sub.s32 %r19943, %r3327, %r19942; + shr.u32 %r19944, %r19943, 1; + add.s32 %r19945, %r19944, %r19942; + shr.u32 %r19946, %r19945, 20; + mul.lo.s32 %r19947, %r19946, 1179641; + sub.s32 %r19948, %r3327, %r19947; + mul.wide.u32 %rd893, %r19948, 64; + add.s64 %rd223, %rd471, %rd893; + @%p16 bra $L__BB2_83; + + cvta.to.global.u64 %rd894, %rd353; + mul.wide.u32 %rd895, %r29, 128; + add.s64 %rd224, %rd894, %rd895; + ld.global.u64 %rd1269, [%rd224]; + setp.eq.s64 %p41, %rd1269, 0; + @%p41 bra $L__BB2_72; + + ld.global.u64 %rd1272, [%rd224+32]; + ld.global.u64 %rd1271, [%rd224+16]; + ld.global.u64 %rd1270, [%rd224+8]; + bra.uni $L__BB2_94; + +$L__BB2_83: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd1011, 1179641; + st.local.u64 [%rd3+8], %rd1011; + st.local.u32 [%rd3+16], %r3326; + ld.global.u64 %rd1012, [%rd222]; + ld.global.u64 %rd1013, [%rd222+8]; + ld.global.u64 %rd1014, [%rd222+16]; + ld.global.u64 %rd1015, [%rd222+24]; + ld.global.u64 %rd1016, [%rd222+32]; + ld.global.u64 %rd1017, [%rd222+40]; + ld.global.u64 %rd1018, [%rd222+48]; + ld.global.u64 %rd1019, [%rd222+56]; + st.local.u64 [%rd3+24], %rd1012; + st.local.u64 [%rd3+32], %rd1013; + st.local.u64 [%rd3+40], %rd1014; + st.local.u64 [%rd3+48], %rd1015; + st.local.u64 [%rd3+56], %rd1016; + st.local.u64 [%rd3+64], %rd1017; + st.local.u64 [%rd3+72], %rd1018; + st.local.u64 [%rd3+80], %rd1019; + cvt.u32.u64 %r23275, %rd1012; + xor.b32 %r23276, %r3326, %r23275; + st.local.u32 [%rd3+24], %r23276; + mov.u32 %r30724, 0; + st.local.v2.u32 [%rd3+96], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+104], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+112], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+120], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+128], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+136], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+144], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+152], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+160], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+168], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+176], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+184], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+192], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+200], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+208], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+216], {%r30724, %r30724}; + mov.u32 %r30739, -2147483648; + mov.u32 %r23248, 1; + st.local.v2.u32 [%rd3+88], {%r23248, %r30739}; + ld.local.v2.u32 {%r30760, %r30761}, [%rd3+24]; + mov.b64 {%r30758, %r30759}, %rd1017; + shr.u64 %rd1020, %rd1013, 32; + cvt.u32.u64 %r30772, %rd1013; + cvt.u32.u64 %r30773, %rd1020; + shr.u64 %rd1021, %rd1018, 32; + cvt.u32.u64 %r30770, %rd1018; + cvt.u32.u64 %r30771, %rd1021; + shr.u64 %rd1022, %rd1014, 32; + cvt.u32.u64 %r30768, %rd1014; + cvt.u32.u64 %r30769, %rd1022; + shr.u64 %rd1023, %rd1019, 32; + cvt.u32.u64 %r30766, %rd1019; + cvt.u32.u64 %r30767, %rd1023; + shr.u64 %rd1024, %rd1015, 32; + cvt.u32.u64 %r30764, %rd1015; + cvt.u32.u64 %r30765, %rd1024; + shr.u64 %rd1025, %rd1016, 32; + cvt.u32.u64 %r30762, %rd1016; + cvt.u32.u64 %r30763, %rd1025; + mov.u32 %r30725, %r30724; + mov.u32 %r30726, %r30724; + mov.u32 %r30727, %r30724; + mov.u32 %r30728, %r30724; + mov.u32 %r30729, %r30724; + mov.u32 %r30730, %r30724; + mov.u32 %r30731, %r30724; + mov.u32 %r30732, %r30724; + mov.u32 %r30733, %r30724; + mov.u32 %r30734, %r30724; + mov.u32 %r30735, %r30724; + mov.u32 %r30736, %r30724; + mov.u32 %r30737, %r30724; + mov.u32 %r30738, %r23248; + mov.u32 %r30740, %r30724; + mov.u32 %r30741, %r30724; + mov.u32 %r30742, %r30724; + mov.u32 %r30743, %r30724; + mov.u32 %r30744, %r30724; + mov.u32 %r30745, %r30724; + mov.u32 %r30746, %r30724; + mov.u32 %r30747, %r30724; + mov.u32 %r30748, %r30724; + mov.u32 %r30749, %r30724; + mov.u32 %r30750, %r30724; + mov.u32 %r30751, %r30724; + mov.u32 %r30752, %r30724; + mov.u32 %r30753, %r30724; + mov.u32 %r30754, %r30724; + mov.u32 %r30755, %r30724; + mov.u32 %r30756, %r30724; + mov.u32 %r30757, %r30724; + mov.u32 %r30774, %r30724; + +$L__BB2_84: + // begin inline asm + // xor5 + lop3.b32 %r23279, %r30760, %r30758, %r30756, 0x96; + lop3.b32 %r23279, %r23279, %r30754, %r30752, 0x96; + lop3.b32 %r23280, %r30761, %r30759, %r30757, 0x96; + lop3.b32 %r23280, %r23280, %r30755, %r30753, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23291, %r30772, %r30770, %r30750, 0x96; + lop3.b32 %r23291, %r23291, %r30748, %r30746, 0x96; + lop3.b32 %r23292, %r30773, %r30771, %r30751, 0x96; + lop3.b32 %r23292, %r23292, %r30749, %r30747, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23303, %r30768, %r30766, %r30744, 0x96; + lop3.b32 %r23303, %r23303, %r30742, %r30740, 0x96; + lop3.b32 %r23304, %r30769, %r30767, %r30745, 0x96; + lop3.b32 %r23304, %r23304, %r30743, %r30741, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23315, %r30764, %r30738, %r30736, 0x96; + lop3.b32 %r23315, %r23315, %r30734, %r30732, 0x96; + lop3.b32 %r23316, %r30765, %r30739, %r30737, 0x96; + lop3.b32 %r23316, %r23316, %r30735, %r30733, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23327, %r30762, %r30730, %r30728, 0x96; + lop3.b32 %r23327, %r23327, %r30726, %r30724, 0x96; + lop3.b32 %r23328, %r30763, %r30731, %r30729, 0x96; + lop3.b32 %r23328, %r23328, %r30727, %r30725, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23339, %r23292, %r23291, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23343, %r23291, %r23292, %r23248; + // end inline asm + xor.b32 %r23773, %r23339, %r23327; + xor.b32 %r23774, %r23343, %r23328; + xor.b32 %r23606, %r30760, %r23773; + xor.b32 %r23609, %r30761, %r23774; + xor.b32 %r23513, %r30758, %r23773; + xor.b32 %r23512, %r30759, %r23774; + xor.b32 %r23560, %r30756, %r23773; + xor.b32 %r23561, %r30757, %r23774; + xor.b32 %r23465, %r30754, %r23773; + xor.b32 %r23464, %r30755, %r23774; + xor.b32 %r23416, %r30752, %r23773; + xor.b32 %r23417, %r30753, %r23774; + // begin inline asm + shf.l.wrap.b32 %r23347, %r23304, %r23303, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23351, %r23303, %r23304, %r23248; + // end inline asm + xor.b32 %r23775, %r23347, %r23279; + xor.b32 %r23776, %r23351, %r23280; + xor.b32 %r23568, %r30772, %r23775; + xor.b32 %r23569, %r30773, %r23776; + xor.b32 %r23385, %r30770, %r23775; + xor.b32 %r23384, %r30771, %r23776; + xor.b32 %r23544, %r30750, %r23775; + xor.b32 %r23545, %r30751, %r23776; + xor.b32 %r23505, %r30748, %r23775; + xor.b32 %r23504, %r30749, %r23776; + xor.b32 %r23488, %r30746, %r23775; + xor.b32 %r23489, %r30747, %r23776; + // begin inline asm + shf.l.wrap.b32 %r23355, %r23316, %r23315, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23359, %r23315, %r23316, %r23248; + // end inline asm + xor.b32 %r23777, %r23355, %r23291; + xor.b32 %r23778, %r23359, %r23292; + xor.b32 %r23425, %r30768, %r23777; + xor.b32 %r23424, %r30769, %r23778; + xor.b32 %r23552, %r30766, %r23777; + xor.b32 %r23553, %r30767, %r23778; + xor.b32 %r23433, %r30744, %r23777; + xor.b32 %r23432, %r30745, %r23778; + xor.b32 %r23536, %r30742, %r23777; + xor.b32 %r23537, %r30743, %r23778; + xor.b32 %r23401, %r30740, %r23777; + xor.b32 %r23400, %r30741, %r23778; + // begin inline asm + shf.l.wrap.b32 %r23363, %r23328, %r23327, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23367, %r23327, %r23328, %r23248; + // end inline asm + xor.b32 %r23779, %r23363, %r23303; + xor.b32 %r23780, %r23367, %r23304; + xor.b32 %r23520, %r30764, %r23779; + xor.b32 %r23521, %r30765, %r23780; + xor.b32 %r23497, %r30738, %r23779; + xor.b32 %r23496, %r30739, %r23780; + xor.b32 %r23440, %r30736, %r23779; + xor.b32 %r23441, %r30737, %r23780; + xor.b32 %r23528, %r30734, %r23779; + xor.b32 %r23529, %r30735, %r23780; + xor.b32 %r23457, %r30732, %r23779; + xor.b32 %r23456, %r30733, %r23780; + // begin inline asm + shf.l.wrap.b32 %r23371, %r23280, %r23279, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23375, %r23279, %r23280, %r23248; + // end inline asm + xor.b32 %r23781, %r23371, %r23315; + xor.b32 %r23782, %r23375, %r23316; + xor.b32 %r23472, %r30762, %r23781; + xor.b32 %r23473, %r30763, %r23782; + xor.b32 %r23392, %r30730, %r23781; + xor.b32 %r23393, %r30731, %r23782; + xor.b32 %r23409, %r30728, %r23781; + xor.b32 %r23408, %r30729, %r23782; + xor.b32 %r23448, %r30726, %r23781; + xor.b32 %r23449, %r30727, %r23782; + xor.b32 %r23480, %r30724, %r23781; + xor.b32 %r23481, %r30725, %r23782; + mov.u32 %r23386, 44; + // begin inline asm + shf.l.wrap.b32 %r23379, %r23385, %r23384, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23383, %r23384, %r23385, %r23386; + // end inline asm + mov.u32 %r23394, 20; + // begin inline asm + shf.l.wrap.b32 %r23387, %r23393, %r23392, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23391, %r23392, %r23393, %r23394; + // end inline asm + mov.u32 %r23402, 61; + // begin inline asm + shf.l.wrap.b32 %r23395, %r23401, %r23400, %r23402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23399, %r23400, %r23401, %r23402; + // end inline asm + mov.u32 %r23410, 39; + // begin inline asm + shf.l.wrap.b32 %r23403, %r23409, %r23408, %r23410; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23407, %r23408, %r23409, %r23410; + // end inline asm + mov.u32 %r23418, 18; + // begin inline asm + shf.l.wrap.b32 %r23411, %r23417, %r23416, %r23418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23415, %r23416, %r23417, %r23418; + // end inline asm + mov.u32 %r23426, 62; + // begin inline asm + shf.l.wrap.b32 %r23419, %r23425, %r23424, %r23426; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23423, %r23424, %r23425, %r23426; + // end inline asm + mov.u32 %r23434, 43; + // begin inline asm + shf.l.wrap.b32 %r23427, %r23433, %r23432, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23431, %r23432, %r23433, %r23434; + // end inline asm + mov.u32 %r23442, 25; + // begin inline asm + shf.l.wrap.b32 %r23435, %r23441, %r23440, %r23442; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23439, %r23440, %r23441, %r23442; + // end inline asm + mov.u32 %r23450, 8; + // begin inline asm + shf.l.wrap.b32 %r23443, %r23449, %r23448, %r23450; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23447, %r23448, %r23449, %r23450; + // end inline asm + mov.u32 %r23458, 56; + // begin inline asm + shf.l.wrap.b32 %r23451, %r23457, %r23456, %r23458; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23455, %r23456, %r23457, %r23458; + // end inline asm + mov.u32 %r23466, 41; + // begin inline asm + shf.l.wrap.b32 %r23459, %r23465, %r23464, %r23466; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23463, %r23464, %r23465, %r23466; + // end inline asm + mov.u32 %r23474, 27; + // begin inline asm + shf.l.wrap.b32 %r23467, %r23473, %r23472, %r23474; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23471, %r23472, %r23473, %r23474; + // end inline asm + mov.u32 %r23482, 14; + // begin inline asm + shf.l.wrap.b32 %r23475, %r23481, %r23480, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23479, %r23480, %r23481, %r23482; + // end inline asm + mov.u32 %r23490, 2; + // begin inline asm + shf.l.wrap.b32 %r23483, %r23489, %r23488, %r23490; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23487, %r23488, %r23489, %r23490; + // end inline asm + mov.u32 %r23498, 55; + // begin inline asm + shf.l.wrap.b32 %r23491, %r23497, %r23496, %r23498; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23495, %r23496, %r23497, %r23498; + // end inline asm + mov.u32 %r23506, 45; + // begin inline asm + shf.l.wrap.b32 %r23499, %r23505, %r23504, %r23506; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23503, %r23504, %r23505, %r23506; + // end inline asm + mov.u32 %r23514, 36; + // begin inline asm + shf.l.wrap.b32 %r23507, %r23513, %r23512, %r23514; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23511, %r23512, %r23513, %r23514; + // end inline asm + mov.u32 %r23522, 28; + // begin inline asm + shf.l.wrap.b32 %r23515, %r23521, %r23520, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23519, %r23520, %r23521, %r23522; + // end inline asm + mov.u32 %r23530, 21; + // begin inline asm + shf.l.wrap.b32 %r23523, %r23529, %r23528, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23527, %r23528, %r23529, %r23530; + // end inline asm + mov.u32 %r23538, 15; + // begin inline asm + shf.l.wrap.b32 %r23531, %r23537, %r23536, %r23538; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23535, %r23536, %r23537, %r23538; + // end inline asm + mov.u32 %r23546, 10; + // begin inline asm + shf.l.wrap.b32 %r23539, %r23545, %r23544, %r23546; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23543, %r23544, %r23545, %r23546; + // end inline asm + mov.u32 %r23554, 6; + // begin inline asm + shf.l.wrap.b32 %r23547, %r23553, %r23552, %r23554; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23551, %r23552, %r23553, %r23554; + // end inline asm + mov.u32 %r23562, 3; + // begin inline asm + shf.l.wrap.b32 %r23555, %r23561, %r23560, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23559, %r23560, %r23561, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23563, %r23569, %r23568, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23567, %r23568, %r23569, %r23248; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23571, %r23606, %r23379, %r23427, 0xD2; + lop3.b32 %r23572, %r23609, %r23383, %r23431, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30772, %r23379, %r23427, %r23523, 0xD2; + lop3.b32 %r30773, %r23383, %r23431, %r23527, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30768, %r23427, %r23523, %r23475, 0xD2; + lop3.b32 %r30769, %r23431, %r23527, %r23479, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30764, %r23523, %r23475, %r23606, 0xD2; + lop3.b32 %r30765, %r23527, %r23479, %r23609, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30762, %r23475, %r23606, %r23379, 0xD2; + lop3.b32 %r30763, %r23479, %r23609, %r23383, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30758, %r23515, %r23387, %r23555, 0xD2; + lop3.b32 %r30759, %r23519, %r23391, %r23559, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30770, %r23387, %r23555, %r23499, 0xD2; + lop3.b32 %r30771, %r23391, %r23559, %r23503, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30766, %r23555, %r23499, %r23395, 0xD2; + lop3.b32 %r30767, %r23559, %r23503, %r23399, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30738, %r23499, %r23395, %r23515, 0xD2; + lop3.b32 %r30739, %r23503, %r23399, %r23519, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30738, %r30739}; + // begin inline asm + // chi + lop3.b32 %r30730, %r23395, %r23515, %r23387, 0xD2; + lop3.b32 %r30731, %r23399, %r23519, %r23391, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30730, %r30731}; + // begin inline asm + // chi + lop3.b32 %r30756, %r23563, %r23547, %r23435, 0xD2; + lop3.b32 %r30757, %r23567, %r23551, %r23439, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30756, %r30757}; + // begin inline asm + // chi + lop3.b32 %r30750, %r23547, %r23435, %r23443, 0xD2; + lop3.b32 %r30751, %r23551, %r23439, %r23447, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30750, %r30751}; + // begin inline asm + // chi + lop3.b32 %r30744, %r23435, %r23443, %r23411, 0xD2; + lop3.b32 %r30745, %r23439, %r23447, %r23415, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30744, %r30745}; + // begin inline asm + // chi + lop3.b32 %r30736, %r23443, %r23411, %r23563, 0xD2; + lop3.b32 %r30737, %r23447, %r23415, %r23567, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30736, %r30737}; + // begin inline asm + // chi + lop3.b32 %r30728, %r23411, %r23563, %r23547, 0xD2; + lop3.b32 %r30729, %r23415, %r23567, %r23551, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30728, %r30729}; + // begin inline asm + // chi + lop3.b32 %r30754, %r23467, %r23507, %r23539, 0xD2; + lop3.b32 %r30755, %r23471, %r23511, %r23543, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30754, %r30755}; + // begin inline asm + // chi + lop3.b32 %r30748, %r23507, %r23539, %r23531, 0xD2; + lop3.b32 %r30749, %r23511, %r23543, %r23535, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30748, %r30749}; + // begin inline asm + // chi + lop3.b32 %r30742, %r23539, %r23531, %r23451, 0xD2; + lop3.b32 %r30743, %r23543, %r23535, %r23455, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30742, %r30743}; + // begin inline asm + // chi + lop3.b32 %r30734, %r23531, %r23451, %r23467, 0xD2; + lop3.b32 %r30735, %r23535, %r23455, %r23471, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30734, %r30735}; + // begin inline asm + // chi + lop3.b32 %r30726, %r23451, %r23467, %r23507, 0xD2; + lop3.b32 %r30727, %r23455, %r23471, %r23511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30726, %r30727}; + // begin inline asm + // chi + lop3.b32 %r30752, %r23419, %r23491, %r23403, 0xD2; + lop3.b32 %r30753, %r23423, %r23495, %r23407, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30752, %r30753}; + // begin inline asm + // chi + lop3.b32 %r30746, %r23491, %r23403, %r23459, 0xD2; + lop3.b32 %r30747, %r23495, %r23407, %r23463, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30746, %r30747}; + // begin inline asm + // chi + lop3.b32 %r30740, %r23403, %r23459, %r23483, 0xD2; + lop3.b32 %r30741, %r23407, %r23463, %r23487, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30740, %r30741}; + // begin inline asm + // chi + lop3.b32 %r30732, %r23459, %r23483, %r23419, 0xD2; + lop3.b32 %r30733, %r23463, %r23487, %r23423, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30732, %r30733}; + // begin inline asm + // chi + lop3.b32 %r30724, %r23483, %r23419, %r23491, 0xD2; + lop3.b32 %r30725, %r23487, %r23423, %r23495, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30724, %r30725}; + mul.wide.s32 %rd1027, %r30774, 8; + mov.u64 %rd1028, keccak_round_constants; + cvta.const.u64 %rd1029, %rd1028; + add.s64 %rd1026, %rd1029, %rd1027; + // begin inline asm + ld.global.nc.v2.u32 {%r23771,%r23772}, [%rd1026]; + // end inline asm + xor.b32 %r30760, %r23571, %r23771; + xor.b32 %r30761, %r23572, %r23772; + add.s32 %r30774, %r30774, 1; + setp.lt.u32 %p47, %r30774, 23; + @%p47 bra $L__BB2_84; + + add.u64 %rd272, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30772, %r30773}; + st.local.v2.u32 [%rd3+72], {%r30770, %r30771}; + st.local.v2.u32 [%rd3+40], {%r30768, %r30769}; + st.local.v2.u32 [%rd3+80], {%r30766, %r30767}; + st.local.v2.u32 [%rd3+48], {%r30764, %r30765}; + st.local.v2.u32 [%rd3+56], {%r30762, %r30763}; + st.local.v2.u32 [%rd3+24], {%r30760, %r30761}; + // begin inline asm + // xor5 + lop3.b32 %r23783, %r30760, %r30758, %r30756, 0x96; + lop3.b32 %r23783, %r23783, %r30754, %r30752, 0x96; + lop3.b32 %r23784, %r30761, %r30759, %r30757, 0x96; + lop3.b32 %r23784, %r23784, %r30755, %r30753, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23795, %r30772, %r30770, %r30750, 0x96; + lop3.b32 %r23795, %r23795, %r30748, %r30746, 0x96; + lop3.b32 %r23796, %r30773, %r30771, %r30751, 0x96; + lop3.b32 %r23796, %r23796, %r30749, %r30747, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23807, %r30768, %r30766, %r30744, 0x96; + lop3.b32 %r23807, %r23807, %r30742, %r30740, 0x96; + lop3.b32 %r23808, %r30769, %r30767, %r30745, 0x96; + lop3.b32 %r23808, %r23808, %r30743, %r30741, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23819, %r30764, %r30738, %r30736, 0x96; + lop3.b32 %r23819, %r23819, %r30734, %r30732, 0x96; + lop3.b32 %r23820, %r30765, %r30739, %r30737, 0x96; + lop3.b32 %r23820, %r23820, %r30735, %r30733, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23831, %r30762, %r30730, %r30728, 0x96; + lop3.b32 %r23831, %r23831, %r30726, %r30724, 0x96; + lop3.b32 %r23832, %r30763, %r30731, %r30729, 0x96; + lop3.b32 %r23832, %r23832, %r30727, %r30725, 0x96; + // end inline asm + mov.u32 %r24035, 1; + // begin inline asm + shf.l.wrap.b32 %r23843, %r23796, %r23795, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23847, %r23795, %r23796, %r24035; + // end inline asm + xor.b32 %r24062, %r23843, %r23831; + xor.b32 %r24063, %r23847, %r23832; + xor.b32 %r23990, %r30760, %r24062; + xor.b32 %r23993, %r30761, %r24063; + xor.b32 %r23953, %r30757, %r24063; + xor.b32 %r23952, %r30756, %r24062; + st.local.v2.u32 [%rd3+104], {%r23952, %r23953}; + // begin inline asm + shf.l.wrap.b32 %r23851, %r23808, %r23807, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23855, %r23807, %r23808, %r24035; + // end inline asm + xor.b32 %r24064, %r23851, %r23783; + xor.b32 %r24065, %r23855, %r23784; + xor.b32 %r23889, %r30770, %r24064; + xor.b32 %r23888, %r30771, %r24065; + xor.b32 %r23928, %r30749, %r24065; + xor.b32 %r23929, %r30748, %r24064; + st.local.v2.u32 [%rd3+152], {%r23929, %r23928}; + // begin inline asm + shf.l.wrap.b32 %r23859, %r23820, %r23819, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23863, %r23819, %r23820, %r24035; + // end inline asm + xor.b32 %r24066, %r23859, %r23795; + xor.b32 %r24067, %r23863, %r23796; + xor.b32 %r23912, %r30745, %r24067; + xor.b32 %r23913, %r30744, %r24066; + st.local.v2.u32 [%rd3+120], {%r23913, %r23912}; + xor.b32 %r23904, %r30741, %r24067; + xor.b32 %r23905, %r30740, %r24066; + st.local.v2.u32 [%rd3+200], {%r23905, %r23904}; + // begin inline asm + shf.l.wrap.b32 %r23867, %r23832, %r23831, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23871, %r23831, %r23832, %r24035; + // end inline asm + xor.b32 %r24068, %r23867, %r23807; + xor.b32 %r24069, %r23871, %r23808; + xor.b32 %r23936, %r30764, %r24068; + xor.b32 %r23937, %r30765, %r24069; + xor.b32 %r23945, %r30735, %r24069; + xor.b32 %r23944, %r30734, %r24068; + st.local.v2.u32 [%rd3+168], {%r23944, %r23945}; + // begin inline asm + shf.l.wrap.b32 %r23875, %r23784, %r23783, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23879, %r23783, %r23784, %r24035; + // end inline asm + xor.b32 %r24070, %r23875, %r23819; + xor.b32 %r24071, %r23879, %r23820; + xor.b32 %r23896, %r30730, %r24070; + xor.b32 %r23897, %r30731, %r24071; + xor.b32 %r23921, %r30725, %r24071; + xor.b32 %r23920, %r30724, %r24070; + st.local.v2.u32 [%rd3+216], {%r23920, %r23921}; + // begin inline asm + shf.l.wrap.b32 %r23883, %r23889, %r23888, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23887, %r23888, %r23889, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23891, %r23897, %r23896, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23895, %r23896, %r23897, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23903, %r23904, %r23905, %r23402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23899, %r23905, %r23904, %r23402; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r23899, %r23903}; + // begin inline asm + shf.l.wrap.b32 %r23907, %r23913, %r23912, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23911, %r23912, %r23913, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23915, %r23921, %r23920, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23919, %r23920, %r23921, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23927, %r23928, %r23929, %r23506; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23923, %r23929, %r23928, %r23506; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r23923, %r23927}; + // begin inline asm + shf.l.wrap.b32 %r23931, %r23937, %r23936, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23935, %r23936, %r23937, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23939, %r23945, %r23944, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23943, %r23944, %r23945, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23947, %r23953, %r23952, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23951, %r23952, %r23953, %r23562; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23955, %r23990, %r23883, %r23907, 0xD2; + lop3.b32 %r23956, %r23993, %r23887, %r23911, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30907, %r23883, %r23907, %r23939, 0xD2; + lop3.b32 %r30908, %r23887, %r23911, %r23943, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30907, %r30908}; + // begin inline asm + // chi + lop3.b32 %r30903, %r23907, %r23939, %r23915, 0xD2; + lop3.b32 %r30904, %r23911, %r23943, %r23919, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30903, %r30904}; + // begin inline asm + // chi + lop3.b32 %r30899, %r23939, %r23915, %r23990, 0xD2; + lop3.b32 %r30900, %r23943, %r23919, %r23993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30899, %r30900}; + // begin inline asm + // chi + lop3.b32 %r30897, %r23915, %r23990, %r23883, 0xD2; + lop3.b32 %r30898, %r23919, %r23993, %r23887, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30897, %r30898}; + // begin inline asm + // chi + lop3.b32 %r30893, %r23931, %r23891, %r23947, 0xD2; + lop3.b32 %r30894, %r23935, %r23895, %r23951, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30893, %r30894}; + // begin inline asm + // chi + lop3.b32 %r30905, %r23891, %r23947, %r23923, 0xD2; + lop3.b32 %r30906, %r23895, %r23951, %r23927, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30905, %r30906}; + // begin inline asm + // chi + lop3.b32 %r30901, %r23947, %r23923, %r23899, 0xD2; + lop3.b32 %r30902, %r23951, %r23927, %r23903, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30901, %r30902}; + add.s64 %rd1030, %rd1029, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r24019,%r24020}, [%rd1030]; + // end inline asm + xor.b32 %r30895, %r23955, %r24019; + xor.b32 %r30896, %r23956, %r24020; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + st.local.u64 [%rd272], %rd354; + mov.u64 %rd1034, 1179641; + st.local.u64 [%rd272+8], %rd1034; + st.local.u32 [%rd272+16], %r3327; + ld.global.u64 %rd1035, [%rd223]; + ld.global.u64 %rd1036, [%rd223+8]; + ld.global.u64 %rd1037, [%rd223+16]; + ld.global.u64 %rd1038, [%rd223+24]; + ld.global.u64 %rd1039, [%rd223+32]; + ld.global.u64 %rd1040, [%rd223+40]; + ld.global.u64 %rd1041, [%rd223+48]; + ld.global.u64 %rd1042, [%rd223+56]; + st.local.u64 [%rd272+32], %rd1036; + st.local.u64 [%rd272+40], %rd1037; + st.local.u64 [%rd272+48], %rd1038; + st.local.u64 [%rd272+56], %rd1039; + st.local.u64 [%rd272+64], %rd1040; + st.local.u64 [%rd272+72], %rd1041; + st.local.u64 [%rd272+80], %rd1042; + cvt.u32.u64 %r24072, %rd1035; + xor.b32 %r24073, %r3327, %r24072; + st.local.u64 [%rd272+24], %rd1035; + st.local.u32 [%rd272+24], %r24073; + mov.u32 %r30775, 0; + st.local.v2.u32 [%rd272+96], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+104], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+112], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+120], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+128], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+136], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+144], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+152], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+160], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+168], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+176], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+184], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+192], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+200], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+208], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+216], {%r30775, %r30775}; + mov.u32 %r30790, -2147483648; + st.local.v2.u32 [%rd272+88], {%r24035, %r30790}; + ld.local.v2.u32 {%r30811, %r30812}, [%rd272+24]; + mov.b64 {%r30809, %r30810}, %rd1040; + shr.u64 %rd1043, %rd1036, 32; + cvt.u32.u64 %r30823, %rd1036; + cvt.u32.u64 %r30824, %rd1043; + shr.u64 %rd1044, %rd1041, 32; + cvt.u32.u64 %r30821, %rd1041; + cvt.u32.u64 %r30822, %rd1044; + shr.u64 %rd1045, %rd1037, 32; + cvt.u32.u64 %r30819, %rd1037; + cvt.u32.u64 %r30820, %rd1045; + shr.u64 %rd1046, %rd1042, 32; + cvt.u32.u64 %r30817, %rd1042; + cvt.u32.u64 %r30818, %rd1046; + shr.u64 %rd1047, %rd1038, 32; + cvt.u32.u64 %r30815, %rd1038; + cvt.u32.u64 %r30816, %rd1047; + shr.u64 %rd1048, %rd1039, 32; + cvt.u32.u64 %r30813, %rd1039; + cvt.u32.u64 %r30814, %rd1048; + mov.u32 %r30776, %r30775; + mov.u32 %r30777, %r30775; + mov.u32 %r30778, %r30775; + mov.u32 %r30779, %r30775; + mov.u32 %r30780, %r30775; + mov.u32 %r30781, %r30775; + mov.u32 %r30782, %r30775; + mov.u32 %r30783, %r30775; + mov.u32 %r30784, %r30775; + mov.u32 %r30785, %r30775; + mov.u32 %r30786, %r30775; + mov.u32 %r30787, %r30775; + mov.u32 %r30788, %r30775; + mov.u32 %r30789, %r24035; + mov.u32 %r30791, %r30775; + mov.u32 %r30792, %r30775; + mov.u32 %r30793, %r30775; + mov.u32 %r30794, %r30775; + mov.u32 %r30795, %r30775; + mov.u32 %r30796, %r30775; + mov.u32 %r30797, %r30775; + mov.u32 %r30798, %r30775; + mov.u32 %r30799, %r30775; + mov.u32 %r30800, %r30775; + mov.u32 %r30801, %r30775; + mov.u32 %r30802, %r30775; + mov.u32 %r30803, %r30775; + mov.u32 %r30804, %r30775; + mov.u32 %r30805, %r30775; + mov.u32 %r30806, %r30775; + mov.u32 %r30807, %r30775; + mov.u32 %r30808, %r30775; + mov.u32 %r30825, %r30775; + +$L__BB2_86: + // begin inline asm + // xor5 + lop3.b32 %r24076, %r30811, %r30809, %r30807, 0x96; + lop3.b32 %r24076, %r24076, %r30805, %r30803, 0x96; + lop3.b32 %r24077, %r30812, %r30810, %r30808, 0x96; + lop3.b32 %r24077, %r24077, %r30806, %r30804, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24088, %r30823, %r30821, %r30801, 0x96; + lop3.b32 %r24088, %r24088, %r30799, %r30797, 0x96; + lop3.b32 %r24089, %r30824, %r30822, %r30802, 0x96; + lop3.b32 %r24089, %r24089, %r30800, %r30798, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24100, %r30819, %r30817, %r30795, 0x96; + lop3.b32 %r24100, %r24100, %r30793, %r30791, 0x96; + lop3.b32 %r24101, %r30820, %r30818, %r30796, 0x96; + lop3.b32 %r24101, %r24101, %r30794, %r30792, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24112, %r30815, %r30789, %r30787, 0x96; + lop3.b32 %r24112, %r24112, %r30785, %r30783, 0x96; + lop3.b32 %r24113, %r30816, %r30790, %r30788, 0x96; + lop3.b32 %r24113, %r24113, %r30786, %r30784, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24124, %r30813, %r30781, %r30779, 0x96; + lop3.b32 %r24124, %r24124, %r30777, %r30775, 0x96; + lop3.b32 %r24125, %r30814, %r30782, %r30780, 0x96; + lop3.b32 %r24125, %r24125, %r30778, %r30776, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24136, %r24089, %r24088, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24140, %r24088, %r24089, %r24035; + // end inline asm + xor.b32 %r24570, %r24136, %r24124; + xor.b32 %r24571, %r24140, %r24125; + xor.b32 %r24403, %r30811, %r24570; + xor.b32 %r24406, %r30812, %r24571; + xor.b32 %r24310, %r30809, %r24570; + xor.b32 %r24309, %r30810, %r24571; + xor.b32 %r24357, %r30807, %r24570; + xor.b32 %r24358, %r30808, %r24571; + xor.b32 %r24262, %r30805, %r24570; + xor.b32 %r24261, %r30806, %r24571; + xor.b32 %r24213, %r30803, %r24570; + xor.b32 %r24214, %r30804, %r24571; + // begin inline asm + shf.l.wrap.b32 %r24144, %r24101, %r24100, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24148, %r24100, %r24101, %r24035; + // end inline asm + xor.b32 %r24572, %r24144, %r24076; + xor.b32 %r24573, %r24148, %r24077; + xor.b32 %r24365, %r30823, %r24572; + xor.b32 %r24366, %r30824, %r24573; + xor.b32 %r24182, %r30821, %r24572; + xor.b32 %r24181, %r30822, %r24573; + xor.b32 %r24341, %r30801, %r24572; + xor.b32 %r24342, %r30802, %r24573; + xor.b32 %r24302, %r30799, %r24572; + xor.b32 %r24301, %r30800, %r24573; + xor.b32 %r24285, %r30797, %r24572; + xor.b32 %r24286, %r30798, %r24573; + // begin inline asm + shf.l.wrap.b32 %r24152, %r24113, %r24112, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24156, %r24112, %r24113, %r24035; + // end inline asm + xor.b32 %r24574, %r24152, %r24088; + xor.b32 %r24575, %r24156, %r24089; + xor.b32 %r24222, %r30819, %r24574; + xor.b32 %r24221, %r30820, %r24575; + xor.b32 %r24349, %r30817, %r24574; + xor.b32 %r24350, %r30818, %r24575; + xor.b32 %r24230, %r30795, %r24574; + xor.b32 %r24229, %r30796, %r24575; + xor.b32 %r24333, %r30793, %r24574; + xor.b32 %r24334, %r30794, %r24575; + xor.b32 %r24198, %r30791, %r24574; + xor.b32 %r24197, %r30792, %r24575; + // begin inline asm + shf.l.wrap.b32 %r24160, %r24125, %r24124, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24164, %r24124, %r24125, %r24035; + // end inline asm + xor.b32 %r24576, %r24160, %r24100; + xor.b32 %r24577, %r24164, %r24101; + xor.b32 %r24317, %r30815, %r24576; + xor.b32 %r24318, %r30816, %r24577; + xor.b32 %r24294, %r30789, %r24576; + xor.b32 %r24293, %r30790, %r24577; + xor.b32 %r24237, %r30787, %r24576; + xor.b32 %r24238, %r30788, %r24577; + xor.b32 %r24325, %r30785, %r24576; + xor.b32 %r24326, %r30786, %r24577; + xor.b32 %r24254, %r30783, %r24576; + xor.b32 %r24253, %r30784, %r24577; + // begin inline asm + shf.l.wrap.b32 %r24168, %r24077, %r24076, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24172, %r24076, %r24077, %r24035; + // end inline asm + xor.b32 %r24578, %r24168, %r24112; + xor.b32 %r24579, %r24172, %r24113; + xor.b32 %r24269, %r30813, %r24578; + xor.b32 %r24270, %r30814, %r24579; + xor.b32 %r24189, %r30781, %r24578; + xor.b32 %r24190, %r30782, %r24579; + xor.b32 %r24206, %r30779, %r24578; + xor.b32 %r24205, %r30780, %r24579; + xor.b32 %r24245, %r30777, %r24578; + xor.b32 %r24246, %r30778, %r24579; + xor.b32 %r24277, %r30775, %r24578; + xor.b32 %r24278, %r30776, %r24579; + mov.u32 %r24183, 44; + // begin inline asm + shf.l.wrap.b32 %r24176, %r24182, %r24181, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24180, %r24181, %r24182, %r24183; + // end inline asm + mov.u32 %r24191, 20; + // begin inline asm + shf.l.wrap.b32 %r24184, %r24190, %r24189, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24188, %r24189, %r24190, %r24191; + // end inline asm + mov.u32 %r24199, 61; + // begin inline asm + shf.l.wrap.b32 %r24192, %r24198, %r24197, %r24199; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24196, %r24197, %r24198, %r24199; + // end inline asm + mov.u32 %r24207, 39; + // begin inline asm + shf.l.wrap.b32 %r24200, %r24206, %r24205, %r24207; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24204, %r24205, %r24206, %r24207; + // end inline asm + mov.u32 %r24215, 18; + // begin inline asm + shf.l.wrap.b32 %r24208, %r24214, %r24213, %r24215; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24212, %r24213, %r24214, %r24215; + // end inline asm + mov.u32 %r24223, 62; + // begin inline asm + shf.l.wrap.b32 %r24216, %r24222, %r24221, %r24223; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24220, %r24221, %r24222, %r24223; + // end inline asm + mov.u32 %r24231, 43; + // begin inline asm + shf.l.wrap.b32 %r24224, %r24230, %r24229, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24228, %r24229, %r24230, %r24231; + // end inline asm + mov.u32 %r24239, 25; + // begin inline asm + shf.l.wrap.b32 %r24232, %r24238, %r24237, %r24239; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24236, %r24237, %r24238, %r24239; + // end inline asm + mov.u32 %r24247, 8; + // begin inline asm + shf.l.wrap.b32 %r24240, %r24246, %r24245, %r24247; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24244, %r24245, %r24246, %r24247; + // end inline asm + mov.u32 %r24255, 56; + // begin inline asm + shf.l.wrap.b32 %r24248, %r24254, %r24253, %r24255; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24252, %r24253, %r24254, %r24255; + // end inline asm + mov.u32 %r24263, 41; + // begin inline asm + shf.l.wrap.b32 %r24256, %r24262, %r24261, %r24263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24260, %r24261, %r24262, %r24263; + // end inline asm + mov.u32 %r24271, 27; + // begin inline asm + shf.l.wrap.b32 %r24264, %r24270, %r24269, %r24271; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24268, %r24269, %r24270, %r24271; + // end inline asm + mov.u32 %r24279, 14; + // begin inline asm + shf.l.wrap.b32 %r24272, %r24278, %r24277, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24276, %r24277, %r24278, %r24279; + // end inline asm + mov.u32 %r24287, 2; + // begin inline asm + shf.l.wrap.b32 %r24280, %r24286, %r24285, %r24287; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24284, %r24285, %r24286, %r24287; + // end inline asm + mov.u32 %r24295, 55; + // begin inline asm + shf.l.wrap.b32 %r24288, %r24294, %r24293, %r24295; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24292, %r24293, %r24294, %r24295; + // end inline asm + mov.u32 %r24303, 45; + // begin inline asm + shf.l.wrap.b32 %r24296, %r24302, %r24301, %r24303; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24300, %r24301, %r24302, %r24303; + // end inline asm + mov.u32 %r24311, 36; + // begin inline asm + shf.l.wrap.b32 %r24304, %r24310, %r24309, %r24311; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24308, %r24309, %r24310, %r24311; + // end inline asm + mov.u32 %r24319, 28; + // begin inline asm + shf.l.wrap.b32 %r24312, %r24318, %r24317, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24316, %r24317, %r24318, %r24319; + // end inline asm + mov.u32 %r24327, 21; + // begin inline asm + shf.l.wrap.b32 %r24320, %r24326, %r24325, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24324, %r24325, %r24326, %r24327; + // end inline asm + mov.u32 %r24335, 15; + // begin inline asm + shf.l.wrap.b32 %r24328, %r24334, %r24333, %r24335; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24332, %r24333, %r24334, %r24335; + // end inline asm + mov.u32 %r24343, 10; + // begin inline asm + shf.l.wrap.b32 %r24336, %r24342, %r24341, %r24343; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24340, %r24341, %r24342, %r24343; + // end inline asm + mov.u32 %r24351, 6; + // begin inline asm + shf.l.wrap.b32 %r24344, %r24350, %r24349, %r24351; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24348, %r24349, %r24350, %r24351; + // end inline asm + mov.u32 %r24359, 3; + // begin inline asm + shf.l.wrap.b32 %r24352, %r24358, %r24357, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24356, %r24357, %r24358, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24360, %r24366, %r24365, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24364, %r24365, %r24366, %r24035; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24368, %r24403, %r24176, %r24224, 0xD2; + lop3.b32 %r24369, %r24406, %r24180, %r24228, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30823, %r24176, %r24224, %r24320, 0xD2; + lop3.b32 %r30824, %r24180, %r24228, %r24324, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30819, %r24224, %r24320, %r24272, 0xD2; + lop3.b32 %r30820, %r24228, %r24324, %r24276, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30815, %r24320, %r24272, %r24403, 0xD2; + lop3.b32 %r30816, %r24324, %r24276, %r24406, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30813, %r24272, %r24403, %r24176, 0xD2; + lop3.b32 %r30814, %r24276, %r24406, %r24180, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30809, %r24312, %r24184, %r24352, 0xD2; + lop3.b32 %r30810, %r24316, %r24188, %r24356, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30821, %r24184, %r24352, %r24296, 0xD2; + lop3.b32 %r30822, %r24188, %r24356, %r24300, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30817, %r24352, %r24296, %r24192, 0xD2; + lop3.b32 %r30818, %r24356, %r24300, %r24196, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30789, %r24296, %r24192, %r24312, 0xD2; + lop3.b32 %r30790, %r24300, %r24196, %r24316, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r30789, %r30790}; + // begin inline asm + // chi + lop3.b32 %r30781, %r24192, %r24312, %r24184, 0xD2; + lop3.b32 %r30782, %r24196, %r24316, %r24188, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r30781, %r30782}; + // begin inline asm + // chi + lop3.b32 %r30807, %r24360, %r24344, %r24232, 0xD2; + lop3.b32 %r30808, %r24364, %r24348, %r24236, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+104], {%r30807, %r30808}; + // begin inline asm + // chi + lop3.b32 %r30801, %r24344, %r24232, %r24240, 0xD2; + lop3.b32 %r30802, %r24348, %r24236, %r24244, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+112], {%r30801, %r30802}; + // begin inline asm + // chi + lop3.b32 %r30795, %r24232, %r24240, %r24208, 0xD2; + lop3.b32 %r30796, %r24236, %r24244, %r24212, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+120], {%r30795, %r30796}; + // begin inline asm + // chi + lop3.b32 %r30787, %r24240, %r24208, %r24360, 0xD2; + lop3.b32 %r30788, %r24244, %r24212, %r24364, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+128], {%r30787, %r30788}; + // begin inline asm + // chi + lop3.b32 %r30779, %r24208, %r24360, %r24344, 0xD2; + lop3.b32 %r30780, %r24212, %r24364, %r24348, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+136], {%r30779, %r30780}; + // begin inline asm + // chi + lop3.b32 %r30805, %r24264, %r24304, %r24336, 0xD2; + lop3.b32 %r30806, %r24268, %r24308, %r24340, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+144], {%r30805, %r30806}; + // begin inline asm + // chi + lop3.b32 %r30799, %r24304, %r24336, %r24328, 0xD2; + lop3.b32 %r30800, %r24308, %r24340, %r24332, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+152], {%r30799, %r30800}; + // begin inline asm + // chi + lop3.b32 %r30793, %r24336, %r24328, %r24248, 0xD2; + lop3.b32 %r30794, %r24340, %r24332, %r24252, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+160], {%r30793, %r30794}; + // begin inline asm + // chi + lop3.b32 %r30785, %r24328, %r24248, %r24264, 0xD2; + lop3.b32 %r30786, %r24332, %r24252, %r24268, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+168], {%r30785, %r30786}; + // begin inline asm + // chi + lop3.b32 %r30777, %r24248, %r24264, %r24304, 0xD2; + lop3.b32 %r30778, %r24252, %r24268, %r24308, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+176], {%r30777, %r30778}; + // begin inline asm + // chi + lop3.b32 %r30803, %r24216, %r24288, %r24200, 0xD2; + lop3.b32 %r30804, %r24220, %r24292, %r24204, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+184], {%r30803, %r30804}; + // begin inline asm + // chi + lop3.b32 %r30797, %r24288, %r24200, %r24256, 0xD2; + lop3.b32 %r30798, %r24292, %r24204, %r24260, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+192], {%r30797, %r30798}; + // begin inline asm + // chi + lop3.b32 %r30791, %r24200, %r24256, %r24280, 0xD2; + lop3.b32 %r30792, %r24204, %r24260, %r24284, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+200], {%r30791, %r30792}; + // begin inline asm + // chi + lop3.b32 %r30783, %r24256, %r24280, %r24216, 0xD2; + lop3.b32 %r30784, %r24260, %r24284, %r24220, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+208], {%r30783, %r30784}; + // begin inline asm + // chi + lop3.b32 %r30775, %r24280, %r24216, %r24288, 0xD2; + lop3.b32 %r30776, %r24284, %r24220, %r24292, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+216], {%r30775, %r30776}; + mul.wide.s32 %rd1050, %r30825, 8; + add.s64 %rd1049, %rd1029, %rd1050; + // begin inline asm + ld.global.nc.v2.u32 {%r24568,%r24569}, [%rd1049]; + // end inline asm + xor.b32 %r30811, %r24368, %r24568; + xor.b32 %r30812, %r24369, %r24569; + add.s32 %r30825, %r30825, 1; + setp.lt.u32 %p48, %r30825, 23; + @%p48 bra $L__BB2_86; + + mov.u32 %r30858, 0; + mov.u32 %r24679, 1; + st.local.v2.u32 [%rd272+32], {%r30823, %r30824}; + st.local.v2.u32 [%rd272+72], {%r30821, %r30822}; + st.local.v2.u32 [%rd272+40], {%r30819, %r30820}; + st.local.v2.u32 [%rd272+80], {%r30817, %r30818}; + st.local.v2.u32 [%rd272+48], {%r30815, %r30816}; + st.local.v2.u32 [%rd272+56], {%r30813, %r30814}; + st.local.v2.u32 [%rd272+24], {%r30811, %r30812}; + // begin inline asm + // xor5 + lop3.b32 %r24580, %r30811, %r30809, %r30807, 0x96; + lop3.b32 %r24580, %r24580, %r30805, %r30803, 0x96; + lop3.b32 %r24581, %r30812, %r30810, %r30808, 0x96; + lop3.b32 %r24581, %r24581, %r30806, %r30804, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24592, %r30823, %r30821, %r30801, 0x96; + lop3.b32 %r24592, %r24592, %r30799, %r30797, 0x96; + lop3.b32 %r24593, %r30824, %r30822, %r30802, 0x96; + lop3.b32 %r24593, %r24593, %r30800, %r30798, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24604, %r30819, %r30817, %r30795, 0x96; + lop3.b32 %r24604, %r24604, %r30793, %r30791, 0x96; + lop3.b32 %r24605, %r30820, %r30818, %r30796, 0x96; + lop3.b32 %r24605, %r24605, %r30794, %r30792, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24616, %r30815, %r30789, %r30787, 0x96; + lop3.b32 %r24616, %r24616, %r30785, %r30783, 0x96; + lop3.b32 %r24617, %r30816, %r30790, %r30788, 0x96; + lop3.b32 %r24617, %r24617, %r30786, %r30784, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24628, %r30813, %r30781, %r30779, 0x96; + lop3.b32 %r24628, %r24628, %r30777, %r30775, 0x96; + lop3.b32 %r24629, %r30814, %r30782, %r30780, 0x96; + lop3.b32 %r24629, %r24629, %r30778, %r30776, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24640, %r24593, %r24592, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24644, %r24592, %r24593, %r24679; + // end inline asm + xor.b32 %r24819, %r24640, %r24628; + xor.b32 %r24820, %r24644, %r24629; + xor.b32 %r24787, %r30811, %r24819; + xor.b32 %r24790, %r30812, %r24820; + xor.b32 %r24750, %r30808, %r24820; + xor.b32 %r24749, %r30807, %r24819; + st.local.v2.u32 [%rd272+104], {%r24749, %r24750}; + // begin inline asm + shf.l.wrap.b32 %r24648, %r24605, %r24604, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24652, %r24604, %r24605, %r24679; + // end inline asm + xor.b32 %r24821, %r24648, %r24580; + xor.b32 %r24822, %r24652, %r24581; + xor.b32 %r24686, %r30821, %r24821; + xor.b32 %r24685, %r30822, %r24822; + xor.b32 %r24725, %r30800, %r24822; + xor.b32 %r24726, %r30799, %r24821; + st.local.v2.u32 [%rd272+152], {%r24726, %r24725}; + // begin inline asm + shf.l.wrap.b32 %r24656, %r24617, %r24616, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24660, %r24616, %r24617, %r24679; + // end inline asm + xor.b32 %r24823, %r24656, %r24592; + xor.b32 %r24824, %r24660, %r24593; + xor.b32 %r24709, %r30796, %r24824; + xor.b32 %r24710, %r30795, %r24823; + st.local.v2.u32 [%rd272+120], {%r24710, %r24709}; + xor.b32 %r24701, %r30792, %r24824; + xor.b32 %r24702, %r30791, %r24823; + st.local.v2.u32 [%rd272+200], {%r24702, %r24701}; + // begin inline asm + shf.l.wrap.b32 %r24664, %r24629, %r24628, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24668, %r24628, %r24629, %r24679; + // end inline asm + xor.b32 %r24825, %r24664, %r24604; + xor.b32 %r24826, %r24668, %r24605; + xor.b32 %r24733, %r30815, %r24825; + xor.b32 %r24734, %r30816, %r24826; + xor.b32 %r24742, %r30786, %r24826; + xor.b32 %r24741, %r30785, %r24825; + st.local.v2.u32 [%rd272+168], {%r24741, %r24742}; + // begin inline asm + shf.l.wrap.b32 %r24672, %r24581, %r24580, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24676, %r24580, %r24581, %r24679; + // end inline asm + xor.b32 %r24827, %r24672, %r24616; + xor.b32 %r24828, %r24676, %r24617; + xor.b32 %r24693, %r30781, %r24827; + xor.b32 %r24694, %r30782, %r24828; + xor.b32 %r24718, %r30776, %r24828; + xor.b32 %r24717, %r30775, %r24827; + st.local.v2.u32 [%rd272+216], {%r24717, %r24718}; + // begin inline asm + shf.l.wrap.b32 %r24680, %r24686, %r24685, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24684, %r24685, %r24686, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24688, %r24694, %r24693, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24692, %r24693, %r24694, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24700, %r24701, %r24702, %r24199; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24696, %r24702, %r24701, %r24199; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r24696, %r24700}; + // begin inline asm + shf.l.wrap.b32 %r24704, %r24710, %r24709, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24708, %r24709, %r24710, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24712, %r24718, %r24717, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24716, %r24717, %r24718, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24724, %r24725, %r24726, %r24303; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24720, %r24726, %r24725, %r24303; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r24720, %r24724}; + // begin inline asm + shf.l.wrap.b32 %r24728, %r24734, %r24733, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24732, %r24733, %r24734, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24736, %r24742, %r24741, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24740, %r24741, %r24742, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24744, %r24750, %r24749, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24748, %r24749, %r24750, %r24359; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24752, %r24787, %r24680, %r24704, 0xD2; + lop3.b32 %r24753, %r24790, %r24684, %r24708, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30958, %r24680, %r24704, %r24736, 0xD2; + lop3.b32 %r30959, %r24684, %r24708, %r24740, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+32], {%r30958, %r30959}; + // begin inline asm + // chi + lop3.b32 %r30954, %r24704, %r24736, %r24712, 0xD2; + lop3.b32 %r30955, %r24708, %r24740, %r24716, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+40], {%r30954, %r30955}; + // begin inline asm + // chi + lop3.b32 %r30950, %r24736, %r24712, %r24787, 0xD2; + lop3.b32 %r30951, %r24740, %r24716, %r24790, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+48], {%r30950, %r30951}; + // begin inline asm + // chi + lop3.b32 %r30948, %r24712, %r24787, %r24680, 0xD2; + lop3.b32 %r30949, %r24716, %r24790, %r24684, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+56], {%r30948, %r30949}; + // begin inline asm + // chi + lop3.b32 %r30944, %r24728, %r24688, %r24744, 0xD2; + lop3.b32 %r30945, %r24732, %r24692, %r24748, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+64], {%r30944, %r30945}; + // begin inline asm + // chi + lop3.b32 %r30956, %r24688, %r24744, %r24720, 0xD2; + lop3.b32 %r30957, %r24692, %r24748, %r24724, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+72], {%r30956, %r30957}; + // begin inline asm + // chi + lop3.b32 %r30952, %r24744, %r24720, %r24696, 0xD2; + lop3.b32 %r30953, %r24748, %r24724, %r24700, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+80], {%r30952, %r30953}; + // begin inline asm + ld.global.nc.v2.u32 {%r24816,%r24817}, [%rd1030]; + // end inline asm + xor.b32 %r30946, %r24752, %r24816; + xor.b32 %r30947, %r24753, %r24817; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + add.s64 %rd275, %rd272, 24; + add.s64 %rd276, %rd3, 24; + +$L__BB2_88: + shl.b32 %r24829, %r30858, 2; + cvt.u64.u32 %rd1058, %r24829; + and.b64 %rd1059, %rd1058, 60; + add.s64 %rd1060, %rd276, %rd1059; + xor.b32 %r24830, %r3326, %r30858; + mul.lo.s32 %r24831, %r24830, 16777619; + ld.local.u32 %r24832, [%rd1060]; + xor.b32 %r24833, %r24831, %r24832; + mul.wide.u32 %rd1061, %r24833, -954391867; + shr.u64 %rd1062, %rd1061, 32; + cvt.u32.u64 %r24834, %rd1062; + sub.s32 %r24835, %r24833, %r24834; + shr.u32 %r24836, %r24835, 1; + add.s32 %r24837, %r24836, %r24834; + shr.u32 %r24838, %r24837, 20; + mul.lo.s32 %r24839, %r24838, 1179641; + sub.s32 %r24840, %r24833, %r24839; + mul.wide.u32 %rd1063, %r24840, 64; + add.s64 %rd1064, %rd471, %rd1063; + mul.lo.s32 %r24841, %r30895, 16777619; + ld.global.u32 %r24842, [%rd1064]; + xor.b32 %r30895, %r24841, %r24842; + mul.lo.s32 %r24843, %r30896, 16777619; + ld.global.u32 %r24844, [%rd1064+4]; + xor.b32 %r30896, %r24843, %r24844; + mul.lo.s32 %r24845, %r30907, 16777619; + ld.global.u32 %r24846, [%rd1064+8]; + mul.lo.s32 %r24847, %r30908, 16777619; + ld.global.u32 %r24848, [%rd1064+12]; + xor.b32 %r24849, %r24847, %r24848; + xor.b32 %r30907, %r24845, %r24846; + mov.b64 %rd1065, {%r30907, %r24849}; + mul.lo.s32 %r24850, %r30903, 16777619; + ld.global.u32 %r24851, [%rd1064+16]; + mul.lo.s32 %r24852, %r30904, 16777619; + ld.global.u32 %r24853, [%rd1064+20]; + xor.b32 %r24854, %r24852, %r24853; + xor.b32 %r30903, %r24850, %r24851; + mov.b64 %rd1066, {%r30903, %r24854}; + mul.lo.s32 %r24855, %r30899, 16777619; + ld.global.u32 %r24856, [%rd1064+24]; + mul.lo.s32 %r24857, %r30900, 16777619; + ld.global.u32 %r24858, [%rd1064+28]; + xor.b32 %r24859, %r24857, %r24858; + xor.b32 %r30899, %r24855, %r24856; + mov.b64 %rd1067, {%r30899, %r24859}; + mul.lo.s32 %r24860, %r30897, 16777619; + ld.global.u32 %r24861, [%rd1064+32]; + mul.lo.s32 %r24862, %r30898, 16777619; + ld.global.u32 %r24863, [%rd1064+36]; + xor.b32 %r24864, %r24862, %r24863; + xor.b32 %r30897, %r24860, %r24861; + mov.b64 %rd1068, {%r30897, %r24864}; + mul.lo.s32 %r24865, %r30893, 16777619; + ld.global.u32 %r24866, [%rd1064+40]; + xor.b32 %r30893, %r24865, %r24866; + mul.lo.s32 %r24867, %r30894, 16777619; + ld.global.u32 %r24868, [%rd1064+44]; + xor.b32 %r30894, %r24867, %r24868; + mul.lo.s32 %r24869, %r30905, 16777619; + ld.global.u32 %r24870, [%rd1064+48]; + mul.lo.s32 %r24871, %r30906, 16777619; + ld.global.u32 %r24872, [%rd1064+52]; + xor.b32 %r24873, %r24871, %r24872; + xor.b32 %r30905, %r24869, %r24870; + mov.b64 %rd1069, {%r30905, %r24873}; + mul.lo.s32 %r24874, %r30901, 16777619; + ld.global.u32 %r24875, [%rd1064+56]; + mul.lo.s32 %r24876, %r30902, 16777619; + ld.global.u32 %r24877, [%rd1064+60]; + xor.b32 %r24878, %r24876, %r24877; + xor.b32 %r30901, %r24874, %r24875; + mov.b64 %rd1070, {%r30901, %r24878}; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + st.local.v2.u32 [%rd3+32], {%r30907, %r24849}; + st.local.v2.u32 [%rd3+40], {%r30903, %r24854}; + st.local.v2.u32 [%rd3+48], {%r30899, %r24859}; + st.local.v2.u32 [%rd3+56], {%r30897, %r24864}; + st.local.v2.u32 [%rd3+64], {%r30893, %r30894}; + st.local.v2.u32 [%rd3+72], {%r30905, %r24873}; + st.local.v2.u32 [%rd3+80], {%r30901, %r24878}; + add.s64 %rd1071, %rd275, %rd1059; + xor.b32 %r24879, %r3327, %r30858; + mul.lo.s32 %r24880, %r24879, 16777619; + ld.local.u32 %r24881, [%rd1071]; + xor.b32 %r24882, %r24880, %r24881; + mul.wide.u32 %rd1072, %r24882, -954391867; + shr.u64 %rd1073, %rd1072, 32; + cvt.u32.u64 %r24883, %rd1073; + sub.s32 %r24884, %r24882, %r24883; + shr.u32 %r24885, %r24884, 1; + add.s32 %r24886, %r24885, %r24883; + shr.u32 %r24887, %r24886, 20; + mul.lo.s32 %r24888, %r24887, 1179641; + sub.s32 %r24889, %r24882, %r24888; + mul.wide.u32 %rd1074, %r24889, 64; + add.s64 %rd1075, %rd471, %rd1074; + mul.lo.s32 %r24890, %r30946, 16777619; + ld.global.u32 %r24891, [%rd1075]; + xor.b32 %r30946, %r24890, %r24891; + mul.lo.s32 %r24892, %r30947, 16777619; + ld.global.u32 %r24893, [%rd1075+4]; + xor.b32 %r30947, %r24892, %r24893; + mul.lo.s32 %r24894, %r30958, 16777619; + ld.global.u32 %r24895, [%rd1075+8]; + mul.lo.s32 %r24896, %r30959, 16777619; + ld.global.u32 %r24897, [%rd1075+12]; + xor.b32 %r24898, %r24896, %r24897; + xor.b32 %r30958, %r24894, %r24895; + mov.b64 %rd1076, {%r30958, %r24898}; + mul.lo.s32 %r24899, %r30954, 16777619; + ld.global.u32 %r24900, [%rd1075+16]; + mul.lo.s32 %r24901, %r30955, 16777619; + ld.global.u32 %r24902, [%rd1075+20]; + xor.b32 %r24903, %r24901, %r24902; + xor.b32 %r30954, %r24899, %r24900; + mov.b64 %rd1077, {%r30954, %r24903}; + mul.lo.s32 %r24904, %r30950, 16777619; + ld.global.u32 %r24905, [%rd1075+24]; + mul.lo.s32 %r24906, %r30951, 16777619; + ld.global.u32 %r24907, [%rd1075+28]; + xor.b32 %r24908, %r24906, %r24907; + xor.b32 %r30950, %r24904, %r24905; + mov.b64 %rd1078, {%r30950, %r24908}; + mul.lo.s32 %r24909, %r30948, 16777619; + ld.global.u32 %r24910, [%rd1075+32]; + mul.lo.s32 %r24911, %r30949, 16777619; + ld.global.u32 %r24912, [%rd1075+36]; + xor.b32 %r24913, %r24911, %r24912; + xor.b32 %r30948, %r24909, %r24910; + mov.b64 %rd1079, {%r30948, %r24913}; + mul.lo.s32 %r24914, %r30944, 16777619; + ld.global.u32 %r24915, [%rd1075+40]; + xor.b32 %r30944, %r24914, %r24915; + mul.lo.s32 %r24916, %r30945, 16777619; + ld.global.u32 %r24917, [%rd1075+44]; + xor.b32 %r30945, %r24916, %r24917; + mul.lo.s32 %r24918, %r30956, 16777619; + ld.global.u32 %r24919, [%rd1075+48]; + mul.lo.s32 %r24920, %r30957, 16777619; + ld.global.u32 %r24921, [%rd1075+52]; + xor.b32 %r24922, %r24920, %r24921; + xor.b32 %r30956, %r24918, %r24919; + mov.b64 %rd1080, {%r30956, %r24922}; + mul.lo.s32 %r24923, %r30952, 16777619; + ld.global.u32 %r24924, [%rd1075+56]; + mul.lo.s32 %r24925, %r30953, 16777619; + ld.global.u32 %r24926, [%rd1075+60]; + xor.b32 %r24927, %r24925, %r24926; + xor.b32 %r30952, %r24923, %r24924; + mov.b64 %rd1081, {%r30952, %r24927}; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + st.local.v2.u32 [%rd272+32], {%r30958, %r24898}; + st.local.v2.u32 [%rd272+40], {%r30954, %r24903}; + st.local.v2.u32 [%rd272+48], {%r30950, %r24908}; + st.local.v2.u32 [%rd272+56], {%r30948, %r24913}; + st.local.v2.u32 [%rd272+64], {%r30944, %r30945}; + st.local.v2.u32 [%rd272+72], {%r30956, %r24922}; + st.local.v2.u32 [%rd272+80], {%r30952, %r24927}; + add.s32 %r30858, %r30858, 1; + setp.lt.u32 %p49, %r30858, 512; + shr.u64 %rd1082, %rd1065, 32; + cvt.u32.u64 %r30908, %rd1082; + shr.u64 %rd1083, %rd1066, 32; + cvt.u32.u64 %r30904, %rd1083; + shr.u64 %rd1084, %rd1067, 32; + cvt.u32.u64 %r30900, %rd1084; + shr.u64 %rd1085, %rd1068, 32; + cvt.u32.u64 %r30898, %rd1085; + shr.u64 %rd1086, %rd1069, 32; + cvt.u32.u64 %r30906, %rd1086; + shr.u64 %rd1087, %rd1070, 32; + cvt.u32.u64 %r30902, %rd1087; + shr.u64 %rd1088, %rd1076, 32; + cvt.u32.u64 %r30959, %rd1088; + shr.u64 %rd1089, %rd1077, 32; + cvt.u32.u64 %r30955, %rd1089; + shr.u64 %rd1090, %rd1078, 32; + cvt.u32.u64 %r30951, %rd1090; + shr.u64 %rd1091, %rd1079, 32; + cvt.u32.u64 %r30949, %rd1091; + shr.u64 %rd1092, %rd1080, 32; + cvt.u32.u64 %r30957, %rd1092; + shr.u64 %rd1093, %rd1081, 32; + cvt.u32.u64 %r30953, %rd1093; + @%p49 bra $L__BB2_88; + + mov.u32 %r30859, 0; + st.local.v2.u32 [%rd3+96], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+104], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+112], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+120], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+128], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+136], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+144], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+152], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+160], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+168], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+176], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+184], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+192], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+200], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+208], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+216], {%r30859, %r30859}; + mov.u32 %r30874, -2147483648; + mov.u32 %r24942, 1; + st.local.v2.u32 [%rd3+88], {%r24942, %r30874}; + mov.u32 %r30860, %r30859; + mov.u32 %r30861, %r30859; + mov.u32 %r30862, %r30859; + mov.u32 %r30863, %r30859; + mov.u32 %r30864, %r30859; + mov.u32 %r30865, %r30859; + mov.u32 %r30866, %r30859; + mov.u32 %r30867, %r30859; + mov.u32 %r30868, %r30859; + mov.u32 %r30869, %r30859; + mov.u32 %r30870, %r30859; + mov.u32 %r30871, %r30859; + mov.u32 %r30872, %r30859; + mov.u32 %r30873, %r24942; + mov.u32 %r30875, %r30859; + mov.u32 %r30876, %r30859; + mov.u32 %r30877, %r30859; + mov.u32 %r30878, %r30859; + mov.u32 %r30879, %r30859; + mov.u32 %r30880, %r30859; + mov.u32 %r30881, %r30859; + mov.u32 %r30882, %r30859; + mov.u32 %r30883, %r30859; + mov.u32 %r30884, %r30859; + mov.u32 %r30885, %r30859; + mov.u32 %r30886, %r30859; + mov.u32 %r30887, %r30859; + mov.u32 %r30888, %r30859; + mov.u32 %r30889, %r30859; + mov.u32 %r30890, %r30859; + mov.u32 %r30891, %r30859; + mov.u32 %r30892, %r30859; + mov.u32 %r30909, %r30859; + +$L__BB2_90: + // begin inline asm + // xor5 + lop3.b32 %r24969, %r30895, %r30893, %r30891, 0x96; + lop3.b32 %r24969, %r24969, %r30889, %r30887, 0x96; + lop3.b32 %r24970, %r30896, %r30894, %r30892, 0x96; + lop3.b32 %r24970, %r24970, %r30890, %r30888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24981, %r30907, %r30905, %r30885, 0x96; + lop3.b32 %r24981, %r24981, %r30883, %r30881, 0x96; + lop3.b32 %r24982, %r30908, %r30906, %r30886, 0x96; + lop3.b32 %r24982, %r24982, %r30884, %r30882, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24993, %r30903, %r30901, %r30879, 0x96; + lop3.b32 %r24993, %r24993, %r30877, %r30875, 0x96; + lop3.b32 %r24994, %r30904, %r30902, %r30880, 0x96; + lop3.b32 %r24994, %r24994, %r30878, %r30876, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25005, %r30899, %r30873, %r30871, 0x96; + lop3.b32 %r25005, %r25005, %r30869, %r30867, 0x96; + lop3.b32 %r25006, %r30900, %r30874, %r30872, 0x96; + lop3.b32 %r25006, %r25006, %r30870, %r30868, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25017, %r30897, %r30865, %r30863, 0x96; + lop3.b32 %r25017, %r25017, %r30861, %r30859, 0x96; + lop3.b32 %r25018, %r30898, %r30866, %r30864, 0x96; + lop3.b32 %r25018, %r25018, %r30862, %r30860, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25029, %r24982, %r24981, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25033, %r24981, %r24982, %r24942; + // end inline asm + xor.b32 %r25463, %r25029, %r25017; + xor.b32 %r25464, %r25033, %r25018; + xor.b32 %r25296, %r30895, %r25463; + xor.b32 %r25299, %r30896, %r25464; + xor.b32 %r25203, %r30893, %r25463; + xor.b32 %r25202, %r30894, %r25464; + xor.b32 %r25250, %r30891, %r25463; + xor.b32 %r25251, %r30892, %r25464; + xor.b32 %r25155, %r30889, %r25463; + xor.b32 %r25154, %r30890, %r25464; + xor.b32 %r25106, %r30887, %r25463; + xor.b32 %r25107, %r30888, %r25464; + // begin inline asm + shf.l.wrap.b32 %r25037, %r24994, %r24993, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25041, %r24993, %r24994, %r24942; + // end inline asm + xor.b32 %r25465, %r25037, %r24969; + xor.b32 %r25466, %r25041, %r24970; + xor.b32 %r25258, %r30907, %r25465; + xor.b32 %r25259, %r30908, %r25466; + xor.b32 %r25075, %r30905, %r25465; + xor.b32 %r25074, %r30906, %r25466; + xor.b32 %r25234, %r30885, %r25465; + xor.b32 %r25235, %r30886, %r25466; + xor.b32 %r25195, %r30883, %r25465; + xor.b32 %r25194, %r30884, %r25466; + xor.b32 %r25178, %r30881, %r25465; + xor.b32 %r25179, %r30882, %r25466; + // begin inline asm + shf.l.wrap.b32 %r25045, %r25006, %r25005, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25049, %r25005, %r25006, %r24942; + // end inline asm + xor.b32 %r25467, %r25045, %r24981; + xor.b32 %r25468, %r25049, %r24982; + xor.b32 %r25115, %r30903, %r25467; + xor.b32 %r25114, %r30904, %r25468; + xor.b32 %r25242, %r30901, %r25467; + xor.b32 %r25243, %r30902, %r25468; + xor.b32 %r25123, %r30879, %r25467; + xor.b32 %r25122, %r30880, %r25468; + xor.b32 %r25226, %r30877, %r25467; + xor.b32 %r25227, %r30878, %r25468; + xor.b32 %r25091, %r30875, %r25467; + xor.b32 %r25090, %r30876, %r25468; + // begin inline asm + shf.l.wrap.b32 %r25053, %r25018, %r25017, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25057, %r25017, %r25018, %r24942; + // end inline asm + xor.b32 %r25469, %r25053, %r24993; + xor.b32 %r25470, %r25057, %r24994; + xor.b32 %r25210, %r30899, %r25469; + xor.b32 %r25211, %r30900, %r25470; + xor.b32 %r25187, %r30873, %r25469; + xor.b32 %r25186, %r30874, %r25470; + xor.b32 %r25130, %r30871, %r25469; + xor.b32 %r25131, %r30872, %r25470; + xor.b32 %r25218, %r30869, %r25469; + xor.b32 %r25219, %r30870, %r25470; + xor.b32 %r25147, %r30867, %r25469; + xor.b32 %r25146, %r30868, %r25470; + // begin inline asm + shf.l.wrap.b32 %r25061, %r24970, %r24969, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25065, %r24969, %r24970, %r24942; + // end inline asm + xor.b32 %r25471, %r25061, %r25005; + xor.b32 %r25472, %r25065, %r25006; + xor.b32 %r25162, %r30897, %r25471; + xor.b32 %r25163, %r30898, %r25472; + xor.b32 %r25082, %r30865, %r25471; + xor.b32 %r25083, %r30866, %r25472; + xor.b32 %r25099, %r30863, %r25471; + xor.b32 %r25098, %r30864, %r25472; + xor.b32 %r25138, %r30861, %r25471; + xor.b32 %r25139, %r30862, %r25472; + xor.b32 %r25170, %r30859, %r25471; + xor.b32 %r25171, %r30860, %r25472; + mov.u32 %r25076, 44; + // begin inline asm + shf.l.wrap.b32 %r25069, %r25075, %r25074, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25073, %r25074, %r25075, %r25076; + // end inline asm + mov.u32 %r25084, 20; + // begin inline asm + shf.l.wrap.b32 %r25077, %r25083, %r25082, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25081, %r25082, %r25083, %r25084; + // end inline asm + mov.u32 %r25092, 61; + // begin inline asm + shf.l.wrap.b32 %r25085, %r25091, %r25090, %r25092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25089, %r25090, %r25091, %r25092; + // end inline asm + mov.u32 %r25100, 39; + // begin inline asm + shf.l.wrap.b32 %r25093, %r25099, %r25098, %r25100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25097, %r25098, %r25099, %r25100; + // end inline asm + mov.u32 %r25108, 18; + // begin inline asm + shf.l.wrap.b32 %r25101, %r25107, %r25106, %r25108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25105, %r25106, %r25107, %r25108; + // end inline asm + mov.u32 %r25116, 62; + // begin inline asm + shf.l.wrap.b32 %r25109, %r25115, %r25114, %r25116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25113, %r25114, %r25115, %r25116; + // end inline asm + mov.u32 %r25124, 43; + // begin inline asm + shf.l.wrap.b32 %r25117, %r25123, %r25122, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25121, %r25122, %r25123, %r25124; + // end inline asm + mov.u32 %r25132, 25; + // begin inline asm + shf.l.wrap.b32 %r25125, %r25131, %r25130, %r25132; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25129, %r25130, %r25131, %r25132; + // end inline asm + mov.u32 %r25140, 8; + // begin inline asm + shf.l.wrap.b32 %r25133, %r25139, %r25138, %r25140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25137, %r25138, %r25139, %r25140; + // end inline asm + mov.u32 %r25148, 56; + // begin inline asm + shf.l.wrap.b32 %r25141, %r25147, %r25146, %r25148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25145, %r25146, %r25147, %r25148; + // end inline asm + mov.u32 %r25156, 41; + // begin inline asm + shf.l.wrap.b32 %r25149, %r25155, %r25154, %r25156; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25153, %r25154, %r25155, %r25156; + // end inline asm + mov.u32 %r25164, 27; + // begin inline asm + shf.l.wrap.b32 %r25157, %r25163, %r25162, %r25164; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25161, %r25162, %r25163, %r25164; + // end inline asm + mov.u32 %r25172, 14; + // begin inline asm + shf.l.wrap.b32 %r25165, %r25171, %r25170, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25169, %r25170, %r25171, %r25172; + // end inline asm + mov.u32 %r25180, 2; + // begin inline asm + shf.l.wrap.b32 %r25173, %r25179, %r25178, %r25180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25177, %r25178, %r25179, %r25180; + // end inline asm + mov.u32 %r25188, 55; + // begin inline asm + shf.l.wrap.b32 %r25181, %r25187, %r25186, %r25188; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25185, %r25186, %r25187, %r25188; + // end inline asm + mov.u32 %r25196, 45; + // begin inline asm + shf.l.wrap.b32 %r25189, %r25195, %r25194, %r25196; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25193, %r25194, %r25195, %r25196; + // end inline asm + mov.u32 %r25204, 36; + // begin inline asm + shf.l.wrap.b32 %r25197, %r25203, %r25202, %r25204; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25201, %r25202, %r25203, %r25204; + // end inline asm + mov.u32 %r25212, 28; + // begin inline asm + shf.l.wrap.b32 %r25205, %r25211, %r25210, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25209, %r25210, %r25211, %r25212; + // end inline asm + mov.u32 %r25220, 21; + // begin inline asm + shf.l.wrap.b32 %r25213, %r25219, %r25218, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25217, %r25218, %r25219, %r25220; + // end inline asm + mov.u32 %r25228, 15; + // begin inline asm + shf.l.wrap.b32 %r25221, %r25227, %r25226, %r25228; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25225, %r25226, %r25227, %r25228; + // end inline asm + mov.u32 %r25236, 10; + // begin inline asm + shf.l.wrap.b32 %r25229, %r25235, %r25234, %r25236; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25233, %r25234, %r25235, %r25236; + // end inline asm + mov.u32 %r25244, 6; + // begin inline asm + shf.l.wrap.b32 %r25237, %r25243, %r25242, %r25244; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25241, %r25242, %r25243, %r25244; + // end inline asm + mov.u32 %r25252, 3; + // begin inline asm + shf.l.wrap.b32 %r25245, %r25251, %r25250, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25249, %r25250, %r25251, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25253, %r25259, %r25258, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25257, %r25258, %r25259, %r24942; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25261, %r25296, %r25069, %r25117, 0xD2; + lop3.b32 %r25262, %r25299, %r25073, %r25121, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30907, %r25069, %r25117, %r25213, 0xD2; + lop3.b32 %r30908, %r25073, %r25121, %r25217, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30903, %r25117, %r25213, %r25165, 0xD2; + lop3.b32 %r30904, %r25121, %r25217, %r25169, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30899, %r25213, %r25165, %r25296, 0xD2; + lop3.b32 %r30900, %r25217, %r25169, %r25299, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30897, %r25165, %r25296, %r25069, 0xD2; + lop3.b32 %r30898, %r25169, %r25299, %r25073, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30893, %r25205, %r25077, %r25245, 0xD2; + lop3.b32 %r30894, %r25209, %r25081, %r25249, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30905, %r25077, %r25245, %r25189, 0xD2; + lop3.b32 %r30906, %r25081, %r25249, %r25193, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30901, %r25245, %r25189, %r25085, 0xD2; + lop3.b32 %r30902, %r25249, %r25193, %r25089, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30873, %r25189, %r25085, %r25205, 0xD2; + lop3.b32 %r30874, %r25193, %r25089, %r25209, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30873, %r30874}; + // begin inline asm + // chi + lop3.b32 %r30865, %r25085, %r25205, %r25077, 0xD2; + lop3.b32 %r30866, %r25089, %r25209, %r25081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30865, %r30866}; + // begin inline asm + // chi + lop3.b32 %r30891, %r25253, %r25237, %r25125, 0xD2; + lop3.b32 %r30892, %r25257, %r25241, %r25129, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30891, %r30892}; + // begin inline asm + // chi + lop3.b32 %r30885, %r25237, %r25125, %r25133, 0xD2; + lop3.b32 %r30886, %r25241, %r25129, %r25137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30885, %r30886}; + // begin inline asm + // chi + lop3.b32 %r30879, %r25125, %r25133, %r25101, 0xD2; + lop3.b32 %r30880, %r25129, %r25137, %r25105, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30879, %r30880}; + // begin inline asm + // chi + lop3.b32 %r30871, %r25133, %r25101, %r25253, 0xD2; + lop3.b32 %r30872, %r25137, %r25105, %r25257, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30871, %r30872}; + // begin inline asm + // chi + lop3.b32 %r30863, %r25101, %r25253, %r25237, 0xD2; + lop3.b32 %r30864, %r25105, %r25257, %r25241, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30863, %r30864}; + // begin inline asm + // chi + lop3.b32 %r30889, %r25157, %r25197, %r25229, 0xD2; + lop3.b32 %r30890, %r25161, %r25201, %r25233, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30889, %r30890}; + // begin inline asm + // chi + lop3.b32 %r30883, %r25197, %r25229, %r25221, 0xD2; + lop3.b32 %r30884, %r25201, %r25233, %r25225, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30883, %r30884}; + // begin inline asm + // chi + lop3.b32 %r30877, %r25229, %r25221, %r25141, 0xD2; + lop3.b32 %r30878, %r25233, %r25225, %r25145, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30877, %r30878}; + // begin inline asm + // chi + lop3.b32 %r30869, %r25221, %r25141, %r25157, 0xD2; + lop3.b32 %r30870, %r25225, %r25145, %r25161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30869, %r30870}; + // begin inline asm + // chi + lop3.b32 %r30861, %r25141, %r25157, %r25197, 0xD2; + lop3.b32 %r30862, %r25145, %r25161, %r25201, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30861, %r30862}; + // begin inline asm + // chi + lop3.b32 %r30887, %r25109, %r25181, %r25093, 0xD2; + lop3.b32 %r30888, %r25113, %r25185, %r25097, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30887, %r30888}; + // begin inline asm + // chi + lop3.b32 %r30881, %r25181, %r25093, %r25149, 0xD2; + lop3.b32 %r30882, %r25185, %r25097, %r25153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30881, %r30882}; + // begin inline asm + // chi + lop3.b32 %r30875, %r25093, %r25149, %r25173, 0xD2; + lop3.b32 %r30876, %r25097, %r25153, %r25177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30875, %r30876}; + // begin inline asm + // chi + lop3.b32 %r30867, %r25149, %r25173, %r25109, 0xD2; + lop3.b32 %r30868, %r25153, %r25177, %r25113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30867, %r30868}; + // begin inline asm + // chi + lop3.b32 %r30859, %r25173, %r25109, %r25181, 0xD2; + lop3.b32 %r30860, %r25177, %r25113, %r25185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30859, %r30860}; + mul.wide.s32 %rd1097, %r30909, 8; + add.s64 %rd1096, %rd1029, %rd1097; + // begin inline asm + ld.global.nc.v2.u32 {%r25461,%r25462}, [%rd1096]; + // end inline asm + xor.b32 %r30895, %r25261, %r25461; + xor.b32 %r30896, %r25262, %r25462; + add.s32 %r30909, %r30909, 1; + setp.lt.u32 %p50, %r30909, 23; + @%p50 bra $L__BB2_90; + + st.local.v2.u32 [%rd3+32], {%r30907, %r30908}; + st.local.v2.u32 [%rd3+72], {%r30905, %r30906}; + st.local.v2.u32 [%rd3+40], {%r30903, %r30904}; + st.local.v2.u32 [%rd3+80], {%r30901, %r30902}; + st.local.v2.u32 [%rd3+48], {%r30899, %r30900}; + st.local.v2.u32 [%rd3+56], {%r30897, %r30898}; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + // begin inline asm + // xor5 + lop3.b32 %r25473, %r30895, %r30893, %r30891, 0x96; + lop3.b32 %r25473, %r25473, %r30889, %r30887, 0x96; + lop3.b32 %r25474, %r30896, %r30894, %r30892, 0x96; + lop3.b32 %r25474, %r25474, %r30890, %r30888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25485, %r30907, %r30905, %r30885, 0x96; + lop3.b32 %r25485, %r25485, %r30883, %r30881, 0x96; + lop3.b32 %r25486, %r30908, %r30906, %r30886, 0x96; + lop3.b32 %r25486, %r25486, %r30884, %r30882, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25497, %r30903, %r30901, %r30879, 0x96; + lop3.b32 %r25497, %r25497, %r30877, %r30875, 0x96; + lop3.b32 %r25498, %r30904, %r30902, %r30880, 0x96; + lop3.b32 %r25498, %r25498, %r30878, %r30876, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25509, %r30899, %r30873, %r30871, 0x96; + lop3.b32 %r25509, %r25509, %r30869, %r30867, 0x96; + lop3.b32 %r25510, %r30900, %r30874, %r30872, 0x96; + lop3.b32 %r25510, %r25510, %r30870, %r30868, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25521, %r30897, %r30865, %r30863, 0x96; + lop3.b32 %r25521, %r25521, %r30861, %r30859, 0x96; + lop3.b32 %r25522, %r30898, %r30866, %r30864, 0x96; + lop3.b32 %r25522, %r25522, %r30862, %r30860, 0x96; + // end inline asm + mov.u32 %r25725, 1; + // begin inline asm + shf.l.wrap.b32 %r25533, %r25486, %r25485, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25537, %r25485, %r25486, %r25725; + // end inline asm + xor.b32 %r25752, %r25533, %r25521; + xor.b32 %r25753, %r25537, %r25522; + xor.b32 %r25680, %r30895, %r25752; + xor.b32 %r25683, %r30896, %r25753; + xor.b32 %r25643, %r30892, %r25753; + xor.b32 %r25642, %r30891, %r25752; + st.local.v2.u32 [%rd3+104], {%r25642, %r25643}; + // begin inline asm + shf.l.wrap.b32 %r25541, %r25498, %r25497, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25545, %r25497, %r25498, %r25725; + // end inline asm + xor.b32 %r25754, %r25541, %r25473; + xor.b32 %r25755, %r25545, %r25474; + xor.b32 %r25579, %r30905, %r25754; + xor.b32 %r25578, %r30906, %r25755; + xor.b32 %r25618, %r30884, %r25755; + xor.b32 %r25619, %r30883, %r25754; + st.local.v2.u32 [%rd3+152], {%r25619, %r25618}; + // begin inline asm + shf.l.wrap.b32 %r25549, %r25510, %r25509, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25553, %r25509, %r25510, %r25725; + // end inline asm + xor.b32 %r25756, %r25549, %r25485; + xor.b32 %r25757, %r25553, %r25486; + xor.b32 %r25602, %r30880, %r25757; + xor.b32 %r25603, %r30879, %r25756; + st.local.v2.u32 [%rd3+120], {%r25603, %r25602}; + xor.b32 %r25594, %r30876, %r25757; + xor.b32 %r25595, %r30875, %r25756; + st.local.v2.u32 [%rd3+200], {%r25595, %r25594}; + // begin inline asm + shf.l.wrap.b32 %r25557, %r25522, %r25521, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25561, %r25521, %r25522, %r25725; + // end inline asm + xor.b32 %r25758, %r25557, %r25497; + xor.b32 %r25759, %r25561, %r25498; + xor.b32 %r25626, %r30899, %r25758; + xor.b32 %r25627, %r30900, %r25759; + xor.b32 %r25635, %r30870, %r25759; + xor.b32 %r25634, %r30869, %r25758; + st.local.v2.u32 [%rd3+168], {%r25634, %r25635}; + // begin inline asm + shf.l.wrap.b32 %r25565, %r25474, %r25473, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25569, %r25473, %r25474, %r25725; + // end inline asm + xor.b32 %r25760, %r25565, %r25509; + xor.b32 %r25761, %r25569, %r25510; + xor.b32 %r25586, %r30865, %r25760; + xor.b32 %r25587, %r30866, %r25761; + xor.b32 %r25611, %r30860, %r25761; + xor.b32 %r25610, %r30859, %r25760; + st.local.v2.u32 [%rd3+216], {%r25610, %r25611}; + // begin inline asm + shf.l.wrap.b32 %r25573, %r25579, %r25578, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25577, %r25578, %r25579, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25581, %r25587, %r25586, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25585, %r25586, %r25587, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25593, %r25594, %r25595, %r25092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25589, %r25595, %r25594, %r25092; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r25589, %r25593}; + // begin inline asm + shf.l.wrap.b32 %r25597, %r25603, %r25602, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25601, %r25602, %r25603, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25605, %r25611, %r25610, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25609, %r25610, %r25611, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25617, %r25618, %r25619, %r25196; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25613, %r25619, %r25618, %r25196; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r25613, %r25617}; + // begin inline asm + shf.l.wrap.b32 %r25621, %r25627, %r25626, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25625, %r25626, %r25627, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25629, %r25635, %r25634, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25633, %r25634, %r25635, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25637, %r25643, %r25642, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25641, %r25642, %r25643, %r25252; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25645, %r25680, %r25573, %r25597, 0xD2; + lop3.b32 %r25646, %r25683, %r25577, %r25601, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25653, %r25573, %r25597, %r25629, 0xD2; + lop3.b32 %r25654, %r25577, %r25601, %r25633, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r25653, %r25654}; + // begin inline asm + // chi + lop3.b32 %r25661, %r25597, %r25629, %r25605, 0xD2; + lop3.b32 %r25662, %r25601, %r25633, %r25609, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r25661, %r25662}; + // begin inline asm + // chi + lop3.b32 %r25669, %r25629, %r25605, %r25680, 0xD2; + lop3.b32 %r25670, %r25633, %r25609, %r25683, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r25669, %r25670}; + // begin inline asm + // chi + lop3.b32 %r25677, %r25605, %r25680, %r25573, 0xD2; + lop3.b32 %r25678, %r25609, %r25683, %r25577, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r25677, %r25678}; + // begin inline asm + // chi + lop3.b32 %r25685, %r25621, %r25581, %r25637, 0xD2; + lop3.b32 %r25686, %r25625, %r25585, %r25641, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r25685, %r25686}; + // begin inline asm + // chi + lop3.b32 %r25693, %r25581, %r25637, %r25613, 0xD2; + lop3.b32 %r25694, %r25585, %r25641, %r25617, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r25693, %r25694}; + // begin inline asm + // chi + lop3.b32 %r25701, %r25637, %r25613, %r25589, 0xD2; + lop3.b32 %r25702, %r25641, %r25617, %r25593, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r25701, %r25702}; + // begin inline asm + ld.global.nc.v2.u32 {%r25709,%r25710}, [%rd1030]; + // end inline asm + xor.b32 %r25762, %r25646, %r25710; + xor.b32 %r25763, %r25645, %r25709; + mov.b64 %rd1269, {%r25763, %r25762}; + mov.b64 %rd1270, {%r25653, %r25654}; + mov.b64 %rd1271, {%r25661, %r25662}; + mov.b64 %rd1272, {%r25677, %r25678}; + mov.u32 %r30910, 0; + st.local.v2.u32 [%rd3+24], {%r25763, %r25762}; + st.local.v2.u32 [%rd272+96], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+104], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+112], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+120], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+128], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+136], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+144], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+152], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+160], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+168], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+176], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+184], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+192], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+200], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+208], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+216], {%r30910, %r30910}; + mov.u32 %r30925, -2147483648; + st.local.v2.u32 [%rd272+88], {%r25725, %r30925}; + mov.u32 %r30911, %r30910; + mov.u32 %r30912, %r30910; + mov.u32 %r30913, %r30910; + mov.u32 %r30914, %r30910; + mov.u32 %r30915, %r30910; + mov.u32 %r30916, %r30910; + mov.u32 %r30917, %r30910; + mov.u32 %r30918, %r30910; + mov.u32 %r30919, %r30910; + mov.u32 %r30920, %r30910; + mov.u32 %r30921, %r30910; + mov.u32 %r30922, %r30910; + mov.u32 %r30923, %r30910; + mov.u32 %r30924, %r25725; + mov.u32 %r30926, %r30910; + mov.u32 %r30927, %r30910; + mov.u32 %r30928, %r30910; + mov.u32 %r30929, %r30910; + mov.u32 %r30930, %r30910; + mov.u32 %r30931, %r30910; + mov.u32 %r30932, %r30910; + mov.u32 %r30933, %r30910; + mov.u32 %r30934, %r30910; + mov.u32 %r30935, %r30910; + mov.u32 %r30936, %r30910; + mov.u32 %r30937, %r30910; + mov.u32 %r30938, %r30910; + mov.u32 %r30939, %r30910; + mov.u32 %r30940, %r30910; + mov.u32 %r30941, %r30910; + mov.u32 %r30942, %r30910; + mov.u32 %r30943, %r30910; + mov.u32 %r30960, %r30910; + +$L__BB2_92: + // begin inline asm + // xor5 + lop3.b32 %r25764, %r30946, %r30944, %r30942, 0x96; + lop3.b32 %r25764, %r25764, %r30940, %r30938, 0x96; + lop3.b32 %r25765, %r30947, %r30945, %r30943, 0x96; + lop3.b32 %r25765, %r25765, %r30941, %r30939, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25776, %r30958, %r30956, %r30936, 0x96; + lop3.b32 %r25776, %r25776, %r30934, %r30932, 0x96; + lop3.b32 %r25777, %r30959, %r30957, %r30937, 0x96; + lop3.b32 %r25777, %r25777, %r30935, %r30933, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25788, %r30954, %r30952, %r30930, 0x96; + lop3.b32 %r25788, %r25788, %r30928, %r30926, 0x96; + lop3.b32 %r25789, %r30955, %r30953, %r30931, 0x96; + lop3.b32 %r25789, %r25789, %r30929, %r30927, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25800, %r30950, %r30924, %r30922, 0x96; + lop3.b32 %r25800, %r25800, %r30920, %r30918, 0x96; + lop3.b32 %r25801, %r30951, %r30925, %r30923, 0x96; + lop3.b32 %r25801, %r25801, %r30921, %r30919, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25812, %r30948, %r30916, %r30914, 0x96; + lop3.b32 %r25812, %r25812, %r30912, %r30910, 0x96; + lop3.b32 %r25813, %r30949, %r30917, %r30915, 0x96; + lop3.b32 %r25813, %r25813, %r30913, %r30911, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25824, %r25777, %r25776, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25828, %r25776, %r25777, %r25725; + // end inline asm + xor.b32 %r26258, %r25824, %r25812; + xor.b32 %r26259, %r25828, %r25813; + xor.b32 %r26091, %r30946, %r26258; + xor.b32 %r26094, %r30947, %r26259; + xor.b32 %r25998, %r30944, %r26258; + xor.b32 %r25997, %r30945, %r26259; + xor.b32 %r26045, %r30942, %r26258; + xor.b32 %r26046, %r30943, %r26259; + xor.b32 %r25950, %r30940, %r26258; + xor.b32 %r25949, %r30941, %r26259; + xor.b32 %r25901, %r30938, %r26258; + xor.b32 %r25902, %r30939, %r26259; + // begin inline asm + shf.l.wrap.b32 %r25832, %r25789, %r25788, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25836, %r25788, %r25789, %r25725; + // end inline asm + xor.b32 %r26260, %r25832, %r25764; + xor.b32 %r26261, %r25836, %r25765; + xor.b32 %r26053, %r30958, %r26260; + xor.b32 %r26054, %r30959, %r26261; + xor.b32 %r25870, %r30956, %r26260; + xor.b32 %r25869, %r30957, %r26261; + xor.b32 %r26029, %r30936, %r26260; + xor.b32 %r26030, %r30937, %r26261; + xor.b32 %r25990, %r30934, %r26260; + xor.b32 %r25989, %r30935, %r26261; + xor.b32 %r25973, %r30932, %r26260; + xor.b32 %r25974, %r30933, %r26261; + // begin inline asm + shf.l.wrap.b32 %r25840, %r25801, %r25800, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25844, %r25800, %r25801, %r25725; + // end inline asm + xor.b32 %r26262, %r25840, %r25776; + xor.b32 %r26263, %r25844, %r25777; + xor.b32 %r25910, %r30954, %r26262; + xor.b32 %r25909, %r30955, %r26263; + xor.b32 %r26037, %r30952, %r26262; + xor.b32 %r26038, %r30953, %r26263; + xor.b32 %r25918, %r30930, %r26262; + xor.b32 %r25917, %r30931, %r26263; + xor.b32 %r26021, %r30928, %r26262; + xor.b32 %r26022, %r30929, %r26263; + xor.b32 %r25886, %r30926, %r26262; + xor.b32 %r25885, %r30927, %r26263; + // begin inline asm + shf.l.wrap.b32 %r25848, %r25813, %r25812, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25852, %r25812, %r25813, %r25725; + // end inline asm + xor.b32 %r26264, %r25848, %r25788; + xor.b32 %r26265, %r25852, %r25789; + xor.b32 %r26005, %r30950, %r26264; + xor.b32 %r26006, %r30951, %r26265; + xor.b32 %r25982, %r30924, %r26264; + xor.b32 %r25981, %r30925, %r26265; + xor.b32 %r25925, %r30922, %r26264; + xor.b32 %r25926, %r30923, %r26265; + xor.b32 %r26013, %r30920, %r26264; + xor.b32 %r26014, %r30921, %r26265; + xor.b32 %r25942, %r30918, %r26264; + xor.b32 %r25941, %r30919, %r26265; + // begin inline asm + shf.l.wrap.b32 %r25856, %r25765, %r25764, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25860, %r25764, %r25765, %r25725; + // end inline asm + xor.b32 %r26266, %r25856, %r25800; + xor.b32 %r26267, %r25860, %r25801; + xor.b32 %r25957, %r30948, %r26266; + xor.b32 %r25958, %r30949, %r26267; + xor.b32 %r25877, %r30916, %r26266; + xor.b32 %r25878, %r30917, %r26267; + xor.b32 %r25894, %r30914, %r26266; + xor.b32 %r25893, %r30915, %r26267; + xor.b32 %r25933, %r30912, %r26266; + xor.b32 %r25934, %r30913, %r26267; + xor.b32 %r25965, %r30910, %r26266; + xor.b32 %r25966, %r30911, %r26267; + mov.u32 %r25871, 44; + // begin inline asm + shf.l.wrap.b32 %r25864, %r25870, %r25869, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25868, %r25869, %r25870, %r25871; + // end inline asm + mov.u32 %r25879, 20; + // begin inline asm + shf.l.wrap.b32 %r25872, %r25878, %r25877, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25876, %r25877, %r25878, %r25879; + // end inline asm + mov.u32 %r25887, 61; + // begin inline asm + shf.l.wrap.b32 %r25880, %r25886, %r25885, %r25887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25884, %r25885, %r25886, %r25887; + // end inline asm + mov.u32 %r25895, 39; + // begin inline asm + shf.l.wrap.b32 %r25888, %r25894, %r25893, %r25895; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25892, %r25893, %r25894, %r25895; + // end inline asm + mov.u32 %r25903, 18; + // begin inline asm + shf.l.wrap.b32 %r25896, %r25902, %r25901, %r25903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25900, %r25901, %r25902, %r25903; + // end inline asm + mov.u32 %r25911, 62; + // begin inline asm + shf.l.wrap.b32 %r25904, %r25910, %r25909, %r25911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25908, %r25909, %r25910, %r25911; + // end inline asm + mov.u32 %r25919, 43; + // begin inline asm + shf.l.wrap.b32 %r25912, %r25918, %r25917, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25916, %r25917, %r25918, %r25919; + // end inline asm + mov.u32 %r25927, 25; + // begin inline asm + shf.l.wrap.b32 %r25920, %r25926, %r25925, %r25927; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25924, %r25925, %r25926, %r25927; + // end inline asm + mov.u32 %r25935, 8; + // begin inline asm + shf.l.wrap.b32 %r25928, %r25934, %r25933, %r25935; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25932, %r25933, %r25934, %r25935; + // end inline asm + mov.u32 %r25943, 56; + // begin inline asm + shf.l.wrap.b32 %r25936, %r25942, %r25941, %r25943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25940, %r25941, %r25942, %r25943; + // end inline asm + mov.u32 %r25951, 41; + // begin inline asm + shf.l.wrap.b32 %r25944, %r25950, %r25949, %r25951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25948, %r25949, %r25950, %r25951; + // end inline asm + mov.u32 %r25959, 27; + // begin inline asm + shf.l.wrap.b32 %r25952, %r25958, %r25957, %r25959; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25956, %r25957, %r25958, %r25959; + // end inline asm + mov.u32 %r25967, 14; + // begin inline asm + shf.l.wrap.b32 %r25960, %r25966, %r25965, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25964, %r25965, %r25966, %r25967; + // end inline asm + mov.u32 %r25975, 2; + // begin inline asm + shf.l.wrap.b32 %r25968, %r25974, %r25973, %r25975; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25972, %r25973, %r25974, %r25975; + // end inline asm + mov.u32 %r25983, 55; + // begin inline asm + shf.l.wrap.b32 %r25976, %r25982, %r25981, %r25983; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25980, %r25981, %r25982, %r25983; + // end inline asm + mov.u32 %r25991, 45; + // begin inline asm + shf.l.wrap.b32 %r25984, %r25990, %r25989, %r25991; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25988, %r25989, %r25990, %r25991; + // end inline asm + mov.u32 %r25999, 36; + // begin inline asm + shf.l.wrap.b32 %r25992, %r25998, %r25997, %r25999; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25996, %r25997, %r25998, %r25999; + // end inline asm + mov.u32 %r26007, 28; + // begin inline asm + shf.l.wrap.b32 %r26000, %r26006, %r26005, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26004, %r26005, %r26006, %r26007; + // end inline asm + mov.u32 %r26015, 21; + // begin inline asm + shf.l.wrap.b32 %r26008, %r26014, %r26013, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26012, %r26013, %r26014, %r26015; + // end inline asm + mov.u32 %r26023, 15; + // begin inline asm + shf.l.wrap.b32 %r26016, %r26022, %r26021, %r26023; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26020, %r26021, %r26022, %r26023; + // end inline asm + mov.u32 %r26031, 10; + // begin inline asm + shf.l.wrap.b32 %r26024, %r26030, %r26029, %r26031; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26028, %r26029, %r26030, %r26031; + // end inline asm + mov.u32 %r26039, 6; + // begin inline asm + shf.l.wrap.b32 %r26032, %r26038, %r26037, %r26039; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26036, %r26037, %r26038, %r26039; + // end inline asm + mov.u32 %r26047, 3; + // begin inline asm + shf.l.wrap.b32 %r26040, %r26046, %r26045, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26044, %r26045, %r26046, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26048, %r26054, %r26053, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26052, %r26053, %r26054, %r25725; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26056, %r26091, %r25864, %r25912, 0xD2; + lop3.b32 %r26057, %r26094, %r25868, %r25916, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30958, %r25864, %r25912, %r26008, 0xD2; + lop3.b32 %r30959, %r25868, %r25916, %r26012, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30954, %r25912, %r26008, %r25960, 0xD2; + lop3.b32 %r30955, %r25916, %r26012, %r25964, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30950, %r26008, %r25960, %r26091, 0xD2; + lop3.b32 %r30951, %r26012, %r25964, %r26094, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30948, %r25960, %r26091, %r25864, 0xD2; + lop3.b32 %r30949, %r25964, %r26094, %r25868, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30944, %r26000, %r25872, %r26040, 0xD2; + lop3.b32 %r30945, %r26004, %r25876, %r26044, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30956, %r25872, %r26040, %r25984, 0xD2; + lop3.b32 %r30957, %r25876, %r26044, %r25988, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30952, %r26040, %r25984, %r25880, 0xD2; + lop3.b32 %r30953, %r26044, %r25988, %r25884, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30924, %r25984, %r25880, %r26000, 0xD2; + lop3.b32 %r30925, %r25988, %r25884, %r26004, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r30924, %r30925}; + // begin inline asm + // chi + lop3.b32 %r30916, %r25880, %r26000, %r25872, 0xD2; + lop3.b32 %r30917, %r25884, %r26004, %r25876, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r30916, %r30917}; + // begin inline asm + // chi + lop3.b32 %r30942, %r26048, %r26032, %r25920, 0xD2; + lop3.b32 %r30943, %r26052, %r26036, %r25924, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+104], {%r30942, %r30943}; + // begin inline asm + // chi + lop3.b32 %r30936, %r26032, %r25920, %r25928, 0xD2; + lop3.b32 %r30937, %r26036, %r25924, %r25932, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+112], {%r30936, %r30937}; + // begin inline asm + // chi + lop3.b32 %r30930, %r25920, %r25928, %r25896, 0xD2; + lop3.b32 %r30931, %r25924, %r25932, %r25900, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+120], {%r30930, %r30931}; + // begin inline asm + // chi + lop3.b32 %r30922, %r25928, %r25896, %r26048, 0xD2; + lop3.b32 %r30923, %r25932, %r25900, %r26052, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+128], {%r30922, %r30923}; + // begin inline asm + // chi + lop3.b32 %r30914, %r25896, %r26048, %r26032, 0xD2; + lop3.b32 %r30915, %r25900, %r26052, %r26036, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+136], {%r30914, %r30915}; + // begin inline asm + // chi + lop3.b32 %r30940, %r25952, %r25992, %r26024, 0xD2; + lop3.b32 %r30941, %r25956, %r25996, %r26028, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+144], {%r30940, %r30941}; + // begin inline asm + // chi + lop3.b32 %r30934, %r25992, %r26024, %r26016, 0xD2; + lop3.b32 %r30935, %r25996, %r26028, %r26020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+152], {%r30934, %r30935}; + // begin inline asm + // chi + lop3.b32 %r30928, %r26024, %r26016, %r25936, 0xD2; + lop3.b32 %r30929, %r26028, %r26020, %r25940, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+160], {%r30928, %r30929}; + // begin inline asm + // chi + lop3.b32 %r30920, %r26016, %r25936, %r25952, 0xD2; + lop3.b32 %r30921, %r26020, %r25940, %r25956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+168], {%r30920, %r30921}; + // begin inline asm + // chi + lop3.b32 %r30912, %r25936, %r25952, %r25992, 0xD2; + lop3.b32 %r30913, %r25940, %r25956, %r25996, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+176], {%r30912, %r30913}; + // begin inline asm + // chi + lop3.b32 %r30938, %r25904, %r25976, %r25888, 0xD2; + lop3.b32 %r30939, %r25908, %r25980, %r25892, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+184], {%r30938, %r30939}; + // begin inline asm + // chi + lop3.b32 %r30932, %r25976, %r25888, %r25944, 0xD2; + lop3.b32 %r30933, %r25980, %r25892, %r25948, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+192], {%r30932, %r30933}; + // begin inline asm + // chi + lop3.b32 %r30926, %r25888, %r25944, %r25968, 0xD2; + lop3.b32 %r30927, %r25892, %r25948, %r25972, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+200], {%r30926, %r30927}; + // begin inline asm + // chi + lop3.b32 %r30918, %r25944, %r25968, %r25904, 0xD2; + lop3.b32 %r30919, %r25948, %r25972, %r25908, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+208], {%r30918, %r30919}; + // begin inline asm + // chi + lop3.b32 %r30910, %r25968, %r25904, %r25976, 0xD2; + lop3.b32 %r30911, %r25972, %r25908, %r25980, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+216], {%r30910, %r30911}; + mul.wide.s32 %rd1108, %r30960, 8; + add.s64 %rd1107, %rd1029, %rd1108; + // begin inline asm + ld.global.nc.v2.u32 {%r26256,%r26257}, [%rd1107]; + // end inline asm + xor.b32 %r30946, %r26056, %r26256; + xor.b32 %r30947, %r26057, %r26257; + add.s32 %r30960, %r30960, 1; + setp.lt.u32 %p51, %r30960, 23; + @%p51 bra $L__BB2_92; + + mov.u32 %r26367, 1; + st.local.v2.u32 [%rd272+32], {%r30958, %r30959}; + st.local.v2.u32 [%rd272+72], {%r30956, %r30957}; + st.local.v2.u32 [%rd272+40], {%r30954, %r30955}; + st.local.v2.u32 [%rd272+80], {%r30952, %r30953}; + st.local.v2.u32 [%rd272+48], {%r30950, %r30951}; + st.local.v2.u32 [%rd272+56], {%r30948, %r30949}; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + // begin inline asm + // xor5 + lop3.b32 %r26268, %r30946, %r30944, %r30942, 0x96; + lop3.b32 %r26268, %r26268, %r30940, %r30938, 0x96; + lop3.b32 %r26269, %r30947, %r30945, %r30943, 0x96; + lop3.b32 %r26269, %r26269, %r30941, %r30939, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26280, %r30958, %r30956, %r30936, 0x96; + lop3.b32 %r26280, %r26280, %r30934, %r30932, 0x96; + lop3.b32 %r26281, %r30959, %r30957, %r30937, 0x96; + lop3.b32 %r26281, %r26281, %r30935, %r30933, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26292, %r30954, %r30952, %r30930, 0x96; + lop3.b32 %r26292, %r26292, %r30928, %r30926, 0x96; + lop3.b32 %r26293, %r30955, %r30953, %r30931, 0x96; + lop3.b32 %r26293, %r26293, %r30929, %r30927, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26304, %r30950, %r30924, %r30922, 0x96; + lop3.b32 %r26304, %r26304, %r30920, %r30918, 0x96; + lop3.b32 %r26305, %r30951, %r30925, %r30923, 0x96; + lop3.b32 %r26305, %r26305, %r30921, %r30919, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26316, %r30948, %r30916, %r30914, 0x96; + lop3.b32 %r26316, %r26316, %r30912, %r30910, 0x96; + lop3.b32 %r26317, %r30949, %r30917, %r30915, 0x96; + lop3.b32 %r26317, %r26317, %r30913, %r30911, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26328, %r26281, %r26280, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26332, %r26280, %r26281, %r26367; + // end inline asm + xor.b32 %r26506, %r26328, %r26316; + xor.b32 %r26507, %r26332, %r26317; + xor.b32 %r26475, %r30946, %r26506; + xor.b32 %r26478, %r30947, %r26507; + xor.b32 %r26438, %r30943, %r26507; + xor.b32 %r26437, %r30942, %r26506; + st.local.v2.u32 [%rd272+104], {%r26437, %r26438}; + // begin inline asm + shf.l.wrap.b32 %r26336, %r26293, %r26292, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26340, %r26292, %r26293, %r26367; + // end inline asm + xor.b32 %r26508, %r26336, %r26268; + xor.b32 %r26509, %r26340, %r26269; + xor.b32 %r26374, %r30956, %r26508; + xor.b32 %r26373, %r30957, %r26509; + xor.b32 %r26413, %r30935, %r26509; + xor.b32 %r26414, %r30934, %r26508; + st.local.v2.u32 [%rd272+152], {%r26414, %r26413}; + // begin inline asm + shf.l.wrap.b32 %r26344, %r26305, %r26304, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26348, %r26304, %r26305, %r26367; + // end inline asm + xor.b32 %r26510, %r26344, %r26280; + xor.b32 %r26511, %r26348, %r26281; + xor.b32 %r26397, %r30931, %r26511; + xor.b32 %r26398, %r30930, %r26510; + st.local.v2.u32 [%rd272+120], {%r26398, %r26397}; + xor.b32 %r26389, %r30927, %r26511; + xor.b32 %r26390, %r30926, %r26510; + st.local.v2.u32 [%rd272+200], {%r26390, %r26389}; + // begin inline asm + shf.l.wrap.b32 %r26352, %r26317, %r26316, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26356, %r26316, %r26317, %r26367; + // end inline asm + xor.b32 %r26512, %r26352, %r26292; + xor.b32 %r26513, %r26356, %r26293; + xor.b32 %r26421, %r30950, %r26512; + xor.b32 %r26422, %r30951, %r26513; + xor.b32 %r26430, %r30921, %r26513; + xor.b32 %r26429, %r30920, %r26512; + st.local.v2.u32 [%rd272+168], {%r26429, %r26430}; + // begin inline asm + shf.l.wrap.b32 %r26360, %r26269, %r26268, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26364, %r26268, %r26269, %r26367; + // end inline asm + xor.b32 %r26514, %r26360, %r26304; + xor.b32 %r26515, %r26364, %r26305; + xor.b32 %r26381, %r30916, %r26514; + xor.b32 %r26382, %r30917, %r26515; + xor.b32 %r26406, %r30911, %r26515; + xor.b32 %r26405, %r30910, %r26514; + st.local.v2.u32 [%rd272+216], {%r26405, %r26406}; + // begin inline asm + shf.l.wrap.b32 %r26368, %r26374, %r26373, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26372, %r26373, %r26374, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26376, %r26382, %r26381, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26380, %r26381, %r26382, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26388, %r26389, %r26390, %r25887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26384, %r26390, %r26389, %r25887; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r26384, %r26388}; + // begin inline asm + shf.l.wrap.b32 %r26392, %r26398, %r26397, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26396, %r26397, %r26398, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26400, %r26406, %r26405, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26404, %r26405, %r26406, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26412, %r26413, %r26414, %r25991; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26408, %r26414, %r26413, %r25991; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r26408, %r26412}; + // begin inline asm + shf.l.wrap.b32 %r26416, %r26422, %r26421, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26420, %r26421, %r26422, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26424, %r26430, %r26429, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26428, %r26429, %r26430, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26432, %r26438, %r26437, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26436, %r26437, %r26438, %r26047; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26440, %r26475, %r26368, %r26392, 0xD2; + lop3.b32 %r26441, %r26478, %r26372, %r26396, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26448, %r26368, %r26392, %r26424, 0xD2; + lop3.b32 %r26449, %r26372, %r26396, %r26428, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+32], {%r26448, %r26449}; + // begin inline asm + // chi + lop3.b32 %r26456, %r26392, %r26424, %r26400, 0xD2; + lop3.b32 %r26457, %r26396, %r26428, %r26404, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+40], {%r26456, %r26457}; + // begin inline asm + // chi + lop3.b32 %r26464, %r26424, %r26400, %r26475, 0xD2; + lop3.b32 %r26465, %r26428, %r26404, %r26478, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+48], {%r26464, %r26465}; + // begin inline asm + // chi + lop3.b32 %r26472, %r26400, %r26475, %r26368, 0xD2; + lop3.b32 %r26473, %r26404, %r26478, %r26372, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+56], {%r26472, %r26473}; + // begin inline asm + // chi + lop3.b32 %r26480, %r26416, %r26376, %r26432, 0xD2; + lop3.b32 %r26481, %r26420, %r26380, %r26436, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+64], {%r26480, %r26481}; + // begin inline asm + // chi + lop3.b32 %r26488, %r26376, %r26432, %r26408, 0xD2; + lop3.b32 %r26489, %r26380, %r26436, %r26412, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+72], {%r26488, %r26489}; + // begin inline asm + // chi + lop3.b32 %r26496, %r26432, %r26408, %r26384, 0xD2; + lop3.b32 %r26497, %r26436, %r26412, %r26388, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+80], {%r26496, %r26497}; + // begin inline asm + ld.global.nc.v2.u32 {%r26504,%r26505}, [%rd1030]; + // end inline asm + xor.b32 %r26516, %r26441, %r26505; + xor.b32 %r26517, %r26440, %r26504; + st.local.v2.u32 [%rd272+24], {%r26517, %r26516}; + bra.uni $L__BB2_94; + +$L__BB2_72: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd898, 1179641; + st.local.u64 [%rd3+8], %rd898; + st.local.u32 [%rd3+16], %r3326; + ld.global.u64 %rd899, [%rd222]; + ld.global.u64 %rd900, [%rd222+8]; + ld.global.u64 %rd901, [%rd222+16]; + ld.global.u64 %rd902, [%rd222+24]; + ld.global.u64 %rd903, [%rd222+32]; + ld.global.u64 %rd904, [%rd222+40]; + ld.global.u64 %rd905, [%rd222+48]; + ld.global.u64 %rd906, [%rd222+56]; + st.local.u64 [%rd3+24], %rd899; + st.local.u64 [%rd3+32], %rd900; + st.local.u64 [%rd3+40], %rd901; + st.local.u64 [%rd3+48], %rd902; + st.local.u64 [%rd3+56], %rd903; + st.local.u64 [%rd3+64], %rd904; + st.local.u64 [%rd3+72], %rd905; + st.local.u64 [%rd3+80], %rd906; + cvt.u32.u64 %r19990, %rd899; + xor.b32 %r19991, %r3326, %r19990; + st.local.u32 [%rd3+24], %r19991; + mov.u32 %r30487, 0; + st.local.v2.u32 [%rd3+96], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+104], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+112], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+120], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+128], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+136], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+144], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+152], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+160], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+168], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+176], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+184], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+192], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+200], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+208], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+216], {%r30487, %r30487}; + mov.u32 %r30502, -2147483648; + mov.u32 %r19963, 1; + st.local.v2.u32 [%rd3+88], {%r19963, %r30502}; + ld.local.v2.u32 {%r30523, %r30524}, [%rd3+24]; + mov.b64 {%r30521, %r30522}, %rd904; + shr.u64 %rd907, %rd900, 32; + cvt.u32.u64 %r30535, %rd900; + cvt.u32.u64 %r30536, %rd907; + shr.u64 %rd908, %rd905, 32; + cvt.u32.u64 %r30533, %rd905; + cvt.u32.u64 %r30534, %rd908; + shr.u64 %rd909, %rd901, 32; + cvt.u32.u64 %r30531, %rd901; + cvt.u32.u64 %r30532, %rd909; + shr.u64 %rd910, %rd906, 32; + cvt.u32.u64 %r30529, %rd906; + cvt.u32.u64 %r30530, %rd910; + shr.u64 %rd911, %rd902, 32; + cvt.u32.u64 %r30527, %rd902; + cvt.u32.u64 %r30528, %rd911; + shr.u64 %rd912, %rd903, 32; + cvt.u32.u64 %r30525, %rd903; + cvt.u32.u64 %r30526, %rd912; + mov.u32 %r30488, %r30487; + mov.u32 %r30489, %r30487; + mov.u32 %r30490, %r30487; + mov.u32 %r30491, %r30487; + mov.u32 %r30492, %r30487; + mov.u32 %r30493, %r30487; + mov.u32 %r30494, %r30487; + mov.u32 %r30495, %r30487; + mov.u32 %r30496, %r30487; + mov.u32 %r30497, %r30487; + mov.u32 %r30498, %r30487; + mov.u32 %r30499, %r30487; + mov.u32 %r30500, %r30487; + mov.u32 %r30501, %r19963; + mov.u32 %r30503, %r30487; + mov.u32 %r30504, %r30487; + mov.u32 %r30505, %r30487; + mov.u32 %r30506, %r30487; + mov.u32 %r30507, %r30487; + mov.u32 %r30508, %r30487; + mov.u32 %r30509, %r30487; + mov.u32 %r30510, %r30487; + mov.u32 %r30511, %r30487; + mov.u32 %r30512, %r30487; + mov.u32 %r30513, %r30487; + mov.u32 %r30514, %r30487; + mov.u32 %r30515, %r30487; + mov.u32 %r30516, %r30487; + mov.u32 %r30517, %r30487; + mov.u32 %r30518, %r30487; + mov.u32 %r30519, %r30487; + mov.u32 %r30520, %r30487; + mov.u32 %r30537, %r30487; + +$L__BB2_73: + // begin inline asm + // xor5 + lop3.b32 %r19994, %r30523, %r30521, %r30519, 0x96; + lop3.b32 %r19994, %r19994, %r30517, %r30515, 0x96; + lop3.b32 %r19995, %r30524, %r30522, %r30520, 0x96; + lop3.b32 %r19995, %r19995, %r30518, %r30516, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20006, %r30535, %r30533, %r30513, 0x96; + lop3.b32 %r20006, %r20006, %r30511, %r30509, 0x96; + lop3.b32 %r20007, %r30536, %r30534, %r30514, 0x96; + lop3.b32 %r20007, %r20007, %r30512, %r30510, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20018, %r30531, %r30529, %r30507, 0x96; + lop3.b32 %r20018, %r20018, %r30505, %r30503, 0x96; + lop3.b32 %r20019, %r30532, %r30530, %r30508, 0x96; + lop3.b32 %r20019, %r20019, %r30506, %r30504, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20030, %r30527, %r30501, %r30499, 0x96; + lop3.b32 %r20030, %r20030, %r30497, %r30495, 0x96; + lop3.b32 %r20031, %r30528, %r30502, %r30500, 0x96; + lop3.b32 %r20031, %r20031, %r30498, %r30496, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20042, %r30525, %r30493, %r30491, 0x96; + lop3.b32 %r20042, %r20042, %r30489, %r30487, 0x96; + lop3.b32 %r20043, %r30526, %r30494, %r30492, 0x96; + lop3.b32 %r20043, %r20043, %r30490, %r30488, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20054, %r20007, %r20006, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20058, %r20006, %r20007, %r19963; + // end inline asm + xor.b32 %r20488, %r20054, %r20042; + xor.b32 %r20489, %r20058, %r20043; + xor.b32 %r20321, %r30523, %r20488; + xor.b32 %r20324, %r30524, %r20489; + xor.b32 %r20228, %r30521, %r20488; + xor.b32 %r20227, %r30522, %r20489; + xor.b32 %r20275, %r30519, %r20488; + xor.b32 %r20276, %r30520, %r20489; + xor.b32 %r20180, %r30517, %r20488; + xor.b32 %r20179, %r30518, %r20489; + xor.b32 %r20131, %r30515, %r20488; + xor.b32 %r20132, %r30516, %r20489; + // begin inline asm + shf.l.wrap.b32 %r20062, %r20019, %r20018, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20066, %r20018, %r20019, %r19963; + // end inline asm + xor.b32 %r20490, %r20062, %r19994; + xor.b32 %r20491, %r20066, %r19995; + xor.b32 %r20283, %r30535, %r20490; + xor.b32 %r20284, %r30536, %r20491; + xor.b32 %r20100, %r30533, %r20490; + xor.b32 %r20099, %r30534, %r20491; + xor.b32 %r20259, %r30513, %r20490; + xor.b32 %r20260, %r30514, %r20491; + xor.b32 %r20220, %r30511, %r20490; + xor.b32 %r20219, %r30512, %r20491; + xor.b32 %r20203, %r30509, %r20490; + xor.b32 %r20204, %r30510, %r20491; + // begin inline asm + shf.l.wrap.b32 %r20070, %r20031, %r20030, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20074, %r20030, %r20031, %r19963; + // end inline asm + xor.b32 %r20492, %r20070, %r20006; + xor.b32 %r20493, %r20074, %r20007; + xor.b32 %r20140, %r30531, %r20492; + xor.b32 %r20139, %r30532, %r20493; + xor.b32 %r20267, %r30529, %r20492; + xor.b32 %r20268, %r30530, %r20493; + xor.b32 %r20148, %r30507, %r20492; + xor.b32 %r20147, %r30508, %r20493; + xor.b32 %r20251, %r30505, %r20492; + xor.b32 %r20252, %r30506, %r20493; + xor.b32 %r20116, %r30503, %r20492; + xor.b32 %r20115, %r30504, %r20493; + // begin inline asm + shf.l.wrap.b32 %r20078, %r20043, %r20042, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20082, %r20042, %r20043, %r19963; + // end inline asm + xor.b32 %r20494, %r20078, %r20018; + xor.b32 %r20495, %r20082, %r20019; + xor.b32 %r20235, %r30527, %r20494; + xor.b32 %r20236, %r30528, %r20495; + xor.b32 %r20212, %r30501, %r20494; + xor.b32 %r20211, %r30502, %r20495; + xor.b32 %r20155, %r30499, %r20494; + xor.b32 %r20156, %r30500, %r20495; + xor.b32 %r20243, %r30497, %r20494; + xor.b32 %r20244, %r30498, %r20495; + xor.b32 %r20172, %r30495, %r20494; + xor.b32 %r20171, %r30496, %r20495; + // begin inline asm + shf.l.wrap.b32 %r20086, %r19995, %r19994, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20090, %r19994, %r19995, %r19963; + // end inline asm + xor.b32 %r20496, %r20086, %r20030; + xor.b32 %r20497, %r20090, %r20031; + xor.b32 %r20187, %r30525, %r20496; + xor.b32 %r20188, %r30526, %r20497; + xor.b32 %r20107, %r30493, %r20496; + xor.b32 %r20108, %r30494, %r20497; + xor.b32 %r20124, %r30491, %r20496; + xor.b32 %r20123, %r30492, %r20497; + xor.b32 %r20163, %r30489, %r20496; + xor.b32 %r20164, %r30490, %r20497; + xor.b32 %r20195, %r30487, %r20496; + xor.b32 %r20196, %r30488, %r20497; + mov.u32 %r20101, 44; + // begin inline asm + shf.l.wrap.b32 %r20094, %r20100, %r20099, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20098, %r20099, %r20100, %r20101; + // end inline asm + mov.u32 %r20109, 20; + // begin inline asm + shf.l.wrap.b32 %r20102, %r20108, %r20107, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20106, %r20107, %r20108, %r20109; + // end inline asm + mov.u32 %r20117, 61; + // begin inline asm + shf.l.wrap.b32 %r20110, %r20116, %r20115, %r20117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20114, %r20115, %r20116, %r20117; + // end inline asm + mov.u32 %r20125, 39; + // begin inline asm + shf.l.wrap.b32 %r20118, %r20124, %r20123, %r20125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20122, %r20123, %r20124, %r20125; + // end inline asm + mov.u32 %r20133, 18; + // begin inline asm + shf.l.wrap.b32 %r20126, %r20132, %r20131, %r20133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20130, %r20131, %r20132, %r20133; + // end inline asm + mov.u32 %r20141, 62; + // begin inline asm + shf.l.wrap.b32 %r20134, %r20140, %r20139, %r20141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20138, %r20139, %r20140, %r20141; + // end inline asm + mov.u32 %r20149, 43; + // begin inline asm + shf.l.wrap.b32 %r20142, %r20148, %r20147, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20146, %r20147, %r20148, %r20149; + // end inline asm + mov.u32 %r20157, 25; + // begin inline asm + shf.l.wrap.b32 %r20150, %r20156, %r20155, %r20157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20154, %r20155, %r20156, %r20157; + // end inline asm + mov.u32 %r20165, 8; + // begin inline asm + shf.l.wrap.b32 %r20158, %r20164, %r20163, %r20165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20162, %r20163, %r20164, %r20165; + // end inline asm + mov.u32 %r20173, 56; + // begin inline asm + shf.l.wrap.b32 %r20166, %r20172, %r20171, %r20173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20170, %r20171, %r20172, %r20173; + // end inline asm + mov.u32 %r20181, 41; + // begin inline asm + shf.l.wrap.b32 %r20174, %r20180, %r20179, %r20181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20178, %r20179, %r20180, %r20181; + // end inline asm + mov.u32 %r20189, 27; + // begin inline asm + shf.l.wrap.b32 %r20182, %r20188, %r20187, %r20189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20186, %r20187, %r20188, %r20189; + // end inline asm + mov.u32 %r20197, 14; + // begin inline asm + shf.l.wrap.b32 %r20190, %r20196, %r20195, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20194, %r20195, %r20196, %r20197; + // end inline asm + mov.u32 %r20205, 2; + // begin inline asm + shf.l.wrap.b32 %r20198, %r20204, %r20203, %r20205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20202, %r20203, %r20204, %r20205; + // end inline asm + mov.u32 %r20213, 55; + // begin inline asm + shf.l.wrap.b32 %r20206, %r20212, %r20211, %r20213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20210, %r20211, %r20212, %r20213; + // end inline asm + mov.u32 %r20221, 45; + // begin inline asm + shf.l.wrap.b32 %r20214, %r20220, %r20219, %r20221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20218, %r20219, %r20220, %r20221; + // end inline asm + mov.u32 %r20229, 36; + // begin inline asm + shf.l.wrap.b32 %r20222, %r20228, %r20227, %r20229; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20226, %r20227, %r20228, %r20229; + // end inline asm + mov.u32 %r20237, 28; + // begin inline asm + shf.l.wrap.b32 %r20230, %r20236, %r20235, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20234, %r20235, %r20236, %r20237; + // end inline asm + mov.u32 %r20245, 21; + // begin inline asm + shf.l.wrap.b32 %r20238, %r20244, %r20243, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20242, %r20243, %r20244, %r20245; + // end inline asm + mov.u32 %r20253, 15; + // begin inline asm + shf.l.wrap.b32 %r20246, %r20252, %r20251, %r20253; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20250, %r20251, %r20252, %r20253; + // end inline asm + mov.u32 %r20261, 10; + // begin inline asm + shf.l.wrap.b32 %r20254, %r20260, %r20259, %r20261; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20258, %r20259, %r20260, %r20261; + // end inline asm + mov.u32 %r20269, 6; + // begin inline asm + shf.l.wrap.b32 %r20262, %r20268, %r20267, %r20269; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20266, %r20267, %r20268, %r20269; + // end inline asm + mov.u32 %r20277, 3; + // begin inline asm + shf.l.wrap.b32 %r20270, %r20276, %r20275, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20274, %r20275, %r20276, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20278, %r20284, %r20283, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20282, %r20283, %r20284, %r19963; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20286, %r20321, %r20094, %r20142, 0xD2; + lop3.b32 %r20287, %r20324, %r20098, %r20146, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30535, %r20094, %r20142, %r20238, 0xD2; + lop3.b32 %r30536, %r20098, %r20146, %r20242, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30531, %r20142, %r20238, %r20190, 0xD2; + lop3.b32 %r30532, %r20146, %r20242, %r20194, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30527, %r20238, %r20190, %r20321, 0xD2; + lop3.b32 %r30528, %r20242, %r20194, %r20324, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30525, %r20190, %r20321, %r20094, 0xD2; + lop3.b32 %r30526, %r20194, %r20324, %r20098, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30521, %r20230, %r20102, %r20270, 0xD2; + lop3.b32 %r30522, %r20234, %r20106, %r20274, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30533, %r20102, %r20270, %r20214, 0xD2; + lop3.b32 %r30534, %r20106, %r20274, %r20218, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30529, %r20270, %r20214, %r20110, 0xD2; + lop3.b32 %r30530, %r20274, %r20218, %r20114, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30501, %r20214, %r20110, %r20230, 0xD2; + lop3.b32 %r30502, %r20218, %r20114, %r20234, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30501, %r30502}; + // begin inline asm + // chi + lop3.b32 %r30493, %r20110, %r20230, %r20102, 0xD2; + lop3.b32 %r30494, %r20114, %r20234, %r20106, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30493, %r30494}; + // begin inline asm + // chi + lop3.b32 %r30519, %r20278, %r20262, %r20150, 0xD2; + lop3.b32 %r30520, %r20282, %r20266, %r20154, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30519, %r30520}; + // begin inline asm + // chi + lop3.b32 %r30513, %r20262, %r20150, %r20158, 0xD2; + lop3.b32 %r30514, %r20266, %r20154, %r20162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30513, %r30514}; + // begin inline asm + // chi + lop3.b32 %r30507, %r20150, %r20158, %r20126, 0xD2; + lop3.b32 %r30508, %r20154, %r20162, %r20130, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30507, %r30508}; + // begin inline asm + // chi + lop3.b32 %r30499, %r20158, %r20126, %r20278, 0xD2; + lop3.b32 %r30500, %r20162, %r20130, %r20282, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30499, %r30500}; + // begin inline asm + // chi + lop3.b32 %r30491, %r20126, %r20278, %r20262, 0xD2; + lop3.b32 %r30492, %r20130, %r20282, %r20266, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30491, %r30492}; + // begin inline asm + // chi + lop3.b32 %r30517, %r20182, %r20222, %r20254, 0xD2; + lop3.b32 %r30518, %r20186, %r20226, %r20258, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30517, %r30518}; + // begin inline asm + // chi + lop3.b32 %r30511, %r20222, %r20254, %r20246, 0xD2; + lop3.b32 %r30512, %r20226, %r20258, %r20250, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30511, %r30512}; + // begin inline asm + // chi + lop3.b32 %r30505, %r20254, %r20246, %r20166, 0xD2; + lop3.b32 %r30506, %r20258, %r20250, %r20170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30505, %r30506}; + // begin inline asm + // chi + lop3.b32 %r30497, %r20246, %r20166, %r20182, 0xD2; + lop3.b32 %r30498, %r20250, %r20170, %r20186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30497, %r30498}; + // begin inline asm + // chi + lop3.b32 %r30489, %r20166, %r20182, %r20222, 0xD2; + lop3.b32 %r30490, %r20170, %r20186, %r20226, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30489, %r30490}; + // begin inline asm + // chi + lop3.b32 %r30515, %r20134, %r20206, %r20118, 0xD2; + lop3.b32 %r30516, %r20138, %r20210, %r20122, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30515, %r30516}; + // begin inline asm + // chi + lop3.b32 %r30509, %r20206, %r20118, %r20174, 0xD2; + lop3.b32 %r30510, %r20210, %r20122, %r20178, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30509, %r30510}; + // begin inline asm + // chi + lop3.b32 %r30503, %r20118, %r20174, %r20198, 0xD2; + lop3.b32 %r30504, %r20122, %r20178, %r20202, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30503, %r30504}; + // begin inline asm + // chi + lop3.b32 %r30495, %r20174, %r20198, %r20134, 0xD2; + lop3.b32 %r30496, %r20178, %r20202, %r20138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30495, %r30496}; + // begin inline asm + // chi + lop3.b32 %r30487, %r20198, %r20134, %r20206, 0xD2; + lop3.b32 %r30488, %r20202, %r20138, %r20210, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30487, %r30488}; + mul.wide.s32 %rd916, %r30537, 8; + mov.u64 %rd917, keccak_round_constants; + cvta.const.u64 %rd918, %rd917; + add.s64 %rd913, %rd918, %rd916; + // begin inline asm + ld.global.nc.v2.u32 {%r20486,%r20487}, [%rd913]; + // end inline asm + xor.b32 %r30523, %r20286, %r20486; + xor.b32 %r30524, %r20287, %r20487; + add.s32 %r30537, %r30537, 1; + setp.lt.u32 %p42, %r30537, 23; + @%p42 bra $L__BB2_73; + + st.local.v2.u32 [%rd3+32], {%r30535, %r30536}; + st.local.v2.u32 [%rd3+72], {%r30533, %r30534}; + st.local.v2.u32 [%rd3+40], {%r30531, %r30532}; + st.local.v2.u32 [%rd3+80], {%r30529, %r30530}; + st.local.v2.u32 [%rd3+48], {%r30527, %r30528}; + st.local.v2.u32 [%rd3+56], {%r30525, %r30526}; + st.local.v2.u32 [%rd3+24], {%r30523, %r30524}; + // begin inline asm + // xor5 + lop3.b32 %r20498, %r30523, %r30521, %r30519, 0x96; + lop3.b32 %r20498, %r20498, %r30517, %r30515, 0x96; + lop3.b32 %r20499, %r30524, %r30522, %r30520, 0x96; + lop3.b32 %r20499, %r20499, %r30518, %r30516, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20510, %r30535, %r30533, %r30513, 0x96; + lop3.b32 %r20510, %r20510, %r30511, %r30509, 0x96; + lop3.b32 %r20511, %r30536, %r30534, %r30514, 0x96; + lop3.b32 %r20511, %r20511, %r30512, %r30510, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20522, %r30531, %r30529, %r30507, 0x96; + lop3.b32 %r20522, %r20522, %r30505, %r30503, 0x96; + lop3.b32 %r20523, %r30532, %r30530, %r30508, 0x96; + lop3.b32 %r20523, %r20523, %r30506, %r30504, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20534, %r30527, %r30501, %r30499, 0x96; + lop3.b32 %r20534, %r20534, %r30497, %r30495, 0x96; + lop3.b32 %r20535, %r30528, %r30502, %r30500, 0x96; + lop3.b32 %r20535, %r20535, %r30498, %r30496, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20546, %r30525, %r30493, %r30491, 0x96; + lop3.b32 %r20546, %r20546, %r30489, %r30487, 0x96; + lop3.b32 %r20547, %r30526, %r30494, %r30492, 0x96; + lop3.b32 %r20547, %r20547, %r30490, %r30488, 0x96; + // end inline asm + mov.u32 %r20750, 1; + // begin inline asm + shf.l.wrap.b32 %r20558, %r20511, %r20510, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20562, %r20510, %r20511, %r20750; + // end inline asm + xor.b32 %r20777, %r20558, %r20546; + xor.b32 %r20778, %r20562, %r20547; + xor.b32 %r20705, %r30523, %r20777; + xor.b32 %r20708, %r30524, %r20778; + xor.b32 %r20668, %r30520, %r20778; + xor.b32 %r20667, %r30519, %r20777; + st.local.v2.u32 [%rd3+104], {%r20667, %r20668}; + // begin inline asm + shf.l.wrap.b32 %r20566, %r20523, %r20522, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20570, %r20522, %r20523, %r20750; + // end inline asm + xor.b32 %r20779, %r20566, %r20498; + xor.b32 %r20780, %r20570, %r20499; + xor.b32 %r20604, %r30533, %r20779; + xor.b32 %r20603, %r30534, %r20780; + xor.b32 %r20643, %r30512, %r20780; + xor.b32 %r20644, %r30511, %r20779; + st.local.v2.u32 [%rd3+152], {%r20644, %r20643}; + // begin inline asm + shf.l.wrap.b32 %r20574, %r20535, %r20534, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20578, %r20534, %r20535, %r20750; + // end inline asm + xor.b32 %r20781, %r20574, %r20510; + xor.b32 %r20782, %r20578, %r20511; + xor.b32 %r20627, %r30508, %r20782; + xor.b32 %r20628, %r30507, %r20781; + st.local.v2.u32 [%rd3+120], {%r20628, %r20627}; + xor.b32 %r20619, %r30504, %r20782; + xor.b32 %r20620, %r30503, %r20781; + st.local.v2.u32 [%rd3+200], {%r20620, %r20619}; + // begin inline asm + shf.l.wrap.b32 %r20582, %r20547, %r20546, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20586, %r20546, %r20547, %r20750; + // end inline asm + xor.b32 %r20783, %r20582, %r20522; + xor.b32 %r20784, %r20586, %r20523; + xor.b32 %r20651, %r30527, %r20783; + xor.b32 %r20652, %r30528, %r20784; + xor.b32 %r20660, %r30498, %r20784; + xor.b32 %r20659, %r30497, %r20783; + st.local.v2.u32 [%rd3+168], {%r20659, %r20660}; + // begin inline asm + shf.l.wrap.b32 %r20590, %r20499, %r20498, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20594, %r20498, %r20499, %r20750; + // end inline asm + xor.b32 %r20785, %r20590, %r20534; + xor.b32 %r20786, %r20594, %r20535; + xor.b32 %r20611, %r30493, %r20785; + xor.b32 %r20612, %r30494, %r20786; + xor.b32 %r20636, %r30488, %r20786; + xor.b32 %r20635, %r30487, %r20785; + st.local.v2.u32 [%rd3+216], {%r20635, %r20636}; + // begin inline asm + shf.l.wrap.b32 %r20598, %r20604, %r20603, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20602, %r20603, %r20604, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20606, %r20612, %r20611, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20610, %r20611, %r20612, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20618, %r20619, %r20620, %r20117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20614, %r20620, %r20619, %r20117; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r20614, %r20618}; + // begin inline asm + shf.l.wrap.b32 %r20622, %r20628, %r20627, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20626, %r20627, %r20628, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20630, %r20636, %r20635, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20634, %r20635, %r20636, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20642, %r20643, %r20644, %r20221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20638, %r20644, %r20643, %r20221; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r20638, %r20642}; + // begin inline asm + shf.l.wrap.b32 %r20646, %r20652, %r20651, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20650, %r20651, %r20652, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20654, %r20660, %r20659, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20658, %r20659, %r20660, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20662, %r20668, %r20667, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20666, %r20667, %r20668, %r20277; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20670, %r20705, %r20598, %r20622, 0xD2; + lop3.b32 %r20671, %r20708, %r20602, %r20626, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30670, %r20598, %r20622, %r20654, 0xD2; + lop3.b32 %r30671, %r20602, %r20626, %r20658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30670, %r30671}; + // begin inline asm + // chi + lop3.b32 %r30666, %r20622, %r20654, %r20630, 0xD2; + lop3.b32 %r30667, %r20626, %r20658, %r20634, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30666, %r30667}; + // begin inline asm + // chi + lop3.b32 %r30662, %r20654, %r20630, %r20705, 0xD2; + lop3.b32 %r30663, %r20658, %r20634, %r20708, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30662, %r30663}; + // begin inline asm + // chi + lop3.b32 %r30660, %r20630, %r20705, %r20598, 0xD2; + lop3.b32 %r30661, %r20634, %r20708, %r20602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30660, %r30661}; + // begin inline asm + // chi + lop3.b32 %r30656, %r20646, %r20606, %r20662, 0xD2; + lop3.b32 %r30657, %r20650, %r20610, %r20666, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30656, %r30657}; + // begin inline asm + // chi + lop3.b32 %r30668, %r20606, %r20662, %r20638, 0xD2; + lop3.b32 %r30669, %r20610, %r20666, %r20642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30668, %r30669}; + // begin inline asm + // chi + lop3.b32 %r30664, %r20662, %r20638, %r20614, 0xD2; + lop3.b32 %r30665, %r20666, %r20642, %r20618, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30664, %r30665}; + add.s64 %rd919, %rd918, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r20734,%r20735}, [%rd919]; + // end inline asm + xor.b32 %r30658, %r20670, %r20734; + xor.b32 %r30659, %r20671, %r20735; + add.u64 %rd925, %SPL, 1912; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + st.local.u64 [%rd925], %rd354; + mov.u64 %rd926, 1179641; + st.local.u64 [%rd925+8], %rd926; + add.s32 %r20787, %r3326, 1; + st.local.u32 [%rd925+16], %r20787; + ld.global.u64 %rd927, [%rd223]; + ld.global.u64 %rd928, [%rd223+8]; + ld.global.u64 %rd929, [%rd223+16]; + ld.global.u64 %rd930, [%rd223+24]; + ld.global.u64 %rd931, [%rd223+32]; + ld.global.u64 %rd932, [%rd223+40]; + ld.global.u64 %rd933, [%rd223+48]; + ld.global.u64 %rd934, [%rd223+56]; + st.local.u64 [%rd925+32], %rd928; + st.local.u64 [%rd925+40], %rd929; + st.local.u64 [%rd925+48], %rd930; + st.local.u64 [%rd925+56], %rd931; + st.local.u64 [%rd925+64], %rd932; + st.local.u64 [%rd925+72], %rd933; + st.local.u64 [%rd925+80], %rd934; + cvt.u32.u64 %r20788, %rd927; + xor.b32 %r20789, %r20787, %r20788; + st.local.u64 [%rd925+24], %rd927; + st.local.u32 [%rd925+24], %r20789; + mov.u32 %r30538, 0; + st.local.v2.u32 [%rd925+96], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+104], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+112], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+120], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+128], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+136], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+144], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+152], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+160], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+168], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+176], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+184], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+192], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+200], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+208], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+216], {%r30538, %r30538}; + mov.u32 %r30553, -2147483648; + st.local.v2.u32 [%rd925+88], {%r20750, %r30553}; + ld.local.v2.u32 {%r30574, %r30575}, [%rd925+24]; + mov.b64 {%r30572, %r30573}, %rd932; + shr.u64 %rd935, %rd928, 32; + cvt.u32.u64 %r30586, %rd928; + cvt.u32.u64 %r30587, %rd935; + shr.u64 %rd936, %rd933, 32; + cvt.u32.u64 %r30584, %rd933; + cvt.u32.u64 %r30585, %rd936; + shr.u64 %rd937, %rd929, 32; + cvt.u32.u64 %r30582, %rd929; + cvt.u32.u64 %r30583, %rd937; + shr.u64 %rd938, %rd934, 32; + cvt.u32.u64 %r30580, %rd934; + cvt.u32.u64 %r30581, %rd938; + shr.u64 %rd939, %rd930, 32; + cvt.u32.u64 %r30578, %rd930; + cvt.u32.u64 %r30579, %rd939; + shr.u64 %rd940, %rd931, 32; + cvt.u32.u64 %r30576, %rd931; + cvt.u32.u64 %r30577, %rd940; + mov.u32 %r30539, %r30538; + mov.u32 %r30540, %r30538; + mov.u32 %r30541, %r30538; + mov.u32 %r30542, %r30538; + mov.u32 %r30543, %r30538; + mov.u32 %r30544, %r30538; + mov.u32 %r30545, %r30538; + mov.u32 %r30546, %r30538; + mov.u32 %r30547, %r30538; + mov.u32 %r30548, %r30538; + mov.u32 %r30549, %r30538; + mov.u32 %r30550, %r30538; + mov.u32 %r30551, %r30538; + mov.u32 %r30552, %r20750; + mov.u32 %r30554, %r30538; + mov.u32 %r30555, %r30538; + mov.u32 %r30556, %r30538; + mov.u32 %r30557, %r30538; + mov.u32 %r30558, %r30538; + mov.u32 %r30559, %r30538; + mov.u32 %r30560, %r30538; + mov.u32 %r30561, %r30538; + mov.u32 %r30562, %r30538; + mov.u32 %r30563, %r30538; + mov.u32 %r30564, %r30538; + mov.u32 %r30565, %r30538; + mov.u32 %r30566, %r30538; + mov.u32 %r30567, %r30538; + mov.u32 %r30568, %r30538; + mov.u32 %r30569, %r30538; + mov.u32 %r30570, %r30538; + mov.u32 %r30571, %r30538; + mov.u32 %r30588, %r30538; + +$L__BB2_75: + // begin inline asm + // xor5 + lop3.b32 %r20792, %r30574, %r30572, %r30570, 0x96; + lop3.b32 %r20792, %r20792, %r30568, %r30566, 0x96; + lop3.b32 %r20793, %r30575, %r30573, %r30571, 0x96; + lop3.b32 %r20793, %r20793, %r30569, %r30567, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20804, %r30586, %r30584, %r30564, 0x96; + lop3.b32 %r20804, %r20804, %r30562, %r30560, 0x96; + lop3.b32 %r20805, %r30587, %r30585, %r30565, 0x96; + lop3.b32 %r20805, %r20805, %r30563, %r30561, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20816, %r30582, %r30580, %r30558, 0x96; + lop3.b32 %r20816, %r20816, %r30556, %r30554, 0x96; + lop3.b32 %r20817, %r30583, %r30581, %r30559, 0x96; + lop3.b32 %r20817, %r20817, %r30557, %r30555, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20828, %r30578, %r30552, %r30550, 0x96; + lop3.b32 %r20828, %r20828, %r30548, %r30546, 0x96; + lop3.b32 %r20829, %r30579, %r30553, %r30551, 0x96; + lop3.b32 %r20829, %r20829, %r30549, %r30547, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20840, %r30576, %r30544, %r30542, 0x96; + lop3.b32 %r20840, %r20840, %r30540, %r30538, 0x96; + lop3.b32 %r20841, %r30577, %r30545, %r30543, 0x96; + lop3.b32 %r20841, %r20841, %r30541, %r30539, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20852, %r20805, %r20804, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20856, %r20804, %r20805, %r20750; + // end inline asm + xor.b32 %r21286, %r20852, %r20840; + xor.b32 %r21287, %r20856, %r20841; + xor.b32 %r21119, %r30574, %r21286; + xor.b32 %r21122, %r30575, %r21287; + xor.b32 %r21026, %r30572, %r21286; + xor.b32 %r21025, %r30573, %r21287; + xor.b32 %r21073, %r30570, %r21286; + xor.b32 %r21074, %r30571, %r21287; + xor.b32 %r20978, %r30568, %r21286; + xor.b32 %r20977, %r30569, %r21287; + xor.b32 %r20929, %r30566, %r21286; + xor.b32 %r20930, %r30567, %r21287; + // begin inline asm + shf.l.wrap.b32 %r20860, %r20817, %r20816, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20864, %r20816, %r20817, %r20750; + // end inline asm + xor.b32 %r21288, %r20860, %r20792; + xor.b32 %r21289, %r20864, %r20793; + xor.b32 %r21081, %r30586, %r21288; + xor.b32 %r21082, %r30587, %r21289; + xor.b32 %r20898, %r30584, %r21288; + xor.b32 %r20897, %r30585, %r21289; + xor.b32 %r21057, %r30564, %r21288; + xor.b32 %r21058, %r30565, %r21289; + xor.b32 %r21018, %r30562, %r21288; + xor.b32 %r21017, %r30563, %r21289; + xor.b32 %r21001, %r30560, %r21288; + xor.b32 %r21002, %r30561, %r21289; + // begin inline asm + shf.l.wrap.b32 %r20868, %r20829, %r20828, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20872, %r20828, %r20829, %r20750; + // end inline asm + xor.b32 %r21290, %r20868, %r20804; + xor.b32 %r21291, %r20872, %r20805; + xor.b32 %r20938, %r30582, %r21290; + xor.b32 %r20937, %r30583, %r21291; + xor.b32 %r21065, %r30580, %r21290; + xor.b32 %r21066, %r30581, %r21291; + xor.b32 %r20946, %r30558, %r21290; + xor.b32 %r20945, %r30559, %r21291; + xor.b32 %r21049, %r30556, %r21290; + xor.b32 %r21050, %r30557, %r21291; + xor.b32 %r20914, %r30554, %r21290; + xor.b32 %r20913, %r30555, %r21291; + // begin inline asm + shf.l.wrap.b32 %r20876, %r20841, %r20840, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20880, %r20840, %r20841, %r20750; + // end inline asm + xor.b32 %r21292, %r20876, %r20816; + xor.b32 %r21293, %r20880, %r20817; + xor.b32 %r21033, %r30578, %r21292; + xor.b32 %r21034, %r30579, %r21293; + xor.b32 %r21010, %r30552, %r21292; + xor.b32 %r21009, %r30553, %r21293; + xor.b32 %r20953, %r30550, %r21292; + xor.b32 %r20954, %r30551, %r21293; + xor.b32 %r21041, %r30548, %r21292; + xor.b32 %r21042, %r30549, %r21293; + xor.b32 %r20970, %r30546, %r21292; + xor.b32 %r20969, %r30547, %r21293; + // begin inline asm + shf.l.wrap.b32 %r20884, %r20793, %r20792, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20888, %r20792, %r20793, %r20750; + // end inline asm + xor.b32 %r21294, %r20884, %r20828; + xor.b32 %r21295, %r20888, %r20829; + xor.b32 %r20985, %r30576, %r21294; + xor.b32 %r20986, %r30577, %r21295; + xor.b32 %r20905, %r30544, %r21294; + xor.b32 %r20906, %r30545, %r21295; + xor.b32 %r20922, %r30542, %r21294; + xor.b32 %r20921, %r30543, %r21295; + xor.b32 %r20961, %r30540, %r21294; + xor.b32 %r20962, %r30541, %r21295; + xor.b32 %r20993, %r30538, %r21294; + xor.b32 %r20994, %r30539, %r21295; + mov.u32 %r20899, 44; + // begin inline asm + shf.l.wrap.b32 %r20892, %r20898, %r20897, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20896, %r20897, %r20898, %r20899; + // end inline asm + mov.u32 %r20907, 20; + // begin inline asm + shf.l.wrap.b32 %r20900, %r20906, %r20905, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20904, %r20905, %r20906, %r20907; + // end inline asm + mov.u32 %r20915, 61; + // begin inline asm + shf.l.wrap.b32 %r20908, %r20914, %r20913, %r20915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20912, %r20913, %r20914, %r20915; + // end inline asm + mov.u32 %r20923, 39; + // begin inline asm + shf.l.wrap.b32 %r20916, %r20922, %r20921, %r20923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20920, %r20921, %r20922, %r20923; + // end inline asm + mov.u32 %r20931, 18; + // begin inline asm + shf.l.wrap.b32 %r20924, %r20930, %r20929, %r20931; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20928, %r20929, %r20930, %r20931; + // end inline asm + mov.u32 %r20939, 62; + // begin inline asm + shf.l.wrap.b32 %r20932, %r20938, %r20937, %r20939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20936, %r20937, %r20938, %r20939; + // end inline asm + mov.u32 %r20947, 43; + // begin inline asm + shf.l.wrap.b32 %r20940, %r20946, %r20945, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20944, %r20945, %r20946, %r20947; + // end inline asm + mov.u32 %r20955, 25; + // begin inline asm + shf.l.wrap.b32 %r20948, %r20954, %r20953, %r20955; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20952, %r20953, %r20954, %r20955; + // end inline asm + mov.u32 %r20963, 8; + // begin inline asm + shf.l.wrap.b32 %r20956, %r20962, %r20961, %r20963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20960, %r20961, %r20962, %r20963; + // end inline asm + mov.u32 %r20971, 56; + // begin inline asm + shf.l.wrap.b32 %r20964, %r20970, %r20969, %r20971; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20968, %r20969, %r20970, %r20971; + // end inline asm + mov.u32 %r20979, 41; + // begin inline asm + shf.l.wrap.b32 %r20972, %r20978, %r20977, %r20979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20976, %r20977, %r20978, %r20979; + // end inline asm + mov.u32 %r20987, 27; + // begin inline asm + shf.l.wrap.b32 %r20980, %r20986, %r20985, %r20987; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20984, %r20985, %r20986, %r20987; + // end inline asm + mov.u32 %r20995, 14; + // begin inline asm + shf.l.wrap.b32 %r20988, %r20994, %r20993, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20992, %r20993, %r20994, %r20995; + // end inline asm + mov.u32 %r21003, 2; + // begin inline asm + shf.l.wrap.b32 %r20996, %r21002, %r21001, %r21003; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21000, %r21001, %r21002, %r21003; + // end inline asm + mov.u32 %r21011, 55; + // begin inline asm + shf.l.wrap.b32 %r21004, %r21010, %r21009, %r21011; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21008, %r21009, %r21010, %r21011; + // end inline asm + mov.u32 %r21019, 45; + // begin inline asm + shf.l.wrap.b32 %r21012, %r21018, %r21017, %r21019; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21016, %r21017, %r21018, %r21019; + // end inline asm + mov.u32 %r21027, 36; + // begin inline asm + shf.l.wrap.b32 %r21020, %r21026, %r21025, %r21027; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21024, %r21025, %r21026, %r21027; + // end inline asm + mov.u32 %r21035, 28; + // begin inline asm + shf.l.wrap.b32 %r21028, %r21034, %r21033, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21032, %r21033, %r21034, %r21035; + // end inline asm + mov.u32 %r21043, 21; + // begin inline asm + shf.l.wrap.b32 %r21036, %r21042, %r21041, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21040, %r21041, %r21042, %r21043; + // end inline asm + mov.u32 %r21051, 15; + // begin inline asm + shf.l.wrap.b32 %r21044, %r21050, %r21049, %r21051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21048, %r21049, %r21050, %r21051; + // end inline asm + mov.u32 %r21059, 10; + // begin inline asm + shf.l.wrap.b32 %r21052, %r21058, %r21057, %r21059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21056, %r21057, %r21058, %r21059; + // end inline asm + mov.u32 %r21067, 6; + // begin inline asm + shf.l.wrap.b32 %r21060, %r21066, %r21065, %r21067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21064, %r21065, %r21066, %r21067; + // end inline asm + mov.u32 %r21075, 3; + // begin inline asm + shf.l.wrap.b32 %r21068, %r21074, %r21073, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21072, %r21073, %r21074, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21076, %r21082, %r21081, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21080, %r21081, %r21082, %r20750; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21084, %r21119, %r20892, %r20940, 0xD2; + lop3.b32 %r21085, %r21122, %r20896, %r20944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30586, %r20892, %r20940, %r21036, 0xD2; + lop3.b32 %r30587, %r20896, %r20944, %r21040, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30582, %r20940, %r21036, %r20988, 0xD2; + lop3.b32 %r30583, %r20944, %r21040, %r20992, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30578, %r21036, %r20988, %r21119, 0xD2; + lop3.b32 %r30579, %r21040, %r20992, %r21122, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30576, %r20988, %r21119, %r20892, 0xD2; + lop3.b32 %r30577, %r20992, %r21122, %r20896, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30572, %r21028, %r20900, %r21068, 0xD2; + lop3.b32 %r30573, %r21032, %r20904, %r21072, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30584, %r20900, %r21068, %r21012, 0xD2; + lop3.b32 %r30585, %r20904, %r21072, %r21016, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30580, %r21068, %r21012, %r20908, 0xD2; + lop3.b32 %r30581, %r21072, %r21016, %r20912, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30552, %r21012, %r20908, %r21028, 0xD2; + lop3.b32 %r30553, %r21016, %r20912, %r21032, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r30552, %r30553}; + // begin inline asm + // chi + lop3.b32 %r30544, %r20908, %r21028, %r20900, 0xD2; + lop3.b32 %r30545, %r20912, %r21032, %r20904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r30544, %r30545}; + // begin inline asm + // chi + lop3.b32 %r30570, %r21076, %r21060, %r20948, 0xD2; + lop3.b32 %r30571, %r21080, %r21064, %r20952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+104], {%r30570, %r30571}; + // begin inline asm + // chi + lop3.b32 %r30564, %r21060, %r20948, %r20956, 0xD2; + lop3.b32 %r30565, %r21064, %r20952, %r20960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+112], {%r30564, %r30565}; + // begin inline asm + // chi + lop3.b32 %r30558, %r20948, %r20956, %r20924, 0xD2; + lop3.b32 %r30559, %r20952, %r20960, %r20928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+120], {%r30558, %r30559}; + // begin inline asm + // chi + lop3.b32 %r30550, %r20956, %r20924, %r21076, 0xD2; + lop3.b32 %r30551, %r20960, %r20928, %r21080, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+128], {%r30550, %r30551}; + // begin inline asm + // chi + lop3.b32 %r30542, %r20924, %r21076, %r21060, 0xD2; + lop3.b32 %r30543, %r20928, %r21080, %r21064, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+136], {%r30542, %r30543}; + // begin inline asm + // chi + lop3.b32 %r30568, %r20980, %r21020, %r21052, 0xD2; + lop3.b32 %r30569, %r20984, %r21024, %r21056, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+144], {%r30568, %r30569}; + // begin inline asm + // chi + lop3.b32 %r30562, %r21020, %r21052, %r21044, 0xD2; + lop3.b32 %r30563, %r21024, %r21056, %r21048, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+152], {%r30562, %r30563}; + // begin inline asm + // chi + lop3.b32 %r30556, %r21052, %r21044, %r20964, 0xD2; + lop3.b32 %r30557, %r21056, %r21048, %r20968, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+160], {%r30556, %r30557}; + // begin inline asm + // chi + lop3.b32 %r30548, %r21044, %r20964, %r20980, 0xD2; + lop3.b32 %r30549, %r21048, %r20968, %r20984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+168], {%r30548, %r30549}; + // begin inline asm + // chi + lop3.b32 %r30540, %r20964, %r20980, %r21020, 0xD2; + lop3.b32 %r30541, %r20968, %r20984, %r21024, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+176], {%r30540, %r30541}; + // begin inline asm + // chi + lop3.b32 %r30566, %r20932, %r21004, %r20916, 0xD2; + lop3.b32 %r30567, %r20936, %r21008, %r20920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+184], {%r30566, %r30567}; + // begin inline asm + // chi + lop3.b32 %r30560, %r21004, %r20916, %r20972, 0xD2; + lop3.b32 %r30561, %r21008, %r20920, %r20976, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+192], {%r30560, %r30561}; + // begin inline asm + // chi + lop3.b32 %r30554, %r20916, %r20972, %r20996, 0xD2; + lop3.b32 %r30555, %r20920, %r20976, %r21000, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+200], {%r30554, %r30555}; + // begin inline asm + // chi + lop3.b32 %r30546, %r20972, %r20996, %r20932, 0xD2; + lop3.b32 %r30547, %r20976, %r21000, %r20936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+208], {%r30546, %r30547}; + // begin inline asm + // chi + lop3.b32 %r30538, %r20996, %r20932, %r21004, 0xD2; + lop3.b32 %r30539, %r21000, %r20936, %r21008, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+216], {%r30538, %r30539}; + mul.wide.s32 %rd944, %r30588, 8; + add.s64 %rd941, %rd918, %rd944; + // begin inline asm + ld.global.nc.v2.u32 {%r21284,%r21285}, [%rd941]; + // end inline asm + xor.b32 %r30574, %r21084, %r21284; + xor.b32 %r30575, %r21085, %r21285; + add.s32 %r30588, %r30588, 1; + setp.lt.u32 %p43, %r30588, 23; + @%p43 bra $L__BB2_75; + + mov.u32 %r30621, 0; + mov.u32 %r21395, 1; + st.local.v2.u32 [%rd925+32], {%r30586, %r30587}; + st.local.v2.u32 [%rd925+72], {%r30584, %r30585}; + st.local.v2.u32 [%rd925+40], {%r30582, %r30583}; + st.local.v2.u32 [%rd925+80], {%r30580, %r30581}; + st.local.v2.u32 [%rd925+48], {%r30578, %r30579}; + st.local.v2.u32 [%rd925+56], {%r30576, %r30577}; + st.local.v2.u32 [%rd925+24], {%r30574, %r30575}; + // begin inline asm + // xor5 + lop3.b32 %r21296, %r30574, %r30572, %r30570, 0x96; + lop3.b32 %r21296, %r21296, %r30568, %r30566, 0x96; + lop3.b32 %r21297, %r30575, %r30573, %r30571, 0x96; + lop3.b32 %r21297, %r21297, %r30569, %r30567, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21308, %r30586, %r30584, %r30564, 0x96; + lop3.b32 %r21308, %r21308, %r30562, %r30560, 0x96; + lop3.b32 %r21309, %r30587, %r30585, %r30565, 0x96; + lop3.b32 %r21309, %r21309, %r30563, %r30561, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21320, %r30582, %r30580, %r30558, 0x96; + lop3.b32 %r21320, %r21320, %r30556, %r30554, 0x96; + lop3.b32 %r21321, %r30583, %r30581, %r30559, 0x96; + lop3.b32 %r21321, %r21321, %r30557, %r30555, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21332, %r30578, %r30552, %r30550, 0x96; + lop3.b32 %r21332, %r21332, %r30548, %r30546, 0x96; + lop3.b32 %r21333, %r30579, %r30553, %r30551, 0x96; + lop3.b32 %r21333, %r21333, %r30549, %r30547, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21344, %r30576, %r30544, %r30542, 0x96; + lop3.b32 %r21344, %r21344, %r30540, %r30538, 0x96; + lop3.b32 %r21345, %r30577, %r30545, %r30543, 0x96; + lop3.b32 %r21345, %r21345, %r30541, %r30539, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21356, %r21309, %r21308, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21360, %r21308, %r21309, %r21395; + // end inline asm + xor.b32 %r21535, %r21356, %r21344; + xor.b32 %r21536, %r21360, %r21345; + xor.b32 %r21503, %r30574, %r21535; + xor.b32 %r21506, %r30575, %r21536; + xor.b32 %r21466, %r30571, %r21536; + xor.b32 %r21465, %r30570, %r21535; + st.local.v2.u32 [%rd925+104], {%r21465, %r21466}; + // begin inline asm + shf.l.wrap.b32 %r21364, %r21321, %r21320, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21368, %r21320, %r21321, %r21395; + // end inline asm + xor.b32 %r21537, %r21364, %r21296; + xor.b32 %r21538, %r21368, %r21297; + xor.b32 %r21402, %r30584, %r21537; + xor.b32 %r21401, %r30585, %r21538; + xor.b32 %r21441, %r30563, %r21538; + xor.b32 %r21442, %r30562, %r21537; + st.local.v2.u32 [%rd925+152], {%r21442, %r21441}; + // begin inline asm + shf.l.wrap.b32 %r21372, %r21333, %r21332, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21376, %r21332, %r21333, %r21395; + // end inline asm + xor.b32 %r21539, %r21372, %r21308; + xor.b32 %r21540, %r21376, %r21309; + xor.b32 %r21425, %r30559, %r21540; + xor.b32 %r21426, %r30558, %r21539; + st.local.v2.u32 [%rd925+120], {%r21426, %r21425}; + xor.b32 %r21417, %r30555, %r21540; + xor.b32 %r21418, %r30554, %r21539; + st.local.v2.u32 [%rd925+200], {%r21418, %r21417}; + // begin inline asm + shf.l.wrap.b32 %r21380, %r21345, %r21344, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21384, %r21344, %r21345, %r21395; + // end inline asm + xor.b32 %r21541, %r21380, %r21320; + xor.b32 %r21542, %r21384, %r21321; + xor.b32 %r21449, %r30578, %r21541; + xor.b32 %r21450, %r30579, %r21542; + xor.b32 %r21458, %r30549, %r21542; + xor.b32 %r21457, %r30548, %r21541; + st.local.v2.u32 [%rd925+168], {%r21457, %r21458}; + // begin inline asm + shf.l.wrap.b32 %r21388, %r21297, %r21296, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21392, %r21296, %r21297, %r21395; + // end inline asm + xor.b32 %r21543, %r21388, %r21332; + xor.b32 %r21544, %r21392, %r21333; + xor.b32 %r21409, %r30544, %r21543; + xor.b32 %r21410, %r30545, %r21544; + xor.b32 %r21434, %r30539, %r21544; + xor.b32 %r21433, %r30538, %r21543; + st.local.v2.u32 [%rd925+216], {%r21433, %r21434}; + // begin inline asm + shf.l.wrap.b32 %r21396, %r21402, %r21401, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21400, %r21401, %r21402, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21404, %r21410, %r21409, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21408, %r21409, %r21410, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21416, %r21417, %r21418, %r20915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21412, %r21418, %r21417, %r20915; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r21412, %r21416}; + // begin inline asm + shf.l.wrap.b32 %r21420, %r21426, %r21425, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21424, %r21425, %r21426, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21428, %r21434, %r21433, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21432, %r21433, %r21434, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21440, %r21441, %r21442, %r21019; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21436, %r21442, %r21441, %r21019; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r21436, %r21440}; + // begin inline asm + shf.l.wrap.b32 %r21444, %r21450, %r21449, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21448, %r21449, %r21450, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21452, %r21458, %r21457, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21456, %r21457, %r21458, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21460, %r21466, %r21465, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21464, %r21465, %r21466, %r21075; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21468, %r21503, %r21396, %r21420, 0xD2; + lop3.b32 %r21469, %r21506, %r21400, %r21424, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30721, %r21396, %r21420, %r21452, 0xD2; + lop3.b32 %r30722, %r21400, %r21424, %r21456, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+32], {%r30721, %r30722}; + // begin inline asm + // chi + lop3.b32 %r30717, %r21420, %r21452, %r21428, 0xD2; + lop3.b32 %r30718, %r21424, %r21456, %r21432, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+40], {%r30717, %r30718}; + // begin inline asm + // chi + lop3.b32 %r30713, %r21452, %r21428, %r21503, 0xD2; + lop3.b32 %r30714, %r21456, %r21432, %r21506, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+48], {%r30713, %r30714}; + // begin inline asm + // chi + lop3.b32 %r30711, %r21428, %r21503, %r21396, 0xD2; + lop3.b32 %r30712, %r21432, %r21506, %r21400, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+56], {%r30711, %r30712}; + // begin inline asm + // chi + lop3.b32 %r30707, %r21444, %r21404, %r21460, 0xD2; + lop3.b32 %r30708, %r21448, %r21408, %r21464, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+64], {%r30707, %r30708}; + // begin inline asm + // chi + lop3.b32 %r30719, %r21404, %r21460, %r21436, 0xD2; + lop3.b32 %r30720, %r21408, %r21464, %r21440, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+72], {%r30719, %r30720}; + // begin inline asm + // chi + lop3.b32 %r30715, %r21460, %r21436, %r21412, 0xD2; + lop3.b32 %r30716, %r21464, %r21440, %r21416, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+80], {%r30715, %r30716}; + // begin inline asm + ld.global.nc.v2.u32 {%r21532,%r21533}, [%rd919]; + // end inline asm + xor.b32 %r30709, %r21468, %r21532; + xor.b32 %r30710, %r21469, %r21533; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + add.s64 %rd244, %rd925, 24; + add.s64 %rd245, %rd3, 24; + +$L__BB2_77: + shl.b32 %r21545, %r30621, 2; + cvt.u64.u32 %rd952, %r21545; + and.b64 %rd953, %rd952, 60; + add.s64 %rd954, %rd245, %rd953; + xor.b32 %r21546, %r3326, %r30621; + mul.lo.s32 %r21547, %r21546, 16777619; + ld.local.u32 %r21548, [%rd954]; + xor.b32 %r21549, %r21547, %r21548; + mul.wide.u32 %rd955, %r21549, -954391867; + shr.u64 %rd956, %rd955, 32; + cvt.u32.u64 %r21550, %rd956; + sub.s32 %r21551, %r21549, %r21550; + shr.u32 %r21552, %r21551, 1; + add.s32 %r21553, %r21552, %r21550; + shr.u32 %r21554, %r21553, 20; + mul.lo.s32 %r21555, %r21554, 1179641; + sub.s32 %r21556, %r21549, %r21555; + mul.wide.u32 %rd957, %r21556, 64; + add.s64 %rd958, %rd471, %rd957; + mul.lo.s32 %r21557, %r30658, 16777619; + ld.global.u32 %r21558, [%rd958]; + xor.b32 %r30658, %r21557, %r21558; + mul.lo.s32 %r21559, %r30659, 16777619; + ld.global.u32 %r21560, [%rd958+4]; + xor.b32 %r30659, %r21559, %r21560; + mul.lo.s32 %r21561, %r30670, 16777619; + ld.global.u32 %r21562, [%rd958+8]; + mul.lo.s32 %r21563, %r30671, 16777619; + ld.global.u32 %r21564, [%rd958+12]; + xor.b32 %r21565, %r21563, %r21564; + xor.b32 %r30670, %r21561, %r21562; + mov.b64 %rd959, {%r30670, %r21565}; + mul.lo.s32 %r21566, %r30666, 16777619; + ld.global.u32 %r21567, [%rd958+16]; + mul.lo.s32 %r21568, %r30667, 16777619; + ld.global.u32 %r21569, [%rd958+20]; + xor.b32 %r21570, %r21568, %r21569; + xor.b32 %r30666, %r21566, %r21567; + mov.b64 %rd960, {%r30666, %r21570}; + mul.lo.s32 %r21571, %r30662, 16777619; + ld.global.u32 %r21572, [%rd958+24]; + mul.lo.s32 %r21573, %r30663, 16777619; + ld.global.u32 %r21574, [%rd958+28]; + xor.b32 %r21575, %r21573, %r21574; + xor.b32 %r30662, %r21571, %r21572; + mov.b64 %rd961, {%r30662, %r21575}; + mul.lo.s32 %r21576, %r30660, 16777619; + ld.global.u32 %r21577, [%rd958+32]; + mul.lo.s32 %r21578, %r30661, 16777619; + ld.global.u32 %r21579, [%rd958+36]; + xor.b32 %r21580, %r21578, %r21579; + xor.b32 %r30660, %r21576, %r21577; + mov.b64 %rd962, {%r30660, %r21580}; + mul.lo.s32 %r21581, %r30656, 16777619; + ld.global.u32 %r21582, [%rd958+40]; + xor.b32 %r30656, %r21581, %r21582; + mul.lo.s32 %r21583, %r30657, 16777619; + ld.global.u32 %r21584, [%rd958+44]; + xor.b32 %r30657, %r21583, %r21584; + mul.lo.s32 %r21585, %r30668, 16777619; + ld.global.u32 %r21586, [%rd958+48]; + mul.lo.s32 %r21587, %r30669, 16777619; + ld.global.u32 %r21588, [%rd958+52]; + xor.b32 %r21589, %r21587, %r21588; + xor.b32 %r30668, %r21585, %r21586; + mov.b64 %rd963, {%r30668, %r21589}; + mul.lo.s32 %r21590, %r30664, 16777619; + ld.global.u32 %r21591, [%rd958+56]; + mul.lo.s32 %r21592, %r30665, 16777619; + ld.global.u32 %r21593, [%rd958+60]; + xor.b32 %r21594, %r21592, %r21593; + xor.b32 %r30664, %r21590, %r21591; + mov.b64 %rd964, {%r30664, %r21594}; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + st.local.v2.u32 [%rd3+32], {%r30670, %r21565}; + st.local.v2.u32 [%rd3+40], {%r30666, %r21570}; + st.local.v2.u32 [%rd3+48], {%r30662, %r21575}; + st.local.v2.u32 [%rd3+56], {%r30660, %r21580}; + st.local.v2.u32 [%rd3+64], {%r30656, %r30657}; + st.local.v2.u32 [%rd3+72], {%r30668, %r21589}; + st.local.v2.u32 [%rd3+80], {%r30664, %r21594}; + add.s64 %rd965, %rd244, %rd953; + xor.b32 %r21595, %r20787, %r30621; + mul.lo.s32 %r21596, %r21595, 16777619; + ld.local.u32 %r21597, [%rd965]; + xor.b32 %r21598, %r21596, %r21597; + mul.wide.u32 %rd966, %r21598, -954391867; + shr.u64 %rd967, %rd966, 32; + cvt.u32.u64 %r21599, %rd967; + sub.s32 %r21600, %r21598, %r21599; + shr.u32 %r21601, %r21600, 1; + add.s32 %r21602, %r21601, %r21599; + shr.u32 %r21603, %r21602, 20; + mul.lo.s32 %r21604, %r21603, 1179641; + sub.s32 %r21605, %r21598, %r21604; + mul.wide.u32 %rd968, %r21605, 64; + add.s64 %rd969, %rd471, %rd968; + mul.lo.s32 %r21606, %r30709, 16777619; + ld.global.u32 %r21607, [%rd969]; + xor.b32 %r30709, %r21606, %r21607; + mul.lo.s32 %r21608, %r30710, 16777619; + ld.global.u32 %r21609, [%rd969+4]; + xor.b32 %r30710, %r21608, %r21609; + mul.lo.s32 %r21610, %r30721, 16777619; + ld.global.u32 %r21611, [%rd969+8]; + mul.lo.s32 %r21612, %r30722, 16777619; + ld.global.u32 %r21613, [%rd969+12]; + xor.b32 %r21614, %r21612, %r21613; + xor.b32 %r30721, %r21610, %r21611; + mov.b64 %rd970, {%r30721, %r21614}; + mul.lo.s32 %r21615, %r30717, 16777619; + ld.global.u32 %r21616, [%rd969+16]; + mul.lo.s32 %r21617, %r30718, 16777619; + ld.global.u32 %r21618, [%rd969+20]; + xor.b32 %r21619, %r21617, %r21618; + xor.b32 %r30717, %r21615, %r21616; + mov.b64 %rd971, {%r30717, %r21619}; + mul.lo.s32 %r21620, %r30713, 16777619; + ld.global.u32 %r21621, [%rd969+24]; + mul.lo.s32 %r21622, %r30714, 16777619; + ld.global.u32 %r21623, [%rd969+28]; + xor.b32 %r21624, %r21622, %r21623; + xor.b32 %r30713, %r21620, %r21621; + mov.b64 %rd972, {%r30713, %r21624}; + mul.lo.s32 %r21625, %r30711, 16777619; + ld.global.u32 %r21626, [%rd969+32]; + mul.lo.s32 %r21627, %r30712, 16777619; + ld.global.u32 %r21628, [%rd969+36]; + xor.b32 %r21629, %r21627, %r21628; + xor.b32 %r30711, %r21625, %r21626; + mov.b64 %rd973, {%r30711, %r21629}; + mul.lo.s32 %r21630, %r30707, 16777619; + ld.global.u32 %r21631, [%rd969+40]; + xor.b32 %r30707, %r21630, %r21631; + mul.lo.s32 %r21632, %r30708, 16777619; + ld.global.u32 %r21633, [%rd969+44]; + xor.b32 %r30708, %r21632, %r21633; + mul.lo.s32 %r21634, %r30719, 16777619; + ld.global.u32 %r21635, [%rd969+48]; + mul.lo.s32 %r21636, %r30720, 16777619; + ld.global.u32 %r21637, [%rd969+52]; + xor.b32 %r21638, %r21636, %r21637; + xor.b32 %r30719, %r21634, %r21635; + mov.b64 %rd974, {%r30719, %r21638}; + mul.lo.s32 %r21639, %r30715, 16777619; + ld.global.u32 %r21640, [%rd969+56]; + mul.lo.s32 %r21641, %r30716, 16777619; + ld.global.u32 %r21642, [%rd969+60]; + xor.b32 %r21643, %r21641, %r21642; + xor.b32 %r30715, %r21639, %r21640; + mov.b64 %rd975, {%r30715, %r21643}; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + st.local.v2.u32 [%rd925+32], {%r30721, %r21614}; + st.local.v2.u32 [%rd925+40], {%r30717, %r21619}; + st.local.v2.u32 [%rd925+48], {%r30713, %r21624}; + st.local.v2.u32 [%rd925+56], {%r30711, %r21629}; + st.local.v2.u32 [%rd925+64], {%r30707, %r30708}; + st.local.v2.u32 [%rd925+72], {%r30719, %r21638}; + st.local.v2.u32 [%rd925+80], {%r30715, %r21643}; + add.s32 %r30621, %r30621, 1; + setp.lt.u32 %p44, %r30621, 512; + shr.u64 %rd976, %rd959, 32; + cvt.u32.u64 %r30671, %rd976; + shr.u64 %rd977, %rd960, 32; + cvt.u32.u64 %r30667, %rd977; + shr.u64 %rd978, %rd961, 32; + cvt.u32.u64 %r30663, %rd978; + shr.u64 %rd979, %rd962, 32; + cvt.u32.u64 %r30661, %rd979; + shr.u64 %rd980, %rd963, 32; + cvt.u32.u64 %r30669, %rd980; + shr.u64 %rd981, %rd964, 32; + cvt.u32.u64 %r30665, %rd981; + shr.u64 %rd982, %rd970, 32; + cvt.u32.u64 %r30722, %rd982; + shr.u64 %rd983, %rd971, 32; + cvt.u32.u64 %r30718, %rd983; + shr.u64 %rd984, %rd972, 32; + cvt.u32.u64 %r30714, %rd984; + shr.u64 %rd985, %rd973, 32; + cvt.u32.u64 %r30712, %rd985; + shr.u64 %rd986, %rd974, 32; + cvt.u32.u64 %r30720, %rd986; + shr.u64 %rd987, %rd975, 32; + cvt.u32.u64 %r30716, %rd987; + @%p44 bra $L__BB2_77; + + mov.u32 %r30622, 0; + st.local.v2.u32 [%rd3+96], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+104], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+112], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+120], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+128], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+136], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+144], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+152], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+160], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+168], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+176], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+184], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+192], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+200], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+208], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+216], {%r30622, %r30622}; + mov.u32 %r30637, -2147483648; + mov.u32 %r21658, 1; + st.local.v2.u32 [%rd3+88], {%r21658, %r30637}; + mov.u32 %r30623, %r30622; + mov.u32 %r30624, %r30622; + mov.u32 %r30625, %r30622; + mov.u32 %r30626, %r30622; + mov.u32 %r30627, %r30622; + mov.u32 %r30628, %r30622; + mov.u32 %r30629, %r30622; + mov.u32 %r30630, %r30622; + mov.u32 %r30631, %r30622; + mov.u32 %r30632, %r30622; + mov.u32 %r30633, %r30622; + mov.u32 %r30634, %r30622; + mov.u32 %r30635, %r30622; + mov.u32 %r30636, %r21658; + mov.u32 %r30638, %r30622; + mov.u32 %r30639, %r30622; + mov.u32 %r30640, %r30622; + mov.u32 %r30641, %r30622; + mov.u32 %r30642, %r30622; + mov.u32 %r30643, %r30622; + mov.u32 %r30644, %r30622; + mov.u32 %r30645, %r30622; + mov.u32 %r30646, %r30622; + mov.u32 %r30647, %r30622; + mov.u32 %r30648, %r30622; + mov.u32 %r30649, %r30622; + mov.u32 %r30650, %r30622; + mov.u32 %r30651, %r30622; + mov.u32 %r30652, %r30622; + mov.u32 %r30653, %r30622; + mov.u32 %r30654, %r30622; + mov.u32 %r30655, %r30622; + mov.u32 %r30672, %r30622; + +$L__BB2_79: + // begin inline asm + // xor5 + lop3.b32 %r21685, %r30658, %r30656, %r30654, 0x96; + lop3.b32 %r21685, %r21685, %r30652, %r30650, 0x96; + lop3.b32 %r21686, %r30659, %r30657, %r30655, 0x96; + lop3.b32 %r21686, %r21686, %r30653, %r30651, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21697, %r30670, %r30668, %r30648, 0x96; + lop3.b32 %r21697, %r21697, %r30646, %r30644, 0x96; + lop3.b32 %r21698, %r30671, %r30669, %r30649, 0x96; + lop3.b32 %r21698, %r21698, %r30647, %r30645, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21709, %r30666, %r30664, %r30642, 0x96; + lop3.b32 %r21709, %r21709, %r30640, %r30638, 0x96; + lop3.b32 %r21710, %r30667, %r30665, %r30643, 0x96; + lop3.b32 %r21710, %r21710, %r30641, %r30639, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21721, %r30662, %r30636, %r30634, 0x96; + lop3.b32 %r21721, %r21721, %r30632, %r30630, 0x96; + lop3.b32 %r21722, %r30663, %r30637, %r30635, 0x96; + lop3.b32 %r21722, %r21722, %r30633, %r30631, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21733, %r30660, %r30628, %r30626, 0x96; + lop3.b32 %r21733, %r21733, %r30624, %r30622, 0x96; + lop3.b32 %r21734, %r30661, %r30629, %r30627, 0x96; + lop3.b32 %r21734, %r21734, %r30625, %r30623, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21745, %r21698, %r21697, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21749, %r21697, %r21698, %r21658; + // end inline asm + xor.b32 %r22179, %r21745, %r21733; + xor.b32 %r22180, %r21749, %r21734; + xor.b32 %r22012, %r30658, %r22179; + xor.b32 %r22015, %r30659, %r22180; + xor.b32 %r21919, %r30656, %r22179; + xor.b32 %r21918, %r30657, %r22180; + xor.b32 %r21966, %r30654, %r22179; + xor.b32 %r21967, %r30655, %r22180; + xor.b32 %r21871, %r30652, %r22179; + xor.b32 %r21870, %r30653, %r22180; + xor.b32 %r21822, %r30650, %r22179; + xor.b32 %r21823, %r30651, %r22180; + // begin inline asm + shf.l.wrap.b32 %r21753, %r21710, %r21709, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21757, %r21709, %r21710, %r21658; + // end inline asm + xor.b32 %r22181, %r21753, %r21685; + xor.b32 %r22182, %r21757, %r21686; + xor.b32 %r21974, %r30670, %r22181; + xor.b32 %r21975, %r30671, %r22182; + xor.b32 %r21791, %r30668, %r22181; + xor.b32 %r21790, %r30669, %r22182; + xor.b32 %r21950, %r30648, %r22181; + xor.b32 %r21951, %r30649, %r22182; + xor.b32 %r21911, %r30646, %r22181; + xor.b32 %r21910, %r30647, %r22182; + xor.b32 %r21894, %r30644, %r22181; + xor.b32 %r21895, %r30645, %r22182; + // begin inline asm + shf.l.wrap.b32 %r21761, %r21722, %r21721, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21765, %r21721, %r21722, %r21658; + // end inline asm + xor.b32 %r22183, %r21761, %r21697; + xor.b32 %r22184, %r21765, %r21698; + xor.b32 %r21831, %r30666, %r22183; + xor.b32 %r21830, %r30667, %r22184; + xor.b32 %r21958, %r30664, %r22183; + xor.b32 %r21959, %r30665, %r22184; + xor.b32 %r21839, %r30642, %r22183; + xor.b32 %r21838, %r30643, %r22184; + xor.b32 %r21942, %r30640, %r22183; + xor.b32 %r21943, %r30641, %r22184; + xor.b32 %r21807, %r30638, %r22183; + xor.b32 %r21806, %r30639, %r22184; + // begin inline asm + shf.l.wrap.b32 %r21769, %r21734, %r21733, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21773, %r21733, %r21734, %r21658; + // end inline asm + xor.b32 %r22185, %r21769, %r21709; + xor.b32 %r22186, %r21773, %r21710; + xor.b32 %r21926, %r30662, %r22185; + xor.b32 %r21927, %r30663, %r22186; + xor.b32 %r21903, %r30636, %r22185; + xor.b32 %r21902, %r30637, %r22186; + xor.b32 %r21846, %r30634, %r22185; + xor.b32 %r21847, %r30635, %r22186; + xor.b32 %r21934, %r30632, %r22185; + xor.b32 %r21935, %r30633, %r22186; + xor.b32 %r21863, %r30630, %r22185; + xor.b32 %r21862, %r30631, %r22186; + // begin inline asm + shf.l.wrap.b32 %r21777, %r21686, %r21685, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21781, %r21685, %r21686, %r21658; + // end inline asm + xor.b32 %r22187, %r21777, %r21721; + xor.b32 %r22188, %r21781, %r21722; + xor.b32 %r21878, %r30660, %r22187; + xor.b32 %r21879, %r30661, %r22188; + xor.b32 %r21798, %r30628, %r22187; + xor.b32 %r21799, %r30629, %r22188; + xor.b32 %r21815, %r30626, %r22187; + xor.b32 %r21814, %r30627, %r22188; + xor.b32 %r21854, %r30624, %r22187; + xor.b32 %r21855, %r30625, %r22188; + xor.b32 %r21886, %r30622, %r22187; + xor.b32 %r21887, %r30623, %r22188; + mov.u32 %r21792, 44; + // begin inline asm + shf.l.wrap.b32 %r21785, %r21791, %r21790, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21789, %r21790, %r21791, %r21792; + // end inline asm + mov.u32 %r21800, 20; + // begin inline asm + shf.l.wrap.b32 %r21793, %r21799, %r21798, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21797, %r21798, %r21799, %r21800; + // end inline asm + mov.u32 %r21808, 61; + // begin inline asm + shf.l.wrap.b32 %r21801, %r21807, %r21806, %r21808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21805, %r21806, %r21807, %r21808; + // end inline asm + mov.u32 %r21816, 39; + // begin inline asm + shf.l.wrap.b32 %r21809, %r21815, %r21814, %r21816; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21813, %r21814, %r21815, %r21816; + // end inline asm + mov.u32 %r21824, 18; + // begin inline asm + shf.l.wrap.b32 %r21817, %r21823, %r21822, %r21824; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21821, %r21822, %r21823, %r21824; + // end inline asm + mov.u32 %r21832, 62; + // begin inline asm + shf.l.wrap.b32 %r21825, %r21831, %r21830, %r21832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21829, %r21830, %r21831, %r21832; + // end inline asm + mov.u32 %r21840, 43; + // begin inline asm + shf.l.wrap.b32 %r21833, %r21839, %r21838, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21837, %r21838, %r21839, %r21840; + // end inline asm + mov.u32 %r21848, 25; + // begin inline asm + shf.l.wrap.b32 %r21841, %r21847, %r21846, %r21848; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21845, %r21846, %r21847, %r21848; + // end inline asm + mov.u32 %r21856, 8; + // begin inline asm + shf.l.wrap.b32 %r21849, %r21855, %r21854, %r21856; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21853, %r21854, %r21855, %r21856; + // end inline asm + mov.u32 %r21864, 56; + // begin inline asm + shf.l.wrap.b32 %r21857, %r21863, %r21862, %r21864; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21861, %r21862, %r21863, %r21864; + // end inline asm + mov.u32 %r21872, 41; + // begin inline asm + shf.l.wrap.b32 %r21865, %r21871, %r21870, %r21872; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21869, %r21870, %r21871, %r21872; + // end inline asm + mov.u32 %r21880, 27; + // begin inline asm + shf.l.wrap.b32 %r21873, %r21879, %r21878, %r21880; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21877, %r21878, %r21879, %r21880; + // end inline asm + mov.u32 %r21888, 14; + // begin inline asm + shf.l.wrap.b32 %r21881, %r21887, %r21886, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21885, %r21886, %r21887, %r21888; + // end inline asm + mov.u32 %r21896, 2; + // begin inline asm + shf.l.wrap.b32 %r21889, %r21895, %r21894, %r21896; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21893, %r21894, %r21895, %r21896; + // end inline asm + mov.u32 %r21904, 55; + // begin inline asm + shf.l.wrap.b32 %r21897, %r21903, %r21902, %r21904; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21901, %r21902, %r21903, %r21904; + // end inline asm + mov.u32 %r21912, 45; + // begin inline asm + shf.l.wrap.b32 %r21905, %r21911, %r21910, %r21912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21909, %r21910, %r21911, %r21912; + // end inline asm + mov.u32 %r21920, 36; + // begin inline asm + shf.l.wrap.b32 %r21913, %r21919, %r21918, %r21920; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21917, %r21918, %r21919, %r21920; + // end inline asm + mov.u32 %r21928, 28; + // begin inline asm + shf.l.wrap.b32 %r21921, %r21927, %r21926, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21925, %r21926, %r21927, %r21928; + // end inline asm + mov.u32 %r21936, 21; + // begin inline asm + shf.l.wrap.b32 %r21929, %r21935, %r21934, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21933, %r21934, %r21935, %r21936; + // end inline asm + mov.u32 %r21944, 15; + // begin inline asm + shf.l.wrap.b32 %r21937, %r21943, %r21942, %r21944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21941, %r21942, %r21943, %r21944; + // end inline asm + mov.u32 %r21952, 10; + // begin inline asm + shf.l.wrap.b32 %r21945, %r21951, %r21950, %r21952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21949, %r21950, %r21951, %r21952; + // end inline asm + mov.u32 %r21960, 6; + // begin inline asm + shf.l.wrap.b32 %r21953, %r21959, %r21958, %r21960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21957, %r21958, %r21959, %r21960; + // end inline asm + mov.u32 %r21968, 3; + // begin inline asm + shf.l.wrap.b32 %r21961, %r21967, %r21966, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21965, %r21966, %r21967, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21969, %r21975, %r21974, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21973, %r21974, %r21975, %r21658; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21977, %r22012, %r21785, %r21833, 0xD2; + lop3.b32 %r21978, %r22015, %r21789, %r21837, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30670, %r21785, %r21833, %r21929, 0xD2; + lop3.b32 %r30671, %r21789, %r21837, %r21933, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30666, %r21833, %r21929, %r21881, 0xD2; + lop3.b32 %r30667, %r21837, %r21933, %r21885, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30662, %r21929, %r21881, %r22012, 0xD2; + lop3.b32 %r30663, %r21933, %r21885, %r22015, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30660, %r21881, %r22012, %r21785, 0xD2; + lop3.b32 %r30661, %r21885, %r22015, %r21789, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30656, %r21921, %r21793, %r21961, 0xD2; + lop3.b32 %r30657, %r21925, %r21797, %r21965, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30668, %r21793, %r21961, %r21905, 0xD2; + lop3.b32 %r30669, %r21797, %r21965, %r21909, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30664, %r21961, %r21905, %r21801, 0xD2; + lop3.b32 %r30665, %r21965, %r21909, %r21805, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30636, %r21905, %r21801, %r21921, 0xD2; + lop3.b32 %r30637, %r21909, %r21805, %r21925, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30636, %r30637}; + // begin inline asm + // chi + lop3.b32 %r30628, %r21801, %r21921, %r21793, 0xD2; + lop3.b32 %r30629, %r21805, %r21925, %r21797, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30628, %r30629}; + // begin inline asm + // chi + lop3.b32 %r30654, %r21969, %r21953, %r21841, 0xD2; + lop3.b32 %r30655, %r21973, %r21957, %r21845, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30654, %r30655}; + // begin inline asm + // chi + lop3.b32 %r30648, %r21953, %r21841, %r21849, 0xD2; + lop3.b32 %r30649, %r21957, %r21845, %r21853, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30648, %r30649}; + // begin inline asm + // chi + lop3.b32 %r30642, %r21841, %r21849, %r21817, 0xD2; + lop3.b32 %r30643, %r21845, %r21853, %r21821, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30642, %r30643}; + // begin inline asm + // chi + lop3.b32 %r30634, %r21849, %r21817, %r21969, 0xD2; + lop3.b32 %r30635, %r21853, %r21821, %r21973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30634, %r30635}; + // begin inline asm + // chi + lop3.b32 %r30626, %r21817, %r21969, %r21953, 0xD2; + lop3.b32 %r30627, %r21821, %r21973, %r21957, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30626, %r30627}; + // begin inline asm + // chi + lop3.b32 %r30652, %r21873, %r21913, %r21945, 0xD2; + lop3.b32 %r30653, %r21877, %r21917, %r21949, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30652, %r30653}; + // begin inline asm + // chi + lop3.b32 %r30646, %r21913, %r21945, %r21937, 0xD2; + lop3.b32 %r30647, %r21917, %r21949, %r21941, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30646, %r30647}; + // begin inline asm + // chi + lop3.b32 %r30640, %r21945, %r21937, %r21857, 0xD2; + lop3.b32 %r30641, %r21949, %r21941, %r21861, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30640, %r30641}; + // begin inline asm + // chi + lop3.b32 %r30632, %r21937, %r21857, %r21873, 0xD2; + lop3.b32 %r30633, %r21941, %r21861, %r21877, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30632, %r30633}; + // begin inline asm + // chi + lop3.b32 %r30624, %r21857, %r21873, %r21913, 0xD2; + lop3.b32 %r30625, %r21861, %r21877, %r21917, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30624, %r30625}; + // begin inline asm + // chi + lop3.b32 %r30650, %r21825, %r21897, %r21809, 0xD2; + lop3.b32 %r30651, %r21829, %r21901, %r21813, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30650, %r30651}; + // begin inline asm + // chi + lop3.b32 %r30644, %r21897, %r21809, %r21865, 0xD2; + lop3.b32 %r30645, %r21901, %r21813, %r21869, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30644, %r30645}; + // begin inline asm + // chi + lop3.b32 %r30638, %r21809, %r21865, %r21889, 0xD2; + lop3.b32 %r30639, %r21813, %r21869, %r21893, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30638, %r30639}; + // begin inline asm + // chi + lop3.b32 %r30630, %r21865, %r21889, %r21825, 0xD2; + lop3.b32 %r30631, %r21869, %r21893, %r21829, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30630, %r30631}; + // begin inline asm + // chi + lop3.b32 %r30622, %r21889, %r21825, %r21897, 0xD2; + lop3.b32 %r30623, %r21893, %r21829, %r21901, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30622, %r30623}; + mul.wide.s32 %rd991, %r30672, 8; + add.s64 %rd990, %rd918, %rd991; + // begin inline asm + ld.global.nc.v2.u32 {%r22177,%r22178}, [%rd990]; + // end inline asm + xor.b32 %r30658, %r21977, %r22177; + xor.b32 %r30659, %r21978, %r22178; + add.s32 %r30672, %r30672, 1; + setp.lt.u32 %p45, %r30672, 23; + @%p45 bra $L__BB2_79; + + st.local.v2.u32 [%rd3+32], {%r30670, %r30671}; + st.local.v2.u32 [%rd3+72], {%r30668, %r30669}; + st.local.v2.u32 [%rd3+40], {%r30666, %r30667}; + st.local.v2.u32 [%rd3+80], {%r30664, %r30665}; + st.local.v2.u32 [%rd3+48], {%r30662, %r30663}; + st.local.v2.u32 [%rd3+56], {%r30660, %r30661}; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + // begin inline asm + // xor5 + lop3.b32 %r22189, %r30658, %r30656, %r30654, 0x96; + lop3.b32 %r22189, %r22189, %r30652, %r30650, 0x96; + lop3.b32 %r22190, %r30659, %r30657, %r30655, 0x96; + lop3.b32 %r22190, %r22190, %r30653, %r30651, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22201, %r30670, %r30668, %r30648, 0x96; + lop3.b32 %r22201, %r22201, %r30646, %r30644, 0x96; + lop3.b32 %r22202, %r30671, %r30669, %r30649, 0x96; + lop3.b32 %r22202, %r22202, %r30647, %r30645, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22213, %r30666, %r30664, %r30642, 0x96; + lop3.b32 %r22213, %r22213, %r30640, %r30638, 0x96; + lop3.b32 %r22214, %r30667, %r30665, %r30643, 0x96; + lop3.b32 %r22214, %r22214, %r30641, %r30639, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22225, %r30662, %r30636, %r30634, 0x96; + lop3.b32 %r22225, %r22225, %r30632, %r30630, 0x96; + lop3.b32 %r22226, %r30663, %r30637, %r30635, 0x96; + lop3.b32 %r22226, %r22226, %r30633, %r30631, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22237, %r30660, %r30628, %r30626, 0x96; + lop3.b32 %r22237, %r22237, %r30624, %r30622, 0x96; + lop3.b32 %r22238, %r30661, %r30629, %r30627, 0x96; + lop3.b32 %r22238, %r22238, %r30625, %r30623, 0x96; + // end inline asm + mov.u32 %r22441, 1; + // begin inline asm + shf.l.wrap.b32 %r22249, %r22202, %r22201, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22253, %r22201, %r22202, %r22441; + // end inline asm + xor.b32 %r22468, %r22249, %r22237; + xor.b32 %r22469, %r22253, %r22238; + xor.b32 %r22396, %r30658, %r22468; + xor.b32 %r22399, %r30659, %r22469; + xor.b32 %r22359, %r30655, %r22469; + xor.b32 %r22358, %r30654, %r22468; + st.local.v2.u32 [%rd3+104], {%r22358, %r22359}; + // begin inline asm + shf.l.wrap.b32 %r22257, %r22214, %r22213, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22261, %r22213, %r22214, %r22441; + // end inline asm + xor.b32 %r22470, %r22257, %r22189; + xor.b32 %r22471, %r22261, %r22190; + xor.b32 %r22295, %r30668, %r22470; + xor.b32 %r22294, %r30669, %r22471; + xor.b32 %r22334, %r30647, %r22471; + xor.b32 %r22335, %r30646, %r22470; + st.local.v2.u32 [%rd3+152], {%r22335, %r22334}; + // begin inline asm + shf.l.wrap.b32 %r22265, %r22226, %r22225, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22269, %r22225, %r22226, %r22441; + // end inline asm + xor.b32 %r22472, %r22265, %r22201; + xor.b32 %r22473, %r22269, %r22202; + xor.b32 %r22318, %r30643, %r22473; + xor.b32 %r22319, %r30642, %r22472; + st.local.v2.u32 [%rd3+120], {%r22319, %r22318}; + xor.b32 %r22310, %r30639, %r22473; + xor.b32 %r22311, %r30638, %r22472; + st.local.v2.u32 [%rd3+200], {%r22311, %r22310}; + // begin inline asm + shf.l.wrap.b32 %r22273, %r22238, %r22237, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22277, %r22237, %r22238, %r22441; + // end inline asm + xor.b32 %r22474, %r22273, %r22213; + xor.b32 %r22475, %r22277, %r22214; + xor.b32 %r22342, %r30662, %r22474; + xor.b32 %r22343, %r30663, %r22475; + xor.b32 %r22351, %r30633, %r22475; + xor.b32 %r22350, %r30632, %r22474; + st.local.v2.u32 [%rd3+168], {%r22350, %r22351}; + // begin inline asm + shf.l.wrap.b32 %r22281, %r22190, %r22189, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22285, %r22189, %r22190, %r22441; + // end inline asm + xor.b32 %r22476, %r22281, %r22225; + xor.b32 %r22477, %r22285, %r22226; + xor.b32 %r22302, %r30628, %r22476; + xor.b32 %r22303, %r30629, %r22477; + xor.b32 %r22327, %r30623, %r22477; + xor.b32 %r22326, %r30622, %r22476; + st.local.v2.u32 [%rd3+216], {%r22326, %r22327}; + // begin inline asm + shf.l.wrap.b32 %r22289, %r22295, %r22294, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22293, %r22294, %r22295, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22297, %r22303, %r22302, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22301, %r22302, %r22303, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22309, %r22310, %r22311, %r21808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22305, %r22311, %r22310, %r21808; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r22305, %r22309}; + // begin inline asm + shf.l.wrap.b32 %r22313, %r22319, %r22318, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22317, %r22318, %r22319, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22321, %r22327, %r22326, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22325, %r22326, %r22327, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22333, %r22334, %r22335, %r21912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22329, %r22335, %r22334, %r21912; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r22329, %r22333}; + // begin inline asm + shf.l.wrap.b32 %r22337, %r22343, %r22342, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22341, %r22342, %r22343, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22345, %r22351, %r22350, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22349, %r22350, %r22351, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22353, %r22359, %r22358, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22357, %r22358, %r22359, %r21968; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22361, %r22396, %r22289, %r22313, 0xD2; + lop3.b32 %r22362, %r22399, %r22293, %r22317, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22369, %r22289, %r22313, %r22345, 0xD2; + lop3.b32 %r22370, %r22293, %r22317, %r22349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r22369, %r22370}; + // begin inline asm + // chi + lop3.b32 %r22377, %r22313, %r22345, %r22321, 0xD2; + lop3.b32 %r22378, %r22317, %r22349, %r22325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r22377, %r22378}; + // begin inline asm + // chi + lop3.b32 %r22385, %r22345, %r22321, %r22396, 0xD2; + lop3.b32 %r22386, %r22349, %r22325, %r22399, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r22385, %r22386}; + // begin inline asm + // chi + lop3.b32 %r22393, %r22321, %r22396, %r22289, 0xD2; + lop3.b32 %r22394, %r22325, %r22399, %r22293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r22393, %r22394}; + // begin inline asm + // chi + lop3.b32 %r22401, %r22337, %r22297, %r22353, 0xD2; + lop3.b32 %r22402, %r22341, %r22301, %r22357, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r22401, %r22402}; + // begin inline asm + // chi + lop3.b32 %r22409, %r22297, %r22353, %r22329, 0xD2; + lop3.b32 %r22410, %r22301, %r22357, %r22333, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r22409, %r22410}; + // begin inline asm + // chi + lop3.b32 %r22417, %r22353, %r22329, %r22305, 0xD2; + lop3.b32 %r22418, %r22357, %r22333, %r22309, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r22417, %r22418}; + // begin inline asm + ld.global.nc.v2.u32 {%r22425,%r22426}, [%rd919]; + // end inline asm + xor.b32 %r22478, %r22362, %r22426; + xor.b32 %r22479, %r22361, %r22425; + mov.b64 %rd1269, {%r22479, %r22478}; + mov.b64 %rd1270, {%r22369, %r22370}; + mov.b64 %rd1271, {%r22377, %r22378}; + mov.b64 %rd250, {%r22385, %r22386}; + mov.b64 %rd1272, {%r22393, %r22394}; + mov.b64 %rd252, {%r22401, %r22402}; + mov.b64 %rd253, {%r22409, %r22410}; + mov.b64 %rd254, {%r22417, %r22418}; + mov.u32 %r30673, 0; + st.local.v2.u32 [%rd3+24], {%r22479, %r22478}; + st.local.v2.u32 [%rd925+96], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+104], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+112], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+120], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+128], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+136], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+144], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+152], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+160], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+168], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+176], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+184], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+192], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+200], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+208], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+216], {%r30673, %r30673}; + mov.u32 %r30688, -2147483648; + st.local.v2.u32 [%rd925+88], {%r22441, %r30688}; + mov.u32 %r30674, %r30673; + mov.u32 %r30675, %r30673; + mov.u32 %r30676, %r30673; + mov.u32 %r30677, %r30673; + mov.u32 %r30678, %r30673; + mov.u32 %r30679, %r30673; + mov.u32 %r30680, %r30673; + mov.u32 %r30681, %r30673; + mov.u32 %r30682, %r30673; + mov.u32 %r30683, %r30673; + mov.u32 %r30684, %r30673; + mov.u32 %r30685, %r30673; + mov.u32 %r30686, %r30673; + mov.u32 %r30687, %r22441; + mov.u32 %r30689, %r30673; + mov.u32 %r30690, %r30673; + mov.u32 %r30691, %r30673; + mov.u32 %r30692, %r30673; + mov.u32 %r30693, %r30673; + mov.u32 %r30694, %r30673; + mov.u32 %r30695, %r30673; + mov.u32 %r30696, %r30673; + mov.u32 %r30697, %r30673; + mov.u32 %r30698, %r30673; + mov.u32 %r30699, %r30673; + mov.u32 %r30700, %r30673; + mov.u32 %r30701, %r30673; + mov.u32 %r30702, %r30673; + mov.u32 %r30703, %r30673; + mov.u32 %r30704, %r30673; + mov.u32 %r30705, %r30673; + mov.u32 %r30706, %r30673; + mov.u32 %r30723, %r30673; + +$L__BB2_81: + // begin inline asm + // xor5 + lop3.b32 %r22480, %r30709, %r30707, %r30705, 0x96; + lop3.b32 %r22480, %r22480, %r30703, %r30701, 0x96; + lop3.b32 %r22481, %r30710, %r30708, %r30706, 0x96; + lop3.b32 %r22481, %r22481, %r30704, %r30702, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22492, %r30721, %r30719, %r30699, 0x96; + lop3.b32 %r22492, %r22492, %r30697, %r30695, 0x96; + lop3.b32 %r22493, %r30722, %r30720, %r30700, 0x96; + lop3.b32 %r22493, %r22493, %r30698, %r30696, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22504, %r30717, %r30715, %r30693, 0x96; + lop3.b32 %r22504, %r22504, %r30691, %r30689, 0x96; + lop3.b32 %r22505, %r30718, %r30716, %r30694, 0x96; + lop3.b32 %r22505, %r22505, %r30692, %r30690, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22516, %r30713, %r30687, %r30685, 0x96; + lop3.b32 %r22516, %r22516, %r30683, %r30681, 0x96; + lop3.b32 %r22517, %r30714, %r30688, %r30686, 0x96; + lop3.b32 %r22517, %r22517, %r30684, %r30682, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22528, %r30711, %r30679, %r30677, 0x96; + lop3.b32 %r22528, %r22528, %r30675, %r30673, 0x96; + lop3.b32 %r22529, %r30712, %r30680, %r30678, 0x96; + lop3.b32 %r22529, %r22529, %r30676, %r30674, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22540, %r22493, %r22492, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22544, %r22492, %r22493, %r22441; + // end inline asm + xor.b32 %r22974, %r22540, %r22528; + xor.b32 %r22975, %r22544, %r22529; + xor.b32 %r22807, %r30709, %r22974; + xor.b32 %r22810, %r30710, %r22975; + xor.b32 %r22714, %r30707, %r22974; + xor.b32 %r22713, %r30708, %r22975; + xor.b32 %r22761, %r30705, %r22974; + xor.b32 %r22762, %r30706, %r22975; + xor.b32 %r22666, %r30703, %r22974; + xor.b32 %r22665, %r30704, %r22975; + xor.b32 %r22617, %r30701, %r22974; + xor.b32 %r22618, %r30702, %r22975; + // begin inline asm + shf.l.wrap.b32 %r22548, %r22505, %r22504, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22552, %r22504, %r22505, %r22441; + // end inline asm + xor.b32 %r22976, %r22548, %r22480; + xor.b32 %r22977, %r22552, %r22481; + xor.b32 %r22769, %r30721, %r22976; + xor.b32 %r22770, %r30722, %r22977; + xor.b32 %r22586, %r30719, %r22976; + xor.b32 %r22585, %r30720, %r22977; + xor.b32 %r22745, %r30699, %r22976; + xor.b32 %r22746, %r30700, %r22977; + xor.b32 %r22706, %r30697, %r22976; + xor.b32 %r22705, %r30698, %r22977; + xor.b32 %r22689, %r30695, %r22976; + xor.b32 %r22690, %r30696, %r22977; + // begin inline asm + shf.l.wrap.b32 %r22556, %r22517, %r22516, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22560, %r22516, %r22517, %r22441; + // end inline asm + xor.b32 %r22978, %r22556, %r22492; + xor.b32 %r22979, %r22560, %r22493; + xor.b32 %r22626, %r30717, %r22978; + xor.b32 %r22625, %r30718, %r22979; + xor.b32 %r22753, %r30715, %r22978; + xor.b32 %r22754, %r30716, %r22979; + xor.b32 %r22634, %r30693, %r22978; + xor.b32 %r22633, %r30694, %r22979; + xor.b32 %r22737, %r30691, %r22978; + xor.b32 %r22738, %r30692, %r22979; + xor.b32 %r22602, %r30689, %r22978; + xor.b32 %r22601, %r30690, %r22979; + // begin inline asm + shf.l.wrap.b32 %r22564, %r22529, %r22528, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22568, %r22528, %r22529, %r22441; + // end inline asm + xor.b32 %r22980, %r22564, %r22504; + xor.b32 %r22981, %r22568, %r22505; + xor.b32 %r22721, %r30713, %r22980; + xor.b32 %r22722, %r30714, %r22981; + xor.b32 %r22698, %r30687, %r22980; + xor.b32 %r22697, %r30688, %r22981; + xor.b32 %r22641, %r30685, %r22980; + xor.b32 %r22642, %r30686, %r22981; + xor.b32 %r22729, %r30683, %r22980; + xor.b32 %r22730, %r30684, %r22981; + xor.b32 %r22658, %r30681, %r22980; + xor.b32 %r22657, %r30682, %r22981; + // begin inline asm + shf.l.wrap.b32 %r22572, %r22481, %r22480, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22576, %r22480, %r22481, %r22441; + // end inline asm + xor.b32 %r22982, %r22572, %r22516; + xor.b32 %r22983, %r22576, %r22517; + xor.b32 %r22673, %r30711, %r22982; + xor.b32 %r22674, %r30712, %r22983; + xor.b32 %r22593, %r30679, %r22982; + xor.b32 %r22594, %r30680, %r22983; + xor.b32 %r22610, %r30677, %r22982; + xor.b32 %r22609, %r30678, %r22983; + xor.b32 %r22649, %r30675, %r22982; + xor.b32 %r22650, %r30676, %r22983; + xor.b32 %r22681, %r30673, %r22982; + xor.b32 %r22682, %r30674, %r22983; + mov.u32 %r22587, 44; + // begin inline asm + shf.l.wrap.b32 %r22580, %r22586, %r22585, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22584, %r22585, %r22586, %r22587; + // end inline asm + mov.u32 %r22595, 20; + // begin inline asm + shf.l.wrap.b32 %r22588, %r22594, %r22593, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22592, %r22593, %r22594, %r22595; + // end inline asm + mov.u32 %r22603, 61; + // begin inline asm + shf.l.wrap.b32 %r22596, %r22602, %r22601, %r22603; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22600, %r22601, %r22602, %r22603; + // end inline asm + mov.u32 %r22611, 39; + // begin inline asm + shf.l.wrap.b32 %r22604, %r22610, %r22609, %r22611; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22608, %r22609, %r22610, %r22611; + // end inline asm + mov.u32 %r22619, 18; + // begin inline asm + shf.l.wrap.b32 %r22612, %r22618, %r22617, %r22619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22616, %r22617, %r22618, %r22619; + // end inline asm + mov.u32 %r22627, 62; + // begin inline asm + shf.l.wrap.b32 %r22620, %r22626, %r22625, %r22627; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22624, %r22625, %r22626, %r22627; + // end inline asm + mov.u32 %r22635, 43; + // begin inline asm + shf.l.wrap.b32 %r22628, %r22634, %r22633, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22632, %r22633, %r22634, %r22635; + // end inline asm + mov.u32 %r22643, 25; + // begin inline asm + shf.l.wrap.b32 %r22636, %r22642, %r22641, %r22643; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22640, %r22641, %r22642, %r22643; + // end inline asm + mov.u32 %r22651, 8; + // begin inline asm + shf.l.wrap.b32 %r22644, %r22650, %r22649, %r22651; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22648, %r22649, %r22650, %r22651; + // end inline asm + mov.u32 %r22659, 56; + // begin inline asm + shf.l.wrap.b32 %r22652, %r22658, %r22657, %r22659; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22656, %r22657, %r22658, %r22659; + // end inline asm + mov.u32 %r22667, 41; + // begin inline asm + shf.l.wrap.b32 %r22660, %r22666, %r22665, %r22667; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22664, %r22665, %r22666, %r22667; + // end inline asm + mov.u32 %r22675, 27; + // begin inline asm + shf.l.wrap.b32 %r22668, %r22674, %r22673, %r22675; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22672, %r22673, %r22674, %r22675; + // end inline asm + mov.u32 %r22683, 14; + // begin inline asm + shf.l.wrap.b32 %r22676, %r22682, %r22681, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22680, %r22681, %r22682, %r22683; + // end inline asm + mov.u32 %r22691, 2; + // begin inline asm + shf.l.wrap.b32 %r22684, %r22690, %r22689, %r22691; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22688, %r22689, %r22690, %r22691; + // end inline asm + mov.u32 %r22699, 55; + // begin inline asm + shf.l.wrap.b32 %r22692, %r22698, %r22697, %r22699; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22696, %r22697, %r22698, %r22699; + // end inline asm + mov.u32 %r22707, 45; + // begin inline asm + shf.l.wrap.b32 %r22700, %r22706, %r22705, %r22707; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22704, %r22705, %r22706, %r22707; + // end inline asm + mov.u32 %r22715, 36; + // begin inline asm + shf.l.wrap.b32 %r22708, %r22714, %r22713, %r22715; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22712, %r22713, %r22714, %r22715; + // end inline asm + mov.u32 %r22723, 28; + // begin inline asm + shf.l.wrap.b32 %r22716, %r22722, %r22721, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22720, %r22721, %r22722, %r22723; + // end inline asm + mov.u32 %r22731, 21; + // begin inline asm + shf.l.wrap.b32 %r22724, %r22730, %r22729, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22728, %r22729, %r22730, %r22731; + // end inline asm + mov.u32 %r22739, 15; + // begin inline asm + shf.l.wrap.b32 %r22732, %r22738, %r22737, %r22739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22736, %r22737, %r22738, %r22739; + // end inline asm + mov.u32 %r22747, 10; + // begin inline asm + shf.l.wrap.b32 %r22740, %r22746, %r22745, %r22747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22744, %r22745, %r22746, %r22747; + // end inline asm + mov.u32 %r22755, 6; + // begin inline asm + shf.l.wrap.b32 %r22748, %r22754, %r22753, %r22755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22752, %r22753, %r22754, %r22755; + // end inline asm + mov.u32 %r22763, 3; + // begin inline asm + shf.l.wrap.b32 %r22756, %r22762, %r22761, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22760, %r22761, %r22762, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22764, %r22770, %r22769, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22768, %r22769, %r22770, %r22441; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22772, %r22807, %r22580, %r22628, 0xD2; + lop3.b32 %r22773, %r22810, %r22584, %r22632, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30721, %r22580, %r22628, %r22724, 0xD2; + lop3.b32 %r30722, %r22584, %r22632, %r22728, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30717, %r22628, %r22724, %r22676, 0xD2; + lop3.b32 %r30718, %r22632, %r22728, %r22680, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30713, %r22724, %r22676, %r22807, 0xD2; + lop3.b32 %r30714, %r22728, %r22680, %r22810, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30711, %r22676, %r22807, %r22580, 0xD2; + lop3.b32 %r30712, %r22680, %r22810, %r22584, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30707, %r22716, %r22588, %r22756, 0xD2; + lop3.b32 %r30708, %r22720, %r22592, %r22760, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30719, %r22588, %r22756, %r22700, 0xD2; + lop3.b32 %r30720, %r22592, %r22760, %r22704, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30715, %r22756, %r22700, %r22596, 0xD2; + lop3.b32 %r30716, %r22760, %r22704, %r22600, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30687, %r22700, %r22596, %r22716, 0xD2; + lop3.b32 %r30688, %r22704, %r22600, %r22720, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r30687, %r30688}; + // begin inline asm + // chi + lop3.b32 %r30679, %r22596, %r22716, %r22588, 0xD2; + lop3.b32 %r30680, %r22600, %r22720, %r22592, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r30679, %r30680}; + // begin inline asm + // chi + lop3.b32 %r30705, %r22764, %r22748, %r22636, 0xD2; + lop3.b32 %r30706, %r22768, %r22752, %r22640, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+104], {%r30705, %r30706}; + // begin inline asm + // chi + lop3.b32 %r30699, %r22748, %r22636, %r22644, 0xD2; + lop3.b32 %r30700, %r22752, %r22640, %r22648, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+112], {%r30699, %r30700}; + // begin inline asm + // chi + lop3.b32 %r30693, %r22636, %r22644, %r22612, 0xD2; + lop3.b32 %r30694, %r22640, %r22648, %r22616, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+120], {%r30693, %r30694}; + // begin inline asm + // chi + lop3.b32 %r30685, %r22644, %r22612, %r22764, 0xD2; + lop3.b32 %r30686, %r22648, %r22616, %r22768, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+128], {%r30685, %r30686}; + // begin inline asm + // chi + lop3.b32 %r30677, %r22612, %r22764, %r22748, 0xD2; + lop3.b32 %r30678, %r22616, %r22768, %r22752, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+136], {%r30677, %r30678}; + // begin inline asm + // chi + lop3.b32 %r30703, %r22668, %r22708, %r22740, 0xD2; + lop3.b32 %r30704, %r22672, %r22712, %r22744, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+144], {%r30703, %r30704}; + // begin inline asm + // chi + lop3.b32 %r30697, %r22708, %r22740, %r22732, 0xD2; + lop3.b32 %r30698, %r22712, %r22744, %r22736, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+152], {%r30697, %r30698}; + // begin inline asm + // chi + lop3.b32 %r30691, %r22740, %r22732, %r22652, 0xD2; + lop3.b32 %r30692, %r22744, %r22736, %r22656, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+160], {%r30691, %r30692}; + // begin inline asm + // chi + lop3.b32 %r30683, %r22732, %r22652, %r22668, 0xD2; + lop3.b32 %r30684, %r22736, %r22656, %r22672, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+168], {%r30683, %r30684}; + // begin inline asm + // chi + lop3.b32 %r30675, %r22652, %r22668, %r22708, 0xD2; + lop3.b32 %r30676, %r22656, %r22672, %r22712, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+176], {%r30675, %r30676}; + // begin inline asm + // chi + lop3.b32 %r30701, %r22620, %r22692, %r22604, 0xD2; + lop3.b32 %r30702, %r22624, %r22696, %r22608, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+184], {%r30701, %r30702}; + // begin inline asm + // chi + lop3.b32 %r30695, %r22692, %r22604, %r22660, 0xD2; + lop3.b32 %r30696, %r22696, %r22608, %r22664, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+192], {%r30695, %r30696}; + // begin inline asm + // chi + lop3.b32 %r30689, %r22604, %r22660, %r22684, 0xD2; + lop3.b32 %r30690, %r22608, %r22664, %r22688, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+200], {%r30689, %r30690}; + // begin inline asm + // chi + lop3.b32 %r30681, %r22660, %r22684, %r22620, 0xD2; + lop3.b32 %r30682, %r22664, %r22688, %r22624, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+208], {%r30681, %r30682}; + // begin inline asm + // chi + lop3.b32 %r30673, %r22684, %r22620, %r22692, 0xD2; + lop3.b32 %r30674, %r22688, %r22624, %r22696, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+216], {%r30673, %r30674}; + mul.wide.s32 %rd1002, %r30723, 8; + add.s64 %rd1001, %rd918, %rd1002; + // begin inline asm + ld.global.nc.v2.u32 {%r22972,%r22973}, [%rd1001]; + // end inline asm + xor.b32 %r30709, %r22772, %r22972; + xor.b32 %r30710, %r22773, %r22973; + add.s32 %r30723, %r30723, 1; + setp.lt.u32 %p46, %r30723, 23; + @%p46 bra $L__BB2_81; -$L__BB0_12: - ld.const.u64 %rd76, [target+16]; - setp.eq.s64 %p13, %rd479, %rd76; - @%p13 bra $L__BB0_14; - bra.uni $L__BB0_13; + mov.u32 %r23083, 1; + st.local.v2.u32 [%rd925+32], {%r30721, %r30722}; + st.local.v2.u32 [%rd925+72], {%r30719, %r30720}; + st.local.v2.u32 [%rd925+40], {%r30717, %r30718}; + st.local.v2.u32 [%rd925+80], {%r30715, %r30716}; + st.local.v2.u32 [%rd925+48], {%r30713, %r30714}; + st.local.v2.u32 [%rd925+56], {%r30711, %r30712}; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + // begin inline asm + // xor5 + lop3.b32 %r22984, %r30709, %r30707, %r30705, 0x96; + lop3.b32 %r22984, %r22984, %r30703, %r30701, 0x96; + lop3.b32 %r22985, %r30710, %r30708, %r30706, 0x96; + lop3.b32 %r22985, %r22985, %r30704, %r30702, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22996, %r30721, %r30719, %r30699, 0x96; + lop3.b32 %r22996, %r22996, %r30697, %r30695, 0x96; + lop3.b32 %r22997, %r30722, %r30720, %r30700, 0x96; + lop3.b32 %r22997, %r22997, %r30698, %r30696, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23008, %r30717, %r30715, %r30693, 0x96; + lop3.b32 %r23008, %r23008, %r30691, %r30689, 0x96; + lop3.b32 %r23009, %r30718, %r30716, %r30694, 0x96; + lop3.b32 %r23009, %r23009, %r30692, %r30690, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23020, %r30713, %r30687, %r30685, 0x96; + lop3.b32 %r23020, %r23020, %r30683, %r30681, 0x96; + lop3.b32 %r23021, %r30714, %r30688, %r30686, 0x96; + lop3.b32 %r23021, %r23021, %r30684, %r30682, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23032, %r30711, %r30679, %r30677, 0x96; + lop3.b32 %r23032, %r23032, %r30675, %r30673, 0x96; + lop3.b32 %r23033, %r30712, %r30680, %r30678, 0x96; + lop3.b32 %r23033, %r23033, %r30676, %r30674, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23044, %r22997, %r22996, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23048, %r22996, %r22997, %r23083; + // end inline asm + xor.b32 %r23222, %r23044, %r23032; + xor.b32 %r23223, %r23048, %r23033; + xor.b32 %r23191, %r30709, %r23222; + xor.b32 %r23194, %r30710, %r23223; + xor.b32 %r23154, %r30706, %r23223; + xor.b32 %r23153, %r30705, %r23222; + st.local.v2.u32 [%rd925+104], {%r23153, %r23154}; + // begin inline asm + shf.l.wrap.b32 %r23052, %r23009, %r23008, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23056, %r23008, %r23009, %r23083; + // end inline asm + xor.b32 %r23224, %r23052, %r22984; + xor.b32 %r23225, %r23056, %r22985; + xor.b32 %r23090, %r30719, %r23224; + xor.b32 %r23089, %r30720, %r23225; + xor.b32 %r23129, %r30698, %r23225; + xor.b32 %r23130, %r30697, %r23224; + st.local.v2.u32 [%rd925+152], {%r23130, %r23129}; + // begin inline asm + shf.l.wrap.b32 %r23060, %r23021, %r23020, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23064, %r23020, %r23021, %r23083; + // end inline asm + xor.b32 %r23226, %r23060, %r22996; + xor.b32 %r23227, %r23064, %r22997; + xor.b32 %r23113, %r30694, %r23227; + xor.b32 %r23114, %r30693, %r23226; + st.local.v2.u32 [%rd925+120], {%r23114, %r23113}; + xor.b32 %r23105, %r30690, %r23227; + xor.b32 %r23106, %r30689, %r23226; + st.local.v2.u32 [%rd925+200], {%r23106, %r23105}; + // begin inline asm + shf.l.wrap.b32 %r23068, %r23033, %r23032, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23072, %r23032, %r23033, %r23083; + // end inline asm + xor.b32 %r23228, %r23068, %r23008; + xor.b32 %r23229, %r23072, %r23009; + xor.b32 %r23137, %r30713, %r23228; + xor.b32 %r23138, %r30714, %r23229; + xor.b32 %r23146, %r30684, %r23229; + xor.b32 %r23145, %r30683, %r23228; + st.local.v2.u32 [%rd925+168], {%r23145, %r23146}; + // begin inline asm + shf.l.wrap.b32 %r23076, %r22985, %r22984, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23080, %r22984, %r22985, %r23083; + // end inline asm + xor.b32 %r23230, %r23076, %r23020; + xor.b32 %r23231, %r23080, %r23021; + xor.b32 %r23097, %r30679, %r23230; + xor.b32 %r23098, %r30680, %r23231; + xor.b32 %r23122, %r30674, %r23231; + xor.b32 %r23121, %r30673, %r23230; + st.local.v2.u32 [%rd925+216], {%r23121, %r23122}; + // begin inline asm + shf.l.wrap.b32 %r23084, %r23090, %r23089, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23088, %r23089, %r23090, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23092, %r23098, %r23097, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23096, %r23097, %r23098, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23104, %r23105, %r23106, %r22603; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23100, %r23106, %r23105, %r22603; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r23100, %r23104}; + // begin inline asm + shf.l.wrap.b32 %r23108, %r23114, %r23113, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23112, %r23113, %r23114, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23116, %r23122, %r23121, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23120, %r23121, %r23122, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23128, %r23129, %r23130, %r22707; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23124, %r23130, %r23129, %r22707; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r23124, %r23128}; + // begin inline asm + shf.l.wrap.b32 %r23132, %r23138, %r23137, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23136, %r23137, %r23138, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23140, %r23146, %r23145, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23144, %r23145, %r23146, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23148, %r23154, %r23153, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23152, %r23153, %r23154, %r22763; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23156, %r23191, %r23084, %r23108, 0xD2; + lop3.b32 %r23157, %r23194, %r23088, %r23112, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23164, %r23084, %r23108, %r23140, 0xD2; + lop3.b32 %r23165, %r23088, %r23112, %r23144, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+32], {%r23164, %r23165}; + // begin inline asm + // chi + lop3.b32 %r23172, %r23108, %r23140, %r23116, 0xD2; + lop3.b32 %r23173, %r23112, %r23144, %r23120, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+40], {%r23172, %r23173}; + // begin inline asm + // chi + lop3.b32 %r23180, %r23140, %r23116, %r23191, 0xD2; + lop3.b32 %r23181, %r23144, %r23120, %r23194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+48], {%r23180, %r23181}; + // begin inline asm + // chi + lop3.b32 %r23188, %r23116, %r23191, %r23084, 0xD2; + lop3.b32 %r23189, %r23120, %r23194, %r23088, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+56], {%r23188, %r23189}; + // begin inline asm + // chi + lop3.b32 %r23196, %r23132, %r23092, %r23148, 0xD2; + lop3.b32 %r23197, %r23136, %r23096, %r23152, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+64], {%r23196, %r23197}; + // begin inline asm + // chi + lop3.b32 %r23204, %r23092, %r23148, %r23124, 0xD2; + lop3.b32 %r23205, %r23096, %r23152, %r23128, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+72], {%r23204, %r23205}; + // begin inline asm + // chi + lop3.b32 %r23212, %r23148, %r23124, %r23100, 0xD2; + lop3.b32 %r23213, %r23152, %r23128, %r23104, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+80], {%r23212, %r23213}; + // begin inline asm + ld.global.nc.v2.u32 {%r23220,%r23221}, [%rd919]; + // end inline asm + xor.b32 %r23232, %r23157, %r23221; + xor.b32 %r23233, %r23156, %r23220; + st.local.v2.u32 [%rd925+24], {%r23233, %r23232}; + st.global.u64 [%rd224], %rd1269; + st.global.u64 [%rd224+8], %rd1270; + st.global.u64 [%rd224+16], %rd1271; + st.global.u64 [%rd224+24], %rd250; + st.global.u64 [%rd224+32], %rd1272; + st.global.u64 [%rd224+40], %rd252; + st.global.u64 [%rd224+48], %rd253; + st.global.u64 [%rd224+56], %rd254; + st.global.v2.u32 [%rd224+64], {%r23233, %r23232}; + st.global.v2.u32 [%rd224+72], {%r23164, %r23165}; + st.global.v2.u32 [%rd224+80], {%r23172, %r23173}; + st.global.v2.u32 [%rd224+88], {%r23180, %r23181}; + st.global.v2.u32 [%rd224+96], {%r23188, %r23189}; + st.global.v2.u32 [%rd224+104], {%r23196, %r23197}; + st.global.v2.u32 [%rd224+112], {%r23204, %r23205}; + st.global.v2.u32 [%rd224+120], {%r23212, %r23213}; -$L__BB0_14: - ld.const.u64 %rd77, [target+8]; - setp.eq.s64 %p14, %rd484, %rd77; - @%p14 bra $L__BB0_16; - bra.uni $L__BB0_15; +$L__BB2_94: + mul.lo.s32 %r26518, %r20, 16777619; + mov.b64 {%r26519, %r26520}, %rd1265; + mul.lo.s32 %r26521, %r19, 16777619; + xor.b32 %r26522, %r26518, %r26519; + xor.b32 %r26523, %r26521, %r26520; + mov.b64 %rd1116, {%r26522, %r26523}; + mov.b64 {%r26524, %r26525}, %rd1269; + xor.b32 %r26526, %r26525, %r19; + xor.b32 %r26527, %r26524, %r20; + mov.b64 %rd1117, {%r26527, %r26526}; + mov.b64 {%r26528, %r26529}, %rd1259; + mul.lo.s32 %r26530, %r26528, 16777619; + mov.b64 {%r26531, %r26532}, %rd1266; + mul.lo.s32 %r26533, %r26529, 16777619; + xor.b32 %r26534, %r26533, %r26532; + xor.b32 %r26535, %r26530, %r26531; + mov.b64 %rd1118, {%r26535, %r26534}; + mov.b64 {%r26536, %r26537}, %rd1270; + xor.b32 %r26538, %r26537, %r26529; + xor.b32 %r26539, %r26536, %r26528; + mov.b64 %rd1119, {%r26539, %r26538}; + mul.lo.s32 %r26540, %r24, 16777619; + mov.b64 {%r26541, %r26542}, %rd1267; + mul.lo.s32 %r26543, %r23, 16777619; + xor.b32 %r26544, %r26543, %r26542; + xor.b32 %r26545, %r26540, %r26541; + mov.b64 %rd1120, {%r26545, %r26544}; + mov.b64 {%r26546, %r26547}, %rd1271; + xor.b32 %r26548, %r26547, %r23; + xor.b32 %r26549, %r26546, %r24; + mov.b64 %rd1121, {%r26549, %r26548}; + mul.lo.s32 %r26550, %r28, 16777619; + mov.b64 {%r26551, %r26552}, %rd1268; + mul.lo.s32 %r26553, %r27, 16777619; + xor.b32 %r26554, %r26553, %r26552; + xor.b32 %r26555, %r26550, %r26551; + mov.b64 %rd1122, {%r26555, %r26554}; + mov.b64 {%r26556, %r26557}, %rd1272; + xor.b32 %r26558, %r26557, %r27; + xor.b32 %r26559, %r26556, %r28; + mov.b64 %rd1123, {%r26559, %r26558}; + mul.lo.s64 %rd1124, %rd1261, %rd1116; + add.s64 %rd1260, %rd1124, %rd1117; + mul.lo.s64 %rd1125, %rd1262, %rd1118; + add.s64 %rd1259, %rd1125, %rd1119; + mul.lo.s64 %rd1126, %rd1263, %rd1120; + add.s64 %rd1258, %rd1126, %rd1121; + mul.lo.s64 %rd1127, %rd1264, %rd1122; + add.s64 %rd1257, %rd1127, %rd1123; + add.s32 %r29538, %r29538, 1; + setp.lt.u32 %p52, %r29538, 32; + @%p52 bra $L__BB2_19; -$L__BB0_16: - ld.const.u64 %rd454, [target]; - setp.lt.u64 %p16, %rd73, %rd454; - bra.uni $L__BB0_17; + add.u64 %rd1250, %SPL, 2016; + add.u64 %rd1242, %SP, 2016; + add.u64 %rd1241, %SP, 0; + mov.u64 %rd1128, 0; + mov.b64 {%r26560, %r26561}, %rd1260; + mul.lo.s32 %r26562, %r26560, 16777619; + xor.b32 %r26563, %r26562, %r26561; + mul.lo.s32 %r26564, %r26563, 16777619; + mov.b64 {%r26565, %r26566}, %rd1259; + xor.b32 %r26567, %r26564, %r26565; + mul.lo.s32 %r26568, %r26567, 16777619; + xor.b32 %r26569, %r26568, %r26566; + mov.b32 {%rs498, %rs499}, %r26569; + mov.u32 %r26570, 0; + st.local.v4.u32 [%rd1250+32], {%r26570, %r26570, %r26570, %r26570}; + st.local.v4.u32 [%rd1250+48], {%r26570, %r26570, %r26570, %r26570}; + st.local.v4.u32 [%rd1250+64], {%r26570, %r26570, %r26570, %r26570}; + cvt.u64.u16 %rd1131, %rs498; + and.b64 %rd1132, %rd1131, 255; + or.b64 %rd1133, %rd26, %rd1132; + st.local.v2.u64 [%rd1250], {%rd1133, %rd23}; + st.local.v2.u64 [%rd1250+16], {%rd24, %rd25}; + mov.u32 %r26571, -1150833019; + mov.u32 %r26572, 1779033703; + st.local.v2.u32 [%rd3], {%r26572, %r26571}; + mov.u32 %r26573, -1521486534; + mov.u32 %r26574, 1013904242; + st.local.v2.u32 [%rd3+8], {%r26574, %r26573}; + mov.u32 %r26575, -1694144372; + mov.u32 %r26576, 1359893119; + st.local.v2.u32 [%rd3+16], {%r26576, %r26575}; + mov.u32 %r26577, 1541459225; + mov.u32 %r26578, 528734635; + st.local.v2.u32 [%rd3+24], {%r26578, %r26577}; + st.local.v2.u32 [%rd3+32], {%r26572, %r26571}; + st.local.v2.u32 [%rd3+40], {%r26574, %r26573}; + st.local.v2.u32 [%rd3+48], {%r26576, %r26575}; + st.local.v2.u32 [%rd3+56], {%r26578, %r26577}; + st.local.u64 [%rd3+64], %rd1128; + st.local.v2.u32 [%rd3+72], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+80], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+88], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+96], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+104], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+112], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+120], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+128], {%r26570, %r26570}; + mov.u16 %rs500, 0; + st.local.v2.u8 [%rd3+136], {%rs500, %rs500}; + st.local.u8 [%rd3+138], %rs500; + st.local.u8 [%rd3+144], %rs500; + { // callseq 13, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd1241; + .param .b64 param1; + st.param.b64 [param1+0], %rd1242; + call.uni + _Z20blake3_hasher_updateP13blake3_hasherPKvy, + ( + param0, + param1 + ); + } // callseq 13 + ld.local.u8 %rd1275, [%rd3+144]; + setp.eq.s64 %p53, %rd1275, 0; + @%p53 bra $L__BB2_103; -$L__BB0_11: - setp.lt.u64 %p16, %rd474, %rd75; - bra.uni $L__BB0_17; + ld.local.v2.u8 {%rs864, %rs502}, [%rd3+136]; + cvt.u32.u16 %r26579, %rs502; + mul.wide.u32 %rd1135, %r26579, 64; + cvt.u64.u16 %rd1136, %rs864; + neg.s64 %rd1137, %rd1136; + setp.eq.s64 %p54, %rd1135, %rd1137; + @%p54 bra $L__BB2_98; + bra.uni $L__BB2_97; -$L__BB0_13: - setp.lt.u64 %p16, %rd479, %rd76; - bra.uni $L__BB0_17; +$L__BB2_98: + add.s64 %rd1275, %rd1275, -2; + shl.b64 %rd1139, %rd1275, 5; + add.s64 %rd1142, %rd3, %rd1139; + ld.local.u8 %rs667, [%rd3+138]; + mov.u64 %rd1276, 0; + or.b16 %rs734, %rs667, 4; + ld.local.v2.u32 {%r30976, %r30975}, [%rd3]; + ld.local.v2.u32 {%r30974, %r30973}, [%rd3+8]; + ld.local.v2.u32 {%r30972, %r30971}, [%rd3+16]; + ld.local.v2.u32 {%r30970, %r30969}, [%rd3+24]; + ld.local.u8 %rs800, [%rd1142+145]; + ld.local.u8 %rs801, [%rd1142+146]; + ld.local.u8 %rs802, [%rd1142+147]; + ld.local.u8 %rs803, [%rd1142+148]; + ld.local.u8 %rs804, [%rd1142+149]; + ld.local.u8 %rs805, [%rd1142+150]; + ld.local.u8 %rs806, [%rd1142+151]; + ld.local.u8 %rs807, [%rd1142+152]; + ld.local.u8 %rs808, [%rd1142+153]; + ld.local.u8 %rs809, [%rd1142+154]; + ld.local.u8 %rs810, [%rd1142+155]; + ld.local.u8 %rs811, [%rd1142+156]; + ld.local.u8 %rs812, [%rd1142+157]; + ld.local.u8 %rs813, [%rd1142+158]; + ld.local.u8 %rs814, [%rd1142+159]; + ld.local.u8 %rs815, [%rd1142+160]; + ld.local.u8 %rs816, [%rd1142+161]; + ld.local.u8 %rs817, [%rd1142+162]; + ld.local.u8 %rs818, [%rd1142+163]; + ld.local.u8 %rs819, [%rd1142+164]; + ld.local.u8 %rs820, [%rd1142+165]; + ld.local.u8 %rs821, [%rd1142+166]; + ld.local.u8 %rs822, [%rd1142+167]; + ld.local.u8 %rs823, [%rd1142+168]; + ld.local.u8 %rs824, [%rd1142+169]; + ld.local.u8 %rs825, [%rd1142+170]; + ld.local.u8 %rs826, [%rd1142+171]; + ld.local.u8 %rs827, [%rd1142+172]; + ld.local.u8 %rs828, [%rd1142+173]; + ld.local.u8 %rs829, [%rd1142+174]; + ld.local.u8 %rs830, [%rd1142+175]; + ld.local.u8 %rs831, [%rd1142+176]; + ld.local.u8 %rs832, [%rd1142+177]; + ld.local.u8 %rs833, [%rd1142+178]; + ld.local.u8 %rs834, [%rd1142+179]; + ld.local.u8 %rs835, [%rd1142+180]; + ld.local.u8 %rs836, [%rd1142+181]; + ld.local.u8 %rs837, [%rd1142+182]; + ld.local.u8 %rs838, [%rd1142+183]; + ld.local.u8 %rs839, [%rd1142+184]; + ld.local.u8 %rs840, [%rd1142+185]; + ld.local.u8 %rs841, [%rd1142+186]; + ld.local.u8 %rs842, [%rd1142+187]; + ld.local.u8 %rs843, [%rd1142+188]; + ld.local.u8 %rs844, [%rd1142+189]; + ld.local.u8 %rs845, [%rd1142+190]; + ld.local.u8 %rs846, [%rd1142+191]; + ld.local.u8 %rs847, [%rd1142+192]; + ld.local.u8 %rs848, [%rd1142+193]; + ld.local.u8 %rs849, [%rd1142+194]; + ld.local.u8 %rs850, [%rd1142+195]; + ld.local.u8 %rs851, [%rd1142+196]; + ld.local.u8 %rs852, [%rd1142+197]; + ld.local.u8 %rs853, [%rd1142+198]; + ld.local.u8 %rs854, [%rd1142+199]; + ld.local.v4.u16 {%rs855, %rs857, %rs859, %rs861}, [%rd1142+200]; + shr.u16 %rs856, %rs855, 8; + shr.u16 %rs858, %rs857, 8; + shr.u16 %rs860, %rs859, 8; + shr.u16 %rs862, %rs861, 8; + ld.local.u8 %rs863, [%rd1142+208]; + mov.u16 %rs864, 64; + bra.uni $L__BB2_99; -$L__BB0_15: - setp.lt.u64 %p16, %rd484, %rd77; +$L__BB2_103: + ld.local.v4.u8 {%rs570, %rs571, %rs572, %rs573}, [%rd3+136]; + setp.eq.s16 %p58, %rs571, 0; + selp.u16 %rs575, 1, 0, %p58; + ld.local.v2.u32 {%r28596, %r28597}, [%rd3+32]; + ld.local.v2.u32 {%r28600, %r28601}, [%rd3+40]; + ld.local.v2.u32 {%r28604, %r28605}, [%rd3+48]; + ld.local.v2.u32 {%r28608, %r28609}, [%rd3+56]; + ld.local.v4.u16 {%rs576, %rs577, %rs578, %rs579}, [%rd3+72]; + shr.u16 %rs581, %rs576, 8; + shr.u16 %rs583, %rs577, 8; + shr.u16 %rs585, %rs578, 8; + shr.u16 %rs587, %rs579, 8; + ld.local.v4.u16 {%rs588, %rs589, %rs590, %rs591}, [%rd3+80]; + shr.u16 %rs593, %rs588, 8; + shr.u16 %rs595, %rs589, 8; + shr.u16 %rs597, %rs590, 8; + shr.u16 %rs599, %rs591, 8; + ld.local.v4.u16 {%rs600, %rs601, %rs602, %rs603}, [%rd3+88]; + shr.u16 %rs605, %rs600, 8; + shr.u16 %rs607, %rs601, 8; + shr.u16 %rs609, %rs602, 8; + shr.u16 %rs611, %rs603, 8; + ld.local.v4.u16 {%rs612, %rs613, %rs614, %rs615}, [%rd3+96]; + shr.u16 %rs617, %rs612, 8; + shr.u16 %rs619, %rs613, 8; + shr.u16 %rs621, %rs614, 8; + shr.u16 %rs623, %rs615, 8; + ld.local.v4.u16 {%rs624, %rs625, %rs626, %rs627}, [%rd3+104]; + shr.u16 %rs629, %rs624, 8; + shr.u16 %rs631, %rs625, 8; + shr.u16 %rs633, %rs626, 8; + shr.u16 %rs635, %rs627, 8; + ld.local.v4.u16 {%rs636, %rs637, %rs638, %rs639}, [%rd3+112]; + shr.u16 %rs641, %rs636, 8; + shr.u16 %rs643, %rs637, 8; + shr.u16 %rs645, %rs638, 8; + shr.u16 %rs647, %rs639, 8; + ld.local.v4.u16 {%rs648, %rs649, %rs650, %rs651}, [%rd3+120]; + shr.u16 %rs653, %rs648, 8; + shr.u16 %rs655, %rs649, 8; + ld.local.v2.u8 {%rs657, %rs658}, [%rd3+126]; + ld.local.u16 %r28612, [%rd3+132]; + ld.local.v2.u8 {%rs661, %rs662}, [%rd3+134]; + or.b16 %rs665, %rs572, %rs575; + or.b16 %rs666, %rs665, 10; + cvt.u32.u16 %r28613, %rs576; + and.b32 %r28614, %r28613, 255; + cvt.u32.u16 %r28615, %rs581; + prmt.b32 %r28616, %r28615, %r28614, 30212; + cvt.u32.u16 %r28617, %rs577; + prmt.b32 %r28618, %r28617, %r28616, 28756; + cvt.u32.u16 %r28619, %rs583; + prmt.b32 %r28620, %r28619, %r28618, 1620; + cvt.u32.u16 %r28621, %rs578; + and.b32 %r28622, %r28621, 255; + cvt.u32.u16 %r28623, %rs585; + prmt.b32 %r28624, %r28623, %r28622, 30212; + cvt.u32.u16 %r28625, %rs579; + prmt.b32 %r28626, %r28625, %r28624, 28756; + cvt.u32.u16 %r28627, %rs587; + prmt.b32 %r28628, %r28627, %r28626, 1620; + cvt.u32.u16 %r28629, %rs588; + and.b32 %r28630, %r28629, 255; + cvt.u32.u16 %r28631, %rs593; + prmt.b32 %r28632, %r28631, %r28630, 30212; + cvt.u32.u16 %r28633, %rs589; + prmt.b32 %r28634, %r28633, %r28632, 28756; + cvt.u32.u16 %r28635, %rs595; + prmt.b32 %r28636, %r28635, %r28634, 1620; + cvt.u32.u16 %r28637, %rs590; + and.b32 %r28638, %r28637, 255; + cvt.u32.u16 %r28639, %rs597; + prmt.b32 %r28640, %r28639, %r28638, 30212; + cvt.u32.u16 %r28641, %rs591; + prmt.b32 %r28642, %r28641, %r28640, 28756; + cvt.u32.u16 %r28643, %rs599; + prmt.b32 %r28644, %r28643, %r28642, 1620; + cvt.u32.u16 %r28645, %rs600; + and.b32 %r28646, %r28645, 255; + cvt.u32.u16 %r28647, %rs605; + prmt.b32 %r28648, %r28647, %r28646, 30212; + cvt.u32.u16 %r28649, %rs601; + prmt.b32 %r28650, %r28649, %r28648, 28756; + cvt.u32.u16 %r28651, %rs607; + prmt.b32 %r28652, %r28651, %r28650, 1620; + cvt.u32.u16 %r28653, %rs602; + and.b32 %r28654, %r28653, 255; + cvt.u32.u16 %r28655, %rs609; + prmt.b32 %r28656, %r28655, %r28654, 30212; + cvt.u32.u16 %r28657, %rs603; + prmt.b32 %r28658, %r28657, %r28656, 28756; + cvt.u32.u16 %r28659, %rs611; + prmt.b32 %r28660, %r28659, %r28658, 1620; + cvt.u32.u16 %r28661, %rs612; + and.b32 %r28662, %r28661, 255; + cvt.u32.u16 %r28663, %rs617; + prmt.b32 %r28664, %r28663, %r28662, 30212; + cvt.u32.u16 %r28665, %rs613; + prmt.b32 %r28666, %r28665, %r28664, 28756; + cvt.u32.u16 %r28667, %rs619; + prmt.b32 %r28668, %r28667, %r28666, 1620; + cvt.u32.u16 %r28669, %rs614; + and.b32 %r28670, %r28669, 255; + cvt.u32.u16 %r28671, %rs621; + prmt.b32 %r28672, %r28671, %r28670, 30212; + cvt.u32.u16 %r28673, %rs615; + prmt.b32 %r28674, %r28673, %r28672, 28756; + cvt.u32.u16 %r28675, %rs623; + prmt.b32 %r28676, %r28675, %r28674, 1620; + cvt.u32.u16 %r28677, %rs624; + and.b32 %r28678, %r28677, 255; + cvt.u32.u16 %r28679, %rs629; + prmt.b32 %r28680, %r28679, %r28678, 30212; + cvt.u32.u16 %r28681, %rs625; + prmt.b32 %r28682, %r28681, %r28680, 28756; + cvt.u32.u16 %r28683, %rs631; + prmt.b32 %r28684, %r28683, %r28682, 1620; + cvt.u32.u16 %r28685, %rs626; + and.b32 %r28686, %r28685, 255; + cvt.u32.u16 %r28687, %rs633; + prmt.b32 %r28688, %r28687, %r28686, 30212; + cvt.u32.u16 %r28689, %rs627; + prmt.b32 %r28690, %r28689, %r28688, 28756; + cvt.u32.u16 %r28691, %rs635; + prmt.b32 %r28692, %r28691, %r28690, 1620; + cvt.u32.u16 %r28693, %rs636; + and.b32 %r28694, %r28693, 255; + cvt.u32.u16 %r28695, %rs641; + prmt.b32 %r28696, %r28695, %r28694, 30212; + cvt.u32.u16 %r28697, %rs637; + prmt.b32 %r28698, %r28697, %r28696, 28756; + cvt.u32.u16 %r28699, %rs643; + prmt.b32 %r28700, %r28699, %r28698, 1620; + cvt.u32.u16 %r28701, %rs638; + and.b32 %r28702, %r28701, 255; + cvt.u32.u16 %r28703, %rs645; + prmt.b32 %r28704, %r28703, %r28702, 30212; + cvt.u32.u16 %r28705, %rs639; + prmt.b32 %r28706, %r28705, %r28704, 28756; + cvt.u32.u16 %r28707, %rs647; + prmt.b32 %r28708, %r28707, %r28706, 1620; + cvt.u32.u16 %r28709, %rs648; + and.b32 %r28710, %r28709, 255; + cvt.u32.u16 %r28711, %rs653; + prmt.b32 %r28712, %r28711, %r28710, 30212; + cvt.u32.u16 %r28713, %rs649; + prmt.b32 %r28714, %r28713, %r28712, 28756; + cvt.u32.u16 %r28715, %rs655; + prmt.b32 %r28716, %r28715, %r28714, 1620; + cvt.u32.u16 %r28717, %rs650; + and.b32 %r28718, %r28717, 255; + ld.local.u8 %r28719, [%rd3+125]; + prmt.b32 %r28720, %r28719, %r28718, 30212; + cvt.u32.u16 %r28721, %rs657; + prmt.b32 %r28722, %r28721, %r28720, 28756; + cvt.u32.u16 %r28723, %rs658; + prmt.b32 %r28724, %r28723, %r28722, 1620; + ld.local.u32 %r28725, [%rd3+128]; + cvt.u32.u16 %r28726, %rs661; + prmt.b32 %r28727, %r28726, %r28612, 28756; + cvt.u32.u16 %r28728, %rs662; + prmt.b32 %r28729, %r28728, %r28727, 1620; + cvt.u32.u16 %r28730, %rs570; + cvt.u32.u16 %r28731, %rs666; + and.b32 %r28732, %r28731, 255; + add.s32 %r28733, %r28604, %r28596; + add.s32 %r28734, %r28733, %r28620; + add.s32 %r28735, %r28628, %r28734; + add.s32 %r28736, %r28605, %r28597; + add.s32 %r28737, %r28736, %r28636; + add.s32 %r28738, %r28644, %r28737; + add.s32 %r28739, %r28608, %r28600; + add.s32 %r28740, %r28739, %r28652; + xor.b32 %r28741, %r28740, %r28730; + shr.u32 %r28742, %r28740, 16; + shl.b32 %r28743, %r28741, 16; + or.b32 %r28744, %r28743, %r28742; + add.s32 %r28745, %r28744, 1013904242; + xor.b32 %r28746, %r28745, %r28608; + shf.l.wrap.b32 %r28747, %r28746, %r28746, 20; + add.s32 %r28748, %r28660, %r28740; + add.s32 %r28749, %r28748, %r28747; + xor.b32 %r28750, %r28749, %r28744; + shf.l.wrap.b32 %r28751, %r28750, %r28750, 24; + add.s32 %r28752, %r28751, %r28745; + xor.b32 %r28753, %r28752, %r28747; + shf.l.wrap.b32 %r28754, %r28753, %r28753, 25; + add.s32 %r28755, %r28609, %r28601; + add.s32 %r28756, %r28755, %r28668; + xor.b32 %r28757, %r28756, %r28732; + shr.u32 %r28758, %r28756, 16; + shl.b32 %r28759, %r28757, 16; + or.b32 %r28760, %r28759, %r28758; + add.s32 %r28761, %r28760, -1521486534; + xor.b32 %r28762, %r28761, %r28609; + shf.l.wrap.b32 %r28763, %r28762, %r28762, 20; + add.s32 %r28764, %r28676, %r28756; + add.s32 %r28765, %r28764, %r28763; + xor.b32 %r28766, %r28765, %r28760; + shf.l.wrap.b32 %r28767, %r28766, %r28766, 24; + add.s32 %r28768, %r28767, %r28761; + xor.b32 %r28769, %r28768, %r28763; + shf.l.wrap.b32 %r28770, %r28769, %r28769, 25; + add.s32 %r28771, %r28700, %r28754; + add.s32 %r28772, %r28770, %r28749; + add.s32 %r28773, %r28772, %r28716; + add.s32 %r28774, %r28724, %r28773; + add.s32 %r28775, %r28725, %r28765; + shf.l.wrap.b32 %r28776, %r28734, %r28734, 16; + add.s32 %r28777, %r28776, 1779033703; + xor.b32 %r28778, %r28777, %r28604; + shf.l.wrap.b32 %r28779, %r28778, %r28778, 20; + add.s32 %r28780, %r28735, %r28779; + xor.b32 %r28781, %r28780, %r28776; + shf.l.wrap.b32 %r28782, %r28781, %r28781, 24; + add.s32 %r28783, %r28782, %r28777; + xor.b32 %r28784, %r28783, %r28779; + shf.l.wrap.b32 %r28785, %r28784, %r28784, 25; + shf.l.wrap.b32 %r28786, %r28737, %r28737, 16; + add.s32 %r28787, %r28786, -1150833019; + xor.b32 %r28788, %r28787, %r28605; + shf.l.wrap.b32 %r28789, %r28788, %r28788, 20; + add.s32 %r28790, %r28738, %r28789; + xor.b32 %r28791, %r28790, %r28786; + shf.l.wrap.b32 %r28792, %r28791, %r28791, 24; + add.s32 %r28793, %r28792, %r28787; + xor.b32 %r28794, %r28793, %r28789; + shf.l.wrap.b32 %r28795, %r28794, %r28794, 25; + add.s32 %r28796, %r28780, %r28684; + add.s32 %r28797, %r28796, %r28795; + xor.b32 %r28798, %r28797, %r28767; + shf.l.wrap.b32 %r28799, %r28798, %r28798, 16; + add.s32 %r28800, %r28799, %r28752; + xor.b32 %r28801, %r28800, %r28795; + shf.l.wrap.b32 %r28802, %r28801, %r28801, 20; + add.s32 %r28803, %r28797, %r28692; + add.s32 %r28804, %r28803, %r28802; + xor.b32 %r28805, %r28804, %r28799; + shf.l.wrap.b32 %r28806, %r28805, %r28805, 24; + add.s32 %r28807, %r28806, %r28800; + xor.b32 %r28808, %r28807, %r28802; + shf.l.wrap.b32 %r28809, %r28808, %r28808, 25; + add.s32 %r28810, %r28771, %r28790; + xor.b32 %r28811, %r28782, %r28810; + shf.l.wrap.b32 %r28812, %r28811, %r28811, 16; + add.s32 %r28813, %r28812, %r28768; + xor.b32 %r28814, %r28813, %r28754; + shf.l.wrap.b32 %r28815, %r28814, %r28814, 20; + add.s32 %r28816, %r28810, %r28708; + add.s32 %r28817, %r28816, %r28815; + xor.b32 %r28818, %r28817, %r28812; + shf.l.wrap.b32 %r28819, %r28818, %r28818, 24; + add.s32 %r28820, %r28819, %r28813; + xor.b32 %r28821, %r28820, %r28815; + shf.l.wrap.b32 %r28822, %r28821, %r28821, 25; + xor.b32 %r28823, %r28792, %r28773; + shf.l.wrap.b32 %r28824, %r28823, %r28823, 16; + add.s32 %r28825, %r28824, %r28783; + xor.b32 %r28826, %r28825, %r28770; + shf.l.wrap.b32 %r28827, %r28826, %r28826, 20; + add.s32 %r28828, %r28774, %r28827; + xor.b32 %r28829, %r28828, %r28824; + shf.l.wrap.b32 %r28830, %r28829, %r28829, 24; + add.s32 %r28831, %r28830, %r28825; + xor.b32 %r28832, %r28831, %r28827; + shf.l.wrap.b32 %r28833, %r28832, %r28832, 25; + add.s32 %r28834, %r28775, %r28785; + xor.b32 %r28835, %r28834, %r28751; + shf.l.wrap.b32 %r28836, %r28835, %r28835, 16; + add.s32 %r28837, %r28836, %r28793; + xor.b32 %r28838, %r28837, %r28785; + shf.l.wrap.b32 %r28839, %r28838, %r28838, 20; + add.s32 %r28840, %r28834, %r28729; + add.s32 %r28841, %r28840, %r28839; + xor.b32 %r28842, %r28841, %r28836; + shf.l.wrap.b32 %r28843, %r28842, %r28842, 24; + add.s32 %r28844, %r28843, %r28837; + xor.b32 %r28845, %r28844, %r28839; + shf.l.wrap.b32 %r28846, %r28845, %r28845, 25; + add.s32 %r28847, %r28804, %r28636; + add.s32 %r28848, %r28847, %r28846; + xor.b32 %r28849, %r28848, %r28819; + shf.l.wrap.b32 %r28850, %r28849, %r28849, 16; + add.s32 %r28851, %r28850, %r28831; + xor.b32 %r28852, %r28851, %r28846; + shf.l.wrap.b32 %r28853, %r28852, %r28852, 20; + add.s32 %r28854, %r28848, %r28668; + add.s32 %r28855, %r28854, %r28853; + xor.b32 %r28856, %r28855, %r28850; + shf.l.wrap.b32 %r28857, %r28856, %r28856, 24; + add.s32 %r28858, %r28857, %r28851; + xor.b32 %r28859, %r28858, %r28853; + shf.l.wrap.b32 %r28860, %r28859, %r28859, 25; + add.s32 %r28861, %r28817, %r28644; + add.s32 %r28862, %r28861, %r28809; + xor.b32 %r28863, %r28862, %r28830; + shf.l.wrap.b32 %r28864, %r28863, %r28863, 16; + add.s32 %r28865, %r28864, %r28844; + xor.b32 %r28866, %r28865, %r28809; + shf.l.wrap.b32 %r28867, %r28866, %r28866, 20; + add.s32 %r28868, %r28862, %r28700; + add.s32 %r28869, %r28868, %r28867; + xor.b32 %r28870, %r28869, %r28864; + shf.l.wrap.b32 %r28871, %r28870, %r28870, 24; + add.s32 %r28872, %r28871, %r28865; + xor.b32 %r28873, %r28872, %r28867; + shf.l.wrap.b32 %r28874, %r28873, %r28873, 25; + add.s32 %r28875, %r28828, %r28676; + add.s32 %r28876, %r28875, %r28822; + xor.b32 %r28877, %r28843, %r28876; + shf.l.wrap.b32 %r28878, %r28877, %r28877, 16; + add.s32 %r28879, %r28878, %r28807; + xor.b32 %r28880, %r28879, %r28822; + shf.l.wrap.b32 %r28881, %r28880, %r28880, 20; + add.s32 %r28882, %r28876, %r28620; + add.s32 %r28883, %r28882, %r28881; + xor.b32 %r28884, %r28883, %r28878; + shf.l.wrap.b32 %r28885, %r28884, %r28884, 24; + add.s32 %r28886, %r28885, %r28879; + xor.b32 %r28887, %r28886, %r28881; + shf.l.wrap.b32 %r28888, %r28887, %r28887, 25; + add.s32 %r28889, %r28841, %r28652; + add.s32 %r28890, %r28889, %r28833; + xor.b32 %r28891, %r28806, %r28890; + shf.l.wrap.b32 %r28892, %r28891, %r28891, 16; + add.s32 %r28893, %r28892, %r28820; + xor.b32 %r28894, %r28893, %r28833; + shf.l.wrap.b32 %r28895, %r28894, %r28894, 20; + add.s32 %r28896, %r28890, %r28724; + add.s32 %r28897, %r28896, %r28895; + xor.b32 %r28898, %r28897, %r28892; + shf.l.wrap.b32 %r28899, %r28898, %r28898, 24; + add.s32 %r28900, %r28899, %r28893; + xor.b32 %r28901, %r28900, %r28895; + shf.l.wrap.b32 %r28902, %r28901, %r28901, 25; + add.s32 %r28903, %r28855, %r28628; + add.s32 %r28904, %r28903, %r28874; + xor.b32 %r28905, %r28904, %r28899; + shf.l.wrap.b32 %r28906, %r28905, %r28905, 16; + add.s32 %r28907, %r28906, %r28886; + xor.b32 %r28908, %r28907, %r28874; + shf.l.wrap.b32 %r28909, %r28908, %r28908, 20; + add.s32 %r28910, %r28904, %r28708; + add.s32 %r28911, %r28910, %r28909; + xor.b32 %r28912, %r28911, %r28906; + shf.l.wrap.b32 %r28913, %r28912, %r28912, 24; + add.s32 %r28914, %r28913, %r28907; + xor.b32 %r28915, %r28914, %r28909; + shf.l.wrap.b32 %r28916, %r28915, %r28915, 25; + add.s32 %r28917, %r28888, %r28716; + add.s32 %r28918, %r28917, %r28869; + xor.b32 %r28919, %r28857, %r28918; + shf.l.wrap.b32 %r28920, %r28919, %r28919, 16; + add.s32 %r28921, %r28920, %r28900; + xor.b32 %r28922, %r28921, %r28888; + shf.l.wrap.b32 %r28923, %r28922, %r28922, 20; + add.s32 %r28924, %r28918, %r28660; + add.s32 %r28925, %r28924, %r28923; + xor.b32 %r28926, %r28925, %r28920; + shf.l.wrap.b32 %r28927, %r28926, %r28926, 24; + add.s32 %r28928, %r28927, %r28921; + xor.b32 %r28929, %r28928, %r28923; + shf.l.wrap.b32 %r28930, %r28929, %r28929, 25; + add.s32 %r28931, %r28883, %r28692; + add.s32 %r28932, %r28931, %r28902; + xor.b32 %r28933, %r28871, %r28932; + shf.l.wrap.b32 %r28934, %r28933, %r28933, 16; + add.s32 %r28935, %r28934, %r28858; + xor.b32 %r28936, %r28935, %r28902; + shf.l.wrap.b32 %r28937, %r28936, %r28936, 20; + add.s32 %r28938, %r28932, %r28725; + add.s32 %r28939, %r28938, %r28937; + xor.b32 %r28940, %r28939, %r28934; + shf.l.wrap.b32 %r28941, %r28940, %r28940, 24; + add.s32 %r28942, %r28941, %r28935; + xor.b32 %r28943, %r28942, %r28937; + shf.l.wrap.b32 %r28944, %r28943, %r28943, 25; + add.s32 %r28945, %r28897, %r28729; + add.s32 %r28946, %r28945, %r28860; + xor.b32 %r28947, %r28946, %r28885; + shf.l.wrap.b32 %r28948, %r28947, %r28947, 16; + add.s32 %r28949, %r28948, %r28872; + xor.b32 %r28950, %r28949, %r28860; + shf.l.wrap.b32 %r28951, %r28950, %r28950, 20; + add.s32 %r28952, %r28946, %r28684; + add.s32 %r28953, %r28952, %r28951; + xor.b32 %r28954, %r28953, %r28948; + shf.l.wrap.b32 %r28955, %r28954, %r28954, 24; + add.s32 %r28956, %r28955, %r28949; + xor.b32 %r28957, %r28956, %r28951; + shf.l.wrap.b32 %r28958, %r28957, %r28957, 25; + add.s32 %r28959, %r28911, %r28644; + add.s32 %r28960, %r28959, %r28958; + xor.b32 %r28961, %r28960, %r28927; + shf.l.wrap.b32 %r28962, %r28961, %r28961, 16; + add.s32 %r28963, %r28962, %r28942; + xor.b32 %r28964, %r28963, %r28958; + shf.l.wrap.b32 %r28965, %r28964, %r28964, 20; + add.s32 %r28966, %r28960, %r28652; + add.s32 %r28967, %r28966, %r28965; + xor.b32 %r28968, %r28967, %r28962; + shf.l.wrap.b32 %r28969, %r28968, %r28968, 24; + add.s32 %r28970, %r28969, %r28963; + xor.b32 %r28971, %r28970, %r28965; + shf.l.wrap.b32 %r28972, %r28971, %r28971, 25; + add.s32 %r28973, %r28925, %r28700; + add.s32 %r28974, %r28973, %r28916; + xor.b32 %r28975, %r28974, %r28941; + shf.l.wrap.b32 %r28976, %r28975, %r28975, 16; + add.s32 %r28977, %r28976, %r28956; + xor.b32 %r28978, %r28977, %r28916; + shf.l.wrap.b32 %r28979, %r28978, %r28978, 20; + add.s32 %r28980, %r28974, %r28716; + add.s32 %r28981, %r28980, %r28979; + xor.b32 %r28982, %r28981, %r28976; + shf.l.wrap.b32 %r28983, %r28982, %r28982, 24; + add.s32 %r28984, %r28983, %r28977; + xor.b32 %r28985, %r28984, %r28979; + shf.l.wrap.b32 %r28986, %r28985, %r28985, 25; + add.s32 %r28987, %r28939, %r28724; + add.s32 %r28988, %r28987, %r28930; + xor.b32 %r28989, %r28955, %r28988; + shf.l.wrap.b32 %r28990, %r28989, %r28989, 16; + add.s32 %r28991, %r28990, %r28914; + xor.b32 %r28992, %r28991, %r28930; + shf.l.wrap.b32 %r28993, %r28992, %r28992, 20; + add.s32 %r28994, %r28988, %r28636; + add.s32 %r28995, %r28994, %r28993; + xor.b32 %r28996, %r28995, %r28990; + shf.l.wrap.b32 %r28997, %r28996, %r28996, 24; + add.s32 %r28998, %r28997, %r28991; + xor.b32 %r28999, %r28998, %r28993; + shf.l.wrap.b32 %r29000, %r28999, %r28999, 25; + add.s32 %r29001, %r28953, %r28676; + add.s32 %r29002, %r29001, %r28944; + xor.b32 %r29003, %r28913, %r29002; + shf.l.wrap.b32 %r29004, %r29003, %r29003, 16; + add.s32 %r29005, %r29004, %r28928; + xor.b32 %r29006, %r29005, %r28944; + shf.l.wrap.b32 %r29007, %r29006, %r29006, 20; + add.s32 %r29008, %r29002, %r28725; + add.s32 %r29009, %r29008, %r29007; + xor.b32 %r29010, %r29009, %r29004; + shf.l.wrap.b32 %r29011, %r29010, %r29010, 24; + add.s32 %r29012, %r29011, %r29005; + xor.b32 %r29013, %r29012, %r29007; + shf.l.wrap.b32 %r29014, %r29013, %r29013, 25; + add.s32 %r29015, %r28967, %r28668; + add.s32 %r29016, %r29015, %r28986; + xor.b32 %r29017, %r29016, %r29011; + shf.l.wrap.b32 %r29018, %r29017, %r29017, 16; + add.s32 %r29019, %r29018, %r28998; + xor.b32 %r29020, %r29019, %r28986; + shf.l.wrap.b32 %r29021, %r29020, %r29020, 20; + add.s32 %r29022, %r29016, %r28660; + add.s32 %r29023, %r29022, %r29021; + xor.b32 %r29024, %r29023, %r29018; + shf.l.wrap.b32 %r29025, %r29024, %r29024, 24; + add.s32 %r29026, %r29025, %r29019; + xor.b32 %r29027, %r29026, %r29021; + shf.l.wrap.b32 %r29028, %r29027, %r29027, 25; + add.s32 %r29029, %r29000, %r28692; + add.s32 %r29030, %r29029, %r28981; + xor.b32 %r29031, %r28969, %r29030; + shf.l.wrap.b32 %r29032, %r29031, %r29031, 16; + add.s32 %r29033, %r29032, %r29012; + xor.b32 %r29034, %r29033, %r29000; + shf.l.wrap.b32 %r29035, %r29034, %r29034, 20; + add.s32 %r29036, %r29030, %r28620; + add.s32 %r29037, %r29036, %r29035; + xor.b32 %r29038, %r29037, %r29032; + shf.l.wrap.b32 %r29039, %r29038, %r29038, 24; + add.s32 %r29040, %r29039, %r29033; + xor.b32 %r29041, %r29040, %r29035; + shf.l.wrap.b32 %r29042, %r29041, %r29041, 25; + add.s32 %r29043, %r28995, %r28708; + add.s32 %r29044, %r29043, %r29014; + xor.b32 %r29045, %r28983, %r29044; + shf.l.wrap.b32 %r29046, %r29045, %r29045, 16; + add.s32 %r29047, %r29046, %r28970; + xor.b32 %r29048, %r29047, %r29014; + shf.l.wrap.b32 %r29049, %r29048, %r29048, 20; + add.s32 %r29050, %r29044, %r28729; + add.s32 %r29051, %r29050, %r29049; + xor.b32 %r29052, %r29051, %r29046; + shf.l.wrap.b32 %r29053, %r29052, %r29052, 24; + add.s32 %r29054, %r29053, %r29047; + xor.b32 %r29055, %r29054, %r29049; + shf.l.wrap.b32 %r29056, %r29055, %r29055, 25; + add.s32 %r29057, %r29009, %r28684; + add.s32 %r29058, %r29057, %r28972; + xor.b32 %r29059, %r29058, %r28997; + shf.l.wrap.b32 %r29060, %r29059, %r29059, 16; + add.s32 %r29061, %r29060, %r28984; + xor.b32 %r29062, %r29061, %r28972; + shf.l.wrap.b32 %r29063, %r29062, %r29062, 20; + add.s32 %r29064, %r29058, %r28628; + add.s32 %r29065, %r29064, %r29063; + xor.b32 %r29066, %r29065, %r29060; + shf.l.wrap.b32 %r29067, %r29066, %r29066, 24; + add.s32 %r29068, %r29067, %r29061; + xor.b32 %r29069, %r29068, %r29063; + shf.l.wrap.b32 %r29070, %r29069, %r29069, 25; + add.s32 %r29071, %r29023, %r28700; + add.s32 %r29072, %r29071, %r29070; + xor.b32 %r29073, %r29072, %r29039; + shf.l.wrap.b32 %r29074, %r29073, %r29073, 16; + add.s32 %r29075, %r29074, %r29054; + xor.b32 %r29076, %r29075, %r29070; + shf.l.wrap.b32 %r29077, %r29076, %r29076, 20; + add.s32 %r29078, %r29072, %r28676; + add.s32 %r29079, %r29078, %r29077; + xor.b32 %r29080, %r29079, %r29074; + shf.l.wrap.b32 %r29081, %r29080, %r29080, 24; + add.s32 %r29082, %r29081, %r29075; + xor.b32 %r29083, %r29082, %r29077; + shf.l.wrap.b32 %r29084, %r29083, %r29083, 25; + add.s32 %r29085, %r29037, %r28716; + add.s32 %r29086, %r29085, %r29028; + xor.b32 %r29087, %r29086, %r29053; + shf.l.wrap.b32 %r29088, %r29087, %r29087, 16; + add.s32 %r29089, %r29088, %r29068; + xor.b32 %r29090, %r29089, %r29028; + shf.l.wrap.b32 %r29091, %r29090, %r29090, 20; + add.s32 %r29092, %r29086, %r28692; + add.s32 %r29093, %r29092, %r29091; + xor.b32 %r29094, %r29093, %r29088; + shf.l.wrap.b32 %r29095, %r29094, %r29094, 24; + add.s32 %r29096, %r29095, %r29089; + xor.b32 %r29097, %r29096, %r29091; + shf.l.wrap.b32 %r29098, %r29097, %r29097, 25; + add.s32 %r29099, %r29051, %r28725; + add.s32 %r29100, %r29099, %r29042; + xor.b32 %r29101, %r29067, %r29100; + shf.l.wrap.b32 %r29102, %r29101, %r29101, 16; + add.s32 %r29103, %r29102, %r29026; + xor.b32 %r29104, %r29103, %r29042; + shf.l.wrap.b32 %r29105, %r29104, %r29104, 20; + add.s32 %r29106, %r29100, %r28644; + add.s32 %r29107, %r29106, %r29105; + xor.b32 %r29108, %r29107, %r29102; + shf.l.wrap.b32 %r29109, %r29108, %r29108, 24; + add.s32 %r29110, %r29109, %r29103; + xor.b32 %r29111, %r29110, %r29105; + shf.l.wrap.b32 %r29112, %r29111, %r29111, 25; + add.s32 %r29113, %r29065, %r28724; + add.s32 %r29114, %r29113, %r29056; + xor.b32 %r29115, %r29025, %r29114; + shf.l.wrap.b32 %r29116, %r29115, %r29115, 16; + add.s32 %r29117, %r29116, %r29040; + xor.b32 %r29118, %r29117, %r29056; + shf.l.wrap.b32 %r29119, %r29118, %r29118, 20; + add.s32 %r29120, %r29114, %r28729; + add.s32 %r29121, %r29120, %r29119; + xor.b32 %r29122, %r29121, %r29116; + shf.l.wrap.b32 %r29123, %r29122, %r29122, 24; + add.s32 %r29124, %r29123, %r29117; + xor.b32 %r29125, %r29124, %r29119; + shf.l.wrap.b32 %r29126, %r29125, %r29125, 25; + add.s32 %r29127, %r29079, %r28652; + add.s32 %r29128, %r29127, %r29098; + xor.b32 %r29129, %r29128, %r29123; + shf.l.wrap.b32 %r29130, %r29129, %r29129, 16; + add.s32 %r29131, %r29130, %r29110; + xor.b32 %r29132, %r29131, %r29098; + shf.l.wrap.b32 %r29133, %r29132, %r29132, 20; + add.s32 %r29134, %r29128, %r28620; + add.s32 %r29135, %r29134, %r29133; + xor.b32 %r29136, %r29135, %r29130; + shf.l.wrap.b32 %r29137, %r29136, %r29136, 24; + add.s32 %r29138, %r29137, %r29131; + xor.b32 %r29139, %r29138, %r29133; + shf.l.wrap.b32 %r29140, %r29139, %r29139, 25; + add.s32 %r29141, %r29112, %r28708; + add.s32 %r29142, %r29141, %r29093; + xor.b32 %r29143, %r29081, %r29142; + shf.l.wrap.b32 %r29144, %r29143, %r29143, 16; + add.s32 %r29145, %r29144, %r29124; + xor.b32 %r29146, %r29145, %r29112; + shf.l.wrap.b32 %r29147, %r29146, %r29146, 20; + add.s32 %r29148, %r29142, %r28636; + add.s32 %r29149, %r29148, %r29147; + xor.b32 %r29150, %r29149, %r29144; + shf.l.wrap.b32 %r29151, %r29150, %r29150, 24; + add.s32 %r29152, %r29151, %r29145; + xor.b32 %r29153, %r29152, %r29147; + shf.l.wrap.b32 %r29154, %r29153, %r29153, 25; + add.s32 %r29155, %r29107, %r28660; + add.s32 %r29156, %r29155, %r29126; + xor.b32 %r29157, %r29095, %r29156; + shf.l.wrap.b32 %r29158, %r29157, %r29157, 16; + add.s32 %r29159, %r29158, %r29082; + xor.b32 %r29160, %r29159, %r29126; + shf.l.wrap.b32 %r29161, %r29160, %r29160, 20; + add.s32 %r29162, %r29156, %r28684; + add.s32 %r29163, %r29162, %r29161; + xor.b32 %r29164, %r29163, %r29158; + shf.l.wrap.b32 %r29165, %r29164, %r29164, 24; + add.s32 %r29166, %r29165, %r29159; + xor.b32 %r29167, %r29166, %r29161; + shf.l.wrap.b32 %r29168, %r29167, %r29167, 25; + add.s32 %r29169, %r29121, %r28628; + add.s32 %r29170, %r29169, %r29084; + xor.b32 %r29171, %r29170, %r29109; + shf.l.wrap.b32 %r29172, %r29171, %r29171, 16; + add.s32 %r29173, %r29172, %r29096; + xor.b32 %r29174, %r29173, %r29084; + shf.l.wrap.b32 %r29175, %r29174, %r29174, 20; + add.s32 %r29176, %r29170, %r28668; + add.s32 %r29177, %r29176, %r29175; + xor.b32 %r29178, %r29177, %r29172; + shf.l.wrap.b32 %r29179, %r29178, %r29178, 24; + add.s32 %r29180, %r29179, %r29173; + xor.b32 %r29181, %r29180, %r29175; + shf.l.wrap.b32 %r29182, %r29181, %r29181, 25; + add.s32 %r29183, %r29135, %r28716; + add.s32 %r29184, %r29183, %r29182; + xor.b32 %r29185, %r29184, %r29151; + shf.l.wrap.b32 %r29186, %r29185, %r29185, 16; + add.s32 %r29187, %r29186, %r29166; + xor.b32 %r29188, %r29187, %r29182; + shf.l.wrap.b32 %r29189, %r29188, %r29188, 20; + add.s32 %r29190, %r29184, %r28724; + add.s32 %r29191, %r29190, %r29189; + xor.b32 %r29192, %r29191, %r29186; + shf.l.wrap.b32 %r29193, %r29192, %r29192, 24; + add.s32 %r29194, %r29193, %r29187; + xor.b32 %r29195, %r29194, %r29189; + shf.l.wrap.b32 %r29196, %r29195, %r29195, 25; + add.s32 %r29197, %r29149, %r28692; + add.s32 %r29198, %r29197, %r29140; + xor.b32 %r29199, %r29198, %r29165; + shf.l.wrap.b32 %r29200, %r29199, %r29199, 16; + add.s32 %r29201, %r29200, %r29180; + xor.b32 %r29202, %r29201, %r29140; + shf.l.wrap.b32 %r29203, %r29202, %r29202, 20; + add.s32 %r29204, %r29198, %r28708; + add.s32 %r29205, %r29204, %r29203; + xor.b32 %r29206, %r29205, %r29200; + shf.l.wrap.b32 %r29207, %r29206, %r29206, 24; + add.s32 %r29208, %r29207, %r29201; + xor.b32 %r29209, %r29208, %r29203; + shf.l.wrap.b32 %r29210, %r29209, %r29209, 25; + add.s32 %r29211, %r29163, %r28729; + add.s32 %r29212, %r29211, %r29154; + xor.b32 %r29213, %r29179, %r29212; + shf.l.wrap.b32 %r29214, %r29213, %r29213, 16; + add.s32 %r29215, %r29214, %r29138; + xor.b32 %r29216, %r29215, %r29154; + shf.l.wrap.b32 %r29217, %r29216, %r29216, 20; + add.s32 %r29218, %r29212, %r28700; + add.s32 %r29219, %r29218, %r29217; + xor.b32 %r29220, %r29219, %r29214; + shf.l.wrap.b32 %r29221, %r29220, %r29220, 24; + add.s32 %r29222, %r29221, %r29215; + xor.b32 %r29223, %r29222, %r29217; + shf.l.wrap.b32 %r29224, %r29223, %r29223, 25; + add.s32 %r29225, %r29177, %r28725; + add.s32 %r29226, %r29225, %r29168; + xor.b32 %r29227, %r29137, %r29226; + shf.l.wrap.b32 %r29228, %r29227, %r29227, 16; + add.s32 %r29229, %r29228, %r29152; + xor.b32 %r29230, %r29229, %r29168; + shf.l.wrap.b32 %r29231, %r29230, %r29230, 20; + add.s32 %r29232, %r29226, %r28684; + add.s32 %r29233, %r29232, %r29231; + xor.b32 %r29234, %r29233, %r29228; + shf.l.wrap.b32 %r29235, %r29234, %r29234, 24; + add.s32 %r29236, %r29235, %r29229; + xor.b32 %r29237, %r29236, %r29231; + shf.l.wrap.b32 %r29238, %r29237, %r29237, 25; + add.s32 %r29239, %r29191, %r28676; + add.s32 %r29240, %r29239, %r29210; + xor.b32 %r29241, %r29240, %r29235; + shf.l.wrap.b32 %r29242, %r29241, %r29241, 16; + add.s32 %r29243, %r29242, %r29222; + xor.b32 %r29244, %r29243, %r29210; + shf.l.wrap.b32 %r29245, %r29244, %r29244, 20; + add.s32 %r29246, %r29240, %r28636; + add.s32 %r29247, %r29246, %r29245; + xor.b32 %r29248, %r29247, %r29242; + shf.l.wrap.b32 %r29249, %r29248, %r29248, 24; + add.s32 %r29250, %r29249, %r29243; + xor.b32 %r29251, %r29250, %r29245; + shf.l.wrap.b32 %r29252, %r29251, %r29251, 25; + add.s32 %r29253, %r29224, %r28660; + add.s32 %r29254, %r29253, %r29205; + xor.b32 %r29255, %r29193, %r29254; + shf.l.wrap.b32 %r29256, %r29255, %r29255, 16; + add.s32 %r29257, %r29256, %r29236; + xor.b32 %r29258, %r29257, %r29224; + shf.l.wrap.b32 %r29259, %r29258, %r29258, 20; + add.s32 %r29260, %r29254, %r28644; + add.s32 %r29261, %r29260, %r29259; + xor.b32 %r29262, %r29261, %r29256; + shf.l.wrap.b32 %r29263, %r29262, %r29262, 24; + add.s32 %r29264, %r29263, %r29257; + xor.b32 %r29265, %r29264, %r29259; + shf.l.wrap.b32 %r29266, %r29265, %r29265, 25; + add.s32 %r29267, %r29219, %r28620; + add.s32 %r29268, %r29267, %r29238; + xor.b32 %r29269, %r29207, %r29268; + shf.l.wrap.b32 %r29270, %r29269, %r29269, 16; + add.s32 %r29271, %r29270, %r29194; + xor.b32 %r29272, %r29271, %r29238; + shf.l.wrap.b32 %r29273, %r29272, %r29272, 20; + add.s32 %r29274, %r29268, %r28628; + add.s32 %r29275, %r29274, %r29273; + xor.b32 %r29276, %r29275, %r29270; + shf.l.wrap.b32 %r29277, %r29276, %r29276, 24; + add.s32 %r29278, %r29277, %r29271; + xor.b32 %r29279, %r29278, %r29273; + shf.l.wrap.b32 %r29280, %r29279, %r29279, 25; + add.s32 %r29281, %r29233, %r28668; + add.s32 %r29282, %r29281, %r29196; + xor.b32 %r29283, %r29282, %r29221; + shf.l.wrap.b32 %r29284, %r29283, %r29283, 16; + add.s32 %r29285, %r29284, %r29208; + xor.b32 %r29286, %r29285, %r29196; + shf.l.wrap.b32 %r29287, %r29286, %r29286, 20; + add.s32 %r29288, %r29282, %r28652; + add.s32 %r29289, %r29288, %r29287; + xor.b32 %r29290, %r29289, %r29284; + shf.l.wrap.b32 %r29291, %r29290, %r29290, 24; + add.s32 %r29292, %r29291, %r29285; + xor.b32 %r29293, %r29292, %r29287; + shf.l.wrap.b32 %r29294, %r29293, %r29293, 25; + add.s32 %r29295, %r29247, %r28692; + add.s32 %r29296, %r29295, %r29294; + xor.b32 %r29297, %r29296, %r29263; + shf.l.wrap.b32 %r29298, %r29297, %r29297, 16; + add.s32 %r29299, %r29298, %r29278; + xor.b32 %r29300, %r29299, %r29294; + shf.l.wrap.b32 %r29301, %r29300, %r29300, 20; + add.s32 %r29302, %r29296, %r28725; + add.s32 %r29303, %r29302, %r29301; + xor.b32 %r29304, %r29303, %r29298; + shf.l.wrap.b32 %r29305, %r29304, %r29304, 24; + add.s32 %r29306, %r29305, %r29299; + xor.b32 %r29307, %r29306, %r29301; + shf.l.wrap.b32 %r29308, %r29307, %r29307, 25; + add.s32 %r29309, %r29261, %r28708; + add.s32 %r29310, %r29309, %r29252; + xor.b32 %r29311, %r29310, %r29277; + shf.l.wrap.b32 %r29312, %r29311, %r29311, 16; + add.s32 %r29313, %r29312, %r29292; + xor.b32 %r29314, %r29313, %r29252; + shf.l.wrap.b32 %r29315, %r29314, %r29314, 20; + add.s32 %r29316, %r29310, %r28660; + add.s32 %r29317, %r29316, %r29315; + xor.b32 %r29318, %r29317, %r29312; + shf.l.wrap.b32 %r29319, %r29318, %r29318, 24; + add.s32 %r29320, %r29319, %r29313; + xor.b32 %r29321, %r29320, %r29315; + shf.l.wrap.b32 %r29322, %r29321, %r29321, 25; + add.s32 %r29323, %r29275, %r28684; + add.s32 %r29324, %r29323, %r29266; + xor.b32 %r29325, %r29291, %r29324; + shf.l.wrap.b32 %r29326, %r29325, %r29325, 16; + add.s32 %r29327, %r29326, %r29250; + xor.b32 %r29328, %r29327, %r29266; + shf.l.wrap.b32 %r29329, %r29328, %r29328, 20; + add.s32 %r29330, %r29324, %r28716; + add.s32 %r29331, %r29330, %r29329; + xor.b32 %r29332, %r29331, %r29326; + shf.l.wrap.b32 %r29333, %r29332, %r29332, 24; + add.s32 %r29334, %r29333, %r29327; + xor.b32 %r29335, %r29334, %r29329; + shf.l.wrap.b32 %r29336, %r29335, %r29335, 25; + add.s32 %r29337, %r29289, %r28729; + add.s32 %r29338, %r29337, %r29280; + xor.b32 %r29339, %r29249, %r29338; + shf.l.wrap.b32 %r29340, %r29339, %r29339, 16; + add.s32 %r29341, %r29340, %r29264; + xor.b32 %r29342, %r29341, %r29280; + shf.l.wrap.b32 %r29343, %r29342, %r29342, 20; + add.s32 %r29344, %r29338, %r28628; + add.s32 %r29345, %r29344, %r29343; + xor.b32 %r29346, %r29345, %r29340; + shf.l.wrap.b32 %r29347, %r29346, %r29346, 24; + add.s32 %r29348, %r29347, %r29341; + xor.b32 %r29349, %r29348, %r29343; + shf.l.wrap.b32 %r29350, %r29349, %r29349, 25; + add.s32 %r29351, %r29303, %r28724; + add.s32 %r29352, %r29351, %r29322; + xor.b32 %r29353, %r29352, %r29347; + shf.l.wrap.b32 %r29354, %r29353, %r29353, 16; + add.s32 %r29355, %r29354, %r29334; + xor.b32 %r29356, %r29355, %r29322; + shf.l.wrap.b32 %r29357, %r29356, %r29356, 20; + add.s32 %r29358, %r29352, %r28644; + add.s32 %r29359, %r29358, %r29357; + xor.b32 %r29360, %r29359, %r29354; + shf.l.wrap.b32 %r29361, %r29360, %r29360, 24; + add.s32 %r29362, %r29361, %r29355; + xor.b32 %r29363, %r29362, %r29357; + shf.l.wrap.b32 %r29364, %r29363, %r29363, 25; + add.s32 %r29365, %r29336, %r28620; + add.s32 %r29366, %r29365, %r29317; + xor.b32 %r29367, %r29305, %r29366; + shf.l.wrap.b32 %r29368, %r29367, %r29367, 16; + add.s32 %r29369, %r29368, %r29348; + xor.b32 %r29370, %r29369, %r29336; + shf.l.wrap.b32 %r29371, %r29370, %r29370, 20; + add.s32 %r29372, %r29366, %r28700; + add.s32 %r29373, %r29372, %r29371; + xor.b32 %r29374, %r29373, %r29368; + shf.l.wrap.b32 %r29375, %r29374, %r29374, 24; + add.s32 %r29376, %r29375, %r29369; + xor.b32 %r29377, %r29376, %r29371; + shf.l.wrap.b32 %r29378, %r29377, %r29377, 25; + add.s32 %r29379, %r29331, %r28636; + add.s32 %r29380, %r29379, %r29350; + xor.b32 %r29381, %r29319, %r29380; + shf.l.wrap.b32 %r29382, %r29381, %r29381, 16; + add.s32 %r29383, %r29382, %r29306; + xor.b32 %r29384, %r29383, %r29350; + shf.l.wrap.b32 %r29385, %r29384, %r29384, 20; + add.s32 %r29386, %r29380, %r28668; + add.s32 %r29387, %r29386, %r29385; + xor.b32 %r29388, %r29387, %r29382; + shf.l.wrap.b32 %r29389, %r29388, %r29388, 24; + add.s32 %r29390, %r29389, %r29383; + xor.b32 %r29391, %r29390, %r29385; + shf.l.wrap.b32 %r29392, %r29391, %r29391, 25; + add.s32 %r29393, %r29345, %r28652; + add.s32 %r29394, %r29393, %r29308; + xor.b32 %r29395, %r29394, %r29333; + shf.l.wrap.b32 %r29396, %r29395, %r29395, 16; + add.s32 %r29397, %r29396, %r29320; + xor.b32 %r29398, %r29397, %r29308; + shf.l.wrap.b32 %r29399, %r29398, %r29398, 20; + add.s32 %r29400, %r29394, %r28676; + add.s32 %r29401, %r29400, %r29399; + xor.b32 %r29402, %r29401, %r29396; + shf.l.wrap.b32 %r29403, %r29402, %r29402, 24; + add.s32 %r29404, %r29403, %r29397; + xor.b32 %r29405, %r29404, %r29399; + shf.l.wrap.b32 %r29406, %r29405, %r29405, 25; + add.s32 %r29407, %r29359, %r28708; + add.s32 %r29408, %r29407, %r29406; + xor.b32 %r29409, %r29408, %r29375; + shf.l.wrap.b32 %r29410, %r29409, %r29409, 16; + add.s32 %r29411, %r29410, %r29390; + xor.b32 %r29412, %r29411, %r29406; + shf.l.wrap.b32 %r29413, %r29412, %r29412, 20; + add.s32 %r29414, %r29408, %r28729; + add.s32 %r29415, %r29414, %r29413; + xor.b32 %r29416, %r29415, %r29410; + shf.l.wrap.b32 %r29417, %r29416, %r29416, 24; + add.s32 %r29418, %r29417, %r29411; + xor.b32 %r29419, %r29418, %r29413; + shf.l.wrap.b32 %r29420, %r29419, %r29419, 25; + add.s32 %r29421, %r29373, %r28660; + add.s32 %r29422, %r29421, %r29364; + xor.b32 %r29423, %r29422, %r29389; + shf.l.wrap.b32 %r29424, %r29423, %r29423, 16; + add.s32 %r29425, %r29424, %r29404; + xor.b32 %r29426, %r29425, %r29364; + shf.l.wrap.b32 %r29427, %r29426, %r29426, 20; + add.s32 %r29428, %r29422, %r28620; + add.s32 %r29429, %r29428, %r29427; + xor.b32 %r29430, %r29429, %r29424; + shf.l.wrap.b32 %r29431, %r29430, %r29430, 24; + add.s32 %r29432, %r29431, %r29425; + xor.b32 %r29433, %r29432, %r29427; + shf.l.wrap.b32 %r29434, %r29433, %r29433, 25; + add.s32 %r29435, %r29387, %r28628; + add.s32 %r29436, %r29435, %r29378; + xor.b32 %r29437, %r29403, %r29436; + shf.l.wrap.b32 %r29438, %r29437, %r29437, 16; + add.s32 %r29439, %r29438, %r29362; + xor.b32 %r29440, %r29439, %r29378; + shf.l.wrap.b32 %r29441, %r29440, %r29440, 20; + add.s32 %r29442, %r29436, %r28692; + add.s32 %r29443, %r29442, %r29441; + xor.b32 %r29444, %r29443, %r29438; + shf.l.wrap.b32 %r29445, %r29444, %r29444, 24; + add.s32 %r29446, %r29445, %r29439; + xor.b32 %r29447, %r29446, %r29441; + shf.l.wrap.b32 %r29448, %r29447, %r29447, 25; + add.s32 %r29449, %r29401, %r28684; + add.s32 %r29450, %r29449, %r29392; + xor.b32 %r29451, %r29361, %r29450; + shf.l.wrap.b32 %r29452, %r29451, %r29451, 16; + add.s32 %r29453, %r29452, %r29376; + xor.b32 %r29454, %r29453, %r29392; + shf.l.wrap.b32 %r29455, %r29454, %r29454, 20; + add.s32 %r29456, %r29450, %r28668; + add.s32 %r29457, %r29456, %r29455; + xor.b32 %r29458, %r29457, %r29452; + shf.l.wrap.b32 %r29459, %r29458, %r29458, 24; + add.s32 %r29460, %r29459, %r29453; + xor.b32 %r29461, %r29460, %r29455; + shf.l.wrap.b32 %r29462, %r29461, %r29461, 25; + add.s32 %r29463, %r29415, %r28725; + add.s32 %r29464, %r29463, %r29434; + xor.b32 %r29465, %r29464, %r29459; + shf.l.wrap.b32 %r29466, %r29465, %r29465, 16; + add.s32 %r29467, %r29466, %r29446; + xor.b32 %r29468, %r29467, %r29434; + shf.l.wrap.b32 %r29469, %r29468, %r29468, 20; + add.s32 %r29470, %r29464, %r28700; + add.s32 %r29471, %r29470, %r29469; + xor.b32 %r29472, %r29471, %r29466; + shf.l.wrap.b32 %r29473, %r29472, %r29472, 24; + add.s32 %r29474, %r29473, %r29467; + xor.b32 %r29475, %r29474, %r29469; + shf.l.wrap.b32 %r29476, %r29475, %r29475, 25; + add.s32 %r29477, %r29448, %r28636; + add.s32 %r29478, %r29477, %r29429; + xor.b32 %r29479, %r29417, %r29478; + shf.l.wrap.b32 %r29480, %r29479, %r29479, 16; + add.s32 %r29481, %r29480, %r29460; + xor.b32 %r29482, %r29481, %r29448; + shf.l.wrap.b32 %r29483, %r29482, %r29482, 20; + add.s32 %r29484, %r29478, %r28716; + add.s32 %r29485, %r29484, %r29483; + xor.b32 %r29486, %r29485, %r29480; + shf.l.wrap.b32 %r29487, %r29486, %r29486, 24; + add.s32 %r29488, %r29487, %r29481; + xor.b32 %r29489, %r29488, %r29483; + shf.l.wrap.b32 %r29490, %r29489, %r29489, 25; + add.s32 %r29491, %r29443, %r28644; + add.s32 %r29492, %r29491, %r29462; + xor.b32 %r29493, %r29431, %r29492; + shf.l.wrap.b32 %r29494, %r29493, %r29493, 16; + add.s32 %r29495, %r29494, %r29418; + xor.b32 %r29496, %r29495, %r29462; + shf.l.wrap.b32 %r29497, %r29496, %r29496, 20; + add.s32 %r29498, %r29492, %r28652; + add.s32 %r29499, %r29498, %r29497; + xor.b32 %r29500, %r29499, %r29494; + shf.l.wrap.b32 %r29501, %r29500, %r29500, 24; + add.s32 %r29502, %r29501, %r29495; + xor.b32 %r29503, %r29502, %r29497; + shf.l.wrap.b32 %r29504, %r29503, %r29503, 25; + add.s32 %r29505, %r29457, %r28676; + add.s32 %r29506, %r29505, %r29420; + xor.b32 %r29507, %r29506, %r29445; + shf.l.wrap.b32 %r29508, %r29507, %r29507, 16; + add.s32 %r29509, %r29508, %r29432; + xor.b32 %r29510, %r29509, %r29420; + shf.l.wrap.b32 %r29511, %r29510, %r29510, 20; + add.s32 %r29512, %r29506, %r28724; + add.s32 %r29513, %r29512, %r29511; + xor.b32 %r29514, %r29513, %r29508; + shf.l.wrap.b32 %r29515, %r29514, %r29514, 24; + add.s32 %r29516, %r29515, %r29509; + xor.b32 %r29517, %r29516, %r29511; + shf.l.wrap.b32 %r29518, %r29517, %r29517, 25; + xor.b32 %r29519, %r29471, %r29502; + cvt.u64.u32 %rd1190, %r29519; + xor.b32 %r29520, %r29516, %r29485; + and.b32 %r29521, %r29520, 255; + cvt.u64.u32 %rd1191, %r29521; + cvt.u64.u32 %rd1192, %r29520; + shl.b64 %rd1193, %rd1192, 32; + and.b64 %rd1194, %rd1193, 280375465082880; + and.b64 %rd1195, %rd1193, 71776119061217280; + shr.u32 %r29522, %r29520, 24; + cvt.u64.u32 %rd1196, %r29522; + shl.b64 %rd1197, %rd1196, 56; + bfi.b64 %rd1198, %rd1191, %rd1190, 32, 32; + or.b64 %rd1199, %rd1198, %rd1194; + or.b64 %rd1200, %rd1199, %rd1195; + or.b64 %rd341, %rd1200, %rd1197; + xor.b32 %r29523, %r29474, %r29499; + cvt.u64.u32 %rd1201, %r29523; + xor.b32 %r29524, %r29513, %r29488; + and.b32 %r29525, %r29524, 255; + cvt.u64.u32 %rd1202, %r29525; + cvt.u64.u32 %rd1203, %r29524; + shl.b64 %rd1204, %rd1203, 32; + and.b64 %rd1205, %rd1204, 280375465082880; + and.b64 %rd1206, %rd1204, 71776119061217280; + shr.u32 %r29526, %r29524, 24; + cvt.u64.u32 %rd1207, %r29526; + shl.b64 %rd1208, %rd1207, 56; + bfi.b64 %rd1209, %rd1202, %rd1201, 32, 32; + or.b64 %rd1210, %rd1209, %rd1205; + or.b64 %rd1211, %rd1210, %rd1206; + or.b64 %rd345, %rd1211, %rd1208; + xor.b32 %r29527, %r29518, %r29487; + cvt.u64.u32 %rd1212, %r29527; + xor.b32 %r29528, %r29476, %r29501; + and.b32 %r29529, %r29528, 255; + cvt.u64.u32 %rd1213, %r29529; + cvt.u64.u32 %rd1214, %r29528; + shl.b64 %rd1215, %rd1214, 32; + and.b64 %rd1216, %rd1215, 280375465082880; + and.b64 %rd1217, %rd1215, 71776119061217280; + shr.u32 %r29530, %r29528, 24; + cvt.u64.u32 %rd1218, %r29530; + shl.b64 %rd1219, %rd1218, 56; + bfi.b64 %rd1220, %rd1213, %rd1212, 32, 32; + or.b64 %rd1221, %rd1220, %rd1216; + or.b64 %rd1222, %rd1221, %rd1217; + or.b64 %rd1280, %rd1222, %rd1219; + xor.b32 %r29531, %r29515, %r29490; + cvt.u64.u32 %rd1223, %r29531; + xor.b32 %r29532, %r29473, %r29504; + and.b32 %r29533, %r29532, 255; + cvt.u64.u32 %rd1224, %r29533; + cvt.u64.u32 %rd1225, %r29532; + shl.b64 %rd1226, %rd1225, 32; + and.b64 %rd1227, %rd1226, 280375465082880; + and.b64 %rd1228, %rd1226, 71776119061217280; + shr.u32 %r29534, %r29532, 24; + cvt.u64.u32 %rd1229, %r29534; + shl.b64 %rd1230, %rd1229, 56; + bfi.b64 %rd1231, %rd1224, %rd1223, 32, 32; + or.b64 %rd1232, %rd1231, %rd1227; + or.b64 %rd1233, %rd1232, %rd1228; + or.b64 %rd1279, %rd1233, %rd1230; + mov.u64 %rd342, %rd341; + bra.uni $L__BB2_104; -$L__BB0_17: - not.pred %p15, %p16; - @%p15 bra $L__BB0_19; +$L__BB2_97: + setp.eq.s16 %p55, %rs502, 0; + selp.u16 %rs504, 1, 0, %p55; + ld.local.u8 %rs667, [%rd3+138]; + or.b16 %rs505, %rs667, %rs504; + or.b16 %rs734, %rs505, 2; + ld.local.u64 %rd1276, [%rd3+64]; + ld.local.v2.u32 {%r30976, %r30975}, [%rd3+32]; + ld.local.v2.u32 {%r30974, %r30973}, [%rd3+40]; + ld.local.v2.u32 {%r30972, %r30971}, [%rd3+48]; + ld.local.v2.u32 {%r30970, %r30969}, [%rd3+56]; + ld.local.v4.u16 {%rs800, %rs802, %rs804, %rs806}, [%rd3+72]; + shr.u16 %rs801, %rs800, 8; + shr.u16 %rs803, %rs802, 8; + shr.u16 %rs805, %rs804, 8; + shr.u16 %rs807, %rs806, 8; + ld.local.v4.u16 {%rs808, %rs810, %rs812, %rs814}, [%rd3+80]; + shr.u16 %rs809, %rs808, 8; + shr.u16 %rs811, %rs810, 8; + shr.u16 %rs813, %rs812, 8; + shr.u16 %rs815, %rs814, 8; + ld.local.v4.u16 {%rs816, %rs818, %rs820, %rs822}, [%rd3+88]; + shr.u16 %rs817, %rs816, 8; + shr.u16 %rs819, %rs818, 8; + shr.u16 %rs821, %rs820, 8; + shr.u16 %rs823, %rs822, 8; + ld.local.v4.u16 {%rs824, %rs826, %rs828, %rs830}, [%rd3+96]; + shr.u16 %rs825, %rs824, 8; + shr.u16 %rs827, %rs826, 8; + shr.u16 %rs829, %rs828, 8; + shr.u16 %rs831, %rs830, 8; + ld.local.v4.u16 {%rs832, %rs834, %rs836, %rs838}, [%rd3+104]; + shr.u16 %rs833, %rs832, 8; + shr.u16 %rs835, %rs834, 8; + shr.u16 %rs837, %rs836, 8; + shr.u16 %rs839, %rs838, 8; + ld.local.v4.u16 {%rs840, %rs842, %rs844, %rs846}, [%rd3+112]; + shr.u16 %rs841, %rs840, 8; + shr.u16 %rs843, %rs842, 8; + shr.u16 %rs845, %rs844, 8; + shr.u16 %rs847, %rs846, 8; + ld.local.v4.u8 {%rs848, %rs849, %rs850, %rs851}, [%rd3+120]; + ld.local.v2.u8 {%rs852, %rs853}, [%rd3+124]; + ld.local.v2.u8 {%rs854, %rs855}, [%rd3+126]; + ld.local.v4.u8 {%rs856, %rs857, %rs858, %rs859}, [%rd3+128]; + ld.local.v2.u8 {%rs860, %rs861}, [%rd3+132]; + ld.local.v2.u8 {%rs862, %rs863}, [%rd3+134]; - ld.param.u64 %rd462, [heavy_hash_param_0]; - ld.param.u64 %rd461, [heavy_hash_param_1]; - and.b64 %rd460, %rd463, %rd462; - or.b64 %rd459, %rd460, %rd461; - ld.param.u64 %rd458, [heavy_hash_param_5]; - cvta.to.global.u64 %rd457, %rd458; - mov.u64 %rd455, 0; - atom.global.cas.b64 %rd456, [%rd457], %rd455, %rd459; +$L__BB2_99: + setp.eq.s64 %p56, %rd1275, 0; + mov.u32 %r30977, %r30976; + mov.u32 %r30978, %r30975; + mov.u32 %r30979, %r30974; + mov.u32 %r30980, %r30973; + mov.u32 %r30981, %r30972; + mov.u32 %r30982, %r30971; + mov.u32 %r30983, %r30970; + mov.u32 %r30984, %r30969; + mov.u16 %rs865, %rs734; + @%p56 bra $L__BB2_102; -$L__BB0_19: + or.b16 %rs865, %rs667, 4; + ld.local.v2.u32 {%r30977, %r30978}, [%rd3]; + ld.local.v2.u32 {%r30979, %r30980}, [%rd3+8]; + ld.local.v2.u32 {%r30981, %r30982}, [%rd3+16]; + ld.local.v2.u32 {%r30983, %r30984}, [%rd3+24]; + mov.u16 %rs768, %rs831; + mov.u16 %rs769, %rs830; + mov.u16 %rs770, %rs829; + mov.u16 %rs771, %rs828; + mov.u16 %rs772, %rs827; + mov.u16 %rs773, %rs826; + mov.u16 %rs774, %rs825; + mov.u16 %rs775, %rs824; + mov.u16 %rs776, %rs823; + mov.u16 %rs777, %rs822; + mov.u16 %rs778, %rs821; + mov.u16 %rs779, %rs820; + mov.u16 %rs780, %rs819; + mov.u16 %rs781, %rs818; + mov.u16 %rs782, %rs817; + mov.u16 %rs783, %rs816; + mov.u16 %rs784, %rs815; + mov.u16 %rs785, %rs814; + mov.u16 %rs786, %rs813; + mov.u16 %rs787, %rs812; + mov.u16 %rs788, %rs811; + mov.u16 %rs789, %rs810; + mov.u16 %rs790, %rs809; + mov.u16 %rs791, %rs808; + mov.u16 %rs792, %rs807; + mov.u16 %rs793, %rs806; + mov.u16 %rs794, %rs805; + mov.u16 %rs795, %rs804; + mov.u16 %rs796, %rs803; + mov.u16 %rs797, %rs802; + mov.u16 %rs798, %rs801; + mov.u16 %rs799, %rs800; + +$L__BB2_101: + add.s64 %rd1275, %rd1275, -1; + shl.b64 %rd1144, %rd1275, 5; + add.s64 %rd1145, %rd3, %rd1144; + ld.local.u8 %rs800, [%rd1145+145]; + mov.u64 %rd1143, 0; + ld.local.u8 %rs801, [%rd1145+146]; + ld.local.u8 %rs802, [%rd1145+147]; + ld.local.u8 %rs803, [%rd1145+148]; + ld.local.u8 %rs804, [%rd1145+149]; + ld.local.u8 %rs805, [%rd1145+150]; + ld.local.u8 %rs806, [%rd1145+151]; + ld.local.u8 %rs807, [%rd1145+152]; + ld.local.u8 %rs808, [%rd1145+153]; + ld.local.u8 %rs809, [%rd1145+154]; + ld.local.u8 %rs810, [%rd1145+155]; + ld.local.u8 %rs811, [%rd1145+156]; + ld.local.u8 %rs812, [%rd1145+157]; + ld.local.u8 %rs813, [%rd1145+158]; + ld.local.u8 %rs814, [%rd1145+159]; + ld.local.u8 %rs815, [%rd1145+160]; + ld.local.u8 %rs816, [%rd1145+161]; + ld.local.u8 %rs817, [%rd1145+162]; + ld.local.u8 %rs818, [%rd1145+163]; + ld.local.u8 %rs819, [%rd1145+164]; + ld.local.u8 %rs820, [%rd1145+165]; + ld.local.u8 %rs821, [%rd1145+166]; + ld.local.u8 %rs822, [%rd1145+167]; + ld.local.u8 %rs823, [%rd1145+168]; + ld.local.u8 %rs824, [%rd1145+169]; + ld.local.u8 %rs825, [%rd1145+170]; + ld.local.u8 %rs826, [%rd1145+171]; + ld.local.u8 %rs827, [%rd1145+172]; + ld.local.u8 %rs828, [%rd1145+173]; + ld.local.u8 %rs829, [%rd1145+174]; + ld.local.u8 %rs830, [%rd1145+175]; + ld.local.u8 %rs831, [%rd1145+176]; + cvt.u32.u16 %r26604, %rs799; + and.b32 %r26605, %r26604, 255; + cvt.u32.u16 %r26606, %rs798; + prmt.b32 %r26607, %r26606, %r26605, 30212; + cvt.u32.u16 %r26608, %rs797; + shl.b32 %r26609, %r26608, 16; + and.b32 %r26610, %r26609, 16711680; + or.b32 %r26611, %r26607, %r26610; + cvt.u32.u16 %r26612, %rs796; + shl.b32 %r26613, %r26612, 24; + or.b32 %r26614, %r26611, %r26613; + cvt.u32.u16 %r26615, %rs795; + and.b32 %r26616, %r26615, 255; + cvt.u32.u16 %r26617, %rs794; + prmt.b32 %r26618, %r26617, %r26616, 30212; + cvt.u32.u16 %r26619, %rs793; + shl.b32 %r26620, %r26619, 16; + and.b32 %r26621, %r26620, 16711680; + or.b32 %r26622, %r26618, %r26621; + cvt.u32.u16 %r26623, %rs792; + shl.b32 %r26624, %r26623, 24; + or.b32 %r26625, %r26622, %r26624; + cvt.u32.u16 %r26626, %rs791; + and.b32 %r26627, %r26626, 255; + cvt.u32.u16 %r26628, %rs790; + prmt.b32 %r26629, %r26628, %r26627, 30212; + cvt.u32.u16 %r26630, %rs789; + shl.b32 %r26631, %r26630, 16; + and.b32 %r26632, %r26631, 16711680; + or.b32 %r26633, %r26629, %r26632; + cvt.u32.u16 %r26634, %rs788; + shl.b32 %r26635, %r26634, 24; + or.b32 %r26636, %r26633, %r26635; + cvt.u32.u16 %r26637, %rs787; + and.b32 %r26638, %r26637, 255; + cvt.u32.u16 %r26639, %rs786; + prmt.b32 %r26640, %r26639, %r26638, 30212; + cvt.u32.u16 %r26641, %rs785; + shl.b32 %r26642, %r26641, 16; + and.b32 %r26643, %r26642, 16711680; + or.b32 %r26644, %r26640, %r26643; + cvt.u32.u16 %r26645, %rs784; + shl.b32 %r26646, %r26645, 24; + or.b32 %r26647, %r26644, %r26646; + cvt.u32.u16 %r26648, %rs783; + and.b32 %r26649, %r26648, 255; + cvt.u32.u16 %r26650, %rs782; + prmt.b32 %r26651, %r26650, %r26649, 30212; + cvt.u32.u16 %r26652, %rs781; + shl.b32 %r26653, %r26652, 16; + and.b32 %r26654, %r26653, 16711680; + or.b32 %r26655, %r26651, %r26654; + cvt.u32.u16 %r26656, %rs780; + shl.b32 %r26657, %r26656, 24; + or.b32 %r26658, %r26655, %r26657; + cvt.u32.u16 %r26659, %rs779; + and.b32 %r26660, %r26659, 255; + cvt.u32.u16 %r26661, %rs778; + prmt.b32 %r26662, %r26661, %r26660, 30212; + cvt.u32.u16 %r26663, %rs777; + shl.b32 %r26664, %r26663, 16; + and.b32 %r26665, %r26664, 16711680; + or.b32 %r26666, %r26662, %r26665; + cvt.u32.u16 %r26667, %rs776; + shl.b32 %r26668, %r26667, 24; + or.b32 %r26669, %r26666, %r26668; + cvt.u32.u16 %r26670, %rs775; + and.b32 %r26671, %r26670, 255; + cvt.u32.u16 %r26672, %rs774; + prmt.b32 %r26673, %r26672, %r26671, 30212; + cvt.u32.u16 %r26674, %rs773; + shl.b32 %r26675, %r26674, 16; + and.b32 %r26676, %r26675, 16711680; + or.b32 %r26677, %r26673, %r26676; + cvt.u32.u16 %r26678, %rs772; + shl.b32 %r26679, %r26678, 24; + or.b32 %r26680, %r26677, %r26679; + cvt.u32.u16 %r26681, %rs771; + and.b32 %r26682, %r26681, 255; + cvt.u32.u16 %r26683, %rs770; + prmt.b32 %r26684, %r26683, %r26682, 30212; + cvt.u32.u16 %r26685, %rs769; + shl.b32 %r26686, %r26685, 16; + and.b32 %r26687, %r26686, 16711680; + or.b32 %r26688, %r26684, %r26687; + cvt.u32.u16 %r26689, %rs768; + shl.b32 %r26690, %r26689, 24; + or.b32 %r26691, %r26688, %r26690; + cvt.u32.u16 %r26692, %rs832; + and.b32 %r26693, %r26692, 255; + cvt.u32.u16 %r26694, %rs833; + prmt.b32 %r26695, %r26694, %r26693, 30212; + cvt.u32.u16 %r26696, %rs834; + shl.b32 %r26697, %r26696, 16; + and.b32 %r26698, %r26697, 16711680; + or.b32 %r26699, %r26695, %r26698; + cvt.u32.u16 %r26700, %rs835; + shl.b32 %r26701, %r26700, 24; + or.b32 %r26702, %r26699, %r26701; + cvt.u32.u16 %r26703, %rs836; + and.b32 %r26704, %r26703, 255; + cvt.u32.u16 %r26705, %rs837; + prmt.b32 %r26706, %r26705, %r26704, 30212; + cvt.u32.u16 %r26707, %rs838; + shl.b32 %r26708, %r26707, 16; + and.b32 %r26709, %r26708, 16711680; + or.b32 %r26710, %r26706, %r26709; + cvt.u32.u16 %r26711, %rs839; + shl.b32 %r26712, %r26711, 24; + or.b32 %r26713, %r26710, %r26712; + cvt.u32.u16 %r26714, %rs840; + and.b32 %r26715, %r26714, 255; + cvt.u32.u16 %r26716, %rs841; + prmt.b32 %r26717, %r26716, %r26715, 30212; + cvt.u32.u16 %r26718, %rs842; + shl.b32 %r26719, %r26718, 16; + and.b32 %r26720, %r26719, 16711680; + or.b32 %r26721, %r26717, %r26720; + cvt.u32.u16 %r26722, %rs843; + shl.b32 %r26723, %r26722, 24; + or.b32 %r26724, %r26721, %r26723; + cvt.u32.u16 %r26725, %rs844; + and.b32 %r26726, %r26725, 255; + cvt.u32.u16 %r26727, %rs845; + prmt.b32 %r26728, %r26727, %r26726, 30212; + cvt.u32.u16 %r26729, %rs846; + shl.b32 %r26730, %r26729, 16; + and.b32 %r26731, %r26730, 16711680; + or.b32 %r26732, %r26728, %r26731; + cvt.u32.u16 %r26733, %rs847; + shl.b32 %r26734, %r26733, 24; + or.b32 %r26735, %r26732, %r26734; + cvt.u32.u16 %r26736, %rs848; + and.b32 %r26737, %r26736, 255; + cvt.u32.u16 %r26738, %rs849; + prmt.b32 %r26739, %r26738, %r26737, 30212; + cvt.u32.u16 %r26740, %rs850; + shl.b32 %r26741, %r26740, 16; + and.b32 %r26742, %r26741, 16711680; + or.b32 %r26743, %r26739, %r26742; + cvt.u32.u16 %r26744, %rs851; + shl.b32 %r26745, %r26744, 24; + or.b32 %r26746, %r26743, %r26745; + cvt.u32.u16 %r26747, %rs852; + and.b32 %r26748, %r26747, 255; + cvt.u32.u16 %r26749, %rs853; + prmt.b32 %r26750, %r26749, %r26748, 30212; + cvt.u32.u16 %r26751, %rs854; + shl.b32 %r26752, %r26751, 16; + and.b32 %r26753, %r26752, 16711680; + or.b32 %r26754, %r26750, %r26753; + cvt.u32.u16 %r26755, %rs855; + shl.b32 %r26756, %r26755, 24; + or.b32 %r26757, %r26754, %r26756; + cvt.u32.u16 %r26758, %rs856; + and.b32 %r26759, %r26758, 255; + cvt.u32.u16 %r26760, %rs857; + prmt.b32 %r26761, %r26760, %r26759, 30212; + cvt.u32.u16 %r26762, %rs858; + shl.b32 %r26763, %r26762, 16; + and.b32 %r26764, %r26763, 16711680; + or.b32 %r26765, %r26761, %r26764; + cvt.u32.u16 %r26766, %rs859; + shl.b32 %r26767, %r26766, 24; + or.b32 %r26768, %r26765, %r26767; + cvt.u32.u16 %r26769, %rs860; + and.b32 %r26770, %r26769, 255; + cvt.u32.u16 %r26771, %rs861; + prmt.b32 %r26772, %r26771, %r26770, 30212; + cvt.u32.u16 %r26773, %rs862; + shl.b32 %r26774, %r26773, 16; + and.b32 %r26775, %r26774, 16711680; + or.b32 %r26776, %r26772, %r26775; + cvt.u32.u16 %r26777, %rs863; + shl.b32 %r26778, %r26777, 24; + or.b32 %r26779, %r26776, %r26778; + shr.u64 %rd1146, %rd1276, 32; + cvt.u32.u64 %r26780, %rd1146; + add.s32 %r26781, %r30972, %r30976; + add.s32 %r26782, %r26781, %r26614; + cvt.u32.u64 %r26783, %rd1276; + xor.b32 %r26784, %r26782, %r26783; + shf.l.wrap.b32 %r26785, %r26784, %r26784, 16; + add.s32 %r26786, %r26785, 1779033703; + xor.b32 %r26787, %r26786, %r30972; + shf.l.wrap.b32 %r26788, %r26787, %r26787, 20; + add.s32 %r26789, %r26625, %r26782; + add.s32 %r26790, %r26789, %r26788; + xor.b32 %r26791, %r26790, %r26785; + shf.l.wrap.b32 %r26792, %r26791, %r26791, 24; + add.s32 %r26793, %r26792, %r26786; + xor.b32 %r26794, %r26793, %r26788; + shf.l.wrap.b32 %r26795, %r26794, %r26794, 25; + add.s32 %r26796, %r30971, %r30975; + add.s32 %r26797, %r26796, %r26636; + xor.b32 %r26798, %r26797, %r26780; + shf.l.wrap.b32 %r26799, %r26798, %r26798, 16; + add.s32 %r26800, %r26799, -1150833019; + xor.b32 %r26801, %r26800, %r30971; + shf.l.wrap.b32 %r26802, %r26801, %r26801, 20; + add.s32 %r26803, %r26647, %r26797; + add.s32 %r26804, %r26803, %r26802; + xor.b32 %r26805, %r26804, %r26799; + shf.l.wrap.b32 %r26806, %r26805, %r26805, 24; + add.s32 %r26807, %r26806, %r26800; + xor.b32 %r26808, %r26807, %r26802; + shf.l.wrap.b32 %r26809, %r26808, %r26808, 25; + add.s32 %r26810, %r30970, %r30974; + add.s32 %r26811, %r26810, %r26658; + cvt.u32.u16 %r26812, %rs864; + and.b32 %r26813, %r26812, 255; + xor.b32 %r26814, %r26811, %r26813; + shr.u32 %r26815, %r26811, 16; + shl.b32 %r26816, %r26814, 16; + or.b32 %r26817, %r26816, %r26815; + add.s32 %r26818, %r26817, 1013904242; + xor.b32 %r26819, %r26818, %r30970; + shf.l.wrap.b32 %r26820, %r26819, %r26819, 20; + add.s32 %r26821, %r26669, %r26811; + add.s32 %r26822, %r26821, %r26820; + xor.b32 %r26823, %r26822, %r26817; + shf.l.wrap.b32 %r26824, %r26823, %r26823, 24; + add.s32 %r26825, %r26824, %r26818; + xor.b32 %r26826, %r26825, %r26820; + shf.l.wrap.b32 %r26827, %r26826, %r26826, 25; + add.s32 %r26828, %r30969, %r30973; + add.s32 %r26829, %r26828, %r26680; + cvt.u32.u16 %r26830, %rs734; + and.b32 %r26831, %r26830, 255; + xor.b32 %r26832, %r26829, %r26831; + shr.u32 %r26833, %r26829, 16; + shl.b32 %r26834, %r26832, 16; + or.b32 %r26835, %r26834, %r26833; + add.s32 %r26836, %r26835, -1521486534; + xor.b32 %r26837, %r26836, %r30969; + shf.l.wrap.b32 %r26838, %r26837, %r26837, 20; + add.s32 %r26839, %r26691, %r26829; + add.s32 %r26840, %r26839, %r26838; + xor.b32 %r26841, %r26840, %r26835; + shf.l.wrap.b32 %r26842, %r26841, %r26841, 24; + add.s32 %r26843, %r26842, %r26836; + xor.b32 %r26844, %r26843, %r26838; + shf.l.wrap.b32 %r26845, %r26844, %r26844, 25; + add.s32 %r26846, %r26809, %r26790; + add.s32 %r26847, %r26846, %r26702; + xor.b32 %r26848, %r26842, %r26847; + shf.l.wrap.b32 %r26849, %r26848, %r26848, 16; + add.s32 %r26850, %r26849, %r26825; + xor.b32 %r26851, %r26850, %r26809; + shf.l.wrap.b32 %r26852, %r26851, %r26851, 20; + add.s32 %r26853, %r26713, %r26847; + add.s32 %r26854, %r26853, %r26852; + xor.b32 %r26855, %r26854, %r26849; + shf.l.wrap.b32 %r26856, %r26855, %r26855, 24; + add.s32 %r26857, %r26856, %r26850; + xor.b32 %r26858, %r26857, %r26852; + shf.l.wrap.b32 %r26859, %r26858, %r26858, 25; + add.s32 %r26860, %r26724, %r26804; + add.s32 %r26861, %r26860, %r26827; + xor.b32 %r26862, %r26861, %r26792; + shf.l.wrap.b32 %r26863, %r26862, %r26862, 16; + add.s32 %r26864, %r26863, %r26843; + xor.b32 %r26865, %r26864, %r26827; + shf.l.wrap.b32 %r26866, %r26865, %r26865, 20; + add.s32 %r26867, %r26861, %r26735; + add.s32 %r26868, %r26867, %r26866; + xor.b32 %r26869, %r26868, %r26863; + shf.l.wrap.b32 %r26870, %r26869, %r26869, 24; + add.s32 %r26871, %r26870, %r26864; + xor.b32 %r26872, %r26871, %r26866; + shf.l.wrap.b32 %r26873, %r26872, %r26872, 25; + add.s32 %r26874, %r26822, %r26746; + add.s32 %r26875, %r26874, %r26845; + xor.b32 %r26876, %r26875, %r26806; + shf.l.wrap.b32 %r26877, %r26876, %r26876, 16; + add.s32 %r26878, %r26877, %r26793; + xor.b32 %r26879, %r26878, %r26845; + shf.l.wrap.b32 %r26880, %r26879, %r26879, 20; + add.s32 %r26881, %r26875, %r26757; + add.s32 %r26882, %r26881, %r26880; + xor.b32 %r26883, %r26882, %r26877; + shf.l.wrap.b32 %r26884, %r26883, %r26883, 24; + add.s32 %r26885, %r26884, %r26878; + xor.b32 %r26886, %r26885, %r26880; + shf.l.wrap.b32 %r26887, %r26886, %r26886, 25; + add.s32 %r26888, %r26768, %r26795; + add.s32 %r26889, %r26888, %r26840; + xor.b32 %r26890, %r26824, %r26889; + shf.l.wrap.b32 %r26891, %r26890, %r26890, 16; + add.s32 %r26892, %r26891, %r26807; + xor.b32 %r26893, %r26892, %r26795; + shf.l.wrap.b32 %r26894, %r26893, %r26893, 20; + add.s32 %r26895, %r26889, %r26779; + add.s32 %r26896, %r26895, %r26894; + xor.b32 %r26897, %r26896, %r26891; + shf.l.wrap.b32 %r26898, %r26897, %r26897, 24; + add.s32 %r26899, %r26898, %r26892; + xor.b32 %r26900, %r26899, %r26894; + shf.l.wrap.b32 %r26901, %r26900, %r26900, 25; + add.s32 %r26902, %r26854, %r26636; + add.s32 %r26903, %r26902, %r26901; + xor.b32 %r26904, %r26870, %r26903; + shf.l.wrap.b32 %r26905, %r26904, %r26904, 16; + add.s32 %r26906, %r26905, %r26885; + xor.b32 %r26907, %r26906, %r26901; + shf.l.wrap.b32 %r26908, %r26907, %r26907, 20; + add.s32 %r26909, %r26903, %r26680; + add.s32 %r26910, %r26909, %r26908; + xor.b32 %r26911, %r26910, %r26905; + shf.l.wrap.b32 %r26912, %r26911, %r26911, 24; + add.s32 %r26913, %r26912, %r26906; + xor.b32 %r26914, %r26913, %r26908; + shf.l.wrap.b32 %r26915, %r26914, %r26914, 25; + add.s32 %r26916, %r26868, %r26647; + add.s32 %r26917, %r26916, %r26859; + xor.b32 %r26918, %r26884, %r26917; + shf.l.wrap.b32 %r26919, %r26918, %r26918, 16; + add.s32 %r26920, %r26919, %r26899; + xor.b32 %r26921, %r26920, %r26859; + shf.l.wrap.b32 %r26922, %r26921, %r26921, 20; + add.s32 %r26923, %r26917, %r26724; + add.s32 %r26924, %r26923, %r26922; + xor.b32 %r26925, %r26924, %r26919; + shf.l.wrap.b32 %r26926, %r26925, %r26925, 24; + add.s32 %r26927, %r26926, %r26920; + xor.b32 %r26928, %r26927, %r26922; + shf.l.wrap.b32 %r26929, %r26928, %r26928, 25; + add.s32 %r26930, %r26882, %r26691; + add.s32 %r26931, %r26930, %r26873; + xor.b32 %r26932, %r26931, %r26898; + shf.l.wrap.b32 %r26933, %r26932, %r26932, 16; + add.s32 %r26934, %r26933, %r26857; + xor.b32 %r26935, %r26934, %r26873; + shf.l.wrap.b32 %r26936, %r26935, %r26935, 20; + add.s32 %r26937, %r26931, %r26614; + add.s32 %r26938, %r26937, %r26936; + xor.b32 %r26939, %r26938, %r26933; + shf.l.wrap.b32 %r26940, %r26939, %r26939, 24; + add.s32 %r26941, %r26940, %r26934; + xor.b32 %r26942, %r26941, %r26936; + shf.l.wrap.b32 %r26943, %r26942, %r26942, 25; + add.s32 %r26944, %r26896, %r26658; + add.s32 %r26945, %r26944, %r26887; + xor.b32 %r26946, %r26945, %r26856; + shf.l.wrap.b32 %r26947, %r26946, %r26946, 16; + add.s32 %r26948, %r26947, %r26871; + xor.b32 %r26949, %r26948, %r26887; + shf.l.wrap.b32 %r26950, %r26949, %r26949, 20; + add.s32 %r26951, %r26945, %r26757; + add.s32 %r26952, %r26951, %r26950; + xor.b32 %r26953, %r26952, %r26947; + shf.l.wrap.b32 %r26954, %r26953, %r26953, 24; + add.s32 %r26955, %r26954, %r26948; + xor.b32 %r26956, %r26955, %r26950; + shf.l.wrap.b32 %r26957, %r26956, %r26956, 25; + add.s32 %r26958, %r26910, %r26625; + add.s32 %r26959, %r26958, %r26929; + xor.b32 %r26960, %r26954, %r26959; + shf.l.wrap.b32 %r26961, %r26960, %r26960, 16; + add.s32 %r26962, %r26961, %r26941; + xor.b32 %r26963, %r26962, %r26929; + shf.l.wrap.b32 %r26964, %r26963, %r26963, 20; + add.s32 %r26965, %r26959, %r26735; + add.s32 %r26966, %r26965, %r26964; + xor.b32 %r26967, %r26966, %r26961; + shf.l.wrap.b32 %r26968, %r26967, %r26967, 24; + add.s32 %r26969, %r26968, %r26962; + xor.b32 %r26970, %r26969, %r26964; + shf.l.wrap.b32 %r26971, %r26970, %r26970, 25; + add.s32 %r26972, %r26924, %r26746; + add.s32 %r26973, %r26972, %r26943; + xor.b32 %r26974, %r26973, %r26912; + shf.l.wrap.b32 %r26975, %r26974, %r26974, 16; + add.s32 %r26976, %r26975, %r26955; + xor.b32 %r26977, %r26976, %r26943; + shf.l.wrap.b32 %r26978, %r26977, %r26977, 20; + add.s32 %r26979, %r26973, %r26669; + add.s32 %r26980, %r26979, %r26978; + xor.b32 %r26981, %r26980, %r26975; + shf.l.wrap.b32 %r26982, %r26981, %r26981, 24; + add.s32 %r26983, %r26982, %r26976; + xor.b32 %r26984, %r26983, %r26978; + shf.l.wrap.b32 %r26985, %r26984, %r26984, 25; + add.s32 %r26986, %r26938, %r26713; + add.s32 %r26987, %r26986, %r26957; + xor.b32 %r26988, %r26987, %r26926; + shf.l.wrap.b32 %r26989, %r26988, %r26988, 16; + add.s32 %r26990, %r26989, %r26913; + xor.b32 %r26991, %r26990, %r26957; + shf.l.wrap.b32 %r26992, %r26991, %r26991, 20; + add.s32 %r26993, %r26987, %r26768; + add.s32 %r26994, %r26993, %r26992; + xor.b32 %r26995, %r26994, %r26989; + shf.l.wrap.b32 %r26996, %r26995, %r26995, 24; + add.s32 %r26997, %r26996, %r26990; + xor.b32 %r26998, %r26997, %r26992; + shf.l.wrap.b32 %r26999, %r26998, %r26998, 25; + add.s32 %r27000, %r26915, %r26779; + add.s32 %r27001, %r27000, %r26952; + xor.b32 %r27002, %r26940, %r27001; + shf.l.wrap.b32 %r27003, %r27002, %r27002, 16; + add.s32 %r27004, %r27003, %r26927; + xor.b32 %r27005, %r27004, %r26915; + shf.l.wrap.b32 %r27006, %r27005, %r27005, 20; + add.s32 %r27007, %r27001, %r26702; + add.s32 %r27008, %r27007, %r27006; + xor.b32 %r27009, %r27008, %r27003; + shf.l.wrap.b32 %r27010, %r27009, %r27009, 24; + add.s32 %r27011, %r27010, %r27004; + xor.b32 %r27012, %r27011, %r27006; + shf.l.wrap.b32 %r27013, %r27012, %r27012, 25; + add.s32 %r27014, %r26966, %r26647; + add.s32 %r27015, %r27014, %r27013; + xor.b32 %r27016, %r26982, %r27015; + shf.l.wrap.b32 %r27017, %r27016, %r27016, 16; + add.s32 %r27018, %r27017, %r26997; + xor.b32 %r27019, %r27018, %r27013; + shf.l.wrap.b32 %r27020, %r27019, %r27019, 20; + add.s32 %r27021, %r27015, %r26658; + add.s32 %r27022, %r27021, %r27020; + xor.b32 %r27023, %r27022, %r27017; + shf.l.wrap.b32 %r27024, %r27023, %r27023, 24; + add.s32 %r27025, %r27024, %r27018; + xor.b32 %r27026, %r27025, %r27020; + shf.l.wrap.b32 %r27027, %r27026, %r27026, 25; + add.s32 %r27028, %r26980, %r26724; + add.s32 %r27029, %r27028, %r26971; + xor.b32 %r27030, %r26996, %r27029; + shf.l.wrap.b32 %r27031, %r27030, %r27030, 16; + add.s32 %r27032, %r27031, %r27011; + xor.b32 %r27033, %r27032, %r26971; + shf.l.wrap.b32 %r27034, %r27033, %r27033, 20; + add.s32 %r27035, %r27029, %r26746; + add.s32 %r27036, %r27035, %r27034; + xor.b32 %r27037, %r27036, %r27031; + shf.l.wrap.b32 %r27038, %r27037, %r27037, 24; + add.s32 %r27039, %r27038, %r27032; + xor.b32 %r27040, %r27039, %r27034; + shf.l.wrap.b32 %r27041, %r27040, %r27040, 25; + add.s32 %r27042, %r26994, %r26757; + add.s32 %r27043, %r27042, %r26985; + xor.b32 %r27044, %r27043, %r27010; + shf.l.wrap.b32 %r27045, %r27044, %r27044, 16; + add.s32 %r27046, %r27045, %r26969; + xor.b32 %r27047, %r27046, %r26985; + shf.l.wrap.b32 %r27048, %r27047, %r27047, 20; + add.s32 %r27049, %r27043, %r26636; + add.s32 %r27050, %r27049, %r27048; + xor.b32 %r27051, %r27050, %r27045; + shf.l.wrap.b32 %r27052, %r27051, %r27051, 24; + add.s32 %r27053, %r27052, %r27046; + xor.b32 %r27054, %r27053, %r27048; + shf.l.wrap.b32 %r27055, %r27054, %r27054, 25; + add.s32 %r27056, %r27008, %r26691; + add.s32 %r27057, %r27056, %r26999; + xor.b32 %r27058, %r27057, %r26968; + shf.l.wrap.b32 %r27059, %r27058, %r27058, 16; + add.s32 %r27060, %r27059, %r26983; + xor.b32 %r27061, %r27060, %r26999; + shf.l.wrap.b32 %r27062, %r27061, %r27061, 20; + add.s32 %r27063, %r27057, %r26768; + add.s32 %r27064, %r27063, %r27062; + xor.b32 %r27065, %r27064, %r27059; + shf.l.wrap.b32 %r27066, %r27065, %r27065, 24; + add.s32 %r27067, %r27066, %r27060; + xor.b32 %r27068, %r27067, %r27062; + shf.l.wrap.b32 %r27069, %r27068, %r27068, 25; + add.s32 %r27070, %r27022, %r26680; + add.s32 %r27071, %r27070, %r27041; + xor.b32 %r27072, %r27066, %r27071; + shf.l.wrap.b32 %r27073, %r27072, %r27072, 16; + add.s32 %r27074, %r27073, %r27053; + xor.b32 %r27075, %r27074, %r27041; + shf.l.wrap.b32 %r27076, %r27075, %r27075, 20; + add.s32 %r27077, %r27071, %r26669; + add.s32 %r27078, %r27077, %r27076; + xor.b32 %r27079, %r27078, %r27073; + shf.l.wrap.b32 %r27080, %r27079, %r27079, 24; + add.s32 %r27081, %r27080, %r27074; + xor.b32 %r27082, %r27081, %r27076; + shf.l.wrap.b32 %r27083, %r27082, %r27082, 25; + add.s32 %r27084, %r27036, %r26713; + add.s32 %r27085, %r27084, %r27055; + xor.b32 %r27086, %r27085, %r27024; + shf.l.wrap.b32 %r27087, %r27086, %r27086, 16; + add.s32 %r27088, %r27087, %r27067; + xor.b32 %r27089, %r27088, %r27055; + shf.l.wrap.b32 %r27090, %r27089, %r27089, 20; + add.s32 %r27091, %r27085, %r26614; + add.s32 %r27092, %r27091, %r27090; + xor.b32 %r27093, %r27092, %r27087; + shf.l.wrap.b32 %r27094, %r27093, %r27093, 24; + add.s32 %r27095, %r27094, %r27088; + xor.b32 %r27096, %r27095, %r27090; + shf.l.wrap.b32 %r27097, %r27096, %r27096, 25; + add.s32 %r27098, %r27050, %r26735; + add.s32 %r27099, %r27098, %r27069; + xor.b32 %r27100, %r27099, %r27038; + shf.l.wrap.b32 %r27101, %r27100, %r27100, 16; + add.s32 %r27102, %r27101, %r27025; + xor.b32 %r27103, %r27102, %r27069; + shf.l.wrap.b32 %r27104, %r27103, %r27103, 20; + add.s32 %r27105, %r27099, %r26779; + add.s32 %r27106, %r27105, %r27104; + xor.b32 %r27107, %r27106, %r27101; + shf.l.wrap.b32 %r27108, %r27107, %r27107, 24; + add.s32 %r27109, %r27108, %r27102; + xor.b32 %r27110, %r27109, %r27104; + shf.l.wrap.b32 %r27111, %r27110, %r27110, 25; + add.s32 %r27112, %r27027, %r26702; + add.s32 %r27113, %r27112, %r27064; + xor.b32 %r27114, %r27052, %r27113; + shf.l.wrap.b32 %r27115, %r27114, %r27114, 16; + add.s32 %r27116, %r27115, %r27039; + xor.b32 %r27117, %r27116, %r27027; + shf.l.wrap.b32 %r27118, %r27117, %r27117, 20; + add.s32 %r27119, %r27113, %r26625; + add.s32 %r27120, %r27119, %r27118; + xor.b32 %r27121, %r27120, %r27115; + shf.l.wrap.b32 %r27122, %r27121, %r27121, 24; + add.s32 %r27123, %r27122, %r27116; + xor.b32 %r27124, %r27123, %r27118; + shf.l.wrap.b32 %r27125, %r27124, %r27124, 25; + add.s32 %r27126, %r27078, %r26724; + add.s32 %r27127, %r27126, %r27125; + xor.b32 %r27128, %r27094, %r27127; + shf.l.wrap.b32 %r27129, %r27128, %r27128, 16; + add.s32 %r27130, %r27129, %r27109; + xor.b32 %r27131, %r27130, %r27125; + shf.l.wrap.b32 %r27132, %r27131, %r27131, 20; + add.s32 %r27133, %r27127, %r26691; + add.s32 %r27134, %r27133, %r27132; + xor.b32 %r27135, %r27134, %r27129; + shf.l.wrap.b32 %r27136, %r27135, %r27135, 24; + add.s32 %r27137, %r27136, %r27130; + xor.b32 %r27138, %r27137, %r27132; + shf.l.wrap.b32 %r27139, %r27138, %r27138, 25; + add.s32 %r27140, %r27092, %r26746; + add.s32 %r27141, %r27140, %r27083; + xor.b32 %r27142, %r27108, %r27141; + shf.l.wrap.b32 %r27143, %r27142, %r27142, 16; + add.s32 %r27144, %r27143, %r27123; + xor.b32 %r27145, %r27144, %r27083; + shf.l.wrap.b32 %r27146, %r27145, %r27145, 20; + add.s32 %r27147, %r27141, %r26713; + add.s32 %r27148, %r27147, %r27146; + xor.b32 %r27149, %r27148, %r27143; + shf.l.wrap.b32 %r27150, %r27149, %r27149, 24; + add.s32 %r27151, %r27150, %r27144; + xor.b32 %r27152, %r27151, %r27146; + shf.l.wrap.b32 %r27153, %r27152, %r27152, 25; + add.s32 %r27154, %r27106, %r26768; + add.s32 %r27155, %r27154, %r27097; + xor.b32 %r27156, %r27155, %r27122; + shf.l.wrap.b32 %r27157, %r27156, %r27156, 16; + add.s32 %r27158, %r27157, %r27081; + xor.b32 %r27159, %r27158, %r27097; + shf.l.wrap.b32 %r27160, %r27159, %r27159, 20; + add.s32 %r27161, %r27155, %r26647; + add.s32 %r27162, %r27161, %r27160; + xor.b32 %r27163, %r27162, %r27157; + shf.l.wrap.b32 %r27164, %r27163, %r27163, 24; + add.s32 %r27165, %r27164, %r27158; + xor.b32 %r27166, %r27165, %r27160; + shf.l.wrap.b32 %r27167, %r27166, %r27166, 25; + add.s32 %r27168, %r27120, %r26757; + add.s32 %r27169, %r27168, %r27111; + xor.b32 %r27170, %r27169, %r27080; + shf.l.wrap.b32 %r27171, %r27170, %r27170, 16; + add.s32 %r27172, %r27171, %r27095; + xor.b32 %r27173, %r27172, %r27111; + shf.l.wrap.b32 %r27174, %r27173, %r27173, 20; + add.s32 %r27175, %r27169, %r26779; + add.s32 %r27176, %r27175, %r27174; + xor.b32 %r27177, %r27176, %r27171; + shf.l.wrap.b32 %r27178, %r27177, %r27177, 24; + add.s32 %r27179, %r27178, %r27172; + xor.b32 %r27180, %r27179, %r27174; + shf.l.wrap.b32 %r27181, %r27180, %r27180, 25; + add.s32 %r27182, %r27134, %r26658; + add.s32 %r27183, %r27182, %r27153; + xor.b32 %r27184, %r27178, %r27183; + shf.l.wrap.b32 %r27185, %r27184, %r27184, 16; + add.s32 %r27186, %r27185, %r27165; + xor.b32 %r27187, %r27186, %r27153; + shf.l.wrap.b32 %r27188, %r27187, %r27187, 20; + add.s32 %r27189, %r27183, %r26614; + add.s32 %r27190, %r27189, %r27188; + xor.b32 %r27191, %r27190, %r27185; + shf.l.wrap.b32 %r27192, %r27191, %r27191, 24; + add.s32 %r27193, %r27192, %r27186; + xor.b32 %r27194, %r27193, %r27188; + shf.l.wrap.b32 %r27195, %r27194, %r27194, 25; + add.s32 %r27196, %r27148, %r26735; + add.s32 %r27197, %r27196, %r27167; + xor.b32 %r27198, %r27197, %r27136; + shf.l.wrap.b32 %r27199, %r27198, %r27198, 16; + add.s32 %r27200, %r27199, %r27179; + xor.b32 %r27201, %r27200, %r27167; + shf.l.wrap.b32 %r27202, %r27201, %r27201, 20; + add.s32 %r27203, %r27197, %r26636; + add.s32 %r27204, %r27203, %r27202; + xor.b32 %r27205, %r27204, %r27199; + shf.l.wrap.b32 %r27206, %r27205, %r27205, 24; + add.s32 %r27207, %r27206, %r27200; + xor.b32 %r27208, %r27207, %r27202; + shf.l.wrap.b32 %r27209, %r27208, %r27208, 25; + add.s32 %r27210, %r27162, %r26669; + add.s32 %r27211, %r27210, %r27181; + xor.b32 %r27212, %r27211, %r27150; + shf.l.wrap.b32 %r27213, %r27212, %r27212, 16; + add.s32 %r27214, %r27213, %r27137; + xor.b32 %r27215, %r27214, %r27181; + shf.l.wrap.b32 %r27216, %r27215, %r27215, 20; + add.s32 %r27217, %r27211, %r26702; + add.s32 %r27218, %r27217, %r27216; + xor.b32 %r27219, %r27218, %r27213; + shf.l.wrap.b32 %r27220, %r27219, %r27219, 24; + add.s32 %r27221, %r27220, %r27214; + xor.b32 %r27222, %r27221, %r27216; + shf.l.wrap.b32 %r27223, %r27222, %r27222, 25; + add.s32 %r27224, %r27139, %r26625; + add.s32 %r27225, %r27224, %r27176; + xor.b32 %r27226, %r27164, %r27225; + shf.l.wrap.b32 %r27227, %r27226, %r27226, 16; + add.s32 %r27228, %r27227, %r27151; + xor.b32 %r27229, %r27228, %r27139; + shf.l.wrap.b32 %r27230, %r27229, %r27229, 20; + add.s32 %r27231, %r27225, %r26680; + add.s32 %r27232, %r27231, %r27230; + xor.b32 %r27233, %r27232, %r27227; + shf.l.wrap.b32 %r27234, %r27233, %r27233, 24; + add.s32 %r27235, %r27234, %r27228; + xor.b32 %r27236, %r27235, %r27230; + shf.l.wrap.b32 %r27237, %r27236, %r27236, 25; + add.s32 %r27238, %r27190, %r26746; + add.s32 %r27239, %r27238, %r27237; + xor.b32 %r27240, %r27206, %r27239; + shf.l.wrap.b32 %r27241, %r27240, %r27240, 16; + add.s32 %r27242, %r27241, %r27221; + xor.b32 %r27243, %r27242, %r27237; + shf.l.wrap.b32 %r27244, %r27243, %r27243, 20; + add.s32 %r27245, %r27239, %r26757; + add.s32 %r27246, %r27245, %r27244; + xor.b32 %r27247, %r27246, %r27241; + shf.l.wrap.b32 %r27248, %r27247, %r27247, 24; + add.s32 %r27249, %r27248, %r27242; + xor.b32 %r27250, %r27249, %r27244; + shf.l.wrap.b32 %r27251, %r27250, %r27250, 25; + add.s32 %r27252, %r27204, %r26713; + add.s32 %r27253, %r27252, %r27195; + xor.b32 %r27254, %r27220, %r27253; + shf.l.wrap.b32 %r27255, %r27254, %r27254, 16; + add.s32 %r27256, %r27255, %r27235; + xor.b32 %r27257, %r27256, %r27195; + shf.l.wrap.b32 %r27258, %r27257, %r27257, 20; + add.s32 %r27259, %r27253, %r26735; + add.s32 %r27260, %r27259, %r27258; + xor.b32 %r27261, %r27260, %r27255; + shf.l.wrap.b32 %r27262, %r27261, %r27261, 24; + add.s32 %r27263, %r27262, %r27256; + xor.b32 %r27264, %r27263, %r27258; + shf.l.wrap.b32 %r27265, %r27264, %r27264, 25; + add.s32 %r27266, %r27218, %r26779; + add.s32 %r27267, %r27266, %r27209; + xor.b32 %r27268, %r27267, %r27234; + shf.l.wrap.b32 %r27269, %r27268, %r27268, 16; + add.s32 %r27270, %r27269, %r27193; + xor.b32 %r27271, %r27270, %r27209; + shf.l.wrap.b32 %r27272, %r27271, %r27271, 20; + add.s32 %r27273, %r27267, %r26724; + add.s32 %r27274, %r27273, %r27272; + xor.b32 %r27275, %r27274, %r27269; + shf.l.wrap.b32 %r27276, %r27275, %r27275, 24; + add.s32 %r27277, %r27276, %r27270; + xor.b32 %r27278, %r27277, %r27272; + shf.l.wrap.b32 %r27279, %r27278, %r27278, 25; + add.s32 %r27280, %r27232, %r26768; + add.s32 %r27281, %r27280, %r27223; + xor.b32 %r27282, %r27281, %r27192; + shf.l.wrap.b32 %r27283, %r27282, %r27282, 16; + add.s32 %r27284, %r27283, %r27207; + xor.b32 %r27285, %r27284, %r27223; + shf.l.wrap.b32 %r27286, %r27285, %r27285, 20; + add.s32 %r27287, %r27281, %r26702; + add.s32 %r27288, %r27287, %r27286; + xor.b32 %r27289, %r27288, %r27283; + shf.l.wrap.b32 %r27290, %r27289, %r27289, 24; + add.s32 %r27291, %r27290, %r27284; + xor.b32 %r27292, %r27291, %r27286; + shf.l.wrap.b32 %r27293, %r27292, %r27292, 25; + add.s32 %r27294, %r27246, %r26691; + add.s32 %r27295, %r27294, %r27265; + xor.b32 %r27296, %r27290, %r27295; + shf.l.wrap.b32 %r27297, %r27296, %r27296, 16; + add.s32 %r27298, %r27297, %r27277; + xor.b32 %r27299, %r27298, %r27265; + shf.l.wrap.b32 %r27300, %r27299, %r27299, 20; + add.s32 %r27301, %r27295, %r26636; + add.s32 %r27302, %r27301, %r27300; + xor.b32 %r27303, %r27302, %r27297; + shf.l.wrap.b32 %r27304, %r27303, %r27303, 24; + add.s32 %r27305, %r27304, %r27298; + xor.b32 %r27306, %r27305, %r27300; + shf.l.wrap.b32 %r27307, %r27306, %r27306, 25; + add.s32 %r27308, %r27260, %r26669; + add.s32 %r27309, %r27308, %r27279; + xor.b32 %r27310, %r27309, %r27248; + shf.l.wrap.b32 %r27311, %r27310, %r27310, 16; + add.s32 %r27312, %r27311, %r27291; + xor.b32 %r27313, %r27312, %r27279; + shf.l.wrap.b32 %r27314, %r27313, %r27313, 20; + add.s32 %r27315, %r27309, %r26647; + add.s32 %r27316, %r27315, %r27314; + xor.b32 %r27317, %r27316, %r27311; + shf.l.wrap.b32 %r27318, %r27317, %r27317, 24; + add.s32 %r27319, %r27318, %r27312; + xor.b32 %r27320, %r27319, %r27314; + shf.l.wrap.b32 %r27321, %r27320, %r27320, 25; + add.s32 %r27322, %r27274, %r26614; + add.s32 %r27323, %r27322, %r27293; + xor.b32 %r27324, %r27323, %r27262; + shf.l.wrap.b32 %r27325, %r27324, %r27324, 16; + add.s32 %r27326, %r27325, %r27249; + xor.b32 %r27327, %r27326, %r27293; + shf.l.wrap.b32 %r27328, %r27327, %r27327, 20; + add.s32 %r27329, %r27323, %r26625; + add.s32 %r27330, %r27329, %r27328; + xor.b32 %r27331, %r27330, %r27325; + shf.l.wrap.b32 %r27332, %r27331, %r27331, 24; + add.s32 %r27333, %r27332, %r27326; + xor.b32 %r27334, %r27333, %r27328; + shf.l.wrap.b32 %r27335, %r27334, %r27334, 25; + add.s32 %r27336, %r27251, %r26680; + add.s32 %r27337, %r27336, %r27288; + xor.b32 %r27338, %r27276, %r27337; + shf.l.wrap.b32 %r27339, %r27338, %r27338, 16; + add.s32 %r27340, %r27339, %r27263; + xor.b32 %r27341, %r27340, %r27251; + shf.l.wrap.b32 %r27342, %r27341, %r27341, 20; + add.s32 %r27343, %r27337, %r26658; + add.s32 %r27344, %r27343, %r27342; + xor.b32 %r27345, %r27344, %r27339; + shf.l.wrap.b32 %r27346, %r27345, %r27345, 24; + add.s32 %r27347, %r27346, %r27340; + xor.b32 %r27348, %r27347, %r27342; + shf.l.wrap.b32 %r27349, %r27348, %r27348, 25; + add.s32 %r27350, %r27302, %r26713; + add.s32 %r27351, %r27350, %r27349; + xor.b32 %r27352, %r27318, %r27351; + shf.l.wrap.b32 %r27353, %r27352, %r27352, 16; + add.s32 %r27354, %r27353, %r27333; + xor.b32 %r27355, %r27354, %r27349; + shf.l.wrap.b32 %r27356, %r27355, %r27355, 20; + add.s32 %r27357, %r27351, %r26768; + add.s32 %r27358, %r27357, %r27356; + xor.b32 %r27359, %r27358, %r27353; + shf.l.wrap.b32 %r27360, %r27359, %r27359, 24; + add.s32 %r27361, %r27360, %r27354; + xor.b32 %r27362, %r27361, %r27356; + shf.l.wrap.b32 %r27363, %r27362, %r27362, 25; + add.s32 %r27364, %r27316, %r26735; + add.s32 %r27365, %r27364, %r27307; + xor.b32 %r27366, %r27332, %r27365; + shf.l.wrap.b32 %r27367, %r27366, %r27366, 16; + add.s32 %r27368, %r27367, %r27347; + xor.b32 %r27369, %r27368, %r27307; + shf.l.wrap.b32 %r27370, %r27369, %r27369, 20; + add.s32 %r27371, %r27365, %r26669; + add.s32 %r27372, %r27371, %r27370; + xor.b32 %r27373, %r27372, %r27367; + shf.l.wrap.b32 %r27374, %r27373, %r27373, 24; + add.s32 %r27375, %r27374, %r27368; + xor.b32 %r27376, %r27375, %r27370; + shf.l.wrap.b32 %r27377, %r27376, %r27376, 25; + add.s32 %r27378, %r27330, %r26702; + add.s32 %r27379, %r27378, %r27321; + xor.b32 %r27380, %r27379, %r27346; + shf.l.wrap.b32 %r27381, %r27380, %r27380, 16; + add.s32 %r27382, %r27381, %r27305; + xor.b32 %r27383, %r27382, %r27321; + shf.l.wrap.b32 %r27384, %r27383, %r27383, 20; + add.s32 %r27385, %r27379, %r26746; + add.s32 %r27386, %r27385, %r27384; + xor.b32 %r27387, %r27386, %r27381; + shf.l.wrap.b32 %r27388, %r27387, %r27387, 24; + add.s32 %r27389, %r27388, %r27382; + xor.b32 %r27390, %r27389, %r27384; + shf.l.wrap.b32 %r27391, %r27390, %r27390, 25; + add.s32 %r27392, %r27344, %r26779; + add.s32 %r27393, %r27392, %r27335; + xor.b32 %r27394, %r27393, %r27304; + shf.l.wrap.b32 %r27395, %r27394, %r27394, 16; + add.s32 %r27396, %r27395, %r27319; + xor.b32 %r27397, %r27396, %r27335; + shf.l.wrap.b32 %r27398, %r27397, %r27397, 20; + add.s32 %r27399, %r27393, %r26625; + add.s32 %r27400, %r27399, %r27398; + xor.b32 %r27401, %r27400, %r27395; + shf.l.wrap.b32 %r27402, %r27401, %r27401, 24; + add.s32 %r27403, %r27402, %r27396; + xor.b32 %r27404, %r27403, %r27398; + shf.l.wrap.b32 %r27405, %r27404, %r27404, 25; + add.s32 %r27406, %r27358, %r26757; + add.s32 %r27407, %r27406, %r27377; + xor.b32 %r27408, %r27402, %r27407; + shf.l.wrap.b32 %r27409, %r27408, %r27408, 16; + add.s32 %r27410, %r27409, %r27389; + xor.b32 %r27411, %r27410, %r27377; + shf.l.wrap.b32 %r27412, %r27411, %r27411, 20; + add.s32 %r27413, %r27407, %r26647; + add.s32 %r27414, %r27413, %r27412; + xor.b32 %r27415, %r27414, %r27409; + shf.l.wrap.b32 %r27416, %r27415, %r27415, 24; + add.s32 %r27417, %r27416, %r27410; + xor.b32 %r27418, %r27417, %r27412; + shf.l.wrap.b32 %r27419, %r27418, %r27418, 25; + add.s32 %r27420, %r27372, %r26614; + add.s32 %r27421, %r27420, %r27391; + xor.b32 %r27422, %r27421, %r27360; + shf.l.wrap.b32 %r27423, %r27422, %r27422, 16; + add.s32 %r27424, %r27423, %r27403; + xor.b32 %r27425, %r27424, %r27391; + shf.l.wrap.b32 %r27426, %r27425, %r27425, 20; + add.s32 %r27427, %r27421, %r26724; + add.s32 %r27428, %r27427, %r27426; + xor.b32 %r27429, %r27428, %r27423; + shf.l.wrap.b32 %r27430, %r27429, %r27429, 24; + add.s32 %r27431, %r27430, %r27424; + xor.b32 %r27432, %r27431, %r27426; + shf.l.wrap.b32 %r27433, %r27432, %r27432, 25; + add.s32 %r27434, %r27386, %r26636; + add.s32 %r27435, %r27434, %r27405; + xor.b32 %r27436, %r27435, %r27374; + shf.l.wrap.b32 %r27437, %r27436, %r27436, 16; + add.s32 %r27438, %r27437, %r27361; + xor.b32 %r27439, %r27438, %r27405; + shf.l.wrap.b32 %r27440, %r27439, %r27439, 20; + add.s32 %r27441, %r27435, %r26680; + add.s32 %r27442, %r27441, %r27440; + xor.b32 %r27443, %r27442, %r27437; + shf.l.wrap.b32 %r27444, %r27443, %r27443, 24; + add.s32 %r27445, %r27444, %r27438; + xor.b32 %r27446, %r27445, %r27440; + shf.l.wrap.b32 %r27447, %r27446, %r27446, 25; + add.s32 %r27448, %r27363, %r26658; + add.s32 %r27449, %r27448, %r27400; + xor.b32 %r27450, %r27388, %r27449; + shf.l.wrap.b32 %r27451, %r27450, %r27450, 16; + add.s32 %r27452, %r27451, %r27375; + xor.b32 %r27453, %r27452, %r27363; + shf.l.wrap.b32 %r27454, %r27453, %r27453, 20; + add.s32 %r27455, %r27449, %r26691; + add.s32 %r27456, %r27455, %r27454; + xor.b32 %r27457, %r27456, %r27451; + shf.l.wrap.b32 %r27458, %r27457, %r27457, 24; + add.s32 %r27459, %r27458, %r27452; + xor.b32 %r27460, %r27459, %r27454; + shf.l.wrap.b32 %r27461, %r27460, %r27460, 25; + add.s32 %r27462, %r27414, %r26735; + add.s32 %r27463, %r27462, %r27461; + xor.b32 %r27464, %r27430, %r27463; + shf.l.wrap.b32 %r27465, %r27464, %r27464, 16; + add.s32 %r27466, %r27465, %r27445; + xor.b32 %r27467, %r27466, %r27461; + shf.l.wrap.b32 %r27468, %r27467, %r27467, 20; + add.s32 %r27469, %r27463, %r26779; + add.s32 %r27470, %r27469, %r27468; + xor.b32 %r27471, %r27470, %r27465; + shf.l.wrap.b32 %r27472, %r27471, %r27471, 24; + add.s32 %r27473, %r27472, %r27466; + xor.b32 %r27474, %r27473, %r27468; + shf.l.wrap.b32 %r27475, %r27474, %r27474, 25; + add.s32 %r27476, %r27428, %r26669; + add.s32 %r27477, %r27476, %r27419; + xor.b32 %r27478, %r27444, %r27477; + shf.l.wrap.b32 %r27479, %r27478, %r27478, 16; + add.s32 %r27480, %r27479, %r27459; + xor.b32 %r27481, %r27480, %r27419; + shf.l.wrap.b32 %r27482, %r27481, %r27481, 20; + add.s32 %r27483, %r27477, %r26614; + add.s32 %r27484, %r27483, %r27482; + xor.b32 %r27485, %r27484, %r27479; + shf.l.wrap.b32 %r27486, %r27485, %r27485, 24; + add.s32 %r27487, %r27486, %r27480; + xor.b32 %r27488, %r27487, %r27482; + shf.l.wrap.b32 %r27489, %r27488, %r27488, 25; + add.s32 %r27490, %r27442, %r26625; + add.s32 %r27491, %r27490, %r27433; + xor.b32 %r27492, %r27491, %r27458; + shf.l.wrap.b32 %r27493, %r27492, %r27492, 16; + add.s32 %r27494, %r27493, %r27417; + xor.b32 %r27495, %r27494, %r27433; + shf.l.wrap.b32 %r27496, %r27495, %r27495, 20; + add.s32 %r27497, %r27491, %r26713; + add.s32 %r27498, %r27497, %r27496; + xor.b32 %r27499, %r27498, %r27493; + shf.l.wrap.b32 %r27500, %r27499, %r27499, 24; + add.s32 %r27501, %r27500, %r27494; + xor.b32 %r27502, %r27501, %r27496; + shf.l.wrap.b32 %r27503, %r27502, %r27502, 25; + add.s32 %r27504, %r27456, %r26702; + add.s32 %r27505, %r27504, %r27447; + xor.b32 %r27506, %r27505, %r27416; + shf.l.wrap.b32 %r27507, %r27506, %r27506, 16; + add.s32 %r27508, %r27507, %r27431; + xor.b32 %r27509, %r27508, %r27447; + shf.l.wrap.b32 %r27510, %r27509, %r27509, 20; + add.s32 %r27511, %r27505, %r26680; + add.s32 %r27512, %r27511, %r27510; + xor.b32 %r27513, %r27512, %r27507; + shf.l.wrap.b32 %r27514, %r27513, %r27513, 24; + add.s32 %r27515, %r27514, %r27508; + xor.b32 %r27516, %r27515, %r27510; + shf.l.wrap.b32 %r27517, %r27516, %r27516, 25; + add.s32 %r27518, %r27470, %r26768; + add.s32 %r27519, %r27518, %r27489; + xor.b32 %r27520, %r27514, %r27519; + shf.l.wrap.b32 %r27521, %r27520, %r27520, 16; + add.s32 %r27522, %r27521, %r27501; + xor.b32 %r27523, %r27522, %r27489; + shf.l.wrap.b32 %r27524, %r27523, %r27523, 20; + add.s32 %r27525, %r27519, %r26724; + add.s32 %r27526, %r27525, %r27524; + xor.b32 %r27527, %r27526, %r27521; + shr.u32 %r27528, %r27527, 8; + shf.l.wrap.b32 %r27529, %r27527, %r27527, 24; + add.s32 %r27530, %r27529, %r27522; + xor.b32 %r27531, %r27530, %r27524; + shr.u32 %r27532, %r27531, 7; + shf.l.wrap.b32 %r27533, %r27531, %r27531, 25; + add.s32 %r27534, %r27484, %r26636; + add.s32 %r27535, %r27534, %r27503; + xor.b32 %r27536, %r27535, %r27472; + shf.l.wrap.b32 %r27537, %r27536, %r27536, 16; + add.s32 %r27538, %r27537, %r27515; + xor.b32 %r27539, %r27538, %r27503; + shf.l.wrap.b32 %r27540, %r27539, %r27539, 20; + add.s32 %r27541, %r27535, %r26746; + add.s32 %r27542, %r27541, %r27540; + xor.b32 %r27543, %r27542, %r27537; + shr.u32 %r27544, %r27543, 8; + shf.l.wrap.b32 %r27545, %r27543, %r27543, 24; + add.s32 %r27546, %r27545, %r27538; + xor.b32 %r27547, %r27546, %r27540; + shr.u32 %r27548, %r27547, 7; + shf.l.wrap.b32 %r27549, %r27547, %r27547, 25; + add.s32 %r27550, %r27498, %r26647; + add.s32 %r27551, %r27550, %r27517; + xor.b32 %r27552, %r27551, %r27486; + shf.l.wrap.b32 %r27553, %r27552, %r27552, 16; + add.s32 %r27554, %r27553, %r27473; + xor.b32 %r27555, %r27554, %r27517; + shf.l.wrap.b32 %r27556, %r27555, %r27555, 20; + add.s32 %r27557, %r27551, %r26658; + add.s32 %r27558, %r27557, %r27556; + xor.b32 %r27559, %r27558, %r27553; + shr.u32 %r27560, %r27559, 8; + shf.l.wrap.b32 %r27561, %r27559, %r27559, 24; + add.s32 %r27562, %r27561, %r27554; + xor.b32 %r27563, %r27562, %r27556; + shr.u32 %r27564, %r27563, 7; + shf.l.wrap.b32 %r27565, %r27563, %r27563, 25; + add.s32 %r27566, %r27475, %r26691; + add.s32 %r27567, %r27566, %r27512; + xor.b32 %r27568, %r27500, %r27567; + shf.l.wrap.b32 %r27569, %r27568, %r27568, 16; + add.s32 %r27570, %r27569, %r27487; + xor.b32 %r27571, %r27570, %r27475; + shf.l.wrap.b32 %r27572, %r27571, %r27571, 20; + add.s32 %r27573, %r27567, %r26757; + add.s32 %r27574, %r27573, %r27572; + xor.b32 %r27575, %r27574, %r27569; + shr.u32 %r27576, %r27575, 8; + shf.l.wrap.b32 %r27577, %r27575, %r27575, 24; + add.s32 %r27578, %r27577, %r27570; + xor.b32 %r27579, %r27578, %r27572; + shr.u32 %r27580, %r27579, 7; + shf.l.wrap.b32 %r27581, %r27579, %r27579, 25; + xor.b32 %r27582, %r27562, %r27526; + xor.b32 %r27583, %r27542, %r27578; + xor.b32 %r27584, %r27558, %r27530; + xor.b32 %r27585, %r27546, %r27574; + xor.b32 %r27586, %r27545, %r27581; + xor.b32 %r27587, %r27561, %r27533; + xor.b32 %r27588, %r27549, %r27577; + xor.b32 %r27589, %r27565, %r27529; + cvt.u16.u32 %rs553, %r27526; + cvt.u16.u32 %rs554, %r27562; + xor.b16 %rs832, %rs554, %rs553; + shr.u32 %r27590, %r27582, 8; + cvt.u16.u32 %rs833, %r27590; + shr.u32 %r27591, %r27582, 16; + cvt.u16.u32 %rs834, %r27591; + shr.u32 %r27592, %r27582, 24; + cvt.u16.u32 %rs835, %r27592; + cvt.u16.u32 %rs555, %r27578; + cvt.u16.u32 %rs556, %r27542; + xor.b16 %rs836, %rs556, %rs555; + shr.u32 %r27593, %r27583, 8; + cvt.u16.u32 %rs837, %r27593; + shr.u32 %r27594, %r27583, 16; + cvt.u16.u32 %rs838, %r27594; + shr.u32 %r27595, %r27583, 24; + cvt.u16.u32 %rs839, %r27595; + cvt.u16.u32 %rs557, %r27530; + cvt.u16.u32 %rs558, %r27558; + xor.b16 %rs840, %rs558, %rs557; + shr.u32 %r27596, %r27584, 8; + cvt.u16.u32 %rs841, %r27596; + shr.u32 %r27597, %r27584, 16; + cvt.u16.u32 %rs842, %r27597; + shr.u32 %r27598, %r27584, 24; + cvt.u16.u32 %rs843, %r27598; + cvt.u16.u32 %rs559, %r27546; + cvt.u16.u32 %rs560, %r27574; + xor.b16 %rs844, %rs559, %rs560; + shr.u32 %r27599, %r27585, 8; + cvt.u16.u32 %rs845, %r27599; + shr.u32 %r27600, %r27585, 16; + cvt.u16.u32 %rs846, %r27600; + shr.u32 %r27601, %r27585, 24; + cvt.u16.u32 %rs847, %r27601; + cvt.u16.u32 %rs561, %r27580; + cvt.u16.u32 %rs562, %r27544; + xor.b16 %rs848, %rs562, %rs561; + shr.u32 %r27602, %r27586, 8; + cvt.u16.u32 %rs849, %r27602; + shr.u32 %r27603, %r27586, 16; + cvt.u16.u32 %rs850, %r27603; + shr.u32 %r27604, %r27586, 24; + cvt.u16.u32 %rs851, %r27604; + cvt.u16.u32 %rs563, %r27532; + cvt.u16.u32 %rs564, %r27560; + xor.b16 %rs852, %rs564, %rs563; + shr.u32 %r27605, %r27587, 8; + cvt.u16.u32 %rs853, %r27605; + shr.u32 %r27606, %r27587, 16; + cvt.u16.u32 %rs854, %r27606; + shr.u32 %r27607, %r27587, 24; + cvt.u16.u32 %rs855, %r27607; + cvt.u16.u32 %rs565, %r27548; + cvt.u16.u32 %rs566, %r27576; + xor.b16 %rs856, %rs565, %rs566; + shr.u32 %r27608, %r27588, 8; + cvt.u16.u32 %rs857, %r27608; + shr.u32 %r27609, %r27588, 16; + cvt.u16.u32 %rs858, %r27609; + shr.u32 %r27610, %r27588, 24; + cvt.u16.u32 %rs859, %r27610; + cvt.u16.u32 %rs567, %r27528; + cvt.u16.u32 %rs568, %r27564; + xor.b16 %rs860, %rs568, %rs567; + shr.u32 %r27611, %r27589, 8; + cvt.u16.u32 %rs861, %r27611; + shr.u32 %r27612, %r27589, 16; + cvt.u16.u32 %rs862, %r27612; + shr.u32 %r27613, %r27589, 24; + cvt.u16.u32 %rs863, %r27613; + setp.ne.s64 %p57, %rd1275, 0; + mov.u16 %rs864, 64; + mov.u16 %rs734, %rs865; + mov.u16 %rs768, %rs831; + mov.u16 %rs769, %rs830; + mov.u16 %rs770, %rs829; + mov.u16 %rs771, %rs828; + mov.u16 %rs772, %rs827; + mov.u16 %rs773, %rs826; + mov.u16 %rs774, %rs825; + mov.u16 %rs775, %rs824; + mov.u16 %rs776, %rs823; + mov.u16 %rs777, %rs822; + mov.u16 %rs778, %rs821; + mov.u16 %rs779, %rs820; + mov.u16 %rs780, %rs819; + mov.u16 %rs781, %rs818; + mov.u16 %rs782, %rs817; + mov.u16 %rs783, %rs816; + mov.u16 %rs784, %rs815; + mov.u16 %rs785, %rs814; + mov.u16 %rs786, %rs813; + mov.u16 %rs787, %rs812; + mov.u16 %rs788, %rs811; + mov.u16 %rs789, %rs810; + mov.u16 %rs790, %rs809; + mov.u16 %rs791, %rs808; + mov.u16 %rs792, %rs807; + mov.u16 %rs793, %rs806; + mov.u16 %rs794, %rs805; + mov.u16 %rs795, %rs804; + mov.u16 %rs796, %rs803; + mov.u16 %rs797, %rs802; + mov.u16 %rs798, %rs801; + mov.u16 %rs799, %rs800; + mov.u64 %rd1276, %rd1143; + mov.u32 %r30969, %r30984; + mov.u32 %r30970, %r30983; + mov.u32 %r30971, %r30982; + mov.u32 %r30972, %r30981; + mov.u32 %r30973, %r30980; + mov.u32 %r30974, %r30979; + mov.u32 %r30975, %r30978; + mov.u32 %r30976, %r30977; + @%p57 bra $L__BB2_101; + +$L__BB2_102: + cvt.u32.u16 %r27614, %rs800; + and.b32 %r27615, %r27614, 255; + cvt.u32.u16 %r27616, %rs801; + prmt.b32 %r27617, %r27616, %r27615, 30212; + cvt.u32.u16 %r27618, %rs802; + shl.b32 %r27619, %r27618, 16; + and.b32 %r27620, %r27619, 16711680; + or.b32 %r27621, %r27617, %r27620; + cvt.u32.u16 %r27622, %rs803; + shl.b32 %r27623, %r27622, 24; + or.b32 %r27624, %r27621, %r27623; + cvt.u32.u16 %r27625, %rs804; + and.b32 %r27626, %r27625, 255; + cvt.u32.u16 %r27627, %rs805; + prmt.b32 %r27628, %r27627, %r27626, 30212; + cvt.u32.u16 %r27629, %rs806; + shl.b32 %r27630, %r27629, 16; + and.b32 %r27631, %r27630, 16711680; + or.b32 %r27632, %r27628, %r27631; + cvt.u32.u16 %r27633, %rs807; + shl.b32 %r27634, %r27633, 24; + or.b32 %r27635, %r27632, %r27634; + cvt.u32.u16 %r27636, %rs808; + and.b32 %r27637, %r27636, 255; + cvt.u32.u16 %r27638, %rs809; + prmt.b32 %r27639, %r27638, %r27637, 30212; + cvt.u32.u16 %r27640, %rs810; + shl.b32 %r27641, %r27640, 16; + and.b32 %r27642, %r27641, 16711680; + or.b32 %r27643, %r27639, %r27642; + cvt.u32.u16 %r27644, %rs811; + shl.b32 %r27645, %r27644, 24; + or.b32 %r27646, %r27643, %r27645; + cvt.u32.u16 %r27647, %rs812; + and.b32 %r27648, %r27647, 255; + cvt.u32.u16 %r27649, %rs813; + prmt.b32 %r27650, %r27649, %r27648, 30212; + cvt.u32.u16 %r27651, %rs814; + shl.b32 %r27652, %r27651, 16; + and.b32 %r27653, %r27652, 16711680; + or.b32 %r27654, %r27650, %r27653; + cvt.u32.u16 %r27655, %rs815; + shl.b32 %r27656, %r27655, 24; + or.b32 %r27657, %r27654, %r27656; + cvt.u32.u16 %r27658, %rs816; + and.b32 %r27659, %r27658, 255; + cvt.u32.u16 %r27660, %rs817; + prmt.b32 %r27661, %r27660, %r27659, 30212; + cvt.u32.u16 %r27662, %rs818; + shl.b32 %r27663, %r27662, 16; + and.b32 %r27664, %r27663, 16711680; + or.b32 %r27665, %r27661, %r27664; + cvt.u32.u16 %r27666, %rs819; + shl.b32 %r27667, %r27666, 24; + or.b32 %r27668, %r27665, %r27667; + cvt.u32.u16 %r27669, %rs820; + and.b32 %r27670, %r27669, 255; + cvt.u32.u16 %r27671, %rs821; + prmt.b32 %r27672, %r27671, %r27670, 30212; + cvt.u32.u16 %r27673, %rs822; + shl.b32 %r27674, %r27673, 16; + and.b32 %r27675, %r27674, 16711680; + or.b32 %r27676, %r27672, %r27675; + cvt.u32.u16 %r27677, %rs823; + shl.b32 %r27678, %r27677, 24; + or.b32 %r27679, %r27676, %r27678; + cvt.u32.u16 %r27680, %rs824; + and.b32 %r27681, %r27680, 255; + cvt.u32.u16 %r27682, %rs825; + prmt.b32 %r27683, %r27682, %r27681, 30212; + cvt.u32.u16 %r27684, %rs826; + shl.b32 %r27685, %r27684, 16; + and.b32 %r27686, %r27685, 16711680; + or.b32 %r27687, %r27683, %r27686; + cvt.u32.u16 %r27688, %rs827; + shl.b32 %r27689, %r27688, 24; + or.b32 %r27690, %r27687, %r27689; + cvt.u32.u16 %r27691, %rs828; + and.b32 %r27692, %r27691, 255; + cvt.u32.u16 %r27693, %rs829; + prmt.b32 %r27694, %r27693, %r27692, 30212; + cvt.u32.u16 %r27695, %rs830; + shl.b32 %r27696, %r27695, 16; + and.b32 %r27697, %r27696, 16711680; + or.b32 %r27698, %r27694, %r27697; + cvt.u32.u16 %r27699, %rs831; + shl.b32 %r27700, %r27699, 24; + or.b32 %r27701, %r27698, %r27700; + cvt.u32.u16 %r27702, %rs832; + and.b32 %r27703, %r27702, 255; + cvt.u32.u16 %r27704, %rs833; + prmt.b32 %r27705, %r27704, %r27703, 30212; + cvt.u32.u16 %r27706, %rs834; + shl.b32 %r27707, %r27706, 16; + and.b32 %r27708, %r27707, 16711680; + or.b32 %r27709, %r27705, %r27708; + cvt.u32.u16 %r27710, %rs835; + shl.b32 %r27711, %r27710, 24; + or.b32 %r27712, %r27709, %r27711; + cvt.u32.u16 %r27713, %rs836; + and.b32 %r27714, %r27713, 255; + cvt.u32.u16 %r27715, %rs837; + prmt.b32 %r27716, %r27715, %r27714, 30212; + cvt.u32.u16 %r27717, %rs838; + shl.b32 %r27718, %r27717, 16; + and.b32 %r27719, %r27718, 16711680; + or.b32 %r27720, %r27716, %r27719; + cvt.u32.u16 %r27721, %rs839; + shl.b32 %r27722, %r27721, 24; + or.b32 %r27723, %r27720, %r27722; + cvt.u32.u16 %r27724, %rs840; + and.b32 %r27725, %r27724, 255; + cvt.u32.u16 %r27726, %rs841; + prmt.b32 %r27727, %r27726, %r27725, 30212; + cvt.u32.u16 %r27728, %rs842; + shl.b32 %r27729, %r27728, 16; + and.b32 %r27730, %r27729, 16711680; + or.b32 %r27731, %r27727, %r27730; + cvt.u32.u16 %r27732, %rs843; + shl.b32 %r27733, %r27732, 24; + or.b32 %r27734, %r27731, %r27733; + cvt.u32.u16 %r27735, %rs844; + and.b32 %r27736, %r27735, 255; + cvt.u32.u16 %r27737, %rs845; + prmt.b32 %r27738, %r27737, %r27736, 30212; + cvt.u32.u16 %r27739, %rs846; + shl.b32 %r27740, %r27739, 16; + and.b32 %r27741, %r27740, 16711680; + or.b32 %r27742, %r27738, %r27741; + cvt.u32.u16 %r27743, %rs847; + shl.b32 %r27744, %r27743, 24; + or.b32 %r27745, %r27742, %r27744; + cvt.u32.u16 %r27746, %rs848; + and.b32 %r27747, %r27746, 255; + cvt.u32.u16 %r27748, %rs849; + prmt.b32 %r27749, %r27748, %r27747, 30212; + cvt.u32.u16 %r27750, %rs850; + shl.b32 %r27751, %r27750, 16; + and.b32 %r27752, %r27751, 16711680; + or.b32 %r27753, %r27749, %r27752; + cvt.u32.u16 %r27754, %rs851; + shl.b32 %r27755, %r27754, 24; + or.b32 %r27756, %r27753, %r27755; + cvt.u32.u16 %r27757, %rs852; + and.b32 %r27758, %r27757, 255; + cvt.u32.u16 %r27759, %rs853; + prmt.b32 %r27760, %r27759, %r27758, 30212; + cvt.u32.u16 %r27761, %rs854; + shl.b32 %r27762, %r27761, 16; + and.b32 %r27763, %r27762, 16711680; + or.b32 %r27764, %r27760, %r27763; + cvt.u32.u16 %r27765, %rs855; + shl.b32 %r27766, %r27765, 24; + or.b32 %r27767, %r27764, %r27766; + cvt.u32.u16 %r27768, %rs856; + and.b32 %r27769, %r27768, 255; + cvt.u32.u16 %r27770, %rs857; + prmt.b32 %r27771, %r27770, %r27769, 30212; + cvt.u32.u16 %r27772, %rs858; + shl.b32 %r27773, %r27772, 16; + and.b32 %r27774, %r27773, 16711680; + or.b32 %r27775, %r27771, %r27774; + cvt.u32.u16 %r27776, %rs859; + shl.b32 %r27777, %r27776, 24; + or.b32 %r27778, %r27775, %r27777; + cvt.u32.u16 %r27779, %rs860; + and.b32 %r27780, %r27779, 255; + cvt.u32.u16 %r27781, %rs861; + prmt.b32 %r27782, %r27781, %r27780, 30212; + cvt.u32.u16 %r27783, %rs862; + shl.b32 %r27784, %r27783, 16; + and.b32 %r27785, %r27784, 16711680; + or.b32 %r27786, %r27782, %r27785; + cvt.u32.u16 %r27787, %rs863; + shl.b32 %r27788, %r27787, 24; + or.b32 %r27789, %r27786, %r27788; + or.b16 %rs569, %rs865, 8; + cvt.u32.u16 %r27790, %rs569; + and.b32 %r27791, %r27790, 255; + add.s32 %r27792, %r30981, %r30977; + add.s32 %r27793, %r27792, %r27624; + add.s32 %r27794, %r27635, %r27793; + add.s32 %r27795, %r30982, %r30978; + add.s32 %r27796, %r27795, %r27646; + add.s32 %r27797, %r27657, %r27796; + add.s32 %r27798, %r30983, %r30979; + add.s32 %r27799, %r27798, %r27668; + cvt.u32.u16 %r27800, %rs864; + and.b32 %r27801, %r27800, 255; + xor.b32 %r27802, %r27799, %r27801; + shr.u32 %r27803, %r27799, 16; + shl.b32 %r27804, %r27802, 16; + or.b32 %r27805, %r27804, %r27803; + add.s32 %r27806, %r27805, 1013904242; + xor.b32 %r27807, %r27806, %r30983; + shf.l.wrap.b32 %r27808, %r27807, %r27807, 20; + add.s32 %r27809, %r27679, %r27799; + add.s32 %r27810, %r27809, %r27808; + xor.b32 %r27811, %r27810, %r27805; + shf.l.wrap.b32 %r27812, %r27811, %r27811, 24; + add.s32 %r27813, %r27812, %r27806; + xor.b32 %r27814, %r27813, %r27808; + shf.l.wrap.b32 %r27815, %r27814, %r27814, 25; + add.s32 %r27816, %r30984, %r30980; + add.s32 %r27817, %r27816, %r27690; + xor.b32 %r27818, %r27817, %r27791; + shr.u32 %r27819, %r27817, 16; + shl.b32 %r27820, %r27818, 16; + or.b32 %r27821, %r27820, %r27819; + add.s32 %r27822, %r27821, -1521486534; + xor.b32 %r27823, %r27822, %r30984; + shf.l.wrap.b32 %r27824, %r27823, %r27823, 20; + add.s32 %r27825, %r27701, %r27817; + add.s32 %r27826, %r27825, %r27824; + xor.b32 %r27827, %r27826, %r27821; + shf.l.wrap.b32 %r27828, %r27827, %r27827, 24; + add.s32 %r27829, %r27828, %r27822; + xor.b32 %r27830, %r27829, %r27824; + shf.l.wrap.b32 %r27831, %r27830, %r27830, 25; + add.s32 %r27832, %r27815, %r27734; + add.s32 %r27833, %r27810, %r27756; + add.s32 %r27834, %r27833, %r27831; + add.s32 %r27835, %r27834, %r27767; + add.s32 %r27836, %r27826, %r27778; + shf.l.wrap.b32 %r27837, %r27793, %r27793, 16; + add.s32 %r27838, %r27837, 1779033703; + xor.b32 %r27839, %r27838, %r30981; + shf.l.wrap.b32 %r27840, %r27839, %r27839, 20; + add.s32 %r27841, %r27794, %r27840; + xor.b32 %r27842, %r27841, %r27837; + shf.l.wrap.b32 %r27843, %r27842, %r27842, 24; + add.s32 %r27844, %r27843, %r27838; + xor.b32 %r27845, %r27844, %r27840; + shf.l.wrap.b32 %r27846, %r27845, %r27845, 25; + shf.l.wrap.b32 %r27847, %r27796, %r27796, 16; + add.s32 %r27848, %r27847, -1150833019; + xor.b32 %r27849, %r27848, %r30982; + shf.l.wrap.b32 %r27850, %r27849, %r27849, 20; + add.s32 %r27851, %r27797, %r27850; + xor.b32 %r27852, %r27851, %r27847; + shf.l.wrap.b32 %r27853, %r27852, %r27852, 24; + add.s32 %r27854, %r27853, %r27848; + xor.b32 %r27855, %r27854, %r27850; + shf.l.wrap.b32 %r27856, %r27855, %r27855, 25; + add.s32 %r27857, %r27841, %r27712; + add.s32 %r27858, %r27857, %r27856; + xor.b32 %r27859, %r27858, %r27828; + shf.l.wrap.b32 %r27860, %r27859, %r27859, 16; + add.s32 %r27861, %r27860, %r27813; + xor.b32 %r27862, %r27861, %r27856; + shf.l.wrap.b32 %r27863, %r27862, %r27862, 20; + add.s32 %r27864, %r27858, %r27723; + add.s32 %r27865, %r27864, %r27863; + xor.b32 %r27866, %r27865, %r27860; + shf.l.wrap.b32 %r27867, %r27866, %r27866, 24; + add.s32 %r27868, %r27867, %r27861; + xor.b32 %r27869, %r27868, %r27863; + shf.l.wrap.b32 %r27870, %r27869, %r27869, 25; + add.s32 %r27871, %r27832, %r27851; + xor.b32 %r27872, %r27843, %r27871; + shf.l.wrap.b32 %r27873, %r27872, %r27872, 16; + add.s32 %r27874, %r27873, %r27829; + xor.b32 %r27875, %r27874, %r27815; + shf.l.wrap.b32 %r27876, %r27875, %r27875, 20; + add.s32 %r27877, %r27871, %r27745; + add.s32 %r27878, %r27877, %r27876; + xor.b32 %r27879, %r27878, %r27873; + shf.l.wrap.b32 %r27880, %r27879, %r27879, 24; + add.s32 %r27881, %r27880, %r27874; + xor.b32 %r27882, %r27881, %r27876; + shf.l.wrap.b32 %r27883, %r27882, %r27882, 25; + xor.b32 %r27884, %r27853, %r27834; + shf.l.wrap.b32 %r27885, %r27884, %r27884, 16; + add.s32 %r27886, %r27885, %r27844; + xor.b32 %r27887, %r27886, %r27831; + shf.l.wrap.b32 %r27888, %r27887, %r27887, 20; + add.s32 %r27889, %r27835, %r27888; + xor.b32 %r27890, %r27889, %r27885; + shf.l.wrap.b32 %r27891, %r27890, %r27890, 24; + add.s32 %r27892, %r27891, %r27886; + xor.b32 %r27893, %r27892, %r27888; + shf.l.wrap.b32 %r27894, %r27893, %r27893, 25; + add.s32 %r27895, %r27836, %r27846; + xor.b32 %r27896, %r27895, %r27812; + shf.l.wrap.b32 %r27897, %r27896, %r27896, 16; + add.s32 %r27898, %r27897, %r27854; + xor.b32 %r27899, %r27898, %r27846; + shf.l.wrap.b32 %r27900, %r27899, %r27899, 20; + add.s32 %r27901, %r27895, %r27789; + add.s32 %r27902, %r27901, %r27900; + xor.b32 %r27903, %r27902, %r27897; + shf.l.wrap.b32 %r27904, %r27903, %r27903, 24; + add.s32 %r27905, %r27904, %r27898; + xor.b32 %r27906, %r27905, %r27900; + shf.l.wrap.b32 %r27907, %r27906, %r27906, 25; + add.s32 %r27908, %r27865, %r27646; + add.s32 %r27909, %r27908, %r27907; + xor.b32 %r27910, %r27909, %r27880; + shf.l.wrap.b32 %r27911, %r27910, %r27910, 16; + add.s32 %r27912, %r27911, %r27892; + xor.b32 %r27913, %r27912, %r27907; + shf.l.wrap.b32 %r27914, %r27913, %r27913, 20; + add.s32 %r27915, %r27909, %r27690; + add.s32 %r27916, %r27915, %r27914; + xor.b32 %r27917, %r27916, %r27911; + shf.l.wrap.b32 %r27918, %r27917, %r27917, 24; + add.s32 %r27919, %r27918, %r27912; + xor.b32 %r27920, %r27919, %r27914; + shf.l.wrap.b32 %r27921, %r27920, %r27920, 25; + add.s32 %r27922, %r27878, %r27657; + add.s32 %r27923, %r27922, %r27870; + xor.b32 %r27924, %r27923, %r27891; + shf.l.wrap.b32 %r27925, %r27924, %r27924, 16; + add.s32 %r27926, %r27925, %r27905; + xor.b32 %r27927, %r27926, %r27870; + shf.l.wrap.b32 %r27928, %r27927, %r27927, 20; + add.s32 %r27929, %r27923, %r27734; + add.s32 %r27930, %r27929, %r27928; + xor.b32 %r27931, %r27930, %r27925; + shf.l.wrap.b32 %r27932, %r27931, %r27931, 24; + add.s32 %r27933, %r27932, %r27926; + xor.b32 %r27934, %r27933, %r27928; + shf.l.wrap.b32 %r27935, %r27934, %r27934, 25; + add.s32 %r27936, %r27889, %r27701; + add.s32 %r27937, %r27936, %r27883; + xor.b32 %r27938, %r27904, %r27937; + shf.l.wrap.b32 %r27939, %r27938, %r27938, 16; + add.s32 %r27940, %r27939, %r27868; + xor.b32 %r27941, %r27940, %r27883; + shf.l.wrap.b32 %r27942, %r27941, %r27941, 20; + add.s32 %r27943, %r27937, %r27624; + add.s32 %r27944, %r27943, %r27942; + xor.b32 %r27945, %r27944, %r27939; + shf.l.wrap.b32 %r27946, %r27945, %r27945, 24; + add.s32 %r27947, %r27946, %r27940; + xor.b32 %r27948, %r27947, %r27942; + shf.l.wrap.b32 %r27949, %r27948, %r27948, 25; + add.s32 %r27950, %r27902, %r27668; + add.s32 %r27951, %r27950, %r27894; + xor.b32 %r27952, %r27867, %r27951; + shf.l.wrap.b32 %r27953, %r27952, %r27952, 16; + add.s32 %r27954, %r27953, %r27881; + xor.b32 %r27955, %r27954, %r27894; + shf.l.wrap.b32 %r27956, %r27955, %r27955, 20; + add.s32 %r27957, %r27951, %r27767; + add.s32 %r27958, %r27957, %r27956; + xor.b32 %r27959, %r27958, %r27953; + shf.l.wrap.b32 %r27960, %r27959, %r27959, 24; + add.s32 %r27961, %r27960, %r27954; + xor.b32 %r27962, %r27961, %r27956; + shf.l.wrap.b32 %r27963, %r27962, %r27962, 25; + add.s32 %r27964, %r27916, %r27635; + add.s32 %r27965, %r27964, %r27935; + xor.b32 %r27966, %r27965, %r27960; + shf.l.wrap.b32 %r27967, %r27966, %r27966, 16; + add.s32 %r27968, %r27967, %r27947; + xor.b32 %r27969, %r27968, %r27935; + shf.l.wrap.b32 %r27970, %r27969, %r27969, 20; + add.s32 %r27971, %r27965, %r27745; + add.s32 %r27972, %r27971, %r27970; + xor.b32 %r27973, %r27972, %r27967; + shf.l.wrap.b32 %r27974, %r27973, %r27973, 24; + add.s32 %r27975, %r27974, %r27968; + xor.b32 %r27976, %r27975, %r27970; + shf.l.wrap.b32 %r27977, %r27976, %r27976, 25; + add.s32 %r27978, %r27949, %r27756; + add.s32 %r27979, %r27978, %r27930; + xor.b32 %r27980, %r27918, %r27979; + shf.l.wrap.b32 %r27981, %r27980, %r27980, 16; + add.s32 %r27982, %r27981, %r27961; + xor.b32 %r27983, %r27982, %r27949; + shf.l.wrap.b32 %r27984, %r27983, %r27983, 20; + add.s32 %r27985, %r27979, %r27679; + add.s32 %r27986, %r27985, %r27984; + xor.b32 %r27987, %r27986, %r27981; + shf.l.wrap.b32 %r27988, %r27987, %r27987, 24; + add.s32 %r27989, %r27988, %r27982; + xor.b32 %r27990, %r27989, %r27984; + shf.l.wrap.b32 %r27991, %r27990, %r27990, 25; + add.s32 %r27992, %r27944, %r27723; + add.s32 %r27993, %r27992, %r27963; + xor.b32 %r27994, %r27932, %r27993; + shf.l.wrap.b32 %r27995, %r27994, %r27994, 16; + add.s32 %r27996, %r27995, %r27919; + xor.b32 %r27997, %r27996, %r27963; + shf.l.wrap.b32 %r27998, %r27997, %r27997, 20; + add.s32 %r27999, %r27993, %r27778; + add.s32 %r28000, %r27999, %r27998; + xor.b32 %r28001, %r28000, %r27995; + shf.l.wrap.b32 %r28002, %r28001, %r28001, 24; + add.s32 %r28003, %r28002, %r27996; + xor.b32 %r28004, %r28003, %r27998; + shf.l.wrap.b32 %r28005, %r28004, %r28004, 25; + add.s32 %r28006, %r27958, %r27789; + add.s32 %r28007, %r28006, %r27921; + xor.b32 %r28008, %r28007, %r27946; + shf.l.wrap.b32 %r28009, %r28008, %r28008, 16; + add.s32 %r28010, %r28009, %r27933; + xor.b32 %r28011, %r28010, %r27921; + shf.l.wrap.b32 %r28012, %r28011, %r28011, 20; + add.s32 %r28013, %r28007, %r27712; + add.s32 %r28014, %r28013, %r28012; + xor.b32 %r28015, %r28014, %r28009; + shf.l.wrap.b32 %r28016, %r28015, %r28015, 24; + add.s32 %r28017, %r28016, %r28010; + xor.b32 %r28018, %r28017, %r28012; + shf.l.wrap.b32 %r28019, %r28018, %r28018, 25; + add.s32 %r28020, %r27972, %r27657; + add.s32 %r28021, %r28020, %r28019; + xor.b32 %r28022, %r28021, %r27988; + shf.l.wrap.b32 %r28023, %r28022, %r28022, 16; + add.s32 %r28024, %r28023, %r28003; + xor.b32 %r28025, %r28024, %r28019; + shf.l.wrap.b32 %r28026, %r28025, %r28025, 20; + add.s32 %r28027, %r28021, %r27668; + add.s32 %r28028, %r28027, %r28026; + xor.b32 %r28029, %r28028, %r28023; + shf.l.wrap.b32 %r28030, %r28029, %r28029, 24; + add.s32 %r28031, %r28030, %r28024; + xor.b32 %r28032, %r28031, %r28026; + shf.l.wrap.b32 %r28033, %r28032, %r28032, 25; + add.s32 %r28034, %r27986, %r27734; + add.s32 %r28035, %r28034, %r27977; + xor.b32 %r28036, %r28035, %r28002; + shf.l.wrap.b32 %r28037, %r28036, %r28036, 16; + add.s32 %r28038, %r28037, %r28017; + xor.b32 %r28039, %r28038, %r27977; + shf.l.wrap.b32 %r28040, %r28039, %r28039, 20; + add.s32 %r28041, %r28035, %r27756; + add.s32 %r28042, %r28041, %r28040; + xor.b32 %r28043, %r28042, %r28037; + shf.l.wrap.b32 %r28044, %r28043, %r28043, 24; + add.s32 %r28045, %r28044, %r28038; + xor.b32 %r28046, %r28045, %r28040; + shf.l.wrap.b32 %r28047, %r28046, %r28046, 25; + add.s32 %r28048, %r28000, %r27767; + add.s32 %r28049, %r28048, %r27991; + xor.b32 %r28050, %r28016, %r28049; + shf.l.wrap.b32 %r28051, %r28050, %r28050, 16; + add.s32 %r28052, %r28051, %r27975; + xor.b32 %r28053, %r28052, %r27991; + shf.l.wrap.b32 %r28054, %r28053, %r28053, 20; + add.s32 %r28055, %r28049, %r27646; + add.s32 %r28056, %r28055, %r28054; + xor.b32 %r28057, %r28056, %r28051; + shf.l.wrap.b32 %r28058, %r28057, %r28057, 24; + add.s32 %r28059, %r28058, %r28052; + xor.b32 %r28060, %r28059, %r28054; + shf.l.wrap.b32 %r28061, %r28060, %r28060, 25; + add.s32 %r28062, %r28014, %r27701; + add.s32 %r28063, %r28062, %r28005; + xor.b32 %r28064, %r27974, %r28063; + shf.l.wrap.b32 %r28065, %r28064, %r28064, 16; + add.s32 %r28066, %r28065, %r27989; + xor.b32 %r28067, %r28066, %r28005; + shf.l.wrap.b32 %r28068, %r28067, %r28067, 20; + add.s32 %r28069, %r28063, %r27778; + add.s32 %r28070, %r28069, %r28068; + xor.b32 %r28071, %r28070, %r28065; + shf.l.wrap.b32 %r28072, %r28071, %r28071, 24; + add.s32 %r28073, %r28072, %r28066; + xor.b32 %r28074, %r28073, %r28068; + shf.l.wrap.b32 %r28075, %r28074, %r28074, 25; + add.s32 %r28076, %r28028, %r27690; + add.s32 %r28077, %r28076, %r28047; + xor.b32 %r28078, %r28077, %r28072; + shf.l.wrap.b32 %r28079, %r28078, %r28078, 16; + add.s32 %r28080, %r28079, %r28059; + xor.b32 %r28081, %r28080, %r28047; + shf.l.wrap.b32 %r28082, %r28081, %r28081, 20; + add.s32 %r28083, %r28077, %r27679; + add.s32 %r28084, %r28083, %r28082; + xor.b32 %r28085, %r28084, %r28079; + shf.l.wrap.b32 %r28086, %r28085, %r28085, 24; + add.s32 %r28087, %r28086, %r28080; + xor.b32 %r28088, %r28087, %r28082; + shf.l.wrap.b32 %r28089, %r28088, %r28088, 25; + add.s32 %r28090, %r28061, %r27723; + add.s32 %r28091, %r28090, %r28042; + xor.b32 %r28092, %r28030, %r28091; + shf.l.wrap.b32 %r28093, %r28092, %r28092, 16; + add.s32 %r28094, %r28093, %r28073; + xor.b32 %r28095, %r28094, %r28061; + shf.l.wrap.b32 %r28096, %r28095, %r28095, 20; + add.s32 %r28097, %r28091, %r27624; + add.s32 %r28098, %r28097, %r28096; + xor.b32 %r28099, %r28098, %r28093; + shf.l.wrap.b32 %r28100, %r28099, %r28099, 24; + add.s32 %r28101, %r28100, %r28094; + xor.b32 %r28102, %r28101, %r28096; + shf.l.wrap.b32 %r28103, %r28102, %r28102, 25; + add.s32 %r28104, %r28056, %r27745; + add.s32 %r28105, %r28104, %r28075; + xor.b32 %r28106, %r28044, %r28105; + shf.l.wrap.b32 %r28107, %r28106, %r28106, 16; + add.s32 %r28108, %r28107, %r28031; + xor.b32 %r28109, %r28108, %r28075; + shf.l.wrap.b32 %r28110, %r28109, %r28109, 20; + add.s32 %r28111, %r28105, %r27789; + add.s32 %r28112, %r28111, %r28110; + xor.b32 %r28113, %r28112, %r28107; + shf.l.wrap.b32 %r28114, %r28113, %r28113, 24; + add.s32 %r28115, %r28114, %r28108; + xor.b32 %r28116, %r28115, %r28110; + shf.l.wrap.b32 %r28117, %r28116, %r28116, 25; + add.s32 %r28118, %r28070, %r27712; + add.s32 %r28119, %r28118, %r28033; + xor.b32 %r28120, %r28119, %r28058; + shf.l.wrap.b32 %r28121, %r28120, %r28120, 16; + add.s32 %r28122, %r28121, %r28045; + xor.b32 %r28123, %r28122, %r28033; + shf.l.wrap.b32 %r28124, %r28123, %r28123, 20; + add.s32 %r28125, %r28119, %r27635; + add.s32 %r28126, %r28125, %r28124; + xor.b32 %r28127, %r28126, %r28121; + shf.l.wrap.b32 %r28128, %r28127, %r28127, 24; + add.s32 %r28129, %r28128, %r28122; + xor.b32 %r28130, %r28129, %r28124; + shf.l.wrap.b32 %r28131, %r28130, %r28130, 25; + add.s32 %r28132, %r28084, %r27734; + add.s32 %r28133, %r28132, %r28131; + xor.b32 %r28134, %r28133, %r28100; + shf.l.wrap.b32 %r28135, %r28134, %r28134, 16; + add.s32 %r28136, %r28135, %r28115; + xor.b32 %r28137, %r28136, %r28131; + shf.l.wrap.b32 %r28138, %r28137, %r28137, 20; + add.s32 %r28139, %r28133, %r27701; + add.s32 %r28140, %r28139, %r28138; + xor.b32 %r28141, %r28140, %r28135; + shf.l.wrap.b32 %r28142, %r28141, %r28141, 24; + add.s32 %r28143, %r28142, %r28136; + xor.b32 %r28144, %r28143, %r28138; + shf.l.wrap.b32 %r28145, %r28144, %r28144, 25; + add.s32 %r28146, %r28098, %r27756; + add.s32 %r28147, %r28146, %r28089; + xor.b32 %r28148, %r28147, %r28114; + shf.l.wrap.b32 %r28149, %r28148, %r28148, 16; + add.s32 %r28150, %r28149, %r28129; + xor.b32 %r28151, %r28150, %r28089; + shf.l.wrap.b32 %r28152, %r28151, %r28151, 20; + add.s32 %r28153, %r28147, %r27723; + add.s32 %r28154, %r28153, %r28152; + xor.b32 %r28155, %r28154, %r28149; + shf.l.wrap.b32 %r28156, %r28155, %r28155, 24; + add.s32 %r28157, %r28156, %r28150; + xor.b32 %r28158, %r28157, %r28152; + shf.l.wrap.b32 %r28159, %r28158, %r28158, 25; + add.s32 %r28160, %r28112, %r27778; + add.s32 %r28161, %r28160, %r28103; + xor.b32 %r28162, %r28128, %r28161; + shf.l.wrap.b32 %r28163, %r28162, %r28162, 16; + add.s32 %r28164, %r28163, %r28087; + xor.b32 %r28165, %r28164, %r28103; + shf.l.wrap.b32 %r28166, %r28165, %r28165, 20; + add.s32 %r28167, %r28161, %r27657; + add.s32 %r28168, %r28167, %r28166; + xor.b32 %r28169, %r28168, %r28163; + shf.l.wrap.b32 %r28170, %r28169, %r28169, 24; + add.s32 %r28171, %r28170, %r28164; + xor.b32 %r28172, %r28171, %r28166; + shf.l.wrap.b32 %r28173, %r28172, %r28172, 25; + add.s32 %r28174, %r28126, %r27767; + add.s32 %r28175, %r28174, %r28117; + xor.b32 %r28176, %r28086, %r28175; + shf.l.wrap.b32 %r28177, %r28176, %r28176, 16; + add.s32 %r28178, %r28177, %r28101; + xor.b32 %r28179, %r28178, %r28117; + shf.l.wrap.b32 %r28180, %r28179, %r28179, 20; + add.s32 %r28181, %r28175, %r27789; + add.s32 %r28182, %r28181, %r28180; + xor.b32 %r28183, %r28182, %r28177; + shf.l.wrap.b32 %r28184, %r28183, %r28183, 24; + add.s32 %r28185, %r28184, %r28178; + xor.b32 %r28186, %r28185, %r28180; + shf.l.wrap.b32 %r28187, %r28186, %r28186, 25; + add.s32 %r28188, %r28140, %r27668; + add.s32 %r28189, %r28188, %r28159; + xor.b32 %r28190, %r28189, %r28184; + shf.l.wrap.b32 %r28191, %r28190, %r28190, 16; + add.s32 %r28192, %r28191, %r28171; + xor.b32 %r28193, %r28192, %r28159; + shf.l.wrap.b32 %r28194, %r28193, %r28193, 20; + add.s32 %r28195, %r28189, %r27624; + add.s32 %r28196, %r28195, %r28194; + xor.b32 %r28197, %r28196, %r28191; + shf.l.wrap.b32 %r28198, %r28197, %r28197, 24; + add.s32 %r28199, %r28198, %r28192; + xor.b32 %r28200, %r28199, %r28194; + shf.l.wrap.b32 %r28201, %r28200, %r28200, 25; + add.s32 %r28202, %r28173, %r27745; + add.s32 %r28203, %r28202, %r28154; + xor.b32 %r28204, %r28142, %r28203; + shf.l.wrap.b32 %r28205, %r28204, %r28204, 16; + add.s32 %r28206, %r28205, %r28185; + xor.b32 %r28207, %r28206, %r28173; + shf.l.wrap.b32 %r28208, %r28207, %r28207, 20; + add.s32 %r28209, %r28203, %r27646; + add.s32 %r28210, %r28209, %r28208; + xor.b32 %r28211, %r28210, %r28205; + shf.l.wrap.b32 %r28212, %r28211, %r28211, 24; + add.s32 %r28213, %r28212, %r28206; + xor.b32 %r28214, %r28213, %r28208; + shf.l.wrap.b32 %r28215, %r28214, %r28214, 25; + add.s32 %r28216, %r28168, %r27679; + add.s32 %r28217, %r28216, %r28187; + xor.b32 %r28218, %r28156, %r28217; + shf.l.wrap.b32 %r28219, %r28218, %r28218, 16; + add.s32 %r28220, %r28219, %r28143; + xor.b32 %r28221, %r28220, %r28187; + shf.l.wrap.b32 %r28222, %r28221, %r28221, 20; + add.s32 %r28223, %r28217, %r27712; + add.s32 %r28224, %r28223, %r28222; + xor.b32 %r28225, %r28224, %r28219; + shf.l.wrap.b32 %r28226, %r28225, %r28225, 24; + add.s32 %r28227, %r28226, %r28220; + xor.b32 %r28228, %r28227, %r28222; + shf.l.wrap.b32 %r28229, %r28228, %r28228, 25; + add.s32 %r28230, %r28182, %r27635; + add.s32 %r28231, %r28230, %r28145; + xor.b32 %r28232, %r28231, %r28170; + shf.l.wrap.b32 %r28233, %r28232, %r28232, 16; + add.s32 %r28234, %r28233, %r28157; + xor.b32 %r28235, %r28234, %r28145; + shf.l.wrap.b32 %r28236, %r28235, %r28235, 20; + add.s32 %r28237, %r28231, %r27690; + add.s32 %r28238, %r28237, %r28236; + xor.b32 %r28239, %r28238, %r28233; + shf.l.wrap.b32 %r28240, %r28239, %r28239, 24; + add.s32 %r28241, %r28240, %r28234; + xor.b32 %r28242, %r28241, %r28236; + shf.l.wrap.b32 %r28243, %r28242, %r28242, 25; + add.s32 %r28244, %r28196, %r27756; + add.s32 %r28245, %r28244, %r28243; + xor.b32 %r28246, %r28245, %r28212; + shf.l.wrap.b32 %r28247, %r28246, %r28246, 16; + add.s32 %r28248, %r28247, %r28227; + xor.b32 %r28249, %r28248, %r28243; + shf.l.wrap.b32 %r28250, %r28249, %r28249, 20; + add.s32 %r28251, %r28245, %r27767; + add.s32 %r28252, %r28251, %r28250; + xor.b32 %r28253, %r28252, %r28247; + shf.l.wrap.b32 %r28254, %r28253, %r28253, 24; + add.s32 %r28255, %r28254, %r28248; + xor.b32 %r28256, %r28255, %r28250; + shf.l.wrap.b32 %r28257, %r28256, %r28256, 25; + add.s32 %r28258, %r28210, %r27723; + add.s32 %r28259, %r28258, %r28201; + xor.b32 %r28260, %r28259, %r28226; + shf.l.wrap.b32 %r28261, %r28260, %r28260, 16; + add.s32 %r28262, %r28261, %r28241; + xor.b32 %r28263, %r28262, %r28201; + shf.l.wrap.b32 %r28264, %r28263, %r28263, 20; + add.s32 %r28265, %r28259, %r27745; + add.s32 %r28266, %r28265, %r28264; + xor.b32 %r28267, %r28266, %r28261; + shf.l.wrap.b32 %r28268, %r28267, %r28267, 24; + add.s32 %r28269, %r28268, %r28262; + xor.b32 %r28270, %r28269, %r28264; + shf.l.wrap.b32 %r28271, %r28270, %r28270, 25; + add.s32 %r28272, %r28224, %r27789; + add.s32 %r28273, %r28272, %r28215; + xor.b32 %r28274, %r28240, %r28273; + shf.l.wrap.b32 %r28275, %r28274, %r28274, 16; + add.s32 %r28276, %r28275, %r28199; + xor.b32 %r28277, %r28276, %r28215; + shf.l.wrap.b32 %r28278, %r28277, %r28277, 20; + add.s32 %r28279, %r28273, %r27734; + add.s32 %r28280, %r28279, %r28278; + xor.b32 %r28281, %r28280, %r28275; + shf.l.wrap.b32 %r28282, %r28281, %r28281, 24; + add.s32 %r28283, %r28282, %r28276; + xor.b32 %r28284, %r28283, %r28278; + shf.l.wrap.b32 %r28285, %r28284, %r28284, 25; + add.s32 %r28286, %r28238, %r27778; + add.s32 %r28287, %r28286, %r28229; + xor.b32 %r28288, %r28198, %r28287; + shf.l.wrap.b32 %r28289, %r28288, %r28288, 16; + add.s32 %r28290, %r28289, %r28213; + xor.b32 %r28291, %r28290, %r28229; + shf.l.wrap.b32 %r28292, %r28291, %r28291, 20; + add.s32 %r28293, %r28287, %r27712; + add.s32 %r28294, %r28293, %r28292; + xor.b32 %r28295, %r28294, %r28289; + shf.l.wrap.b32 %r28296, %r28295, %r28295, 24; + add.s32 %r28297, %r28296, %r28290; + xor.b32 %r28298, %r28297, %r28292; + shf.l.wrap.b32 %r28299, %r28298, %r28298, 25; + add.s32 %r28300, %r28252, %r27701; + add.s32 %r28301, %r28300, %r28271; + xor.b32 %r28302, %r28301, %r28296; + shf.l.wrap.b32 %r28303, %r28302, %r28302, 16; + add.s32 %r28304, %r28303, %r28283; + xor.b32 %r28305, %r28304, %r28271; + shf.l.wrap.b32 %r28306, %r28305, %r28305, 20; + add.s32 %r28307, %r28301, %r27646; + add.s32 %r28308, %r28307, %r28306; + xor.b32 %r28309, %r28308, %r28303; + shf.l.wrap.b32 %r28310, %r28309, %r28309, 24; + add.s32 %r28311, %r28310, %r28304; + xor.b32 %r28312, %r28311, %r28306; + shf.l.wrap.b32 %r28313, %r28312, %r28312, 25; + add.s32 %r28314, %r28285, %r27679; + add.s32 %r28315, %r28314, %r28266; + xor.b32 %r28316, %r28254, %r28315; + shf.l.wrap.b32 %r28317, %r28316, %r28316, 16; + add.s32 %r28318, %r28317, %r28297; + xor.b32 %r28319, %r28318, %r28285; + shf.l.wrap.b32 %r28320, %r28319, %r28319, 20; + add.s32 %r28321, %r28315, %r27657; + add.s32 %r28322, %r28321, %r28320; + xor.b32 %r28323, %r28322, %r28317; + shf.l.wrap.b32 %r28324, %r28323, %r28323, 24; + add.s32 %r28325, %r28324, %r28318; + xor.b32 %r28326, %r28325, %r28320; + shf.l.wrap.b32 %r28327, %r28326, %r28326, 25; + add.s32 %r28328, %r28280, %r27624; + add.s32 %r28329, %r28328, %r28299; + xor.b32 %r28330, %r28268, %r28329; + shf.l.wrap.b32 %r28331, %r28330, %r28330, 16; + add.s32 %r28332, %r28331, %r28255; + xor.b32 %r28333, %r28332, %r28299; + shf.l.wrap.b32 %r28334, %r28333, %r28333, 20; + add.s32 %r28335, %r28329, %r27635; + add.s32 %r28336, %r28335, %r28334; + xor.b32 %r28337, %r28336, %r28331; + shf.l.wrap.b32 %r28338, %r28337, %r28337, 24; + add.s32 %r28339, %r28338, %r28332; + xor.b32 %r28340, %r28339, %r28334; + shf.l.wrap.b32 %r28341, %r28340, %r28340, 25; + add.s32 %r28342, %r28294, %r27690; + add.s32 %r28343, %r28342, %r28257; + xor.b32 %r28344, %r28343, %r28282; + shf.l.wrap.b32 %r28345, %r28344, %r28344, 16; + add.s32 %r28346, %r28345, %r28269; + xor.b32 %r28347, %r28346, %r28257; + shf.l.wrap.b32 %r28348, %r28347, %r28347, 20; + add.s32 %r28349, %r28343, %r27668; + add.s32 %r28350, %r28349, %r28348; + xor.b32 %r28351, %r28350, %r28345; + shf.l.wrap.b32 %r28352, %r28351, %r28351, 24; + add.s32 %r28353, %r28352, %r28346; + xor.b32 %r28354, %r28353, %r28348; + shf.l.wrap.b32 %r28355, %r28354, %r28354, 25; + add.s32 %r28356, %r28308, %r27723; + add.s32 %r28357, %r28356, %r28355; + xor.b32 %r28358, %r28357, %r28324; + shf.l.wrap.b32 %r28359, %r28358, %r28358, 16; + add.s32 %r28360, %r28359, %r28339; + xor.b32 %r28361, %r28360, %r28355; + shf.l.wrap.b32 %r28362, %r28361, %r28361, 20; + add.s32 %r28363, %r28357, %r27778; + add.s32 %r28364, %r28363, %r28362; + xor.b32 %r28365, %r28364, %r28359; + shf.l.wrap.b32 %r28366, %r28365, %r28365, 24; + add.s32 %r28367, %r28366, %r28360; + xor.b32 %r28368, %r28367, %r28362; + shf.l.wrap.b32 %r28369, %r28368, %r28368, 25; + add.s32 %r28370, %r28322, %r27745; + add.s32 %r28371, %r28370, %r28313; + xor.b32 %r28372, %r28371, %r28338; + shf.l.wrap.b32 %r28373, %r28372, %r28372, 16; + add.s32 %r28374, %r28373, %r28353; + xor.b32 %r28375, %r28374, %r28313; + shf.l.wrap.b32 %r28376, %r28375, %r28375, 20; + add.s32 %r28377, %r28371, %r27679; + add.s32 %r28378, %r28377, %r28376; + xor.b32 %r28379, %r28378, %r28373; + shf.l.wrap.b32 %r28380, %r28379, %r28379, 24; + add.s32 %r28381, %r28380, %r28374; + xor.b32 %r28382, %r28381, %r28376; + shf.l.wrap.b32 %r28383, %r28382, %r28382, 25; + add.s32 %r28384, %r28336, %r27712; + add.s32 %r28385, %r28384, %r28327; + xor.b32 %r28386, %r28352, %r28385; + shf.l.wrap.b32 %r28387, %r28386, %r28386, 16; + add.s32 %r28388, %r28387, %r28311; + xor.b32 %r28389, %r28388, %r28327; + shf.l.wrap.b32 %r28390, %r28389, %r28389, 20; + add.s32 %r28391, %r28385, %r27756; + add.s32 %r28392, %r28391, %r28390; + xor.b32 %r28393, %r28392, %r28387; + shf.l.wrap.b32 %r28394, %r28393, %r28393, 24; + add.s32 %r28395, %r28394, %r28388; + xor.b32 %r28396, %r28395, %r28390; + shf.l.wrap.b32 %r28397, %r28396, %r28396, 25; + add.s32 %r28398, %r28350, %r27789; + add.s32 %r28399, %r28398, %r28341; + xor.b32 %r28400, %r28310, %r28399; + shf.l.wrap.b32 %r28401, %r28400, %r28400, 16; + add.s32 %r28402, %r28401, %r28325; + xor.b32 %r28403, %r28402, %r28341; + shf.l.wrap.b32 %r28404, %r28403, %r28403, 20; + add.s32 %r28405, %r28399, %r27635; + add.s32 %r28406, %r28405, %r28404; + xor.b32 %r28407, %r28406, %r28401; + shf.l.wrap.b32 %r28408, %r28407, %r28407, 24; + add.s32 %r28409, %r28408, %r28402; + xor.b32 %r28410, %r28409, %r28404; + shf.l.wrap.b32 %r28411, %r28410, %r28410, 25; + add.s32 %r28412, %r28364, %r27767; + add.s32 %r28413, %r28412, %r28383; + xor.b32 %r28414, %r28413, %r28408; + shf.l.wrap.b32 %r28415, %r28414, %r28414, 16; + add.s32 %r28416, %r28415, %r28395; + xor.b32 %r28417, %r28416, %r28383; + shf.l.wrap.b32 %r28418, %r28417, %r28417, 20; + add.s32 %r28419, %r28413, %r27657; + add.s32 %r28420, %r28419, %r28418; + xor.b32 %r28421, %r28420, %r28415; + shf.l.wrap.b32 %r28422, %r28421, %r28421, 24; + add.s32 %r28423, %r28422, %r28416; + xor.b32 %r28424, %r28423, %r28418; + shf.l.wrap.b32 %r28425, %r28424, %r28424, 25; + add.s32 %r28426, %r28397, %r27624; + add.s32 %r28427, %r28426, %r28378; + xor.b32 %r28428, %r28366, %r28427; + shf.l.wrap.b32 %r28429, %r28428, %r28428, 16; + add.s32 %r28430, %r28429, %r28409; + xor.b32 %r28431, %r28430, %r28397; + shf.l.wrap.b32 %r28432, %r28431, %r28431, 20; + add.s32 %r28433, %r28427, %r27734; + add.s32 %r28434, %r28433, %r28432; + xor.b32 %r28435, %r28434, %r28429; + shf.l.wrap.b32 %r28436, %r28435, %r28435, 24; + add.s32 %r28437, %r28436, %r28430; + xor.b32 %r28438, %r28437, %r28432; + shf.l.wrap.b32 %r28439, %r28438, %r28438, 25; + add.s32 %r28440, %r28392, %r27646; + add.s32 %r28441, %r28440, %r28411; + xor.b32 %r28442, %r28380, %r28441; + shf.l.wrap.b32 %r28443, %r28442, %r28442, 16; + add.s32 %r28444, %r28443, %r28367; + xor.b32 %r28445, %r28444, %r28411; + shf.l.wrap.b32 %r28446, %r28445, %r28445, 20; + add.s32 %r28447, %r28441, %r27690; + add.s32 %r28448, %r28447, %r28446; + xor.b32 %r28449, %r28448, %r28443; + shf.l.wrap.b32 %r28450, %r28449, %r28449, 24; + add.s32 %r28451, %r28450, %r28444; + xor.b32 %r28452, %r28451, %r28446; + shf.l.wrap.b32 %r28453, %r28452, %r28452, 25; + add.s32 %r28454, %r28406, %r27668; + add.s32 %r28455, %r28454, %r28369; + xor.b32 %r28456, %r28455, %r28394; + shf.l.wrap.b32 %r28457, %r28456, %r28456, 16; + add.s32 %r28458, %r28457, %r28381; + xor.b32 %r28459, %r28458, %r28369; + shf.l.wrap.b32 %r28460, %r28459, %r28459, 20; + add.s32 %r28461, %r28455, %r27701; + add.s32 %r28462, %r28461, %r28460; + xor.b32 %r28463, %r28462, %r28457; + shf.l.wrap.b32 %r28464, %r28463, %r28463, 24; + add.s32 %r28465, %r28464, %r28458; + xor.b32 %r28466, %r28465, %r28460; + shf.l.wrap.b32 %r28467, %r28466, %r28466, 25; + add.s32 %r28468, %r28420, %r27745; + add.s32 %r28469, %r28468, %r28467; + xor.b32 %r28470, %r28469, %r28436; + shf.l.wrap.b32 %r28471, %r28470, %r28470, 16; + add.s32 %r28472, %r28471, %r28451; + xor.b32 %r28473, %r28472, %r28467; + shf.l.wrap.b32 %r28474, %r28473, %r28473, 20; + add.s32 %r28475, %r28469, %r27789; + add.s32 %r28476, %r28475, %r28474; + xor.b32 %r28477, %r28476, %r28471; + shf.l.wrap.b32 %r28478, %r28477, %r28477, 24; + add.s32 %r28479, %r28478, %r28472; + xor.b32 %r28480, %r28479, %r28474; + shf.l.wrap.b32 %r28481, %r28480, %r28480, 25; + add.s32 %r28482, %r28434, %r27679; + add.s32 %r28483, %r28482, %r28425; + xor.b32 %r28484, %r28483, %r28450; + shf.l.wrap.b32 %r28485, %r28484, %r28484, 16; + add.s32 %r28486, %r28485, %r28465; + xor.b32 %r28487, %r28486, %r28425; + shf.l.wrap.b32 %r28488, %r28487, %r28487, 20; + add.s32 %r28489, %r28483, %r27624; + add.s32 %r28490, %r28489, %r28488; + xor.b32 %r28491, %r28490, %r28485; + shf.l.wrap.b32 %r28492, %r28491, %r28491, 24; + add.s32 %r28493, %r28492, %r28486; + xor.b32 %r28494, %r28493, %r28488; + shf.l.wrap.b32 %r28495, %r28494, %r28494, 25; + add.s32 %r28496, %r28448, %r27635; + add.s32 %r28497, %r28496, %r28439; + xor.b32 %r28498, %r28464, %r28497; + shf.l.wrap.b32 %r28499, %r28498, %r28498, 16; + add.s32 %r28500, %r28499, %r28423; + xor.b32 %r28501, %r28500, %r28439; + shf.l.wrap.b32 %r28502, %r28501, %r28501, 20; + add.s32 %r28503, %r28497, %r27723; + add.s32 %r28504, %r28503, %r28502; + xor.b32 %r28505, %r28504, %r28499; + shf.l.wrap.b32 %r28506, %r28505, %r28505, 24; + add.s32 %r28507, %r28506, %r28500; + xor.b32 %r28508, %r28507, %r28502; + shf.l.wrap.b32 %r28509, %r28508, %r28508, 25; + add.s32 %r28510, %r28462, %r27712; + add.s32 %r28511, %r28510, %r28453; + xor.b32 %r28512, %r28422, %r28511; + shf.l.wrap.b32 %r28513, %r28512, %r28512, 16; + add.s32 %r28514, %r28513, %r28437; + xor.b32 %r28515, %r28514, %r28453; + shf.l.wrap.b32 %r28516, %r28515, %r28515, 20; + add.s32 %r28517, %r28511, %r27690; + add.s32 %r28518, %r28517, %r28516; + xor.b32 %r28519, %r28518, %r28513; + shf.l.wrap.b32 %r28520, %r28519, %r28519, 24; + add.s32 %r28521, %r28520, %r28514; + xor.b32 %r28522, %r28521, %r28516; + shf.l.wrap.b32 %r28523, %r28522, %r28522, 25; + add.s32 %r28524, %r28476, %r27778; + add.s32 %r28525, %r28524, %r28495; + xor.b32 %r28526, %r28525, %r28520; + shf.l.wrap.b32 %r28527, %r28526, %r28526, 16; + add.s32 %r28528, %r28527, %r28507; + xor.b32 %r28529, %r28528, %r28495; + shf.l.wrap.b32 %r28530, %r28529, %r28529, 20; + add.s32 %r28531, %r28525, %r27734; + add.s32 %r28532, %r28531, %r28530; + xor.b32 %r28533, %r28532, %r28527; + shf.l.wrap.b32 %r28534, %r28533, %r28533, 24; + add.s32 %r28535, %r28534, %r28528; + xor.b32 %r28536, %r28535, %r28530; + shf.l.wrap.b32 %r28537, %r28536, %r28536, 25; + add.s32 %r28538, %r28509, %r27646; + add.s32 %r28539, %r28538, %r28490; + xor.b32 %r28540, %r28478, %r28539; + shf.l.wrap.b32 %r28541, %r28540, %r28540, 16; + add.s32 %r28542, %r28541, %r28521; + xor.b32 %r28543, %r28542, %r28509; + shf.l.wrap.b32 %r28544, %r28543, %r28543, 20; + add.s32 %r28545, %r28539, %r27756; + add.s32 %r28546, %r28545, %r28544; + xor.b32 %r28547, %r28546, %r28541; + shf.l.wrap.b32 %r28548, %r28547, %r28547, 24; + add.s32 %r28549, %r28548, %r28542; + xor.b32 %r28550, %r28549, %r28544; + shf.l.wrap.b32 %r28551, %r28550, %r28550, 25; + add.s32 %r28552, %r28504, %r27657; + add.s32 %r28553, %r28552, %r28523; + xor.b32 %r28554, %r28492, %r28553; + shf.l.wrap.b32 %r28555, %r28554, %r28554, 16; + add.s32 %r28556, %r28555, %r28479; + xor.b32 %r28557, %r28556, %r28523; + shf.l.wrap.b32 %r28558, %r28557, %r28557, 20; + add.s32 %r28559, %r28553, %r27668; + add.s32 %r28560, %r28559, %r28558; + xor.b32 %r28561, %r28560, %r28555; + shf.l.wrap.b32 %r28562, %r28561, %r28561, 24; + add.s32 %r28563, %r28562, %r28556; + xor.b32 %r28564, %r28563, %r28558; + shf.l.wrap.b32 %r28565, %r28564, %r28564, 25; + add.s32 %r28566, %r28518, %r27701; + add.s32 %r28567, %r28566, %r28481; + xor.b32 %r28568, %r28567, %r28506; + shf.l.wrap.b32 %r28569, %r28568, %r28568, 16; + add.s32 %r28570, %r28569, %r28493; + xor.b32 %r28571, %r28570, %r28481; + shf.l.wrap.b32 %r28572, %r28571, %r28571, 20; + add.s32 %r28573, %r28567, %r27767; + add.s32 %r28574, %r28573, %r28572; + xor.b32 %r28575, %r28574, %r28569; + shf.l.wrap.b32 %r28576, %r28575, %r28575, 24; + add.s32 %r28577, %r28576, %r28570; + xor.b32 %r28578, %r28577, %r28572; + shf.l.wrap.b32 %r28579, %r28578, %r28578, 25; + xor.b32 %r28580, %r28532, %r28563; + cvt.u64.u32 %rd342, %r28580; + xor.b32 %r28581, %r28577, %r28546; + and.b32 %r28582, %r28581, 255; + cvt.u64.u32 %rd1147, %r28582; + cvt.u64.u32 %rd1148, %r28581; + shl.b64 %rd1149, %rd1148, 32; + and.b64 %rd1150, %rd1149, 280375465082880; + and.b64 %rd1151, %rd1149, 71776119061217280; + shr.u32 %r28583, %r28581, 24; + cvt.u64.u32 %rd1152, %r28583; + shl.b64 %rd1153, %rd1152, 56; + bfi.b64 %rd1154, %rd1147, %rd342, 32, 32; + or.b64 %rd1155, %rd1154, %rd1150; + or.b64 %rd1156, %rd1155, %rd1151; + or.b64 %rd341, %rd1156, %rd1153; + xor.b32 %r28584, %r28535, %r28560; + cvt.u64.u32 %rd1157, %r28584; + xor.b32 %r28585, %r28574, %r28549; + and.b32 %r28586, %r28585, 255; + cvt.u64.u32 %rd1158, %r28586; + cvt.u64.u32 %rd1159, %r28585; + shl.b64 %rd1160, %rd1159, 32; + and.b64 %rd1161, %rd1160, 280375465082880; + and.b64 %rd1162, %rd1160, 71776119061217280; + shr.u32 %r28587, %r28585, 24; + cvt.u64.u32 %rd1163, %r28587; + shl.b64 %rd1164, %rd1163, 56; + bfi.b64 %rd1165, %rd1158, %rd1157, 32, 32; + or.b64 %rd1166, %rd1165, %rd1161; + or.b64 %rd1167, %rd1166, %rd1162; + or.b64 %rd345, %rd1167, %rd1164; + xor.b32 %r28588, %r28579, %r28548; + cvt.u64.u32 %rd1168, %r28588; + xor.b32 %r28589, %r28537, %r28562; + and.b32 %r28590, %r28589, 255; + cvt.u64.u32 %rd1169, %r28590; + cvt.u64.u32 %rd1170, %r28589; + shl.b64 %rd1171, %rd1170, 32; + and.b64 %rd1172, %rd1171, 280375465082880; + and.b64 %rd1173, %rd1171, 71776119061217280; + shr.u32 %r28591, %r28589, 24; + cvt.u64.u32 %rd1174, %r28591; + shl.b64 %rd1175, %rd1174, 56; + bfi.b64 %rd1176, %rd1169, %rd1168, 32, 32; + or.b64 %rd1177, %rd1176, %rd1172; + or.b64 %rd1178, %rd1177, %rd1173; + or.b64 %rd1280, %rd1178, %rd1175; + xor.b32 %r28592, %r28576, %r28551; + cvt.u64.u32 %rd1179, %r28592; + xor.b32 %r28593, %r28534, %r28565; + and.b32 %r28594, %r28593, 255; + cvt.u64.u32 %rd1180, %r28594; + cvt.u64.u32 %rd1181, %r28593; + shl.b64 %rd1182, %rd1181, 32; + and.b64 %rd1183, %rd1182, 280375465082880; + and.b64 %rd1184, %rd1182, 71776119061217280; + shr.u32 %r28595, %r28593, 24; + cvt.u64.u32 %rd1185, %r28595; + shl.b64 %rd1186, %rd1185, 56; + bfi.b64 %rd1187, %rd1180, %rd1179, 32, 32; + or.b64 %rd1188, %rd1187, %rd1183; + or.b64 %rd1189, %rd1188, %rd1184; + or.b64 %rd1279, %rd1189, %rd1186; + +$L__BB2_104: + ld.const.u64 %rd346, [target+24]; + setp.eq.s64 %p59, %rd1279, %rd346; + @%p59 bra $L__BB2_106; + bra.uni $L__BB2_105; + +$L__BB2_106: + ld.const.u64 %rd347, [target+16]; + setp.eq.s64 %p60, %rd1280, %rd347; + @%p60 bra $L__BB2_108; + bra.uni $L__BB2_107; + +$L__BB2_108: + ld.const.u64 %rd348, [target+8]; + setp.eq.s64 %p61, %rd345, %rd348; + @%p61 bra $L__BB2_110; + bra.uni $L__BB2_109; + +$L__BB2_110: + and.b64 %rd1234, %rd342, 255; + and.b64 %rd1235, %rd341, -256; + or.b64 %rd1236, %rd1235, %rd1234; + ld.const.u64 %rd1237, [target]; + setp.lt.u64 %p63, %rd1236, %rd1237; + bra.uni $L__BB2_111; + +$L__BB2_105: + setp.lt.u64 %p63, %rd1279, %rd346; + bra.uni $L__BB2_111; + +$L__BB2_107: + setp.lt.u64 %p63, %rd1280, %rd347; + bra.uni $L__BB2_111; + +$L__BB2_109: + setp.lt.u64 %p63, %rd345, %rd348; + +$L__BB2_111: + not.pred %p62, %p63; + @%p62 bra $L__BB2_113; + + ld.param.u64 %rd1247, [heavy_hash_param_0]; + ld.param.u64 %rd1246, [heavy_hash_param_1]; + and.b64 %rd1245, %rd1255, %rd1247; + or.b64 %rd1244, %rd1245, %rd1246; + ld.param.u64 %rd1243, [heavy_hash_param_5]; + cvta.to.global.u64 %rd1238, %rd1243; + mov.u64 %rd1239, 0; + atom.global.cas.b64 %rd1240, [%rd1238], %rd1239, %rd1244; + +$L__BB2_113: ret; } diff --git a/plugins/cuda/resources/kaspa-cuda-sm86.ptx b/plugins/cuda/resources/kaspa-cuda-sm86.ptx index b1f2fa5..6b95b47 100644 --- a/plugins/cuda/resources/kaspa-cuda-sm86.ptx +++ b/plugins/cuda/resources/kaspa-cuda-sm86.ptx @@ -10,7 +10,12 @@ .target sm_86 .address_size 64 - // .globl heavy_hash +.extern .func (.param .b32 func_retval0) vprintf +( + .param .b64 vprintf_param_0, + .param .b64 vprintf_param_1 +) +; .global .align 4 .b8 IV[32] = {103, 230, 9, 106, 133, 174, 103, 187, 114, 243, 110, 60, 58, 245, 79, 165, 127, 82, 14, 81, 140, 104, 5, 155, 171, 217, 131, 31, 25, 205, 224, 91}; .global .align 1 .b8 MSG_SCHEDULE[112] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8, 3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1, 10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6, 12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4, 9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7, 11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}; .global .align 1 .b8 rho[24] = {1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44}; @@ -18,7063 +23,41900 @@ .global .align 8 .b8 RC[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; .global .align 8 .b8 _ZZ15xoshiro256_jumpP10ulonglong4E4JUMP[32] = {186, 10, 253, 60, 211, 198, 14, 24, 44, 57, 201, 240, 102, 18, 166, 213, 170, 201, 63, 224, 24, 38, 88, 169, 28, 102, 177, 41, 69, 220, 171, 57}; .global .align 8 .b8 _ZZ20xoshiro256_long_jumpP10ulonglong4E9LONG_JUMP[32] = {191, 203, 253, 254, 62, 93, 225, 118, 179, 47, 82, 28, 68, 78, 0, 197, 65, 226, 78, 133, 105, 0, 113, 119, 53, 230, 203, 42, 176, 155, 16, 57}; -.const .align 4 .b8 matrix[4096]; -.const .align 8 .b8 hash_header[72]; +.const .align 8 .b8 keccak_round_constants[192] = {1, 0, 0, 0, 0, 0, 0, 0, 130, 128, 0, 0, 0, 0, 0, 0, 138, 128, 0, 0, 0, 0, 0, 128, 0, 128, 0, 128, 0, 0, 0, 128, 139, 128, 0, 0, 0, 0, 0, 0, 1, 0, 0, 128, 0, 0, 0, 0, 129, 128, 0, 128, 0, 0, 0, 128, 9, 128, 0, 0, 0, 0, 0, 128, 138, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 0, 0, 0, 0, 9, 128, 0, 128, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 0, 139, 128, 0, 128, 0, 0, 0, 0, 139, 0, 0, 0, 0, 0, 0, 128, 137, 128, 0, 0, 0, 0, 0, 128, 3, 128, 0, 0, 0, 0, 0, 128, 2, 128, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 128, 10, 128, 0, 0, 0, 0, 0, 0, 10, 0, 0, 128, 0, 0, 0, 128, 129, 128, 0, 128, 0, 0, 0, 128, 128, 128, 0, 0, 0, 0, 0, 128, 1, 0, 0, 128, 0, 0, 0, 0, 8, 128, 0, 128, 0, 0, 0, 128}; +.global .align 8 .u64 _ZZN10item_state6updateEjE9num_words = 16; +.global .align 8 .u64 _ZZ15fishhash_kernelRK16fishhash_contextRK7hash512E9num_words = 32; +.const .align 1 .b8 matrix[4096]; +.const .align 1 .b8 hash_header[72]; .const .align 8 .b8 target[32]; .const .align 1 .b8 powP[200] = {61, 216, 246, 161, 13, 255, 60, 17, 60, 126, 2, 183, 85, 136, 191, 41, 210, 68, 251, 14, 114, 46, 95, 30, 160, 105, 152, 245, 163, 164, 165, 27, 101, 45, 94, 135, 202, 175, 47, 123, 70, 226, 220, 41, 214, 97, 239, 74, 16, 91, 65, 173, 30, 152, 58, 24, 156, 194, 155, 120, 12, 246, 107, 119, 64, 49, 102, 136, 51, 241, 235, 248, 240, 95, 40, 67, 60, 28, 101, 46, 10, 74, 241, 64, 5, 7, 150, 15, 82, 145, 41, 91, 135, 103, 227, 68, 21, 55, 177, 37, 164, 241, 112, 236, 137, 218, 233, 130, 143, 93, 200, 230, 35, 178, 180, 133, 31, 96, 26, 178, 70, 106, 163, 100, 144, 84, 133, 52, 26, 133, 47, 122, 28, 221, 6, 15, 66, 177, 59, 86, 29, 2, 162, 193, 228, 104, 22, 69, 228, 229, 29, 186, 141, 95, 9, 5, 65, 87, 2, 209, 74, 207, 206, 155, 132, 78, 202, 137, 219, 46, 116, 168, 39, 148, 176, 72, 114, 82, 139, 231, 156, 206, 252, 177, 188, 165, 175, 130, 207, 41, 17, 93, 131, 67, 130, 111, 120, 124, 185, 2}; .const .align 1 .b8 heavyP[200] = {9, 133, 36, 178, 82, 76, 215, 58, 22, 66, 159, 47, 14, 155, 98, 121, 238, 248, 199, 22, 72, 255, 20, 122, 152, 100, 5, 128, 76, 95, 167, 17, 218, 206, 238, 68, 223, 224, 32, 231, 105, 64, 243, 20, 46, 216, 199, 114, 186, 53, 137, 147, 42, 255, 0, 193, 98, 196, 15, 37, 64, 144, 33, 94, 72, 106, 207, 13, 166, 249, 57, 128, 12, 61, 42, 121, 159, 170, 188, 160, 38, 162, 169, 208, 93, 192, 49, 244, 63, 140, 193, 84, 195, 76, 31, 211, 61, 204, 105, 167, 1, 125, 107, 108, 228, 147, 36, 86, 211, 91, 198, 46, 68, 176, 205, 153, 58, 75, 247, 78, 176, 242, 52, 84, 131, 134, 76, 119, 22, 148, 188, 54, 176, 97, 233, 7, 7, 204, 101, 119, 177, 29, 143, 126, 57, 109, 196, 186, 128, 219, 143, 234, 88, 202, 52, 123, 211, 242, 146, 185, 87, 185, 129, 132, 4, 197, 118, 199, 46, 194, 18, 81, 103, 159, 195, 71, 10, 12, 41, 181, 157, 57, 187, 146, 21, 198, 159, 47, 49, 224, 154, 84, 53, 218, 185, 16, 125, 50, 25, 22}; +.global .align 1 .b8 $str[5] = {37, 48, 50, 120, 0}; +.global .align 1 .b8 $str$1[2] = {10, 0}; +.global .align 1 .b8 $str$2[32] = {104, 101, 97, 118, 121, 95, 104, 97, 115, 104, 32, 84, 104, 114, 101, 97, 100, 32, 37, 100, 44, 32, 66, 108, 111, 99, 107, 32, 37, 100, 10, 0}; +.global .align 1 .b8 $str$3[22] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 49, 48, 93, 32, 105, 115, 32, 58, 32, 0}; +.global .align 1 .b8 $str$4[22] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 52, 50, 93, 32, 105, 115, 32, 58, 32, 0}; +.global .align 1 .b8 $str$5[25] = {84, 104, 101, 32, 100, 97, 116, 97, 115, 101, 116, 91, 49, 50, 51, 52, 53, 93, 32, 105, 115, 32, 58, 32, 0}; -.visible .entry heavy_hash( - .param .u64 heavy_hash_param_0, - .param .u64 heavy_hash_param_1, - .param .u64 heavy_hash_param_2, - .param .u8 heavy_hash_param_3, - .param .u64 heavy_hash_param_4, - .param .u64 heavy_hash_param_5 +.func (.param .b64 func_retval0) _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh( + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3, + .param .b32 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4, + .param .b64 _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5 ) { - .local .align 8 .b8 __local_depot0[1912]; + .local .align 16 .b8 __local_depot0[224]; .reg .b64 %SP; .reg .b64 %SPL; - .reg .pred %p<17>; - .reg .b16 %rs<113>; - .reg .b32 %r<6245>; - .reg .b64 %rd<490>; + .reg .pred %p<28>; + .reg .b16 %rs<233>; + .reg .b32 %r<3965>; + .reg .b64 %rd<175>; mov.u64 %SPL, __local_depot0; - ld.param.u8 %rs11, [heavy_hash_param_3]; - ld.param.u64 %rd78, [heavy_hash_param_0]; - ld.param.u64 %rd79, [heavy_hash_param_1]; - ld.param.u64 %rd80, [heavy_hash_param_2]; - ld.param.u64 %rd81, [heavy_hash_param_4]; - ld.param.u64 %rd82, [heavy_hash_param_5]; - cvta.to.global.u64 %rd1, %rd81; - cvta.to.global.u64 %rd2, %rd82; - add.u64 %rd3, %SPL, 0; - mov.u32 %r17, %ntid.x; - mov.u32 %r18, %ctaid.x; - mov.u32 %r19, %tid.x; - mad.lo.s32 %r20, %r18, %r17, %r19; - cvt.s64.s32 %rd4, %r20; - setp.ge.u64 %p6, %rd4, %rd80; - @%p6 bra $L__BB0_19; - - cvt.u32.u64 %r21, %rd4; - setp.ne.s32 %p7, %r21, 0; - @%p7 bra $L__BB0_3; - - mov.u64 %rd84, 0; - st.global.u64 [%rd2], %rd84; - -$L__BB0_3: - setp.eq.s16 %p8, %rs11, 0; - @%p8 bra $L__BB0_5; - - shl.b64 %rd85, %rd4, 5; - add.s64 %rd86, %rd1, %rd85; - ld.global.v2.u64 {%rd87, %rd88}, [%rd86]; - mul.lo.s64 %rd91, %rd88, 5; - { - .reg .b64 %lhs; - .reg .b64 %rhs; - shl.b64 %lhs, %rd91, 7; - shr.b64 %rhs, %rd91, 57; - add.u64 %rd92, %lhs, %rhs; - } - mul.lo.s64 %rd463, %rd92, 9; - shl.b64 %rd93, %rd88, 17; - ld.global.v2.u64 {%rd94, %rd95}, [%rd86+16]; - xor.b64 %rd98, %rd94, %rd87; - xor.b64 %rd99, %rd95, %rd88; - xor.b64 %rd100, %rd88, %rd98; - xor.b64 %rd101, %rd87, %rd99; - st.global.v2.u64 [%rd86], {%rd101, %rd100}; - { - .reg .b32 %dummy; - mov.b64 {%r22,%dummy}, %rd99; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r23}, %rd99; - } - shf.r.wrap.b32 %r24, %r23, %r22, 19; - shf.r.wrap.b32 %r25, %r22, %r23, 19; - mov.b64 %rd102, {%r25, %r24}; - xor.b64 %rd103, %rd98, %rd93; - st.global.v2.u64 [%rd86+16], {%rd103, %rd102}; - bra.uni $L__BB0_6; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs75, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_4]; + ld.param.u64 %rd69, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd171, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + ld.param.u64 %rd71, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_2]; + ld.param.u64 %rd165, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + ld.param.u64 %rd73, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd155, %rd73; + cvta.to.local.u64 %rd2, %rd71; + add.u64 %rd153, %SPL, 16; + add.u64 %rd149, %SP, 96; + cvta.to.local.u64 %rd4, %rd149; + setp.lt.u64 %p1, %rd171, 1025; + @%p1 bra $L__BB0_14; + bra.uni $L__BB0_1; + +$L__BB0_14: + add.u64 %rd162, %SPL, 0; + setp.ne.s64 %p16, %rd171, 1024; + mov.u64 %rd159, 0; + mov.u64 %rd151, %rd159; + @%p16 bra $L__BB0_16; + + mov.u64 %rd171, 0; + st.local.u64 [%rd162], %rd69; + mov.u64 %rd151, 1; + mov.u64 %rd159, 1024; + +$L__BB0_16: + setp.eq.s64 %p17, %rd151, 0; + @%p17 bra $L__BB0_21; + + or.b16 %rs1, %rs75, 1; + mov.u64 %rd163, %rd151; + +$L__BB0_18: + ld.local.u64 %rd166, [%rd162]; + ld.local.u8 %r1060, [%rd2]; + ld.local.u8 %r1061, [%rd2+1]; + prmt.b32 %r1062, %r1061, %r1060, 30212; + ld.local.u8 %r1063, [%rd2+2]; + ld.local.u8 %r1064, [%rd2+3]; + prmt.b32 %r1065, %r1064, %r1063, 30212; + prmt.b32 %r3948, %r1065, %r1062, 4180; + ld.local.u8 %r1066, [%rd2+4]; + ld.local.u8 %r1067, [%rd2+5]; + prmt.b32 %r1068, %r1067, %r1066, 30212; + ld.local.u8 %r1069, [%rd2+6]; + ld.local.u8 %r1070, [%rd2+7]; + prmt.b32 %r1071, %r1070, %r1069, 30212; + prmt.b32 %r3947, %r1071, %r1068, 4180; + ld.local.u8 %r1072, [%rd2+8]; + ld.local.u8 %r1073, [%rd2+9]; + prmt.b32 %r1074, %r1073, %r1072, 30212; + ld.local.u8 %r1075, [%rd2+10]; + ld.local.u8 %r1076, [%rd2+11]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + prmt.b32 %r3946, %r1077, %r1074, 4180; + ld.local.u8 %r1078, [%rd2+12]; + ld.local.u8 %r1079, [%rd2+13]; + prmt.b32 %r1080, %r1079, %r1078, 30212; + ld.local.u8 %r1081, [%rd2+14]; + ld.local.u8 %r1082, [%rd2+15]; + prmt.b32 %r1083, %r1082, %r1081, 30212; + prmt.b32 %r3945, %r1083, %r1080, 4180; + mov.u64 %rd167, 16; + ld.local.u8 %r1084, [%rd2+16]; + ld.local.u8 %r1085, [%rd2+17]; + prmt.b32 %r1086, %r1085, %r1084, 30212; + ld.local.u8 %r1087, [%rd2+18]; + ld.local.u8 %r1088, [%rd2+19]; + prmt.b32 %r1089, %r1088, %r1087, 30212; + prmt.b32 %r3944, %r1089, %r1086, 4180; + ld.local.u8 %r1090, [%rd2+20]; + ld.local.u8 %r1091, [%rd2+21]; + prmt.b32 %r1092, %r1091, %r1090, 30212; + ld.local.u8 %r1093, [%rd2+22]; + ld.local.u8 %r1094, [%rd2+23]; + prmt.b32 %r1095, %r1094, %r1093, 30212; + prmt.b32 %r3943, %r1095, %r1092, 4180; + ld.local.u8 %r1096, [%rd2+24]; + ld.local.u8 %r1097, [%rd2+25]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd2+26]; + ld.local.u8 %r1100, [%rd2+27]; + prmt.b32 %r1101, %r1100, %r1099, 30212; + prmt.b32 %r3942, %r1101, %r1098, 4180; + ld.local.u8 %r1102, [%rd2+28]; + ld.local.u8 %r1103, [%rd2+29]; + prmt.b32 %r1104, %r1103, %r1102, 30212; + ld.local.u8 %r1105, [%rd2+30]; + ld.local.u8 %r1106, [%rd2+31]; + prmt.b32 %r1107, %r1106, %r1105, 30212; + prmt.b32 %r3941, %r1107, %r1104, 4180; + mov.u16 %rs197, %rs1; + +$L__BB0_19: + shr.u64 %rd143, %rd165, 32; + cvt.u32.u64 %r3940, %rd143; + cvt.u32.u64 %r3939, %rd165; + setp.eq.s64 %p18, %rd167, 1; + selp.b16 %rs79, 2, 0, %p18; + or.b16 %rs80, %rs79, %rs197; + ld.u8 %r1108, [%rd166]; + ld.u8 %r1109, [%rd166+1]; + prmt.b32 %r1110, %r1109, %r1108, 30212; + ld.u8 %r1111, [%rd166+2]; + prmt.b32 %r1112, %r1111, %r1110, 28756; + ld.u8 %r1113, [%rd166+3]; + prmt.b32 %r1114, %r1113, %r1112, 1620; + ld.u8 %r1115, [%rd166+4]; + ld.u8 %r1116, [%rd166+5]; + prmt.b32 %r1117, %r1116, %r1115, 30212; + ld.u8 %r1118, [%rd166+6]; + prmt.b32 %r1119, %r1118, %r1117, 28756; + ld.u8 %r1120, [%rd166+7]; + prmt.b32 %r1121, %r1120, %r1119, 1620; + ld.u8 %r1122, [%rd166+8]; + ld.u8 %r1123, [%rd166+9]; + prmt.b32 %r1124, %r1123, %r1122, 30212; + ld.u8 %r1125, [%rd166+10]; + prmt.b32 %r1126, %r1125, %r1124, 28756; + ld.u8 %r1127, [%rd166+11]; + prmt.b32 %r1128, %r1127, %r1126, 1620; + ld.u8 %r1129, [%rd166+12]; + ld.u8 %r1130, [%rd166+13]; + prmt.b32 %r1131, %r1130, %r1129, 30212; + ld.u8 %r1132, [%rd166+14]; + prmt.b32 %r1133, %r1132, %r1131, 28756; + ld.u8 %r1134, [%rd166+15]; + prmt.b32 %r1135, %r1134, %r1133, 1620; + ld.u8 %r1136, [%rd166+16]; + ld.u8 %r1137, [%rd166+17]; + prmt.b32 %r1138, %r1137, %r1136, 30212; + ld.u8 %r1139, [%rd166+18]; + prmt.b32 %r1140, %r1139, %r1138, 28756; + ld.u8 %r1141, [%rd166+19]; + prmt.b32 %r1142, %r1141, %r1140, 1620; + ld.u8 %r1143, [%rd166+20]; + ld.u8 %r1144, [%rd166+21]; + prmt.b32 %r1145, %r1144, %r1143, 30212; + ld.u8 %r1146, [%rd166+22]; + prmt.b32 %r1147, %r1146, %r1145, 28756; + ld.u8 %r1148, [%rd166+23]; + prmt.b32 %r1149, %r1148, %r1147, 1620; + ld.u8 %r1150, [%rd166+24]; + ld.u8 %r1151, [%rd166+25]; + prmt.b32 %r1152, %r1151, %r1150, 30212; + ld.u8 %r1153, [%rd166+26]; + prmt.b32 %r1154, %r1153, %r1152, 28756; + ld.u8 %r1155, [%rd166+27]; + prmt.b32 %r1156, %r1155, %r1154, 1620; + ld.u8 %r1157, [%rd166+28]; + ld.u8 %r1158, [%rd166+29]; + prmt.b32 %r1159, %r1158, %r1157, 30212; + ld.u8 %r1160, [%rd166+30]; + prmt.b32 %r1161, %r1160, %r1159, 28756; + ld.u8 %r1162, [%rd166+31]; + prmt.b32 %r1163, %r1162, %r1161, 1620; + ld.u8 %r1164, [%rd166+32]; + ld.u8 %r1165, [%rd166+33]; + prmt.b32 %r1166, %r1165, %r1164, 30212; + ld.u8 %r1167, [%rd166+34]; + prmt.b32 %r1168, %r1167, %r1166, 28756; + ld.u8 %r1169, [%rd166+35]; + prmt.b32 %r1170, %r1169, %r1168, 1620; + ld.u8 %r1171, [%rd166+36]; + ld.u8 %r1172, [%rd166+37]; + prmt.b32 %r1173, %r1172, %r1171, 30212; + ld.u8 %r1174, [%rd166+38]; + prmt.b32 %r1175, %r1174, %r1173, 28756; + ld.u8 %r1176, [%rd166+39]; + prmt.b32 %r1177, %r1176, %r1175, 1620; + ld.u8 %r1178, [%rd166+40]; + ld.u8 %r1179, [%rd166+41]; + prmt.b32 %r1180, %r1179, %r1178, 30212; + ld.u8 %r1181, [%rd166+42]; + prmt.b32 %r1182, %r1181, %r1180, 28756; + ld.u8 %r1183, [%rd166+43]; + prmt.b32 %r1184, %r1183, %r1182, 1620; + ld.u8 %r1185, [%rd166+44]; + ld.u8 %r1186, [%rd166+45]; + prmt.b32 %r1187, %r1186, %r1185, 30212; + ld.u8 %r1188, [%rd166+46]; + prmt.b32 %r1189, %r1188, %r1187, 28756; + ld.u8 %r1190, [%rd166+47]; + prmt.b32 %r1191, %r1190, %r1189, 1620; + ld.u8 %r1192, [%rd166+48]; + ld.u8 %r1193, [%rd166+49]; + prmt.b32 %r1194, %r1193, %r1192, 30212; + ld.u8 %r1195, [%rd166+50]; + prmt.b32 %r1196, %r1195, %r1194, 28756; + ld.u8 %r1197, [%rd166+51]; + prmt.b32 %r1198, %r1197, %r1196, 1620; + ld.u8 %r1199, [%rd166+52]; + ld.u8 %r1200, [%rd166+53]; + prmt.b32 %r1201, %r1200, %r1199, 30212; + ld.u8 %r1202, [%rd166+54]; + prmt.b32 %r1203, %r1202, %r1201, 28756; + ld.u8 %r1204, [%rd166+55]; + prmt.b32 %r1205, %r1204, %r1203, 1620; + ld.u8 %r1206, [%rd166+56]; + ld.u8 %r1207, [%rd166+57]; + prmt.b32 %r1208, %r1207, %r1206, 30212; + ld.u8 %r1209, [%rd166+58]; + prmt.b32 %r1210, %r1209, %r1208, 28756; + ld.u8 %r1211, [%rd166+59]; + prmt.b32 %r1212, %r1211, %r1210, 1620; + ld.u8 %r1213, [%rd166+60]; + ld.u8 %r1214, [%rd166+61]; + prmt.b32 %r1215, %r1214, %r1213, 30212; + ld.u8 %r1216, [%rd166+62]; + prmt.b32 %r1217, %r1216, %r1215, 28756; + ld.u8 %r1218, [%rd166+63]; + prmt.b32 %r1219, %r1218, %r1217, 1620; + cvt.u32.u16 %r1220, %rs80; + and.b32 %r1221, %r1220, 255; + add.s32 %r1222, %r3944, %r3948; + add.s32 %r1223, %r1222, %r1114; + xor.b32 %r1224, %r1223, %r3939; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 16; + add.s32 %r1226, %r1225, 1779033703; + xor.b32 %r1227, %r1226, %r3944; + shf.l.wrap.b32 %r1228, %r1227, %r1227, 20; + add.s32 %r1229, %r1121, %r1223; + add.s32 %r1230, %r1229, %r1228; + xor.b32 %r1231, %r1230, %r1225; + shf.l.wrap.b32 %r1232, %r1231, %r1231, 24; + add.s32 %r1233, %r1232, %r1226; + xor.b32 %r1234, %r1233, %r1228; + shf.l.wrap.b32 %r1235, %r1234, %r1234, 25; + add.s32 %r1236, %r3943, %r3947; + add.s32 %r1237, %r1236, %r1128; + xor.b32 %r1238, %r1237, %r3940; + shf.l.wrap.b32 %r1239, %r1238, %r1238, 16; + add.s32 %r1240, %r1239, -1150833019; + xor.b32 %r1241, %r1240, %r3943; + shf.l.wrap.b32 %r1242, %r1241, %r1241, 20; + add.s32 %r1243, %r1135, %r1237; + add.s32 %r1244, %r1243, %r1242; + xor.b32 %r1245, %r1244, %r1239; + shf.l.wrap.b32 %r1246, %r1245, %r1245, 24; + add.s32 %r1247, %r1246, %r1240; + xor.b32 %r1248, %r1247, %r1242; + shf.l.wrap.b32 %r1249, %r1248, %r1248, 25; + add.s32 %r1250, %r3942, %r3946; + add.s32 %r1251, %r1250, %r1142; + shr.u32 %r1252, %r1251, 16; + shl.b32 %r1253, %r1251, 16; + xor.b32 %r1254, %r1253, 4194304; + or.b32 %r1255, %r1254, %r1252; + add.s32 %r1256, %r1255, 1013904242; + xor.b32 %r1257, %r1256, %r3942; + shf.l.wrap.b32 %r1258, %r1257, %r1257, 20; + add.s32 %r1259, %r1149, %r1251; + add.s32 %r1260, %r1259, %r1258; + xor.b32 %r1261, %r1260, %r1255; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 24; + add.s32 %r1263, %r1262, %r1256; + xor.b32 %r1264, %r1263, %r1258; + shf.l.wrap.b32 %r1265, %r1264, %r1264, 25; + add.s32 %r1266, %r3941, %r3945; + add.s32 %r1267, %r1266, %r1156; + xor.b32 %r1268, %r1267, %r1221; + shr.u32 %r1269, %r1267, 16; + shl.b32 %r1270, %r1268, 16; + or.b32 %r1271, %r1270, %r1269; + add.s32 %r1272, %r1271, -1521486534; + xor.b32 %r1273, %r1272, %r3941; + shf.l.wrap.b32 %r1274, %r1273, %r1273, 20; + add.s32 %r1275, %r1163, %r1267; + add.s32 %r1276, %r1275, %r1274; + xor.b32 %r1277, %r1276, %r1271; + shf.l.wrap.b32 %r1278, %r1277, %r1277, 24; + add.s32 %r1279, %r1278, %r1272; + xor.b32 %r1280, %r1279, %r1274; + shf.l.wrap.b32 %r1281, %r1280, %r1280, 25; + add.s32 %r1282, %r1249, %r1230; + add.s32 %r1283, %r1282, %r1170; + xor.b32 %r1284, %r1278, %r1283; + shf.l.wrap.b32 %r1285, %r1284, %r1284, 16; + add.s32 %r1286, %r1285, %r1263; + xor.b32 %r1287, %r1286, %r1249; + shf.l.wrap.b32 %r1288, %r1287, %r1287, 20; + add.s32 %r1289, %r1177, %r1283; + add.s32 %r1290, %r1289, %r1288; + xor.b32 %r1291, %r1290, %r1285; + shf.l.wrap.b32 %r1292, %r1291, %r1291, 24; + add.s32 %r1293, %r1292, %r1286; + xor.b32 %r1294, %r1293, %r1288; + shf.l.wrap.b32 %r1295, %r1294, %r1294, 25; + add.s32 %r1296, %r1265, %r1244; + add.s32 %r1297, %r1296, %r1184; + xor.b32 %r1298, %r1297, %r1232; + shf.l.wrap.b32 %r1299, %r1298, %r1298, 16; + add.s32 %r1300, %r1299, %r1279; + xor.b32 %r1301, %r1300, %r1265; + shf.l.wrap.b32 %r1302, %r1301, %r1301, 20; + add.s32 %r1303, %r1191, %r1297; + add.s32 %r1304, %r1303, %r1302; + xor.b32 %r1305, %r1304, %r1299; + shf.l.wrap.b32 %r1306, %r1305, %r1305, 24; + add.s32 %r1307, %r1306, %r1300; + xor.b32 %r1308, %r1307, %r1302; + shf.l.wrap.b32 %r1309, %r1308, %r1308, 25; + add.s32 %r1310, %r1281, %r1260; + add.s32 %r1311, %r1310, %r1198; + xor.b32 %r1312, %r1311, %r1246; + shf.l.wrap.b32 %r1313, %r1312, %r1312, 16; + add.s32 %r1314, %r1313, %r1233; + xor.b32 %r1315, %r1314, %r1281; + shf.l.wrap.b32 %r1316, %r1315, %r1315, 20; + add.s32 %r1317, %r1205, %r1311; + add.s32 %r1318, %r1317, %r1316; + xor.b32 %r1319, %r1318, %r1313; + shf.l.wrap.b32 %r1320, %r1319, %r1319, 24; + add.s32 %r1321, %r1320, %r1314; + xor.b32 %r1322, %r1321, %r1316; + shf.l.wrap.b32 %r1323, %r1322, %r1322, 25; + add.s32 %r1324, %r1276, %r1235; + add.s32 %r1325, %r1324, %r1212; + xor.b32 %r1326, %r1325, %r1262; + shf.l.wrap.b32 %r1327, %r1326, %r1326, 16; + add.s32 %r1328, %r1327, %r1247; + xor.b32 %r1329, %r1328, %r1235; + shf.l.wrap.b32 %r1330, %r1329, %r1329, 20; + add.s32 %r1331, %r1219, %r1325; + add.s32 %r1332, %r1331, %r1330; + xor.b32 %r1333, %r1332, %r1327; + shf.l.wrap.b32 %r1334, %r1333, %r1333, 24; + add.s32 %r1335, %r1334, %r1328; + xor.b32 %r1336, %r1335, %r1330; + shf.l.wrap.b32 %r1337, %r1336, %r1336, 25; + add.s32 %r1338, %r1290, %r1128; + add.s32 %r1339, %r1338, %r1337; + xor.b32 %r1340, %r1339, %r1306; + shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; + add.s32 %r1342, %r1341, %r1321; + xor.b32 %r1343, %r1342, %r1337; + shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; + add.s32 %r1345, %r1339, %r1156; + add.s32 %r1346, %r1345, %r1344; + xor.b32 %r1347, %r1346, %r1341; + shf.l.wrap.b32 %r1348, %r1347, %r1347, 24; + add.s32 %r1349, %r1348, %r1342; + xor.b32 %r1350, %r1349, %r1344; + shf.l.wrap.b32 %r1351, %r1350, %r1350, 25; + add.s32 %r1352, %r1304, %r1135; + add.s32 %r1353, %r1352, %r1295; + xor.b32 %r1354, %r1320, %r1353; + shf.l.wrap.b32 %r1355, %r1354, %r1354, 16; + add.s32 %r1356, %r1335, %r1355; + xor.b32 %r1357, %r1356, %r1295; + shf.l.wrap.b32 %r1358, %r1357, %r1357, 20; + add.s32 %r1359, %r1353, %r1184; + add.s32 %r1360, %r1359, %r1358; + xor.b32 %r1361, %r1360, %r1355; + shf.l.wrap.b32 %r1362, %r1361, %r1361, 24; + add.s32 %r1363, %r1362, %r1356; + xor.b32 %r1364, %r1363, %r1358; + shf.l.wrap.b32 %r1365, %r1364, %r1364, 25; + add.s32 %r1366, %r1309, %r1163; + add.s32 %r1367, %r1366, %r1318; + xor.b32 %r1368, %r1334, %r1367; + shf.l.wrap.b32 %r1369, %r1368, %r1368, 16; + add.s32 %r1370, %r1369, %r1293; + xor.b32 %r1371, %r1370, %r1309; + shf.l.wrap.b32 %r1372, %r1371, %r1371, 20; + add.s32 %r1373, %r1367, %r1114; + add.s32 %r1374, %r1373, %r1372; + xor.b32 %r1375, %r1374, %r1369; + shf.l.wrap.b32 %r1376, %r1375, %r1375, 24; + add.s32 %r1377, %r1376, %r1370; + xor.b32 %r1378, %r1377, %r1372; + shf.l.wrap.b32 %r1379, %r1378, %r1378, 25; + add.s32 %r1380, %r1323, %r1142; + add.s32 %r1381, %r1380, %r1332; + xor.b32 %r1382, %r1381, %r1292; + shf.l.wrap.b32 %r1383, %r1382, %r1382, 16; + add.s32 %r1384, %r1383, %r1307; + xor.b32 %r1385, %r1384, %r1323; + shf.l.wrap.b32 %r1386, %r1385, %r1385, 20; + add.s32 %r1387, %r1381, %r1205; + add.s32 %r1388, %r1387, %r1386; + xor.b32 %r1389, %r1388, %r1383; + shf.l.wrap.b32 %r1390, %r1389, %r1389, 24; + add.s32 %r1391, %r1390, %r1384; + xor.b32 %r1392, %r1391, %r1386; + shf.l.wrap.b32 %r1393, %r1392, %r1392, 25; + add.s32 %r1394, %r1365, %r1121; + add.s32 %r1395, %r1394, %r1346; + xor.b32 %r1396, %r1395, %r1390; + shf.l.wrap.b32 %r1397, %r1396, %r1396, 16; + add.s32 %r1398, %r1397, %r1377; + xor.b32 %r1399, %r1398, %r1365; + shf.l.wrap.b32 %r1400, %r1399, %r1399, 20; + add.s32 %r1401, %r1395, %r1191; + add.s32 %r1402, %r1401, %r1400; + xor.b32 %r1403, %r1402, %r1397; + shf.l.wrap.b32 %r1404, %r1403, %r1403, 24; + add.s32 %r1405, %r1404, %r1398; + xor.b32 %r1406, %r1405, %r1400; + shf.l.wrap.b32 %r1407, %r1406, %r1406, 25; + add.s32 %r1408, %r1360, %r1198; + add.s32 %r1409, %r1408, %r1379; + xor.b32 %r1410, %r1348, %r1409; + shf.l.wrap.b32 %r1411, %r1410, %r1410, 16; + add.s32 %r1412, %r1411, %r1391; + xor.b32 %r1413, %r1412, %r1379; + shf.l.wrap.b32 %r1414, %r1413, %r1413, 20; + add.s32 %r1415, %r1409, %r1149; + add.s32 %r1416, %r1415, %r1414; + xor.b32 %r1417, %r1416, %r1411; + shf.l.wrap.b32 %r1418, %r1417, %r1417, 24; + add.s32 %r1419, %r1418, %r1412; + xor.b32 %r1420, %r1419, %r1414; + shf.l.wrap.b32 %r1421, %r1420, %r1420, 25; + add.s32 %r1422, %r1374, %r1177; + add.s32 %r1423, %r1422, %r1393; + xor.b32 %r1424, %r1423, %r1362; + shf.l.wrap.b32 %r1425, %r1424, %r1424, 16; + add.s32 %r1426, %r1425, %r1349; + xor.b32 %r1427, %r1426, %r1393; + shf.l.wrap.b32 %r1428, %r1427, %r1427, 20; + add.s32 %r1429, %r1423, %r1212; + add.s32 %r1430, %r1429, %r1428; + xor.b32 %r1431, %r1430, %r1425; + shf.l.wrap.b32 %r1432, %r1431, %r1431, 24; + add.s32 %r1433, %r1432, %r1426; + xor.b32 %r1434, %r1433, %r1428; + shf.l.wrap.b32 %r1435, %r1434, %r1434, 25; + add.s32 %r1436, %r1388, %r1219; + add.s32 %r1437, %r1436, %r1351; + xor.b32 %r1438, %r1437, %r1376; + shf.l.wrap.b32 %r1439, %r1438, %r1438, 16; + add.s32 %r1440, %r1439, %r1363; + xor.b32 %r1441, %r1440, %r1351; + shf.l.wrap.b32 %r1442, %r1441, %r1441, 20; + add.s32 %r1443, %r1437, %r1170; + add.s32 %r1444, %r1443, %r1442; + xor.b32 %r1445, %r1444, %r1439; + shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; + add.s32 %r1447, %r1446, %r1440; + xor.b32 %r1448, %r1447, %r1442; + shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; + add.s32 %r1450, %r1402, %r1135; + add.s32 %r1451, %r1450, %r1449; + xor.b32 %r1452, %r1451, %r1418; + shf.l.wrap.b32 %r1453, %r1452, %r1452, 16; + add.s32 %r1454, %r1453, %r1433; + xor.b32 %r1455, %r1454, %r1449; + shf.l.wrap.b32 %r1456, %r1455, %r1455, 20; + add.s32 %r1457, %r1451, %r1142; + add.s32 %r1458, %r1457, %r1456; + xor.b32 %r1459, %r1458, %r1453; + shf.l.wrap.b32 %r1460, %r1459, %r1459, 24; + add.s32 %r1461, %r1460, %r1454; + xor.b32 %r1462, %r1461, %r1456; + shf.l.wrap.b32 %r1463, %r1462, %r1462, 25; + add.s32 %r1464, %r1416, %r1184; + add.s32 %r1465, %r1464, %r1407; + xor.b32 %r1466, %r1465, %r1432; + shf.l.wrap.b32 %r1467, %r1466, %r1466, 16; + add.s32 %r1468, %r1467, %r1447; + xor.b32 %r1469, %r1468, %r1407; + shf.l.wrap.b32 %r1470, %r1469, %r1469, 20; + add.s32 %r1471, %r1465, %r1198; + add.s32 %r1472, %r1471, %r1470; + xor.b32 %r1473, %r1472, %r1467; + shf.l.wrap.b32 %r1474, %r1473, %r1473, 24; + add.s32 %r1475, %r1474, %r1468; + xor.b32 %r1476, %r1475, %r1470; + shf.l.wrap.b32 %r1477, %r1476, %r1476, 25; + add.s32 %r1478, %r1430, %r1205; + add.s32 %r1479, %r1478, %r1421; + xor.b32 %r1480, %r1446, %r1479; + shf.l.wrap.b32 %r1481, %r1480, %r1480, 16; + add.s32 %r1482, %r1481, %r1405; + xor.b32 %r1483, %r1482, %r1421; + shf.l.wrap.b32 %r1484, %r1483, %r1483, 20; + add.s32 %r1485, %r1479, %r1128; + add.s32 %r1486, %r1485, %r1484; + xor.b32 %r1487, %r1486, %r1481; + shf.l.wrap.b32 %r1488, %r1487, %r1487, 24; + add.s32 %r1489, %r1488, %r1482; + xor.b32 %r1490, %r1489, %r1484; + shf.l.wrap.b32 %r1491, %r1490, %r1490, 25; + add.s32 %r1492, %r1435, %r1163; + add.s32 %r1493, %r1492, %r1444; + xor.b32 %r1494, %r1493, %r1404; + shf.l.wrap.b32 %r1495, %r1494, %r1494, 16; + add.s32 %r1496, %r1495, %r1419; + xor.b32 %r1497, %r1496, %r1435; + shf.l.wrap.b32 %r1498, %r1497, %r1497, 20; + add.s32 %r1499, %r1493, %r1212; + add.s32 %r1500, %r1499, %r1498; + xor.b32 %r1501, %r1500, %r1495; + shf.l.wrap.b32 %r1502, %r1501, %r1501, 24; + add.s32 %r1503, %r1502, %r1496; + xor.b32 %r1504, %r1503, %r1498; + shf.l.wrap.b32 %r1505, %r1504, %r1504, 25; + add.s32 %r1506, %r1477, %r1156; + add.s32 %r1507, %r1506, %r1458; + xor.b32 %r1508, %r1507, %r1502; + shf.l.wrap.b32 %r1509, %r1508, %r1508, 16; + add.s32 %r1510, %r1509, %r1489; + xor.b32 %r1511, %r1510, %r1477; + shf.l.wrap.b32 %r1512, %r1511, %r1511, 20; + add.s32 %r1513, %r1507, %r1149; + add.s32 %r1514, %r1513, %r1512; + xor.b32 %r1515, %r1514, %r1509; + shf.l.wrap.b32 %r1516, %r1515, %r1515, 24; + add.s32 %r1517, %r1516, %r1510; + xor.b32 %r1518, %r1517, %r1512; + shf.l.wrap.b32 %r1519, %r1518, %r1518, 25; + add.s32 %r1520, %r1472, %r1177; + add.s32 %r1521, %r1520, %r1491; + xor.b32 %r1522, %r1460, %r1521; + shf.l.wrap.b32 %r1523, %r1522, %r1522, 16; + add.s32 %r1524, %r1523, %r1503; + xor.b32 %r1525, %r1524, %r1491; + shf.l.wrap.b32 %r1526, %r1525, %r1525, 20; + add.s32 %r1527, %r1521, %r1114; + add.s32 %r1528, %r1527, %r1526; + xor.b32 %r1529, %r1528, %r1523; + shf.l.wrap.b32 %r1530, %r1529, %r1529, 24; + add.s32 %r1531, %r1530, %r1524; + xor.b32 %r1532, %r1531, %r1526; + shf.l.wrap.b32 %r1533, %r1532, %r1532, 25; + add.s32 %r1534, %r1486, %r1191; + add.s32 %r1535, %r1534, %r1505; + xor.b32 %r1536, %r1535, %r1474; + shf.l.wrap.b32 %r1537, %r1536, %r1536, 16; + add.s32 %r1538, %r1537, %r1461; + xor.b32 %r1539, %r1538, %r1505; + shf.l.wrap.b32 %r1540, %r1539, %r1539, 20; + add.s32 %r1541, %r1535, %r1219; + add.s32 %r1542, %r1541, %r1540; + xor.b32 %r1543, %r1542, %r1537; + shf.l.wrap.b32 %r1544, %r1543, %r1543, 24; + add.s32 %r1545, %r1544, %r1538; + xor.b32 %r1546, %r1545, %r1540; + shf.l.wrap.b32 %r1547, %r1546, %r1546, 25; + add.s32 %r1548, %r1500, %r1170; + add.s32 %r1549, %r1548, %r1463; + xor.b32 %r1550, %r1549, %r1488; + shf.l.wrap.b32 %r1551, %r1550, %r1550, 16; + add.s32 %r1552, %r1551, %r1475; + xor.b32 %r1553, %r1552, %r1463; + shf.l.wrap.b32 %r1554, %r1553, %r1553, 20; + add.s32 %r1555, %r1549, %r1121; + add.s32 %r1556, %r1555, %r1554; + xor.b32 %r1557, %r1556, %r1551; + shf.l.wrap.b32 %r1558, %r1557, %r1557, 24; + add.s32 %r1559, %r1558, %r1552; + xor.b32 %r1560, %r1559, %r1554; + shf.l.wrap.b32 %r1561, %r1560, %r1560, 25; + add.s32 %r1562, %r1514, %r1184; + add.s32 %r1563, %r1562, %r1561; + xor.b32 %r1564, %r1563, %r1530; + shf.l.wrap.b32 %r1565, %r1564, %r1564, 16; + add.s32 %r1566, %r1565, %r1545; + xor.b32 %r1567, %r1566, %r1561; + shf.l.wrap.b32 %r1568, %r1567, %r1567, 20; + add.s32 %r1569, %r1563, %r1163; + add.s32 %r1570, %r1569, %r1568; + xor.b32 %r1571, %r1570, %r1565; + shf.l.wrap.b32 %r1572, %r1571, %r1571, 24; + add.s32 %r1573, %r1572, %r1566; + xor.b32 %r1574, %r1573, %r1568; + shf.l.wrap.b32 %r1575, %r1574, %r1574, 25; + add.s32 %r1576, %r1528, %r1198; + add.s32 %r1577, %r1576, %r1519; + xor.b32 %r1578, %r1577, %r1544; + shf.l.wrap.b32 %r1579, %r1578, %r1578, 16; + add.s32 %r1580, %r1579, %r1559; + xor.b32 %r1581, %r1580, %r1519; + shf.l.wrap.b32 %r1582, %r1581, %r1581, 20; + add.s32 %r1583, %r1577, %r1177; + add.s32 %r1584, %r1583, %r1582; + xor.b32 %r1585, %r1584, %r1579; + shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; + add.s32 %r1587, %r1586, %r1580; + xor.b32 %r1588, %r1587, %r1582; + shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; + add.s32 %r1590, %r1542, %r1212; + add.s32 %r1591, %r1590, %r1533; + xor.b32 %r1592, %r1558, %r1591; + shf.l.wrap.b32 %r1593, %r1592, %r1592, 16; + add.s32 %r1594, %r1593, %r1517; + xor.b32 %r1595, %r1594, %r1533; + shf.l.wrap.b32 %r1596, %r1595, %r1595, 20; + add.s32 %r1597, %r1591, %r1135; + add.s32 %r1598, %r1597, %r1596; + xor.b32 %r1599, %r1598, %r1593; + shf.l.wrap.b32 %r1600, %r1599, %r1599, 24; + add.s32 %r1601, %r1600, %r1594; + xor.b32 %r1602, %r1601, %r1596; + shf.l.wrap.b32 %r1603, %r1602, %r1602, 25; + add.s32 %r1604, %r1547, %r1205; + add.s32 %r1605, %r1604, %r1556; + xor.b32 %r1606, %r1605, %r1516; + shf.l.wrap.b32 %r1607, %r1606, %r1606, 16; + add.s32 %r1608, %r1607, %r1531; + xor.b32 %r1609, %r1608, %r1547; + shf.l.wrap.b32 %r1610, %r1609, %r1609, 20; + add.s32 %r1611, %r1605, %r1219; + add.s32 %r1612, %r1611, %r1610; + xor.b32 %r1613, %r1612, %r1607; + shf.l.wrap.b32 %r1614, %r1613, %r1613, 24; + add.s32 %r1615, %r1614, %r1608; + xor.b32 %r1616, %r1615, %r1610; + shf.l.wrap.b32 %r1617, %r1616, %r1616, 25; + add.s32 %r1618, %r1589, %r1142; + add.s32 %r1619, %r1618, %r1570; + xor.b32 %r1620, %r1619, %r1614; + shf.l.wrap.b32 %r1621, %r1620, %r1620, 16; + add.s32 %r1622, %r1621, %r1601; + xor.b32 %r1623, %r1622, %r1589; + shf.l.wrap.b32 %r1624, %r1623, %r1623, 20; + add.s32 %r1625, %r1619, %r1114; + add.s32 %r1626, %r1625, %r1624; + xor.b32 %r1627, %r1626, %r1621; + shf.l.wrap.b32 %r1628, %r1627, %r1627, 24; + add.s32 %r1629, %r1628, %r1622; + xor.b32 %r1630, %r1629, %r1624; + shf.l.wrap.b32 %r1631, %r1630, %r1630, 25; + add.s32 %r1632, %r1584, %r1191; + add.s32 %r1633, %r1632, %r1603; + xor.b32 %r1634, %r1572, %r1633; + shf.l.wrap.b32 %r1635, %r1634, %r1634, 16; + add.s32 %r1636, %r1635, %r1615; + xor.b32 %r1637, %r1636, %r1603; + shf.l.wrap.b32 %r1638, %r1637, %r1637, 20; + add.s32 %r1639, %r1633, %r1128; + add.s32 %r1640, %r1639, %r1638; + xor.b32 %r1641, %r1640, %r1635; + shf.l.wrap.b32 %r1642, %r1641, %r1641, 24; + add.s32 %r1643, %r1642, %r1636; + xor.b32 %r1644, %r1643, %r1638; + shf.l.wrap.b32 %r1645, %r1644, %r1644, 25; + add.s32 %r1646, %r1598, %r1149; + add.s32 %r1647, %r1646, %r1617; + xor.b32 %r1648, %r1647, %r1586; + shf.l.wrap.b32 %r1649, %r1648, %r1648, 16; + add.s32 %r1650, %r1649, %r1573; + xor.b32 %r1651, %r1650, %r1617; + shf.l.wrap.b32 %r1652, %r1651, %r1651, 20; + add.s32 %r1653, %r1647, %r1170; + add.s32 %r1654, %r1653, %r1652; + xor.b32 %r1655, %r1654, %r1649; + shf.l.wrap.b32 %r1656, %r1655, %r1655, 24; + add.s32 %r1657, %r1656, %r1650; + xor.b32 %r1658, %r1657, %r1652; + shf.l.wrap.b32 %r1659, %r1658, %r1658, 25; + add.s32 %r1660, %r1612, %r1121; + add.s32 %r1661, %r1660, %r1575; + xor.b32 %r1662, %r1661, %r1600; + shf.l.wrap.b32 %r1663, %r1662, %r1662, 16; + add.s32 %r1664, %r1663, %r1587; + xor.b32 %r1665, %r1664, %r1575; + shf.l.wrap.b32 %r1666, %r1665, %r1665, 20; + add.s32 %r1667, %r1661, %r1156; + add.s32 %r1668, %r1667, %r1666; + xor.b32 %r1669, %r1668, %r1663; + shf.l.wrap.b32 %r1670, %r1669, %r1669, 24; + add.s32 %r1671, %r1670, %r1664; + xor.b32 %r1672, %r1671, %r1666; + shf.l.wrap.b32 %r1673, %r1672, %r1672, 25; + add.s32 %r1674, %r1626, %r1198; + add.s32 %r1675, %r1674, %r1673; + xor.b32 %r1676, %r1675, %r1642; + shf.l.wrap.b32 %r1677, %r1676, %r1676, 16; + add.s32 %r1678, %r1677, %r1657; + xor.b32 %r1679, %r1678, %r1673; + shf.l.wrap.b32 %r1680, %r1679, %r1679, 20; + add.s32 %r1681, %r1675, %r1205; + add.s32 %r1682, %r1681, %r1680; + xor.b32 %r1683, %r1682, %r1677; + shf.l.wrap.b32 %r1684, %r1683, %r1683, 24; + add.s32 %r1685, %r1684, %r1678; + xor.b32 %r1686, %r1685, %r1680; + shf.l.wrap.b32 %r1687, %r1686, %r1686, 25; + add.s32 %r1688, %r1640, %r1177; + add.s32 %r1689, %r1688, %r1631; + xor.b32 %r1690, %r1689, %r1656; + shf.l.wrap.b32 %r1691, %r1690, %r1690, 16; + add.s32 %r1692, %r1691, %r1671; + xor.b32 %r1693, %r1692, %r1631; + shf.l.wrap.b32 %r1694, %r1693, %r1693, 20; + add.s32 %r1695, %r1689, %r1191; + add.s32 %r1696, %r1695, %r1694; + xor.b32 %r1697, %r1696, %r1691; + shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; + add.s32 %r1699, %r1698, %r1692; + xor.b32 %r1700, %r1699, %r1694; + shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; + add.s32 %r1702, %r1654, %r1219; + add.s32 %r1703, %r1702, %r1645; + xor.b32 %r1704, %r1670, %r1703; + shf.l.wrap.b32 %r1705, %r1704, %r1704, 16; + add.s32 %r1706, %r1705, %r1629; + xor.b32 %r1707, %r1706, %r1645; + shf.l.wrap.b32 %r1708, %r1707, %r1707, 20; + add.s32 %r1709, %r1703, %r1184; + add.s32 %r1710, %r1709, %r1708; + xor.b32 %r1711, %r1710, %r1705; + shf.l.wrap.b32 %r1712, %r1711, %r1711, 24; + add.s32 %r1713, %r1712, %r1706; + xor.b32 %r1714, %r1713, %r1708; + shf.l.wrap.b32 %r1715, %r1714, %r1714, 25; + add.s32 %r1716, %r1659, %r1212; + add.s32 %r1717, %r1716, %r1668; + xor.b32 %r1718, %r1717, %r1628; + shf.l.wrap.b32 %r1719, %r1718, %r1718, 16; + add.s32 %r1720, %r1719, %r1643; + xor.b32 %r1721, %r1720, %r1659; + shf.l.wrap.b32 %r1722, %r1721, %r1721, 20; + add.s32 %r1723, %r1717, %r1170; + add.s32 %r1724, %r1723, %r1722; + xor.b32 %r1725, %r1724, %r1719; + shf.l.wrap.b32 %r1726, %r1725, %r1725, 24; + add.s32 %r1727, %r1726, %r1720; + xor.b32 %r1728, %r1727, %r1722; + shf.l.wrap.b32 %r1729, %r1728, %r1728, 25; + add.s32 %r1730, %r1701, %r1163; + add.s32 %r1731, %r1730, %r1682; + xor.b32 %r1732, %r1731, %r1726; + shf.l.wrap.b32 %r1733, %r1732, %r1732, 16; + add.s32 %r1734, %r1733, %r1713; + xor.b32 %r1735, %r1734, %r1701; + shf.l.wrap.b32 %r1736, %r1735, %r1735, 20; + add.s32 %r1737, %r1731, %r1128; + add.s32 %r1738, %r1737, %r1736; + xor.b32 %r1739, %r1738, %r1733; + shf.l.wrap.b32 %r1740, %r1739, %r1739, 24; + add.s32 %r1741, %r1740, %r1734; + xor.b32 %r1742, %r1741, %r1736; + shf.l.wrap.b32 %r1743, %r1742, %r1742, 25; + add.s32 %r1744, %r1696, %r1149; + add.s32 %r1745, %r1744, %r1715; + xor.b32 %r1746, %r1684, %r1745; + shf.l.wrap.b32 %r1747, %r1746, %r1746, 16; + add.s32 %r1748, %r1747, %r1727; + xor.b32 %r1749, %r1748, %r1715; + shf.l.wrap.b32 %r1750, %r1749, %r1749, 20; + add.s32 %r1751, %r1745, %r1135; + add.s32 %r1752, %r1751, %r1750; + xor.b32 %r1753, %r1752, %r1747; + shf.l.wrap.b32 %r1754, %r1753, %r1753, 24; + add.s32 %r1755, %r1754, %r1748; + xor.b32 %r1756, %r1755, %r1750; + shf.l.wrap.b32 %r1757, %r1756, %r1756, 25; + add.s32 %r1758, %r1710, %r1114; + add.s32 %r1759, %r1758, %r1729; + xor.b32 %r1760, %r1759, %r1698; + shf.l.wrap.b32 %r1761, %r1760, %r1760, 16; + add.s32 %r1762, %r1761, %r1685; + xor.b32 %r1763, %r1762, %r1729; + shf.l.wrap.b32 %r1764, %r1763, %r1763, 20; + add.s32 %r1765, %r1759, %r1121; + add.s32 %r1766, %r1765, %r1764; + xor.b32 %r1767, %r1766, %r1761; + shf.l.wrap.b32 %r1768, %r1767, %r1767, 24; + add.s32 %r1769, %r1768, %r1762; + xor.b32 %r1770, %r1769, %r1764; + shf.l.wrap.b32 %r1771, %r1770, %r1770, 25; + add.s32 %r1772, %r1724, %r1156; + add.s32 %r1773, %r1772, %r1687; + xor.b32 %r1774, %r1773, %r1712; + shf.l.wrap.b32 %r1775, %r1774, %r1774, 16; + add.s32 %r1776, %r1775, %r1699; + xor.b32 %r1777, %r1776, %r1687; + shf.l.wrap.b32 %r1778, %r1777, %r1777, 20; + add.s32 %r1779, %r1773, %r1142; + add.s32 %r1780, %r1779, %r1778; + xor.b32 %r1781, %r1780, %r1775; + shf.l.wrap.b32 %r1782, %r1781, %r1781, 24; + add.s32 %r1783, %r1782, %r1776; + xor.b32 %r1784, %r1783, %r1778; + shf.l.wrap.b32 %r1785, %r1784, %r1784, 25; + add.s32 %r1786, %r1738, %r1177; + add.s32 %r1787, %r1786, %r1785; + xor.b32 %r1788, %r1787, %r1754; + shf.l.wrap.b32 %r1789, %r1788, %r1788, 16; + add.s32 %r1790, %r1789, %r1769; + xor.b32 %r1791, %r1790, %r1785; + shf.l.wrap.b32 %r1792, %r1791, %r1791, 20; + add.s32 %r1793, %r1787, %r1212; + add.s32 %r1794, %r1793, %r1792; + xor.b32 %r1795, %r1794, %r1789; + shf.l.wrap.b32 %r1796, %r1795, %r1795, 24; + add.s32 %r1797, %r1796, %r1790; + xor.b32 %r1798, %r1797, %r1792; + shf.l.wrap.b32 %r1799, %r1798, %r1798, 25; + add.s32 %r1800, %r1752, %r1191; + add.s32 %r1801, %r1800, %r1743; + xor.b32 %r1802, %r1801, %r1768; + shf.l.wrap.b32 %r1803, %r1802, %r1802, 16; + add.s32 %r1804, %r1803, %r1783; + xor.b32 %r1805, %r1804, %r1743; + shf.l.wrap.b32 %r1806, %r1805, %r1805, 20; + add.s32 %r1807, %r1801, %r1149; + add.s32 %r1808, %r1807, %r1806; + xor.b32 %r1809, %r1808, %r1803; + shf.l.wrap.b32 %r1810, %r1809, %r1809, 24; + add.s32 %r1811, %r1810, %r1804; + xor.b32 %r1812, %r1811, %r1806; + shf.l.wrap.b32 %r1813, %r1812, %r1812, 25; + add.s32 %r1814, %r1766, %r1170; + add.s32 %r1815, %r1814, %r1757; + xor.b32 %r1816, %r1782, %r1815; + shf.l.wrap.b32 %r1817, %r1816, %r1816, 16; + add.s32 %r1818, %r1817, %r1741; + xor.b32 %r1819, %r1818, %r1757; + shf.l.wrap.b32 %r1820, %r1819, %r1819, 20; + add.s32 %r1821, %r1815, %r1198; + add.s32 %r1822, %r1821, %r1820; + xor.b32 %r1823, %r1822, %r1817; + shf.l.wrap.b32 %r1824, %r1823, %r1823, 24; + add.s32 %r1825, %r1824, %r1818; + xor.b32 %r1826, %r1825, %r1820; + shf.l.wrap.b32 %r1827, %r1826, %r1826, 25; + add.s32 %r1828, %r1771, %r1219; + add.s32 %r1829, %r1828, %r1780; + xor.b32 %r1830, %r1829, %r1740; + shf.l.wrap.b32 %r1831, %r1830, %r1830, 16; + add.s32 %r1832, %r1831, %r1755; + xor.b32 %r1833, %r1832, %r1771; + shf.l.wrap.b32 %r1834, %r1833, %r1833, 20; + add.s32 %r1835, %r1829, %r1121; + add.s32 %r1836, %r1835, %r1834; + xor.b32 %r1837, %r1836, %r1831; + shf.l.wrap.b32 %r1838, %r1837, %r1837, 24; + add.s32 %r1839, %r1838, %r1832; + xor.b32 %r1840, %r1839, %r1834; + shf.l.wrap.b32 %r1841, %r1840, %r1840, 25; + add.s32 %r1842, %r1813, %r1205; + add.s32 %r1843, %r1842, %r1794; + xor.b32 %r1844, %r1843, %r1838; + shf.l.wrap.b32 %r1845, %r1844, %r1844, 16; + add.s32 %r1846, %r1845, %r1825; + xor.b32 %r1847, %r1846, %r1813; + shf.l.wrap.b32 %r1848, %r1847, %r1847, 20; + add.s32 %r1849, %r1843, %r1135; + add.s32 %r1850, %r1849, %r1848; + xor.b32 %r1851, %r1850, %r1845; + shf.l.wrap.b32 %r1852, %r1851, %r1851, 24; + add.s32 %r1853, %r1852, %r1846; + xor.b32 %r1854, %r1853, %r1848; + shf.l.wrap.b32 %r1855, %r1854, %r1854, 25; + add.s32 %r1856, %r1808, %r1114; + add.s32 %r1857, %r1856, %r1827; + xor.b32 %r1858, %r1796, %r1857; + shf.l.wrap.b32 %r1859, %r1858, %r1858, 16; + add.s32 %r1860, %r1859, %r1839; + xor.b32 %r1861, %r1860, %r1827; + shf.l.wrap.b32 %r1862, %r1861, %r1861, 20; + add.s32 %r1863, %r1857, %r1184; + add.s32 %r1864, %r1863, %r1862; + xor.b32 %r1865, %r1864, %r1859; + shf.l.wrap.b32 %r1866, %r1865, %r1865, 24; + add.s32 %r1867, %r1866, %r1860; + xor.b32 %r1868, %r1867, %r1862; + shf.l.wrap.b32 %r1869, %r1868, %r1868, 25; + add.s32 %r1870, %r1822, %r1128; + add.s32 %r1871, %r1870, %r1841; + xor.b32 %r1872, %r1871, %r1810; + shf.l.wrap.b32 %r1873, %r1872, %r1872, 16; + add.s32 %r1874, %r1873, %r1797; + xor.b32 %r1875, %r1874, %r1841; + shf.l.wrap.b32 %r1876, %r1875, %r1875, 20; + add.s32 %r1877, %r1871, %r1156; + add.s32 %r1878, %r1877, %r1876; + xor.b32 %r1879, %r1878, %r1873; + shf.l.wrap.b32 %r1880, %r1879, %r1879, 24; + add.s32 %r1881, %r1880, %r1874; + xor.b32 %r1882, %r1881, %r1876; + shf.l.wrap.b32 %r1883, %r1882, %r1882, 25; + add.s32 %r1884, %r1836, %r1142; + add.s32 %r1885, %r1884, %r1799; + xor.b32 %r1886, %r1885, %r1824; + shf.l.wrap.b32 %r1887, %r1886, %r1886, 16; + add.s32 %r1888, %r1887, %r1811; + xor.b32 %r1889, %r1888, %r1799; + shf.l.wrap.b32 %r1890, %r1889, %r1889, 20; + add.s32 %r1891, %r1885, %r1163; + add.s32 %r1892, %r1891, %r1890; + xor.b32 %r1893, %r1892, %r1887; + shf.l.wrap.b32 %r1894, %r1893, %r1893, 24; + add.s32 %r1895, %r1894, %r1888; + xor.b32 %r1896, %r1895, %r1890; + shf.l.wrap.b32 %r1897, %r1896, %r1896, 25; + add.s32 %r1898, %r1850, %r1191; + add.s32 %r1899, %r1898, %r1897; + xor.b32 %r1900, %r1899, %r1866; + shf.l.wrap.b32 %r1901, %r1900, %r1900, 16; + add.s32 %r1902, %r1901, %r1881; + xor.b32 %r1903, %r1902, %r1897; + shf.l.wrap.b32 %r1904, %r1903, %r1903, 20; + add.s32 %r1905, %r1899, %r1219; + add.s32 %r1906, %r1905, %r1904; + xor.b32 %r1907, %r1906, %r1901; + shf.l.wrap.b32 %r1908, %r1907, %r1907, 24; + add.s32 %r1909, %r1908, %r1902; + xor.b32 %r1910, %r1909, %r1904; + shf.l.wrap.b32 %r1911, %r1910, %r1910, 25; + add.s32 %r1912, %r1864, %r1149; + add.s32 %r1913, %r1912, %r1855; + xor.b32 %r1914, %r1913, %r1880; + shf.l.wrap.b32 %r1915, %r1914, %r1914, 16; + add.s32 %r1916, %r1915, %r1895; + xor.b32 %r1917, %r1916, %r1855; + shf.l.wrap.b32 %r1918, %r1917, %r1917, 20; + add.s32 %r1919, %r1913, %r1114; + add.s32 %r1920, %r1919, %r1918; + xor.b32 %r1921, %r1920, %r1915; + shf.l.wrap.b32 %r1922, %r1921, %r1921, 24; + add.s32 %r1923, %r1922, %r1916; + xor.b32 %r1924, %r1923, %r1918; + shf.l.wrap.b32 %r1925, %r1924, %r1924, 25; + add.s32 %r1926, %r1878, %r1121; + add.s32 %r1927, %r1926, %r1869; + xor.b32 %r1928, %r1894, %r1927; + shf.l.wrap.b32 %r1929, %r1928, %r1928, 16; + add.s32 %r1930, %r1929, %r1853; + xor.b32 %r1931, %r1930, %r1869; + shf.l.wrap.b32 %r1932, %r1931, %r1931, 20; + add.s32 %r1933, %r1927, %r1177; + add.s32 %r1934, %r1933, %r1932; + xor.b32 %r1935, %r1934, %r1929; + shf.l.wrap.b32 %r1936, %r1935, %r1935, 24; + add.s32 %r1937, %r1936, %r1930; + xor.b32 %r1938, %r1937, %r1932; + shf.l.wrap.b32 %r1939, %r1938, %r1938, 25; + add.s32 %r1940, %r1883, %r1170; + add.s32 %r1941, %r1940, %r1892; + xor.b32 %r1942, %r1941, %r1852; + shf.l.wrap.b32 %r1943, %r1942, %r1942, 16; + add.s32 %r1944, %r1943, %r1867; + xor.b32 %r1945, %r1944, %r1883; + shf.l.wrap.b32 %r1946, %r1945, %r1945, 20; + add.s32 %r1947, %r1941, %r1156; + add.s32 %r1948, %r1947, %r1946; + xor.b32 %r1949, %r1948, %r1943; + shf.l.wrap.b32 %r1950, %r1949, %r1949, 24; + add.s32 %r1951, %r1950, %r1944; + xor.b32 %r1952, %r1951, %r1946; + shf.l.wrap.b32 %r1953, %r1952, %r1952, 25; + add.s32 %r1954, %r1925, %r1212; + add.s32 %r1955, %r1954, %r1906; + xor.b32 %r1956, %r1955, %r1950; + shf.l.wrap.b32 %r1957, %r1956, %r1956, 16; + add.s32 %r1958, %r1957, %r1937; + xor.b32 %r1959, %r1958, %r1925; + shf.l.wrap.b32 %r1960, %r1959, %r1959, 20; + add.s32 %r1961, %r1955, %r1184; + add.s32 %r1962, %r1961, %r1960; + xor.b32 %r1963, %r1962, %r1957; + shf.l.wrap.b32 %r1964, %r1963, %r1963, 24; + add.s32 %r1965, %r1964, %r1958; + xor.b32 %r1966, %r1965, %r1960; + shf.l.wrap.b32 %r1967, %r1966, %r1966, 25; + add.s32 %r1968, %r1920, %r1128; + add.s32 %r1969, %r1968, %r1939; + xor.b32 %r1970, %r1908, %r1969; + shf.l.wrap.b32 %r1971, %r1970, %r1970, 16; + add.s32 %r1972, %r1971, %r1951; + xor.b32 %r1973, %r1972, %r1939; + shf.l.wrap.b32 %r1974, %r1973, %r1973, 20; + add.s32 %r1975, %r1969, %r1198; + add.s32 %r1976, %r1975, %r1974; + xor.b32 %r1977, %r1976, %r1971; + shf.l.wrap.b32 %r1978, %r1977, %r1977, 24; + add.s32 %r1979, %r1978, %r1972; + xor.b32 %r1980, %r1979, %r1974; + shf.l.wrap.b32 %r1981, %r1980, %r1980, 25; + add.s32 %r1982, %r1934, %r1135; + add.s32 %r1983, %r1982, %r1953; + xor.b32 %r1984, %r1983, %r1922; + shf.l.wrap.b32 %r1985, %r1984, %r1984, 16; + add.s32 %r1986, %r1985, %r1909; + xor.b32 %r1987, %r1986, %r1953; + shf.l.wrap.b32 %r1988, %r1987, %r1987, 20; + add.s32 %r1989, %r1983, %r1142; + add.s32 %r1990, %r1989, %r1988; + xor.b32 %r1991, %r1990, %r1985; + shf.l.wrap.b32 %r1992, %r1991, %r1991, 24; + add.s32 %r1993, %r1992, %r1986; + xor.b32 %r1994, %r1993, %r1988; + shf.l.wrap.b32 %r1995, %r1994, %r1994, 25; + add.s32 %r1996, %r1948, %r1163; + add.s32 %r1997, %r1996, %r1911; + xor.b32 %r1998, %r1997, %r1936; + shf.l.wrap.b32 %r1999, %r1998, %r1998, 16; + add.s32 %r2000, %r1999, %r1923; + xor.b32 %r2001, %r2000, %r1911; + shf.l.wrap.b32 %r2002, %r2001, %r2001, 20; + add.s32 %r2003, %r1997, %r1205; + add.s32 %r2004, %r2003, %r2002; + xor.b32 %r2005, %r2004, %r1999; + shf.l.wrap.b32 %r2006, %r2005, %r2005, 24; + add.s32 %r2007, %r2006, %r2000; + xor.b32 %r2008, %r2007, %r2002; + shf.l.wrap.b32 %r2009, %r2008, %r2008, 25; + xor.b32 %r3948, %r1993, %r1962; + xor.b32 %r3947, %r2007, %r1976; + xor.b32 %r3946, %r1965, %r1990; + xor.b32 %r3945, %r2004, %r1979; + xor.b32 %r3944, %r2009, %r1978; + xor.b32 %r3943, %r1967, %r1992; + xor.b32 %r3942, %r2006, %r1981; + xor.b32 %r3941, %r1995, %r1964; + add.s64 %rd166, %rd166, 64; + add.s64 %rd167, %rd167, -1; + setp.ne.s64 %p19, %rd167, 0; + mov.u16 %rs197, %rs75; + @%p19 bra $L__BB0_19; + + st.local.u8 [%rd155], %r3948; + shr.u32 %r2010, %r3948, 8; + st.local.u8 [%rd155+1], %r2010; + shr.u32 %r2011, %r3948, 16; + st.local.u8 [%rd155+2], %r2011; + shr.u32 %r2012, %r3948, 24; + st.local.u8 [%rd155+3], %r2012; + st.local.u8 [%rd155+4], %r3947; + shr.u32 %r2013, %r3947, 8; + st.local.u8 [%rd155+5], %r2013; + shr.u32 %r2014, %r3947, 16; + st.local.u8 [%rd155+6], %r2014; + shr.u32 %r2015, %r3947, 24; + st.local.u8 [%rd155+7], %r2015; + st.local.u8 [%rd155+8], %r3946; + shr.u32 %r2016, %r3946, 8; + st.local.u8 [%rd155+9], %r2016; + shr.u32 %r2017, %r3946, 16; + st.local.u8 [%rd155+10], %r2017; + shr.u32 %r2018, %r3946, 24; + st.local.u8 [%rd155+11], %r2018; + st.local.u8 [%rd155+12], %r3945; + shr.u32 %r2019, %r3945, 8; + st.local.u8 [%rd155+13], %r2019; + shr.u32 %r2020, %r3945, 16; + st.local.u8 [%rd155+14], %r2020; + shr.u32 %r2021, %r3945, 24; + st.local.u8 [%rd155+15], %r2021; + st.local.u8 [%rd155+16], %r3944; + shr.u32 %r2022, %r3944, 8; + st.local.u8 [%rd155+17], %r2022; + shr.u32 %r2023, %r3944, 16; + st.local.u8 [%rd155+18], %r2023; + shr.u32 %r2024, %r3944, 24; + st.local.u8 [%rd155+19], %r2024; + st.local.u8 [%rd155+20], %r3943; + shr.u32 %r2025, %r3943, 8; + st.local.u8 [%rd155+21], %r2025; + shr.u32 %r2026, %r3943, 16; + st.local.u8 [%rd155+22], %r2026; + shr.u32 %r2027, %r3943, 24; + st.local.u8 [%rd155+23], %r2027; + st.local.u8 [%rd155+24], %r3942; + shr.u32 %r2028, %r3942, 8; + st.local.u8 [%rd155+25], %r2028; + shr.u32 %r2029, %r3942, 16; + st.local.u8 [%rd155+26], %r2029; + shr.u32 %r2030, %r3942, 24; + st.local.u8 [%rd155+27], %r2030; + st.local.u8 [%rd155+28], %r3941; + shr.u32 %r2031, %r3941, 8; + st.local.u8 [%rd155+29], %r2031; + shr.u32 %r2032, %r3941, 16; + st.local.u8 [%rd155+30], %r2032; + shr.u32 %r2033, %r3941, 24; + st.local.u8 [%rd155+31], %r2033; + add.s64 %rd165, %rd165, 1; + add.s64 %rd162, %rd162, 8; + add.s64 %rd155, %rd155, 32; + add.s64 %rd163, %rd163, -1; + setp.ne.s64 %p20, %rd163, 0; + @%p20 bra $L__BB0_18; + +$L__BB0_21: + ld.param.u64 %rd139, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_1]; + setp.ge.u64 %p21, %rd159, %rd139; + @%p21 bra $L__BB0_30; + + ld.param.u64 %rd140, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_0]; + ld.param.u64 %rd135, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_3]; + add.s64 %rd127, %rd151, %rd135; + ld.local.u8 %r2034, [%rd2]; + ld.local.u8 %r2035, [%rd2+1]; + prmt.b32 %r2036, %r2035, %r2034, 30212; + ld.local.u8 %r2037, [%rd2+2]; + ld.local.u8 %r2038, [%rd2+3]; + prmt.b32 %r2039, %r2038, %r2037, 30212; + prmt.b32 %r3964, %r2039, %r2036, 4180; + ld.local.u8 %r2040, [%rd2+4]; + ld.local.u8 %r2041, [%rd2+5]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd2+6]; + ld.local.u8 %r2044, [%rd2+7]; + prmt.b32 %r2045, %r2044, %r2043, 30212; + prmt.b32 %r3963, %r2045, %r2042, 4180; + ld.local.u8 %r2046, [%rd2+8]; + ld.local.u8 %r2047, [%rd2+9]; + prmt.b32 %r2048, %r2047, %r2046, 30212; + ld.local.u8 %r2049, [%rd2+10]; + ld.local.u8 %r2050, [%rd2+11]; + prmt.b32 %r2051, %r2050, %r2049, 30212; + prmt.b32 %r3962, %r2051, %r2048, 4180; + ld.local.u8 %r2052, [%rd2+12]; + ld.local.u8 %r2053, [%rd2+13]; + prmt.b32 %r2054, %r2053, %r2052, 30212; + ld.local.u8 %r2055, [%rd2+14]; + ld.local.u8 %r2056, [%rd2+15]; + prmt.b32 %r2057, %r2056, %r2055, 30212; + prmt.b32 %r3961, %r2057, %r2054, 4180; + ld.local.u8 %r2058, [%rd2+16]; + ld.local.u8 %r2059, [%rd2+17]; + prmt.b32 %r2060, %r2059, %r2058, 30212; + ld.local.u8 %r2061, [%rd2+18]; + ld.local.u8 %r2062, [%rd2+19]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + prmt.b32 %r3960, %r2063, %r2060, 4180; + ld.local.u8 %r2064, [%rd2+20]; + ld.local.u8 %r2065, [%rd2+21]; + prmt.b32 %r2066, %r2065, %r2064, 30212; + ld.local.u8 %r2067, [%rd2+22]; + ld.local.u8 %r2068, [%rd2+23]; + prmt.b32 %r2069, %r2068, %r2067, 30212; + prmt.b32 %r3959, %r2069, %r2066, 4180; + ld.local.u8 %r2070, [%rd2+24]; + ld.local.u8 %r2071, [%rd2+25]; + prmt.b32 %r2072, %r2071, %r2070, 30212; + ld.local.u8 %r2073, [%rd2+26]; + ld.local.u8 %r2074, [%rd2+27]; + prmt.b32 %r2075, %r2074, %r2073, 30212; + prmt.b32 %r3958, %r2075, %r2072, 4180; + ld.local.u8 %r2076, [%rd2+28]; + ld.local.u8 %r2077, [%rd2+29]; + prmt.b32 %r2078, %r2077, %r2076, 30212; + ld.local.u8 %r2079, [%rd2+30]; + ld.local.u8 %r2080, [%rd2+31]; + prmt.b32 %r2081, %r2080, %r2079, 30212; + prmt.b32 %r3957, %r2081, %r2078, 4180; + add.u64 %rd53, %SPL, 16; + mov.u32 %r2082, 0; + st.local.v2.u32 [%rd53], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+8], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+16], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+24], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+32], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+40], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+48], {%r2082, %r2082}; + st.local.v2.u32 [%rd53+56], {%r2082, %r2082}; + mov.u16 %rs199, 0; + st.local.v2.u8 [%rd53+64], {%rs199, %rs199}; + st.local.u8 [%rd53+66], %rs75; + add.s64 %rd170, %rd140, %rd159; + cvt.u32.u64 %r36, %rd127; + shr.u64 %rd129, %rd127, 32; + cvt.u32.u64 %r37, %rd129; + setp.lt.u64 %p22, %rd171, 65; + @%p22 bra $L__BB0_25; + + add.s64 %rd56, %rd53, 64; + mov.u16 %rs198, 0; + +$L__BB0_24: + and.b16 %rs83, %rs198, 255; + setp.eq.s16 %p23, %rs83, 0; + selp.u16 %rs84, 1, 0, %p23; + or.b16 %rs85, %rs84, %rs75; + ld.u8 %r2083, [%rd170]; + ld.u8 %r2084, [%rd170+1]; + prmt.b32 %r2085, %r2084, %r2083, 30212; + ld.u8 %r2086, [%rd170+2]; + prmt.b32 %r2087, %r2086, %r2085, 28756; + ld.u8 %r2088, [%rd170+3]; + prmt.b32 %r2089, %r2088, %r2087, 1620; + ld.u8 %r2090, [%rd170+4]; + ld.u8 %r2091, [%rd170+5]; + prmt.b32 %r2092, %r2091, %r2090, 30212; + ld.u8 %r2093, [%rd170+6]; + prmt.b32 %r2094, %r2093, %r2092, 28756; + ld.u8 %r2095, [%rd170+7]; + prmt.b32 %r2096, %r2095, %r2094, 1620; + ld.u8 %r2097, [%rd170+8]; + ld.u8 %r2098, [%rd170+9]; + prmt.b32 %r2099, %r2098, %r2097, 30212; + ld.u8 %r2100, [%rd170+10]; + prmt.b32 %r2101, %r2100, %r2099, 28756; + ld.u8 %r2102, [%rd170+11]; + prmt.b32 %r2103, %r2102, %r2101, 1620; + ld.u8 %r2104, [%rd170+12]; + ld.u8 %r2105, [%rd170+13]; + prmt.b32 %r2106, %r2105, %r2104, 30212; + ld.u8 %r2107, [%rd170+14]; + prmt.b32 %r2108, %r2107, %r2106, 28756; + ld.u8 %r2109, [%rd170+15]; + prmt.b32 %r2110, %r2109, %r2108, 1620; + ld.u8 %r2111, [%rd170+16]; + ld.u8 %r2112, [%rd170+17]; + prmt.b32 %r2113, %r2112, %r2111, 30212; + ld.u8 %r2114, [%rd170+18]; + prmt.b32 %r2115, %r2114, %r2113, 28756; + ld.u8 %r2116, [%rd170+19]; + prmt.b32 %r2117, %r2116, %r2115, 1620; + ld.u8 %r2118, [%rd170+20]; + ld.u8 %r2119, [%rd170+21]; + prmt.b32 %r2120, %r2119, %r2118, 30212; + ld.u8 %r2121, [%rd170+22]; + prmt.b32 %r2122, %r2121, %r2120, 28756; + ld.u8 %r2123, [%rd170+23]; + prmt.b32 %r2124, %r2123, %r2122, 1620; + ld.u8 %r2125, [%rd170+24]; + ld.u8 %r2126, [%rd170+25]; + prmt.b32 %r2127, %r2126, %r2125, 30212; + ld.u8 %r2128, [%rd170+26]; + prmt.b32 %r2129, %r2128, %r2127, 28756; + ld.u8 %r2130, [%rd170+27]; + prmt.b32 %r2131, %r2130, %r2129, 1620; + ld.u8 %r2132, [%rd170+28]; + ld.u8 %r2133, [%rd170+29]; + prmt.b32 %r2134, %r2133, %r2132, 30212; + ld.u8 %r2135, [%rd170+30]; + prmt.b32 %r2136, %r2135, %r2134, 28756; + ld.u8 %r2137, [%rd170+31]; + prmt.b32 %r2138, %r2137, %r2136, 1620; + ld.u8 %r2139, [%rd170+32]; + ld.u8 %r2140, [%rd170+33]; + prmt.b32 %r2141, %r2140, %r2139, 30212; + ld.u8 %r2142, [%rd170+34]; + prmt.b32 %r2143, %r2142, %r2141, 28756; + ld.u8 %r2144, [%rd170+35]; + prmt.b32 %r2145, %r2144, %r2143, 1620; + ld.u8 %r2146, [%rd170+36]; + ld.u8 %r2147, [%rd170+37]; + prmt.b32 %r2148, %r2147, %r2146, 30212; + ld.u8 %r2149, [%rd170+38]; + prmt.b32 %r2150, %r2149, %r2148, 28756; + ld.u8 %r2151, [%rd170+39]; + prmt.b32 %r2152, %r2151, %r2150, 1620; + ld.u8 %r2153, [%rd170+40]; + ld.u8 %r2154, [%rd170+41]; + prmt.b32 %r2155, %r2154, %r2153, 30212; + ld.u8 %r2156, [%rd170+42]; + prmt.b32 %r2157, %r2156, %r2155, 28756; + ld.u8 %r2158, [%rd170+43]; + prmt.b32 %r2159, %r2158, %r2157, 1620; + ld.u8 %r2160, [%rd170+44]; + ld.u8 %r2161, [%rd170+45]; + prmt.b32 %r2162, %r2161, %r2160, 30212; + ld.u8 %r2163, [%rd170+46]; + prmt.b32 %r2164, %r2163, %r2162, 28756; + ld.u8 %r2165, [%rd170+47]; + prmt.b32 %r2166, %r2165, %r2164, 1620; + ld.u8 %r2167, [%rd170+48]; + ld.u8 %r2168, [%rd170+49]; + prmt.b32 %r2169, %r2168, %r2167, 30212; + ld.u8 %r2170, [%rd170+50]; + prmt.b32 %r2171, %r2170, %r2169, 28756; + ld.u8 %r2172, [%rd170+51]; + prmt.b32 %r2173, %r2172, %r2171, 1620; + ld.u8 %r2174, [%rd170+52]; + ld.u8 %r2175, [%rd170+53]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.u8 %r2177, [%rd170+54]; + prmt.b32 %r2178, %r2177, %r2176, 28756; + ld.u8 %r2179, [%rd170+55]; + prmt.b32 %r2180, %r2179, %r2178, 1620; + ld.u8 %r2181, [%rd170+56]; + ld.u8 %r2182, [%rd170+57]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.u8 %r2184, [%rd170+58]; + prmt.b32 %r2185, %r2184, %r2183, 28756; + ld.u8 %r2186, [%rd170+59]; + prmt.b32 %r2187, %r2186, %r2185, 1620; + ld.u8 %r2188, [%rd170+60]; + ld.u8 %r2189, [%rd170+61]; + prmt.b32 %r2190, %r2189, %r2188, 30212; + ld.u8 %r2191, [%rd170+62]; + prmt.b32 %r2192, %r2191, %r2190, 28756; + ld.u8 %r2193, [%rd170+63]; + prmt.b32 %r2194, %r2193, %r2192, 1620; + cvt.u32.u16 %r2195, %rs85; + add.s32 %r2196, %r3964, %r2089; + add.s32 %r2197, %r2196, %r3960; + xor.b32 %r2198, %r2197, %r36; + shf.l.wrap.b32 %r2199, %r2198, %r2198, 16; + add.s32 %r2200, %r2199, 1779033703; + xor.b32 %r2201, %r2200, %r3960; + shf.l.wrap.b32 %r2202, %r2201, %r2201, 20; + add.s32 %r2203, %r2197, %r2096; + add.s32 %r2204, %r2203, %r2202; + xor.b32 %r2205, %r2204, %r2199; + shf.l.wrap.b32 %r2206, %r2205, %r2205, 24; + add.s32 %r2207, %r2206, %r2200; + xor.b32 %r2208, %r2207, %r2202; + shf.l.wrap.b32 %r2209, %r2208, %r2208, 25; + add.s32 %r2210, %r3963, %r2103; + add.s32 %r2211, %r2210, %r3959; + xor.b32 %r2212, %r2211, %r37; + shf.l.wrap.b32 %r2213, %r2212, %r2212, 16; + add.s32 %r2214, %r2213, -1150833019; + xor.b32 %r2215, %r2214, %r3959; + shf.l.wrap.b32 %r2216, %r2215, %r2215, 20; + add.s32 %r2217, %r2211, %r2110; + add.s32 %r2218, %r2217, %r2216; + xor.b32 %r2219, %r2218, %r2213; + shf.l.wrap.b32 %r2220, %r2219, %r2219, 24; + add.s32 %r2221, %r2220, %r2214; + xor.b32 %r2222, %r2221, %r2216; + shf.l.wrap.b32 %r2223, %r2222, %r2222, 25; + add.s32 %r2224, %r3962, %r2117; + add.s32 %r2225, %r2224, %r3958; + shr.u32 %r2226, %r2225, 16; + shl.b32 %r2227, %r2225, 16; + xor.b32 %r2228, %r2227, 4194304; + or.b32 %r2229, %r2228, %r2226; + add.s32 %r2230, %r2229, 1013904242; + xor.b32 %r2231, %r2230, %r3958; + shf.l.wrap.b32 %r2232, %r2231, %r2231, 20; + add.s32 %r2233, %r2225, %r2124; + add.s32 %r2234, %r2233, %r2232; + xor.b32 %r2235, %r2234, %r2229; + shf.l.wrap.b32 %r2236, %r2235, %r2235, 24; + add.s32 %r2237, %r2236, %r2230; + xor.b32 %r2238, %r2237, %r2232; + shf.l.wrap.b32 %r2239, %r2238, %r2238, 25; + add.s32 %r2240, %r3961, %r2131; + add.s32 %r2241, %r2240, %r3957; + xor.b32 %r2242, %r2241, %r2195; + shr.u32 %r2243, %r2241, 16; + shl.b32 %r2244, %r2242, 16; + or.b32 %r2245, %r2244, %r2243; + add.s32 %r2246, %r2245, -1521486534; + xor.b32 %r2247, %r2246, %r3957; + shf.l.wrap.b32 %r2248, %r2247, %r2247, 20; + add.s32 %r2249, %r2241, %r2138; + add.s32 %r2250, %r2249, %r2248; + xor.b32 %r2251, %r2250, %r2245; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 24; + add.s32 %r2253, %r2252, %r2246; + xor.b32 %r2254, %r2253, %r2248; + shf.l.wrap.b32 %r2255, %r2254, %r2254, 25; + add.s32 %r2256, %r2204, %r2145; + add.s32 %r2257, %r2256, %r2223; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 16; + add.s32 %r2260, %r2259, %r2237; + xor.b32 %r2261, %r2260, %r2223; + shf.l.wrap.b32 %r2262, %r2261, %r2261, 20; + add.s32 %r2263, %r2257, %r2152; + add.s32 %r2264, %r2263, %r2262; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 24; + add.s32 %r2267, %r2266, %r2260; + xor.b32 %r2268, %r2267, %r2262; + shf.l.wrap.b32 %r2269, %r2268, %r2268, 25; + add.s32 %r2270, %r2218, %r2159; + add.s32 %r2271, %r2270, %r2239; + xor.b32 %r2272, %r2271, %r2206; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 16; + add.s32 %r2274, %r2273, %r2253; + xor.b32 %r2275, %r2274, %r2239; + shf.l.wrap.b32 %r2276, %r2275, %r2275, 20; + add.s32 %r2277, %r2271, %r2166; + add.s32 %r2278, %r2277, %r2276; + xor.b32 %r2279, %r2278, %r2273; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 24; + add.s32 %r2281, %r2280, %r2274; + xor.b32 %r2282, %r2281, %r2276; + shf.l.wrap.b32 %r2283, %r2282, %r2282, 25; + add.s32 %r2284, %r2234, %r2173; + add.s32 %r2285, %r2284, %r2255; + xor.b32 %r2286, %r2285, %r2220; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 16; + add.s32 %r2288, %r2287, %r2207; + xor.b32 %r2289, %r2288, %r2255; + shf.l.wrap.b32 %r2290, %r2289, %r2289, 20; + add.s32 %r2291, %r2285, %r2180; + add.s32 %r2292, %r2291, %r2290; + xor.b32 %r2293, %r2292, %r2287; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 24; + add.s32 %r2295, %r2294, %r2288; + xor.b32 %r2296, %r2295, %r2290; + shf.l.wrap.b32 %r2297, %r2296, %r2296, 25; + add.s32 %r2298, %r2250, %r2187; + add.s32 %r2299, %r2298, %r2209; + xor.b32 %r2300, %r2299, %r2236; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 16; + add.s32 %r2302, %r2301, %r2221; + xor.b32 %r2303, %r2302, %r2209; + shf.l.wrap.b32 %r2304, %r2303, %r2303, 20; + add.s32 %r2305, %r2299, %r2194; + add.s32 %r2306, %r2305, %r2304; + xor.b32 %r2307, %r2306, %r2301; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 24; + add.s32 %r2309, %r2308, %r2302; + xor.b32 %r2310, %r2309, %r2304; + shf.l.wrap.b32 %r2311, %r2310, %r2310, 25; + add.s32 %r2312, %r2264, %r2103; + add.s32 %r2313, %r2312, %r2311; + xor.b32 %r2314, %r2313, %r2280; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 16; + add.s32 %r2316, %r2315, %r2295; + xor.b32 %r2317, %r2316, %r2311; + shf.l.wrap.b32 %r2318, %r2317, %r2317, 20; + add.s32 %r2319, %r2313, %r2131; + add.s32 %r2320, %r2319, %r2318; + xor.b32 %r2321, %r2320, %r2315; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 24; + add.s32 %r2323, %r2322, %r2316; + xor.b32 %r2324, %r2323, %r2318; + shf.l.wrap.b32 %r2325, %r2324, %r2324, 25; + add.s32 %r2326, %r2278, %r2110; + add.s32 %r2327, %r2326, %r2269; + xor.b32 %r2328, %r2327, %r2294; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 16; + add.s32 %r2330, %r2329, %r2309; + xor.b32 %r2331, %r2330, %r2269; + shf.l.wrap.b32 %r2332, %r2331, %r2331, 20; + add.s32 %r2333, %r2327, %r2159; + add.s32 %r2334, %r2333, %r2332; + xor.b32 %r2335, %r2334, %r2329; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 24; + add.s32 %r2337, %r2336, %r2330; + xor.b32 %r2338, %r2337, %r2332; + shf.l.wrap.b32 %r2339, %r2338, %r2338, 25; + add.s32 %r2340, %r2292, %r2138; + add.s32 %r2341, %r2340, %r2283; + xor.b32 %r2342, %r2341, %r2308; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 16; + add.s32 %r2344, %r2343, %r2267; + xor.b32 %r2345, %r2344, %r2283; + shf.l.wrap.b32 %r2346, %r2345, %r2345, 20; + add.s32 %r2347, %r2341, %r2089; + add.s32 %r2348, %r2347, %r2346; + xor.b32 %r2349, %r2348, %r2343; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 24; + add.s32 %r2351, %r2350, %r2344; + xor.b32 %r2352, %r2351, %r2346; + shf.l.wrap.b32 %r2353, %r2352, %r2352, 25; + add.s32 %r2354, %r2306, %r2117; + add.s32 %r2355, %r2354, %r2297; + xor.b32 %r2356, %r2355, %r2266; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 16; + add.s32 %r2358, %r2357, %r2281; + xor.b32 %r2359, %r2358, %r2297; + shf.l.wrap.b32 %r2360, %r2359, %r2359, 20; + add.s32 %r2361, %r2355, %r2180; + add.s32 %r2362, %r2361, %r2360; + xor.b32 %r2363, %r2362, %r2357; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 24; + add.s32 %r2365, %r2364, %r2358; + xor.b32 %r2366, %r2365, %r2360; + shf.l.wrap.b32 %r2367, %r2366, %r2366, 25; + add.s32 %r2368, %r2320, %r2096; + add.s32 %r2369, %r2368, %r2339; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 16; + add.s32 %r2372, %r2371, %r2351; + xor.b32 %r2373, %r2372, %r2339; + shf.l.wrap.b32 %r2374, %r2373, %r2373, 20; + add.s32 %r2375, %r2369, %r2166; + add.s32 %r2376, %r2375, %r2374; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 24; + add.s32 %r2379, %r2378, %r2372; + xor.b32 %r2380, %r2379, %r2374; + shf.l.wrap.b32 %r2381, %r2380, %r2380, 25; + add.s32 %r2382, %r2334, %r2173; + add.s32 %r2383, %r2382, %r2353; + xor.b32 %r2384, %r2383, %r2322; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 16; + add.s32 %r2386, %r2385, %r2365; + xor.b32 %r2387, %r2386, %r2353; + shf.l.wrap.b32 %r2388, %r2387, %r2387, 20; + add.s32 %r2389, %r2383, %r2124; + add.s32 %r2390, %r2389, %r2388; + xor.b32 %r2391, %r2390, %r2385; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 24; + add.s32 %r2393, %r2392, %r2386; + xor.b32 %r2394, %r2393, %r2388; + shf.l.wrap.b32 %r2395, %r2394, %r2394, 25; + add.s32 %r2396, %r2348, %r2152; + add.s32 %r2397, %r2396, %r2367; + xor.b32 %r2398, %r2397, %r2336; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 16; + add.s32 %r2400, %r2399, %r2323; + xor.b32 %r2401, %r2400, %r2367; + shf.l.wrap.b32 %r2402, %r2401, %r2401, 20; + add.s32 %r2403, %r2397, %r2187; + add.s32 %r2404, %r2403, %r2402; + xor.b32 %r2405, %r2404, %r2399; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 24; + add.s32 %r2407, %r2406, %r2400; + xor.b32 %r2408, %r2407, %r2402; + shf.l.wrap.b32 %r2409, %r2408, %r2408, 25; + add.s32 %r2410, %r2362, %r2194; + add.s32 %r2411, %r2410, %r2325; + xor.b32 %r2412, %r2411, %r2350; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 16; + add.s32 %r2414, %r2413, %r2337; + xor.b32 %r2415, %r2414, %r2325; + shf.l.wrap.b32 %r2416, %r2415, %r2415, 20; + add.s32 %r2417, %r2411, %r2145; + add.s32 %r2418, %r2417, %r2416; + xor.b32 %r2419, %r2418, %r2413; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 24; + add.s32 %r2421, %r2420, %r2414; + xor.b32 %r2422, %r2421, %r2416; + shf.l.wrap.b32 %r2423, %r2422, %r2422, 25; + add.s32 %r2424, %r2376, %r2110; + add.s32 %r2425, %r2424, %r2423; + xor.b32 %r2426, %r2425, %r2392; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 16; + add.s32 %r2428, %r2427, %r2407; + xor.b32 %r2429, %r2428, %r2423; + shf.l.wrap.b32 %r2430, %r2429, %r2429, 20; + add.s32 %r2431, %r2425, %r2117; + add.s32 %r2432, %r2431, %r2430; + xor.b32 %r2433, %r2432, %r2427; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 24; + add.s32 %r2435, %r2434, %r2428; + xor.b32 %r2436, %r2435, %r2430; + shf.l.wrap.b32 %r2437, %r2436, %r2436, 25; + add.s32 %r2438, %r2390, %r2159; + add.s32 %r2439, %r2438, %r2381; + xor.b32 %r2440, %r2439, %r2406; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 16; + add.s32 %r2442, %r2441, %r2421; + xor.b32 %r2443, %r2442, %r2381; + shf.l.wrap.b32 %r2444, %r2443, %r2443, 20; + add.s32 %r2445, %r2439, %r2173; + add.s32 %r2446, %r2445, %r2444; + xor.b32 %r2447, %r2446, %r2441; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 24; + add.s32 %r2449, %r2448, %r2442; + xor.b32 %r2450, %r2449, %r2444; + shf.l.wrap.b32 %r2451, %r2450, %r2450, 25; + add.s32 %r2452, %r2404, %r2180; + add.s32 %r2453, %r2452, %r2395; + xor.b32 %r2454, %r2453, %r2420; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 16; + add.s32 %r2456, %r2455, %r2379; + xor.b32 %r2457, %r2456, %r2395; + shf.l.wrap.b32 %r2458, %r2457, %r2457, 20; + add.s32 %r2459, %r2453, %r2103; + add.s32 %r2460, %r2459, %r2458; + xor.b32 %r2461, %r2460, %r2455; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 24; + add.s32 %r2463, %r2462, %r2456; + xor.b32 %r2464, %r2463, %r2458; + shf.l.wrap.b32 %r2465, %r2464, %r2464, 25; + add.s32 %r2466, %r2418, %r2138; + add.s32 %r2467, %r2466, %r2409; + xor.b32 %r2468, %r2467, %r2378; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 16; + add.s32 %r2470, %r2469, %r2393; + xor.b32 %r2471, %r2470, %r2409; + shf.l.wrap.b32 %r2472, %r2471, %r2471, 20; + add.s32 %r2473, %r2467, %r2187; + add.s32 %r2474, %r2473, %r2472; + xor.b32 %r2475, %r2474, %r2469; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 24; + add.s32 %r2477, %r2476, %r2470; + xor.b32 %r2478, %r2477, %r2472; + shf.l.wrap.b32 %r2479, %r2478, %r2478, 25; + add.s32 %r2480, %r2432, %r2131; + add.s32 %r2481, %r2480, %r2451; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 16; + add.s32 %r2484, %r2483, %r2463; + xor.b32 %r2485, %r2484, %r2451; + shf.l.wrap.b32 %r2486, %r2485, %r2485, 20; + add.s32 %r2487, %r2481, %r2124; + add.s32 %r2488, %r2487, %r2486; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 24; + add.s32 %r2491, %r2490, %r2484; + xor.b32 %r2492, %r2491, %r2486; + shf.l.wrap.b32 %r2493, %r2492, %r2492, 25; + add.s32 %r2494, %r2446, %r2152; + add.s32 %r2495, %r2494, %r2465; + xor.b32 %r2496, %r2495, %r2434; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 16; + add.s32 %r2498, %r2497, %r2477; + xor.b32 %r2499, %r2498, %r2465; + shf.l.wrap.b32 %r2500, %r2499, %r2499, 20; + add.s32 %r2501, %r2495, %r2089; + add.s32 %r2502, %r2501, %r2500; + xor.b32 %r2503, %r2502, %r2497; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 24; + add.s32 %r2505, %r2504, %r2498; + xor.b32 %r2506, %r2505, %r2500; + shf.l.wrap.b32 %r2507, %r2506, %r2506, 25; + add.s32 %r2508, %r2460, %r2166; + add.s32 %r2509, %r2508, %r2479; + xor.b32 %r2510, %r2509, %r2448; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 16; + add.s32 %r2512, %r2511, %r2435; + xor.b32 %r2513, %r2512, %r2479; + shf.l.wrap.b32 %r2514, %r2513, %r2513, 20; + add.s32 %r2515, %r2509, %r2194; + add.s32 %r2516, %r2515, %r2514; + xor.b32 %r2517, %r2516, %r2511; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 24; + add.s32 %r2519, %r2518, %r2512; + xor.b32 %r2520, %r2519, %r2514; + shf.l.wrap.b32 %r2521, %r2520, %r2520, 25; + add.s32 %r2522, %r2474, %r2145; + add.s32 %r2523, %r2522, %r2437; + xor.b32 %r2524, %r2523, %r2462; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 16; + add.s32 %r2526, %r2525, %r2449; + xor.b32 %r2527, %r2526, %r2437; + shf.l.wrap.b32 %r2528, %r2527, %r2527, 20; + add.s32 %r2529, %r2523, %r2096; + add.s32 %r2530, %r2529, %r2528; + xor.b32 %r2531, %r2530, %r2525; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 24; + add.s32 %r2533, %r2532, %r2526; + xor.b32 %r2534, %r2533, %r2528; + shf.l.wrap.b32 %r2535, %r2534, %r2534, 25; + add.s32 %r2536, %r2488, %r2159; + add.s32 %r2537, %r2536, %r2535; + xor.b32 %r2538, %r2537, %r2504; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 16; + add.s32 %r2540, %r2539, %r2519; + xor.b32 %r2541, %r2540, %r2535; + shf.l.wrap.b32 %r2542, %r2541, %r2541, 20; + add.s32 %r2543, %r2537, %r2138; + add.s32 %r2544, %r2543, %r2542; + xor.b32 %r2545, %r2544, %r2539; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 24; + add.s32 %r2547, %r2546, %r2540; + xor.b32 %r2548, %r2547, %r2542; + shf.l.wrap.b32 %r2549, %r2548, %r2548, 25; + add.s32 %r2550, %r2502, %r2173; + add.s32 %r2551, %r2550, %r2493; + xor.b32 %r2552, %r2551, %r2518; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 16; + add.s32 %r2554, %r2553, %r2533; + xor.b32 %r2555, %r2554, %r2493; + shf.l.wrap.b32 %r2556, %r2555, %r2555, 20; + add.s32 %r2557, %r2551, %r2152; + add.s32 %r2558, %r2557, %r2556; + xor.b32 %r2559, %r2558, %r2553; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 24; + add.s32 %r2561, %r2560, %r2554; + xor.b32 %r2562, %r2561, %r2556; + shf.l.wrap.b32 %r2563, %r2562, %r2562, 25; + add.s32 %r2564, %r2516, %r2187; + add.s32 %r2565, %r2564, %r2507; + xor.b32 %r2566, %r2565, %r2532; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 16; + add.s32 %r2568, %r2567, %r2491; + xor.b32 %r2569, %r2568, %r2507; + shf.l.wrap.b32 %r2570, %r2569, %r2569, 20; + add.s32 %r2571, %r2565, %r2110; + add.s32 %r2572, %r2571, %r2570; + xor.b32 %r2573, %r2572, %r2567; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 24; + add.s32 %r2575, %r2574, %r2568; + xor.b32 %r2576, %r2575, %r2570; + shf.l.wrap.b32 %r2577, %r2576, %r2576, 25; + add.s32 %r2578, %r2530, %r2180; + add.s32 %r2579, %r2578, %r2521; + xor.b32 %r2580, %r2579, %r2490; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 16; + add.s32 %r2582, %r2581, %r2505; + xor.b32 %r2583, %r2582, %r2521; + shf.l.wrap.b32 %r2584, %r2583, %r2583, 20; + add.s32 %r2585, %r2579, %r2194; + add.s32 %r2586, %r2585, %r2584; + xor.b32 %r2587, %r2586, %r2581; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 24; + add.s32 %r2589, %r2588, %r2582; + xor.b32 %r2590, %r2589, %r2584; + shf.l.wrap.b32 %r2591, %r2590, %r2590, 25; + add.s32 %r2592, %r2544, %r2117; + add.s32 %r2593, %r2592, %r2563; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 16; + add.s32 %r2596, %r2595, %r2575; + xor.b32 %r2597, %r2596, %r2563; + shf.l.wrap.b32 %r2598, %r2597, %r2597, 20; + add.s32 %r2599, %r2593, %r2089; + add.s32 %r2600, %r2599, %r2598; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 24; + add.s32 %r2603, %r2602, %r2596; + xor.b32 %r2604, %r2603, %r2598; + shf.l.wrap.b32 %r2605, %r2604, %r2604, 25; + add.s32 %r2606, %r2558, %r2166; + add.s32 %r2607, %r2606, %r2577; + xor.b32 %r2608, %r2607, %r2546; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 16; + add.s32 %r2610, %r2609, %r2589; + xor.b32 %r2611, %r2610, %r2577; + shf.l.wrap.b32 %r2612, %r2611, %r2611, 20; + add.s32 %r2613, %r2607, %r2103; + add.s32 %r2614, %r2613, %r2612; + xor.b32 %r2615, %r2614, %r2609; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 24; + add.s32 %r2617, %r2616, %r2610; + xor.b32 %r2618, %r2617, %r2612; + shf.l.wrap.b32 %r2619, %r2618, %r2618, 25; + add.s32 %r2620, %r2572, %r2124; + add.s32 %r2621, %r2620, %r2591; + xor.b32 %r2622, %r2621, %r2560; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 16; + add.s32 %r2624, %r2623, %r2547; + xor.b32 %r2625, %r2624, %r2591; + shf.l.wrap.b32 %r2626, %r2625, %r2625, 20; + add.s32 %r2627, %r2621, %r2145; + add.s32 %r2628, %r2627, %r2626; + xor.b32 %r2629, %r2628, %r2623; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 24; + add.s32 %r2631, %r2630, %r2624; + xor.b32 %r2632, %r2631, %r2626; + shf.l.wrap.b32 %r2633, %r2632, %r2632, 25; + add.s32 %r2634, %r2586, %r2096; + add.s32 %r2635, %r2634, %r2549; + xor.b32 %r2636, %r2635, %r2574; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 16; + add.s32 %r2638, %r2637, %r2561; + xor.b32 %r2639, %r2638, %r2549; + shf.l.wrap.b32 %r2640, %r2639, %r2639, 20; + add.s32 %r2641, %r2635, %r2131; + add.s32 %r2642, %r2641, %r2640; + xor.b32 %r2643, %r2642, %r2637; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 24; + add.s32 %r2645, %r2644, %r2638; + xor.b32 %r2646, %r2645, %r2640; + shf.l.wrap.b32 %r2647, %r2646, %r2646, 25; + add.s32 %r2648, %r2600, %r2173; + add.s32 %r2649, %r2648, %r2647; + xor.b32 %r2650, %r2649, %r2616; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 16; + add.s32 %r2652, %r2651, %r2631; + xor.b32 %r2653, %r2652, %r2647; + shf.l.wrap.b32 %r2654, %r2653, %r2653, 20; + add.s32 %r2655, %r2649, %r2180; + add.s32 %r2656, %r2655, %r2654; + xor.b32 %r2657, %r2656, %r2651; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 24; + add.s32 %r2659, %r2658, %r2652; + xor.b32 %r2660, %r2659, %r2654; + shf.l.wrap.b32 %r2661, %r2660, %r2660, 25; + add.s32 %r2662, %r2614, %r2152; + add.s32 %r2663, %r2662, %r2605; + xor.b32 %r2664, %r2663, %r2630; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 16; + add.s32 %r2666, %r2665, %r2645; + xor.b32 %r2667, %r2666, %r2605; + shf.l.wrap.b32 %r2668, %r2667, %r2667, 20; + add.s32 %r2669, %r2663, %r2166; + add.s32 %r2670, %r2669, %r2668; + xor.b32 %r2671, %r2670, %r2665; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 24; + add.s32 %r2673, %r2672, %r2666; + xor.b32 %r2674, %r2673, %r2668; + shf.l.wrap.b32 %r2675, %r2674, %r2674, 25; + add.s32 %r2676, %r2628, %r2194; + add.s32 %r2677, %r2676, %r2619; + xor.b32 %r2678, %r2677, %r2644; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 16; + add.s32 %r2680, %r2679, %r2603; + xor.b32 %r2681, %r2680, %r2619; + shf.l.wrap.b32 %r2682, %r2681, %r2681, 20; + add.s32 %r2683, %r2677, %r2159; + add.s32 %r2684, %r2683, %r2682; + xor.b32 %r2685, %r2684, %r2679; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 24; + add.s32 %r2687, %r2686, %r2680; + xor.b32 %r2688, %r2687, %r2682; + shf.l.wrap.b32 %r2689, %r2688, %r2688, 25; + add.s32 %r2690, %r2642, %r2187; + add.s32 %r2691, %r2690, %r2633; + xor.b32 %r2692, %r2691, %r2602; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 16; + add.s32 %r2694, %r2693, %r2617; + xor.b32 %r2695, %r2694, %r2633; + shf.l.wrap.b32 %r2696, %r2695, %r2695, 20; + add.s32 %r2697, %r2691, %r2145; + add.s32 %r2698, %r2697, %r2696; + xor.b32 %r2699, %r2698, %r2693; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 24; + add.s32 %r2701, %r2700, %r2694; + xor.b32 %r2702, %r2701, %r2696; + shf.l.wrap.b32 %r2703, %r2702, %r2702, 25; + add.s32 %r2704, %r2656, %r2138; + add.s32 %r2705, %r2704, %r2675; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 16; + add.s32 %r2708, %r2707, %r2687; + xor.b32 %r2709, %r2708, %r2675; + shf.l.wrap.b32 %r2710, %r2709, %r2709, 20; + add.s32 %r2711, %r2705, %r2103; + add.s32 %r2712, %r2711, %r2710; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 24; + add.s32 %r2715, %r2714, %r2708; + xor.b32 %r2716, %r2715, %r2710; + shf.l.wrap.b32 %r2717, %r2716, %r2716, 25; + add.s32 %r2718, %r2670, %r2124; + add.s32 %r2719, %r2718, %r2689; + xor.b32 %r2720, %r2719, %r2658; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 16; + add.s32 %r2722, %r2721, %r2701; + xor.b32 %r2723, %r2722, %r2689; + shf.l.wrap.b32 %r2724, %r2723, %r2723, 20; + add.s32 %r2725, %r2719, %r2110; + add.s32 %r2726, %r2725, %r2724; + xor.b32 %r2727, %r2726, %r2721; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 24; + add.s32 %r2729, %r2728, %r2722; + xor.b32 %r2730, %r2729, %r2724; + shf.l.wrap.b32 %r2731, %r2730, %r2730, 25; + add.s32 %r2732, %r2684, %r2089; + add.s32 %r2733, %r2732, %r2703; + xor.b32 %r2734, %r2733, %r2672; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 16; + add.s32 %r2736, %r2735, %r2659; + xor.b32 %r2737, %r2736, %r2703; + shf.l.wrap.b32 %r2738, %r2737, %r2737, 20; + add.s32 %r2739, %r2733, %r2096; + add.s32 %r2740, %r2739, %r2738; + xor.b32 %r2741, %r2740, %r2735; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 24; + add.s32 %r2743, %r2742, %r2736; + xor.b32 %r2744, %r2743, %r2738; + shf.l.wrap.b32 %r2745, %r2744, %r2744, 25; + add.s32 %r2746, %r2698, %r2131; + add.s32 %r2747, %r2746, %r2661; + xor.b32 %r2748, %r2747, %r2686; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 16; + add.s32 %r2750, %r2749, %r2673; + xor.b32 %r2751, %r2750, %r2661; + shf.l.wrap.b32 %r2752, %r2751, %r2751, 20; + add.s32 %r2753, %r2747, %r2117; + add.s32 %r2754, %r2753, %r2752; + xor.b32 %r2755, %r2754, %r2749; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 24; + add.s32 %r2757, %r2756, %r2750; + xor.b32 %r2758, %r2757, %r2752; + shf.l.wrap.b32 %r2759, %r2758, %r2758, 25; + add.s32 %r2760, %r2712, %r2152; + add.s32 %r2761, %r2760, %r2759; + xor.b32 %r2762, %r2761, %r2728; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 16; + add.s32 %r2764, %r2763, %r2743; + xor.b32 %r2765, %r2764, %r2759; + shf.l.wrap.b32 %r2766, %r2765, %r2765, 20; + add.s32 %r2767, %r2761, %r2187; + add.s32 %r2768, %r2767, %r2766; + xor.b32 %r2769, %r2768, %r2763; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 24; + add.s32 %r2771, %r2770, %r2764; + xor.b32 %r2772, %r2771, %r2766; + shf.l.wrap.b32 %r2773, %r2772, %r2772, 25; + add.s32 %r2774, %r2726, %r2166; + add.s32 %r2775, %r2774, %r2717; + xor.b32 %r2776, %r2775, %r2742; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 16; + add.s32 %r2778, %r2777, %r2757; + xor.b32 %r2779, %r2778, %r2717; + shf.l.wrap.b32 %r2780, %r2779, %r2779, 20; + add.s32 %r2781, %r2775, %r2124; + add.s32 %r2782, %r2781, %r2780; + xor.b32 %r2783, %r2782, %r2777; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 24; + add.s32 %r2785, %r2784, %r2778; + xor.b32 %r2786, %r2785, %r2780; + shf.l.wrap.b32 %r2787, %r2786, %r2786, 25; + add.s32 %r2788, %r2740, %r2145; + add.s32 %r2789, %r2788, %r2731; + xor.b32 %r2790, %r2789, %r2756; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 16; + add.s32 %r2792, %r2791, %r2715; + xor.b32 %r2793, %r2792, %r2731; + shf.l.wrap.b32 %r2794, %r2793, %r2793, 20; + add.s32 %r2795, %r2789, %r2173; + add.s32 %r2796, %r2795, %r2794; + xor.b32 %r2797, %r2796, %r2791; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 24; + add.s32 %r2799, %r2798, %r2792; + xor.b32 %r2800, %r2799, %r2794; + shf.l.wrap.b32 %r2801, %r2800, %r2800, 25; + add.s32 %r2802, %r2754, %r2194; + add.s32 %r2803, %r2802, %r2745; + xor.b32 %r2804, %r2803, %r2714; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 16; + add.s32 %r2806, %r2805, %r2729; + xor.b32 %r2807, %r2806, %r2745; + shf.l.wrap.b32 %r2808, %r2807, %r2807, 20; + add.s32 %r2809, %r2803, %r2096; + add.s32 %r2810, %r2809, %r2808; + xor.b32 %r2811, %r2810, %r2805; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 24; + add.s32 %r2813, %r2812, %r2806; + xor.b32 %r2814, %r2813, %r2808; + shf.l.wrap.b32 %r2815, %r2814, %r2814, 25; + add.s32 %r2816, %r2768, %r2180; + add.s32 %r2817, %r2816, %r2787; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 16; + add.s32 %r2820, %r2819, %r2799; + xor.b32 %r2821, %r2820, %r2787; + shf.l.wrap.b32 %r2822, %r2821, %r2821, 20; + add.s32 %r2823, %r2817, %r2110; + add.s32 %r2824, %r2823, %r2822; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 24; + add.s32 %r2827, %r2826, %r2820; + xor.b32 %r2828, %r2827, %r2822; + shf.l.wrap.b32 %r2829, %r2828, %r2828, 25; + add.s32 %r2830, %r2782, %r2089; + add.s32 %r2831, %r2830, %r2801; + xor.b32 %r2832, %r2831, %r2770; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 16; + add.s32 %r2834, %r2833, %r2813; + xor.b32 %r2835, %r2834, %r2801; + shf.l.wrap.b32 %r2836, %r2835, %r2835, 20; + add.s32 %r2837, %r2831, %r2159; + add.s32 %r2838, %r2837, %r2836; + xor.b32 %r2839, %r2838, %r2833; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 24; + add.s32 %r2841, %r2840, %r2834; + xor.b32 %r2842, %r2841, %r2836; + shf.l.wrap.b32 %r2843, %r2842, %r2842, 25; + add.s32 %r2844, %r2796, %r2103; + add.s32 %r2845, %r2844, %r2815; + xor.b32 %r2846, %r2845, %r2784; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 16; + add.s32 %r2848, %r2847, %r2771; + xor.b32 %r2849, %r2848, %r2815; + shf.l.wrap.b32 %r2850, %r2849, %r2849, 20; + add.s32 %r2851, %r2845, %r2131; + add.s32 %r2852, %r2851, %r2850; + xor.b32 %r2853, %r2852, %r2847; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 24; + add.s32 %r2855, %r2854, %r2848; + xor.b32 %r2856, %r2855, %r2850; + shf.l.wrap.b32 %r2857, %r2856, %r2856, 25; + add.s32 %r2858, %r2810, %r2117; + add.s32 %r2859, %r2858, %r2773; + xor.b32 %r2860, %r2859, %r2798; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 16; + add.s32 %r2862, %r2861, %r2785; + xor.b32 %r2863, %r2862, %r2773; + shf.l.wrap.b32 %r2864, %r2863, %r2863, 20; + add.s32 %r2865, %r2859, %r2138; + add.s32 %r2866, %r2865, %r2864; + xor.b32 %r2867, %r2866, %r2861; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 24; + add.s32 %r2869, %r2868, %r2862; + xor.b32 %r2870, %r2869, %r2864; + shf.l.wrap.b32 %r2871, %r2870, %r2870, 25; + add.s32 %r2872, %r2824, %r2166; + add.s32 %r2873, %r2872, %r2871; + xor.b32 %r2874, %r2873, %r2840; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 16; + add.s32 %r2876, %r2875, %r2855; + xor.b32 %r2877, %r2876, %r2871; + shf.l.wrap.b32 %r2878, %r2877, %r2877, 20; + add.s32 %r2879, %r2873, %r2194; + add.s32 %r2880, %r2879, %r2878; + xor.b32 %r2881, %r2880, %r2875; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 24; + add.s32 %r2883, %r2882, %r2876; + xor.b32 %r2884, %r2883, %r2878; + shf.l.wrap.b32 %r2885, %r2884, %r2884, 25; + add.s32 %r2886, %r2838, %r2124; + add.s32 %r2887, %r2886, %r2829; + xor.b32 %r2888, %r2887, %r2854; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 16; + add.s32 %r2890, %r2889, %r2869; + xor.b32 %r2891, %r2890, %r2829; + shf.l.wrap.b32 %r2892, %r2891, %r2891, 20; + add.s32 %r2893, %r2887, %r2089; + add.s32 %r2894, %r2893, %r2892; + xor.b32 %r2895, %r2894, %r2889; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 24; + add.s32 %r2897, %r2896, %r2890; + xor.b32 %r2898, %r2897, %r2892; + shf.l.wrap.b32 %r2899, %r2898, %r2898, 25; + add.s32 %r2900, %r2852, %r2096; + add.s32 %r2901, %r2900, %r2843; + xor.b32 %r2902, %r2901, %r2868; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 16; + add.s32 %r2904, %r2903, %r2827; + xor.b32 %r2905, %r2904, %r2843; + shf.l.wrap.b32 %r2906, %r2905, %r2905, 20; + add.s32 %r2907, %r2901, %r2152; + add.s32 %r2908, %r2907, %r2906; + xor.b32 %r2909, %r2908, %r2903; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 24; + add.s32 %r2911, %r2910, %r2904; + xor.b32 %r2912, %r2911, %r2906; + shf.l.wrap.b32 %r2913, %r2912, %r2912, 25; + add.s32 %r2914, %r2866, %r2145; + add.s32 %r2915, %r2914, %r2857; + xor.b32 %r2916, %r2915, %r2826; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 16; + add.s32 %r2918, %r2917, %r2841; + xor.b32 %r2919, %r2918, %r2857; + shf.l.wrap.b32 %r2920, %r2919, %r2919, 20; + add.s32 %r2921, %r2915, %r2131; + add.s32 %r2922, %r2921, %r2920; + xor.b32 %r2923, %r2922, %r2917; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 24; + add.s32 %r2925, %r2924, %r2918; + xor.b32 %r2926, %r2925, %r2920; + shf.l.wrap.b32 %r2927, %r2926, %r2926, 25; + add.s32 %r2928, %r2880, %r2187; + add.s32 %r2929, %r2928, %r2899; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 16; + add.s32 %r2932, %r2931, %r2911; + xor.b32 %r2933, %r2932, %r2899; + shf.l.wrap.b32 %r2934, %r2933, %r2933, 20; + add.s32 %r2935, %r2929, %r2159; + add.s32 %r2936, %r2935, %r2934; + xor.b32 %r2937, %r2936, %r2931; + shf.l.wrap.b32 %r2938, %r2937, %r2937, 24; + add.s32 %r2939, %r2938, %r2932; + xor.b32 %r2940, %r2939, %r2934; + shf.l.wrap.b32 %r2941, %r2940, %r2940, 25; + add.s32 %r2942, %r2894, %r2103; + add.s32 %r2943, %r2942, %r2913; + xor.b32 %r2944, %r2943, %r2882; + shf.l.wrap.b32 %r2945, %r2944, %r2944, 16; + add.s32 %r2946, %r2945, %r2925; + xor.b32 %r2947, %r2946, %r2913; + shf.l.wrap.b32 %r2948, %r2947, %r2947, 20; + add.s32 %r2949, %r2943, %r2173; + add.s32 %r2950, %r2949, %r2948; + xor.b32 %r2951, %r2950, %r2945; + shf.l.wrap.b32 %r2952, %r2951, %r2951, 24; + add.s32 %r2953, %r2952, %r2946; + xor.b32 %r2954, %r2953, %r2948; + shf.l.wrap.b32 %r2955, %r2954, %r2954, 25; + add.s32 %r2956, %r2908, %r2110; + add.s32 %r2957, %r2956, %r2927; + xor.b32 %r2958, %r2957, %r2896; + shf.l.wrap.b32 %r2959, %r2958, %r2958, 16; + add.s32 %r2960, %r2959, %r2883; + xor.b32 %r2961, %r2960, %r2927; + shf.l.wrap.b32 %r2962, %r2961, %r2961, 20; + add.s32 %r2963, %r2957, %r2117; + add.s32 %r2964, %r2963, %r2962; + xor.b32 %r2965, %r2964, %r2959; + shf.l.wrap.b32 %r2966, %r2965, %r2965, 24; + add.s32 %r2967, %r2966, %r2960; + xor.b32 %r2968, %r2967, %r2962; + shf.l.wrap.b32 %r2969, %r2968, %r2968, 25; + add.s32 %r2970, %r2922, %r2138; + add.s32 %r2971, %r2970, %r2885; + xor.b32 %r2972, %r2971, %r2910; + shf.l.wrap.b32 %r2973, %r2972, %r2972, 16; + add.s32 %r2974, %r2973, %r2897; + xor.b32 %r2975, %r2974, %r2885; + shf.l.wrap.b32 %r2976, %r2975, %r2975, 20; + add.s32 %r2977, %r2971, %r2180; + add.s32 %r2978, %r2977, %r2976; + xor.b32 %r2979, %r2978, %r2973; + shf.l.wrap.b32 %r2980, %r2979, %r2979, 24; + add.s32 %r2981, %r2980, %r2974; + xor.b32 %r2982, %r2981, %r2976; + shf.l.wrap.b32 %r2983, %r2982, %r2982, 25; + xor.b32 %r3964, %r2967, %r2936; + xor.b32 %r3963, %r2981, %r2950; + xor.b32 %r3962, %r2939, %r2964; + xor.b32 %r3961, %r2953, %r2978; + xor.b32 %r3960, %r2983, %r2952; + xor.b32 %r3959, %r2941, %r2966; + xor.b32 %r3958, %r2955, %r2980; + xor.b32 %r3957, %r2969, %r2938; + add.s16 %rs198, %rs198, 1; + st.local.u8 [%rd56+1], %rs198; + add.s64 %rd170, %rd170, 64; + add.s64 %rd171, %rd171, -64; + setp.gt.u64 %p24, %rd171, 64; + @%p24 bra $L__BB0_24; + +$L__BB0_25: + min.u64 %rd63, %rd171, 64; + setp.eq.s64 %p25, %rd63, 0; + mov.u16 %rs200, %rs199; + mov.u16 %rs201, %rs199; + mov.u16 %rs202, %rs199; + mov.u16 %rs203, %rs199; + mov.u16 %rs204, %rs199; + mov.u16 %rs205, %rs199; + mov.u16 %rs206, %rs199; + mov.u16 %rs207, %rs199; + mov.u16 %rs208, %rs199; + mov.u16 %rs209, %rs199; + mov.u16 %rs210, %rs199; + mov.u16 %rs211, %rs199; + mov.u16 %rs212, %rs199; + mov.u16 %rs213, %rs199; + mov.u16 %rs214, %rs199; + mov.u16 %rs215, %rs199; + mov.u16 %rs216, %rs199; + mov.u16 %rs217, %rs199; + mov.u16 %rs218, %rs199; + mov.u16 %rs219, %rs199; + mov.u16 %rs220, %rs199; + mov.u16 %rs221, %rs199; + mov.u16 %rs222, %rs199; + mov.u16 %rs223, %rs199; + mov.u16 %rs224, %rs199; + mov.u16 %rs225, %rs199; + mov.u16 %rs226, %rs199; + mov.u16 %rs227, %rs199; + mov.u16 %rs228, %rs199; + mov.u16 %rs229, %rs199; + mov.u16 %rs230, %rs199; + mov.u16 %rs231, %rs199; + mov.u16 %rs232, %rs199; + @%p25 bra $L__BB0_29; + + mov.u64 %rd172, 0; + +$L__BB0_27: + add.s64 %rd131, %rd170, %rd172; + ld.u8 %rs121, [%rd131]; + add.s64 %rd132, %rd53, %rd172; + st.local.u8 [%rd132], %rs121; + add.s64 %rd172, %rd172, 1; + setp.lt.u64 %p26, %rd172, %rd63; + @%p26 bra $L__BB0_27; + + ld.local.v4.u16 {%rs229, %rs230, %rs231, %rs232}, [%rd53]; + ld.local.v4.u16 {%rs225, %rs226, %rs227, %rs228}, [%rd53+8]; + ld.local.v4.u16 {%rs221, %rs222, %rs223, %rs224}, [%rd53+16]; + ld.local.v4.u16 {%rs217, %rs218, %rs219, %rs220}, [%rd53+24]; + ld.local.v4.u16 {%rs213, %rs214, %rs215, %rs216}, [%rd53+32]; + ld.local.v4.u16 {%rs209, %rs210, %rs211, %rs212}, [%rd53+40]; + ld.local.v4.u16 {%rs205, %rs206, %rs207, %rs208}, [%rd53+48]; + ld.local.v4.u16 {%rs202, %rs203, %rs204, %rs153}, [%rd53+56]; + ld.local.u8 %rs201, [%rd53+61]; + ld.local.v2.u8 {%rs199, %rs200}, [%rd53+62]; + +$L__BB0_29: + ld.param.u64 %rd138, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd137, %rd138; + ld.local.v4.u8 {%rs156, %rs157, %rs158, %rs159}, [%rd53+64]; + cvt.u16.u64 %rs162, %rd63; + add.s16 %rs163, %rs156, %rs162; + st.local.u8 [%rd53+64], %rs163; + setp.eq.s16 %p27, %rs157, 0; + selp.u16 %rs164, 1, 0, %p27; + or.b16 %rs165, %rs158, %rs164; + or.b16 %rs166, %rs165, 2; + shr.u16 %rs167, %rs229, 8; + shr.u16 %rs168, %rs230, 8; + shr.u16 %rs169, %rs231, 8; + shr.u16 %rs170, %rs232, 8; + shr.u16 %rs171, %rs225, 8; + shr.u16 %rs172, %rs226, 8; + shr.u16 %rs173, %rs227, 8; + shr.u16 %rs174, %rs228, 8; + shr.u16 %rs175, %rs221, 8; + shr.u16 %rs176, %rs222, 8; + shr.u16 %rs177, %rs223, 8; + shr.u16 %rs178, %rs224, 8; + shr.u16 %rs179, %rs217, 8; + shr.u16 %rs180, %rs218, 8; + shr.u16 %rs181, %rs219, 8; + shr.u16 %rs182, %rs220, 8; + shr.u16 %rs183, %rs213, 8; + shr.u16 %rs184, %rs214, 8; + shr.u16 %rs185, %rs215, 8; + shr.u16 %rs186, %rs216, 8; + shr.u16 %rs187, %rs209, 8; + shr.u16 %rs188, %rs210, 8; + shr.u16 %rs189, %rs211, 8; + shr.u16 %rs190, %rs212, 8; + shr.u16 %rs191, %rs205, 8; + shr.u16 %rs192, %rs206, 8; + shr.u16 %rs193, %rs207, 8; + shr.u16 %rs194, %rs208, 8; + shr.u16 %rs195, %rs202, 8; + shr.u16 %rs196, %rs203, 8; + shl.b64 %rd133, %rd151, 5; + add.s64 %rd134, %rd137, %rd133; + cvt.u32.u16 %r2984, %rs229; + and.b32 %r2985, %r2984, 255; + cvt.u32.u16 %r2986, %rs167; + prmt.b32 %r2987, %r2986, %r2985, 30212; + cvt.u32.u16 %r2988, %rs230; + prmt.b32 %r2989, %r2988, %r2987, 28756; + cvt.u32.u16 %r2990, %rs168; + prmt.b32 %r2991, %r2990, %r2989, 1620; + cvt.u32.u16 %r2992, %rs231; + and.b32 %r2993, %r2992, 255; + cvt.u32.u16 %r2994, %rs169; + prmt.b32 %r2995, %r2994, %r2993, 30212; + cvt.u32.u16 %r2996, %rs232; + prmt.b32 %r2997, %r2996, %r2995, 28756; + cvt.u32.u16 %r2998, %rs170; + prmt.b32 %r2999, %r2998, %r2997, 1620; + cvt.u32.u16 %r3000, %rs225; + and.b32 %r3001, %r3000, 255; + cvt.u32.u16 %r3002, %rs171; + prmt.b32 %r3003, %r3002, %r3001, 30212; + cvt.u32.u16 %r3004, %rs226; + prmt.b32 %r3005, %r3004, %r3003, 28756; + cvt.u32.u16 %r3006, %rs172; + prmt.b32 %r3007, %r3006, %r3005, 1620; + cvt.u32.u16 %r3008, %rs227; + and.b32 %r3009, %r3008, 255; + cvt.u32.u16 %r3010, %rs173; + prmt.b32 %r3011, %r3010, %r3009, 30212; + cvt.u32.u16 %r3012, %rs228; + prmt.b32 %r3013, %r3012, %r3011, 28756; + cvt.u32.u16 %r3014, %rs174; + prmt.b32 %r3015, %r3014, %r3013, 1620; + cvt.u32.u16 %r3016, %rs221; + and.b32 %r3017, %r3016, 255; + cvt.u32.u16 %r3018, %rs175; + prmt.b32 %r3019, %r3018, %r3017, 30212; + cvt.u32.u16 %r3020, %rs222; + prmt.b32 %r3021, %r3020, %r3019, 28756; + cvt.u32.u16 %r3022, %rs176; + prmt.b32 %r3023, %r3022, %r3021, 1620; + cvt.u32.u16 %r3024, %rs223; + and.b32 %r3025, %r3024, 255; + cvt.u32.u16 %r3026, %rs177; + prmt.b32 %r3027, %r3026, %r3025, 30212; + cvt.u32.u16 %r3028, %rs224; + prmt.b32 %r3029, %r3028, %r3027, 28756; + cvt.u32.u16 %r3030, %rs178; + prmt.b32 %r3031, %r3030, %r3029, 1620; + cvt.u32.u16 %r3032, %rs217; + and.b32 %r3033, %r3032, 255; + cvt.u32.u16 %r3034, %rs179; + prmt.b32 %r3035, %r3034, %r3033, 30212; + cvt.u32.u16 %r3036, %rs218; + prmt.b32 %r3037, %r3036, %r3035, 28756; + cvt.u32.u16 %r3038, %rs180; + prmt.b32 %r3039, %r3038, %r3037, 1620; + cvt.u32.u16 %r3040, %rs219; + and.b32 %r3041, %r3040, 255; + cvt.u32.u16 %r3042, %rs181; + prmt.b32 %r3043, %r3042, %r3041, 30212; + cvt.u32.u16 %r3044, %rs220; + prmt.b32 %r3045, %r3044, %r3043, 28756; + cvt.u32.u16 %r3046, %rs182; + prmt.b32 %r3047, %r3046, %r3045, 1620; + cvt.u32.u16 %r3048, %rs213; + and.b32 %r3049, %r3048, 255; + cvt.u32.u16 %r3050, %rs183; + prmt.b32 %r3051, %r3050, %r3049, 30212; + cvt.u32.u16 %r3052, %rs214; + prmt.b32 %r3053, %r3052, %r3051, 28756; + cvt.u32.u16 %r3054, %rs184; + prmt.b32 %r3055, %r3054, %r3053, 1620; + cvt.u32.u16 %r3056, %rs215; + and.b32 %r3057, %r3056, 255; + cvt.u32.u16 %r3058, %rs185; + prmt.b32 %r3059, %r3058, %r3057, 30212; + cvt.u32.u16 %r3060, %rs216; + prmt.b32 %r3061, %r3060, %r3059, 28756; + cvt.u32.u16 %r3062, %rs186; + prmt.b32 %r3063, %r3062, %r3061, 1620; + cvt.u32.u16 %r3064, %rs209; + and.b32 %r3065, %r3064, 255; + cvt.u32.u16 %r3066, %rs187; + prmt.b32 %r3067, %r3066, %r3065, 30212; + cvt.u32.u16 %r3068, %rs210; + prmt.b32 %r3069, %r3068, %r3067, 28756; + cvt.u32.u16 %r3070, %rs188; + prmt.b32 %r3071, %r3070, %r3069, 1620; + cvt.u32.u16 %r3072, %rs211; + and.b32 %r3073, %r3072, 255; + cvt.u32.u16 %r3074, %rs189; + prmt.b32 %r3075, %r3074, %r3073, 30212; + cvt.u32.u16 %r3076, %rs212; + prmt.b32 %r3077, %r3076, %r3075, 28756; + cvt.u32.u16 %r3078, %rs190; + prmt.b32 %r3079, %r3078, %r3077, 1620; + cvt.u32.u16 %r3080, %rs205; + and.b32 %r3081, %r3080, 255; + cvt.u32.u16 %r3082, %rs191; + prmt.b32 %r3083, %r3082, %r3081, 30212; + cvt.u32.u16 %r3084, %rs206; + prmt.b32 %r3085, %r3084, %r3083, 28756; + cvt.u32.u16 %r3086, %rs192; + prmt.b32 %r3087, %r3086, %r3085, 1620; + cvt.u32.u16 %r3088, %rs207; + and.b32 %r3089, %r3088, 255; + cvt.u32.u16 %r3090, %rs193; + prmt.b32 %r3091, %r3090, %r3089, 30212; + cvt.u32.u16 %r3092, %rs208; + prmt.b32 %r3093, %r3092, %r3091, 28756; + cvt.u32.u16 %r3094, %rs194; + prmt.b32 %r3095, %r3094, %r3093, 1620; + cvt.u32.u16 %r3096, %rs202; + and.b32 %r3097, %r3096, 255; + cvt.u32.u16 %r3098, %rs195; + prmt.b32 %r3099, %r3098, %r3097, 30212; + cvt.u32.u16 %r3100, %rs203; + prmt.b32 %r3101, %r3100, %r3099, 28756; + cvt.u32.u16 %r3102, %rs196; + prmt.b32 %r3103, %r3102, %r3101, 1620; + cvt.u32.u16 %r3104, %rs204; + and.b32 %r3105, %r3104, 255; + cvt.u32.u16 %r3106, %rs201; + prmt.b32 %r3107, %r3106, %r3105, 30212; + cvt.u32.u16 %r3108, %rs199; + shl.b32 %r3109, %r3108, 16; + and.b32 %r3110, %r3109, 16711680; + or.b32 %r3111, %r3107, %r3110; + cvt.u32.u16 %r3112, %rs200; + shl.b32 %r3113, %r3112, 24; + or.b32 %r3114, %r3111, %r3113; + cvt.u32.u16 %r3115, %rs163; + and.b32 %r3116, %r3115, 255; + cvt.u32.u16 %r3117, %rs166; + and.b32 %r3118, %r3117, 255; + add.s32 %r3119, %r3960, %r3964; + add.s32 %r3120, %r3119, %r2991; + xor.b32 %r3121, %r3120, %r36; + shf.l.wrap.b32 %r3122, %r3121, %r3121, 16; + add.s32 %r3123, %r3122, 1779033703; + xor.b32 %r3124, %r3123, %r3960; + shf.l.wrap.b32 %r3125, %r3124, %r3124, 20; + add.s32 %r3126, %r2999, %r3120; + add.s32 %r3127, %r3126, %r3125; + xor.b32 %r3128, %r3127, %r3122; + shf.l.wrap.b32 %r3129, %r3128, %r3128, 24; + add.s32 %r3130, %r3129, %r3123; + xor.b32 %r3131, %r3130, %r3125; + shf.l.wrap.b32 %r3132, %r3131, %r3131, 25; + add.s32 %r3133, %r3959, %r3963; + add.s32 %r3134, %r3133, %r3007; + xor.b32 %r3135, %r3134, %r37; + shf.l.wrap.b32 %r3136, %r3135, %r3135, 16; + add.s32 %r3137, %r3136, -1150833019; + xor.b32 %r3138, %r3137, %r3959; + shf.l.wrap.b32 %r3139, %r3138, %r3138, 20; + add.s32 %r3140, %r3015, %r3134; + add.s32 %r3141, %r3140, %r3139; + xor.b32 %r3142, %r3141, %r3136; + shf.l.wrap.b32 %r3143, %r3142, %r3142, 24; + add.s32 %r3144, %r3143, %r3137; + xor.b32 %r3145, %r3144, %r3139; + shf.l.wrap.b32 %r3146, %r3145, %r3145, 25; + add.s32 %r3147, %r3958, %r3962; + add.s32 %r3148, %r3147, %r3023; + xor.b32 %r3149, %r3148, %r3116; + shr.u32 %r3150, %r3148, 16; + shl.b32 %r3151, %r3149, 16; + or.b32 %r3152, %r3151, %r3150; + add.s32 %r3153, %r3152, 1013904242; + xor.b32 %r3154, %r3153, %r3958; + shf.l.wrap.b32 %r3155, %r3154, %r3154, 20; + add.s32 %r3156, %r3031, %r3148; + add.s32 %r3157, %r3156, %r3155; + xor.b32 %r3158, %r3157, %r3152; + shf.l.wrap.b32 %r3159, %r3158, %r3158, 24; + add.s32 %r3160, %r3159, %r3153; + xor.b32 %r3161, %r3160, %r3155; + shf.l.wrap.b32 %r3162, %r3161, %r3161, 25; + add.s32 %r3163, %r3957, %r3961; + add.s32 %r3164, %r3163, %r3039; + xor.b32 %r3165, %r3164, %r3118; + shr.u32 %r3166, %r3164, 16; + shl.b32 %r3167, %r3165, 16; + or.b32 %r3168, %r3167, %r3166; + add.s32 %r3169, %r3168, -1521486534; + xor.b32 %r3170, %r3169, %r3957; + shf.l.wrap.b32 %r3171, %r3170, %r3170, 20; + add.s32 %r3172, %r3047, %r3164; + add.s32 %r3173, %r3172, %r3171; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 24; + add.s32 %r3176, %r3175, %r3169; + xor.b32 %r3177, %r3176, %r3171; + shf.l.wrap.b32 %r3178, %r3177, %r3177, 25; + add.s32 %r3179, %r3146, %r3127; + add.s32 %r3180, %r3179, %r3055; + xor.b32 %r3181, %r3175, %r3180; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 16; + add.s32 %r3183, %r3182, %r3160; + xor.b32 %r3184, %r3183, %r3146; + shf.l.wrap.b32 %r3185, %r3184, %r3184, 20; + add.s32 %r3186, %r3063, %r3180; + add.s32 %r3187, %r3186, %r3185; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 24; + add.s32 %r3190, %r3189, %r3183; + xor.b32 %r3191, %r3190, %r3185; + shf.l.wrap.b32 %r3192, %r3191, %r3191, 25; + add.s32 %r3193, %r3162, %r3141; + add.s32 %r3194, %r3193, %r3071; + xor.b32 %r3195, %r3194, %r3129; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 16; + add.s32 %r3197, %r3196, %r3176; + xor.b32 %r3198, %r3197, %r3162; + shf.l.wrap.b32 %r3199, %r3198, %r3198, 20; + add.s32 %r3200, %r3079, %r3194; + add.s32 %r3201, %r3200, %r3199; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 24; + add.s32 %r3204, %r3203, %r3197; + xor.b32 %r3205, %r3204, %r3199; + shf.l.wrap.b32 %r3206, %r3205, %r3205, 25; + add.s32 %r3207, %r3178, %r3157; + add.s32 %r3208, %r3207, %r3087; + xor.b32 %r3209, %r3208, %r3143; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 16; + add.s32 %r3211, %r3210, %r3130; + xor.b32 %r3212, %r3211, %r3178; + shf.l.wrap.b32 %r3213, %r3212, %r3212, 20; + add.s32 %r3214, %r3095, %r3208; + add.s32 %r3215, %r3214, %r3213; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 24; + add.s32 %r3218, %r3217, %r3211; + xor.b32 %r3219, %r3218, %r3213; + shf.l.wrap.b32 %r3220, %r3219, %r3219, 25; + add.s32 %r3221, %r3173, %r3132; + add.s32 %r3222, %r3221, %r3103; + xor.b32 %r3223, %r3222, %r3159; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 16; + add.s32 %r3225, %r3224, %r3144; + xor.b32 %r3226, %r3225, %r3132; + shf.l.wrap.b32 %r3227, %r3226, %r3226, 20; + add.s32 %r3228, %r3114, %r3222; + add.s32 %r3229, %r3228, %r3227; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 24; + add.s32 %r3232, %r3231, %r3225; + xor.b32 %r3233, %r3232, %r3227; + shf.l.wrap.b32 %r3234, %r3233, %r3233, 25; + add.s32 %r3235, %r3187, %r3007; + add.s32 %r3236, %r3235, %r3234; + xor.b32 %r3237, %r3236, %r3203; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 16; + add.s32 %r3239, %r3238, %r3218; + xor.b32 %r3240, %r3239, %r3234; + shf.l.wrap.b32 %r3241, %r3240, %r3240, 20; + add.s32 %r3242, %r3236, %r3039; + add.s32 %r3243, %r3242, %r3241; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 24; + add.s32 %r3246, %r3245, %r3239; + xor.b32 %r3247, %r3246, %r3241; + shf.l.wrap.b32 %r3248, %r3247, %r3247, 25; + add.s32 %r3249, %r3201, %r3015; + add.s32 %r3250, %r3249, %r3192; + xor.b32 %r3251, %r3217, %r3250; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 16; + add.s32 %r3253, %r3232, %r3252; + xor.b32 %r3254, %r3253, %r3192; + shf.l.wrap.b32 %r3255, %r3254, %r3254, 20; + add.s32 %r3256, %r3250, %r3071; + add.s32 %r3257, %r3256, %r3255; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 24; + add.s32 %r3260, %r3259, %r3253; + xor.b32 %r3261, %r3260, %r3255; + shf.l.wrap.b32 %r3262, %r3261, %r3261, 25; + add.s32 %r3263, %r3206, %r3047; + add.s32 %r3264, %r3263, %r3215; + xor.b32 %r3265, %r3231, %r3264; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 16; + add.s32 %r3267, %r3266, %r3190; + xor.b32 %r3268, %r3267, %r3206; + shf.l.wrap.b32 %r3269, %r3268, %r3268, 20; + add.s32 %r3270, %r3264, %r2991; + add.s32 %r3271, %r3270, %r3269; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 24; + add.s32 %r3274, %r3273, %r3267; + xor.b32 %r3275, %r3274, %r3269; + shf.l.wrap.b32 %r3276, %r3275, %r3275, 25; + add.s32 %r3277, %r3220, %r3023; + add.s32 %r3278, %r3277, %r3229; + xor.b32 %r3279, %r3278, %r3189; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 16; + add.s32 %r3281, %r3280, %r3204; + xor.b32 %r3282, %r3281, %r3220; + shf.l.wrap.b32 %r3283, %r3282, %r3282, 20; + add.s32 %r3284, %r3278, %r3095; + add.s32 %r3285, %r3284, %r3283; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 24; + add.s32 %r3288, %r3287, %r3281; + xor.b32 %r3289, %r3288, %r3283; + shf.l.wrap.b32 %r3290, %r3289, %r3289, 25; + add.s32 %r3291, %r3262, %r2999; + add.s32 %r3292, %r3291, %r3243; + xor.b32 %r3293, %r3292, %r3287; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 16; + add.s32 %r3295, %r3294, %r3274; + xor.b32 %r3296, %r3295, %r3262; + shf.l.wrap.b32 %r3297, %r3296, %r3296, 20; + add.s32 %r3298, %r3292, %r3079; + add.s32 %r3299, %r3298, %r3297; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 24; + add.s32 %r3302, %r3301, %r3295; + xor.b32 %r3303, %r3302, %r3297; + shf.l.wrap.b32 %r3304, %r3303, %r3303, 25; + add.s32 %r3305, %r3257, %r3087; + add.s32 %r3306, %r3305, %r3276; + xor.b32 %r3307, %r3245, %r3306; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 16; + add.s32 %r3309, %r3308, %r3288; + xor.b32 %r3310, %r3309, %r3276; + shf.l.wrap.b32 %r3311, %r3310, %r3310, 20; + add.s32 %r3312, %r3306, %r3031; + add.s32 %r3313, %r3312, %r3311; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 24; + add.s32 %r3316, %r3315, %r3309; + xor.b32 %r3317, %r3316, %r3311; + shf.l.wrap.b32 %r3318, %r3317, %r3317, 25; + add.s32 %r3319, %r3271, %r3063; + add.s32 %r3320, %r3319, %r3290; + xor.b32 %r3321, %r3320, %r3259; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 16; + add.s32 %r3323, %r3322, %r3246; + xor.b32 %r3324, %r3323, %r3290; + shf.l.wrap.b32 %r3325, %r3324, %r3324, 20; + add.s32 %r3326, %r3320, %r3103; + add.s32 %r3327, %r3326, %r3325; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 24; + add.s32 %r3330, %r3329, %r3323; + xor.b32 %r3331, %r3330, %r3325; + shf.l.wrap.b32 %r3332, %r3331, %r3331, 25; + add.s32 %r3333, %r3285, %r3114; + add.s32 %r3334, %r3333, %r3248; + xor.b32 %r3335, %r3334, %r3273; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 16; + add.s32 %r3337, %r3336, %r3260; + xor.b32 %r3338, %r3337, %r3248; + shf.l.wrap.b32 %r3339, %r3338, %r3338, 20; + add.s32 %r3340, %r3334, %r3055; + add.s32 %r3341, %r3340, %r3339; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 24; + add.s32 %r3344, %r3343, %r3337; + xor.b32 %r3345, %r3344, %r3339; + shf.l.wrap.b32 %r3346, %r3345, %r3345, 25; + add.s32 %r3347, %r3299, %r3015; + add.s32 %r3348, %r3347, %r3346; + xor.b32 %r3349, %r3348, %r3315; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 16; + add.s32 %r3351, %r3350, %r3330; + xor.b32 %r3352, %r3351, %r3346; + shf.l.wrap.b32 %r3353, %r3352, %r3352, 20; + add.s32 %r3354, %r3348, %r3023; + add.s32 %r3355, %r3354, %r3353; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 24; + add.s32 %r3358, %r3357, %r3351; + xor.b32 %r3359, %r3358, %r3353; + shf.l.wrap.b32 %r3360, %r3359, %r3359, 25; + add.s32 %r3361, %r3313, %r3071; + add.s32 %r3362, %r3361, %r3304; + xor.b32 %r3363, %r3362, %r3329; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 16; + add.s32 %r3365, %r3364, %r3344; + xor.b32 %r3366, %r3365, %r3304; + shf.l.wrap.b32 %r3367, %r3366, %r3366, 20; + add.s32 %r3368, %r3362, %r3087; + add.s32 %r3369, %r3368, %r3367; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 24; + add.s32 %r3372, %r3371, %r3365; + xor.b32 %r3373, %r3372, %r3367; + shf.l.wrap.b32 %r3374, %r3373, %r3373, 25; + add.s32 %r3375, %r3327, %r3095; + add.s32 %r3376, %r3375, %r3318; + xor.b32 %r3377, %r3343, %r3376; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 16; + add.s32 %r3379, %r3378, %r3302; + xor.b32 %r3380, %r3379, %r3318; + shf.l.wrap.b32 %r3381, %r3380, %r3380, 20; + add.s32 %r3382, %r3376, %r3007; + add.s32 %r3383, %r3382, %r3381; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 24; + add.s32 %r3386, %r3385, %r3379; + xor.b32 %r3387, %r3386, %r3381; + shf.l.wrap.b32 %r3388, %r3387, %r3387, 25; + add.s32 %r3389, %r3332, %r3047; + add.s32 %r3390, %r3389, %r3341; + xor.b32 %r3391, %r3390, %r3301; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 16; + add.s32 %r3393, %r3392, %r3316; + xor.b32 %r3394, %r3393, %r3332; + shf.l.wrap.b32 %r3395, %r3394, %r3394, 20; + add.s32 %r3396, %r3390, %r3103; + add.s32 %r3397, %r3396, %r3395; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 24; + add.s32 %r3400, %r3399, %r3393; + xor.b32 %r3401, %r3400, %r3395; + shf.l.wrap.b32 %r3402, %r3401, %r3401, 25; + add.s32 %r3403, %r3374, %r3039; + add.s32 %r3404, %r3403, %r3355; + xor.b32 %r3405, %r3404, %r3399; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 16; + add.s32 %r3407, %r3406, %r3386; + xor.b32 %r3408, %r3407, %r3374; + shf.l.wrap.b32 %r3409, %r3408, %r3408, 20; + add.s32 %r3410, %r3404, %r3031; + add.s32 %r3411, %r3410, %r3409; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 24; + add.s32 %r3414, %r3413, %r3407; + xor.b32 %r3415, %r3414, %r3409; + shf.l.wrap.b32 %r3416, %r3415, %r3415, 25; + add.s32 %r3417, %r3369, %r3063; + add.s32 %r3418, %r3417, %r3388; + xor.b32 %r3419, %r3357, %r3418; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 16; + add.s32 %r3421, %r3420, %r3400; + xor.b32 %r3422, %r3421, %r3388; + shf.l.wrap.b32 %r3423, %r3422, %r3422, 20; + add.s32 %r3424, %r3418, %r2991; + add.s32 %r3425, %r3424, %r3423; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 24; + add.s32 %r3428, %r3427, %r3421; + xor.b32 %r3429, %r3428, %r3423; + shf.l.wrap.b32 %r3430, %r3429, %r3429, 25; + add.s32 %r3431, %r3383, %r3079; + add.s32 %r3432, %r3431, %r3402; + xor.b32 %r3433, %r3432, %r3371; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 16; + add.s32 %r3435, %r3434, %r3358; + xor.b32 %r3436, %r3435, %r3402; + shf.l.wrap.b32 %r3437, %r3436, %r3436, 20; + add.s32 %r3438, %r3432, %r3114; + add.s32 %r3439, %r3438, %r3437; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 24; + add.s32 %r3442, %r3441, %r3435; + xor.b32 %r3443, %r3442, %r3437; + shf.l.wrap.b32 %r3444, %r3443, %r3443, 25; + add.s32 %r3445, %r3397, %r3055; + add.s32 %r3446, %r3445, %r3360; + xor.b32 %r3447, %r3446, %r3385; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 16; + add.s32 %r3449, %r3448, %r3372; + xor.b32 %r3450, %r3449, %r3360; + shf.l.wrap.b32 %r3451, %r3450, %r3450, 20; + add.s32 %r3452, %r3446, %r2999; + add.s32 %r3453, %r3452, %r3451; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 24; + add.s32 %r3456, %r3455, %r3449; + xor.b32 %r3457, %r3456, %r3451; + shf.l.wrap.b32 %r3458, %r3457, %r3457, 25; + add.s32 %r3459, %r3411, %r3071; + add.s32 %r3460, %r3459, %r3458; + xor.b32 %r3461, %r3460, %r3427; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 16; + add.s32 %r3463, %r3462, %r3442; + xor.b32 %r3464, %r3463, %r3458; + shf.l.wrap.b32 %r3465, %r3464, %r3464, 20; + add.s32 %r3466, %r3460, %r3047; + add.s32 %r3467, %r3466, %r3465; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 24; + add.s32 %r3470, %r3469, %r3463; + xor.b32 %r3471, %r3470, %r3465; + shf.l.wrap.b32 %r3472, %r3471, %r3471, 25; + add.s32 %r3473, %r3425, %r3087; + add.s32 %r3474, %r3473, %r3416; + xor.b32 %r3475, %r3474, %r3441; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 16; + add.s32 %r3477, %r3476, %r3456; + xor.b32 %r3478, %r3477, %r3416; + shf.l.wrap.b32 %r3479, %r3478, %r3478, 20; + add.s32 %r3480, %r3474, %r3063; + add.s32 %r3481, %r3480, %r3479; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 24; + add.s32 %r3484, %r3483, %r3477; + xor.b32 %r3485, %r3484, %r3479; + shf.l.wrap.b32 %r3486, %r3485, %r3485, 25; + add.s32 %r3487, %r3439, %r3103; + add.s32 %r3488, %r3487, %r3430; + xor.b32 %r3489, %r3455, %r3488; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 16; + add.s32 %r3491, %r3490, %r3414; + xor.b32 %r3492, %r3491, %r3430; + shf.l.wrap.b32 %r3493, %r3492, %r3492, 20; + add.s32 %r3494, %r3488, %r3015; + add.s32 %r3495, %r3494, %r3493; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 24; + add.s32 %r3498, %r3497, %r3491; + xor.b32 %r3499, %r3498, %r3493; + shf.l.wrap.b32 %r3500, %r3499, %r3499, 25; + add.s32 %r3501, %r3444, %r3095; + add.s32 %r3502, %r3501, %r3453; + xor.b32 %r3503, %r3502, %r3413; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 16; + add.s32 %r3505, %r3504, %r3428; + xor.b32 %r3506, %r3505, %r3444; + shf.l.wrap.b32 %r3507, %r3506, %r3506, 20; + add.s32 %r3508, %r3502, %r3114; + add.s32 %r3509, %r3508, %r3507; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 24; + add.s32 %r3512, %r3511, %r3505; + xor.b32 %r3513, %r3512, %r3507; + shf.l.wrap.b32 %r3514, %r3513, %r3513, 25; + add.s32 %r3515, %r3486, %r3023; + add.s32 %r3516, %r3515, %r3467; + xor.b32 %r3517, %r3516, %r3511; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 16; + add.s32 %r3519, %r3518, %r3498; + xor.b32 %r3520, %r3519, %r3486; + shf.l.wrap.b32 %r3521, %r3520, %r3520, 20; + add.s32 %r3522, %r3516, %r2991; + add.s32 %r3523, %r3522, %r3521; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 24; + add.s32 %r3526, %r3525, %r3519; + xor.b32 %r3527, %r3526, %r3521; + shf.l.wrap.b32 %r3528, %r3527, %r3527, 25; + add.s32 %r3529, %r3481, %r3079; + add.s32 %r3530, %r3529, %r3500; + xor.b32 %r3531, %r3469, %r3530; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 16; + add.s32 %r3533, %r3532, %r3512; + xor.b32 %r3534, %r3533, %r3500; + shf.l.wrap.b32 %r3535, %r3534, %r3534, 20; + add.s32 %r3536, %r3530, %r3007; + add.s32 %r3537, %r3536, %r3535; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 24; + add.s32 %r3540, %r3539, %r3533; + xor.b32 %r3541, %r3540, %r3535; + shf.l.wrap.b32 %r3542, %r3541, %r3541, 25; + add.s32 %r3543, %r3495, %r3031; + add.s32 %r3544, %r3543, %r3514; + xor.b32 %r3545, %r3544, %r3483; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 16; + add.s32 %r3547, %r3546, %r3470; + xor.b32 %r3548, %r3547, %r3514; + shf.l.wrap.b32 %r3549, %r3548, %r3548, 20; + add.s32 %r3550, %r3544, %r3055; + add.s32 %r3551, %r3550, %r3549; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 24; + add.s32 %r3554, %r3553, %r3547; + xor.b32 %r3555, %r3554, %r3549; + shf.l.wrap.b32 %r3556, %r3555, %r3555, 25; + add.s32 %r3557, %r3509, %r2999; + add.s32 %r3558, %r3557, %r3472; + xor.b32 %r3559, %r3558, %r3497; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 16; + add.s32 %r3561, %r3560, %r3484; + xor.b32 %r3562, %r3561, %r3472; + shf.l.wrap.b32 %r3563, %r3562, %r3562, 20; + add.s32 %r3564, %r3558, %r3039; + add.s32 %r3565, %r3564, %r3563; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 24; + add.s32 %r3568, %r3567, %r3561; + xor.b32 %r3569, %r3568, %r3563; + shf.l.wrap.b32 %r3570, %r3569, %r3569, 25; + add.s32 %r3571, %r3523, %r3087; + add.s32 %r3572, %r3571, %r3570; + xor.b32 %r3573, %r3572, %r3539; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 16; + add.s32 %r3575, %r3574, %r3554; + xor.b32 %r3576, %r3575, %r3570; + shf.l.wrap.b32 %r3577, %r3576, %r3576, 20; + add.s32 %r3578, %r3572, %r3095; + add.s32 %r3579, %r3578, %r3577; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 24; + add.s32 %r3582, %r3581, %r3575; + xor.b32 %r3583, %r3582, %r3577; + shf.l.wrap.b32 %r3584, %r3583, %r3583, 25; + add.s32 %r3585, %r3537, %r3063; + add.s32 %r3586, %r3585, %r3528; + xor.b32 %r3587, %r3586, %r3553; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 16; + add.s32 %r3589, %r3588, %r3568; + xor.b32 %r3590, %r3589, %r3528; + shf.l.wrap.b32 %r3591, %r3590, %r3590, 20; + add.s32 %r3592, %r3586, %r3079; + add.s32 %r3593, %r3592, %r3591; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 24; + add.s32 %r3596, %r3595, %r3589; + xor.b32 %r3597, %r3596, %r3591; + shf.l.wrap.b32 %r3598, %r3597, %r3597, 25; + add.s32 %r3599, %r3551, %r3114; + add.s32 %r3600, %r3599, %r3542; + xor.b32 %r3601, %r3567, %r3600; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 16; + add.s32 %r3603, %r3602, %r3526; + xor.b32 %r3604, %r3603, %r3542; + shf.l.wrap.b32 %r3605, %r3604, %r3604, 20; + add.s32 %r3606, %r3600, %r3071; + add.s32 %r3607, %r3606, %r3605; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 24; + add.s32 %r3610, %r3609, %r3603; + xor.b32 %r3611, %r3610, %r3605; + shf.l.wrap.b32 %r3612, %r3611, %r3611, 25; + add.s32 %r3613, %r3556, %r3103; + add.s32 %r3614, %r3613, %r3565; + xor.b32 %r3615, %r3614, %r3525; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 16; + add.s32 %r3617, %r3616, %r3540; + xor.b32 %r3618, %r3617, %r3556; + shf.l.wrap.b32 %r3619, %r3618, %r3618, 20; + add.s32 %r3620, %r3614, %r3055; + add.s32 %r3621, %r3620, %r3619; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 24; + add.s32 %r3624, %r3623, %r3617; + xor.b32 %r3625, %r3624, %r3619; + shf.l.wrap.b32 %r3626, %r3625, %r3625, 25; + add.s32 %r3627, %r3598, %r3047; + add.s32 %r3628, %r3627, %r3579; + xor.b32 %r3629, %r3628, %r3623; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 16; + add.s32 %r3631, %r3630, %r3610; + xor.b32 %r3632, %r3631, %r3598; + shf.l.wrap.b32 %r3633, %r3632, %r3632, 20; + add.s32 %r3634, %r3628, %r3007; + add.s32 %r3635, %r3634, %r3633; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 24; + add.s32 %r3638, %r3637, %r3631; + xor.b32 %r3639, %r3638, %r3633; + shf.l.wrap.b32 %r3640, %r3639, %r3639, 25; + add.s32 %r3641, %r3593, %r3031; + add.s32 %r3642, %r3641, %r3612; + xor.b32 %r3643, %r3581, %r3642; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 16; + add.s32 %r3645, %r3644, %r3624; + xor.b32 %r3646, %r3645, %r3612; + shf.l.wrap.b32 %r3647, %r3646, %r3646, 20; + add.s32 %r3648, %r3642, %r3015; + add.s32 %r3649, %r3648, %r3647; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 24; + add.s32 %r3652, %r3651, %r3645; + xor.b32 %r3653, %r3652, %r3647; + shf.l.wrap.b32 %r3654, %r3653, %r3653, 25; + add.s32 %r3655, %r3607, %r2991; + add.s32 %r3656, %r3655, %r3626; + xor.b32 %r3657, %r3656, %r3595; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 16; + add.s32 %r3659, %r3658, %r3582; + xor.b32 %r3660, %r3659, %r3626; + shf.l.wrap.b32 %r3661, %r3660, %r3660, 20; + add.s32 %r3662, %r3656, %r2999; + add.s32 %r3663, %r3662, %r3661; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 24; + add.s32 %r3666, %r3665, %r3659; + xor.b32 %r3667, %r3666, %r3661; + shf.l.wrap.b32 %r3668, %r3667, %r3667, 25; + add.s32 %r3669, %r3621, %r3039; + add.s32 %r3670, %r3669, %r3584; + xor.b32 %r3671, %r3670, %r3609; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 16; + add.s32 %r3673, %r3672, %r3596; + xor.b32 %r3674, %r3673, %r3584; + shf.l.wrap.b32 %r3675, %r3674, %r3674, 20; + add.s32 %r3676, %r3670, %r3023; + add.s32 %r3677, %r3676, %r3675; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 24; + add.s32 %r3680, %r3679, %r3673; + xor.b32 %r3681, %r3680, %r3675; + shf.l.wrap.b32 %r3682, %r3681, %r3681, 25; + add.s32 %r3683, %r3635, %r3063; + add.s32 %r3684, %r3683, %r3682; + xor.b32 %r3685, %r3684, %r3651; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 16; + add.s32 %r3687, %r3686, %r3666; + xor.b32 %r3688, %r3687, %r3682; + shf.l.wrap.b32 %r3689, %r3688, %r3688, 20; + add.s32 %r3690, %r3684, %r3103; + add.s32 %r3691, %r3690, %r3689; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 24; + add.s32 %r3694, %r3693, %r3687; + xor.b32 %r3695, %r3694, %r3689; + shf.l.wrap.b32 %r3696, %r3695, %r3695, 25; + add.s32 %r3697, %r3649, %r3079; + add.s32 %r3698, %r3697, %r3640; + xor.b32 %r3699, %r3698, %r3665; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 16; + add.s32 %r3701, %r3700, %r3680; + xor.b32 %r3702, %r3701, %r3640; + shf.l.wrap.b32 %r3703, %r3702, %r3702, 20; + add.s32 %r3704, %r3698, %r3031; + add.s32 %r3705, %r3704, %r3703; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 24; + add.s32 %r3708, %r3707, %r3701; + xor.b32 %r3709, %r3708, %r3703; + shf.l.wrap.b32 %r3710, %r3709, %r3709, 25; + add.s32 %r3711, %r3663, %r3055; + add.s32 %r3712, %r3711, %r3654; + xor.b32 %r3713, %r3679, %r3712; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 16; + add.s32 %r3715, %r3714, %r3638; + xor.b32 %r3716, %r3715, %r3654; + shf.l.wrap.b32 %r3717, %r3716, %r3716, 20; + add.s32 %r3718, %r3712, %r3087; + add.s32 %r3719, %r3718, %r3717; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 24; + add.s32 %r3722, %r3721, %r3715; + xor.b32 %r3723, %r3722, %r3717; + shf.l.wrap.b32 %r3724, %r3723, %r3723, 25; + add.s32 %r3725, %r3668, %r3114; + add.s32 %r3726, %r3725, %r3677; + xor.b32 %r3727, %r3726, %r3637; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 16; + add.s32 %r3729, %r3728, %r3652; + xor.b32 %r3730, %r3729, %r3668; + shf.l.wrap.b32 %r3731, %r3730, %r3730, 20; + add.s32 %r3732, %r3726, %r2999; + add.s32 %r3733, %r3732, %r3731; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 24; + add.s32 %r3736, %r3735, %r3729; + xor.b32 %r3737, %r3736, %r3731; + shf.l.wrap.b32 %r3738, %r3737, %r3737, 25; + add.s32 %r3739, %r3710, %r3095; + add.s32 %r3740, %r3739, %r3691; + xor.b32 %r3741, %r3740, %r3735; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 16; + add.s32 %r3743, %r3742, %r3722; + xor.b32 %r3744, %r3743, %r3710; + shf.l.wrap.b32 %r3745, %r3744, %r3744, 20; + add.s32 %r3746, %r3740, %r3015; + add.s32 %r3747, %r3746, %r3745; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 24; + add.s32 %r3750, %r3749, %r3743; + xor.b32 %r3751, %r3750, %r3745; + shf.l.wrap.b32 %r3752, %r3751, %r3751, 25; + add.s32 %r3753, %r3705, %r2991; + add.s32 %r3754, %r3753, %r3724; + xor.b32 %r3755, %r3693, %r3754; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 16; + add.s32 %r3757, %r3756, %r3736; + xor.b32 %r3758, %r3757, %r3724; + shf.l.wrap.b32 %r3759, %r3758, %r3758, 20; + add.s32 %r3760, %r3754, %r3071; + add.s32 %r3761, %r3760, %r3759; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 24; + add.s32 %r3764, %r3763, %r3757; + xor.b32 %r3765, %r3764, %r3759; + shf.l.wrap.b32 %r3766, %r3765, %r3765, 25; + add.s32 %r3767, %r3719, %r3007; + add.s32 %r3768, %r3767, %r3738; + xor.b32 %r3769, %r3768, %r3707; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 16; + add.s32 %r3771, %r3770, %r3694; + xor.b32 %r3772, %r3771, %r3738; + shf.l.wrap.b32 %r3773, %r3772, %r3772, 20; + add.s32 %r3774, %r3768, %r3039; + add.s32 %r3775, %r3774, %r3773; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 24; + add.s32 %r3778, %r3777, %r3771; + xor.b32 %r3779, %r3778, %r3773; + shf.l.wrap.b32 %r3780, %r3779, %r3779, 25; + add.s32 %r3781, %r3733, %r3023; + add.s32 %r3782, %r3781, %r3696; + xor.b32 %r3783, %r3782, %r3721; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 16; + add.s32 %r3785, %r3784, %r3708; + xor.b32 %r3786, %r3785, %r3696; + shf.l.wrap.b32 %r3787, %r3786, %r3786, 20; + add.s32 %r3788, %r3782, %r3047; + add.s32 %r3789, %r3788, %r3787; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 24; + add.s32 %r3792, %r3791, %r3785; + xor.b32 %r3793, %r3792, %r3787; + shf.l.wrap.b32 %r3794, %r3793, %r3793, 25; + add.s32 %r3795, %r3747, %r3079; + add.s32 %r3796, %r3795, %r3794; + xor.b32 %r3797, %r3796, %r3763; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 16; + add.s32 %r3799, %r3798, %r3778; + xor.b32 %r3800, %r3799, %r3794; + shf.l.wrap.b32 %r3801, %r3800, %r3800, 20; + add.s32 %r3802, %r3796, %r3114; + add.s32 %r3803, %r3802, %r3801; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 24; + add.s32 %r3806, %r3805, %r3799; + xor.b32 %r3807, %r3806, %r3801; + shf.l.wrap.b32 %r3808, %r3807, %r3807, 25; + add.s32 %r3809, %r3761, %r3031; + add.s32 %r3810, %r3809, %r3752; + xor.b32 %r3811, %r3810, %r3777; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 16; + add.s32 %r3813, %r3812, %r3792; + xor.b32 %r3814, %r3813, %r3752; + shf.l.wrap.b32 %r3815, %r3814, %r3814, 20; + add.s32 %r3816, %r3810, %r2991; + add.s32 %r3817, %r3816, %r3815; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 24; + add.s32 %r3820, %r3819, %r3813; + xor.b32 %r3821, %r3820, %r3815; + shf.l.wrap.b32 %r3822, %r3821, %r3821, 25; + add.s32 %r3823, %r3775, %r2999; + add.s32 %r3824, %r3823, %r3766; + xor.b32 %r3825, %r3791, %r3824; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 16; + add.s32 %r3827, %r3826, %r3750; + xor.b32 %r3828, %r3827, %r3766; + shf.l.wrap.b32 %r3829, %r3828, %r3828, 20; + add.s32 %r3830, %r3824, %r3063; + add.s32 %r3831, %r3830, %r3829; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 24; + add.s32 %r3834, %r3833, %r3827; + xor.b32 %r3835, %r3834, %r3829; + shf.l.wrap.b32 %r3836, %r3835, %r3835, 25; + add.s32 %r3837, %r3780, %r3055; + add.s32 %r3838, %r3837, %r3789; + xor.b32 %r3839, %r3838, %r3749; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 16; + add.s32 %r3841, %r3840, %r3764; + xor.b32 %r3842, %r3841, %r3780; + shf.l.wrap.b32 %r3843, %r3842, %r3842, 20; + add.s32 %r3844, %r3838, %r3039; + add.s32 %r3845, %r3844, %r3843; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 24; + add.s32 %r3848, %r3847, %r3841; + xor.b32 %r3849, %r3848, %r3843; + shf.l.wrap.b32 %r3850, %r3849, %r3849, 25; + add.s32 %r3851, %r3822, %r3103; + add.s32 %r3852, %r3851, %r3803; + xor.b32 %r3853, %r3852, %r3847; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 16; + add.s32 %r3855, %r3854, %r3834; + xor.b32 %r3856, %r3855, %r3822; + shf.l.wrap.b32 %r3857, %r3856, %r3856, 20; + add.s32 %r3858, %r3852, %r3071; + add.s32 %r3859, %r3858, %r3857; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 24; + add.s32 %r3862, %r3861, %r3855; + xor.b32 %r3863, %r3862, %r3857; + shf.l.wrap.b32 %r3864, %r3863, %r3863, 25; + add.s32 %r3865, %r3817, %r3007; + add.s32 %r3866, %r3865, %r3836; + xor.b32 %r3867, %r3805, %r3866; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 16; + add.s32 %r3869, %r3868, %r3848; + xor.b32 %r3870, %r3869, %r3836; + shf.l.wrap.b32 %r3871, %r3870, %r3870, 20; + add.s32 %r3872, %r3866, %r3087; + add.s32 %r3873, %r3872, %r3871; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 24; + add.s32 %r3876, %r3875, %r3869; + xor.b32 %r3877, %r3876, %r3871; + shf.l.wrap.b32 %r3878, %r3877, %r3877, 25; + add.s32 %r3879, %r3831, %r3015; + add.s32 %r3880, %r3879, %r3850; + xor.b32 %r3881, %r3880, %r3819; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 16; + add.s32 %r3883, %r3882, %r3806; + xor.b32 %r3884, %r3883, %r3850; + shf.l.wrap.b32 %r3885, %r3884, %r3884, 20; + add.s32 %r3886, %r3880, %r3023; + add.s32 %r3887, %r3886, %r3885; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 24; + add.s32 %r3890, %r3889, %r3883; + xor.b32 %r3891, %r3890, %r3885; + shf.l.wrap.b32 %r3892, %r3891, %r3891, 25; + add.s32 %r3893, %r3845, %r3047; + add.s32 %r3894, %r3893, %r3808; + xor.b32 %r3895, %r3894, %r3833; + shf.l.wrap.b32 %r3896, %r3895, %r3895, 16; + add.s32 %r3897, %r3896, %r3820; + xor.b32 %r3898, %r3897, %r3808; + shf.l.wrap.b32 %r3899, %r3898, %r3898, 20; + add.s32 %r3900, %r3894, %r3095; + add.s32 %r3901, %r3900, %r3899; + xor.b32 %r3902, %r3901, %r3896; + shf.l.wrap.b32 %r3903, %r3902, %r3902, 24; + add.s32 %r3904, %r3903, %r3897; + xor.b32 %r3905, %r3904, %r3899; + shf.l.wrap.b32 %r3906, %r3905, %r3905, 25; + xor.b32 %r3907, %r3890, %r3859; + xor.b32 %r3908, %r3904, %r3873; + xor.b32 %r3909, %r3862, %r3887; + xor.b32 %r3910, %r3901, %r3876; + xor.b32 %r3911, %r3906, %r3875; + xor.b32 %r3912, %r3864, %r3889; + xor.b32 %r3913, %r3903, %r3878; + xor.b32 %r3914, %r3892, %r3861; + st.local.u8 [%rd134], %r3907; + shr.u32 %r3915, %r3907, 8; + st.local.u8 [%rd134+1], %r3915; + shr.u32 %r3916, %r3907, 16; + st.local.u8 [%rd134+2], %r3916; + shr.u32 %r3917, %r3907, 24; + st.local.u8 [%rd134+3], %r3917; + st.local.u8 [%rd134+4], %r3908; + shr.u32 %r3918, %r3908, 8; + st.local.u8 [%rd134+5], %r3918; + shr.u32 %r3919, %r3908, 16; + st.local.u8 [%rd134+6], %r3919; + shr.u32 %r3920, %r3908, 24; + st.local.u8 [%rd134+7], %r3920; + st.local.u8 [%rd134+8], %r3909; + shr.u32 %r3921, %r3909, 8; + st.local.u8 [%rd134+9], %r3921; + shr.u32 %r3922, %r3909, 16; + st.local.u8 [%rd134+10], %r3922; + shr.u32 %r3923, %r3909, 24; + st.local.u8 [%rd134+11], %r3923; + st.local.u8 [%rd134+12], %r3910; + shr.u32 %r3924, %r3910, 8; + st.local.u8 [%rd134+13], %r3924; + shr.u32 %r3925, %r3910, 16; + st.local.u8 [%rd134+14], %r3925; + shr.u32 %r3926, %r3910, 24; + st.local.u8 [%rd134+15], %r3926; + st.local.u8 [%rd134+16], %r3911; + shr.u32 %r3927, %r3911, 8; + st.local.u8 [%rd134+17], %r3927; + shr.u32 %r3928, %r3911, 16; + st.local.u8 [%rd134+18], %r3928; + shr.u32 %r3929, %r3911, 24; + st.local.u8 [%rd134+19], %r3929; + st.local.u8 [%rd134+20], %r3912; + shr.u32 %r3930, %r3912, 8; + st.local.u8 [%rd134+21], %r3930; + shr.u32 %r3931, %r3912, 16; + st.local.u8 [%rd134+22], %r3931; + shr.u32 %r3932, %r3912, 24; + st.local.u8 [%rd134+23], %r3932; + st.local.u8 [%rd134+24], %r3913; + shr.u32 %r3933, %r3913, 8; + st.local.u8 [%rd134+25], %r3933; + shr.u32 %r3934, %r3913, 16; + st.local.u8 [%rd134+26], %r3934; + shr.u32 %r3935, %r3913, 24; + st.local.u8 [%rd134+27], %r3935; + st.local.u8 [%rd134+28], %r3914; + shr.u32 %r3936, %r3914, 8; + st.local.u8 [%rd134+29], %r3936; + shr.u32 %r3937, %r3914, 16; + st.local.u8 [%rd134+30], %r3937; + shr.u32 %r3938, %r3914, 24; + st.local.u8 [%rd134+31], %r3938; + add.s64 %rd151, %rd151, 1; + bra.uni $L__BB0_30; + +$L__BB0_1: + add.s64 %rd76, %rd171, -1; + shr.u64 %rd77, %rd76, 10; + or.b64 %rd78, %rd77, 1; + setp.gt.u64 %p2, %rd78, 4294967295; + shr.u64 %rd79, %rd76, 42; + selp.b64 %rd80, %rd79, %rd78, %p2; + selp.b32 %r62, 32, 0, %p2; + and.b64 %rd81, %rd80, 4294901760; + setp.ne.s64 %p3, %rd81, 0; + shr.u64 %rd82, %rd80, 16; + or.b32 %r63, %r62, 16; + selp.b64 %rd83, %rd82, %rd80, %p3; + selp.b32 %r64, %r63, %r62, %p3; + and.b64 %rd84, %rd83, 65280; + setp.ne.s64 %p4, %rd84, 0; + shr.u64 %rd85, %rd83, 8; + or.b32 %r65, %r64, 8; + selp.b64 %rd86, %rd85, %rd83, %p4; + selp.b32 %r66, %r65, %r64, %p4; + and.b64 %rd87, %rd86, 240; + setp.ne.s64 %p5, %rd87, 0; + shr.u64 %rd88, %rd86, 4; + or.b32 %r67, %r66, 4; + selp.b64 %rd89, %rd88, %rd86, %p5; + selp.b32 %r68, %r67, %r66, %p5; + and.b64 %rd90, %rd89, 12; + setp.ne.s64 %p6, %rd90, 0; + shr.u64 %rd91, %rd89, 2; + add.s32 %r69, %r68, 2; + selp.b64 %rd92, %rd91, %rd89, %p6; + selp.b32 %r70, %r69, %r68, %p6; + and.b64 %rd93, %rd92, 2; + shr.u64 %rd94, %rd93, 1; + cvt.u32.u64 %r71, %rd94; + add.s32 %r72, %r70, %r71; + mov.u64 %rd95, 1024; + shl.b64 %rd96, %rd95, %r72; + sub.s64 %rd97, %rd171, %rd96; + add.s64 %rd98, %rd69, %rd96; + shr.u64 %rd99, %rd96, 10; + add.s64 %rd100, %rd99, %rd165; + setp.gt.u64 %p7, %rd96, 1024; + selp.b64 %rd101, 64, 32, %p7; + add.s64 %rd103, %rd149, %rd101; + cvt.u32.u16 %r73, %rs75; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd69; + .param .b64 param1; + st.param.b64 [param1+0], %rd96; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd165; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd149; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd5, [retval0+0]; + } // callseq 0 + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd98; + .param .b64 param1; + st.param.b64 [param1+0], %rd97; + .param .b64 param2; + st.param.b64 [param2+0], %rd71; + .param .b64 param3; + st.param.b64 [param3+0], %rd100; + .param .b32 param4; + st.param.b32 [param4+0], %r73; + .param .b64 param5; + st.param.b64 [param5+0], %rd103; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd6, [retval0+0]; + } // callseq 1 + setp.eq.s64 %p8, %rd5, 1; + @%p8 bra $L__BB0_12; + bra.uni $L__BB0_2; + +$L__BB0_12: + mov.u64 %rd158, 0; + +$L__BB0_13: + add.s64 %rd117, %rd4, %rd158; + ld.local.u8 %rs78, [%rd117]; + add.s64 %rd118, %rd155, %rd158; + st.local.u8 [%rd118], %rs78; + add.s64 %rd158, %rd158, 1; + setp.lt.u64 %p15, %rd158, 64; + mov.u64 %rd151, 2; + @%p15 bra $L__BB0_13; + bra.uni $L__BB0_30; + +$L__BB0_2: + add.s64 %rd7, %rd6, %rd5; + setp.lt.u64 %p9, %rd7, 2; + mov.u64 %rd151, 0; + mov.u64 %rd152, %rd151; + @%p9 bra $L__BB0_5; + + mov.u64 %rd146, %rd153; + mov.u64 %rd147, %rd7; + +$L__BB0_4: + st.local.u64 [%rd146], %rd149; + add.s64 %rd151, %rd151, 1; + add.s64 %rd149, %rd149, 64; + add.s64 %rd152, %rd152, 2; + add.s64 %rd146, %rd146, 8; + add.s64 %rd147, %rd147, -2; + setp.gt.u64 %p10, %rd147, 1; + @%p10 bra $L__BB0_4; $L__BB0_5: - ld.global.u64 %rd104, [%rd1]; - xor.b64 %rd463, %rd104, %rd4; - -$L__BB0_6: - and.b64 %rd105, %rd463, %rd78; - or.b64 %rd8, %rd105, %rd79; - mov.b64 {%r26, %r27}, %rd8; - mov.u64 %rd106, 0; - ld.const.u64 %rd107, [hash_header]; - cvt.u32.u64 %r28, %rd107; - shr.u64 %rd108, %rd107, 8; - cvt.u32.u64 %r29, %rd108; - shr.u64 %rd109, %rd107, 16; - cvt.u32.u64 %r30, %rd109; - shr.u64 %rd110, %rd107, 32; - cvt.u32.u64 %r31, %rd110; - shr.u64 %rd111, %rd107, 40; - cvt.u32.u64 %r32, %rd111; - shr.u64 %rd112, %rd107, 48; - cvt.u32.u64 %r33, %rd112; - ld.const.u64 %rd113, [hash_header+8]; - cvt.u32.u64 %r34, %rd113; - shr.u64 %rd114, %rd113, 8; - cvt.u32.u64 %r35, %rd114; - shr.u64 %rd115, %rd113, 16; - cvt.u32.u64 %r36, %rd115; - shr.u64 %rd116, %rd113, 32; - cvt.u32.u64 %r37, %rd116; - shr.u64 %rd117, %rd113, 40; - cvt.u32.u64 %r38, %rd117; - shr.u64 %rd118, %rd113, 48; - cvt.u32.u64 %r39, %rd118; - ld.const.u64 %rd119, [hash_header+16]; - cvt.u32.u64 %r40, %rd119; - shr.u64 %rd120, %rd119, 8; - cvt.u32.u64 %r41, %rd120; - shr.u64 %rd121, %rd119, 16; - cvt.u32.u64 %r42, %rd121; - shr.u64 %rd122, %rd119, 32; - cvt.u32.u64 %r43, %rd122; - shr.u64 %rd123, %rd119, 40; - cvt.u32.u64 %r44, %rd123; - shr.u64 %rd124, %rd119, 48; - cvt.u32.u64 %r45, %rd124; - ld.const.u64 %rd125, [hash_header+24]; - cvt.u32.u64 %r46, %rd125; - shr.u64 %rd126, %rd125, 8; - cvt.u32.u64 %r47, %rd126; - shr.u64 %rd127, %rd125, 16; - cvt.u32.u64 %r48, %rd127; - shr.u64 %rd128, %rd125, 32; - cvt.u32.u64 %r49, %rd128; - shr.u64 %rd129, %rd125, 40; - cvt.u32.u64 %r50, %rd129; - shr.u64 %rd130, %rd125, 48; - cvt.u32.u64 %r51, %rd130; - ld.const.v4.u16 {%rs12, %rs13, %rs14, %rs15}, [hash_header+32]; - shr.u16 %rs17, %rs12, 8; - shr.u16 %rs19, %rs13, 8; - shr.u16 %rs21, %rs14, 8; - shr.u16 %rs23, %rs15, 8; - ld.const.v4.u16 {%rs24, %rs25, %rs26, %rs27}, [hash_header+40]; - shr.u16 %rs29, %rs24, 8; - shr.u16 %rs31, %rs25, 8; - shr.u16 %rs33, %rs26, 8; - shr.u16 %rs35, %rs27, 8; - ld.const.v4.u16 {%rs36, %rs37, %rs38, %rs39}, [hash_header+48]; - shr.u16 %rs41, %rs36, 8; - shr.u16 %rs43, %rs37, 8; - shr.u16 %rs45, %rs38, 8; - shr.u16 %rs47, %rs39, 8; - ld.const.v4.u16 {%rs48, %rs49, %rs50, %rs51}, [hash_header+56]; - shr.u16 %rs53, %rs48, 8; - shr.u16 %rs55, %rs49, 8; - shr.u16 %rs57, %rs50, 8; - shr.u16 %rs59, %rs51, 8; - ld.const.u64 %rd131, [hash_header+64]; - mov.b64 {%r52, %r53}, %rd131; - mov.u32 %r54, -1150833019; - mov.u32 %r55, 1779033703; - st.local.v2.u32 [%rd3], {%r55, %r54}; - mov.u32 %r56, -1521486534; - mov.u32 %r57, 1013904242; - st.local.v2.u32 [%rd3+8], {%r57, %r56}; - mov.u32 %r58, -1694144372; - mov.u32 %r59, 1359893119; - st.local.v2.u32 [%rd3+16], {%r59, %r58}; - mov.u32 %r60, 1541459225; - mov.u32 %r61, 528734635; - st.local.v2.u32 [%rd3+24], {%r61, %r60}; - st.local.u64 [%rd3+64], %rd106; - mov.u32 %r62, 0; - st.local.v2.u32 [%rd3+88], {%r62, %r62}; - st.local.v2.u32 [%rd3+96], {%r62, %r62}; - st.local.v2.u32 [%rd3+104], {%r62, %r62}; - st.local.v2.u32 [%rd3+112], {%r62, %r62}; - st.local.v2.u32 [%rd3+120], {%r62, %r62}; - st.local.v2.u32 [%rd3+128], {%r62, %r62}; - mov.u16 %rs60, 0; - st.local.v2.u8 [%rd3+136], {%rs60, %rs60}; - st.local.u8 [%rd3+138], %rs60; - st.local.v2.u32 [%rd3+32], {%r55, %r54}; - st.local.v2.u32 [%rd3+40], {%r57, %r56}; - st.local.v2.u32 [%rd3+48], {%r59, %r58}; - st.local.v2.u32 [%rd3+56], {%r61, %r60}; - st.local.v2.u32 [%rd3+72], {%r62, %r62}; - st.local.v2.u32 [%rd3+80], {%r62, %r62}; - st.local.u8 [%rd3+144], %rs60; - ld.local.v4.u8 {%rs61, %rs62, %rs63, %rs64}, [%rd3+136]; - setp.eq.s16 %p9, %rs62, 0; - selp.u16 %rs68, 1, 0, %p9; - or.b16 %rs69, %rs63, %rs68; - shr.u32 %r63, %r28, 24; - mov.u32 %r64, 64; - prmt.b32 %r65, %r28, %r29, %r64; - mov.u32 %r66, 1040; - prmt.b32 %r67, %r65, %r30, %r66; - mov.u32 %r68, 16912; - prmt.b32 %r69, %r67, %r63, %r68; - and.b32 %r70, %r31, 255; - and.b32 %r71, %r32, 255; - prmt.b32 %r72, %r71, %r70, 30212; - shl.b32 %r73, %r33, 16; - and.b32 %r74, %r73, 16711680; - or.b32 %r75, %r72, %r74; - and.b32 %r76, %r31, -16777216; - or.b32 %r77, %r75, %r76; - shr.u32 %r78, %r34, 24; - prmt.b32 %r79, %r34, %r35, %r64; - prmt.b32 %r80, %r79, %r36, %r66; - prmt.b32 %r81, %r80, %r78, %r68; - and.b32 %r82, %r37, 255; - and.b32 %r83, %r38, 255; - prmt.b32 %r84, %r83, %r82, 30212; - shl.b32 %r85, %r39, 16; - and.b32 %r86, %r85, 16711680; - or.b32 %r87, %r84, %r86; - and.b32 %r88, %r37, -16777216; - or.b32 %r89, %r87, %r88; - shr.u32 %r90, %r40, 24; - prmt.b32 %r91, %r40, %r41, %r64; - prmt.b32 %r92, %r91, %r42, %r66; - prmt.b32 %r93, %r92, %r90, %r68; - and.b32 %r94, %r43, 255; - and.b32 %r95, %r44, 255; - prmt.b32 %r96, %r95, %r94, 30212; - shl.b32 %r97, %r45, 16; - and.b32 %r98, %r97, 16711680; - or.b32 %r99, %r96, %r98; - and.b32 %r100, %r43, -16777216; - or.b32 %r101, %r99, %r100; - shr.u32 %r102, %r46, 24; - prmt.b32 %r103, %r46, %r47, %r64; - prmt.b32 %r104, %r103, %r48, %r66; - prmt.b32 %r105, %r104, %r102, %r68; - and.b32 %r106, %r49, 255; - and.b32 %r107, %r50, 255; - prmt.b32 %r108, %r107, %r106, 30212; - shl.b32 %r109, %r51, 16; - and.b32 %r110, %r109, 16711680; - or.b32 %r111, %r108, %r110; - and.b32 %r112, %r49, -16777216; - or.b32 %r113, %r111, %r112; - cvt.u32.u16 %r114, %rs12; - and.b32 %r115, %r114, 255; - cvt.u32.u16 %r116, %rs17; - prmt.b32 %r117, %r116, %r115, 30212; - cvt.u32.u16 %r118, %rs13; - prmt.b32 %r119, %r118, %r117, 28756; - cvt.u32.u16 %r120, %rs19; - prmt.b32 %r121, %r120, %r119, 1620; - cvt.u32.u16 %r122, %rs14; - and.b32 %r123, %r122, 255; - cvt.u32.u16 %r124, %rs21; + setp.eq.s64 %p11, %rd151, 0; + @%p11 bra $L__BB0_8; + + or.b16 %rs76, %rs75, 4; + cvt.u32.u16 %r1, %rs76; + mov.u64 %rd154, %rd151; + +$L__BB0_7: + ld.local.u64 %rd109, [%rd153]; + ld.u8 %r74, [%rd109]; + ld.u8 %r75, [%rd109+1]; + prmt.b32 %r76, %r75, %r74, 30212; + ld.u8 %r77, [%rd109+2]; + prmt.b32 %r78, %r77, %r76, 28756; + ld.u8 %r79, [%rd109+3]; + prmt.b32 %r80, %r79, %r78, 1620; + ld.u8 %r81, [%rd109+4]; + ld.u8 %r82, [%rd109+5]; + prmt.b32 %r83, %r82, %r81, 30212; + ld.u8 %r84, [%rd109+6]; + prmt.b32 %r85, %r84, %r83, 28756; + ld.u8 %r86, [%rd109+7]; + prmt.b32 %r87, %r86, %r85, 1620; + ld.u8 %r88, [%rd109+8]; + ld.u8 %r89, [%rd109+9]; + prmt.b32 %r90, %r89, %r88, 30212; + ld.u8 %r91, [%rd109+10]; + prmt.b32 %r92, %r91, %r90, 28756; + ld.u8 %r93, [%rd109+11]; + prmt.b32 %r94, %r93, %r92, 1620; + ld.u8 %r95, [%rd109+12]; + ld.u8 %r96, [%rd109+13]; + prmt.b32 %r97, %r96, %r95, 30212; + ld.u8 %r98, [%rd109+14]; + prmt.b32 %r99, %r98, %r97, 28756; + ld.u8 %r100, [%rd109+15]; + prmt.b32 %r101, %r100, %r99, 1620; + ld.u8 %r102, [%rd109+16]; + ld.u8 %r103, [%rd109+17]; + prmt.b32 %r104, %r103, %r102, 30212; + ld.u8 %r105, [%rd109+18]; + prmt.b32 %r106, %r105, %r104, 28756; + ld.u8 %r107, [%rd109+19]; + prmt.b32 %r108, %r107, %r106, 1620; + ld.u8 %r109, [%rd109+20]; + ld.u8 %r110, [%rd109+21]; + prmt.b32 %r111, %r110, %r109, 30212; + ld.u8 %r112, [%rd109+22]; + prmt.b32 %r113, %r112, %r111, 28756; + ld.u8 %r114, [%rd109+23]; + prmt.b32 %r115, %r114, %r113, 1620; + ld.u8 %r116, [%rd109+24]; + ld.u8 %r117, [%rd109+25]; + prmt.b32 %r118, %r117, %r116, 30212; + ld.u8 %r119, [%rd109+26]; + prmt.b32 %r120, %r119, %r118, 28756; + ld.u8 %r121, [%rd109+27]; + prmt.b32 %r122, %r121, %r120, 1620; + ld.u8 %r123, [%rd109+28]; + ld.u8 %r124, [%rd109+29]; prmt.b32 %r125, %r124, %r123, 30212; - cvt.u32.u16 %r126, %rs15; + ld.u8 %r126, [%rd109+30]; prmt.b32 %r127, %r126, %r125, 28756; - cvt.u32.u16 %r128, %rs23; + ld.u8 %r128, [%rd109+31]; prmt.b32 %r129, %r128, %r127, 1620; - cvt.u32.u16 %r130, %rs24; - and.b32 %r131, %r130, 255; - cvt.u32.u16 %r132, %rs29; - prmt.b32 %r133, %r132, %r131, 30212; - cvt.u32.u16 %r134, %rs25; - prmt.b32 %r135, %r134, %r133, 28756; - cvt.u32.u16 %r136, %rs31; - prmt.b32 %r137, %r136, %r135, 1620; - cvt.u32.u16 %r138, %rs26; - and.b32 %r139, %r138, 255; - cvt.u32.u16 %r140, %rs33; - prmt.b32 %r141, %r140, %r139, 30212; - cvt.u32.u16 %r142, %rs27; - prmt.b32 %r143, %r142, %r141, 28756; - cvt.u32.u16 %r144, %rs35; - prmt.b32 %r145, %r144, %r143, 1620; - cvt.u32.u16 %r146, %rs36; - and.b32 %r147, %r146, 255; - cvt.u32.u16 %r148, %rs41; - prmt.b32 %r149, %r148, %r147, 30212; - cvt.u32.u16 %r150, %rs37; - prmt.b32 %r151, %r150, %r149, 28756; - cvt.u32.u16 %r152, %rs43; - prmt.b32 %r153, %r152, %r151, 1620; - cvt.u32.u16 %r154, %rs38; - and.b32 %r155, %r154, 255; - cvt.u32.u16 %r156, %rs45; + ld.u8 %r130, [%rd109+32]; + ld.u8 %r131, [%rd109+33]; + prmt.b32 %r132, %r131, %r130, 30212; + ld.u8 %r133, [%rd109+34]; + prmt.b32 %r134, %r133, %r132, 28756; + ld.u8 %r135, [%rd109+35]; + prmt.b32 %r136, %r135, %r134, 1620; + ld.u8 %r137, [%rd109+36]; + ld.u8 %r138, [%rd109+37]; + prmt.b32 %r139, %r138, %r137, 30212; + ld.u8 %r140, [%rd109+38]; + prmt.b32 %r141, %r140, %r139, 28756; + ld.u8 %r142, [%rd109+39]; + prmt.b32 %r143, %r142, %r141, 1620; + ld.u8 %r144, [%rd109+40]; + ld.u8 %r145, [%rd109+41]; + prmt.b32 %r146, %r145, %r144, 30212; + ld.u8 %r147, [%rd109+42]; + prmt.b32 %r148, %r147, %r146, 28756; + ld.u8 %r149, [%rd109+43]; + prmt.b32 %r150, %r149, %r148, 1620; + ld.u8 %r151, [%rd109+44]; + ld.u8 %r152, [%rd109+45]; + prmt.b32 %r153, %r152, %r151, 30212; + ld.u8 %r154, [%rd109+46]; + prmt.b32 %r155, %r154, %r153, 28756; + ld.u8 %r156, [%rd109+47]; + prmt.b32 %r157, %r156, %r155, 1620; + ld.u8 %r158, [%rd109+48]; + ld.u8 %r159, [%rd109+49]; + prmt.b32 %r160, %r159, %r158, 30212; + ld.u8 %r161, [%rd109+50]; + prmt.b32 %r162, %r161, %r160, 28756; + ld.u8 %r163, [%rd109+51]; + prmt.b32 %r164, %r163, %r162, 1620; + ld.u8 %r165, [%rd109+52]; + ld.u8 %r166, [%rd109+53]; + prmt.b32 %r167, %r166, %r165, 30212; + ld.u8 %r168, [%rd109+54]; + prmt.b32 %r169, %r168, %r167, 28756; + ld.u8 %r170, [%rd109+55]; + prmt.b32 %r171, %r170, %r169, 1620; + ld.u8 %r172, [%rd109+56]; + ld.u8 %r173, [%rd109+57]; + prmt.b32 %r174, %r173, %r172, 30212; + ld.u8 %r175, [%rd109+58]; + prmt.b32 %r176, %r175, %r174, 28756; + ld.u8 %r177, [%rd109+59]; + prmt.b32 %r178, %r177, %r176, 1620; + ld.u8 %r179, [%rd109+60]; + ld.u8 %r180, [%rd109+61]; + prmt.b32 %r181, %r180, %r179, 30212; + ld.u8 %r182, [%rd109+62]; + prmt.b32 %r183, %r182, %r181, 28756; + ld.u8 %r184, [%rd109+63]; + prmt.b32 %r185, %r184, %r183, 1620; + ld.local.u8 %r186, [%rd2+16]; + ld.local.u8 %r187, [%rd2+17]; + prmt.b32 %r188, %r187, %r186, 30212; + ld.local.u8 %r189, [%rd2+18]; + ld.local.u8 %r190, [%rd2+19]; + prmt.b32 %r191, %r190, %r189, 30212; + prmt.b32 %r192, %r191, %r188, 4180; + ld.local.u8 %r193, [%rd2]; + ld.local.u8 %r194, [%rd2+1]; + prmt.b32 %r195, %r194, %r193, 30212; + ld.local.u8 %r196, [%rd2+2]; + ld.local.u8 %r197, [%rd2+3]; + prmt.b32 %r198, %r197, %r196, 30212; + prmt.b32 %r199, %r198, %r195, 4180; + add.s32 %r200, %r192, %r199; + add.s32 %r201, %r200, %r80; + shf.l.wrap.b32 %r202, %r201, %r201, 16; + add.s32 %r203, %r202, 1779033703; + xor.b32 %r204, %r203, %r192; + shf.l.wrap.b32 %r205, %r204, %r204, 20; + add.s32 %r206, %r87, %r201; + add.s32 %r207, %r206, %r205; + xor.b32 %r208, %r207, %r202; + shf.l.wrap.b32 %r209, %r208, %r208, 24; + add.s32 %r210, %r209, %r203; + xor.b32 %r211, %r210, %r205; + shf.l.wrap.b32 %r212, %r211, %r211, 25; + ld.local.u8 %r213, [%rd2+20]; + ld.local.u8 %r214, [%rd2+21]; + prmt.b32 %r215, %r214, %r213, 30212; + ld.local.u8 %r216, [%rd2+22]; + ld.local.u8 %r217, [%rd2+23]; + prmt.b32 %r218, %r217, %r216, 30212; + prmt.b32 %r219, %r218, %r215, 4180; + ld.local.u8 %r220, [%rd2+4]; + ld.local.u8 %r221, [%rd2+5]; + prmt.b32 %r222, %r221, %r220, 30212; + ld.local.u8 %r223, [%rd2+6]; + ld.local.u8 %r224, [%rd2+7]; + prmt.b32 %r225, %r224, %r223, 30212; + prmt.b32 %r226, %r225, %r222, 4180; + add.s32 %r227, %r219, %r226; + add.s32 %r228, %r227, %r94; + shf.l.wrap.b32 %r229, %r228, %r228, 16; + add.s32 %r230, %r229, -1150833019; + xor.b32 %r231, %r230, %r219; + shf.l.wrap.b32 %r232, %r231, %r231, 20; + add.s32 %r233, %r101, %r228; + add.s32 %r234, %r233, %r232; + xor.b32 %r235, %r234, %r229; + shf.l.wrap.b32 %r236, %r235, %r235, 24; + add.s32 %r237, %r236, %r230; + xor.b32 %r238, %r237, %r232; + shf.l.wrap.b32 %r239, %r238, %r238, 25; + ld.local.u8 %r240, [%rd2+24]; + ld.local.u8 %r241, [%rd2+25]; + prmt.b32 %r242, %r241, %r240, 30212; + ld.local.u8 %r243, [%rd2+26]; + ld.local.u8 %r244, [%rd2+27]; + prmt.b32 %r245, %r244, %r243, 30212; + prmt.b32 %r246, %r245, %r242, 4180; + ld.local.u8 %r247, [%rd2+8]; + ld.local.u8 %r248, [%rd2+9]; + prmt.b32 %r249, %r248, %r247, 30212; + ld.local.u8 %r250, [%rd2+10]; + ld.local.u8 %r251, [%rd2+11]; + prmt.b32 %r252, %r251, %r250, 30212; + prmt.b32 %r253, %r252, %r249, 4180; + add.s32 %r254, %r246, %r253; + add.s32 %r255, %r254, %r108; + shr.u32 %r256, %r255, 16; + shl.b32 %r257, %r255, 16; + xor.b32 %r258, %r257, 4194304; + or.b32 %r259, %r258, %r256; + add.s32 %r260, %r259, 1013904242; + xor.b32 %r261, %r260, %r246; + shf.l.wrap.b32 %r262, %r261, %r261, 20; + add.s32 %r263, %r115, %r255; + add.s32 %r264, %r263, %r262; + xor.b32 %r265, %r264, %r259; + shf.l.wrap.b32 %r266, %r265, %r265, 24; + add.s32 %r267, %r266, %r260; + xor.b32 %r268, %r267, %r262; + shf.l.wrap.b32 %r269, %r268, %r268, 25; + ld.local.u8 %r270, [%rd2+28]; + ld.local.u8 %r271, [%rd2+29]; + prmt.b32 %r272, %r271, %r270, 30212; + ld.local.u8 %r273, [%rd2+30]; + ld.local.u8 %r274, [%rd2+31]; + prmt.b32 %r275, %r274, %r273, 30212; + prmt.b32 %r276, %r275, %r272, 4180; + ld.local.u8 %r277, [%rd2+12]; + ld.local.u8 %r278, [%rd2+13]; + prmt.b32 %r279, %r278, %r277, 30212; + ld.local.u8 %r280, [%rd2+14]; + ld.local.u8 %r281, [%rd2+15]; + prmt.b32 %r282, %r281, %r280, 30212; + prmt.b32 %r283, %r282, %r279, 4180; + add.s32 %r284, %r276, %r283; + add.s32 %r285, %r284, %r122; + xor.b32 %r286, %r285, %r1; + shr.u32 %r287, %r285, 16; + shl.b32 %r288, %r286, 16; + or.b32 %r289, %r288, %r287; + add.s32 %r290, %r289, -1521486534; + xor.b32 %r291, %r290, %r276; + shf.l.wrap.b32 %r292, %r291, %r291, 20; + add.s32 %r293, %r129, %r285; + add.s32 %r294, %r293, %r292; + xor.b32 %r295, %r294, %r289; + shf.l.wrap.b32 %r296, %r295, %r295, 24; + add.s32 %r297, %r296, %r290; + xor.b32 %r298, %r297, %r292; + shf.l.wrap.b32 %r299, %r298, %r298, 25; + add.s32 %r300, %r239, %r207; + add.s32 %r301, %r300, %r136; + xor.b32 %r302, %r296, %r301; + shf.l.wrap.b32 %r303, %r302, %r302, 16; + add.s32 %r304, %r303, %r267; + xor.b32 %r305, %r304, %r239; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r143, %r301; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + add.s32 %r314, %r269, %r234; + add.s32 %r315, %r314, %r150; + xor.b32 %r316, %r315, %r209; + shf.l.wrap.b32 %r317, %r316, %r316, 16; + add.s32 %r318, %r317, %r297; + xor.b32 %r319, %r318, %r269; + shf.l.wrap.b32 %r320, %r319, %r319, 20; + add.s32 %r321, %r157, %r315; + add.s32 %r322, %r321, %r320; + xor.b32 %r323, %r322, %r317; + shf.l.wrap.b32 %r324, %r323, %r323, 24; + add.s32 %r325, %r324, %r318; + xor.b32 %r326, %r325, %r320; + shf.l.wrap.b32 %r327, %r326, %r326, 25; + add.s32 %r328, %r299, %r264; + add.s32 %r329, %r328, %r164; + xor.b32 %r330, %r329, %r236; + shf.l.wrap.b32 %r331, %r330, %r330, 16; + add.s32 %r332, %r331, %r210; + xor.b32 %r333, %r332, %r299; + shf.l.wrap.b32 %r334, %r333, %r333, 20; + add.s32 %r335, %r171, %r329; + add.s32 %r336, %r335, %r334; + xor.b32 %r337, %r336, %r331; + shf.l.wrap.b32 %r338, %r337, %r337, 24; + add.s32 %r339, %r338, %r332; + xor.b32 %r340, %r339, %r334; + shf.l.wrap.b32 %r341, %r340, %r340, 25; + add.s32 %r342, %r294, %r212; + add.s32 %r343, %r342, %r178; + xor.b32 %r344, %r343, %r266; + shf.l.wrap.b32 %r345, %r344, %r344, 16; + add.s32 %r346, %r345, %r237; + xor.b32 %r347, %r346, %r212; + shf.l.wrap.b32 %r348, %r347, %r347, 20; + add.s32 %r349, %r185, %r343; + add.s32 %r350, %r349, %r348; + xor.b32 %r351, %r350, %r345; + shf.l.wrap.b32 %r352, %r351, %r351, 24; + add.s32 %r353, %r352, %r346; + xor.b32 %r354, %r353, %r348; + shf.l.wrap.b32 %r355, %r354, %r354, 25; + add.s32 %r356, %r308, %r94; + add.s32 %r357, %r356, %r355; + xor.b32 %r358, %r357, %r324; + shf.l.wrap.b32 %r359, %r358, %r358, 16; + add.s32 %r360, %r359, %r339; + xor.b32 %r361, %r360, %r355; + shf.l.wrap.b32 %r362, %r361, %r361, 20; + add.s32 %r363, %r357, %r122; + add.s32 %r364, %r363, %r362; + xor.b32 %r365, %r364, %r359; + shf.l.wrap.b32 %r366, %r365, %r365, 24; + add.s32 %r367, %r366, %r360; + xor.b32 %r368, %r367, %r362; + shf.l.wrap.b32 %r369, %r368, %r368, 25; + add.s32 %r370, %r322, %r101; + add.s32 %r371, %r370, %r313; + xor.b32 %r372, %r338, %r371; + shf.l.wrap.b32 %r373, %r372, %r372, 16; + add.s32 %r374, %r353, %r373; + xor.b32 %r375, %r374, %r313; + shf.l.wrap.b32 %r376, %r375, %r375, 20; + add.s32 %r377, %r371, %r150; + add.s32 %r378, %r377, %r376; + xor.b32 %r379, %r378, %r373; + shf.l.wrap.b32 %r380, %r379, %r379, 24; + add.s32 %r381, %r380, %r374; + xor.b32 %r382, %r381, %r376; + shf.l.wrap.b32 %r383, %r382, %r382, 25; + add.s32 %r384, %r327, %r129; + add.s32 %r385, %r384, %r336; + xor.b32 %r386, %r352, %r385; + shf.l.wrap.b32 %r387, %r386, %r386, 16; + add.s32 %r388, %r387, %r311; + xor.b32 %r389, %r388, %r327; + shf.l.wrap.b32 %r390, %r389, %r389, 20; + add.s32 %r391, %r385, %r80; + add.s32 %r392, %r391, %r390; + xor.b32 %r393, %r392, %r387; + shf.l.wrap.b32 %r394, %r393, %r393, 24; + add.s32 %r395, %r394, %r388; + xor.b32 %r396, %r395, %r390; + shf.l.wrap.b32 %r397, %r396, %r396, 25; + add.s32 %r398, %r341, %r108; + add.s32 %r399, %r398, %r350; + xor.b32 %r400, %r399, %r310; + shf.l.wrap.b32 %r401, %r400, %r400, 16; + add.s32 %r402, %r401, %r325; + xor.b32 %r403, %r402, %r341; + shf.l.wrap.b32 %r404, %r403, %r403, 20; + add.s32 %r405, %r399, %r171; + add.s32 %r406, %r405, %r404; + xor.b32 %r407, %r406, %r401; + shf.l.wrap.b32 %r408, %r407, %r407, 24; + add.s32 %r409, %r408, %r402; + xor.b32 %r410, %r409, %r404; + shf.l.wrap.b32 %r411, %r410, %r410, 25; + add.s32 %r412, %r383, %r87; + add.s32 %r413, %r412, %r364; + xor.b32 %r414, %r413, %r408; + shf.l.wrap.b32 %r415, %r414, %r414, 16; + add.s32 %r416, %r415, %r395; + xor.b32 %r417, %r416, %r383; + shf.l.wrap.b32 %r418, %r417, %r417, 20; + add.s32 %r419, %r413, %r157; + add.s32 %r420, %r419, %r418; + xor.b32 %r421, %r420, %r415; + shf.l.wrap.b32 %r422, %r421, %r421, 24; + add.s32 %r423, %r422, %r416; + xor.b32 %r424, %r423, %r418; + shf.l.wrap.b32 %r425, %r424, %r424, 25; + add.s32 %r426, %r378, %r164; + add.s32 %r427, %r426, %r397; + xor.b32 %r428, %r366, %r427; + shf.l.wrap.b32 %r429, %r428, %r428, 16; + add.s32 %r430, %r429, %r409; + xor.b32 %r431, %r430, %r397; + shf.l.wrap.b32 %r432, %r431, %r431, 20; + add.s32 %r433, %r427, %r115; + add.s32 %r434, %r433, %r432; + xor.b32 %r435, %r434, %r429; + shf.l.wrap.b32 %r436, %r435, %r435, 24; + add.s32 %r437, %r436, %r430; + xor.b32 %r438, %r437, %r432; + shf.l.wrap.b32 %r439, %r438, %r438, 25; + add.s32 %r440, %r392, %r143; + add.s32 %r441, %r440, %r411; + xor.b32 %r442, %r441, %r380; + shf.l.wrap.b32 %r443, %r442, %r442, 16; + add.s32 %r444, %r443, %r367; + xor.b32 %r445, %r444, %r411; + shf.l.wrap.b32 %r446, %r445, %r445, 20; + add.s32 %r447, %r441, %r178; + add.s32 %r448, %r447, %r446; + xor.b32 %r449, %r448, %r443; + shf.l.wrap.b32 %r450, %r449, %r449, 24; + add.s32 %r451, %r450, %r444; + xor.b32 %r452, %r451, %r446; + shf.l.wrap.b32 %r453, %r452, %r452, 25; + add.s32 %r454, %r406, %r185; + add.s32 %r455, %r454, %r369; + xor.b32 %r456, %r455, %r394; + shf.l.wrap.b32 %r457, %r456, %r456, 16; + add.s32 %r458, %r457, %r381; + xor.b32 %r459, %r458, %r369; + shf.l.wrap.b32 %r460, %r459, %r459, 20; + add.s32 %r461, %r455, %r136; + add.s32 %r462, %r461, %r460; + xor.b32 %r463, %r462, %r457; + shf.l.wrap.b32 %r464, %r463, %r463, 24; + add.s32 %r465, %r464, %r458; + xor.b32 %r466, %r465, %r460; + shf.l.wrap.b32 %r467, %r466, %r466, 25; + add.s32 %r468, %r420, %r101; + add.s32 %r469, %r468, %r467; + xor.b32 %r470, %r469, %r436; + shf.l.wrap.b32 %r471, %r470, %r470, 16; + add.s32 %r472, %r471, %r451; + xor.b32 %r473, %r472, %r467; + shf.l.wrap.b32 %r474, %r473, %r473, 20; + add.s32 %r475, %r469, %r108; + add.s32 %r476, %r475, %r474; + xor.b32 %r477, %r476, %r471; + shf.l.wrap.b32 %r478, %r477, %r477, 24; + add.s32 %r479, %r478, %r472; + xor.b32 %r480, %r479, %r474; + shf.l.wrap.b32 %r481, %r480, %r480, 25; + add.s32 %r482, %r434, %r150; + add.s32 %r483, %r482, %r425; + xor.b32 %r484, %r483, %r450; + shf.l.wrap.b32 %r485, %r484, %r484, 16; + add.s32 %r486, %r485, %r465; + xor.b32 %r487, %r486, %r425; + shf.l.wrap.b32 %r488, %r487, %r487, 20; + add.s32 %r489, %r483, %r164; + add.s32 %r490, %r489, %r488; + xor.b32 %r491, %r490, %r485; + shf.l.wrap.b32 %r492, %r491, %r491, 24; + add.s32 %r493, %r492, %r486; + xor.b32 %r494, %r493, %r488; + shf.l.wrap.b32 %r495, %r494, %r494, 25; + add.s32 %r496, %r448, %r171; + add.s32 %r497, %r496, %r439; + xor.b32 %r498, %r464, %r497; + shf.l.wrap.b32 %r499, %r498, %r498, 16; + add.s32 %r500, %r499, %r423; + xor.b32 %r501, %r500, %r439; + shf.l.wrap.b32 %r502, %r501, %r501, 20; + add.s32 %r503, %r497, %r94; + add.s32 %r504, %r503, %r502; + xor.b32 %r505, %r504, %r499; + shf.l.wrap.b32 %r506, %r505, %r505, 24; + add.s32 %r507, %r506, %r500; + xor.b32 %r508, %r507, %r502; + shf.l.wrap.b32 %r509, %r508, %r508, 25; + add.s32 %r510, %r453, %r129; + add.s32 %r511, %r510, %r462; + xor.b32 %r512, %r511, %r422; + shf.l.wrap.b32 %r513, %r512, %r512, 16; + add.s32 %r514, %r513, %r437; + xor.b32 %r515, %r514, %r453; + shf.l.wrap.b32 %r516, %r515, %r515, 20; + add.s32 %r517, %r511, %r178; + add.s32 %r518, %r517, %r516; + xor.b32 %r519, %r518, %r513; + shf.l.wrap.b32 %r520, %r519, %r519, 24; + add.s32 %r521, %r520, %r514; + xor.b32 %r522, %r521, %r516; + shf.l.wrap.b32 %r523, %r522, %r522, 25; + add.s32 %r524, %r495, %r122; + add.s32 %r525, %r524, %r476; + xor.b32 %r526, %r525, %r520; + shf.l.wrap.b32 %r527, %r526, %r526, 16; + add.s32 %r528, %r527, %r507; + xor.b32 %r529, %r528, %r495; + shf.l.wrap.b32 %r530, %r529, %r529, 20; + add.s32 %r531, %r525, %r115; + add.s32 %r532, %r531, %r530; + xor.b32 %r533, %r532, %r527; + shf.l.wrap.b32 %r534, %r533, %r533, 24; + add.s32 %r535, %r534, %r528; + xor.b32 %r536, %r535, %r530; + shf.l.wrap.b32 %r537, %r536, %r536, 25; + add.s32 %r538, %r490, %r143; + add.s32 %r539, %r538, %r509; + xor.b32 %r540, %r478, %r539; + shf.l.wrap.b32 %r541, %r540, %r540, 16; + add.s32 %r542, %r541, %r521; + xor.b32 %r543, %r542, %r509; + shf.l.wrap.b32 %r544, %r543, %r543, 20; + add.s32 %r545, %r539, %r80; + add.s32 %r546, %r545, %r544; + xor.b32 %r547, %r546, %r541; + shf.l.wrap.b32 %r548, %r547, %r547, 24; + add.s32 %r549, %r548, %r542; + xor.b32 %r550, %r549, %r544; + shf.l.wrap.b32 %r551, %r550, %r550, 25; + add.s32 %r552, %r504, %r157; + add.s32 %r553, %r552, %r523; + xor.b32 %r554, %r553, %r492; + shf.l.wrap.b32 %r555, %r554, %r554, 16; + add.s32 %r556, %r555, %r479; + xor.b32 %r557, %r556, %r523; + shf.l.wrap.b32 %r558, %r557, %r557, 20; + add.s32 %r559, %r553, %r185; + add.s32 %r560, %r559, %r558; + xor.b32 %r561, %r560, %r555; + shf.l.wrap.b32 %r562, %r561, %r561, 24; + add.s32 %r563, %r562, %r556; + xor.b32 %r564, %r563, %r558; + shf.l.wrap.b32 %r565, %r564, %r564, 25; + add.s32 %r566, %r518, %r136; + add.s32 %r567, %r566, %r481; + xor.b32 %r568, %r567, %r506; + shf.l.wrap.b32 %r569, %r568, %r568, 16; + add.s32 %r570, %r569, %r493; + xor.b32 %r571, %r570, %r481; + shf.l.wrap.b32 %r572, %r571, %r571, 20; + add.s32 %r573, %r567, %r87; + add.s32 %r574, %r573, %r572; + xor.b32 %r575, %r574, %r569; + shf.l.wrap.b32 %r576, %r575, %r575, 24; + add.s32 %r577, %r576, %r570; + xor.b32 %r578, %r577, %r572; + shf.l.wrap.b32 %r579, %r578, %r578, 25; + add.s32 %r580, %r532, %r150; + add.s32 %r581, %r580, %r579; + xor.b32 %r582, %r581, %r548; + shf.l.wrap.b32 %r583, %r582, %r582, 16; + add.s32 %r584, %r583, %r563; + xor.b32 %r585, %r584, %r579; + shf.l.wrap.b32 %r586, %r585, %r585, 20; + add.s32 %r587, %r581, %r129; + add.s32 %r588, %r587, %r586; + xor.b32 %r589, %r588, %r583; + shf.l.wrap.b32 %r590, %r589, %r589, 24; + add.s32 %r591, %r590, %r584; + xor.b32 %r592, %r591, %r586; + shf.l.wrap.b32 %r593, %r592, %r592, 25; + add.s32 %r594, %r546, %r164; + add.s32 %r595, %r594, %r537; + xor.b32 %r596, %r595, %r562; + shf.l.wrap.b32 %r597, %r596, %r596, 16; + add.s32 %r598, %r597, %r577; + xor.b32 %r599, %r598, %r537; + shf.l.wrap.b32 %r600, %r599, %r599, 20; + add.s32 %r601, %r595, %r143; + add.s32 %r602, %r601, %r600; + xor.b32 %r603, %r602, %r597; + shf.l.wrap.b32 %r604, %r603, %r603, 24; + add.s32 %r605, %r604, %r598; + xor.b32 %r606, %r605, %r600; + shf.l.wrap.b32 %r607, %r606, %r606, 25; + add.s32 %r608, %r560, %r178; + add.s32 %r609, %r608, %r551; + xor.b32 %r610, %r576, %r609; + shf.l.wrap.b32 %r611, %r610, %r610, 16; + add.s32 %r612, %r611, %r535; + xor.b32 %r613, %r612, %r551; + shf.l.wrap.b32 %r614, %r613, %r613, 20; + add.s32 %r615, %r609, %r101; + add.s32 %r616, %r615, %r614; + xor.b32 %r617, %r616, %r611; + shf.l.wrap.b32 %r618, %r617, %r617, 24; + add.s32 %r619, %r618, %r612; + xor.b32 %r620, %r619, %r614; + shf.l.wrap.b32 %r621, %r620, %r620, 25; + add.s32 %r622, %r565, %r171; + add.s32 %r623, %r622, %r574; + xor.b32 %r624, %r623, %r534; + shf.l.wrap.b32 %r625, %r624, %r624, 16; + add.s32 %r626, %r625, %r549; + xor.b32 %r627, %r626, %r565; + shf.l.wrap.b32 %r628, %r627, %r627, 20; + add.s32 %r629, %r623, %r185; + add.s32 %r630, %r629, %r628; + xor.b32 %r631, %r630, %r625; + shf.l.wrap.b32 %r632, %r631, %r631, 24; + add.s32 %r633, %r632, %r626; + xor.b32 %r634, %r633, %r628; + shf.l.wrap.b32 %r635, %r634, %r634, 25; + add.s32 %r636, %r607, %r108; + add.s32 %r637, %r636, %r588; + xor.b32 %r638, %r637, %r632; + shf.l.wrap.b32 %r639, %r638, %r638, 16; + add.s32 %r640, %r639, %r619; + xor.b32 %r641, %r640, %r607; + shf.l.wrap.b32 %r642, %r641, %r641, 20; + add.s32 %r643, %r637, %r80; + add.s32 %r644, %r643, %r642; + xor.b32 %r645, %r644, %r639; + shf.l.wrap.b32 %r646, %r645, %r645, 24; + add.s32 %r647, %r646, %r640; + xor.b32 %r648, %r647, %r642; + shf.l.wrap.b32 %r649, %r648, %r648, 25; + add.s32 %r650, %r602, %r157; + add.s32 %r651, %r650, %r621; + xor.b32 %r652, %r590, %r651; + shf.l.wrap.b32 %r653, %r652, %r652, 16; + add.s32 %r654, %r653, %r633; + xor.b32 %r655, %r654, %r621; + shf.l.wrap.b32 %r656, %r655, %r655, 20; + add.s32 %r657, %r651, %r94; + add.s32 %r658, %r657, %r656; + xor.b32 %r659, %r658, %r653; + shf.l.wrap.b32 %r660, %r659, %r659, 24; + add.s32 %r661, %r660, %r654; + xor.b32 %r662, %r661, %r656; + shf.l.wrap.b32 %r663, %r662, %r662, 25; + add.s32 %r664, %r616, %r115; + add.s32 %r665, %r664, %r635; + xor.b32 %r666, %r665, %r604; + shf.l.wrap.b32 %r667, %r666, %r666, 16; + add.s32 %r668, %r667, %r591; + xor.b32 %r669, %r668, %r635; + shf.l.wrap.b32 %r670, %r669, %r669, 20; + add.s32 %r671, %r665, %r136; + add.s32 %r672, %r671, %r670; + xor.b32 %r673, %r672, %r667; + shf.l.wrap.b32 %r674, %r673, %r673, 24; + add.s32 %r675, %r674, %r668; + xor.b32 %r676, %r675, %r670; + shf.l.wrap.b32 %r677, %r676, %r676, 25; + add.s32 %r678, %r630, %r87; + add.s32 %r679, %r678, %r593; + xor.b32 %r680, %r679, %r618; + shf.l.wrap.b32 %r681, %r680, %r680, 16; + add.s32 %r682, %r681, %r605; + xor.b32 %r683, %r682, %r593; + shf.l.wrap.b32 %r684, %r683, %r683, 20; + add.s32 %r685, %r679, %r122; + add.s32 %r686, %r685, %r684; + xor.b32 %r687, %r686, %r681; + shf.l.wrap.b32 %r688, %r687, %r687, 24; + add.s32 %r689, %r688, %r682; + xor.b32 %r690, %r689, %r684; + shf.l.wrap.b32 %r691, %r690, %r690, 25; + add.s32 %r692, %r644, %r164; + add.s32 %r693, %r692, %r691; + xor.b32 %r694, %r693, %r660; + shf.l.wrap.b32 %r695, %r694, %r694, 16; + add.s32 %r696, %r695, %r675; + xor.b32 %r697, %r696, %r691; + shf.l.wrap.b32 %r698, %r697, %r697, 20; + add.s32 %r699, %r693, %r171; + add.s32 %r700, %r699, %r698; + xor.b32 %r701, %r700, %r695; + shf.l.wrap.b32 %r702, %r701, %r701, 24; + add.s32 %r703, %r702, %r696; + xor.b32 %r704, %r703, %r698; + shf.l.wrap.b32 %r705, %r704, %r704, 25; + add.s32 %r706, %r658, %r143; + add.s32 %r707, %r706, %r649; + xor.b32 %r708, %r707, %r674; + shf.l.wrap.b32 %r709, %r708, %r708, 16; + add.s32 %r710, %r709, %r689; + xor.b32 %r711, %r710, %r649; + shf.l.wrap.b32 %r712, %r711, %r711, 20; + add.s32 %r713, %r707, %r157; + add.s32 %r714, %r713, %r712; + xor.b32 %r715, %r714, %r709; + shf.l.wrap.b32 %r716, %r715, %r715, 24; + add.s32 %r717, %r716, %r710; + xor.b32 %r718, %r717, %r712; + shf.l.wrap.b32 %r719, %r718, %r718, 25; + add.s32 %r720, %r672, %r185; + add.s32 %r721, %r720, %r663; + xor.b32 %r722, %r688, %r721; + shf.l.wrap.b32 %r723, %r722, %r722, 16; + add.s32 %r724, %r723, %r647; + xor.b32 %r725, %r724, %r663; + shf.l.wrap.b32 %r726, %r725, %r725, 20; + add.s32 %r727, %r721, %r150; + add.s32 %r728, %r727, %r726; + xor.b32 %r729, %r728, %r723; + shf.l.wrap.b32 %r730, %r729, %r729, 24; + add.s32 %r731, %r730, %r724; + xor.b32 %r732, %r731, %r726; + shf.l.wrap.b32 %r733, %r732, %r732, 25; + add.s32 %r734, %r677, %r178; + add.s32 %r735, %r734, %r686; + xor.b32 %r736, %r735, %r646; + shf.l.wrap.b32 %r737, %r736, %r736, 16; + add.s32 %r738, %r737, %r661; + xor.b32 %r739, %r738, %r677; + shf.l.wrap.b32 %r740, %r739, %r739, 20; + add.s32 %r741, %r735, %r136; + add.s32 %r742, %r741, %r740; + xor.b32 %r743, %r742, %r737; + shf.l.wrap.b32 %r744, %r743, %r743, 24; + add.s32 %r745, %r744, %r738; + xor.b32 %r746, %r745, %r740; + shf.l.wrap.b32 %r747, %r746, %r746, 25; + add.s32 %r748, %r719, %r129; + add.s32 %r749, %r748, %r700; + xor.b32 %r750, %r749, %r744; + shf.l.wrap.b32 %r751, %r750, %r750, 16; + add.s32 %r752, %r751, %r731; + xor.b32 %r753, %r752, %r719; + shf.l.wrap.b32 %r754, %r753, %r753, 20; + add.s32 %r755, %r749, %r94; + add.s32 %r756, %r755, %r754; + xor.b32 %r757, %r756, %r751; + shf.l.wrap.b32 %r758, %r757, %r757, 24; + add.s32 %r759, %r758, %r752; + xor.b32 %r760, %r759, %r754; + shf.l.wrap.b32 %r761, %r760, %r760, 25; + add.s32 %r762, %r714, %r115; + add.s32 %r763, %r762, %r733; + xor.b32 %r764, %r702, %r763; + shf.l.wrap.b32 %r765, %r764, %r764, 16; + add.s32 %r766, %r765, %r745; + xor.b32 %r767, %r766, %r733; + shf.l.wrap.b32 %r768, %r767, %r767, 20; + add.s32 %r769, %r763, %r101; + add.s32 %r770, %r769, %r768; + xor.b32 %r771, %r770, %r765; + shf.l.wrap.b32 %r772, %r771, %r771, 24; + add.s32 %r773, %r772, %r766; + xor.b32 %r774, %r773, %r768; + shf.l.wrap.b32 %r775, %r774, %r774, 25; + add.s32 %r776, %r728, %r80; + add.s32 %r777, %r776, %r747; + xor.b32 %r778, %r777, %r716; + shf.l.wrap.b32 %r779, %r778, %r778, 16; + add.s32 %r780, %r779, %r703; + xor.b32 %r781, %r780, %r747; + shf.l.wrap.b32 %r782, %r781, %r781, 20; + add.s32 %r783, %r777, %r87; + add.s32 %r784, %r783, %r782; + xor.b32 %r785, %r784, %r779; + shf.l.wrap.b32 %r786, %r785, %r785, 24; + add.s32 %r787, %r786, %r780; + xor.b32 %r788, %r787, %r782; + shf.l.wrap.b32 %r789, %r788, %r788, 25; + add.s32 %r790, %r742, %r122; + add.s32 %r791, %r790, %r705; + xor.b32 %r792, %r791, %r730; + shf.l.wrap.b32 %r793, %r792, %r792, 16; + add.s32 %r794, %r793, %r717; + xor.b32 %r795, %r794, %r705; + shf.l.wrap.b32 %r796, %r795, %r795, 20; + add.s32 %r797, %r791, %r108; + add.s32 %r798, %r797, %r796; + xor.b32 %r799, %r798, %r793; + shf.l.wrap.b32 %r800, %r799, %r799, 24; + add.s32 %r801, %r800, %r794; + xor.b32 %r802, %r801, %r796; + shf.l.wrap.b32 %r803, %r802, %r802, 25; + add.s32 %r804, %r756, %r143; + add.s32 %r805, %r804, %r803; + xor.b32 %r806, %r805, %r772; + shf.l.wrap.b32 %r807, %r806, %r806, 16; + add.s32 %r808, %r807, %r787; + xor.b32 %r809, %r808, %r803; + shf.l.wrap.b32 %r810, %r809, %r809, 20; + add.s32 %r811, %r805, %r178; + add.s32 %r812, %r811, %r810; + xor.b32 %r813, %r812, %r807; + shf.l.wrap.b32 %r814, %r813, %r813, 24; + add.s32 %r815, %r814, %r808; + xor.b32 %r816, %r815, %r810; + shf.l.wrap.b32 %r817, %r816, %r816, 25; + add.s32 %r818, %r770, %r157; + add.s32 %r819, %r818, %r761; + xor.b32 %r820, %r819, %r786; + shf.l.wrap.b32 %r821, %r820, %r820, 16; + add.s32 %r822, %r821, %r801; + xor.b32 %r823, %r822, %r761; + shf.l.wrap.b32 %r824, %r823, %r823, 20; + add.s32 %r825, %r819, %r115; + add.s32 %r826, %r825, %r824; + xor.b32 %r827, %r826, %r821; + shf.l.wrap.b32 %r828, %r827, %r827, 24; + add.s32 %r829, %r828, %r822; + xor.b32 %r830, %r829, %r824; + shf.l.wrap.b32 %r831, %r830, %r830, 25; + add.s32 %r832, %r784, %r136; + add.s32 %r833, %r832, %r775; + xor.b32 %r834, %r800, %r833; + shf.l.wrap.b32 %r835, %r834, %r834, 16; + add.s32 %r836, %r835, %r759; + xor.b32 %r837, %r836, %r775; + shf.l.wrap.b32 %r838, %r837, %r837, 20; + add.s32 %r839, %r833, %r164; + add.s32 %r840, %r839, %r838; + xor.b32 %r841, %r840, %r835; + shf.l.wrap.b32 %r842, %r841, %r841, 24; + add.s32 %r843, %r842, %r836; + xor.b32 %r844, %r843, %r838; + shf.l.wrap.b32 %r845, %r844, %r844, 25; + add.s32 %r846, %r789, %r185; + add.s32 %r847, %r846, %r798; + xor.b32 %r848, %r847, %r758; + shf.l.wrap.b32 %r849, %r848, %r848, 16; + add.s32 %r850, %r849, %r773; + xor.b32 %r851, %r850, %r789; + shf.l.wrap.b32 %r852, %r851, %r851, 20; + add.s32 %r853, %r847, %r87; + add.s32 %r854, %r853, %r852; + xor.b32 %r855, %r854, %r849; + shf.l.wrap.b32 %r856, %r855, %r855, 24; + add.s32 %r857, %r856, %r850; + xor.b32 %r858, %r857, %r852; + shf.l.wrap.b32 %r859, %r858, %r858, 25; + add.s32 %r860, %r831, %r171; + add.s32 %r861, %r860, %r812; + xor.b32 %r862, %r861, %r856; + shf.l.wrap.b32 %r863, %r862, %r862, 16; + add.s32 %r864, %r863, %r843; + xor.b32 %r865, %r864, %r831; + shf.l.wrap.b32 %r866, %r865, %r865, 20; + add.s32 %r867, %r861, %r101; + add.s32 %r868, %r867, %r866; + xor.b32 %r869, %r868, %r863; + shf.l.wrap.b32 %r870, %r869, %r869, 24; + add.s32 %r871, %r870, %r864; + xor.b32 %r872, %r871, %r866; + shf.l.wrap.b32 %r873, %r872, %r872, 25; + add.s32 %r874, %r826, %r80; + add.s32 %r875, %r874, %r845; + xor.b32 %r876, %r814, %r875; + shf.l.wrap.b32 %r877, %r876, %r876, 16; + add.s32 %r878, %r877, %r857; + xor.b32 %r879, %r878, %r845; + shf.l.wrap.b32 %r880, %r879, %r879, 20; + add.s32 %r881, %r875, %r150; + add.s32 %r882, %r881, %r880; + xor.b32 %r883, %r882, %r877; + shf.l.wrap.b32 %r884, %r883, %r883, 24; + add.s32 %r885, %r884, %r878; + xor.b32 %r886, %r885, %r880; + shf.l.wrap.b32 %r887, %r886, %r886, 25; + add.s32 %r888, %r840, %r94; + add.s32 %r889, %r888, %r859; + xor.b32 %r890, %r889, %r828; + shf.l.wrap.b32 %r891, %r890, %r890, 16; + add.s32 %r892, %r891, %r815; + xor.b32 %r893, %r892, %r859; + shf.l.wrap.b32 %r894, %r893, %r893, 20; + add.s32 %r895, %r889, %r122; + add.s32 %r896, %r895, %r894; + xor.b32 %r897, %r896, %r891; + shf.l.wrap.b32 %r898, %r897, %r897, 24; + add.s32 %r899, %r898, %r892; + xor.b32 %r900, %r899, %r894; + shf.l.wrap.b32 %r901, %r900, %r900, 25; + add.s32 %r902, %r854, %r108; + add.s32 %r903, %r902, %r817; + xor.b32 %r904, %r903, %r842; + shf.l.wrap.b32 %r905, %r904, %r904, 16; + add.s32 %r906, %r905, %r829; + xor.b32 %r907, %r906, %r817; + shf.l.wrap.b32 %r908, %r907, %r907, 20; + add.s32 %r909, %r903, %r129; + add.s32 %r910, %r909, %r908; + xor.b32 %r911, %r910, %r905; + shf.l.wrap.b32 %r912, %r911, %r911, 24; + add.s32 %r913, %r912, %r906; + xor.b32 %r914, %r913, %r908; + shf.l.wrap.b32 %r915, %r914, %r914, 25; + add.s32 %r916, %r868, %r157; + add.s32 %r917, %r916, %r915; + xor.b32 %r918, %r917, %r884; + shf.l.wrap.b32 %r919, %r918, %r918, 16; + add.s32 %r920, %r919, %r899; + xor.b32 %r921, %r920, %r915; + shf.l.wrap.b32 %r922, %r921, %r921, 20; + add.s32 %r923, %r917, %r185; + add.s32 %r924, %r923, %r922; + xor.b32 %r925, %r924, %r919; + shf.l.wrap.b32 %r926, %r925, %r925, 24; + add.s32 %r927, %r926, %r920; + xor.b32 %r928, %r927, %r922; + shf.l.wrap.b32 %r929, %r928, %r928, 25; + add.s32 %r930, %r882, %r115; + add.s32 %r931, %r930, %r873; + xor.b32 %r932, %r931, %r898; + shf.l.wrap.b32 %r933, %r932, %r932, 16; + add.s32 %r934, %r933, %r913; + xor.b32 %r935, %r934, %r873; + shf.l.wrap.b32 %r936, %r935, %r935, 20; + add.s32 %r937, %r931, %r80; + add.s32 %r938, %r937, %r936; + xor.b32 %r939, %r938, %r933; + shf.l.wrap.b32 %r940, %r939, %r939, 24; + add.s32 %r941, %r940, %r934; + xor.b32 %r942, %r941, %r936; + shf.l.wrap.b32 %r943, %r942, %r942, 25; + add.s32 %r944, %r896, %r87; + add.s32 %r945, %r944, %r887; + xor.b32 %r946, %r912, %r945; + shf.l.wrap.b32 %r947, %r946, %r946, 16; + add.s32 %r948, %r947, %r871; + xor.b32 %r949, %r948, %r887; + shf.l.wrap.b32 %r950, %r949, %r949, 20; + add.s32 %r951, %r945, %r143; + add.s32 %r952, %r951, %r950; + xor.b32 %r953, %r952, %r947; + shf.l.wrap.b32 %r954, %r953, %r953, 24; + add.s32 %r955, %r954, %r948; + xor.b32 %r956, %r955, %r950; + shf.l.wrap.b32 %r957, %r956, %r956, 25; + add.s32 %r958, %r901, %r136; + add.s32 %r959, %r958, %r910; + xor.b32 %r960, %r959, %r870; + shf.l.wrap.b32 %r961, %r960, %r960, 16; + add.s32 %r962, %r961, %r885; + xor.b32 %r963, %r962, %r901; + shf.l.wrap.b32 %r964, %r963, %r963, 20; + add.s32 %r965, %r959, %r122; + add.s32 %r966, %r965, %r964; + xor.b32 %r967, %r966, %r961; + shf.l.wrap.b32 %r968, %r967, %r967, 24; + add.s32 %r969, %r968, %r962; + xor.b32 %r970, %r969, %r964; + shf.l.wrap.b32 %r971, %r970, %r970, 25; + add.s32 %r972, %r943, %r178; + add.s32 %r973, %r972, %r924; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 16; + add.s32 %r976, %r975, %r955; + xor.b32 %r977, %r976, %r943; + shf.l.wrap.b32 %r978, %r977, %r977, 20; + add.s32 %r979, %r973, %r150; + add.s32 %r980, %r979, %r978; + xor.b32 %r981, %r980, %r975; + shf.l.wrap.b32 %r982, %r981, %r981, 24; + add.s32 %r983, %r982, %r976; + xor.b32 %r984, %r983, %r978; + shf.l.wrap.b32 %r985, %r984, %r984, 25; + add.s32 %r986, %r938, %r94; + add.s32 %r987, %r986, %r957; + xor.b32 %r988, %r926, %r987; + shf.l.wrap.b32 %r989, %r988, %r988, 16; + add.s32 %r990, %r989, %r969; + xor.b32 %r991, %r990, %r957; + shf.l.wrap.b32 %r992, %r991, %r991, 20; + add.s32 %r993, %r987, %r164; + add.s32 %r994, %r993, %r992; + xor.b32 %r995, %r994, %r989; + shf.l.wrap.b32 %r996, %r995, %r995, 24; + add.s32 %r997, %r996, %r990; + xor.b32 %r998, %r997, %r992; + shf.l.wrap.b32 %r999, %r998, %r998, 25; + add.s32 %r1000, %r952, %r101; + add.s32 %r1001, %r1000, %r971; + xor.b32 %r1002, %r1001, %r940; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 16; + add.s32 %r1004, %r1003, %r927; + xor.b32 %r1005, %r1004, %r971; + shf.l.wrap.b32 %r1006, %r1005, %r1005, 20; + add.s32 %r1007, %r1001, %r108; + add.s32 %r1008, %r1007, %r1006; + xor.b32 %r1009, %r1008, %r1003; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 24; + add.s32 %r1011, %r1010, %r1004; + xor.b32 %r1012, %r1011, %r1006; + shf.l.wrap.b32 %r1013, %r1012, %r1012, 25; + add.s32 %r1014, %r966, %r129; + add.s32 %r1015, %r1014, %r929; + xor.b32 %r1016, %r1015, %r954; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 16; + add.s32 %r1018, %r1017, %r941; + xor.b32 %r1019, %r1018, %r929; + shf.l.wrap.b32 %r1020, %r1019, %r1019, 20; + add.s32 %r1021, %r1015, %r171; + add.s32 %r1022, %r1021, %r1020; + xor.b32 %r1023, %r1022, %r1017; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 24; + add.s32 %r1025, %r1024, %r1018; + xor.b32 %r1026, %r1025, %r1020; + shf.l.wrap.b32 %r1027, %r1026, %r1026, 25; + xor.b32 %r1028, %r1011, %r980; + xor.b32 %r1029, %r1025, %r994; + xor.b32 %r1030, %r983, %r1008; + xor.b32 %r1031, %r1022, %r997; + xor.b32 %r1032, %r1027, %r996; + xor.b32 %r1033, %r985, %r1010; + xor.b32 %r1034, %r1024, %r999; + xor.b32 %r1035, %r1013, %r982; + st.local.u8 [%rd155], %r1028; + shr.u32 %r1036, %r1028, 8; + st.local.u8 [%rd155+1], %r1036; + shr.u32 %r1037, %r1028, 16; + st.local.u8 [%rd155+2], %r1037; + shr.u32 %r1038, %r1028, 24; + st.local.u8 [%rd155+3], %r1038; + st.local.u8 [%rd155+4], %r1029; + shr.u32 %r1039, %r1029, 8; + st.local.u8 [%rd155+5], %r1039; + shr.u32 %r1040, %r1029, 16; + st.local.u8 [%rd155+6], %r1040; + shr.u32 %r1041, %r1029, 24; + st.local.u8 [%rd155+7], %r1041; + st.local.u8 [%rd155+8], %r1030; + shr.u32 %r1042, %r1030, 8; + st.local.u8 [%rd155+9], %r1042; + shr.u32 %r1043, %r1030, 16; + st.local.u8 [%rd155+10], %r1043; + shr.u32 %r1044, %r1030, 24; + st.local.u8 [%rd155+11], %r1044; + st.local.u8 [%rd155+12], %r1031; + shr.u32 %r1045, %r1031, 8; + st.local.u8 [%rd155+13], %r1045; + shr.u32 %r1046, %r1031, 16; + st.local.u8 [%rd155+14], %r1046; + shr.u32 %r1047, %r1031, 24; + st.local.u8 [%rd155+15], %r1047; + st.local.u8 [%rd155+16], %r1032; + shr.u32 %r1048, %r1032, 8; + st.local.u8 [%rd155+17], %r1048; + shr.u32 %r1049, %r1032, 16; + st.local.u8 [%rd155+18], %r1049; + shr.u32 %r1050, %r1032, 24; + st.local.u8 [%rd155+19], %r1050; + st.local.u8 [%rd155+20], %r1033; + shr.u32 %r1051, %r1033, 8; + st.local.u8 [%rd155+21], %r1051; + shr.u32 %r1052, %r1033, 16; + st.local.u8 [%rd155+22], %r1052; + shr.u32 %r1053, %r1033, 24; + st.local.u8 [%rd155+23], %r1053; + st.local.u8 [%rd155+24], %r1034; + shr.u32 %r1054, %r1034, 8; + st.local.u8 [%rd155+25], %r1054; + shr.u32 %r1055, %r1034, 16; + st.local.u8 [%rd155+26], %r1055; + shr.u32 %r1056, %r1034, 24; + st.local.u8 [%rd155+27], %r1056; + st.local.u8 [%rd155+28], %r1035; + shr.u32 %r1057, %r1035, 8; + st.local.u8 [%rd155+29], %r1057; + shr.u32 %r1058, %r1035, 16; + st.local.u8 [%rd155+30], %r1058; + shr.u32 %r1059, %r1035, 24; + st.local.u8 [%rd155+31], %r1059; + add.s64 %rd153, %rd153, 8; + add.s64 %rd155, %rd155, 32; + add.s64 %rd154, %rd154, -1; + setp.ne.s64 %p12, %rd154, 0; + @%p12 bra $L__BB0_7; + +$L__BB0_8: + setp.le.u64 %p13, %rd7, %rd152; + @%p13 bra $L__BB0_30; + + add.u64 %rd144, %SPL, 96; + ld.param.u64 %rd142, [_ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh_param_5]; + cvta.to.local.u64 %rd141, %rd142; + shl.b64 %rd111, %rd151, 6; + shl.b64 %rd112, %rd151, 5; + add.s64 %rd27, %rd141, %rd112; + add.s64 %rd28, %rd144, %rd111; + mov.u64 %rd156, 0; + +$L__BB0_10: + add.s64 %rd113, %rd28, %rd156; + ld.local.u8 %rs77, [%rd113]; + add.s64 %rd114, %rd27, %rd156; + st.local.u8 [%rd114], %rs77; + add.s64 %rd156, %rd156, 1; + setp.lt.u64 %p14, %rd156, 32; + @%p14 bra $L__BB0_10; + + add.s64 %rd151, %rd151, 1; + +$L__BB0_30: + st.param.b64 [func_retval0+0], %rd151; + ret; + +} +.func _Z20blake3_hasher_updateP13blake3_hasherPKvy( + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0, + .param .b64 _Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1 +) +{ + .local .align 16 .b8 __local_depot1[144]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<54>; + .reg .b16 %rs<393>; + .reg .b32 %r<11690>; + .reg .b64 %rd<273>; + + + mov.u64 %SPL, __local_depot1; + cvta.local.u64 %SP, %SPL; + ld.param.u64 %rd98, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + ld.param.u64 %rd254, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd261, %rd254; + cvta.to.local.u64 %rd2, %rd98; + add.s64 %rd3, %rd2, 136; + ld.local.v2.u8 {%rs102, %rs103}, [%rd2+136]; + cvt.u64.u16 %rd4, %rs103; + cvt.u32.u16 %r144, %rs103; + mul.wide.u32 %rd101, %r144, 64; + cvt.u64.u16 %rd5, %rs102; + neg.s64 %rd102, %rd5; + setp.eq.s64 %p1, %rd101, %rd102; + mov.u64 %rd262, 80; + @%p1 bra $L__BB1_24; + + shl.b64 %rd103, %rd4, 6; + mov.u64 %rd104, 1024; + sub.s64 %rd105, %rd104, %rd5; + sub.s64 %rd106, %rd105, %rd103; + min.u64 %rd6, %rd106, 80; + setp.eq.s16 %p2, %rs102, 0; + mov.u16 %rs351, 0; + mov.u64 %rd244, %rd6; + @%p2 bra $L__BB1_9; + + cvt.u32.u16 %r145, %rs102; + prmt.b32 %r147, %r144, %r145, 30212; + cvt.u16.u32 %rs350, %r147; + mov.u64 %rd107, 64; + sub.s64 %rd108, %rd107, %rd5; + min.u64 %rd7, %rd108, %rd6; + setp.eq.s64 %p3, %rd7, 0; + @%p3 bra $L__BB1_6; + + add.s64 %rd110, %rd2, %rd5; + add.s64 %rd8, %rd110, 72; + mov.u64 %rd237, 0; + +$L__BB1_4: + add.s64 %rd111, %rd261, %rd237; + ld.local.u8 %rs107, [%rd111]; + add.s64 %rd112, %rd8, %rd237; + st.local.u8 [%rd112], %rs107; + add.s64 %rd237, %rd237, 1; + setp.lt.u64 %p4, %rd237, %rd7; + @%p4 bra $L__BB1_4; + + ld.local.u8 %rs350, [%rd3]; + +$L__BB1_6: + cvt.u16.u64 %rs108, %rd7; + add.s16 %rs351, %rs350, %rs108; + mov.u64 %rd244, 0; + st.local.u8 [%rd3], %rs351; + add.s64 %rd261, %rd261, %rd7; + sub.s64 %rd12, %rd6, %rd7; + setp.eq.s64 %p5, %rd12, 0; + @%p5 bra $L__BB1_9; + + add.s64 %rd13, %rd2, 72; + ld.local.u8 %rs109, [%rd3+1]; + mov.u64 %rd238, 0; + setp.eq.s16 %p6, %rs109, 0; + mov.u16 %rs351, 0; + selp.u16 %rs111, 1, 0, %p6; + ld.local.u8 %rs112, [%rd3+2]; + or.b16 %rs113, %rs112, %rs111; + ld.local.u8 %r148, [%rd3+-64]; + ld.local.u8 %r149, [%rd3+-63]; + prmt.b32 %r150, %r149, %r148, 30212; + ld.local.u8 %r151, [%rd3+-62]; + prmt.b32 %r152, %r151, %r150, 28756; + ld.local.u8 %r153, [%rd3+-61]; + prmt.b32 %r154, %r153, %r152, 1620; + ld.local.u8 %r155, [%rd3+-60]; + ld.local.u8 %r156, [%rd3+-59]; prmt.b32 %r157, %r156, %r155, 30212; - cvt.u32.u16 %r158, %rs39; + ld.local.u8 %r158, [%rd3+-58]; prmt.b32 %r159, %r158, %r157, 28756; - cvt.u32.u16 %r160, %rs47; + ld.local.u8 %r160, [%rd3+-57]; prmt.b32 %r161, %r160, %r159, 1620; - cvt.u32.u16 %r162, %rs48; - and.b32 %r163, %r162, 255; - cvt.u32.u16 %r164, %rs53; - prmt.b32 %r165, %r164, %r163, 30212; - cvt.u32.u16 %r166, %rs49; - prmt.b32 %r167, %r166, %r165, 28756; - cvt.u32.u16 %r168, %rs55; - prmt.b32 %r169, %r168, %r167, 1620; - cvt.u32.u16 %r170, %rs50; - and.b32 %r171, %r170, 255; - cvt.u32.u16 %r172, %rs57; - prmt.b32 %r173, %r172, %r171, 30212; - cvt.u32.u16 %r174, %rs51; - prmt.b32 %r175, %r174, %r173, 28756; - cvt.u32.u16 %r176, %rs59; - prmt.b32 %r177, %r176, %r175, 1620; - cvt.u32.u16 %r178, %rs69; - and.b32 %r179, %r178, 255; - add.s32 %r180, %r69, -1156040474; - shf.l.wrap.b32 %r181, %r180, %r180, 16; - add.s32 %r182, %r181, 1779033703; - xor.b32 %r183, %r182, 1359893119; - shf.l.wrap.b32 %r184, %r183, %r183, 20; - add.s32 %r185, %r77, %r180; - add.s32 %r186, %r185, %r184; - xor.b32 %r187, %r186, %r181; - shf.l.wrap.b32 %r188, %r187, %r187, 24; - add.s32 %r189, %r188, %r182; - xor.b32 %r190, %r189, %r184; - shf.l.wrap.b32 %r191, %r190, %r190, 25; - add.s32 %r192, %r81, 1449989905; - shf.l.wrap.b32 %r193, %r192, %r192, 16; - add.s32 %r194, %r193, -1150833019; - xor.b32 %r195, %r194, -1694144372; - shf.l.wrap.b32 %r196, %r195, %r195, 20; - add.s32 %r197, %r89, %r192; - add.s32 %r198, %r197, %r196; - xor.b32 %r199, %r198, %r193; - shf.l.wrap.b32 %r200, %r199, %r199, 24; - add.s32 %r201, %r200, %r194; - xor.b32 %r202, %r201, %r196; - shf.l.wrap.b32 %r203, %r202, %r202, 25; - add.s32 %r204, %r93, 1542638877; - shr.u32 %r205, %r204, 16; - shl.b32 %r206, %r204, 16; - xor.b32 %r207, %r206, 4194304; - or.b32 %r208, %r207, %r205; - add.s32 %r209, %r208, 1013904242; - xor.b32 %r210, %r209, 528734635; - shf.l.wrap.b32 %r211, %r210, %r210, 20; - add.s32 %r212, %r101, %r204; - add.s32 %r213, %r212, %r211; - xor.b32 %r214, %r213, %r208; - shf.l.wrap.b32 %r215, %r214, %r214, 24; - add.s32 %r216, %r215, %r209; - xor.b32 %r217, %r216, %r211; - shf.l.wrap.b32 %r218, %r217, %r217, 25; - add.s32 %r219, %r105, 19972691; - xor.b32 %r220, %r219, %r179; - shr.u32 %r221, %r219, 16; - shl.b32 %r222, %r220, 16; - or.b32 %r223, %r222, %r221; - add.s32 %r224, %r223, -1521486534; - xor.b32 %r225, %r224, 1541459225; - shf.l.wrap.b32 %r226, %r225, %r225, 20; - add.s32 %r227, %r113, %r219; - add.s32 %r228, %r227, %r226; - xor.b32 %r229, %r228, %r223; - shf.l.wrap.b32 %r230, %r229, %r229, 24; - add.s32 %r231, %r230, %r224; - xor.b32 %r232, %r231, %r226; - shf.l.wrap.b32 %r233, %r232, %r232, 25; - add.s32 %r234, %r203, %r186; - add.s32 %r235, %r234, %r121; - xor.b32 %r236, %r230, %r235; - shf.l.wrap.b32 %r237, %r236, %r236, 16; - add.s32 %r238, %r237, %r216; - xor.b32 %r239, %r238, %r203; - shf.l.wrap.b32 %r240, %r239, %r239, 20; - add.s32 %r241, %r129, %r235; - add.s32 %r242, %r241, %r240; - xor.b32 %r243, %r242, %r237; - shf.l.wrap.b32 %r244, %r243, %r243, 24; - add.s32 %r245, %r244, %r238; - xor.b32 %r246, %r245, %r240; - shf.l.wrap.b32 %r247, %r246, %r246, 25; - add.s32 %r248, %r218, %r198; - add.s32 %r249, %r248, %r137; - xor.b32 %r250, %r249, %r188; - shf.l.wrap.b32 %r251, %r250, %r250, 16; - add.s32 %r252, %r251, %r231; - xor.b32 %r253, %r252, %r218; - shf.l.wrap.b32 %r254, %r253, %r253, 20; - add.s32 %r255, %r145, %r249; - add.s32 %r256, %r255, %r254; - xor.b32 %r257, %r256, %r251; - shf.l.wrap.b32 %r258, %r257, %r257, 24; - add.s32 %r259, %r258, %r252; - xor.b32 %r260, %r259, %r254; - shf.l.wrap.b32 %r261, %r260, %r260, 25; - add.s32 %r262, %r233, %r213; - add.s32 %r263, %r262, %r153; - xor.b32 %r264, %r263, %r200; - shf.l.wrap.b32 %r265, %r264, %r264, 16; - add.s32 %r266, %r265, %r189; - xor.b32 %r267, %r266, %r233; - shf.l.wrap.b32 %r268, %r267, %r267, 20; - add.s32 %r269, %r161, %r263; - add.s32 %r270, %r269, %r268; - xor.b32 %r271, %r270, %r265; - shf.l.wrap.b32 %r272, %r271, %r271, 24; - add.s32 %r273, %r272, %r266; - xor.b32 %r274, %r273, %r268; - shf.l.wrap.b32 %r275, %r274, %r274, 25; - add.s32 %r276, %r228, %r191; - add.s32 %r277, %r276, %r169; - xor.b32 %r278, %r277, %r215; - shf.l.wrap.b32 %r279, %r278, %r278, 16; - add.s32 %r280, %r279, %r201; - xor.b32 %r281, %r280, %r191; - shf.l.wrap.b32 %r282, %r281, %r281, 20; - add.s32 %r283, %r177, %r277; - add.s32 %r284, %r283, %r282; - xor.b32 %r285, %r284, %r279; - shf.l.wrap.b32 %r286, %r285, %r285, 24; - add.s32 %r287, %r286, %r280; - xor.b32 %r288, %r287, %r282; - shf.l.wrap.b32 %r289, %r288, %r288, 25; - add.s32 %r290, %r242, %r81; - add.s32 %r291, %r290, %r289; - xor.b32 %r292, %r291, %r258; - shf.l.wrap.b32 %r293, %r292, %r292, 16; - add.s32 %r294, %r293, %r273; - xor.b32 %r295, %r294, %r289; - shf.l.wrap.b32 %r296, %r295, %r295, 20; - add.s32 %r297, %r291, %r105; - add.s32 %r298, %r297, %r296; - xor.b32 %r299, %r298, %r293; - shf.l.wrap.b32 %r300, %r299, %r299, 24; - add.s32 %r301, %r300, %r294; - xor.b32 %r302, %r301, %r296; - shf.l.wrap.b32 %r303, %r302, %r302, 25; - add.s32 %r304, %r256, %r89; - add.s32 %r305, %r304, %r247; - xor.b32 %r306, %r272, %r305; - shf.l.wrap.b32 %r307, %r306, %r306, 16; - add.s32 %r308, %r287, %r307; - xor.b32 %r309, %r308, %r247; - shf.l.wrap.b32 %r310, %r309, %r309, 20; - add.s32 %r311, %r305, %r137; - add.s32 %r312, %r311, %r310; - xor.b32 %r313, %r312, %r307; - shf.l.wrap.b32 %r314, %r313, %r313, 24; - add.s32 %r315, %r314, %r308; - xor.b32 %r316, %r315, %r310; - shf.l.wrap.b32 %r317, %r316, %r316, 25; - add.s32 %r318, %r261, %r113; - add.s32 %r319, %r318, %r270; - xor.b32 %r320, %r286, %r319; - shf.l.wrap.b32 %r321, %r320, %r320, 16; - add.s32 %r322, %r321, %r245; - xor.b32 %r323, %r322, %r261; + ld.local.u8 %r162, [%rd3+-56]; + ld.local.u8 %r163, [%rd3+-55]; + prmt.b32 %r164, %r163, %r162, 30212; + ld.local.u8 %r165, [%rd3+-54]; + prmt.b32 %r166, %r165, %r164, 28756; + ld.local.u8 %r167, [%rd3+-53]; + prmt.b32 %r168, %r167, %r166, 1620; + ld.local.u8 %r169, [%rd3+-52]; + ld.local.u8 %r170, [%rd3+-51]; + prmt.b32 %r171, %r170, %r169, 30212; + ld.local.u8 %r172, [%rd3+-50]; + prmt.b32 %r173, %r172, %r171, 28756; + ld.local.u8 %r174, [%rd3+-49]; + prmt.b32 %r175, %r174, %r173, 1620; + ld.local.u8 %r176, [%rd3+-48]; + ld.local.u8 %r177, [%rd3+-47]; + prmt.b32 %r178, %r177, %r176, 30212; + ld.local.u8 %r179, [%rd3+-46]; + prmt.b32 %r180, %r179, %r178, 28756; + ld.local.u8 %r181, [%rd3+-45]; + prmt.b32 %r182, %r181, %r180, 1620; + ld.local.u8 %r183, [%rd3+-44]; + ld.local.u8 %r184, [%rd3+-43]; + prmt.b32 %r185, %r184, %r183, 30212; + ld.local.u8 %r186, [%rd3+-42]; + prmt.b32 %r187, %r186, %r185, 28756; + ld.local.u8 %r188, [%rd3+-41]; + prmt.b32 %r189, %r188, %r187, 1620; + ld.local.u8 %r190, [%rd3+-40]; + ld.local.u8 %r191, [%rd3+-39]; + prmt.b32 %r192, %r191, %r190, 30212; + ld.local.u8 %r193, [%rd3+-38]; + prmt.b32 %r194, %r193, %r192, 28756; + ld.local.u8 %r195, [%rd3+-37]; + prmt.b32 %r196, %r195, %r194, 1620; + ld.local.u8 %r197, [%rd3+-36]; + ld.local.u8 %r198, [%rd3+-35]; + prmt.b32 %r199, %r198, %r197, 30212; + ld.local.u8 %r200, [%rd3+-34]; + prmt.b32 %r201, %r200, %r199, 28756; + ld.local.u8 %r202, [%rd3+-33]; + prmt.b32 %r203, %r202, %r201, 1620; + ld.local.u8 %r204, [%rd3+-32]; + ld.local.u8 %r205, [%rd3+-31]; + prmt.b32 %r206, %r205, %r204, 30212; + ld.local.u8 %r207, [%rd3+-30]; + prmt.b32 %r208, %r207, %r206, 28756; + ld.local.u8 %r209, [%rd3+-29]; + prmt.b32 %r210, %r209, %r208, 1620; + ld.local.u8 %r211, [%rd3+-28]; + ld.local.u8 %r212, [%rd3+-27]; + prmt.b32 %r213, %r212, %r211, 30212; + ld.local.u8 %r214, [%rd3+-26]; + prmt.b32 %r215, %r214, %r213, 28756; + ld.local.u8 %r216, [%rd3+-25]; + prmt.b32 %r217, %r216, %r215, 1620; + ld.local.u8 %r218, [%rd3+-24]; + ld.local.u8 %r219, [%rd3+-23]; + prmt.b32 %r220, %r219, %r218, 30212; + ld.local.u8 %r221, [%rd3+-22]; + prmt.b32 %r222, %r221, %r220, 28756; + ld.local.u8 %r223, [%rd3+-21]; + prmt.b32 %r224, %r223, %r222, 1620; + ld.local.u8 %r225, [%rd3+-20]; + ld.local.u8 %r226, [%rd3+-19]; + prmt.b32 %r227, %r226, %r225, 30212; + ld.local.u8 %r228, [%rd3+-18]; + prmt.b32 %r229, %r228, %r227, 28756; + ld.local.u8 %r230, [%rd3+-17]; + prmt.b32 %r231, %r230, %r229, 1620; + ld.local.u8 %r232, [%rd3+-16]; + ld.local.u8 %r233, [%rd3+-15]; + prmt.b32 %r234, %r233, %r232, 30212; + ld.local.u8 %r235, [%rd3+-14]; + prmt.b32 %r236, %r235, %r234, 28756; + ld.local.u8 %r237, [%rd3+-13]; + prmt.b32 %r238, %r237, %r236, 1620; + ld.local.u8 %r239, [%rd3+-12]; + ld.local.u8 %r240, [%rd3+-11]; + prmt.b32 %r241, %r240, %r239, 30212; + ld.local.u8 %r242, [%rd3+-10]; + prmt.b32 %r243, %r242, %r241, 28756; + ld.local.u8 %r244, [%rd3+-9]; + prmt.b32 %r245, %r244, %r243, 1620; + ld.local.u8 %r246, [%rd3+-8]; + ld.local.u8 %r247, [%rd3+-7]; + prmt.b32 %r248, %r247, %r246, 30212; + ld.local.u8 %r249, [%rd3+-6]; + prmt.b32 %r250, %r249, %r248, 28756; + ld.local.u8 %r251, [%rd3+-5]; + prmt.b32 %r252, %r251, %r250, 1620; + ld.local.u8 %r253, [%rd3+-4]; + ld.local.u8 %r254, [%rd3+-3]; + prmt.b32 %r255, %r254, %r253, 30212; + ld.local.u8 %r256, [%rd3+-2]; + prmt.b32 %r257, %r256, %r255, 28756; + ld.local.u8 %r258, [%rd3+-1]; + prmt.b32 %r259, %r258, %r257, 1620; + ld.local.u64 %rd115, [%rd3+-72]; + cvt.u32.u64 %r260, %rd115; + shr.u64 %rd116, %rd115, 32; + cvt.u32.u64 %r261, %rd116; + cvt.u32.u16 %r262, %rs113; + and.b32 %r263, %r262, 255; + ld.local.u32 %r264, [%rd3+-104]; + add.s32 %r265, %r264, %r154; + ld.local.u32 %r266, [%rd3+-88]; + add.s32 %r267, %r265, %r266; + xor.b32 %r268, %r267, %r260; + shf.l.wrap.b32 %r269, %r268, %r268, 16; + add.s32 %r270, %r269, 1779033703; + xor.b32 %r271, %r270, %r266; + shf.l.wrap.b32 %r272, %r271, %r271, 20; + add.s32 %r273, %r267, %r161; + add.s32 %r274, %r273, %r272; + xor.b32 %r275, %r274, %r269; + shf.l.wrap.b32 %r276, %r275, %r275, 24; + add.s32 %r277, %r276, %r270; + xor.b32 %r278, %r277, %r272; + shf.l.wrap.b32 %r279, %r278, %r278, 25; + ld.local.u32 %r280, [%rd3+-100]; + add.s32 %r281, %r280, %r168; + ld.local.u32 %r282, [%rd3+-84]; + add.s32 %r283, %r281, %r282; + xor.b32 %r284, %r283, %r261; + shf.l.wrap.b32 %r285, %r284, %r284, 16; + add.s32 %r286, %r285, -1150833019; + xor.b32 %r287, %r286, %r282; + shf.l.wrap.b32 %r288, %r287, %r287, 20; + add.s32 %r289, %r283, %r175; + add.s32 %r290, %r289, %r288; + xor.b32 %r291, %r290, %r285; + shf.l.wrap.b32 %r292, %r291, %r291, 24; + add.s32 %r293, %r292, %r286; + xor.b32 %r294, %r293, %r288; + shf.l.wrap.b32 %r295, %r294, %r294, 25; + ld.local.u32 %r296, [%rd3+-96]; + add.s32 %r297, %r296, %r182; + ld.local.u32 %r298, [%rd3+-80]; + add.s32 %r299, %r297, %r298; + shr.u32 %r300, %r299, 16; + shl.b32 %r301, %r299, 16; + xor.b32 %r302, %r301, 4194304; + or.b32 %r303, %r302, %r300; + add.s32 %r304, %r303, 1013904242; + xor.b32 %r305, %r304, %r298; + shf.l.wrap.b32 %r306, %r305, %r305, 20; + add.s32 %r307, %r299, %r189; + add.s32 %r308, %r307, %r306; + xor.b32 %r309, %r308, %r303; + shf.l.wrap.b32 %r310, %r309, %r309, 24; + add.s32 %r311, %r310, %r304; + xor.b32 %r312, %r311, %r306; + shf.l.wrap.b32 %r313, %r312, %r312, 25; + ld.local.u32 %r314, [%rd3+-92]; + add.s32 %r315, %r314, %r196; + ld.local.u32 %r316, [%rd3+-76]; + add.s32 %r317, %r315, %r316; + xor.b32 %r318, %r317, %r263; + shr.u32 %r319, %r317, 16; + shl.b32 %r320, %r318, 16; + or.b32 %r321, %r320, %r319; + add.s32 %r322, %r321, -1521486534; + xor.b32 %r323, %r322, %r316; shf.l.wrap.b32 %r324, %r323, %r323, 20; - add.s32 %r325, %r319, %r69; + add.s32 %r325, %r317, %r203; add.s32 %r326, %r325, %r324; xor.b32 %r327, %r326, %r321; shf.l.wrap.b32 %r328, %r327, %r327, 24; add.s32 %r329, %r328, %r322; xor.b32 %r330, %r329, %r324; shf.l.wrap.b32 %r331, %r330, %r330, 25; - add.s32 %r332, %r275, %r93; - add.s32 %r333, %r332, %r284; - xor.b32 %r334, %r333, %r244; + add.s32 %r332, %r274, %r210; + add.s32 %r333, %r332, %r295; + xor.b32 %r334, %r333, %r328; shf.l.wrap.b32 %r335, %r334, %r334, 16; - add.s32 %r336, %r335, %r259; - xor.b32 %r337, %r336, %r275; + add.s32 %r336, %r335, %r311; + xor.b32 %r337, %r336, %r295; shf.l.wrap.b32 %r338, %r337, %r337, 20; - add.s32 %r339, %r333, %r161; + add.s32 %r339, %r333, %r217; add.s32 %r340, %r339, %r338; xor.b32 %r341, %r340, %r335; shf.l.wrap.b32 %r342, %r341, %r341, 24; add.s32 %r343, %r342, %r336; xor.b32 %r344, %r343, %r338; shf.l.wrap.b32 %r345, %r344, %r344, 25; - add.s32 %r346, %r298, %r77; - add.s32 %r347, %r346, %r317; - xor.b32 %r348, %r347, %r342; + add.s32 %r346, %r290, %r224; + add.s32 %r347, %r346, %r313; + xor.b32 %r348, %r347, %r276; shf.l.wrap.b32 %r349, %r348, %r348, 16; add.s32 %r350, %r349, %r329; - xor.b32 %r351, %r350, %r317; + xor.b32 %r351, %r350, %r313; shf.l.wrap.b32 %r352, %r351, %r351, 20; - add.s32 %r353, %r347, %r145; + add.s32 %r353, %r347, %r231; add.s32 %r354, %r353, %r352; xor.b32 %r355, %r354, %r349; shf.l.wrap.b32 %r356, %r355, %r355, 24; add.s32 %r357, %r356, %r350; xor.b32 %r358, %r357, %r352; shf.l.wrap.b32 %r359, %r358, %r358, 25; - add.s32 %r360, %r312, %r153; + add.s32 %r360, %r308, %r238; add.s32 %r361, %r360, %r331; - xor.b32 %r362, %r361, %r300; + xor.b32 %r362, %r361, %r292; shf.l.wrap.b32 %r363, %r362, %r362, 16; - add.s32 %r364, %r363, %r343; + add.s32 %r364, %r363, %r277; xor.b32 %r365, %r364, %r331; shf.l.wrap.b32 %r366, %r365, %r365, 20; - add.s32 %r367, %r361, %r101; + add.s32 %r367, %r361, %r245; add.s32 %r368, %r367, %r366; xor.b32 %r369, %r368, %r363; shf.l.wrap.b32 %r370, %r369, %r369, 24; add.s32 %r371, %r370, %r364; xor.b32 %r372, %r371, %r366; shf.l.wrap.b32 %r373, %r372, %r372, 25; - add.s32 %r374, %r326, %r129; - add.s32 %r375, %r374, %r345; - xor.b32 %r376, %r375, %r314; + add.s32 %r374, %r326, %r252; + add.s32 %r375, %r374, %r279; + xor.b32 %r376, %r375, %r310; shf.l.wrap.b32 %r377, %r376, %r376, 16; - add.s32 %r378, %r377, %r301; - xor.b32 %r379, %r378, %r345; + add.s32 %r378, %r377, %r293; + xor.b32 %r379, %r378, %r279; shf.l.wrap.b32 %r380, %r379, %r379, 20; - add.s32 %r381, %r375, %r169; + add.s32 %r381, %r375, %r259; add.s32 %r382, %r381, %r380; xor.b32 %r383, %r382, %r377; shf.l.wrap.b32 %r384, %r383, %r383, 24; add.s32 %r385, %r384, %r378; xor.b32 %r386, %r385, %r380; shf.l.wrap.b32 %r387, %r386, %r386, 25; - add.s32 %r388, %r340, %r177; - add.s32 %r389, %r388, %r303; - xor.b32 %r390, %r389, %r328; + add.s32 %r388, %r340, %r168; + add.s32 %r389, %r388, %r387; + xor.b32 %r390, %r389, %r356; shf.l.wrap.b32 %r391, %r390, %r390, 16; - add.s32 %r392, %r391, %r315; - xor.b32 %r393, %r392, %r303; + add.s32 %r392, %r391, %r371; + xor.b32 %r393, %r392, %r387; shf.l.wrap.b32 %r394, %r393, %r393, 20; - add.s32 %r395, %r389, %r121; + add.s32 %r395, %r389, %r196; add.s32 %r396, %r395, %r394; xor.b32 %r397, %r396, %r391; shf.l.wrap.b32 %r398, %r397, %r397, 24; add.s32 %r399, %r398, %r392; xor.b32 %r400, %r399, %r394; shf.l.wrap.b32 %r401, %r400, %r400, 25; - add.s32 %r402, %r354, %r89; - add.s32 %r403, %r402, %r401; + add.s32 %r402, %r354, %r175; + add.s32 %r403, %r402, %r345; xor.b32 %r404, %r403, %r370; shf.l.wrap.b32 %r405, %r404, %r404, 16; add.s32 %r406, %r405, %r385; - xor.b32 %r407, %r406, %r401; + xor.b32 %r407, %r406, %r345; shf.l.wrap.b32 %r408, %r407, %r407, 20; - add.s32 %r409, %r403, %r93; + add.s32 %r409, %r403, %r224; add.s32 %r410, %r409, %r408; xor.b32 %r411, %r410, %r405; shf.l.wrap.b32 %r412, %r411, %r411, 24; add.s32 %r413, %r412, %r406; xor.b32 %r414, %r413, %r408; shf.l.wrap.b32 %r415, %r414, %r414, 25; - add.s32 %r416, %r368, %r137; + add.s32 %r416, %r368, %r203; add.s32 %r417, %r416, %r359; xor.b32 %r418, %r417, %r384; shf.l.wrap.b32 %r419, %r418, %r418, 16; - add.s32 %r420, %r419, %r399; + add.s32 %r420, %r419, %r343; xor.b32 %r421, %r420, %r359; shf.l.wrap.b32 %r422, %r421, %r421, 20; - add.s32 %r423, %r417, %r153; + add.s32 %r423, %r417, %r154; add.s32 %r424, %r423, %r422; xor.b32 %r425, %r424, %r419; shf.l.wrap.b32 %r426, %r425, %r425, 24; add.s32 %r427, %r426, %r420; xor.b32 %r428, %r427, %r422; shf.l.wrap.b32 %r429, %r428, %r428, 25; - add.s32 %r430, %r382, %r161; + add.s32 %r430, %r382, %r182; add.s32 %r431, %r430, %r373; - xor.b32 %r432, %r431, %r398; + xor.b32 %r432, %r431, %r342; shf.l.wrap.b32 %r433, %r432, %r432, 16; add.s32 %r434, %r433, %r357; xor.b32 %r435, %r434, %r373; shf.l.wrap.b32 %r436, %r435, %r435, 20; - add.s32 %r437, %r431, %r81; + add.s32 %r437, %r431, %r245; add.s32 %r438, %r437, %r436; xor.b32 %r439, %r438, %r433; shf.l.wrap.b32 %r440, %r439, %r439, 24; add.s32 %r441, %r440, %r434; xor.b32 %r442, %r441, %r436; shf.l.wrap.b32 %r443, %r442, %r442, 25; - add.s32 %r444, %r396, %r113; - add.s32 %r445, %r444, %r387; - xor.b32 %r446, %r445, %r356; + add.s32 %r444, %r396, %r161; + add.s32 %r445, %r444, %r415; + xor.b32 %r446, %r445, %r440; shf.l.wrap.b32 %r447, %r446, %r446, 16; - add.s32 %r448, %r447, %r371; - xor.b32 %r449, %r448, %r387; + add.s32 %r448, %r447, %r427; + xor.b32 %r449, %r448, %r415; shf.l.wrap.b32 %r450, %r449, %r449, 20; - add.s32 %r451, %r445, %r169; + add.s32 %r451, %r445, %r231; add.s32 %r452, %r451, %r450; xor.b32 %r453, %r452, %r447; shf.l.wrap.b32 %r454, %r453, %r453, 24; add.s32 %r455, %r454, %r448; xor.b32 %r456, %r455, %r450; shf.l.wrap.b32 %r457, %r456, %r456, 25; - add.s32 %r458, %r410, %r105; + add.s32 %r458, %r410, %r238; add.s32 %r459, %r458, %r429; - xor.b32 %r460, %r459, %r454; + xor.b32 %r460, %r459, %r398; shf.l.wrap.b32 %r461, %r460, %r460, 16; add.s32 %r462, %r461, %r441; xor.b32 %r463, %r462, %r429; shf.l.wrap.b32 %r464, %r463, %r463, 20; - add.s32 %r465, %r459, %r101; + add.s32 %r465, %r459, %r189; add.s32 %r466, %r465, %r464; xor.b32 %r467, %r466, %r461; shf.l.wrap.b32 %r468, %r467, %r467, 24; add.s32 %r469, %r468, %r462; xor.b32 %r470, %r469, %r464; shf.l.wrap.b32 %r471, %r470, %r470, 25; - add.s32 %r472, %r424, %r129; + add.s32 %r472, %r424, %r217; add.s32 %r473, %r472, %r443; xor.b32 %r474, %r473, %r412; shf.l.wrap.b32 %r475, %r474, %r474, 16; - add.s32 %r476, %r475, %r455; + add.s32 %r476, %r475, %r399; xor.b32 %r477, %r476, %r443; shf.l.wrap.b32 %r478, %r477, %r477, 20; - add.s32 %r479, %r473, %r69; + add.s32 %r479, %r473, %r252; add.s32 %r480, %r479, %r478; xor.b32 %r481, %r480, %r475; shf.l.wrap.b32 %r482, %r481, %r481, 24; add.s32 %r483, %r482, %r476; xor.b32 %r484, %r483, %r478; shf.l.wrap.b32 %r485, %r484, %r484, 25; - add.s32 %r486, %r438, %r145; - add.s32 %r487, %r486, %r457; + add.s32 %r486, %r438, %r259; + add.s32 %r487, %r486, %r401; xor.b32 %r488, %r487, %r426; shf.l.wrap.b32 %r489, %r488, %r488, 16; add.s32 %r490, %r489, %r413; - xor.b32 %r491, %r490, %r457; + xor.b32 %r491, %r490, %r401; shf.l.wrap.b32 %r492, %r491, %r491, 20; - add.s32 %r493, %r487, %r177; + add.s32 %r493, %r487, %r210; add.s32 %r494, %r493, %r492; xor.b32 %r495, %r494, %r489; shf.l.wrap.b32 %r496, %r495, %r495, 24; add.s32 %r497, %r496, %r490; xor.b32 %r498, %r497, %r492; shf.l.wrap.b32 %r499, %r498, %r498, 25; - add.s32 %r500, %r452, %r121; - add.s32 %r501, %r500, %r415; - xor.b32 %r502, %r501, %r440; + add.s32 %r500, %r452, %r175; + add.s32 %r501, %r500, %r499; + xor.b32 %r502, %r501, %r468; shf.l.wrap.b32 %r503, %r502, %r502, 16; - add.s32 %r504, %r503, %r427; - xor.b32 %r505, %r504, %r415; + add.s32 %r504, %r503, %r483; + xor.b32 %r505, %r504, %r499; shf.l.wrap.b32 %r506, %r505, %r505, 20; - add.s32 %r507, %r501, %r77; + add.s32 %r507, %r501, %r182; add.s32 %r508, %r507, %r506; xor.b32 %r509, %r508, %r503; shf.l.wrap.b32 %r510, %r509, %r509, 24; add.s32 %r511, %r510, %r504; xor.b32 %r512, %r511, %r506; shf.l.wrap.b32 %r513, %r512, %r512, 25; - add.s32 %r514, %r466, %r137; - add.s32 %r515, %r514, %r513; + add.s32 %r514, %r466, %r224; + add.s32 %r515, %r514, %r457; xor.b32 %r516, %r515, %r482; shf.l.wrap.b32 %r517, %r516, %r516, 16; add.s32 %r518, %r517, %r497; - xor.b32 %r519, %r518, %r513; + xor.b32 %r519, %r518, %r457; shf.l.wrap.b32 %r520, %r519, %r519, 20; - add.s32 %r521, %r515, %r113; + add.s32 %r521, %r515, %r238; add.s32 %r522, %r521, %r520; xor.b32 %r523, %r522, %r517; shf.l.wrap.b32 %r524, %r523, %r523, 24; add.s32 %r525, %r524, %r518; xor.b32 %r526, %r525, %r520; shf.l.wrap.b32 %r527, %r526, %r526, 25; - add.s32 %r528, %r480, %r153; + add.s32 %r528, %r480, %r245; add.s32 %r529, %r528, %r471; xor.b32 %r530, %r529, %r496; shf.l.wrap.b32 %r531, %r530, %r530, 16; - add.s32 %r532, %r531, %r511; + add.s32 %r532, %r531, %r455; xor.b32 %r533, %r532, %r471; shf.l.wrap.b32 %r534, %r533, %r533, 20; - add.s32 %r535, %r529, %r129; + add.s32 %r535, %r529, %r168; add.s32 %r536, %r535, %r534; xor.b32 %r537, %r536, %r531; shf.l.wrap.b32 %r538, %r537, %r537, 24; add.s32 %r539, %r538, %r532; xor.b32 %r540, %r539, %r534; shf.l.wrap.b32 %r541, %r540, %r540, 25; - add.s32 %r542, %r494, %r169; + add.s32 %r542, %r494, %r203; add.s32 %r543, %r542, %r485; - xor.b32 %r544, %r543, %r510; + xor.b32 %r544, %r543, %r454; shf.l.wrap.b32 %r545, %r544, %r544, 16; add.s32 %r546, %r545, %r469; xor.b32 %r547, %r546, %r485; shf.l.wrap.b32 %r548, %r547, %r547, 20; - add.s32 %r549, %r543, %r89; + add.s32 %r549, %r543, %r252; add.s32 %r550, %r549, %r548; xor.b32 %r551, %r550, %r545; shf.l.wrap.b32 %r552, %r551, %r551, 24; add.s32 %r553, %r552, %r546; xor.b32 %r554, %r553, %r548; shf.l.wrap.b32 %r555, %r554, %r554, 25; - add.s32 %r556, %r508, %r161; - add.s32 %r557, %r556, %r499; - xor.b32 %r558, %r557, %r468; + add.s32 %r556, %r508, %r196; + add.s32 %r557, %r556, %r527; + xor.b32 %r558, %r557, %r552; shf.l.wrap.b32 %r559, %r558, %r558, 16; - add.s32 %r560, %r559, %r483; - xor.b32 %r561, %r560, %r499; + add.s32 %r560, %r559, %r539; + xor.b32 %r561, %r560, %r527; shf.l.wrap.b32 %r562, %r561, %r561, 20; - add.s32 %r563, %r557, %r177; + add.s32 %r563, %r557, %r189; add.s32 %r564, %r563, %r562; xor.b32 %r565, %r564, %r559; shf.l.wrap.b32 %r566, %r565, %r565, 24; add.s32 %r567, %r566, %r560; xor.b32 %r568, %r567, %r562; shf.l.wrap.b32 %r569, %r568, %r568, 25; - add.s32 %r570, %r522, %r93; + add.s32 %r570, %r522, %r217; add.s32 %r571, %r570, %r541; - xor.b32 %r572, %r571, %r566; + xor.b32 %r572, %r571, %r510; shf.l.wrap.b32 %r573, %r572, %r572, 16; add.s32 %r574, %r573, %r553; xor.b32 %r575, %r574, %r541; shf.l.wrap.b32 %r576, %r575, %r575, 20; - add.s32 %r577, %r571, %r69; + add.s32 %r577, %r571, %r154; add.s32 %r578, %r577, %r576; xor.b32 %r579, %r578, %r573; shf.l.wrap.b32 %r580, %r579, %r579, 24; add.s32 %r581, %r580, %r574; xor.b32 %r582, %r581, %r576; shf.l.wrap.b32 %r583, %r582, %r582, 25; - add.s32 %r584, %r536, %r145; + add.s32 %r584, %r536, %r231; add.s32 %r585, %r584, %r555; xor.b32 %r586, %r585, %r524; shf.l.wrap.b32 %r587, %r586, %r586, 16; - add.s32 %r588, %r587, %r567; + add.s32 %r588, %r587, %r511; xor.b32 %r589, %r588, %r555; shf.l.wrap.b32 %r590, %r589, %r589, 20; - add.s32 %r591, %r585, %r81; + add.s32 %r591, %r585, %r259; add.s32 %r592, %r591, %r590; xor.b32 %r593, %r592, %r587; shf.l.wrap.b32 %r594, %r593, %r593, 24; add.s32 %r595, %r594, %r588; xor.b32 %r596, %r595, %r590; shf.l.wrap.b32 %r597, %r596, %r596, 25; - add.s32 %r598, %r550, %r101; - add.s32 %r599, %r598, %r569; + add.s32 %r598, %r550, %r210; + add.s32 %r599, %r598, %r513; xor.b32 %r600, %r599, %r538; shf.l.wrap.b32 %r601, %r600, %r600, 16; add.s32 %r602, %r601, %r525; - xor.b32 %r603, %r602, %r569; + xor.b32 %r603, %r602, %r513; shf.l.wrap.b32 %r604, %r603, %r603, 20; - add.s32 %r605, %r599, %r121; + add.s32 %r605, %r599, %r161; add.s32 %r606, %r605, %r604; xor.b32 %r607, %r606, %r601; shf.l.wrap.b32 %r608, %r607, %r607, 24; add.s32 %r609, %r608, %r602; xor.b32 %r610, %r609, %r604; shf.l.wrap.b32 %r611, %r610, %r610, 25; - add.s32 %r612, %r564, %r77; - add.s32 %r613, %r612, %r527; - xor.b32 %r614, %r613, %r552; + add.s32 %r612, %r564, %r224; + add.s32 %r613, %r612, %r611; + xor.b32 %r614, %r613, %r580; shf.l.wrap.b32 %r615, %r614, %r614, 16; - add.s32 %r616, %r615, %r539; - xor.b32 %r617, %r616, %r527; + add.s32 %r616, %r615, %r595; + xor.b32 %r617, %r616, %r611; shf.l.wrap.b32 %r618, %r617, %r617, 20; - add.s32 %r619, %r613, %r105; + add.s32 %r619, %r613, %r203; add.s32 %r620, %r619, %r618; xor.b32 %r621, %r620, %r615; shf.l.wrap.b32 %r622, %r621, %r621, 24; add.s32 %r623, %r622, %r616; xor.b32 %r624, %r623, %r618; shf.l.wrap.b32 %r625, %r624, %r624, 25; - add.s32 %r626, %r578, %r153; - add.s32 %r627, %r626, %r625; + add.s32 %r626, %r578, %r238; + add.s32 %r627, %r626, %r569; xor.b32 %r628, %r627, %r594; shf.l.wrap.b32 %r629, %r628, %r628, 16; add.s32 %r630, %r629, %r609; - xor.b32 %r631, %r630, %r625; + xor.b32 %r631, %r630, %r569; shf.l.wrap.b32 %r632, %r631, %r631, 20; - add.s32 %r633, %r627, %r161; + add.s32 %r633, %r627, %r217; add.s32 %r634, %r633, %r632; xor.b32 %r635, %r634, %r629; shf.l.wrap.b32 %r636, %r635, %r635, 24; add.s32 %r637, %r636, %r630; xor.b32 %r638, %r637, %r632; shf.l.wrap.b32 %r639, %r638, %r638, 25; - add.s32 %r640, %r592, %r129; + add.s32 %r640, %r592, %r252; add.s32 %r641, %r640, %r583; xor.b32 %r642, %r641, %r608; shf.l.wrap.b32 %r643, %r642, %r642, 16; - add.s32 %r644, %r643, %r623; + add.s32 %r644, %r643, %r567; xor.b32 %r645, %r644, %r583; shf.l.wrap.b32 %r646, %r645, %r645, 20; - add.s32 %r647, %r641, %r145; + add.s32 %r647, %r641, %r175; add.s32 %r648, %r647, %r646; xor.b32 %r649, %r648, %r643; shf.l.wrap.b32 %r650, %r649, %r649, 24; add.s32 %r651, %r650, %r644; xor.b32 %r652, %r651, %r646; shf.l.wrap.b32 %r653, %r652, %r652, 25; - add.s32 %r654, %r606, %r177; + add.s32 %r654, %r606, %r245; add.s32 %r655, %r654, %r597; - xor.b32 %r656, %r655, %r622; + xor.b32 %r656, %r655, %r566; shf.l.wrap.b32 %r657, %r656, %r656, 16; add.s32 %r658, %r657, %r581; xor.b32 %r659, %r658, %r597; shf.l.wrap.b32 %r660, %r659, %r659, 20; - add.s32 %r661, %r655, %r137; + add.s32 %r661, %r655, %r259; add.s32 %r662, %r661, %r660; xor.b32 %r663, %r662, %r657; shf.l.wrap.b32 %r664, %r663, %r663, 24; add.s32 %r665, %r664, %r658; xor.b32 %r666, %r665, %r660; shf.l.wrap.b32 %r667, %r666, %r666, 25; - add.s32 %r668, %r620, %r169; - add.s32 %r669, %r668, %r611; - xor.b32 %r670, %r669, %r580; + add.s32 %r668, %r620, %r182; + add.s32 %r669, %r668, %r639; + xor.b32 %r670, %r669, %r664; shf.l.wrap.b32 %r671, %r670, %r670, 16; - add.s32 %r672, %r671, %r595; - xor.b32 %r673, %r672, %r611; + add.s32 %r672, %r671, %r651; + xor.b32 %r673, %r672, %r639; shf.l.wrap.b32 %r674, %r673, %r673, 20; - add.s32 %r675, %r669, %r121; + add.s32 %r675, %r669, %r154; add.s32 %r676, %r675, %r674; xor.b32 %r677, %r676, %r671; shf.l.wrap.b32 %r678, %r677, %r677, 24; add.s32 %r679, %r678, %r672; xor.b32 %r680, %r679, %r674; shf.l.wrap.b32 %r681, %r680, %r680, 25; - add.s32 %r682, %r634, %r113; + add.s32 %r682, %r634, %r231; add.s32 %r683, %r682, %r653; - xor.b32 %r684, %r683, %r678; + xor.b32 %r684, %r683, %r622; shf.l.wrap.b32 %r685, %r684, %r684, 16; add.s32 %r686, %r685, %r665; xor.b32 %r687, %r686, %r653; shf.l.wrap.b32 %r688, %r687, %r687, 20; - add.s32 %r689, %r683, %r81; + add.s32 %r689, %r683, %r168; add.s32 %r690, %r689, %r688; xor.b32 %r691, %r690, %r685; shf.l.wrap.b32 %r692, %r691, %r691, 24; add.s32 %r693, %r692, %r686; xor.b32 %r694, %r693, %r688; shf.l.wrap.b32 %r695, %r694, %r694, 25; - add.s32 %r696, %r648, %r101; + add.s32 %r696, %r648, %r189; add.s32 %r697, %r696, %r667; xor.b32 %r698, %r697, %r636; shf.l.wrap.b32 %r699, %r698, %r698, 16; - add.s32 %r700, %r699, %r679; + add.s32 %r700, %r699, %r623; xor.b32 %r701, %r700, %r667; shf.l.wrap.b32 %r702, %r701, %r701, 20; - add.s32 %r703, %r697, %r89; + add.s32 %r703, %r697, %r210; add.s32 %r704, %r703, %r702; xor.b32 %r705, %r704, %r699; shf.l.wrap.b32 %r706, %r705, %r705, 24; add.s32 %r707, %r706, %r700; xor.b32 %r708, %r707, %r702; shf.l.wrap.b32 %r709, %r708, %r708, 25; - add.s32 %r710, %r662, %r69; - add.s32 %r711, %r710, %r681; + add.s32 %r710, %r662, %r161; + add.s32 %r711, %r710, %r625; xor.b32 %r712, %r711, %r650; shf.l.wrap.b32 %r713, %r712, %r712, 16; add.s32 %r714, %r713, %r637; - xor.b32 %r715, %r714, %r681; + xor.b32 %r715, %r714, %r625; shf.l.wrap.b32 %r716, %r715, %r715, 20; - add.s32 %r717, %r711, %r77; + add.s32 %r717, %r711, %r196; add.s32 %r718, %r717, %r716; xor.b32 %r719, %r718, %r713; shf.l.wrap.b32 %r720, %r719, %r719, 24; add.s32 %r721, %r720, %r714; xor.b32 %r722, %r721, %r716; shf.l.wrap.b32 %r723, %r722, %r722, 25; - add.s32 %r724, %r676, %r105; - add.s32 %r725, %r724, %r639; - xor.b32 %r726, %r725, %r664; + add.s32 %r724, %r676, %r238; + add.s32 %r725, %r724, %r723; + xor.b32 %r726, %r725, %r692; shf.l.wrap.b32 %r727, %r726, %r726, 16; - add.s32 %r728, %r727, %r651; - xor.b32 %r729, %r728, %r639; + add.s32 %r728, %r727, %r707; + xor.b32 %r729, %r728, %r723; shf.l.wrap.b32 %r730, %r729, %r729, 20; - add.s32 %r731, %r725, %r93; + add.s32 %r731, %r725, %r245; add.s32 %r732, %r731, %r730; xor.b32 %r733, %r732, %r727; shf.l.wrap.b32 %r734, %r733, %r733, 24; add.s32 %r735, %r734, %r728; xor.b32 %r736, %r735, %r730; shf.l.wrap.b32 %r737, %r736, %r736, 25; - add.s32 %r738, %r690, %r129; - add.s32 %r739, %r738, %r737; + add.s32 %r738, %r690, %r217; + add.s32 %r739, %r738, %r681; xor.b32 %r740, %r739, %r706; shf.l.wrap.b32 %r741, %r740, %r740, 16; add.s32 %r742, %r741, %r721; - xor.b32 %r743, %r742, %r737; + xor.b32 %r743, %r742, %r681; shf.l.wrap.b32 %r744, %r743, %r743, 20; - add.s32 %r745, %r739, %r169; + add.s32 %r745, %r739, %r231; add.s32 %r746, %r745, %r744; xor.b32 %r747, %r746, %r741; shf.l.wrap.b32 %r748, %r747, %r747, 24; add.s32 %r749, %r748, %r742; xor.b32 %r750, %r749, %r744; shf.l.wrap.b32 %r751, %r750, %r750, 25; - add.s32 %r752, %r704, %r145; + add.s32 %r752, %r704, %r259; add.s32 %r753, %r752, %r695; xor.b32 %r754, %r753, %r720; shf.l.wrap.b32 %r755, %r754, %r754, 16; - add.s32 %r756, %r755, %r735; + add.s32 %r756, %r755, %r679; xor.b32 %r757, %r756, %r695; shf.l.wrap.b32 %r758, %r757, %r757, 20; - add.s32 %r759, %r753, %r101; + add.s32 %r759, %r753, %r224; add.s32 %r760, %r759, %r758; xor.b32 %r761, %r760, %r755; shf.l.wrap.b32 %r762, %r761, %r761, 24; add.s32 %r763, %r762, %r756; xor.b32 %r764, %r763, %r758; shf.l.wrap.b32 %r765, %r764, %r764, 25; - add.s32 %r766, %r718, %r121; + add.s32 %r766, %r718, %r252; add.s32 %r767, %r766, %r709; - xor.b32 %r768, %r767, %r734; + xor.b32 %r768, %r767, %r678; shf.l.wrap.b32 %r769, %r768, %r768, 16; add.s32 %r770, %r769, %r693; xor.b32 %r771, %r770, %r709; shf.l.wrap.b32 %r772, %r771, %r771, 20; - add.s32 %r773, %r767, %r153; + add.s32 %r773, %r767, %r210; add.s32 %r774, %r773, %r772; xor.b32 %r775, %r774, %r769; shf.l.wrap.b32 %r776, %r775, %r775, 24; add.s32 %r777, %r776, %r770; xor.b32 %r778, %r777, %r772; shf.l.wrap.b32 %r779, %r778, %r778, 25; - add.s32 %r780, %r732, %r177; - add.s32 %r781, %r780, %r723; - xor.b32 %r782, %r781, %r692; + add.s32 %r780, %r732, %r203; + add.s32 %r781, %r780, %r751; + xor.b32 %r782, %r781, %r776; shf.l.wrap.b32 %r783, %r782, %r782, 16; - add.s32 %r784, %r783, %r707; - xor.b32 %r785, %r784, %r723; + add.s32 %r784, %r783, %r763; + xor.b32 %r785, %r784, %r751; shf.l.wrap.b32 %r786, %r785, %r785, 20; - add.s32 %r787, %r781, %r77; + add.s32 %r787, %r781, %r168; add.s32 %r788, %r787, %r786; xor.b32 %r789, %r788, %r783; shf.l.wrap.b32 %r790, %r789, %r789, 24; add.s32 %r791, %r790, %r784; xor.b32 %r792, %r791, %r786; shf.l.wrap.b32 %r793, %r792, %r792, 25; - add.s32 %r794, %r746, %r161; + add.s32 %r794, %r746, %r189; add.s32 %r795, %r794, %r765; - xor.b32 %r796, %r795, %r790; + xor.b32 %r796, %r795, %r734; shf.l.wrap.b32 %r797, %r796, %r796, 16; add.s32 %r798, %r797, %r777; xor.b32 %r799, %r798, %r765; shf.l.wrap.b32 %r800, %r799, %r799, 20; - add.s32 %r801, %r795, %r89; + add.s32 %r801, %r795, %r175; add.s32 %r802, %r801, %r800; xor.b32 %r803, %r802, %r797; shf.l.wrap.b32 %r804, %r803, %r803, 24; add.s32 %r805, %r804, %r798; xor.b32 %r806, %r805, %r800; shf.l.wrap.b32 %r807, %r806, %r806, 25; - add.s32 %r808, %r760, %r69; + add.s32 %r808, %r760, %r154; add.s32 %r809, %r808, %r779; xor.b32 %r810, %r809, %r748; shf.l.wrap.b32 %r811, %r810, %r810, 16; - add.s32 %r812, %r811, %r791; + add.s32 %r812, %r811, %r735; xor.b32 %r813, %r812, %r779; shf.l.wrap.b32 %r814, %r813, %r813, 20; - add.s32 %r815, %r809, %r137; + add.s32 %r815, %r809, %r161; add.s32 %r816, %r815, %r814; xor.b32 %r817, %r816, %r811; shf.l.wrap.b32 %r818, %r817, %r817, 24; add.s32 %r819, %r818, %r812; xor.b32 %r820, %r819, %r814; shf.l.wrap.b32 %r821, %r820, %r820, 25; - add.s32 %r822, %r774, %r81; - add.s32 %r823, %r822, %r793; + add.s32 %r822, %r774, %r196; + add.s32 %r823, %r822, %r737; xor.b32 %r824, %r823, %r762; shf.l.wrap.b32 %r825, %r824, %r824, 16; add.s32 %r826, %r825, %r749; - xor.b32 %r827, %r826, %r793; + xor.b32 %r827, %r826, %r737; shf.l.wrap.b32 %r828, %r827, %r827, 20; - add.s32 %r829, %r823, %r105; + add.s32 %r829, %r823, %r182; add.s32 %r830, %r829, %r828; xor.b32 %r831, %r830, %r825; shf.l.wrap.b32 %r832, %r831, %r831, 24; add.s32 %r833, %r832, %r826; xor.b32 %r834, %r833, %r828; shf.l.wrap.b32 %r835, %r834, %r834, 25; - add.s32 %r836, %r788, %r93; - add.s32 %r837, %r836, %r751; - xor.b32 %r838, %r837, %r776; + add.s32 %r836, %r788, %r217; + add.s32 %r837, %r836, %r835; + xor.b32 %r838, %r837, %r804; shf.l.wrap.b32 %r839, %r838, %r838, 16; - add.s32 %r840, %r839, %r763; - xor.b32 %r841, %r840, %r751; + add.s32 %r840, %r839, %r819; + xor.b32 %r841, %r840, %r835; shf.l.wrap.b32 %r842, %r841, %r841, 20; - add.s32 %r843, %r837, %r113; + add.s32 %r843, %r837, %r252; add.s32 %r844, %r843, %r842; xor.b32 %r845, %r844, %r839; shf.l.wrap.b32 %r846, %r845, %r845, 24; add.s32 %r847, %r846, %r840; xor.b32 %r848, %r847, %r842; shf.l.wrap.b32 %r849, %r848, %r848, 25; - add.s32 %r850, %r802, %r145; - add.s32 %r851, %r850, %r849; + add.s32 %r850, %r802, %r231; + add.s32 %r851, %r850, %r793; xor.b32 %r852, %r851, %r818; shf.l.wrap.b32 %r853, %r852, %r852, 16; add.s32 %r854, %r853, %r833; - xor.b32 %r855, %r854, %r849; + xor.b32 %r855, %r854, %r793; shf.l.wrap.b32 %r856, %r855, %r855, 20; - add.s32 %r857, %r851, %r177; + add.s32 %r857, %r851, %r189; add.s32 %r858, %r857, %r856; xor.b32 %r859, %r858, %r853; shf.l.wrap.b32 %r860, %r859, %r859, 24; add.s32 %r861, %r860, %r854; xor.b32 %r862, %r861, %r856; shf.l.wrap.b32 %r863, %r862, %r862, 25; - add.s32 %r864, %r816, %r101; + add.s32 %r864, %r816, %r210; add.s32 %r865, %r864, %r807; xor.b32 %r866, %r865, %r832; shf.l.wrap.b32 %r867, %r866, %r866, 16; - add.s32 %r868, %r867, %r847; + add.s32 %r868, %r867, %r791; xor.b32 %r869, %r868, %r807; shf.l.wrap.b32 %r870, %r869, %r869, 20; - add.s32 %r871, %r865, %r69; + add.s32 %r871, %r865, %r238; add.s32 %r872, %r871, %r870; xor.b32 %r873, %r872, %r867; shf.l.wrap.b32 %r874, %r873, %r873, 24; add.s32 %r875, %r874, %r868; xor.b32 %r876, %r875, %r870; shf.l.wrap.b32 %r877, %r876, %r876, 25; - add.s32 %r878, %r830, %r77; + add.s32 %r878, %r830, %r259; add.s32 %r879, %r878, %r821; - xor.b32 %r880, %r879, %r846; + xor.b32 %r880, %r879, %r790; shf.l.wrap.b32 %r881, %r880, %r880, 16; add.s32 %r882, %r881, %r805; xor.b32 %r883, %r882, %r821; shf.l.wrap.b32 %r884, %r883, %r883, 20; - add.s32 %r885, %r879, %r129; + add.s32 %r885, %r879, %r161; add.s32 %r886, %r885, %r884; xor.b32 %r887, %r886, %r881; shf.l.wrap.b32 %r888, %r887, %r887, 24; add.s32 %r889, %r888, %r882; xor.b32 %r890, %r889, %r884; shf.l.wrap.b32 %r891, %r890, %r890, 25; - add.s32 %r892, %r844, %r121; - add.s32 %r893, %r892, %r835; - xor.b32 %r894, %r893, %r804; + add.s32 %r892, %r844, %r245; + add.s32 %r893, %r892, %r863; + xor.b32 %r894, %r893, %r888; shf.l.wrap.b32 %r895, %r894, %r894, 16; - add.s32 %r896, %r895, %r819; - xor.b32 %r897, %r896, %r835; + add.s32 %r896, %r895, %r875; + xor.b32 %r897, %r896, %r863; shf.l.wrap.b32 %r898, %r897, %r897, 20; - add.s32 %r899, %r893, %r105; + add.s32 %r899, %r893, %r175; add.s32 %r900, %r899, %r898; xor.b32 %r901, %r900, %r895; shf.l.wrap.b32 %r902, %r901, %r901, 24; add.s32 %r903, %r902, %r896; xor.b32 %r904, %r903, %r898; shf.l.wrap.b32 %r905, %r904, %r904, 25; - add.s32 %r906, %r858, %r169; + add.s32 %r906, %r858, %r154; add.s32 %r907, %r906, %r877; - xor.b32 %r908, %r907, %r902; + xor.b32 %r908, %r907, %r846; shf.l.wrap.b32 %r909, %r908, %r908, 16; add.s32 %r910, %r909, %r889; xor.b32 %r911, %r910, %r877; shf.l.wrap.b32 %r912, %r911, %r911, 20; - add.s32 %r913, %r907, %r137; + add.s32 %r913, %r907, %r224; add.s32 %r914, %r913, %r912; xor.b32 %r915, %r914, %r909; shf.l.wrap.b32 %r916, %r915, %r915, 24; add.s32 %r917, %r916, %r910; xor.b32 %r918, %r917, %r912; shf.l.wrap.b32 %r919, %r918, %r918, 25; - add.s32 %r920, %r872, %r81; + add.s32 %r920, %r872, %r168; add.s32 %r921, %r920, %r891; xor.b32 %r922, %r921, %r860; shf.l.wrap.b32 %r923, %r922, %r922, 16; - add.s32 %r924, %r923, %r903; + add.s32 %r924, %r923, %r847; xor.b32 %r925, %r924, %r891; shf.l.wrap.b32 %r926, %r925, %r925, 20; - add.s32 %r927, %r921, %r153; + add.s32 %r927, %r921, %r196; add.s32 %r928, %r927, %r926; xor.b32 %r929, %r928, %r923; shf.l.wrap.b32 %r930, %r929, %r929, 24; add.s32 %r931, %r930, %r924; xor.b32 %r932, %r931, %r926; shf.l.wrap.b32 %r933, %r932, %r932, 25; - add.s32 %r934, %r886, %r89; - add.s32 %r935, %r934, %r905; + add.s32 %r934, %r886, %r182; + add.s32 %r935, %r934, %r849; xor.b32 %r936, %r935, %r874; shf.l.wrap.b32 %r937, %r936, %r936, 16; add.s32 %r938, %r937, %r861; - xor.b32 %r939, %r938, %r905; + xor.b32 %r939, %r938, %r849; shf.l.wrap.b32 %r940, %r939, %r939, 20; - add.s32 %r941, %r935, %r93; + add.s32 %r941, %r935, %r203; add.s32 %r942, %r941, %r940; xor.b32 %r943, %r942, %r937; shf.l.wrap.b32 %r944, %r943, %r943, 24; add.s32 %r945, %r944, %r938; xor.b32 %r946, %r945, %r940; shf.l.wrap.b32 %r947, %r946, %r946, 25; - add.s32 %r948, %r900, %r113; - add.s32 %r949, %r948, %r863; - xor.b32 %r950, %r949, %r888; + add.s32 %r948, %r900, %r231; + add.s32 %r949, %r948, %r947; + xor.b32 %r950, %r949, %r916; shf.l.wrap.b32 %r951, %r950, %r950, 16; - add.s32 %r952, %r951, %r875; - xor.b32 %r953, %r952, %r863; + add.s32 %r952, %r951, %r931; + xor.b32 %r953, %r952, %r947; shf.l.wrap.b32 %r954, %r953, %r953, 20; - add.s32 %r955, %r949, %r161; + add.s32 %r955, %r949, %r259; add.s32 %r956, %r955, %r954; xor.b32 %r957, %r956, %r951; shf.l.wrap.b32 %r958, %r957, %r957, 24; add.s32 %r959, %r958, %r952; xor.b32 %r960, %r959, %r954; shf.l.wrap.b32 %r961, %r960, %r960, 25; - xor.b32 %r1, %r945, %r914; - xor.b32 %r2, %r959, %r928; - st.local.v2.u32 [%rd3+32], {%r1, %r2}; - xor.b32 %r3, %r917, %r942; - xor.b32 %r4, %r956, %r931; - st.local.v2.u32 [%rd3+40], {%r3, %r4}; - xor.b32 %r5, %r961, %r930; - xor.b32 %r6, %r919, %r944; - st.local.v2.u32 [%rd3+48], {%r5, %r6}; - xor.b32 %r7, %r958, %r933; - xor.b32 %r8, %r947, %r916; - st.local.v2.u32 [%rd3+56], {%r7, %r8}; - st.local.u64 [%rd3+72], %rd131; - st.local.u64 [%rd3+80], %rd8; - add.s16 %rs1, %rs61, 16; - and.b16 %rs70, %rs1, 255; - add.s16 %rs71, %rs62, 1; - st.local.v2.u8 [%rd3+136], {%rs1, %rs71}; - cvt.u32.u16 %r962, %rs71; - cvt.u32.u16 %r963, %rs70; - prmt.b32 %r964, %r962, %r963, 30212; - cvt.u16.u32 %rs72, %r964; - shr.u16 %rs2, %rs72, 8; - mov.b32 {%rs5, %rs6}, %r53; - mov.b32 {%rs3, %rs4}, %r52; - mov.b32 {%rs9, %rs10}, %r27; - mov.b32 {%rs7, %rs8}, %r26; - setp.eq.s16 %p10, %rs2, 0; - selp.u16 %rs73, 1, 0, %p10; - shr.u16 %rs74, %rs3, 8; - shr.u16 %rs75, %rs4, 8; - shr.u16 %rs76, %rs5, 8; - shr.u16 %rs77, %rs6, 8; - shr.u16 %rs78, %rs7, 8; - shr.u16 %rs79, %rs8, 8; - shr.u16 %rs80, %rs9, 8; - shr.u16 %rs81, %rs10, 8; - or.b16 %rs82, %rs73, 10; - cvt.u32.u16 %r965, %rs3; - and.b32 %r966, %r965, 255; - cvt.u32.u16 %r967, %rs74; - prmt.b32 %r968, %r967, %r966, 30212; - cvt.u32.u16 %r969, %rs4; - prmt.b32 %r970, %r969, %r968, 28756; - cvt.u32.u16 %r971, %rs75; - prmt.b32 %r972, %r971, %r970, 1620; - cvt.u32.u16 %r973, %rs5; - and.b32 %r974, %r973, 255; - cvt.u32.u16 %r975, %rs76; - prmt.b32 %r976, %r975, %r974, 30212; - cvt.u32.u16 %r977, %rs6; - prmt.b32 %r978, %r977, %r976, 28756; - cvt.u32.u16 %r979, %rs77; - prmt.b32 %r980, %r979, %r978, 1620; - cvt.u32.u16 %r981, %rs7; - and.b32 %r982, %r981, 255; - cvt.u32.u16 %r983, %rs78; - prmt.b32 %r984, %r983, %r982, 30212; - cvt.u32.u16 %r985, %rs8; - prmt.b32 %r986, %r985, %r984, 28756; - cvt.u32.u16 %r987, %rs79; - prmt.b32 %r988, %r987, %r986, 1620; - cvt.u32.u16 %r989, %rs9; - and.b32 %r990, %r989, 255; - cvt.u32.u16 %r991, %rs80; - prmt.b32 %r992, %r991, %r990, 30212; - cvt.u32.u16 %r993, %rs10; - prmt.b32 %r994, %r993, %r992, 28756; - cvt.u32.u16 %r995, %rs81; - prmt.b32 %r996, %r995, %r994, 1620; - cvt.u32.u16 %r997, %rs82; - add.s32 %r998, %r5, %r1; - add.s32 %r999, %r998, %r972; - add.s32 %r1000, %r980, %r999; - add.s32 %r1001, %r6, %r2; - add.s32 %r1002, %r1001, %r988; - add.s32 %r1003, %r996, %r1002; - add.s32 %r1004, %r7, %r3; - cvt.u32.u16 %r1005, %rs1; - and.b32 %r1006, %r1005, 255; - xor.b32 %r1007, %r1004, %r1006; - shr.u32 %r1008, %r1004, 16; - shl.b32 %r1009, %r1007, 16; - or.b32 %r1010, %r1009, %r1008; - add.s32 %r1011, %r1010, 1013904242; - xor.b32 %r1012, %r1011, %r7; - shf.l.wrap.b32 %r1013, %r1012, %r1012, 20; - add.s32 %r1014, %r1004, %r1013; - xor.b32 %r1015, %r1014, %r1010; - shf.l.wrap.b32 %r1016, %r1015, %r1015, 24; - add.s32 %r1017, %r1016, %r1011; - xor.b32 %r1018, %r1017, %r1013; - shf.l.wrap.b32 %r1019, %r1018, %r1018, 25; - add.s32 %r1020, %r8, %r4; - xor.b32 %r1021, %r1020, %r997; - shr.u32 %r1022, %r1020, 16; - shl.b32 %r1023, %r1021, 16; - or.b32 %r1024, %r1023, %r1022; - add.s32 %r1025, %r1024, -1521486534; - xor.b32 %r1026, %r1025, %r8; - shf.l.wrap.b32 %r1027, %r1026, %r1026, 20; - add.s32 %r1028, %r1020, %r1027; - xor.b32 %r1029, %r1028, %r1024; - shf.l.wrap.b32 %r1030, %r1029, %r1029, 24; - add.s32 %r1031, %r1030, %r1025; - xor.b32 %r1032, %r1031, %r1027; - shf.l.wrap.b32 %r1033, %r1032, %r1032, 25; - add.s32 %r1034, %r1033, %r1014; - shf.l.wrap.b32 %r1035, %r999, %r999, 16; - add.s32 %r1036, %r1035, 1779033703; - xor.b32 %r1037, %r1036, %r5; + add.s32 %r962, %r914, %r189; + add.s32 %r963, %r962, %r905; + xor.b32 %r964, %r963, %r930; + shf.l.wrap.b32 %r965, %r964, %r964, 16; + add.s32 %r966, %r965, %r945; + xor.b32 %r967, %r966, %r905; + shf.l.wrap.b32 %r968, %r967, %r967, 20; + add.s32 %r969, %r963, %r154; + add.s32 %r970, %r969, %r968; + xor.b32 %r971, %r970, %r965; + shf.l.wrap.b32 %r972, %r971, %r971, 24; + add.s32 %r973, %r972, %r966; + xor.b32 %r974, %r973, %r968; + shf.l.wrap.b32 %r975, %r974, %r974, 25; + add.s32 %r976, %r928, %r161; + add.s32 %r977, %r976, %r919; + xor.b32 %r978, %r977, %r944; + shf.l.wrap.b32 %r979, %r978, %r978, 16; + add.s32 %r980, %r979, %r903; + xor.b32 %r981, %r980, %r919; + shf.l.wrap.b32 %r982, %r981, %r981, 20; + add.s32 %r983, %r977, %r217; + add.s32 %r984, %r983, %r982; + xor.b32 %r985, %r984, %r979; + shf.l.wrap.b32 %r986, %r985, %r985, 24; + add.s32 %r987, %r986, %r980; + xor.b32 %r988, %r987, %r982; + shf.l.wrap.b32 %r989, %r988, %r988, 25; + add.s32 %r990, %r942, %r210; + add.s32 %r991, %r990, %r933; + xor.b32 %r992, %r991, %r902; + shf.l.wrap.b32 %r993, %r992, %r992, 16; + add.s32 %r994, %r993, %r917; + xor.b32 %r995, %r994, %r933; + shf.l.wrap.b32 %r996, %r995, %r995, 20; + add.s32 %r997, %r991, %r196; + add.s32 %r998, %r997, %r996; + xor.b32 %r999, %r998, %r993; + shf.l.wrap.b32 %r1000, %r999, %r999, 24; + add.s32 %r1001, %r1000, %r994; + xor.b32 %r1002, %r1001, %r996; + shf.l.wrap.b32 %r1003, %r1002, %r1002, 25; + add.s32 %r1004, %r956, %r252; + add.s32 %r1005, %r1004, %r975; + xor.b32 %r1006, %r1005, %r1000; + shf.l.wrap.b32 %r1007, %r1006, %r1006, 16; + add.s32 %r1008, %r1007, %r987; + xor.b32 %r1009, %r1008, %r975; + shf.l.wrap.b32 %r1010, %r1009, %r1009, 20; + add.s32 %r1011, %r1005, %r224; + add.s32 %r1012, %r1011, %r1010; + xor.b32 %r1013, %r1012, %r1007; + shf.l.wrap.b32 %r1014, %r1013, %r1013, 24; + add.s32 %r1015, %r1014, %r1008; + xor.b32 %r1016, %r1015, %r1010; + shf.l.wrap.b32 %r1017, %r1016, %r1016, 25; + add.s32 %r1018, %r970, %r168; + add.s32 %r1019, %r1018, %r989; + xor.b32 %r1020, %r1019, %r958; + shf.l.wrap.b32 %r1021, %r1020, %r1020, 16; + add.s32 %r1022, %r1021, %r1001; + xor.b32 %r1023, %r1022, %r989; + shf.l.wrap.b32 %r1024, %r1023, %r1023, 20; + add.s32 %r1025, %r1019, %r238; + add.s32 %r1026, %r1025, %r1024; + xor.b32 %r1027, %r1026, %r1021; + shf.l.wrap.b32 %r1028, %r1027, %r1027, 24; + add.s32 %r1029, %r1028, %r1022; + xor.b32 %r1030, %r1029, %r1024; + shf.l.wrap.b32 %r1031, %r1030, %r1030, 25; + add.s32 %r1032, %r984, %r175; + add.s32 %r1033, %r1032, %r1003; + xor.b32 %r1034, %r1033, %r972; + shf.l.wrap.b32 %r1035, %r1034, %r1034, 16; + add.s32 %r1036, %r1035, %r959; + xor.b32 %r1037, %r1036, %r1003; shf.l.wrap.b32 %r1038, %r1037, %r1037, 20; - add.s32 %r1039, %r1000, %r1038; - xor.b32 %r1040, %r1039, %r1035; - shf.l.wrap.b32 %r1041, %r1040, %r1040, 24; - add.s32 %r1042, %r1041, %r1036; - xor.b32 %r1043, %r1042, %r1038; - shf.l.wrap.b32 %r1044, %r1043, %r1043, 25; - shf.l.wrap.b32 %r1045, %r1002, %r1002, 16; - add.s32 %r1046, %r1045, -1150833019; - xor.b32 %r1047, %r1046, %r6; - shf.l.wrap.b32 %r1048, %r1047, %r1047, 20; - add.s32 %r1049, %r1003, %r1048; - xor.b32 %r1050, %r1049, %r1045; - shf.l.wrap.b32 %r1051, %r1050, %r1050, 24; - add.s32 %r1052, %r1051, %r1046; - xor.b32 %r1053, %r1052, %r1048; - shf.l.wrap.b32 %r1054, %r1053, %r1053, 25; - add.s32 %r1055, %r1039, %r1054; - xor.b32 %r1056, %r1055, %r1030; - shf.l.wrap.b32 %r1057, %r1056, %r1056, 16; - add.s32 %r1058, %r1057, %r1017; - xor.b32 %r1059, %r1058, %r1054; - shf.l.wrap.b32 %r1060, %r1059, %r1059, 20; - add.s32 %r1061, %r1055, %r1060; - xor.b32 %r1062, %r1061, %r1057; - shf.l.wrap.b32 %r1063, %r1062, %r1062, 24; - add.s32 %r1064, %r1063, %r1058; - xor.b32 %r1065, %r1064, %r1060; - shf.l.wrap.b32 %r1066, %r1065, %r1065, 25; - add.s32 %r1067, %r1019, %r1049; - xor.b32 %r1068, %r1041, %r1067; - shf.l.wrap.b32 %r1069, %r1068, %r1068, 16; - add.s32 %r1070, %r1069, %r1031; - xor.b32 %r1071, %r1070, %r1019; - shf.l.wrap.b32 %r1072, %r1071, %r1071, 20; - add.s32 %r1073, %r1067, %r1072; - xor.b32 %r1074, %r1073, %r1069; - shf.l.wrap.b32 %r1075, %r1074, %r1074, 24; - add.s32 %r1076, %r1075, %r1070; - xor.b32 %r1077, %r1076, %r1072; - shf.l.wrap.b32 %r1078, %r1077, %r1077, 25; - xor.b32 %r1079, %r1051, %r1034; - shf.l.wrap.b32 %r1080, %r1079, %r1079, 16; - add.s32 %r1081, %r1080, %r1042; - xor.b32 %r1082, %r1081, %r1033; - shf.l.wrap.b32 %r1083, %r1082, %r1082, 20; - add.s32 %r1084, %r1034, %r1083; - xor.b32 %r1085, %r1084, %r1080; - shf.l.wrap.b32 %r1086, %r1085, %r1085, 24; - add.s32 %r1087, %r1086, %r1081; - xor.b32 %r1088, %r1087, %r1083; - shf.l.wrap.b32 %r1089, %r1088, %r1088, 25; - add.s32 %r1090, %r1028, %r1044; - xor.b32 %r1091, %r1090, %r1016; - shf.l.wrap.b32 %r1092, %r1091, %r1091, 16; - add.s32 %r1093, %r1092, %r1052; - xor.b32 %r1094, %r1093, %r1044; - shf.l.wrap.b32 %r1095, %r1094, %r1094, 20; - add.s32 %r1096, %r1090, %r1095; - xor.b32 %r1097, %r1096, %r1092; - shf.l.wrap.b32 %r1098, %r1097, %r1097, 24; - add.s32 %r1099, %r1098, %r1093; - xor.b32 %r1100, %r1099, %r1095; - shf.l.wrap.b32 %r1101, %r1100, %r1100, 25; - add.s32 %r1102, %r1061, %r988; - add.s32 %r1103, %r1102, %r1101; - xor.b32 %r1104, %r1103, %r1075; - shf.l.wrap.b32 %r1105, %r1104, %r1104, 16; - add.s32 %r1106, %r1105, %r1087; - xor.b32 %r1107, %r1106, %r1101; - shf.l.wrap.b32 %r1108, %r1107, %r1107, 20; - add.s32 %r1109, %r1103, %r1108; - xor.b32 %r1110, %r1109, %r1105; - shf.l.wrap.b32 %r1111, %r1110, %r1110, 24; - add.s32 %r1112, %r1111, %r1106; - xor.b32 %r1113, %r1112, %r1108; - shf.l.wrap.b32 %r1114, %r1113, %r1113, 25; - add.s32 %r1115, %r1073, %r996; - add.s32 %r1116, %r1115, %r1066; - xor.b32 %r1117, %r1116, %r1086; - shf.l.wrap.b32 %r1118, %r1117, %r1117, 16; - add.s32 %r1119, %r1118, %r1099; - xor.b32 %r1120, %r1119, %r1066; - shf.l.wrap.b32 %r1121, %r1120, %r1120, 20; - add.s32 %r1122, %r1116, %r1121; - xor.b32 %r1123, %r1122, %r1118; - shf.l.wrap.b32 %r1124, %r1123, %r1123, 24; - add.s32 %r1125, %r1124, %r1119; - xor.b32 %r1126, %r1125, %r1121; - shf.l.wrap.b32 %r1127, %r1126, %r1126, 25; - add.s32 %r1128, %r1084, %r1078; - xor.b32 %r1129, %r1098, %r1128; - shf.l.wrap.b32 %r1130, %r1129, %r1129, 16; - add.s32 %r1131, %r1130, %r1064; - xor.b32 %r1132, %r1131, %r1078; - shf.l.wrap.b32 %r1133, %r1132, %r1132, 20; - add.s32 %r1134, %r1128, %r972; - add.s32 %r1135, %r1134, %r1133; - xor.b32 %r1136, %r1135, %r1130; - shf.l.wrap.b32 %r1137, %r1136, %r1136, 24; - add.s32 %r1138, %r1137, %r1131; - xor.b32 %r1139, %r1138, %r1133; - shf.l.wrap.b32 %r1140, %r1139, %r1139, 25; - add.s32 %r1141, %r1096, %r1089; - xor.b32 %r1142, %r1063, %r1141; - shf.l.wrap.b32 %r1143, %r1142, %r1142, 16; - add.s32 %r1144, %r1143, %r1076; - xor.b32 %r1145, %r1144, %r1089; - shf.l.wrap.b32 %r1146, %r1145, %r1145, 20; - add.s32 %r1147, %r1141, %r1146; - xor.b32 %r1148, %r1147, %r1143; - shf.l.wrap.b32 %r1149, %r1148, %r1148, 24; - add.s32 %r1150, %r1149, %r1144; - xor.b32 %r1151, %r1150, %r1146; - shf.l.wrap.b32 %r1152, %r1151, %r1151, 25; - add.s32 %r1153, %r1109, %r980; - add.s32 %r1154, %r1153, %r1127; - xor.b32 %r1155, %r1154, %r1149; - shf.l.wrap.b32 %r1156, %r1155, %r1155, 16; - add.s32 %r1157, %r1156, %r1138; - xor.b32 %r1158, %r1157, %r1127; - shf.l.wrap.b32 %r1159, %r1158, %r1158, 20; - add.s32 %r1160, %r1154, %r1159; - xor.b32 %r1161, %r1160, %r1156; - shf.l.wrap.b32 %r1162, %r1161, %r1161, 24; - add.s32 %r1163, %r1162, %r1157; - xor.b32 %r1164, %r1163, %r1159; - shf.l.wrap.b32 %r1165, %r1164, %r1164, 25; - add.s32 %r1166, %r1140, %r1122; - xor.b32 %r1167, %r1111, %r1166; - shf.l.wrap.b32 %r1168, %r1167, %r1167, 16; - add.s32 %r1169, %r1168, %r1150; - xor.b32 %r1170, %r1169, %r1140; - shf.l.wrap.b32 %r1171, %r1170, %r1170, 20; - add.s32 %r1172, %r1166, %r1171; - xor.b32 %r1173, %r1172, %r1168; - shf.l.wrap.b32 %r1174, %r1173, %r1173, 24; - add.s32 %r1175, %r1174, %r1169; - xor.b32 %r1176, %r1175, %r1171; - shf.l.wrap.b32 %r1177, %r1176, %r1176, 25; - add.s32 %r1178, %r1135, %r1152; - xor.b32 %r1179, %r1124, %r1178; - shf.l.wrap.b32 %r1180, %r1179, %r1179, 16; - add.s32 %r1181, %r1180, %r1112; - xor.b32 %r1182, %r1181, %r1152; - shf.l.wrap.b32 %r1183, %r1182, %r1182, 20; - add.s32 %r1184, %r1178, %r1183; - xor.b32 %r1185, %r1184, %r1180; - shf.l.wrap.b32 %r1186, %r1185, %r1185, 24; - add.s32 %r1187, %r1186, %r1181; - xor.b32 %r1188, %r1187, %r1183; - shf.l.wrap.b32 %r1189, %r1188, %r1188, 25; - add.s32 %r1190, %r1147, %r1114; - xor.b32 %r1191, %r1190, %r1137; - shf.l.wrap.b32 %r1192, %r1191, %r1191, 16; - add.s32 %r1193, %r1192, %r1125; - xor.b32 %r1194, %r1193, %r1114; - shf.l.wrap.b32 %r1195, %r1194, %r1194, 20; - add.s32 %r1196, %r1190, %r1195; - xor.b32 %r1197, %r1196, %r1192; - shf.l.wrap.b32 %r1198, %r1197, %r1197, 24; - add.s32 %r1199, %r1198, %r1193; - xor.b32 %r1200, %r1199, %r1195; - shf.l.wrap.b32 %r1201, %r1200, %r1200, 25; - add.s32 %r1202, %r1160, %r996; - add.s32 %r1203, %r1202, %r1201; - xor.b32 %r1204, %r1203, %r1174; - shf.l.wrap.b32 %r1205, %r1204, %r1204, 16; - add.s32 %r1206, %r1205, %r1187; - xor.b32 %r1207, %r1206, %r1201; - shf.l.wrap.b32 %r1208, %r1207, %r1207, 20; - add.s32 %r1209, %r1203, %r1208; - xor.b32 %r1210, %r1209, %r1205; - shf.l.wrap.b32 %r1211, %r1210, %r1210, 24; - add.s32 %r1212, %r1211, %r1206; - xor.b32 %r1213, %r1212, %r1208; - shf.l.wrap.b32 %r1214, %r1213, %r1213, 25; - add.s32 %r1215, %r1172, %r1165; - xor.b32 %r1216, %r1215, %r1186; - shf.l.wrap.b32 %r1217, %r1216, %r1216, 16; - add.s32 %r1218, %r1217, %r1199; - xor.b32 %r1219, %r1218, %r1165; - shf.l.wrap.b32 %r1220, %r1219, %r1219, 20; - add.s32 %r1221, %r1215, %r1220; - xor.b32 %r1222, %r1221, %r1217; - shf.l.wrap.b32 %r1223, %r1222, %r1222, 24; - add.s32 %r1224, %r1223, %r1218; - xor.b32 %r1225, %r1224, %r1220; - shf.l.wrap.b32 %r1226, %r1225, %r1225, 25; - add.s32 %r1227, %r1184, %r1177; - xor.b32 %r1228, %r1198, %r1227; - shf.l.wrap.b32 %r1229, %r1228, %r1228, 16; - add.s32 %r1230, %r1229, %r1163; - xor.b32 %r1231, %r1230, %r1177; - shf.l.wrap.b32 %r1232, %r1231, %r1231, 20; - add.s32 %r1233, %r1227, %r988; - add.s32 %r1234, %r1233, %r1232; - xor.b32 %r1235, %r1234, %r1229; - shf.l.wrap.b32 %r1236, %r1235, %r1235, 24; - add.s32 %r1237, %r1236, %r1230; - xor.b32 %r1238, %r1237, %r1232; - shf.l.wrap.b32 %r1239, %r1238, %r1238, 25; - add.s32 %r1240, %r1196, %r1189; - xor.b32 %r1241, %r1162, %r1240; - shf.l.wrap.b32 %r1242, %r1241, %r1241, 16; - add.s32 %r1243, %r1242, %r1175; - xor.b32 %r1244, %r1243, %r1189; - shf.l.wrap.b32 %r1245, %r1244, %r1244, 20; - add.s32 %r1246, %r1240, %r1245; - xor.b32 %r1247, %r1246, %r1242; - shf.l.wrap.b32 %r1248, %r1247, %r1247, 24; - add.s32 %r1249, %r1248, %r1243; - xor.b32 %r1250, %r1249, %r1245; - shf.l.wrap.b32 %r1251, %r1250, %r1250, 25; - add.s32 %r1252, %r1209, %r1226; - xor.b32 %r1253, %r1252, %r1248; - shf.l.wrap.b32 %r1254, %r1253, %r1253, 16; - add.s32 %r1255, %r1254, %r1237; - xor.b32 %r1256, %r1255, %r1226; - shf.l.wrap.b32 %r1257, %r1256, %r1256, 20; - add.s32 %r1258, %r1252, %r1257; - xor.b32 %r1259, %r1258, %r1254; - shf.l.wrap.b32 %r1260, %r1259, %r1259, 24; - add.s32 %r1261, %r1260, %r1255; - xor.b32 %r1262, %r1261, %r1257; - shf.l.wrap.b32 %r1263, %r1262, %r1262, 25; - add.s32 %r1264, %r1239, %r1221; - xor.b32 %r1265, %r1211, %r1264; - shf.l.wrap.b32 %r1266, %r1265, %r1265, 16; - add.s32 %r1267, %r1266, %r1249; - xor.b32 %r1268, %r1267, %r1239; - shf.l.wrap.b32 %r1269, %r1268, %r1268, 20; - add.s32 %r1270, %r1264, %r972; - add.s32 %r1271, %r1270, %r1269; - xor.b32 %r1272, %r1271, %r1266; - shf.l.wrap.b32 %r1273, %r1272, %r1272, 24; - add.s32 %r1274, %r1273, %r1267; - xor.b32 %r1275, %r1274, %r1269; - shf.l.wrap.b32 %r1276, %r1275, %r1275, 25; - add.s32 %r1277, %r1234, %r1251; - xor.b32 %r1278, %r1223, %r1277; - shf.l.wrap.b32 %r1279, %r1278, %r1278, 16; - add.s32 %r1280, %r1279, %r1212; - xor.b32 %r1281, %r1280, %r1251; - shf.l.wrap.b32 %r1282, %r1281, %r1281, 20; - add.s32 %r1283, %r1277, %r1282; - xor.b32 %r1284, %r1283, %r1279; - shf.l.wrap.b32 %r1285, %r1284, %r1284, 24; - add.s32 %r1286, %r1285, %r1280; - xor.b32 %r1287, %r1286, %r1282; - shf.l.wrap.b32 %r1288, %r1287, %r1287, 25; - add.s32 %r1289, %r1246, %r1214; - xor.b32 %r1290, %r1289, %r1236; - shf.l.wrap.b32 %r1291, %r1290, %r1290, 16; - add.s32 %r1292, %r1291, %r1224; - xor.b32 %r1293, %r1292, %r1214; - shf.l.wrap.b32 %r1294, %r1293, %r1293, 20; - add.s32 %r1295, %r1289, %r980; - add.s32 %r1296, %r1295, %r1294; - xor.b32 %r1297, %r1296, %r1291; - shf.l.wrap.b32 %r1298, %r1297, %r1297, 24; - add.s32 %r1299, %r1298, %r1292; - xor.b32 %r1300, %r1299, %r1294; - shf.l.wrap.b32 %r1301, %r1300, %r1300, 25; - add.s32 %r1302, %r1258, %r1301; - xor.b32 %r1303, %r1302, %r1273; - shf.l.wrap.b32 %r1304, %r1303, %r1303, 16; - add.s32 %r1305, %r1304, %r1286; - xor.b32 %r1306, %r1305, %r1301; - shf.l.wrap.b32 %r1307, %r1306, %r1306, 20; - add.s32 %r1308, %r1302, %r1307; - xor.b32 %r1309, %r1308, %r1304; - shf.l.wrap.b32 %r1310, %r1309, %r1309, 24; - add.s32 %r1311, %r1310, %r1305; - xor.b32 %r1312, %r1311, %r1307; - shf.l.wrap.b32 %r1313, %r1312, %r1312, 25; - add.s32 %r1314, %r1271, %r1263; - xor.b32 %r1315, %r1314, %r1285; - shf.l.wrap.b32 %r1316, %r1315, %r1315, 16; - add.s32 %r1317, %r1316, %r1299; - xor.b32 %r1318, %r1317, %r1263; - shf.l.wrap.b32 %r1319, %r1318, %r1318, 20; - add.s32 %r1320, %r1314, %r1319; - xor.b32 %r1321, %r1320, %r1316; + add.s32 %r1039, %r1033, %r182; + add.s32 %r1040, %r1039, %r1038; + xor.b32 %r1041, %r1040, %r1035; + shf.l.wrap.b32 %r1042, %r1041, %r1041, 24; + add.s32 %r1043, %r1042, %r1036; + xor.b32 %r1044, %r1043, %r1038; + shf.l.wrap.b32 %r1045, %r1044, %r1044, 25; + add.s32 %r1046, %r998, %r203; + add.s32 %r1047, %r1046, %r961; + xor.b32 %r1048, %r1047, %r986; + shf.l.wrap.b32 %r1049, %r1048, %r1048, 16; + add.s32 %r1050, %r1049, %r973; + xor.b32 %r1051, %r1050, %r961; + shf.l.wrap.b32 %r1052, %r1051, %r1051, 20; + add.s32 %r1053, %r1047, %r245; + add.s32 %r1054, %r1053, %r1052; + xor.b32 %r1055, %r1054, %r1049; + shf.l.wrap.b32 %r1056, %r1055, %r1055, 24; + add.s32 %r1057, %r1056, %r1050; + xor.b32 %r1058, %r1057, %r1052; + shf.l.wrap.b32 %r1059, %r1058, %r1058, 25; + xor.b32 %r1060, %r1043, %r1012; + st.local.u32 [%rd3+-104], %r1060; + xor.b32 %r1061, %r1057, %r1026; + st.local.u32 [%rd3+-100], %r1061; + xor.b32 %r1062, %r1015, %r1040; + st.local.u32 [%rd3+-96], %r1062; + xor.b32 %r1063, %r1029, %r1054; + st.local.u32 [%rd3+-92], %r1063; + xor.b32 %r1064, %r1059, %r1028; + st.local.u32 [%rd3+-88], %r1064; + xor.b32 %r1065, %r1017, %r1042; + st.local.u32 [%rd3+-84], %r1065; + xor.b32 %r1066, %r1031, %r1056; + st.local.u32 [%rd3+-80], %r1066; + xor.b32 %r1067, %r1045, %r1014; + st.local.u32 [%rd3+-76], %r1067; + add.s16 %rs114, %rs109, 1; + st.local.v2.u8 [%rd3], {%rs351, %rs114}; + +$L__BB1_8: + add.s64 %rd117, %rd13, %rd238; + st.local.u8 [%rd117], %rs351; + add.s64 %rd238, %rd238, 1; + setp.lt.u64 %p7, %rd238, 64; + mov.u64 %rd244, %rd12; + @%p7 bra $L__BB1_8; + +$L__BB1_9: + setp.lt.u64 %p8, %rd244, 65; + @%p8 bra $L__BB1_12; + + ld.local.u8 %rs9, [%rd3+2]; + ld.local.u8 %rs352, [%rd3+1]; + ld.local.u32 %r11657, [%rd3+-104]; + ld.local.u32 %r11656, [%rd3+-100]; + ld.local.u32 %r11655, [%rd3+-96]; + ld.local.u32 %r11654, [%rd3+-92]; + ld.local.u32 %r11653, [%rd3+-88]; + ld.local.u32 %r11652, [%rd3+-84]; + ld.local.u32 %r11651, [%rd3+-80]; + ld.local.u32 %r11650, [%rd3+-76]; + ld.local.u64 %rd118, [%rd3+-72]; + cvt.u32.u64 %r9, %rd118; + shr.u64 %rd119, %rd118, 32; + cvt.u32.u64 %r10, %rd119; + +$L__BB1_11: + and.b16 %rs116, %rs352, 255; + setp.eq.s16 %p9, %rs116, 0; + selp.u16 %rs117, 1, 0, %p9; + or.b16 %rs118, %rs9, %rs117; + ld.local.u8 %r1068, [%rd261]; + ld.local.u8 %r1069, [%rd261+1]; + prmt.b32 %r1070, %r1069, %r1068, 30212; + ld.local.u8 %r1071, [%rd261+2]; + prmt.b32 %r1072, %r1071, %r1070, 28756; + ld.local.u8 %r1073, [%rd261+3]; + prmt.b32 %r1074, %r1073, %r1072, 1620; + ld.local.u8 %r1075, [%rd261+4]; + ld.local.u8 %r1076, [%rd261+5]; + prmt.b32 %r1077, %r1076, %r1075, 30212; + ld.local.u8 %r1078, [%rd261+6]; + prmt.b32 %r1079, %r1078, %r1077, 28756; + ld.local.u8 %r1080, [%rd261+7]; + prmt.b32 %r1081, %r1080, %r1079, 1620; + ld.local.u8 %r1082, [%rd261+8]; + ld.local.u8 %r1083, [%rd261+9]; + prmt.b32 %r1084, %r1083, %r1082, 30212; + ld.local.u8 %r1085, [%rd261+10]; + prmt.b32 %r1086, %r1085, %r1084, 28756; + ld.local.u8 %r1087, [%rd261+11]; + prmt.b32 %r1088, %r1087, %r1086, 1620; + ld.local.u8 %r1089, [%rd261+12]; + ld.local.u8 %r1090, [%rd261+13]; + prmt.b32 %r1091, %r1090, %r1089, 30212; + ld.local.u8 %r1092, [%rd261+14]; + prmt.b32 %r1093, %r1092, %r1091, 28756; + ld.local.u8 %r1094, [%rd261+15]; + prmt.b32 %r1095, %r1094, %r1093, 1620; + ld.local.u8 %r1096, [%rd261+16]; + ld.local.u8 %r1097, [%rd261+17]; + prmt.b32 %r1098, %r1097, %r1096, 30212; + ld.local.u8 %r1099, [%rd261+18]; + prmt.b32 %r1100, %r1099, %r1098, 28756; + ld.local.u8 %r1101, [%rd261+19]; + prmt.b32 %r1102, %r1101, %r1100, 1620; + ld.local.u8 %r1103, [%rd261+20]; + ld.local.u8 %r1104, [%rd261+21]; + prmt.b32 %r1105, %r1104, %r1103, 30212; + ld.local.u8 %r1106, [%rd261+22]; + prmt.b32 %r1107, %r1106, %r1105, 28756; + ld.local.u8 %r1108, [%rd261+23]; + prmt.b32 %r1109, %r1108, %r1107, 1620; + ld.local.u8 %r1110, [%rd261+24]; + ld.local.u8 %r1111, [%rd261+25]; + prmt.b32 %r1112, %r1111, %r1110, 30212; + ld.local.u8 %r1113, [%rd261+26]; + prmt.b32 %r1114, %r1113, %r1112, 28756; + ld.local.u8 %r1115, [%rd261+27]; + prmt.b32 %r1116, %r1115, %r1114, 1620; + ld.local.u8 %r1117, [%rd261+28]; + ld.local.u8 %r1118, [%rd261+29]; + prmt.b32 %r1119, %r1118, %r1117, 30212; + ld.local.u8 %r1120, [%rd261+30]; + prmt.b32 %r1121, %r1120, %r1119, 28756; + ld.local.u8 %r1122, [%rd261+31]; + prmt.b32 %r1123, %r1122, %r1121, 1620; + ld.local.u8 %r1124, [%rd261+32]; + ld.local.u8 %r1125, [%rd261+33]; + prmt.b32 %r1126, %r1125, %r1124, 30212; + ld.local.u8 %r1127, [%rd261+34]; + prmt.b32 %r1128, %r1127, %r1126, 28756; + ld.local.u8 %r1129, [%rd261+35]; + prmt.b32 %r1130, %r1129, %r1128, 1620; + ld.local.u8 %r1131, [%rd261+36]; + ld.local.u8 %r1132, [%rd261+37]; + prmt.b32 %r1133, %r1132, %r1131, 30212; + ld.local.u8 %r1134, [%rd261+38]; + prmt.b32 %r1135, %r1134, %r1133, 28756; + ld.local.u8 %r1136, [%rd261+39]; + prmt.b32 %r1137, %r1136, %r1135, 1620; + ld.local.u8 %r1138, [%rd261+40]; + ld.local.u8 %r1139, [%rd261+41]; + prmt.b32 %r1140, %r1139, %r1138, 30212; + ld.local.u8 %r1141, [%rd261+42]; + prmt.b32 %r1142, %r1141, %r1140, 28756; + ld.local.u8 %r1143, [%rd261+43]; + prmt.b32 %r1144, %r1143, %r1142, 1620; + ld.local.u8 %r1145, [%rd261+44]; + ld.local.u8 %r1146, [%rd261+45]; + prmt.b32 %r1147, %r1146, %r1145, 30212; + ld.local.u8 %r1148, [%rd261+46]; + prmt.b32 %r1149, %r1148, %r1147, 28756; + ld.local.u8 %r1150, [%rd261+47]; + prmt.b32 %r1151, %r1150, %r1149, 1620; + ld.local.u8 %r1152, [%rd261+48]; + ld.local.u8 %r1153, [%rd261+49]; + prmt.b32 %r1154, %r1153, %r1152, 30212; + ld.local.u8 %r1155, [%rd261+50]; + prmt.b32 %r1156, %r1155, %r1154, 28756; + ld.local.u8 %r1157, [%rd261+51]; + prmt.b32 %r1158, %r1157, %r1156, 1620; + ld.local.u8 %r1159, [%rd261+52]; + ld.local.u8 %r1160, [%rd261+53]; + prmt.b32 %r1161, %r1160, %r1159, 30212; + ld.local.u8 %r1162, [%rd261+54]; + prmt.b32 %r1163, %r1162, %r1161, 28756; + ld.local.u8 %r1164, [%rd261+55]; + prmt.b32 %r1165, %r1164, %r1163, 1620; + ld.local.u8 %r1166, [%rd261+56]; + ld.local.u8 %r1167, [%rd261+57]; + prmt.b32 %r1168, %r1167, %r1166, 30212; + ld.local.u8 %r1169, [%rd261+58]; + prmt.b32 %r1170, %r1169, %r1168, 28756; + ld.local.u8 %r1171, [%rd261+59]; + prmt.b32 %r1172, %r1171, %r1170, 1620; + ld.local.u8 %r1173, [%rd261+60]; + ld.local.u8 %r1174, [%rd261+61]; + prmt.b32 %r1175, %r1174, %r1173, 30212; + ld.local.u8 %r1176, [%rd261+62]; + prmt.b32 %r1177, %r1176, %r1175, 28756; + ld.local.u8 %r1178, [%rd261+63]; + prmt.b32 %r1179, %r1178, %r1177, 1620; + cvt.u32.u16 %r1180, %rs118; + and.b32 %r1181, %r1180, 255; + add.s32 %r1182, %r11657, %r1074; + add.s32 %r1183, %r1182, %r11653; + xor.b32 %r1184, %r1183, %r9; + shf.l.wrap.b32 %r1185, %r1184, %r1184, 16; + add.s32 %r1186, %r1185, 1779033703; + xor.b32 %r1187, %r1186, %r11653; + shf.l.wrap.b32 %r1188, %r1187, %r1187, 20; + add.s32 %r1189, %r1183, %r1081; + add.s32 %r1190, %r1189, %r1188; + xor.b32 %r1191, %r1190, %r1185; + shf.l.wrap.b32 %r1192, %r1191, %r1191, 24; + add.s32 %r1193, %r1192, %r1186; + xor.b32 %r1194, %r1193, %r1188; + shf.l.wrap.b32 %r1195, %r1194, %r1194, 25; + add.s32 %r1196, %r11656, %r1088; + add.s32 %r1197, %r1196, %r11652; + xor.b32 %r1198, %r1197, %r10; + shf.l.wrap.b32 %r1199, %r1198, %r1198, 16; + add.s32 %r1200, %r1199, -1150833019; + xor.b32 %r1201, %r1200, %r11652; + shf.l.wrap.b32 %r1202, %r1201, %r1201, 20; + add.s32 %r1203, %r1197, %r1095; + add.s32 %r1204, %r1203, %r1202; + xor.b32 %r1205, %r1204, %r1199; + shf.l.wrap.b32 %r1206, %r1205, %r1205, 24; + add.s32 %r1207, %r1206, %r1200; + xor.b32 %r1208, %r1207, %r1202; + shf.l.wrap.b32 %r1209, %r1208, %r1208, 25; + add.s32 %r1210, %r11655, %r1102; + add.s32 %r1211, %r1210, %r11651; + shr.u32 %r1212, %r1211, 16; + shl.b32 %r1213, %r1211, 16; + xor.b32 %r1214, %r1213, 4194304; + or.b32 %r1215, %r1214, %r1212; + add.s32 %r1216, %r1215, 1013904242; + xor.b32 %r1217, %r1216, %r11651; + shf.l.wrap.b32 %r1218, %r1217, %r1217, 20; + add.s32 %r1219, %r1211, %r1109; + add.s32 %r1220, %r1219, %r1218; + xor.b32 %r1221, %r1220, %r1215; + shf.l.wrap.b32 %r1222, %r1221, %r1221, 24; + add.s32 %r1223, %r1222, %r1216; + xor.b32 %r1224, %r1223, %r1218; + shf.l.wrap.b32 %r1225, %r1224, %r1224, 25; + add.s32 %r1226, %r11654, %r1116; + add.s32 %r1227, %r1226, %r11650; + xor.b32 %r1228, %r1227, %r1181; + shr.u32 %r1229, %r1227, 16; + shl.b32 %r1230, %r1228, 16; + or.b32 %r1231, %r1230, %r1229; + add.s32 %r1232, %r1231, -1521486534; + xor.b32 %r1233, %r1232, %r11650; + shf.l.wrap.b32 %r1234, %r1233, %r1233, 20; + add.s32 %r1235, %r1227, %r1123; + add.s32 %r1236, %r1235, %r1234; + xor.b32 %r1237, %r1236, %r1231; + shf.l.wrap.b32 %r1238, %r1237, %r1237, 24; + add.s32 %r1239, %r1238, %r1232; + xor.b32 %r1240, %r1239, %r1234; + shf.l.wrap.b32 %r1241, %r1240, %r1240, 25; + add.s32 %r1242, %r1190, %r1130; + add.s32 %r1243, %r1242, %r1209; + xor.b32 %r1244, %r1243, %r1238; + shf.l.wrap.b32 %r1245, %r1244, %r1244, 16; + add.s32 %r1246, %r1245, %r1223; + xor.b32 %r1247, %r1246, %r1209; + shf.l.wrap.b32 %r1248, %r1247, %r1247, 20; + add.s32 %r1249, %r1243, %r1137; + add.s32 %r1250, %r1249, %r1248; + xor.b32 %r1251, %r1250, %r1245; + shf.l.wrap.b32 %r1252, %r1251, %r1251, 24; + add.s32 %r1253, %r1252, %r1246; + xor.b32 %r1254, %r1253, %r1248; + shf.l.wrap.b32 %r1255, %r1254, %r1254, 25; + add.s32 %r1256, %r1204, %r1144; + add.s32 %r1257, %r1256, %r1225; + xor.b32 %r1258, %r1257, %r1192; + shf.l.wrap.b32 %r1259, %r1258, %r1258, 16; + add.s32 %r1260, %r1259, %r1239; + xor.b32 %r1261, %r1260, %r1225; + shf.l.wrap.b32 %r1262, %r1261, %r1261, 20; + add.s32 %r1263, %r1257, %r1151; + add.s32 %r1264, %r1263, %r1262; + xor.b32 %r1265, %r1264, %r1259; + shf.l.wrap.b32 %r1266, %r1265, %r1265, 24; + add.s32 %r1267, %r1266, %r1260; + xor.b32 %r1268, %r1267, %r1262; + shf.l.wrap.b32 %r1269, %r1268, %r1268, 25; + add.s32 %r1270, %r1220, %r1158; + add.s32 %r1271, %r1270, %r1241; + xor.b32 %r1272, %r1271, %r1206; + shf.l.wrap.b32 %r1273, %r1272, %r1272, 16; + add.s32 %r1274, %r1273, %r1193; + xor.b32 %r1275, %r1274, %r1241; + shf.l.wrap.b32 %r1276, %r1275, %r1275, 20; + add.s32 %r1277, %r1271, %r1165; + add.s32 %r1278, %r1277, %r1276; + xor.b32 %r1279, %r1278, %r1273; + shf.l.wrap.b32 %r1280, %r1279, %r1279, 24; + add.s32 %r1281, %r1280, %r1274; + xor.b32 %r1282, %r1281, %r1276; + shf.l.wrap.b32 %r1283, %r1282, %r1282, 25; + add.s32 %r1284, %r1236, %r1172; + add.s32 %r1285, %r1284, %r1195; + xor.b32 %r1286, %r1285, %r1222; + shf.l.wrap.b32 %r1287, %r1286, %r1286, 16; + add.s32 %r1288, %r1287, %r1207; + xor.b32 %r1289, %r1288, %r1195; + shf.l.wrap.b32 %r1290, %r1289, %r1289, 20; + add.s32 %r1291, %r1285, %r1179; + add.s32 %r1292, %r1291, %r1290; + xor.b32 %r1293, %r1292, %r1287; + shf.l.wrap.b32 %r1294, %r1293, %r1293, 24; + add.s32 %r1295, %r1294, %r1288; + xor.b32 %r1296, %r1295, %r1290; + shf.l.wrap.b32 %r1297, %r1296, %r1296, 25; + add.s32 %r1298, %r1250, %r1088; + add.s32 %r1299, %r1298, %r1297; + xor.b32 %r1300, %r1299, %r1266; + shf.l.wrap.b32 %r1301, %r1300, %r1300, 16; + add.s32 %r1302, %r1301, %r1281; + xor.b32 %r1303, %r1302, %r1297; + shf.l.wrap.b32 %r1304, %r1303, %r1303, 20; + add.s32 %r1305, %r1299, %r1116; + add.s32 %r1306, %r1305, %r1304; + xor.b32 %r1307, %r1306, %r1301; + shf.l.wrap.b32 %r1308, %r1307, %r1307, 24; + add.s32 %r1309, %r1308, %r1302; + xor.b32 %r1310, %r1309, %r1304; + shf.l.wrap.b32 %r1311, %r1310, %r1310, 25; + add.s32 %r1312, %r1264, %r1095; + add.s32 %r1313, %r1312, %r1255; + xor.b32 %r1314, %r1313, %r1280; + shf.l.wrap.b32 %r1315, %r1314, %r1314, 16; + add.s32 %r1316, %r1315, %r1295; + xor.b32 %r1317, %r1316, %r1255; + shf.l.wrap.b32 %r1318, %r1317, %r1317, 20; + add.s32 %r1319, %r1313, %r1144; + add.s32 %r1320, %r1319, %r1318; + xor.b32 %r1321, %r1320, %r1315; shf.l.wrap.b32 %r1322, %r1321, %r1321, 24; - add.s32 %r1323, %r1322, %r1317; - xor.b32 %r1324, %r1323, %r1319; + add.s32 %r1323, %r1322, %r1316; + xor.b32 %r1324, %r1323, %r1318; shf.l.wrap.b32 %r1325, %r1324, %r1324, 25; - add.s32 %r1326, %r1283, %r1276; - xor.b32 %r1327, %r1298, %r1326; - shf.l.wrap.b32 %r1328, %r1327, %r1327, 16; - add.s32 %r1329, %r1328, %r1261; - xor.b32 %r1330, %r1329, %r1276; - shf.l.wrap.b32 %r1331, %r1330, %r1330, 20; - add.s32 %r1332, %r1326, %r996; - add.s32 %r1333, %r1332, %r1331; - xor.b32 %r1334, %r1333, %r1328; - shf.l.wrap.b32 %r1335, %r1334, %r1334, 24; - add.s32 %r1336, %r1335, %r1329; - xor.b32 %r1337, %r1336, %r1331; - shf.l.wrap.b32 %r1338, %r1337, %r1337, 25; - add.s32 %r1339, %r1296, %r1288; - xor.b32 %r1340, %r1260, %r1339; - shf.l.wrap.b32 %r1341, %r1340, %r1340, 16; - add.s32 %r1342, %r1341, %r1274; - xor.b32 %r1343, %r1342, %r1288; - shf.l.wrap.b32 %r1344, %r1343, %r1343, 20; - add.s32 %r1345, %r1339, %r1344; - xor.b32 %r1346, %r1345, %r1341; - shf.l.wrap.b32 %r1347, %r1346, %r1346, 24; - add.s32 %r1348, %r1347, %r1342; - xor.b32 %r1349, %r1348, %r1344; - shf.l.wrap.b32 %r1350, %r1349, %r1349, 25; - add.s32 %r1351, %r1308, %r1325; - xor.b32 %r1352, %r1351, %r1347; - shf.l.wrap.b32 %r1353, %r1352, %r1352, 16; - add.s32 %r1354, %r1353, %r1336; - xor.b32 %r1355, %r1354, %r1325; - shf.l.wrap.b32 %r1356, %r1355, %r1355, 20; - add.s32 %r1357, %r1351, %r972; - add.s32 %r1358, %r1357, %r1356; - xor.b32 %r1359, %r1358, %r1353; - shf.l.wrap.b32 %r1360, %r1359, %r1359, 24; - add.s32 %r1361, %r1360, %r1354; - xor.b32 %r1362, %r1361, %r1356; - shf.l.wrap.b32 %r1363, %r1362, %r1362, 25; - add.s32 %r1364, %r1338, %r1320; - xor.b32 %r1365, %r1310, %r1364; - shf.l.wrap.b32 %r1366, %r1365, %r1365, 16; - add.s32 %r1367, %r1366, %r1348; - xor.b32 %r1368, %r1367, %r1338; - shf.l.wrap.b32 %r1369, %r1368, %r1368, 20; - add.s32 %r1370, %r1364, %r988; - add.s32 %r1371, %r1370, %r1369; - xor.b32 %r1372, %r1371, %r1366; - shf.l.wrap.b32 %r1373, %r1372, %r1372, 24; - add.s32 %r1374, %r1373, %r1367; - xor.b32 %r1375, %r1374, %r1369; - shf.l.wrap.b32 %r1376, %r1375, %r1375, 25; - add.s32 %r1377, %r1333, %r1350; - xor.b32 %r1378, %r1322, %r1377; - shf.l.wrap.b32 %r1379, %r1378, %r1378, 16; - add.s32 %r1380, %r1379, %r1311; - xor.b32 %r1381, %r1380, %r1350; - shf.l.wrap.b32 %r1382, %r1381, %r1381, 20; - add.s32 %r1383, %r1377, %r1382; - xor.b32 %r1384, %r1383, %r1379; - shf.l.wrap.b32 %r1385, %r1384, %r1384, 24; - add.s32 %r1386, %r1385, %r1380; - xor.b32 %r1387, %r1386, %r1382; - shf.l.wrap.b32 %r1388, %r1387, %r1387, 25; - add.s32 %r1389, %r1345, %r980; - add.s32 %r1390, %r1389, %r1313; - xor.b32 %r1391, %r1390, %r1335; - shf.l.wrap.b32 %r1392, %r1391, %r1391, 16; - add.s32 %r1393, %r1392, %r1323; - xor.b32 %r1394, %r1393, %r1313; - shf.l.wrap.b32 %r1395, %r1394, %r1394, 20; - add.s32 %r1396, %r1390, %r1395; - xor.b32 %r1397, %r1396, %r1392; - shf.l.wrap.b32 %r1398, %r1397, %r1397, 24; - add.s32 %r1399, %r1398, %r1393; - xor.b32 %r1400, %r1399, %r1395; - shf.l.wrap.b32 %r1401, %r1400, %r1400, 25; - add.s32 %r1402, %r1358, %r1401; - xor.b32 %r1403, %r1402, %r1373; - shf.l.wrap.b32 %r1404, %r1403, %r1403, 16; - add.s32 %r1405, %r1404, %r1386; - xor.b32 %r1406, %r1405, %r1401; - shf.l.wrap.b32 %r1407, %r1406, %r1406, 20; - add.s32 %r1408, %r1402, %r1407; - xor.b32 %r1409, %r1408, %r1404; - shf.l.wrap.b32 %r1410, %r1409, %r1409, 24; - add.s32 %r1411, %r1410, %r1405; - xor.b32 %r1412, %r1411, %r1407; - shf.l.wrap.b32 %r1413, %r1412, %r1412, 25; - add.s32 %r1414, %r1371, %r1363; - xor.b32 %r1415, %r1414, %r1385; - shf.l.wrap.b32 %r1416, %r1415, %r1415, 16; - add.s32 %r1417, %r1416, %r1399; - xor.b32 %r1418, %r1417, %r1363; - shf.l.wrap.b32 %r1419, %r1418, %r1418, 20; - add.s32 %r1420, %r1414, %r1419; - xor.b32 %r1421, %r1420, %r1416; - shf.l.wrap.b32 %r1422, %r1421, %r1421, 24; - add.s32 %r1423, %r1422, %r1417; - xor.b32 %r1424, %r1423, %r1419; - shf.l.wrap.b32 %r1425, %r1424, %r1424, 25; - add.s32 %r1426, %r1383, %r1376; - xor.b32 %r1427, %r1398, %r1426; - shf.l.wrap.b32 %r1428, %r1427, %r1427, 16; - add.s32 %r1429, %r1428, %r1361; - xor.b32 %r1430, %r1429, %r1376; - shf.l.wrap.b32 %r1431, %r1430, %r1430, 20; - add.s32 %r1432, %r1426, %r1431; - xor.b32 %r1433, %r1432, %r1428; + add.s32 %r1326, %r1278, %r1123; + add.s32 %r1327, %r1326, %r1269; + xor.b32 %r1328, %r1327, %r1294; + shf.l.wrap.b32 %r1329, %r1328, %r1328, 16; + add.s32 %r1330, %r1329, %r1253; + xor.b32 %r1331, %r1330, %r1269; + shf.l.wrap.b32 %r1332, %r1331, %r1331, 20; + add.s32 %r1333, %r1327, %r1074; + add.s32 %r1334, %r1333, %r1332; + xor.b32 %r1335, %r1334, %r1329; + shf.l.wrap.b32 %r1336, %r1335, %r1335, 24; + add.s32 %r1337, %r1336, %r1330; + xor.b32 %r1338, %r1337, %r1332; + shf.l.wrap.b32 %r1339, %r1338, %r1338, 25; + add.s32 %r1340, %r1292, %r1102; + add.s32 %r1341, %r1340, %r1283; + xor.b32 %r1342, %r1341, %r1252; + shf.l.wrap.b32 %r1343, %r1342, %r1342, 16; + add.s32 %r1344, %r1343, %r1267; + xor.b32 %r1345, %r1344, %r1283; + shf.l.wrap.b32 %r1346, %r1345, %r1345, 20; + add.s32 %r1347, %r1341, %r1165; + add.s32 %r1348, %r1347, %r1346; + xor.b32 %r1349, %r1348, %r1343; + shf.l.wrap.b32 %r1350, %r1349, %r1349, 24; + add.s32 %r1351, %r1350, %r1344; + xor.b32 %r1352, %r1351, %r1346; + shf.l.wrap.b32 %r1353, %r1352, %r1352, 25; + add.s32 %r1354, %r1306, %r1081; + add.s32 %r1355, %r1354, %r1325; + xor.b32 %r1356, %r1355, %r1350; + shf.l.wrap.b32 %r1357, %r1356, %r1356, 16; + add.s32 %r1358, %r1357, %r1337; + xor.b32 %r1359, %r1358, %r1325; + shf.l.wrap.b32 %r1360, %r1359, %r1359, 20; + add.s32 %r1361, %r1355, %r1151; + add.s32 %r1362, %r1361, %r1360; + xor.b32 %r1363, %r1362, %r1357; + shf.l.wrap.b32 %r1364, %r1363, %r1363, 24; + add.s32 %r1365, %r1364, %r1358; + xor.b32 %r1366, %r1365, %r1360; + shf.l.wrap.b32 %r1367, %r1366, %r1366, 25; + add.s32 %r1368, %r1320, %r1158; + add.s32 %r1369, %r1368, %r1339; + xor.b32 %r1370, %r1369, %r1308; + shf.l.wrap.b32 %r1371, %r1370, %r1370, 16; + add.s32 %r1372, %r1371, %r1351; + xor.b32 %r1373, %r1372, %r1339; + shf.l.wrap.b32 %r1374, %r1373, %r1373, 20; + add.s32 %r1375, %r1369, %r1109; + add.s32 %r1376, %r1375, %r1374; + xor.b32 %r1377, %r1376, %r1371; + shf.l.wrap.b32 %r1378, %r1377, %r1377, 24; + add.s32 %r1379, %r1378, %r1372; + xor.b32 %r1380, %r1379, %r1374; + shf.l.wrap.b32 %r1381, %r1380, %r1380, 25; + add.s32 %r1382, %r1334, %r1137; + add.s32 %r1383, %r1382, %r1353; + xor.b32 %r1384, %r1383, %r1322; + shf.l.wrap.b32 %r1385, %r1384, %r1384, 16; + add.s32 %r1386, %r1385, %r1309; + xor.b32 %r1387, %r1386, %r1353; + shf.l.wrap.b32 %r1388, %r1387, %r1387, 20; + add.s32 %r1389, %r1383, %r1172; + add.s32 %r1390, %r1389, %r1388; + xor.b32 %r1391, %r1390, %r1385; + shf.l.wrap.b32 %r1392, %r1391, %r1391, 24; + add.s32 %r1393, %r1392, %r1386; + xor.b32 %r1394, %r1393, %r1388; + shf.l.wrap.b32 %r1395, %r1394, %r1394, 25; + add.s32 %r1396, %r1348, %r1179; + add.s32 %r1397, %r1396, %r1311; + xor.b32 %r1398, %r1397, %r1336; + shf.l.wrap.b32 %r1399, %r1398, %r1398, 16; + add.s32 %r1400, %r1399, %r1323; + xor.b32 %r1401, %r1400, %r1311; + shf.l.wrap.b32 %r1402, %r1401, %r1401, 20; + add.s32 %r1403, %r1397, %r1130; + add.s32 %r1404, %r1403, %r1402; + xor.b32 %r1405, %r1404, %r1399; + shf.l.wrap.b32 %r1406, %r1405, %r1405, 24; + add.s32 %r1407, %r1406, %r1400; + xor.b32 %r1408, %r1407, %r1402; + shf.l.wrap.b32 %r1409, %r1408, %r1408, 25; + add.s32 %r1410, %r1362, %r1095; + add.s32 %r1411, %r1410, %r1409; + xor.b32 %r1412, %r1411, %r1378; + shf.l.wrap.b32 %r1413, %r1412, %r1412, 16; + add.s32 %r1414, %r1413, %r1393; + xor.b32 %r1415, %r1414, %r1409; + shf.l.wrap.b32 %r1416, %r1415, %r1415, 20; + add.s32 %r1417, %r1411, %r1102; + add.s32 %r1418, %r1417, %r1416; + xor.b32 %r1419, %r1418, %r1413; + shf.l.wrap.b32 %r1420, %r1419, %r1419, 24; + add.s32 %r1421, %r1420, %r1414; + xor.b32 %r1422, %r1421, %r1416; + shf.l.wrap.b32 %r1423, %r1422, %r1422, 25; + add.s32 %r1424, %r1376, %r1144; + add.s32 %r1425, %r1424, %r1367; + xor.b32 %r1426, %r1425, %r1392; + shf.l.wrap.b32 %r1427, %r1426, %r1426, 16; + add.s32 %r1428, %r1427, %r1407; + xor.b32 %r1429, %r1428, %r1367; + shf.l.wrap.b32 %r1430, %r1429, %r1429, 20; + add.s32 %r1431, %r1425, %r1158; + add.s32 %r1432, %r1431, %r1430; + xor.b32 %r1433, %r1432, %r1427; shf.l.wrap.b32 %r1434, %r1433, %r1433, 24; - add.s32 %r1435, %r1434, %r1429; - xor.b32 %r1436, %r1435, %r1431; + add.s32 %r1435, %r1434, %r1428; + xor.b32 %r1436, %r1435, %r1430; shf.l.wrap.b32 %r1437, %r1436, %r1436, 25; - add.s32 %r1438, %r1396, %r1388; - xor.b32 %r1439, %r1360, %r1438; - shf.l.wrap.b32 %r1440, %r1439, %r1439, 16; - add.s32 %r1441, %r1440, %r1374; - xor.b32 %r1442, %r1441, %r1388; - shf.l.wrap.b32 %r1443, %r1442, %r1442, 20; - add.s32 %r1444, %r1438, %r1443; - xor.b32 %r1445, %r1444, %r1440; - shf.l.wrap.b32 %r1446, %r1445, %r1445, 24; - add.s32 %r1447, %r1446, %r1441; - xor.b32 %r1448, %r1447, %r1443; - shf.l.wrap.b32 %r1449, %r1448, %r1448, 25; - add.s32 %r1450, %r1408, %r1425; - xor.b32 %r1451, %r1450, %r1446; - shf.l.wrap.b32 %r1452, %r1451, %r1451, 16; - add.s32 %r1453, %r1452, %r1435; - xor.b32 %r1454, %r1453, %r1425; - shf.l.wrap.b32 %r1455, %r1454, %r1454, 20; - add.s32 %r1456, %r1450, %r988; - add.s32 %r1457, %r1456, %r1455; - xor.b32 %r1458, %r1457, %r1452; - shf.l.wrap.b32 %r1459, %r1458, %r1458, 24; - add.s32 %r1460, %r1459, %r1453; + add.s32 %r1438, %r1390, %r1165; + add.s32 %r1439, %r1438, %r1381; + xor.b32 %r1440, %r1439, %r1406; + shf.l.wrap.b32 %r1441, %r1440, %r1440, 16; + add.s32 %r1442, %r1441, %r1365; + xor.b32 %r1443, %r1442, %r1381; + shf.l.wrap.b32 %r1444, %r1443, %r1443, 20; + add.s32 %r1445, %r1439, %r1088; + add.s32 %r1446, %r1445, %r1444; + xor.b32 %r1447, %r1446, %r1441; + shf.l.wrap.b32 %r1448, %r1447, %r1447, 24; + add.s32 %r1449, %r1448, %r1442; + xor.b32 %r1450, %r1449, %r1444; + shf.l.wrap.b32 %r1451, %r1450, %r1450, 25; + add.s32 %r1452, %r1404, %r1123; + add.s32 %r1453, %r1452, %r1395; + xor.b32 %r1454, %r1453, %r1364; + shf.l.wrap.b32 %r1455, %r1454, %r1454, 16; + add.s32 %r1456, %r1455, %r1379; + xor.b32 %r1457, %r1456, %r1395; + shf.l.wrap.b32 %r1458, %r1457, %r1457, 20; + add.s32 %r1459, %r1453, %r1172; + add.s32 %r1460, %r1459, %r1458; xor.b32 %r1461, %r1460, %r1455; - shf.l.wrap.b32 %r1462, %r1461, %r1461, 25; - add.s32 %r1463, %r1437, %r1420; - xor.b32 %r1464, %r1410, %r1463; - shf.l.wrap.b32 %r1465, %r1464, %r1464, 16; - add.s32 %r1466, %r1465, %r1447; - xor.b32 %r1467, %r1466, %r1437; - shf.l.wrap.b32 %r1468, %r1467, %r1467, 20; - add.s32 %r1469, %r1463, %r996; - add.s32 %r1470, %r1469, %r1468; - xor.b32 %r1471, %r1470, %r1465; - shf.l.wrap.b32 %r1472, %r1471, %r1471, 24; - add.s32 %r1473, %r1472, %r1466; - xor.b32 %r1474, %r1473, %r1468; - shf.l.wrap.b32 %r1475, %r1474, %r1474, 25; - add.s32 %r1476, %r1432, %r972; - add.s32 %r1477, %r1476, %r1449; - xor.b32 %r1478, %r1422, %r1477; - shf.l.wrap.b32 %r1479, %r1478, %r1478, 16; - add.s32 %r1480, %r1479, %r1411; - xor.b32 %r1481, %r1480, %r1449; - shf.l.wrap.b32 %r1482, %r1481, %r1481, 20; - add.s32 %r1483, %r1477, %r980; - add.s32 %r1484, %r1483, %r1482; - xor.b32 %r1485, %r1484, %r1479; - shf.l.wrap.b32 %r1486, %r1485, %r1485, 24; - add.s32 %r1487, %r1486, %r1480; - xor.b32 %r1488, %r1487, %r1482; - shf.l.wrap.b32 %r1489, %r1488, %r1488, 25; - add.s32 %r1490, %r1444, %r1413; - xor.b32 %r1491, %r1490, %r1434; - shf.l.wrap.b32 %r1492, %r1491, %r1491, 16; - add.s32 %r1493, %r1492, %r1423; - xor.b32 %r1494, %r1493, %r1413; - shf.l.wrap.b32 %r1495, %r1494, %r1494, 20; - add.s32 %r1496, %r1490, %r1495; - xor.b32 %r1497, %r1496, %r1492; - shf.l.wrap.b32 %r1498, %r1497, %r1497, 24; - add.s32 %r1499, %r1498, %r1493; - xor.b32 %r1500, %r1499, %r1495; - shf.l.wrap.b32 %r1501, %r1500, %r1500, 25; - add.s32 %r1502, %r1457, %r1501; - xor.b32 %r1503, %r1502, %r1472; - shf.l.wrap.b32 %r1504, %r1503, %r1503, 16; - add.s32 %r1505, %r1504, %r1487; - xor.b32 %r1506, %r1505, %r1501; - shf.l.wrap.b32 %r1507, %r1506, %r1506, 20; - add.s32 %r1508, %r1502, %r1507; - xor.b32 %r1509, %r1508, %r1504; - shf.l.wrap.b32 %r1510, %r1509, %r1509, 24; - add.s32 %r1511, %r1510, %r1505; - xor.b32 %r1512, %r1511, %r1507; - shf.l.wrap.b32 %r1513, %r1512, %r1512, 25; - add.s32 %r1514, %r1470, %r1462; - xor.b32 %r1515, %r1514, %r1486; - shf.l.wrap.b32 %r1516, %r1515, %r1515, 16; - add.s32 %r1517, %r1516, %r1499; - xor.b32 %r1518, %r1517, %r1462; - shf.l.wrap.b32 %r1519, %r1518, %r1518, 20; - add.s32 %r1520, %r1514, %r1519; - xor.b32 %r1521, %r1520, %r1516; - shf.l.wrap.b32 %r1522, %r1521, %r1521, 24; - add.s32 %r1523, %r1522, %r1517; - xor.b32 %r1524, %r1523, %r1519; - shf.l.wrap.b32 %r1525, %r1524, %r1524, 25; - add.s32 %r1526, %r1484, %r1475; - xor.b32 %r1527, %r1498, %r1526; - shf.l.wrap.b32 %r1528, %r1527, %r1527, 16; - add.s32 %r1529, %r1528, %r1460; - xor.b32 %r1530, %r1529, %r1475; - shf.l.wrap.b32 %r1531, %r1530, %r1530, 20; - add.s32 %r1532, %r1526, %r1531; - xor.b32 %r1533, %r1532, %r1528; - shf.l.wrap.b32 %r1534, %r1533, %r1533, 24; - add.s32 %r1535, %r1534, %r1529; - xor.b32 %r1536, %r1535, %r1531; - shf.l.wrap.b32 %r1537, %r1536, %r1536, 25; - add.s32 %r1538, %r1496, %r1489; - xor.b32 %r1539, %r1459, %r1538; - shf.l.wrap.b32 %r1540, %r1539, %r1539, 16; - add.s32 %r1541, %r1540, %r1473; - xor.b32 %r1542, %r1541, %r1489; - shf.l.wrap.b32 %r1543, %r1542, %r1542, 20; - add.s32 %r1544, %r1538, %r980; - add.s32 %r1545, %r1544, %r1543; - xor.b32 %r1546, %r1545, %r1540; - shf.l.wrap.b32 %r1547, %r1546, %r1546, 24; - add.s32 %r1548, %r1547, %r1541; - xor.b32 %r1549, %r1548, %r1543; - shf.l.wrap.b32 %r1550, %r1549, %r1549, 25; - add.s32 %r1551, %r1508, %r1525; - xor.b32 %r1552, %r1551, %r1547; + shf.l.wrap.b32 %r1462, %r1461, %r1461, 24; + add.s32 %r1463, %r1462, %r1456; + xor.b32 %r1464, %r1463, %r1458; + shf.l.wrap.b32 %r1465, %r1464, %r1464, 25; + add.s32 %r1466, %r1418, %r1116; + add.s32 %r1467, %r1466, %r1437; + xor.b32 %r1468, %r1467, %r1462; + shf.l.wrap.b32 %r1469, %r1468, %r1468, 16; + add.s32 %r1470, %r1469, %r1449; + xor.b32 %r1471, %r1470, %r1437; + shf.l.wrap.b32 %r1472, %r1471, %r1471, 20; + add.s32 %r1473, %r1467, %r1109; + add.s32 %r1474, %r1473, %r1472; + xor.b32 %r1475, %r1474, %r1469; + shf.l.wrap.b32 %r1476, %r1475, %r1475, 24; + add.s32 %r1477, %r1476, %r1470; + xor.b32 %r1478, %r1477, %r1472; + shf.l.wrap.b32 %r1479, %r1478, %r1478, 25; + add.s32 %r1480, %r1432, %r1137; + add.s32 %r1481, %r1480, %r1451; + xor.b32 %r1482, %r1481, %r1420; + shf.l.wrap.b32 %r1483, %r1482, %r1482, 16; + add.s32 %r1484, %r1483, %r1463; + xor.b32 %r1485, %r1484, %r1451; + shf.l.wrap.b32 %r1486, %r1485, %r1485, 20; + add.s32 %r1487, %r1481, %r1074; + add.s32 %r1488, %r1487, %r1486; + xor.b32 %r1489, %r1488, %r1483; + shf.l.wrap.b32 %r1490, %r1489, %r1489, 24; + add.s32 %r1491, %r1490, %r1484; + xor.b32 %r1492, %r1491, %r1486; + shf.l.wrap.b32 %r1493, %r1492, %r1492, 25; + add.s32 %r1494, %r1446, %r1151; + add.s32 %r1495, %r1494, %r1465; + xor.b32 %r1496, %r1495, %r1434; + shf.l.wrap.b32 %r1497, %r1496, %r1496, 16; + add.s32 %r1498, %r1497, %r1421; + xor.b32 %r1499, %r1498, %r1465; + shf.l.wrap.b32 %r1500, %r1499, %r1499, 20; + add.s32 %r1501, %r1495, %r1179; + add.s32 %r1502, %r1501, %r1500; + xor.b32 %r1503, %r1502, %r1497; + shf.l.wrap.b32 %r1504, %r1503, %r1503, 24; + add.s32 %r1505, %r1504, %r1498; + xor.b32 %r1506, %r1505, %r1500; + shf.l.wrap.b32 %r1507, %r1506, %r1506, 25; + add.s32 %r1508, %r1460, %r1130; + add.s32 %r1509, %r1508, %r1423; + xor.b32 %r1510, %r1509, %r1448; + shf.l.wrap.b32 %r1511, %r1510, %r1510, 16; + add.s32 %r1512, %r1511, %r1435; + xor.b32 %r1513, %r1512, %r1423; + shf.l.wrap.b32 %r1514, %r1513, %r1513, 20; + add.s32 %r1515, %r1509, %r1081; + add.s32 %r1516, %r1515, %r1514; + xor.b32 %r1517, %r1516, %r1511; + shf.l.wrap.b32 %r1518, %r1517, %r1517, 24; + add.s32 %r1519, %r1518, %r1512; + xor.b32 %r1520, %r1519, %r1514; + shf.l.wrap.b32 %r1521, %r1520, %r1520, 25; + add.s32 %r1522, %r1474, %r1144; + add.s32 %r1523, %r1522, %r1521; + xor.b32 %r1524, %r1523, %r1490; + shf.l.wrap.b32 %r1525, %r1524, %r1524, 16; + add.s32 %r1526, %r1525, %r1505; + xor.b32 %r1527, %r1526, %r1521; + shf.l.wrap.b32 %r1528, %r1527, %r1527, 20; + add.s32 %r1529, %r1523, %r1123; + add.s32 %r1530, %r1529, %r1528; + xor.b32 %r1531, %r1530, %r1525; + shf.l.wrap.b32 %r1532, %r1531, %r1531, 24; + add.s32 %r1533, %r1532, %r1526; + xor.b32 %r1534, %r1533, %r1528; + shf.l.wrap.b32 %r1535, %r1534, %r1534, 25; + add.s32 %r1536, %r1488, %r1158; + add.s32 %r1537, %r1536, %r1479; + xor.b32 %r1538, %r1537, %r1504; + shf.l.wrap.b32 %r1539, %r1538, %r1538, 16; + add.s32 %r1540, %r1539, %r1519; + xor.b32 %r1541, %r1540, %r1479; + shf.l.wrap.b32 %r1542, %r1541, %r1541, 20; + add.s32 %r1543, %r1537, %r1137; + add.s32 %r1544, %r1543, %r1542; + xor.b32 %r1545, %r1544, %r1539; + shf.l.wrap.b32 %r1546, %r1545, %r1545, 24; + add.s32 %r1547, %r1546, %r1540; + xor.b32 %r1548, %r1547, %r1542; + shf.l.wrap.b32 %r1549, %r1548, %r1548, 25; + add.s32 %r1550, %r1502, %r1172; + add.s32 %r1551, %r1550, %r1493; + xor.b32 %r1552, %r1551, %r1518; shf.l.wrap.b32 %r1553, %r1552, %r1552, 16; - add.s32 %r1554, %r1553, %r1535; - xor.b32 %r1555, %r1554, %r1525; + add.s32 %r1554, %r1553, %r1477; + xor.b32 %r1555, %r1554, %r1493; shf.l.wrap.b32 %r1556, %r1555, %r1555, 20; - add.s32 %r1557, %r1551, %r996; + add.s32 %r1557, %r1551, %r1095; add.s32 %r1558, %r1557, %r1556; xor.b32 %r1559, %r1558, %r1553; shf.l.wrap.b32 %r1560, %r1559, %r1559, 24; add.s32 %r1561, %r1560, %r1554; xor.b32 %r1562, %r1561, %r1556; shf.l.wrap.b32 %r1563, %r1562, %r1562, 25; - add.s32 %r1564, %r1537, %r972; - add.s32 %r1565, %r1564, %r1520; - xor.b32 %r1566, %r1510, %r1565; + add.s32 %r1564, %r1516, %r1165; + add.s32 %r1565, %r1564, %r1507; + xor.b32 %r1566, %r1565, %r1476; shf.l.wrap.b32 %r1567, %r1566, %r1566, 16; - add.s32 %r1568, %r1567, %r1548; - xor.b32 %r1569, %r1568, %r1537; + add.s32 %r1568, %r1567, %r1491; + xor.b32 %r1569, %r1568, %r1507; shf.l.wrap.b32 %r1570, %r1569, %r1569, 20; - add.s32 %r1571, %r1565, %r1570; - xor.b32 %r1572, %r1571, %r1567; - shf.l.wrap.b32 %r1573, %r1572, %r1572, 24; - add.s32 %r1574, %r1573, %r1568; - xor.b32 %r1575, %r1574, %r1570; - shf.l.wrap.b32 %r1576, %r1575, %r1575, 25; - add.s32 %r1577, %r1532, %r988; - add.s32 %r1578, %r1577, %r1550; - xor.b32 %r1579, %r1522, %r1578; - shf.l.wrap.b32 %r1580, %r1579, %r1579, 16; - add.s32 %r1581, %r1580, %r1511; - xor.b32 %r1582, %r1581, %r1550; - shf.l.wrap.b32 %r1583, %r1582, %r1582, 20; - add.s32 %r1584, %r1578, %r1583; - xor.b32 %r1585, %r1584, %r1580; - shf.l.wrap.b32 %r1586, %r1585, %r1585, 24; - add.s32 %r1587, %r1586, %r1581; - xor.b32 %r1588, %r1587, %r1583; - shf.l.wrap.b32 %r1589, %r1588, %r1588, 25; - add.s32 %r1590, %r1545, %r1513; - xor.b32 %r1591, %r1590, %r1534; - shf.l.wrap.b32 %r1592, %r1591, %r1591, 16; - add.s32 %r1593, %r1592, %r1523; - xor.b32 %r1594, %r1593, %r1513; - shf.l.wrap.b32 %r1595, %r1594, %r1594, 20; - add.s32 %r1596, %r1590, %r1595; - xor.b32 %r1597, %r1596, %r1592; - shf.l.wrap.b32 %r1598, %r1597, %r1597, 24; - add.s32 %r1599, %r1598, %r1593; - xor.b32 %r1600, %r1599, %r1595; - shf.l.wrap.b32 %r1601, %r1600, %r1600, 25; - add.s32 %r1602, %r1558, %r1601; - xor.b32 %r1603, %r1602, %r1573; - shf.l.wrap.b32 %r1604, %r1603, %r1603, 16; - add.s32 %r1605, %r1604, %r1587; - xor.b32 %r1606, %r1605, %r1601; - shf.l.wrap.b32 %r1607, %r1606, %r1606, 20; - add.s32 %r1608, %r1602, %r1607; - xor.b32 %r1609, %r1608, %r1604; - shf.l.wrap.b32 %r1610, %r1609, %r1609, 24; - add.s32 %r1611, %r1610, %r1605; - xor.b32 %r1612, %r1611, %r1607; - shf.l.wrap.b32 %r1613, %r1612, %r1612, 25; - add.s32 %r1614, %r1571, %r1563; - xor.b32 %r1615, %r1614, %r1586; - shf.l.wrap.b32 %r1616, %r1615, %r1615, 16; - add.s32 %r1617, %r1616, %r1599; - xor.b32 %r1618, %r1617, %r1563; - shf.l.wrap.b32 %r1619, %r1618, %r1618, 20; - add.s32 %r1620, %r1614, %r972; - add.s32 %r1621, %r1620, %r1619; - xor.b32 %r1622, %r1621, %r1616; - shf.l.wrap.b32 %r1623, %r1622, %r1622, 24; - add.s32 %r1624, %r1623, %r1617; - xor.b32 %r1625, %r1624, %r1619; - shf.l.wrap.b32 %r1626, %r1625, %r1625, 25; - add.s32 %r1627, %r1584, %r980; - add.s32 %r1628, %r1627, %r1576; - xor.b32 %r1629, %r1598, %r1628; - shf.l.wrap.b32 %r1630, %r1629, %r1629, 16; - add.s32 %r1631, %r1630, %r1561; - xor.b32 %r1632, %r1631, %r1576; - shf.l.wrap.b32 %r1633, %r1632, %r1632, 20; - add.s32 %r1634, %r1628, %r1633; - xor.b32 %r1635, %r1634, %r1630; - shf.l.wrap.b32 %r1636, %r1635, %r1635, 24; - add.s32 %r1637, %r1636, %r1631; - xor.b32 %r1638, %r1637, %r1633; - shf.l.wrap.b32 %r1639, %r1638, %r1638, 25; - add.s32 %r1640, %r1596, %r1589; - xor.b32 %r1641, %r1560, %r1640; - shf.l.wrap.b32 %r1642, %r1641, %r1641, 16; - add.s32 %r1643, %r1642, %r1574; - xor.b32 %r1644, %r1643, %r1589; - shf.l.wrap.b32 %r1645, %r1644, %r1644, 20; - add.s32 %r1646, %r1640, %r1645; - xor.b32 %r1647, %r1646, %r1642; - shf.l.wrap.b32 %r1648, %r1647, %r1647, 24; - add.s32 %r1649, %r1648, %r1643; - xor.b32 %r1650, %r1649, %r1645; - shf.l.wrap.b32 %r1651, %r1650, %r1650, 25; - add.s32 %r1652, %r1608, %r1626; - xor.b32 %r1653, %r1652, %r1648; - shf.l.wrap.b32 %r1654, %r1653, %r1653, 16; - add.s32 %r1655, %r1654, %r1637; - xor.b32 %r1656, %r1655, %r1626; - shf.l.wrap.b32 %r1657, %r1656, %r1656, 20; - add.s32 %r1658, %r1652, %r1657; - xor.b32 %r1659, %r1658, %r1654; - shf.l.wrap.b32 %r1660, %r1659, %r1659, 24; - add.s32 %r1661, %r1660, %r1655; - xor.b32 %r1662, %r1661, %r1657; - shf.l.wrap.b32 %r1663, %r1662, %r1662, 25; - add.s32 %r1664, %r1639, %r988; - add.s32 %r1665, %r1664, %r1621; - xor.b32 %r1666, %r1610, %r1665; - shf.l.wrap.b32 %r1667, %r1666, %r1666, 16; - add.s32 %r1668, %r1667, %r1649; - xor.b32 %r1669, %r1668, %r1639; - shf.l.wrap.b32 %r1670, %r1669, %r1669, 20; - add.s32 %r1671, %r1665, %r1670; - xor.b32 %r1672, %r1671, %r1667; - shf.l.wrap.b32 %r1673, %r1672, %r1672, 24; - add.s32 %r1674, %r1673, %r1668; - xor.b32 %r1675, %r1674, %r1670; - shf.l.wrap.b32 %r1676, %r1675, %r1675, 25; - add.s32 %r1677, %r1634, %r996; - add.s32 %r1678, %r1677, %r1651; - xor.b32 %r1679, %r1623, %r1678; - shf.l.wrap.b32 %r1680, %r1679, %r1679, 16; - add.s32 %r1681, %r1680, %r1611; - xor.b32 %r1682, %r1681, %r1651; - shf.l.wrap.b32 %r1683, %r1682, %r1682, 20; - add.s32 %r1684, %r1678, %r1683; - xor.b32 %r1685, %r1684, %r1680; + add.s32 %r1571, %r1565, %r1179; + add.s32 %r1572, %r1571, %r1570; + xor.b32 %r1573, %r1572, %r1567; + shf.l.wrap.b32 %r1574, %r1573, %r1573, 24; + add.s32 %r1575, %r1574, %r1568; + xor.b32 %r1576, %r1575, %r1570; + shf.l.wrap.b32 %r1577, %r1576, %r1576, 25; + add.s32 %r1578, %r1530, %r1102; + add.s32 %r1579, %r1578, %r1549; + xor.b32 %r1580, %r1579, %r1574; + shf.l.wrap.b32 %r1581, %r1580, %r1580, 16; + add.s32 %r1582, %r1581, %r1561; + xor.b32 %r1583, %r1582, %r1549; + shf.l.wrap.b32 %r1584, %r1583, %r1583, 20; + add.s32 %r1585, %r1579, %r1074; + add.s32 %r1586, %r1585, %r1584; + xor.b32 %r1587, %r1586, %r1581; + shf.l.wrap.b32 %r1588, %r1587, %r1587, 24; + add.s32 %r1589, %r1588, %r1582; + xor.b32 %r1590, %r1589, %r1584; + shf.l.wrap.b32 %r1591, %r1590, %r1590, 25; + add.s32 %r1592, %r1544, %r1151; + add.s32 %r1593, %r1592, %r1563; + xor.b32 %r1594, %r1593, %r1532; + shf.l.wrap.b32 %r1595, %r1594, %r1594, 16; + add.s32 %r1596, %r1595, %r1575; + xor.b32 %r1597, %r1596, %r1563; + shf.l.wrap.b32 %r1598, %r1597, %r1597, 20; + add.s32 %r1599, %r1593, %r1088; + add.s32 %r1600, %r1599, %r1598; + xor.b32 %r1601, %r1600, %r1595; + shf.l.wrap.b32 %r1602, %r1601, %r1601, 24; + add.s32 %r1603, %r1602, %r1596; + xor.b32 %r1604, %r1603, %r1598; + shf.l.wrap.b32 %r1605, %r1604, %r1604, 25; + add.s32 %r1606, %r1558, %r1109; + add.s32 %r1607, %r1606, %r1577; + xor.b32 %r1608, %r1607, %r1546; + shf.l.wrap.b32 %r1609, %r1608, %r1608, 16; + add.s32 %r1610, %r1609, %r1533; + xor.b32 %r1611, %r1610, %r1577; + shf.l.wrap.b32 %r1612, %r1611, %r1611, 20; + add.s32 %r1613, %r1607, %r1130; + add.s32 %r1614, %r1613, %r1612; + xor.b32 %r1615, %r1614, %r1609; + shf.l.wrap.b32 %r1616, %r1615, %r1615, 24; + add.s32 %r1617, %r1616, %r1610; + xor.b32 %r1618, %r1617, %r1612; + shf.l.wrap.b32 %r1619, %r1618, %r1618, 25; + add.s32 %r1620, %r1572, %r1081; + add.s32 %r1621, %r1620, %r1535; + xor.b32 %r1622, %r1621, %r1560; + shf.l.wrap.b32 %r1623, %r1622, %r1622, 16; + add.s32 %r1624, %r1623, %r1547; + xor.b32 %r1625, %r1624, %r1535; + shf.l.wrap.b32 %r1626, %r1625, %r1625, 20; + add.s32 %r1627, %r1621, %r1116; + add.s32 %r1628, %r1627, %r1626; + xor.b32 %r1629, %r1628, %r1623; + shf.l.wrap.b32 %r1630, %r1629, %r1629, 24; + add.s32 %r1631, %r1630, %r1624; + xor.b32 %r1632, %r1631, %r1626; + shf.l.wrap.b32 %r1633, %r1632, %r1632, 25; + add.s32 %r1634, %r1586, %r1158; + add.s32 %r1635, %r1634, %r1633; + xor.b32 %r1636, %r1635, %r1602; + shf.l.wrap.b32 %r1637, %r1636, %r1636, 16; + add.s32 %r1638, %r1637, %r1617; + xor.b32 %r1639, %r1638, %r1633; + shf.l.wrap.b32 %r1640, %r1639, %r1639, 20; + add.s32 %r1641, %r1635, %r1165; + add.s32 %r1642, %r1641, %r1640; + xor.b32 %r1643, %r1642, %r1637; + shf.l.wrap.b32 %r1644, %r1643, %r1643, 24; + add.s32 %r1645, %r1644, %r1638; + xor.b32 %r1646, %r1645, %r1640; + shf.l.wrap.b32 %r1647, %r1646, %r1646, 25; + add.s32 %r1648, %r1600, %r1137; + add.s32 %r1649, %r1648, %r1591; + xor.b32 %r1650, %r1649, %r1616; + shf.l.wrap.b32 %r1651, %r1650, %r1650, 16; + add.s32 %r1652, %r1651, %r1631; + xor.b32 %r1653, %r1652, %r1591; + shf.l.wrap.b32 %r1654, %r1653, %r1653, 20; + add.s32 %r1655, %r1649, %r1151; + add.s32 %r1656, %r1655, %r1654; + xor.b32 %r1657, %r1656, %r1651; + shf.l.wrap.b32 %r1658, %r1657, %r1657, 24; + add.s32 %r1659, %r1658, %r1652; + xor.b32 %r1660, %r1659, %r1654; + shf.l.wrap.b32 %r1661, %r1660, %r1660, 25; + add.s32 %r1662, %r1614, %r1179; + add.s32 %r1663, %r1662, %r1605; + xor.b32 %r1664, %r1663, %r1630; + shf.l.wrap.b32 %r1665, %r1664, %r1664, 16; + add.s32 %r1666, %r1665, %r1589; + xor.b32 %r1667, %r1666, %r1605; + shf.l.wrap.b32 %r1668, %r1667, %r1667, 20; + add.s32 %r1669, %r1663, %r1144; + add.s32 %r1670, %r1669, %r1668; + xor.b32 %r1671, %r1670, %r1665; + shf.l.wrap.b32 %r1672, %r1671, %r1671, 24; + add.s32 %r1673, %r1672, %r1666; + xor.b32 %r1674, %r1673, %r1668; + shf.l.wrap.b32 %r1675, %r1674, %r1674, 25; + add.s32 %r1676, %r1628, %r1172; + add.s32 %r1677, %r1676, %r1619; + xor.b32 %r1678, %r1677, %r1588; + shf.l.wrap.b32 %r1679, %r1678, %r1678, 16; + add.s32 %r1680, %r1679, %r1603; + xor.b32 %r1681, %r1680, %r1619; + shf.l.wrap.b32 %r1682, %r1681, %r1681, 20; + add.s32 %r1683, %r1677, %r1130; + add.s32 %r1684, %r1683, %r1682; + xor.b32 %r1685, %r1684, %r1679; shf.l.wrap.b32 %r1686, %r1685, %r1685, 24; - add.s32 %r1687, %r1686, %r1681; - xor.b32 %r1688, %r1687, %r1683; + add.s32 %r1687, %r1686, %r1680; + xor.b32 %r1688, %r1687, %r1682; shf.l.wrap.b32 %r1689, %r1688, %r1688, 25; - add.s32 %r1690, %r1646, %r1613; - xor.b32 %r1691, %r1690, %r1636; - shf.l.wrap.b32 %r1692, %r1691, %r1691, 16; - add.s32 %r1693, %r1692, %r1624; - xor.b32 %r1694, %r1693, %r1613; - shf.l.wrap.b32 %r1695, %r1694, %r1694, 20; - add.s32 %r1696, %r1690, %r1695; - xor.b32 %r1697, %r1696, %r1692; - shf.l.wrap.b32 %r1698, %r1697, %r1697, 24; - add.s32 %r1699, %r1698, %r1693; - xor.b32 %r1700, %r1699, %r1695; - shf.l.wrap.b32 %r1701, %r1700, %r1700, 25; - xor.b32 %r9, %r1658, %r1687; - cvt.u64.u32 %rd132, %r9; - xor.b32 %r1702, %r1699, %r1671; - and.b32 %r1703, %r1702, 255; - cvt.u64.u32 %rd133, %r1703; - bfi.b64 %rd134, %rd133, %rd132, 32, 32; - cvt.u64.u32 %rd135, %r1702; - shl.b64 %rd136, %rd135, 32; - and.b64 %rd137, %rd136, 280375465082880; - or.b64 %rd138, %rd134, %rd137; - and.b64 %rd139, %rd136, 71776119061217280; - shr.u32 %r10, %r1702, 24; - cvt.u64.u32 %rd140, %r10; - shl.b64 %rd141, %rd140, 56; - or.b64 %rd142, %rd138, %rd139; - or.b64 %rd143, %rd142, %rd141; - xor.b32 %r11, %r1661, %r1684; - cvt.u64.u32 %rd144, %r11; - xor.b32 %r1704, %r1696, %r1674; - and.b32 %r1705, %r1704, 255; - cvt.u64.u32 %rd145, %r1705; - bfi.b64 %rd146, %rd145, %rd144, 32, 32; - cvt.u64.u32 %rd147, %r1704; - shl.b64 %rd148, %rd147, 32; - and.b64 %rd149, %rd148, 280375465082880; - or.b64 %rd150, %rd146, %rd149; - and.b64 %rd151, %rd148, 71776119061217280; - shr.u32 %r12, %r1704, 24; - cvt.u64.u32 %rd152, %r12; - shl.b64 %rd153, %rd152, 56; - or.b64 %rd154, %rd150, %rd151; - or.b64 %rd155, %rd154, %rd153; - xor.b32 %r13, %r1701, %r1673; - cvt.u64.u32 %rd156, %r13; - xor.b32 %r1706, %r1663, %r1686; - and.b32 %r1707, %r1706, 255; - cvt.u64.u32 %rd157, %r1707; - bfi.b64 %rd158, %rd157, %rd156, 32, 32; - cvt.u64.u32 %rd159, %r1706; - shl.b64 %rd160, %rd159, 32; - and.b64 %rd161, %rd160, 280375465082880; - or.b64 %rd162, %rd158, %rd161; - and.b64 %rd163, %rd160, 71776119061217280; - shr.u32 %r14, %r1706, 24; - cvt.u64.u32 %rd164, %r14; - shl.b64 %rd165, %rd164, 56; - or.b64 %rd166, %rd162, %rd163; - or.b64 %rd167, %rd166, %rd165; - xor.b32 %r1708, %r1698, %r1676; - cvt.u64.u32 %rd168, %r1708; - xor.b32 %r1709, %r1660, %r1689; - and.b32 %r1710, %r1709, 255; - cvt.u64.u32 %rd169, %r1710; - bfi.b64 %rd170, %rd169, %rd168, 32, 32; - cvt.u64.u32 %rd171, %r1709; - shl.b64 %rd172, %rd171, 32; - and.b64 %rd173, %rd172, 280375465082880; - or.b64 %rd174, %rd170, %rd173; - and.b64 %rd175, %rd172, 71776119061217280; - shr.u32 %r1711, %r1709, 24; - cvt.u64.u32 %rd176, %r1711; - shl.b64 %rd177, %rd176, 56; - or.b64 %rd178, %rd174, %rd175; - or.b64 %rd9, %rd178, %rd177; - shr.u64 %rd10, %rd143, 32; - shr.u64 %rd11, %rd143, 40; - shr.u64 %rd12, %rd143, 48; - shr.u64 %rd13, %rd155, 32; - shr.u64 %rd14, %rd155, 40; - shr.u64 %rd15, %rd155, 48; - shr.u64 %rd16, %rd167, 32; - shr.u64 %rd17, %rd167, 40; - shr.u64 %rd18, %rd167, 48; - shr.u32 %r5809, %r9, 12; - shr.u32 %r5810, %r9, 8; - shr.u32 %r5811, %r9, 4; - and.b32 %r5812, %r5811, 15; - and.b32 %r5813, %r9, 15; - bfi.b32 %r5814, %r5813, %r5812, 8, 4; - shl.b32 %r5815, %r9, 4; - and.b32 %r5816, %r5815, 983040; - or.b32 %r5817, %r5814, %r5816; - shl.b32 %r5818, %r9, 16; - and.b32 %r5819, %r5818, 251658240; - or.b32 %r5746, %r5817, %r5819; - shr.u32 %r5820, %r9, 20; - and.b32 %r5821, %r5820, 15; - shr.u32 %r5822, %r9, 16; - and.b32 %r5823, %r5822, 15; - shr.u32 %r5824, %r9, 24; - bfi.b32 %r5825, %r5823, %r5821, 8, 4; - and.b32 %r5826, %r5809, 983040; - or.b32 %r5827, %r5825, %r5826; - and.b32 %r5828, %r9, 251658240; - or.b32 %r5750, %r5827, %r5828; - cvt.u16.u64 %rs83, %rd10; - and.b16 %rs84, %rs83, 240; - shr.u16 %rs85, %rs84, 4; - cvt.u16.u64 %rs86, %rd11; - and.b16 %rs87, %rs86, 240; - shr.u16 %rs88, %rs87, 4; - cvt.u32.u16 %r5829, %rs85; - cvt.u32.u64 %r5830, %rd10; - and.b32 %r5831, %r5830, 15; - prmt.b32 %r5832, %r5831, %r5829, 30212; - cvt.u32.u16 %r5833, %rs88; - prmt.b32 %r5834, %r5833, %r5832, 28756; - cvt.u32.u64 %r5835, %rd11; - shl.b32 %r5836, %r5835, 24; - and.b32 %r5837, %r5836, 251658240; - or.b32 %r5754, %r5834, %r5837; - cvt.u16.u64 %rs89, %rd12; - and.b16 %rs90, %rs89, 240; - shr.u16 %rs91, %rs90, 4; - cvt.u32.u16 %r5838, %rs91; - cvt.u32.u64 %r5839, %rd12; - and.b32 %r5840, %r5839, 15; - prmt.b32 %r5841, %r5840, %r5838, 30212; - shl.b32 %r5842, %r10, 12; - and.b32 %r5843, %r5842, 983040; - or.b32 %r5844, %r5841, %r5843; - shl.b32 %r5845, %r10, 24; - and.b32 %r5846, %r5845, 251658240; - or.b32 %r5758, %r5844, %r5846; - shr.u32 %r5847, %r11, 12; - shr.u32 %r5848, %r11, 8; - shr.u32 %r5849, %r11, 4; - and.b32 %r5850, %r5849, 15; - and.b32 %r5851, %r11, 15; - bfi.b32 %r5852, %r5851, %r5850, 8, 4; - shl.b32 %r5853, %r11, 4; - and.b32 %r5854, %r5853, 983040; - or.b32 %r5855, %r5852, %r5854; - shl.b32 %r5856, %r11, 16; - and.b32 %r5857, %r5856, 251658240; - or.b32 %r5762, %r5855, %r5857; - shr.u32 %r5858, %r11, 20; - and.b32 %r5859, %r5858, 15; - shr.u32 %r5860, %r11, 16; - and.b32 %r5861, %r5860, 15; - shr.u32 %r5862, %r11, 24; - bfi.b32 %r5863, %r5861, %r5859, 8, 4; - and.b32 %r5864, %r5847, 983040; - or.b32 %r5865, %r5863, %r5864; - and.b32 %r5866, %r11, 251658240; - or.b32 %r5766, %r5865, %r5866; - cvt.u16.u64 %rs92, %rd13; - and.b16 %rs93, %rs92, 240; - shr.u16 %rs94, %rs93, 4; - cvt.u16.u64 %rs95, %rd14; - and.b16 %rs96, %rs95, 240; - shr.u16 %rs97, %rs96, 4; - cvt.u32.u16 %r5867, %rs94; - cvt.u32.u64 %r5868, %rd13; - and.b32 %r5869, %r5868, 15; - prmt.b32 %r5870, %r5869, %r5867, 30212; - cvt.u32.u16 %r5871, %rs97; - prmt.b32 %r5872, %r5871, %r5870, 28756; - cvt.u32.u64 %r5873, %rd14; - shl.b32 %r5874, %r5873, 24; - and.b32 %r5875, %r5874, 251658240; - or.b32 %r5770, %r5872, %r5875; - cvt.u16.u64 %rs98, %rd15; - and.b16 %rs99, %rs98, 240; - shr.u16 %rs100, %rs99, 4; - cvt.u32.u16 %r5876, %rs100; - cvt.u32.u64 %r5877, %rd15; - and.b32 %r5878, %r5877, 15; - prmt.b32 %r5879, %r5878, %r5876, 30212; - shl.b32 %r5880, %r12, 12; - and.b32 %r5881, %r5880, 983040; - or.b32 %r5882, %r5879, %r5881; - shl.b32 %r5883, %r12, 24; - and.b32 %r5884, %r5883, 251658240; - or.b32 %r5774, %r5882, %r5884; - shr.u32 %r5885, %r13, 12; - shr.u32 %r5886, %r13, 8; - shr.u32 %r5887, %r13, 4; - and.b32 %r5888, %r5887, 15; - and.b32 %r5889, %r13, 15; - bfi.b32 %r5890, %r5889, %r5888, 8, 4; - shl.b32 %r5891, %r13, 4; - and.b32 %r5892, %r5891, 983040; - or.b32 %r5893, %r5890, %r5892; - shl.b32 %r5894, %r13, 16; - and.b32 %r5895, %r5894, 251658240; - or.b32 %r5778, %r5893, %r5895; - shr.u32 %r5896, %r13, 20; - and.b32 %r5897, %r5896, 15; - shr.u32 %r5898, %r13, 16; - and.b32 %r5899, %r5898, 15; - shr.u32 %r5900, %r13, 24; - bfi.b32 %r5901, %r5899, %r5897, 8, 4; - and.b32 %r5902, %r5885, 983040; - or.b32 %r5903, %r5901, %r5902; - and.b32 %r5904, %r13, 251658240; - or.b32 %r5782, %r5903, %r5904; - cvt.u16.u64 %rs101, %rd16; - and.b16 %rs102, %rs101, 240; - shr.u16 %rs103, %rs102, 4; - cvt.u16.u64 %rs104, %rd17; - and.b16 %rs105, %rs104, 240; - shr.u16 %rs106, %rs105, 4; - cvt.u32.u16 %r5905, %rs103; - cvt.u32.u64 %r5906, %rd16; - and.b32 %r5907, %r5906, 15; - prmt.b32 %r5908, %r5907, %r5905, 30212; - cvt.u32.u16 %r5909, %rs106; - prmt.b32 %r5910, %r5909, %r5908, 28756; - cvt.u32.u64 %r5911, %rd17; - shl.b32 %r5912, %r5911, 24; - and.b32 %r5913, %r5912, 251658240; - or.b32 %r5786, %r5910, %r5913; - cvt.u16.u64 %rs107, %rd18; - and.b16 %rs108, %rs107, 240; - shr.u16 %rs109, %rs108, 4; - cvt.u32.u16 %r5914, %rs109; - cvt.u32.u64 %r5915, %rd18; - and.b32 %r5916, %r5915, 15; - prmt.b32 %r5917, %r5916, %r5914, 30212; - shl.b32 %r5918, %r14, 12; - and.b32 %r5919, %r5918, 983040; - or.b32 %r5920, %r5917, %r5919; - shl.b32 %r5921, %r14, 24; - and.b32 %r5922, %r5921, 251658240; - or.b32 %r5790, %r5920, %r5922; - cvt.u16.u64 %rs110, %rd9; - and.b16 %rs111, %rs110, 240; - shr.u16 %rs112, %rs111, 4; - shr.u64 %rd201, %rd9, 8; - cvt.u32.u64 %r5923, %rd201; - cvt.u32.u64 %r5924, %rd9; - shr.u32 %r5925, %r5924, 12; - cvt.u32.u16 %r5926, %rs112; - and.b32 %r5927, %r5924, 15; - prmt.b32 %r5928, %r5927, %r5926, 30212; - shl.b32 %r5929, %r5924, 4; - and.b32 %r5930, %r5929, 983040; - or.b32 %r5931, %r5928, %r5930; - shl.b32 %r5932, %r5923, 24; - and.b32 %r5933, %r5932, 251658240; - or.b32 %r5794, %r5931, %r5933; - shr.u64 %rd202, %rd9, 16; - cvt.u32.u64 %r5934, %rd202; - shr.u32 %r5935, %r5924, 20; - and.b32 %r5936, %r5935, 15; - and.b32 %r5937, %r5934, 15; - shr.u64 %rd203, %rd9, 24; - cvt.u32.u64 %r5938, %rd203; - bfi.b32 %r5939, %r5937, %r5936, 8, 4; - and.b32 %r5940, %r5925, 983040; - or.b32 %r5941, %r5939, %r5940; - shl.b32 %r5942, %r5938, 24; - and.b32 %r5943, %r5942, 251658240; - or.b32 %r5798, %r5941, %r5943; - shr.u64 %rd204, %rd9, 32; - cvt.u32.u64 %r5944, %rd204; - shr.u64 %rd205, %rd9, 36; - cvt.u32.u64 %r5945, %rd205; - and.b32 %r5946, %r5945, 15; - and.b32 %r5947, %r5944, 15; - shr.u64 %rd206, %rd9, 40; - cvt.u32.u64 %r5948, %rd206; - shr.u64 %rd207, %rd9, 44; - cvt.u32.u64 %r5949, %rd207; - bfi.b32 %r5950, %r5947, %r5946, 8, 4; - shl.b32 %r5951, %r5949, 16; - and.b32 %r5952, %r5951, 983040; - or.b32 %r5953, %r5950, %r5952; - shl.b32 %r5954, %r5948, 24; - and.b32 %r5955, %r5954, 251658240; - or.b32 %r5802, %r5953, %r5955; - shr.u64 %rd208, %rd9, 48; - cvt.u32.u64 %r5956, %rd208; - shr.u64 %rd209, %rd9, 52; - cvt.u32.u64 %r5957, %rd209; - and.b32 %r5958, %r5957, 15; - and.b32 %r5959, %r5956, 15; - shr.u64 %rd210, %rd9, 56; - cvt.u32.u64 %r5960, %rd210; - bfi.b32 %r5961, %r5959, %r5958, 8, 4; - and.b32 %r5962, %r5949, 983040; - or.b32 %r5963, %r5961, %r5962; - shl.b32 %r5964, %r5960, 24; - and.b32 %r5965, %r5964, 251658240; - or.b32 %r5806, %r5963, %r5965; - ld.const.u32 %r1713, [matrix]; - mov.u32 %r6244, 0; + add.s32 %r1690, %r1642, %r1123; + add.s32 %r1691, %r1690, %r1661; + xor.b32 %r1692, %r1691, %r1686; + shf.l.wrap.b32 %r1693, %r1692, %r1692, 16; + add.s32 %r1694, %r1693, %r1673; + xor.b32 %r1695, %r1694, %r1661; + shf.l.wrap.b32 %r1696, %r1695, %r1695, 20; + add.s32 %r1697, %r1691, %r1088; + add.s32 %r1698, %r1697, %r1696; + xor.b32 %r1699, %r1698, %r1693; + shf.l.wrap.b32 %r1700, %r1699, %r1699, 24; + add.s32 %r1701, %r1700, %r1694; + xor.b32 %r1702, %r1701, %r1696; + shf.l.wrap.b32 %r1703, %r1702, %r1702, 25; + add.s32 %r1704, %r1656, %r1109; + add.s32 %r1705, %r1704, %r1675; + xor.b32 %r1706, %r1705, %r1644; + shf.l.wrap.b32 %r1707, %r1706, %r1706, 16; + add.s32 %r1708, %r1707, %r1687; + xor.b32 %r1709, %r1708, %r1675; + shf.l.wrap.b32 %r1710, %r1709, %r1709, 20; + add.s32 %r1711, %r1705, %r1095; + add.s32 %r1712, %r1711, %r1710; + xor.b32 %r1713, %r1712, %r1707; + shf.l.wrap.b32 %r1714, %r1713, %r1713, 24; + add.s32 %r1715, %r1714, %r1708; + xor.b32 %r1716, %r1715, %r1710; + shf.l.wrap.b32 %r1717, %r1716, %r1716, 25; + add.s32 %r1718, %r1670, %r1074; + add.s32 %r1719, %r1718, %r1689; + xor.b32 %r1720, %r1719, %r1658; + shf.l.wrap.b32 %r1721, %r1720, %r1720, 16; + add.s32 %r1722, %r1721, %r1645; + xor.b32 %r1723, %r1722, %r1689; + shf.l.wrap.b32 %r1724, %r1723, %r1723, 20; + add.s32 %r1725, %r1719, %r1081; + add.s32 %r1726, %r1725, %r1724; + xor.b32 %r1727, %r1726, %r1721; + shf.l.wrap.b32 %r1728, %r1727, %r1727, 24; + add.s32 %r1729, %r1728, %r1722; + xor.b32 %r1730, %r1729, %r1724; + shf.l.wrap.b32 %r1731, %r1730, %r1730, 25; + add.s32 %r1732, %r1684, %r1116; + add.s32 %r1733, %r1732, %r1647; + xor.b32 %r1734, %r1733, %r1672; + shf.l.wrap.b32 %r1735, %r1734, %r1734, 16; + add.s32 %r1736, %r1735, %r1659; + xor.b32 %r1737, %r1736, %r1647; + shf.l.wrap.b32 %r1738, %r1737, %r1737, 20; + add.s32 %r1739, %r1733, %r1102; + add.s32 %r1740, %r1739, %r1738; + xor.b32 %r1741, %r1740, %r1735; + shf.l.wrap.b32 %r1742, %r1741, %r1741, 24; + add.s32 %r1743, %r1742, %r1736; + xor.b32 %r1744, %r1743, %r1738; + shf.l.wrap.b32 %r1745, %r1744, %r1744, 25; + add.s32 %r1746, %r1698, %r1137; + add.s32 %r1747, %r1746, %r1745; + xor.b32 %r1748, %r1747, %r1714; + shf.l.wrap.b32 %r1749, %r1748, %r1748, 16; + add.s32 %r1750, %r1749, %r1729; + xor.b32 %r1751, %r1750, %r1745; + shf.l.wrap.b32 %r1752, %r1751, %r1751, 20; + add.s32 %r1753, %r1747, %r1172; + add.s32 %r1754, %r1753, %r1752; + xor.b32 %r1755, %r1754, %r1749; + shf.l.wrap.b32 %r1756, %r1755, %r1755, 24; + add.s32 %r1757, %r1756, %r1750; + xor.b32 %r1758, %r1757, %r1752; + shf.l.wrap.b32 %r1759, %r1758, %r1758, 25; + add.s32 %r1760, %r1712, %r1151; + add.s32 %r1761, %r1760, %r1703; + xor.b32 %r1762, %r1761, %r1728; + shf.l.wrap.b32 %r1763, %r1762, %r1762, 16; + add.s32 %r1764, %r1763, %r1743; + xor.b32 %r1765, %r1764, %r1703; + shf.l.wrap.b32 %r1766, %r1765, %r1765, 20; + add.s32 %r1767, %r1761, %r1109; + add.s32 %r1768, %r1767, %r1766; + xor.b32 %r1769, %r1768, %r1763; + shf.l.wrap.b32 %r1770, %r1769, %r1769, 24; + add.s32 %r1771, %r1770, %r1764; + xor.b32 %r1772, %r1771, %r1766; + shf.l.wrap.b32 %r1773, %r1772, %r1772, 25; + add.s32 %r1774, %r1726, %r1130; + add.s32 %r1775, %r1774, %r1717; + xor.b32 %r1776, %r1775, %r1742; + shf.l.wrap.b32 %r1777, %r1776, %r1776, 16; + add.s32 %r1778, %r1777, %r1701; + xor.b32 %r1779, %r1778, %r1717; + shf.l.wrap.b32 %r1780, %r1779, %r1779, 20; + add.s32 %r1781, %r1775, %r1158; + add.s32 %r1782, %r1781, %r1780; + xor.b32 %r1783, %r1782, %r1777; + shf.l.wrap.b32 %r1784, %r1783, %r1783, 24; + add.s32 %r1785, %r1784, %r1778; + xor.b32 %r1786, %r1785, %r1780; + shf.l.wrap.b32 %r1787, %r1786, %r1786, 25; + add.s32 %r1788, %r1740, %r1179; + add.s32 %r1789, %r1788, %r1731; + xor.b32 %r1790, %r1789, %r1700; + shf.l.wrap.b32 %r1791, %r1790, %r1790, 16; + add.s32 %r1792, %r1791, %r1715; + xor.b32 %r1793, %r1792, %r1731; + shf.l.wrap.b32 %r1794, %r1793, %r1793, 20; + add.s32 %r1795, %r1789, %r1081; + add.s32 %r1796, %r1795, %r1794; + xor.b32 %r1797, %r1796, %r1791; + shf.l.wrap.b32 %r1798, %r1797, %r1797, 24; + add.s32 %r1799, %r1798, %r1792; + xor.b32 %r1800, %r1799, %r1794; + shf.l.wrap.b32 %r1801, %r1800, %r1800, 25; + add.s32 %r1802, %r1754, %r1165; + add.s32 %r1803, %r1802, %r1773; + xor.b32 %r1804, %r1803, %r1798; + shf.l.wrap.b32 %r1805, %r1804, %r1804, 16; + add.s32 %r1806, %r1805, %r1785; + xor.b32 %r1807, %r1806, %r1773; + shf.l.wrap.b32 %r1808, %r1807, %r1807, 20; + add.s32 %r1809, %r1803, %r1095; + add.s32 %r1810, %r1809, %r1808; + xor.b32 %r1811, %r1810, %r1805; + shf.l.wrap.b32 %r1812, %r1811, %r1811, 24; + add.s32 %r1813, %r1812, %r1806; + xor.b32 %r1814, %r1813, %r1808; + shf.l.wrap.b32 %r1815, %r1814, %r1814, 25; + add.s32 %r1816, %r1768, %r1074; + add.s32 %r1817, %r1816, %r1787; + xor.b32 %r1818, %r1817, %r1756; + shf.l.wrap.b32 %r1819, %r1818, %r1818, 16; + add.s32 %r1820, %r1819, %r1799; + xor.b32 %r1821, %r1820, %r1787; + shf.l.wrap.b32 %r1822, %r1821, %r1821, 20; + add.s32 %r1823, %r1817, %r1144; + add.s32 %r1824, %r1823, %r1822; + xor.b32 %r1825, %r1824, %r1819; + shf.l.wrap.b32 %r1826, %r1825, %r1825, 24; + add.s32 %r1827, %r1826, %r1820; + xor.b32 %r1828, %r1827, %r1822; + shf.l.wrap.b32 %r1829, %r1828, %r1828, 25; + add.s32 %r1830, %r1782, %r1088; + add.s32 %r1831, %r1830, %r1801; + xor.b32 %r1832, %r1831, %r1770; + shf.l.wrap.b32 %r1833, %r1832, %r1832, 16; + add.s32 %r1834, %r1833, %r1757; + xor.b32 %r1835, %r1834, %r1801; + shf.l.wrap.b32 %r1836, %r1835, %r1835, 20; + add.s32 %r1837, %r1831, %r1116; + add.s32 %r1838, %r1837, %r1836; + xor.b32 %r1839, %r1838, %r1833; + shf.l.wrap.b32 %r1840, %r1839, %r1839, 24; + add.s32 %r1841, %r1840, %r1834; + xor.b32 %r1842, %r1841, %r1836; + shf.l.wrap.b32 %r1843, %r1842, %r1842, 25; + add.s32 %r1844, %r1796, %r1102; + add.s32 %r1845, %r1844, %r1759; + xor.b32 %r1846, %r1845, %r1784; + shf.l.wrap.b32 %r1847, %r1846, %r1846, 16; + add.s32 %r1848, %r1847, %r1771; + xor.b32 %r1849, %r1848, %r1759; + shf.l.wrap.b32 %r1850, %r1849, %r1849, 20; + add.s32 %r1851, %r1845, %r1123; + add.s32 %r1852, %r1851, %r1850; + xor.b32 %r1853, %r1852, %r1847; + shf.l.wrap.b32 %r1854, %r1853, %r1853, 24; + add.s32 %r1855, %r1854, %r1848; + xor.b32 %r1856, %r1855, %r1850; + shf.l.wrap.b32 %r1857, %r1856, %r1856, 25; + add.s32 %r1858, %r1810, %r1151; + add.s32 %r1859, %r1858, %r1857; + xor.b32 %r1860, %r1859, %r1826; + shf.l.wrap.b32 %r1861, %r1860, %r1860, 16; + add.s32 %r1862, %r1861, %r1841; + xor.b32 %r1863, %r1862, %r1857; + shf.l.wrap.b32 %r1864, %r1863, %r1863, 20; + add.s32 %r1865, %r1859, %r1179; + add.s32 %r1866, %r1865, %r1864; + xor.b32 %r1867, %r1866, %r1861; + shf.l.wrap.b32 %r1868, %r1867, %r1867, 24; + add.s32 %r1869, %r1868, %r1862; + xor.b32 %r1870, %r1869, %r1864; + shf.l.wrap.b32 %r1871, %r1870, %r1870, 25; + add.s32 %r1872, %r1824, %r1109; + add.s32 %r1873, %r1872, %r1815; + xor.b32 %r1874, %r1873, %r1840; + shf.l.wrap.b32 %r1875, %r1874, %r1874, 16; + add.s32 %r1876, %r1875, %r1855; + xor.b32 %r1877, %r1876, %r1815; + shf.l.wrap.b32 %r1878, %r1877, %r1877, 20; + add.s32 %r1879, %r1873, %r1074; + add.s32 %r1880, %r1879, %r1878; + xor.b32 %r1881, %r1880, %r1875; + shf.l.wrap.b32 %r1882, %r1881, %r1881, 24; + add.s32 %r1883, %r1882, %r1876; + xor.b32 %r1884, %r1883, %r1878; + shf.l.wrap.b32 %r1885, %r1884, %r1884, 25; + add.s32 %r1886, %r1838, %r1081; + add.s32 %r1887, %r1886, %r1829; + xor.b32 %r1888, %r1887, %r1854; + shf.l.wrap.b32 %r1889, %r1888, %r1888, 16; + add.s32 %r1890, %r1889, %r1813; + xor.b32 %r1891, %r1890, %r1829; + shf.l.wrap.b32 %r1892, %r1891, %r1891, 20; + add.s32 %r1893, %r1887, %r1137; + add.s32 %r1894, %r1893, %r1892; + xor.b32 %r1895, %r1894, %r1889; + shf.l.wrap.b32 %r1896, %r1895, %r1895, 24; + add.s32 %r1897, %r1896, %r1890; + xor.b32 %r1898, %r1897, %r1892; + shf.l.wrap.b32 %r1899, %r1898, %r1898, 25; + add.s32 %r1900, %r1852, %r1130; + add.s32 %r1901, %r1900, %r1843; + xor.b32 %r1902, %r1901, %r1812; + shf.l.wrap.b32 %r1903, %r1902, %r1902, 16; + add.s32 %r1904, %r1903, %r1827; + xor.b32 %r1905, %r1904, %r1843; + shf.l.wrap.b32 %r1906, %r1905, %r1905, 20; + add.s32 %r1907, %r1901, %r1116; + add.s32 %r1908, %r1907, %r1906; + xor.b32 %r1909, %r1908, %r1903; + shf.l.wrap.b32 %r1910, %r1909, %r1909, 24; + add.s32 %r1911, %r1910, %r1904; + xor.b32 %r1912, %r1911, %r1906; + shf.l.wrap.b32 %r1913, %r1912, %r1912, 25; + add.s32 %r1914, %r1866, %r1172; + add.s32 %r1915, %r1914, %r1885; + xor.b32 %r1916, %r1915, %r1910; + shf.l.wrap.b32 %r1917, %r1916, %r1916, 16; + add.s32 %r1918, %r1917, %r1897; + xor.b32 %r1919, %r1918, %r1885; + shf.l.wrap.b32 %r1920, %r1919, %r1919, 20; + add.s32 %r1921, %r1915, %r1144; + add.s32 %r1922, %r1921, %r1920; + xor.b32 %r1923, %r1922, %r1917; + shf.l.wrap.b32 %r1924, %r1923, %r1923, 24; + add.s32 %r1925, %r1924, %r1918; + xor.b32 %r1926, %r1925, %r1920; + shf.l.wrap.b32 %r1927, %r1926, %r1926, 25; + add.s32 %r1928, %r1880, %r1088; + add.s32 %r1929, %r1928, %r1899; + xor.b32 %r1930, %r1929, %r1868; + shf.l.wrap.b32 %r1931, %r1930, %r1930, 16; + add.s32 %r1932, %r1931, %r1911; + xor.b32 %r1933, %r1932, %r1899; + shf.l.wrap.b32 %r1934, %r1933, %r1933, 20; + add.s32 %r1935, %r1929, %r1158; + add.s32 %r1936, %r1935, %r1934; + xor.b32 %r1937, %r1936, %r1931; + shf.l.wrap.b32 %r1938, %r1937, %r1937, 24; + add.s32 %r1939, %r1938, %r1932; + xor.b32 %r1940, %r1939, %r1934; + shf.l.wrap.b32 %r1941, %r1940, %r1940, 25; + add.s32 %r1942, %r1894, %r1095; + add.s32 %r1943, %r1942, %r1913; + xor.b32 %r1944, %r1943, %r1882; + shf.l.wrap.b32 %r1945, %r1944, %r1944, 16; + add.s32 %r1946, %r1945, %r1869; + xor.b32 %r1947, %r1946, %r1913; + shf.l.wrap.b32 %r1948, %r1947, %r1947, 20; + add.s32 %r1949, %r1943, %r1102; + add.s32 %r1950, %r1949, %r1948; + xor.b32 %r1951, %r1950, %r1945; + shf.l.wrap.b32 %r1952, %r1951, %r1951, 24; + add.s32 %r1953, %r1952, %r1946; + xor.b32 %r1954, %r1953, %r1948; + shf.l.wrap.b32 %r1955, %r1954, %r1954, 25; + add.s32 %r1956, %r1908, %r1123; + add.s32 %r1957, %r1956, %r1871; + xor.b32 %r1958, %r1957, %r1896; + shf.l.wrap.b32 %r1959, %r1958, %r1958, 16; + add.s32 %r1960, %r1959, %r1883; + xor.b32 %r1961, %r1960, %r1871; + shf.l.wrap.b32 %r1962, %r1961, %r1961, 20; + add.s32 %r1963, %r1957, %r1165; + add.s32 %r1964, %r1963, %r1962; + xor.b32 %r1965, %r1964, %r1959; + shf.l.wrap.b32 %r1966, %r1965, %r1965, 24; + add.s32 %r1967, %r1966, %r1960; + xor.b32 %r1968, %r1967, %r1962; + shf.l.wrap.b32 %r1969, %r1968, %r1968, 25; + xor.b32 %r11657, %r1953, %r1922; + st.local.u32 [%rd3+-104], %r11657; + xor.b32 %r11656, %r1967, %r1936; + st.local.u32 [%rd3+-100], %r11656; + xor.b32 %r11655, %r1925, %r1950; + st.local.u32 [%rd3+-96], %r11655; + xor.b32 %r11654, %r1939, %r1964; + st.local.u32 [%rd3+-92], %r11654; + xor.b32 %r11653, %r1969, %r1938; + st.local.u32 [%rd3+-88], %r11653; + xor.b32 %r11652, %r1927, %r1952; + st.local.u32 [%rd3+-84], %r11652; + xor.b32 %r11651, %r1941, %r1966; + st.local.u32 [%rd3+-80], %r11651; + xor.b32 %r11650, %r1955, %r1924; + st.local.u32 [%rd3+-76], %r11650; + add.s16 %rs352, %rs352, 1; + st.local.u8 [%rd3+1], %rs352; + add.s64 %rd261, %rd261, 64; + add.s64 %rd244, %rd244, -64; + setp.gt.u64 %p10, %rd244, 64; + @%p10 bra $L__BB1_11; + +$L__BB1_12: + cvt.u64.u16 %rd120, %rs351; + and.b64 %rd24, %rd120, 255; + mov.u64 %rd121, 64; + sub.s64 %rd122, %rd121, %rd24; + min.u64 %rd25, %rd122, %rd244; + setp.eq.s64 %p11, %rd25, 0; + @%p11 bra $L__BB1_15; + + add.s64 %rd124, %rd2, %rd24; + add.s64 %rd26, %rd124, 72; + mov.u64 %rd245, 0; + +$L__BB1_14: + add.s64 %rd125, %rd261, %rd245; + ld.local.u8 %rs119, [%rd125]; + add.s64 %rd126, %rd26, %rd245; + st.local.u8 [%rd126], %rs119; + add.s64 %rd245, %rd245, 1; + setp.lt.u64 %p12, %rd245, %rd25; + @%p12 bra $L__BB1_14; + +$L__BB1_15: + cvt.u16.u64 %rs120, %rd25; + ld.local.u8 %rs121, [%rd3]; + add.s16 %rs13, %rs121, %rs120; + st.local.u8 [%rd3], %rs13; + mov.u64 %rd127, 80; + sub.s64 %rd29, %rd127, %rd6; + setp.eq.s64 %p13, %rd29, 0; + @%p13 bra $L__BB1_68; + + ld.local.u8 %rs122, [%rd3+1]; + setp.eq.s16 %p14, %rs122, 0; + selp.u16 %rs123, 1, 0, %p14; + ld.local.u8 %rs124, [%rd3+2]; + or.b16 %rs125, %rs124, %rs123; + or.b16 %rs126, %rs125, 2; + ld.local.u8 %r1970, [%rd3+-64]; + ld.local.u8 %r1971, [%rd3+-63]; + prmt.b32 %r1972, %r1971, %r1970, 30212; + ld.local.u8 %r1973, [%rd3+-62]; + prmt.b32 %r1974, %r1973, %r1972, 28756; + ld.local.u8 %r1975, [%rd3+-61]; + prmt.b32 %r1976, %r1975, %r1974, 1620; + ld.local.u8 %r1977, [%rd3+-60]; + ld.local.u8 %r1978, [%rd3+-59]; + prmt.b32 %r1979, %r1978, %r1977, 30212; + ld.local.u8 %r1980, [%rd3+-58]; + prmt.b32 %r1981, %r1980, %r1979, 28756; + ld.local.u8 %r1982, [%rd3+-57]; + prmt.b32 %r1983, %r1982, %r1981, 1620; + ld.local.u8 %r1984, [%rd3+-56]; + ld.local.u8 %r1985, [%rd3+-55]; + prmt.b32 %r1986, %r1985, %r1984, 30212; + ld.local.u8 %r1987, [%rd3+-54]; + prmt.b32 %r1988, %r1987, %r1986, 28756; + ld.local.u8 %r1989, [%rd3+-53]; + prmt.b32 %r1990, %r1989, %r1988, 1620; + ld.local.u8 %r1991, [%rd3+-52]; + ld.local.u8 %r1992, [%rd3+-51]; + prmt.b32 %r1993, %r1992, %r1991, 30212; + ld.local.u8 %r1994, [%rd3+-50]; + prmt.b32 %r1995, %r1994, %r1993, 28756; + ld.local.u8 %r1996, [%rd3+-49]; + prmt.b32 %r1997, %r1996, %r1995, 1620; + ld.local.u8 %r1998, [%rd3+-48]; + ld.local.u8 %r1999, [%rd3+-47]; + prmt.b32 %r2000, %r1999, %r1998, 30212; + ld.local.u8 %r2001, [%rd3+-46]; + prmt.b32 %r2002, %r2001, %r2000, 28756; + ld.local.u8 %r2003, [%rd3+-45]; + prmt.b32 %r2004, %r2003, %r2002, 1620; + ld.local.u8 %r2005, [%rd3+-44]; + ld.local.u8 %r2006, [%rd3+-43]; + prmt.b32 %r2007, %r2006, %r2005, 30212; + ld.local.u8 %r2008, [%rd3+-42]; + prmt.b32 %r2009, %r2008, %r2007, 28756; + ld.local.u8 %r2010, [%rd3+-41]; + prmt.b32 %r2011, %r2010, %r2009, 1620; + ld.local.u8 %r2012, [%rd3+-40]; + ld.local.u8 %r2013, [%rd3+-39]; + prmt.b32 %r2014, %r2013, %r2012, 30212; + ld.local.u8 %r2015, [%rd3+-38]; + prmt.b32 %r2016, %r2015, %r2014, 28756; + ld.local.u8 %r2017, [%rd3+-37]; + prmt.b32 %r2018, %r2017, %r2016, 1620; + ld.local.u8 %r2019, [%rd3+-36]; + ld.local.u8 %r2020, [%rd3+-35]; + prmt.b32 %r2021, %r2020, %r2019, 30212; + ld.local.u8 %r2022, [%rd3+-34]; + prmt.b32 %r2023, %r2022, %r2021, 28756; + ld.local.u8 %r2024, [%rd3+-33]; + prmt.b32 %r2025, %r2024, %r2023, 1620; + ld.local.u8 %r2026, [%rd3+-32]; + ld.local.u8 %r2027, [%rd3+-31]; + prmt.b32 %r2028, %r2027, %r2026, 30212; + ld.local.u8 %r2029, [%rd3+-30]; + prmt.b32 %r2030, %r2029, %r2028, 28756; + ld.local.u8 %r2031, [%rd3+-29]; + prmt.b32 %r2032, %r2031, %r2030, 1620; + ld.local.u8 %r2033, [%rd3+-28]; + ld.local.u8 %r2034, [%rd3+-27]; + prmt.b32 %r2035, %r2034, %r2033, 30212; + ld.local.u8 %r2036, [%rd3+-26]; + prmt.b32 %r2037, %r2036, %r2035, 28756; + ld.local.u8 %r2038, [%rd3+-25]; + prmt.b32 %r2039, %r2038, %r2037, 1620; + ld.local.u8 %r2040, [%rd3+-24]; + ld.local.u8 %r2041, [%rd3+-23]; + prmt.b32 %r2042, %r2041, %r2040, 30212; + ld.local.u8 %r2043, [%rd3+-22]; + prmt.b32 %r2044, %r2043, %r2042, 28756; + ld.local.u8 %r2045, [%rd3+-21]; + prmt.b32 %r2046, %r2045, %r2044, 1620; + ld.local.u8 %r2047, [%rd3+-20]; + ld.local.u8 %r2048, [%rd3+-19]; + prmt.b32 %r2049, %r2048, %r2047, 30212; + ld.local.u8 %r2050, [%rd3+-18]; + prmt.b32 %r2051, %r2050, %r2049, 28756; + ld.local.u8 %r2052, [%rd3+-17]; + prmt.b32 %r2053, %r2052, %r2051, 1620; + ld.local.u8 %r2054, [%rd3+-16]; + ld.local.u8 %r2055, [%rd3+-15]; + prmt.b32 %r2056, %r2055, %r2054, 30212; + ld.local.u8 %r2057, [%rd3+-14]; + prmt.b32 %r2058, %r2057, %r2056, 28756; + ld.local.u8 %r2059, [%rd3+-13]; + prmt.b32 %r2060, %r2059, %r2058, 1620; + ld.local.u8 %r2061, [%rd3+-12]; + ld.local.u8 %r2062, [%rd3+-11]; + prmt.b32 %r2063, %r2062, %r2061, 30212; + ld.local.u8 %r2064, [%rd3+-10]; + prmt.b32 %r2065, %r2064, %r2063, 28756; + ld.local.u8 %r2066, [%rd3+-9]; + prmt.b32 %r2067, %r2066, %r2065, 1620; + ld.local.u8 %r2068, [%rd3+-8]; + ld.local.u8 %r2069, [%rd3+-7]; + prmt.b32 %r2070, %r2069, %r2068, 30212; + ld.local.u8 %r2071, [%rd3+-6]; + prmt.b32 %r2072, %r2071, %r2070, 28756; + ld.local.u8 %r2073, [%rd3+-5]; + prmt.b32 %r2074, %r2073, %r2072, 1620; + ld.local.u8 %r2075, [%rd3+-4]; + ld.local.u8 %r2076, [%rd3+-3]; + prmt.b32 %r2077, %r2076, %r2075, 30212; + ld.local.u8 %r2078, [%rd3+-2]; + prmt.b32 %r2079, %r2078, %r2077, 28756; + ld.local.u8 %r2080, [%rd3+-1]; + prmt.b32 %r2081, %r2080, %r2079, 1620; + ld.local.u64 %rd128, [%rd3+-72]; + cvt.u32.u64 %r2082, %rd128; + shr.u64 %rd129, %rd128, 32; + cvt.u32.u64 %r2083, %rd129; + cvt.u32.u16 %r2084, %rs126; + and.b32 %r2085, %r2084, 255; + ld.local.u8 %r2086, [%rd3+-88]; + ld.local.u8 %r2087, [%rd3+-87]; + prmt.b32 %r2088, %r2087, %r2086, 30212; + ld.local.u8 %r2089, [%rd3+-86]; + ld.local.u8 %r2090, [%rd3+-85]; + prmt.b32 %r2091, %r2090, %r2089, 30212; + prmt.b32 %r2092, %r2091, %r2088, 4180; + ld.local.u8 %r2093, [%rd3+-104]; + ld.local.u8 %r2094, [%rd3+-103]; + prmt.b32 %r2095, %r2094, %r2093, 30212; + ld.local.u8 %r2096, [%rd3+-102]; + ld.local.u8 %r2097, [%rd3+-101]; + prmt.b32 %r2098, %r2097, %r2096, 30212; + prmt.b32 %r2099, %r2098, %r2095, 4180; + add.s32 %r2100, %r2092, %r2099; + add.s32 %r2101, %r2100, %r1976; + xor.b32 %r2102, %r2101, %r2082; + shf.l.wrap.b32 %r2103, %r2102, %r2102, 16; + add.s32 %r2104, %r2103, 1779033703; + xor.b32 %r2105, %r2104, %r2092; + shf.l.wrap.b32 %r2106, %r2105, %r2105, 20; + add.s32 %r2107, %r1983, %r2101; + add.s32 %r2108, %r2107, %r2106; + xor.b32 %r2109, %r2108, %r2103; + shf.l.wrap.b32 %r2110, %r2109, %r2109, 24; + add.s32 %r2111, %r2110, %r2104; + xor.b32 %r2112, %r2111, %r2106; + shf.l.wrap.b32 %r2113, %r2112, %r2112, 25; + ld.local.u8 %r2114, [%rd3+-84]; + ld.local.u8 %r2115, [%rd3+-83]; + prmt.b32 %r2116, %r2115, %r2114, 30212; + ld.local.u8 %r2117, [%rd3+-82]; + ld.local.u8 %r2118, [%rd3+-81]; + prmt.b32 %r2119, %r2118, %r2117, 30212; + prmt.b32 %r2120, %r2119, %r2116, 4180; + ld.local.u8 %r2121, [%rd3+-100]; + ld.local.u8 %r2122, [%rd3+-99]; + prmt.b32 %r2123, %r2122, %r2121, 30212; + ld.local.u8 %r2124, [%rd3+-98]; + ld.local.u8 %r2125, [%rd3+-97]; + prmt.b32 %r2126, %r2125, %r2124, 30212; + prmt.b32 %r2127, %r2126, %r2123, 4180; + add.s32 %r2128, %r2120, %r2127; + add.s32 %r2129, %r2128, %r1990; + xor.b32 %r2130, %r2129, %r2083; + shf.l.wrap.b32 %r2131, %r2130, %r2130, 16; + add.s32 %r2132, %r2131, -1150833019; + xor.b32 %r2133, %r2132, %r2120; + shf.l.wrap.b32 %r2134, %r2133, %r2133, 20; + add.s32 %r2135, %r1997, %r2129; + add.s32 %r2136, %r2135, %r2134; + xor.b32 %r2137, %r2136, %r2131; + shf.l.wrap.b32 %r2138, %r2137, %r2137, 24; + add.s32 %r2139, %r2138, %r2132; + xor.b32 %r2140, %r2139, %r2134; + shf.l.wrap.b32 %r2141, %r2140, %r2140, 25; + ld.local.u8 %r2142, [%rd3+-80]; + ld.local.u8 %r2143, [%rd3+-79]; + prmt.b32 %r2144, %r2143, %r2142, 30212; + ld.local.u8 %r2145, [%rd3+-78]; + ld.local.u8 %r2146, [%rd3+-77]; + prmt.b32 %r2147, %r2146, %r2145, 30212; + prmt.b32 %r2148, %r2147, %r2144, 4180; + ld.local.u8 %r2149, [%rd3+-96]; + ld.local.u8 %r2150, [%rd3+-95]; + prmt.b32 %r2151, %r2150, %r2149, 30212; + ld.local.u8 %r2152, [%rd3+-94]; + ld.local.u8 %r2153, [%rd3+-93]; + prmt.b32 %r2154, %r2153, %r2152, 30212; + prmt.b32 %r2155, %r2154, %r2151, 4180; + add.s32 %r2156, %r2148, %r2155; + add.s32 %r2157, %r2156, %r2004; + cvt.u32.u16 %r2158, %rs13; + and.b32 %r2159, %r2158, 255; + xor.b32 %r2160, %r2157, %r2159; + shr.u32 %r2161, %r2157, 16; + shl.b32 %r2162, %r2160, 16; + or.b32 %r2163, %r2162, %r2161; + add.s32 %r2164, %r2163, 1013904242; + xor.b32 %r2165, %r2164, %r2148; + shf.l.wrap.b32 %r2166, %r2165, %r2165, 20; + add.s32 %r2167, %r2011, %r2157; + add.s32 %r2168, %r2167, %r2166; + xor.b32 %r2169, %r2168, %r2163; + shf.l.wrap.b32 %r2170, %r2169, %r2169, 24; + add.s32 %r2171, %r2170, %r2164; + xor.b32 %r2172, %r2171, %r2166; + shf.l.wrap.b32 %r2173, %r2172, %r2172, 25; + ld.local.u8 %r2174, [%rd3+-76]; + ld.local.u8 %r2175, [%rd3+-75]; + prmt.b32 %r2176, %r2175, %r2174, 30212; + ld.local.u8 %r2177, [%rd3+-74]; + ld.local.u8 %r2178, [%rd3+-73]; + prmt.b32 %r2179, %r2178, %r2177, 30212; + prmt.b32 %r2180, %r2179, %r2176, 4180; + ld.local.u8 %r2181, [%rd3+-92]; + ld.local.u8 %r2182, [%rd3+-91]; + prmt.b32 %r2183, %r2182, %r2181, 30212; + ld.local.u8 %r2184, [%rd3+-90]; + ld.local.u8 %r2185, [%rd3+-89]; + prmt.b32 %r2186, %r2185, %r2184, 30212; + prmt.b32 %r2187, %r2186, %r2183, 4180; + add.s32 %r2188, %r2180, %r2187; + add.s32 %r2189, %r2188, %r2018; + xor.b32 %r2190, %r2189, %r2085; + shr.u32 %r2191, %r2189, 16; + shl.b32 %r2192, %r2190, 16; + or.b32 %r2193, %r2192, %r2191; + add.s32 %r2194, %r2193, -1521486534; + xor.b32 %r2195, %r2194, %r2180; + shf.l.wrap.b32 %r2196, %r2195, %r2195, 20; + add.s32 %r2197, %r2025, %r2189; + add.s32 %r2198, %r2197, %r2196; + xor.b32 %r2199, %r2198, %r2193; + shf.l.wrap.b32 %r2200, %r2199, %r2199, 24; + add.s32 %r2201, %r2200, %r2194; + xor.b32 %r2202, %r2201, %r2196; + shf.l.wrap.b32 %r2203, %r2202, %r2202, 25; + add.s32 %r2204, %r2141, %r2108; + add.s32 %r2205, %r2204, %r2032; + xor.b32 %r2206, %r2200, %r2205; + shf.l.wrap.b32 %r2207, %r2206, %r2206, 16; + add.s32 %r2208, %r2207, %r2171; + xor.b32 %r2209, %r2208, %r2141; + shf.l.wrap.b32 %r2210, %r2209, %r2209, 20; + add.s32 %r2211, %r2039, %r2205; + add.s32 %r2212, %r2211, %r2210; + xor.b32 %r2213, %r2212, %r2207; + shf.l.wrap.b32 %r2214, %r2213, %r2213, 24; + add.s32 %r2215, %r2214, %r2208; + xor.b32 %r2216, %r2215, %r2210; + shf.l.wrap.b32 %r2217, %r2216, %r2216, 25; + add.s32 %r2218, %r2173, %r2136; + add.s32 %r2219, %r2218, %r2046; + xor.b32 %r2220, %r2219, %r2110; + shf.l.wrap.b32 %r2221, %r2220, %r2220, 16; + add.s32 %r2222, %r2221, %r2201; + xor.b32 %r2223, %r2222, %r2173; + shf.l.wrap.b32 %r2224, %r2223, %r2223, 20; + add.s32 %r2225, %r2053, %r2219; + add.s32 %r2226, %r2225, %r2224; + xor.b32 %r2227, %r2226, %r2221; + shf.l.wrap.b32 %r2228, %r2227, %r2227, 24; + add.s32 %r2229, %r2228, %r2222; + xor.b32 %r2230, %r2229, %r2224; + shf.l.wrap.b32 %r2231, %r2230, %r2230, 25; + add.s32 %r2232, %r2203, %r2168; + add.s32 %r2233, %r2232, %r2060; + xor.b32 %r2234, %r2233, %r2138; + shf.l.wrap.b32 %r2235, %r2234, %r2234, 16; + add.s32 %r2236, %r2235, %r2111; + xor.b32 %r2237, %r2236, %r2203; + shf.l.wrap.b32 %r2238, %r2237, %r2237, 20; + add.s32 %r2239, %r2067, %r2233; + add.s32 %r2240, %r2239, %r2238; + xor.b32 %r2241, %r2240, %r2235; + shf.l.wrap.b32 %r2242, %r2241, %r2241, 24; + add.s32 %r2243, %r2242, %r2236; + xor.b32 %r2244, %r2243, %r2238; + shf.l.wrap.b32 %r2245, %r2244, %r2244, 25; + add.s32 %r2246, %r2198, %r2113; + add.s32 %r2247, %r2246, %r2074; + xor.b32 %r2248, %r2247, %r2170; + shf.l.wrap.b32 %r2249, %r2248, %r2248, 16; + add.s32 %r2250, %r2249, %r2139; + xor.b32 %r2251, %r2250, %r2113; + shf.l.wrap.b32 %r2252, %r2251, %r2251, 20; + add.s32 %r2253, %r2081, %r2247; + add.s32 %r2254, %r2253, %r2252; + xor.b32 %r2255, %r2254, %r2249; + shf.l.wrap.b32 %r2256, %r2255, %r2255, 24; + add.s32 %r2257, %r2256, %r2250; + xor.b32 %r2258, %r2257, %r2252; + shf.l.wrap.b32 %r2259, %r2258, %r2258, 25; + add.s32 %r2260, %r2212, %r1990; + add.s32 %r2261, %r2260, %r2259; + xor.b32 %r2262, %r2261, %r2228; + shf.l.wrap.b32 %r2263, %r2262, %r2262, 16; + add.s32 %r2264, %r2263, %r2243; + xor.b32 %r2265, %r2264, %r2259; + shf.l.wrap.b32 %r2266, %r2265, %r2265, 20; + add.s32 %r2267, %r2261, %r2018; + add.s32 %r2268, %r2267, %r2266; + xor.b32 %r2269, %r2268, %r2263; + shf.l.wrap.b32 %r2270, %r2269, %r2269, 24; + add.s32 %r2271, %r2270, %r2264; + xor.b32 %r2272, %r2271, %r2266; + shf.l.wrap.b32 %r2273, %r2272, %r2272, 25; + add.s32 %r2274, %r2226, %r1997; + add.s32 %r2275, %r2274, %r2217; + xor.b32 %r2276, %r2242, %r2275; + shf.l.wrap.b32 %r2277, %r2276, %r2276, 16; + add.s32 %r2278, %r2257, %r2277; + xor.b32 %r2279, %r2278, %r2217; + shf.l.wrap.b32 %r2280, %r2279, %r2279, 20; + add.s32 %r2281, %r2275, %r2046; + add.s32 %r2282, %r2281, %r2280; + xor.b32 %r2283, %r2282, %r2277; + shf.l.wrap.b32 %r2284, %r2283, %r2283, 24; + add.s32 %r2285, %r2284, %r2278; + xor.b32 %r2286, %r2285, %r2280; + shf.l.wrap.b32 %r2287, %r2286, %r2286, 25; + add.s32 %r2288, %r2231, %r2025; + add.s32 %r2289, %r2288, %r2240; + xor.b32 %r2290, %r2256, %r2289; + shf.l.wrap.b32 %r2291, %r2290, %r2290, 16; + add.s32 %r2292, %r2291, %r2215; + xor.b32 %r2293, %r2292, %r2231; + shf.l.wrap.b32 %r2294, %r2293, %r2293, 20; + add.s32 %r2295, %r2289, %r1976; + add.s32 %r2296, %r2295, %r2294; + xor.b32 %r2297, %r2296, %r2291; + shf.l.wrap.b32 %r2298, %r2297, %r2297, 24; + add.s32 %r2299, %r2298, %r2292; + xor.b32 %r2300, %r2299, %r2294; + shf.l.wrap.b32 %r2301, %r2300, %r2300, 25; + add.s32 %r2302, %r2245, %r2004; + add.s32 %r2303, %r2302, %r2254; + xor.b32 %r2304, %r2303, %r2214; + shf.l.wrap.b32 %r2305, %r2304, %r2304, 16; + add.s32 %r2306, %r2305, %r2229; + xor.b32 %r2307, %r2306, %r2245; + shf.l.wrap.b32 %r2308, %r2307, %r2307, 20; + add.s32 %r2309, %r2303, %r2067; + add.s32 %r2310, %r2309, %r2308; + xor.b32 %r2311, %r2310, %r2305; + shf.l.wrap.b32 %r2312, %r2311, %r2311, 24; + add.s32 %r2313, %r2312, %r2306; + xor.b32 %r2314, %r2313, %r2308; + shf.l.wrap.b32 %r2315, %r2314, %r2314, 25; + add.s32 %r2316, %r2287, %r1983; + add.s32 %r2317, %r2316, %r2268; + xor.b32 %r2318, %r2317, %r2312; + shf.l.wrap.b32 %r2319, %r2318, %r2318, 16; + add.s32 %r2320, %r2319, %r2299; + xor.b32 %r2321, %r2320, %r2287; + shf.l.wrap.b32 %r2322, %r2321, %r2321, 20; + add.s32 %r2323, %r2317, %r2053; + add.s32 %r2324, %r2323, %r2322; + xor.b32 %r2325, %r2324, %r2319; + shf.l.wrap.b32 %r2326, %r2325, %r2325, 24; + add.s32 %r2327, %r2326, %r2320; + xor.b32 %r2328, %r2327, %r2322; + shf.l.wrap.b32 %r2329, %r2328, %r2328, 25; + add.s32 %r2330, %r2282, %r2060; + add.s32 %r2331, %r2330, %r2301; + xor.b32 %r2332, %r2270, %r2331; + shf.l.wrap.b32 %r2333, %r2332, %r2332, 16; + add.s32 %r2334, %r2333, %r2313; + xor.b32 %r2335, %r2334, %r2301; + shf.l.wrap.b32 %r2336, %r2335, %r2335, 20; + add.s32 %r2337, %r2331, %r2011; + add.s32 %r2338, %r2337, %r2336; + xor.b32 %r2339, %r2338, %r2333; + shf.l.wrap.b32 %r2340, %r2339, %r2339, 24; + add.s32 %r2341, %r2340, %r2334; + xor.b32 %r2342, %r2341, %r2336; + shf.l.wrap.b32 %r2343, %r2342, %r2342, 25; + add.s32 %r2344, %r2296, %r2039; + add.s32 %r2345, %r2344, %r2315; + xor.b32 %r2346, %r2345, %r2284; + shf.l.wrap.b32 %r2347, %r2346, %r2346, 16; + add.s32 %r2348, %r2347, %r2271; + xor.b32 %r2349, %r2348, %r2315; + shf.l.wrap.b32 %r2350, %r2349, %r2349, 20; + add.s32 %r2351, %r2345, %r2074; + add.s32 %r2352, %r2351, %r2350; + xor.b32 %r2353, %r2352, %r2347; + shf.l.wrap.b32 %r2354, %r2353, %r2353, 24; + add.s32 %r2355, %r2354, %r2348; + xor.b32 %r2356, %r2355, %r2350; + shf.l.wrap.b32 %r2357, %r2356, %r2356, 25; + add.s32 %r2358, %r2310, %r2081; + add.s32 %r2359, %r2358, %r2273; + xor.b32 %r2360, %r2359, %r2298; + shf.l.wrap.b32 %r2361, %r2360, %r2360, 16; + add.s32 %r2362, %r2361, %r2285; + xor.b32 %r2363, %r2362, %r2273; + shf.l.wrap.b32 %r2364, %r2363, %r2363, 20; + add.s32 %r2365, %r2359, %r2032; + add.s32 %r2366, %r2365, %r2364; + xor.b32 %r2367, %r2366, %r2361; + shf.l.wrap.b32 %r2368, %r2367, %r2367, 24; + add.s32 %r2369, %r2368, %r2362; + xor.b32 %r2370, %r2369, %r2364; + shf.l.wrap.b32 %r2371, %r2370, %r2370, 25; + add.s32 %r2372, %r2324, %r1997; + add.s32 %r2373, %r2372, %r2371; + xor.b32 %r2374, %r2373, %r2340; + shf.l.wrap.b32 %r2375, %r2374, %r2374, 16; + add.s32 %r2376, %r2375, %r2355; + xor.b32 %r2377, %r2376, %r2371; + shf.l.wrap.b32 %r2378, %r2377, %r2377, 20; + add.s32 %r2379, %r2373, %r2004; + add.s32 %r2380, %r2379, %r2378; + xor.b32 %r2381, %r2380, %r2375; + shf.l.wrap.b32 %r2382, %r2381, %r2381, 24; + add.s32 %r2383, %r2382, %r2376; + xor.b32 %r2384, %r2383, %r2378; + shf.l.wrap.b32 %r2385, %r2384, %r2384, 25; + add.s32 %r2386, %r2338, %r2046; + add.s32 %r2387, %r2386, %r2329; + xor.b32 %r2388, %r2387, %r2354; + shf.l.wrap.b32 %r2389, %r2388, %r2388, 16; + add.s32 %r2390, %r2389, %r2369; + xor.b32 %r2391, %r2390, %r2329; + shf.l.wrap.b32 %r2392, %r2391, %r2391, 20; + add.s32 %r2393, %r2387, %r2060; + add.s32 %r2394, %r2393, %r2392; + xor.b32 %r2395, %r2394, %r2389; + shf.l.wrap.b32 %r2396, %r2395, %r2395, 24; + add.s32 %r2397, %r2396, %r2390; + xor.b32 %r2398, %r2397, %r2392; + shf.l.wrap.b32 %r2399, %r2398, %r2398, 25; + add.s32 %r2400, %r2352, %r2067; + add.s32 %r2401, %r2400, %r2343; + xor.b32 %r2402, %r2368, %r2401; + shf.l.wrap.b32 %r2403, %r2402, %r2402, 16; + add.s32 %r2404, %r2403, %r2327; + xor.b32 %r2405, %r2404, %r2343; + shf.l.wrap.b32 %r2406, %r2405, %r2405, 20; + add.s32 %r2407, %r2401, %r1990; + add.s32 %r2408, %r2407, %r2406; + xor.b32 %r2409, %r2408, %r2403; + shf.l.wrap.b32 %r2410, %r2409, %r2409, 24; + add.s32 %r2411, %r2410, %r2404; + xor.b32 %r2412, %r2411, %r2406; + shf.l.wrap.b32 %r2413, %r2412, %r2412, 25; + add.s32 %r2414, %r2357, %r2025; + add.s32 %r2415, %r2414, %r2366; + xor.b32 %r2416, %r2415, %r2326; + shf.l.wrap.b32 %r2417, %r2416, %r2416, 16; + add.s32 %r2418, %r2417, %r2341; + xor.b32 %r2419, %r2418, %r2357; + shf.l.wrap.b32 %r2420, %r2419, %r2419, 20; + add.s32 %r2421, %r2415, %r2074; + add.s32 %r2422, %r2421, %r2420; + xor.b32 %r2423, %r2422, %r2417; + shf.l.wrap.b32 %r2424, %r2423, %r2423, 24; + add.s32 %r2425, %r2424, %r2418; + xor.b32 %r2426, %r2425, %r2420; + shf.l.wrap.b32 %r2427, %r2426, %r2426, 25; + add.s32 %r2428, %r2399, %r2018; + add.s32 %r2429, %r2428, %r2380; + xor.b32 %r2430, %r2429, %r2424; + shf.l.wrap.b32 %r2431, %r2430, %r2430, 16; + add.s32 %r2432, %r2431, %r2411; + xor.b32 %r2433, %r2432, %r2399; + shf.l.wrap.b32 %r2434, %r2433, %r2433, 20; + add.s32 %r2435, %r2429, %r2011; + add.s32 %r2436, %r2435, %r2434; + xor.b32 %r2437, %r2436, %r2431; + shf.l.wrap.b32 %r2438, %r2437, %r2437, 24; + add.s32 %r2439, %r2438, %r2432; + xor.b32 %r2440, %r2439, %r2434; + shf.l.wrap.b32 %r2441, %r2440, %r2440, 25; + add.s32 %r2442, %r2394, %r2039; + add.s32 %r2443, %r2442, %r2413; + xor.b32 %r2444, %r2382, %r2443; + shf.l.wrap.b32 %r2445, %r2444, %r2444, 16; + add.s32 %r2446, %r2445, %r2425; + xor.b32 %r2447, %r2446, %r2413; + shf.l.wrap.b32 %r2448, %r2447, %r2447, 20; + add.s32 %r2449, %r2443, %r1976; + add.s32 %r2450, %r2449, %r2448; + xor.b32 %r2451, %r2450, %r2445; + shf.l.wrap.b32 %r2452, %r2451, %r2451, 24; + add.s32 %r2453, %r2452, %r2446; + xor.b32 %r2454, %r2453, %r2448; + shf.l.wrap.b32 %r2455, %r2454, %r2454, 25; + add.s32 %r2456, %r2408, %r2053; + add.s32 %r2457, %r2456, %r2427; + xor.b32 %r2458, %r2457, %r2396; + shf.l.wrap.b32 %r2459, %r2458, %r2458, 16; + add.s32 %r2460, %r2459, %r2383; + xor.b32 %r2461, %r2460, %r2427; + shf.l.wrap.b32 %r2462, %r2461, %r2461, 20; + add.s32 %r2463, %r2457, %r2081; + add.s32 %r2464, %r2463, %r2462; + xor.b32 %r2465, %r2464, %r2459; + shf.l.wrap.b32 %r2466, %r2465, %r2465, 24; + add.s32 %r2467, %r2466, %r2460; + xor.b32 %r2468, %r2467, %r2462; + shf.l.wrap.b32 %r2469, %r2468, %r2468, 25; + add.s32 %r2470, %r2422, %r2032; + add.s32 %r2471, %r2470, %r2385; + xor.b32 %r2472, %r2471, %r2410; + shf.l.wrap.b32 %r2473, %r2472, %r2472, 16; + add.s32 %r2474, %r2473, %r2397; + xor.b32 %r2475, %r2474, %r2385; + shf.l.wrap.b32 %r2476, %r2475, %r2475, 20; + add.s32 %r2477, %r2471, %r1983; + add.s32 %r2478, %r2477, %r2476; + xor.b32 %r2479, %r2478, %r2473; + shf.l.wrap.b32 %r2480, %r2479, %r2479, 24; + add.s32 %r2481, %r2480, %r2474; + xor.b32 %r2482, %r2481, %r2476; + shf.l.wrap.b32 %r2483, %r2482, %r2482, 25; + add.s32 %r2484, %r2436, %r2046; + add.s32 %r2485, %r2484, %r2483; + xor.b32 %r2486, %r2485, %r2452; + shf.l.wrap.b32 %r2487, %r2486, %r2486, 16; + add.s32 %r2488, %r2487, %r2467; + xor.b32 %r2489, %r2488, %r2483; + shf.l.wrap.b32 %r2490, %r2489, %r2489, 20; + add.s32 %r2491, %r2485, %r2025; + add.s32 %r2492, %r2491, %r2490; + xor.b32 %r2493, %r2492, %r2487; + shf.l.wrap.b32 %r2494, %r2493, %r2493, 24; + add.s32 %r2495, %r2494, %r2488; + xor.b32 %r2496, %r2495, %r2490; + shf.l.wrap.b32 %r2497, %r2496, %r2496, 25; + add.s32 %r2498, %r2450, %r2060; + add.s32 %r2499, %r2498, %r2441; + xor.b32 %r2500, %r2499, %r2466; + shf.l.wrap.b32 %r2501, %r2500, %r2500, 16; + add.s32 %r2502, %r2501, %r2481; + xor.b32 %r2503, %r2502, %r2441; + shf.l.wrap.b32 %r2504, %r2503, %r2503, 20; + add.s32 %r2505, %r2499, %r2039; + add.s32 %r2506, %r2505, %r2504; + xor.b32 %r2507, %r2506, %r2501; + shf.l.wrap.b32 %r2508, %r2507, %r2507, 24; + add.s32 %r2509, %r2508, %r2502; + xor.b32 %r2510, %r2509, %r2504; + shf.l.wrap.b32 %r2511, %r2510, %r2510, 25; + add.s32 %r2512, %r2464, %r2074; + add.s32 %r2513, %r2512, %r2455; + xor.b32 %r2514, %r2480, %r2513; + shf.l.wrap.b32 %r2515, %r2514, %r2514, 16; + add.s32 %r2516, %r2515, %r2439; + xor.b32 %r2517, %r2516, %r2455; + shf.l.wrap.b32 %r2518, %r2517, %r2517, 20; + add.s32 %r2519, %r2513, %r1997; + add.s32 %r2520, %r2519, %r2518; + xor.b32 %r2521, %r2520, %r2515; + shf.l.wrap.b32 %r2522, %r2521, %r2521, 24; + add.s32 %r2523, %r2522, %r2516; + xor.b32 %r2524, %r2523, %r2518; + shf.l.wrap.b32 %r2525, %r2524, %r2524, 25; + add.s32 %r2526, %r2469, %r2067; + add.s32 %r2527, %r2526, %r2478; + xor.b32 %r2528, %r2527, %r2438; + shf.l.wrap.b32 %r2529, %r2528, %r2528, 16; + add.s32 %r2530, %r2529, %r2453; + xor.b32 %r2531, %r2530, %r2469; + shf.l.wrap.b32 %r2532, %r2531, %r2531, 20; + add.s32 %r2533, %r2527, %r2081; + add.s32 %r2534, %r2533, %r2532; + xor.b32 %r2535, %r2534, %r2529; + shf.l.wrap.b32 %r2536, %r2535, %r2535, 24; + add.s32 %r2537, %r2536, %r2530; + xor.b32 %r2538, %r2537, %r2532; + shf.l.wrap.b32 %r2539, %r2538, %r2538, 25; + add.s32 %r2540, %r2511, %r2004; + add.s32 %r2541, %r2540, %r2492; + xor.b32 %r2542, %r2541, %r2536; + shf.l.wrap.b32 %r2543, %r2542, %r2542, 16; + add.s32 %r2544, %r2543, %r2523; + xor.b32 %r2545, %r2544, %r2511; + shf.l.wrap.b32 %r2546, %r2545, %r2545, 20; + add.s32 %r2547, %r2541, %r1976; + add.s32 %r2548, %r2547, %r2546; + xor.b32 %r2549, %r2548, %r2543; + shf.l.wrap.b32 %r2550, %r2549, %r2549, 24; + add.s32 %r2551, %r2550, %r2544; + xor.b32 %r2552, %r2551, %r2546; + shf.l.wrap.b32 %r2553, %r2552, %r2552, 25; + add.s32 %r2554, %r2506, %r2053; + add.s32 %r2555, %r2554, %r2525; + xor.b32 %r2556, %r2494, %r2555; + shf.l.wrap.b32 %r2557, %r2556, %r2556, 16; + add.s32 %r2558, %r2557, %r2537; + xor.b32 %r2559, %r2558, %r2525; + shf.l.wrap.b32 %r2560, %r2559, %r2559, 20; + add.s32 %r2561, %r2555, %r1990; + add.s32 %r2562, %r2561, %r2560; + xor.b32 %r2563, %r2562, %r2557; + shf.l.wrap.b32 %r2564, %r2563, %r2563, 24; + add.s32 %r2565, %r2564, %r2558; + xor.b32 %r2566, %r2565, %r2560; + shf.l.wrap.b32 %r2567, %r2566, %r2566, 25; + add.s32 %r2568, %r2520, %r2011; + add.s32 %r2569, %r2568, %r2539; + xor.b32 %r2570, %r2569, %r2508; + shf.l.wrap.b32 %r2571, %r2570, %r2570, 16; + add.s32 %r2572, %r2571, %r2495; + xor.b32 %r2573, %r2572, %r2539; + shf.l.wrap.b32 %r2574, %r2573, %r2573, 20; + add.s32 %r2575, %r2569, %r2032; + add.s32 %r2576, %r2575, %r2574; + xor.b32 %r2577, %r2576, %r2571; + shf.l.wrap.b32 %r2578, %r2577, %r2577, 24; + add.s32 %r2579, %r2578, %r2572; + xor.b32 %r2580, %r2579, %r2574; + shf.l.wrap.b32 %r2581, %r2580, %r2580, 25; + add.s32 %r2582, %r2534, %r1983; + add.s32 %r2583, %r2582, %r2497; + xor.b32 %r2584, %r2583, %r2522; + shf.l.wrap.b32 %r2585, %r2584, %r2584, 16; + add.s32 %r2586, %r2585, %r2509; + xor.b32 %r2587, %r2586, %r2497; + shf.l.wrap.b32 %r2588, %r2587, %r2587, 20; + add.s32 %r2589, %r2583, %r2018; + add.s32 %r2590, %r2589, %r2588; + xor.b32 %r2591, %r2590, %r2585; + shf.l.wrap.b32 %r2592, %r2591, %r2591, 24; + add.s32 %r2593, %r2592, %r2586; + xor.b32 %r2594, %r2593, %r2588; + shf.l.wrap.b32 %r2595, %r2594, %r2594, 25; + add.s32 %r2596, %r2548, %r2060; + add.s32 %r2597, %r2596, %r2595; + xor.b32 %r2598, %r2597, %r2564; + shf.l.wrap.b32 %r2599, %r2598, %r2598, 16; + add.s32 %r2600, %r2599, %r2579; + xor.b32 %r2601, %r2600, %r2595; + shf.l.wrap.b32 %r2602, %r2601, %r2601, 20; + add.s32 %r2603, %r2597, %r2067; + add.s32 %r2604, %r2603, %r2602; + xor.b32 %r2605, %r2604, %r2599; + shf.l.wrap.b32 %r2606, %r2605, %r2605, 24; + add.s32 %r2607, %r2606, %r2600; + xor.b32 %r2608, %r2607, %r2602; + shf.l.wrap.b32 %r2609, %r2608, %r2608, 25; + add.s32 %r2610, %r2562, %r2039; + add.s32 %r2611, %r2610, %r2553; + xor.b32 %r2612, %r2611, %r2578; + shf.l.wrap.b32 %r2613, %r2612, %r2612, 16; + add.s32 %r2614, %r2613, %r2593; + xor.b32 %r2615, %r2614, %r2553; + shf.l.wrap.b32 %r2616, %r2615, %r2615, 20; + add.s32 %r2617, %r2611, %r2053; + add.s32 %r2618, %r2617, %r2616; + xor.b32 %r2619, %r2618, %r2613; + shf.l.wrap.b32 %r2620, %r2619, %r2619, 24; + add.s32 %r2621, %r2620, %r2614; + xor.b32 %r2622, %r2621, %r2616; + shf.l.wrap.b32 %r2623, %r2622, %r2622, 25; + add.s32 %r2624, %r2576, %r2081; + add.s32 %r2625, %r2624, %r2567; + xor.b32 %r2626, %r2592, %r2625; + shf.l.wrap.b32 %r2627, %r2626, %r2626, 16; + add.s32 %r2628, %r2627, %r2551; + xor.b32 %r2629, %r2628, %r2567; + shf.l.wrap.b32 %r2630, %r2629, %r2629, 20; + add.s32 %r2631, %r2625, %r2046; + add.s32 %r2632, %r2631, %r2630; + xor.b32 %r2633, %r2632, %r2627; + shf.l.wrap.b32 %r2634, %r2633, %r2633, 24; + add.s32 %r2635, %r2634, %r2628; + xor.b32 %r2636, %r2635, %r2630; + shf.l.wrap.b32 %r2637, %r2636, %r2636, 25; + add.s32 %r2638, %r2581, %r2074; + add.s32 %r2639, %r2638, %r2590; + xor.b32 %r2640, %r2639, %r2550; + shf.l.wrap.b32 %r2641, %r2640, %r2640, 16; + add.s32 %r2642, %r2641, %r2565; + xor.b32 %r2643, %r2642, %r2581; + shf.l.wrap.b32 %r2644, %r2643, %r2643, 20; + add.s32 %r2645, %r2639, %r2032; + add.s32 %r2646, %r2645, %r2644; + xor.b32 %r2647, %r2646, %r2641; + shf.l.wrap.b32 %r2648, %r2647, %r2647, 24; + add.s32 %r2649, %r2648, %r2642; + xor.b32 %r2650, %r2649, %r2644; + shf.l.wrap.b32 %r2651, %r2650, %r2650, 25; + add.s32 %r2652, %r2623, %r2025; + add.s32 %r2653, %r2652, %r2604; + xor.b32 %r2654, %r2653, %r2648; + shf.l.wrap.b32 %r2655, %r2654, %r2654, 16; + add.s32 %r2656, %r2655, %r2635; + xor.b32 %r2657, %r2656, %r2623; + shf.l.wrap.b32 %r2658, %r2657, %r2657, 20; + add.s32 %r2659, %r2653, %r1990; + add.s32 %r2660, %r2659, %r2658; + xor.b32 %r2661, %r2660, %r2655; + shf.l.wrap.b32 %r2662, %r2661, %r2661, 24; + add.s32 %r2663, %r2662, %r2656; + xor.b32 %r2664, %r2663, %r2658; + shf.l.wrap.b32 %r2665, %r2664, %r2664, 25; + add.s32 %r2666, %r2618, %r2011; + add.s32 %r2667, %r2666, %r2637; + xor.b32 %r2668, %r2606, %r2667; + shf.l.wrap.b32 %r2669, %r2668, %r2668, 16; + add.s32 %r2670, %r2669, %r2649; + xor.b32 %r2671, %r2670, %r2637; + shf.l.wrap.b32 %r2672, %r2671, %r2671, 20; + add.s32 %r2673, %r2667, %r1997; + add.s32 %r2674, %r2673, %r2672; + xor.b32 %r2675, %r2674, %r2669; + shf.l.wrap.b32 %r2676, %r2675, %r2675, 24; + add.s32 %r2677, %r2676, %r2670; + xor.b32 %r2678, %r2677, %r2672; + shf.l.wrap.b32 %r2679, %r2678, %r2678, 25; + add.s32 %r2680, %r2632, %r1976; + add.s32 %r2681, %r2680, %r2651; + xor.b32 %r2682, %r2681, %r2620; + shf.l.wrap.b32 %r2683, %r2682, %r2682, 16; + add.s32 %r2684, %r2683, %r2607; + xor.b32 %r2685, %r2684, %r2651; + shf.l.wrap.b32 %r2686, %r2685, %r2685, 20; + add.s32 %r2687, %r2681, %r1983; + add.s32 %r2688, %r2687, %r2686; + xor.b32 %r2689, %r2688, %r2683; + shf.l.wrap.b32 %r2690, %r2689, %r2689, 24; + add.s32 %r2691, %r2690, %r2684; + xor.b32 %r2692, %r2691, %r2686; + shf.l.wrap.b32 %r2693, %r2692, %r2692, 25; + add.s32 %r2694, %r2646, %r2018; + add.s32 %r2695, %r2694, %r2609; + xor.b32 %r2696, %r2695, %r2634; + shf.l.wrap.b32 %r2697, %r2696, %r2696, 16; + add.s32 %r2698, %r2697, %r2621; + xor.b32 %r2699, %r2698, %r2609; + shf.l.wrap.b32 %r2700, %r2699, %r2699, 20; + add.s32 %r2701, %r2695, %r2004; + add.s32 %r2702, %r2701, %r2700; + xor.b32 %r2703, %r2702, %r2697; + shf.l.wrap.b32 %r2704, %r2703, %r2703, 24; + add.s32 %r2705, %r2704, %r2698; + xor.b32 %r2706, %r2705, %r2700; + shf.l.wrap.b32 %r2707, %r2706, %r2706, 25; + add.s32 %r2708, %r2660, %r2039; + add.s32 %r2709, %r2708, %r2707; + xor.b32 %r2710, %r2709, %r2676; + shf.l.wrap.b32 %r2711, %r2710, %r2710, 16; + add.s32 %r2712, %r2711, %r2691; + xor.b32 %r2713, %r2712, %r2707; + shf.l.wrap.b32 %r2714, %r2713, %r2713, 20; + add.s32 %r2715, %r2709, %r2074; + add.s32 %r2716, %r2715, %r2714; + xor.b32 %r2717, %r2716, %r2711; + shf.l.wrap.b32 %r2718, %r2717, %r2717, 24; + add.s32 %r2719, %r2718, %r2712; + xor.b32 %r2720, %r2719, %r2714; + shf.l.wrap.b32 %r2721, %r2720, %r2720, 25; + add.s32 %r2722, %r2674, %r2053; + add.s32 %r2723, %r2722, %r2665; + xor.b32 %r2724, %r2723, %r2690; + shf.l.wrap.b32 %r2725, %r2724, %r2724, 16; + add.s32 %r2726, %r2725, %r2705; + xor.b32 %r2727, %r2726, %r2665; + shf.l.wrap.b32 %r2728, %r2727, %r2727, 20; + add.s32 %r2729, %r2723, %r2011; + add.s32 %r2730, %r2729, %r2728; + xor.b32 %r2731, %r2730, %r2725; + shf.l.wrap.b32 %r2732, %r2731, %r2731, 24; + add.s32 %r2733, %r2732, %r2726; + xor.b32 %r2734, %r2733, %r2728; + shf.l.wrap.b32 %r2735, %r2734, %r2734, 25; + add.s32 %r2736, %r2688, %r2032; + add.s32 %r2737, %r2736, %r2679; + xor.b32 %r2738, %r2704, %r2737; + shf.l.wrap.b32 %r2739, %r2738, %r2738, 16; + add.s32 %r2740, %r2739, %r2663; + xor.b32 %r2741, %r2740, %r2679; + shf.l.wrap.b32 %r2742, %r2741, %r2741, 20; + add.s32 %r2743, %r2737, %r2060; + add.s32 %r2744, %r2743, %r2742; + xor.b32 %r2745, %r2744, %r2739; + shf.l.wrap.b32 %r2746, %r2745, %r2745, 24; + add.s32 %r2747, %r2746, %r2740; + xor.b32 %r2748, %r2747, %r2742; + shf.l.wrap.b32 %r2749, %r2748, %r2748, 25; + add.s32 %r2750, %r2693, %r2081; + add.s32 %r2751, %r2750, %r2702; + xor.b32 %r2752, %r2751, %r2662; + shf.l.wrap.b32 %r2753, %r2752, %r2752, 16; + add.s32 %r2754, %r2753, %r2677; + xor.b32 %r2755, %r2754, %r2693; + shf.l.wrap.b32 %r2756, %r2755, %r2755, 20; + add.s32 %r2757, %r2751, %r1983; + add.s32 %r2758, %r2757, %r2756; + xor.b32 %r2759, %r2758, %r2753; + shf.l.wrap.b32 %r2760, %r2759, %r2759, 24; + add.s32 %r2761, %r2760, %r2754; + xor.b32 %r2762, %r2761, %r2756; + shf.l.wrap.b32 %r2763, %r2762, %r2762, 25; + add.s32 %r2764, %r2735, %r2067; + add.s32 %r2765, %r2764, %r2716; + xor.b32 %r2766, %r2765, %r2760; + shf.l.wrap.b32 %r2767, %r2766, %r2766, 16; + add.s32 %r2768, %r2767, %r2747; + xor.b32 %r2769, %r2768, %r2735; + shf.l.wrap.b32 %r2770, %r2769, %r2769, 20; + add.s32 %r2771, %r2765, %r1997; + add.s32 %r2772, %r2771, %r2770; + xor.b32 %r2773, %r2772, %r2767; + shf.l.wrap.b32 %r2774, %r2773, %r2773, 24; + add.s32 %r2775, %r2774, %r2768; + xor.b32 %r2776, %r2775, %r2770; + shf.l.wrap.b32 %r2777, %r2776, %r2776, 25; + add.s32 %r2778, %r2730, %r1976; + add.s32 %r2779, %r2778, %r2749; + xor.b32 %r2780, %r2718, %r2779; + shf.l.wrap.b32 %r2781, %r2780, %r2780, 16; + add.s32 %r2782, %r2781, %r2761; + xor.b32 %r2783, %r2782, %r2749; + shf.l.wrap.b32 %r2784, %r2783, %r2783, 20; + add.s32 %r2785, %r2779, %r2046; + add.s32 %r2786, %r2785, %r2784; + xor.b32 %r2787, %r2786, %r2781; + shf.l.wrap.b32 %r2788, %r2787, %r2787, 24; + add.s32 %r2789, %r2788, %r2782; + xor.b32 %r2790, %r2789, %r2784; + shf.l.wrap.b32 %r2791, %r2790, %r2790, 25; + add.s32 %r2792, %r2744, %r1990; + add.s32 %r2793, %r2792, %r2763; + xor.b32 %r2794, %r2793, %r2732; + shf.l.wrap.b32 %r2795, %r2794, %r2794, 16; + add.s32 %r2796, %r2795, %r2719; + xor.b32 %r2797, %r2796, %r2763; + shf.l.wrap.b32 %r2798, %r2797, %r2797, 20; + add.s32 %r2799, %r2793, %r2018; + add.s32 %r2800, %r2799, %r2798; + xor.b32 %r2801, %r2800, %r2795; + shf.l.wrap.b32 %r2802, %r2801, %r2801, 24; + add.s32 %r2803, %r2802, %r2796; + xor.b32 %r2804, %r2803, %r2798; + shf.l.wrap.b32 %r2805, %r2804, %r2804, 25; + add.s32 %r2806, %r2758, %r2004; + add.s32 %r2807, %r2806, %r2721; + xor.b32 %r2808, %r2807, %r2746; + shf.l.wrap.b32 %r2809, %r2808, %r2808, 16; + add.s32 %r2810, %r2809, %r2733; + xor.b32 %r2811, %r2810, %r2721; + shf.l.wrap.b32 %r2812, %r2811, %r2811, 20; + add.s32 %r2813, %r2807, %r2025; + add.s32 %r2814, %r2813, %r2812; + xor.b32 %r2815, %r2814, %r2809; + shf.l.wrap.b32 %r2816, %r2815, %r2815, 24; + add.s32 %r2817, %r2816, %r2810; + xor.b32 %r2818, %r2817, %r2812; + shf.l.wrap.b32 %r2819, %r2818, %r2818, 25; + add.s32 %r2820, %r2772, %r2053; + add.s32 %r2821, %r2820, %r2819; + xor.b32 %r2822, %r2821, %r2788; + shf.l.wrap.b32 %r2823, %r2822, %r2822, 16; + add.s32 %r2824, %r2823, %r2803; + xor.b32 %r2825, %r2824, %r2819; + shf.l.wrap.b32 %r2826, %r2825, %r2825, 20; + add.s32 %r2827, %r2821, %r2081; + add.s32 %r2828, %r2827, %r2826; + xor.b32 %r2829, %r2828, %r2823; + shf.l.wrap.b32 %r2830, %r2829, %r2829, 24; + add.s32 %r2831, %r2830, %r2824; + xor.b32 %r2832, %r2831, %r2826; + shf.l.wrap.b32 %r2833, %r2832, %r2832, 25; + add.s32 %r2834, %r2786, %r2011; + add.s32 %r2835, %r2834, %r2777; + xor.b32 %r2836, %r2835, %r2802; + shf.l.wrap.b32 %r2837, %r2836, %r2836, 16; + add.s32 %r2838, %r2837, %r2817; + xor.b32 %r2839, %r2838, %r2777; + shf.l.wrap.b32 %r2840, %r2839, %r2839, 20; + add.s32 %r2841, %r2835, %r1976; + add.s32 %r2842, %r2841, %r2840; + xor.b32 %r2843, %r2842, %r2837; + shf.l.wrap.b32 %r2844, %r2843, %r2843, 24; + add.s32 %r2845, %r2844, %r2838; + xor.b32 %r2846, %r2845, %r2840; + shf.l.wrap.b32 %r2847, %r2846, %r2846, 25; + add.s32 %r2848, %r2800, %r1983; + add.s32 %r2849, %r2848, %r2791; + xor.b32 %r2850, %r2816, %r2849; + shf.l.wrap.b32 %r2851, %r2850, %r2850, 16; + add.s32 %r2852, %r2851, %r2775; + xor.b32 %r2853, %r2852, %r2791; + shf.l.wrap.b32 %r2854, %r2853, %r2853, 20; + add.s32 %r2855, %r2849, %r2039; + add.s32 %r2856, %r2855, %r2854; + xor.b32 %r2857, %r2856, %r2851; + shf.l.wrap.b32 %r2858, %r2857, %r2857, 24; + add.s32 %r2859, %r2858, %r2852; + xor.b32 %r2860, %r2859, %r2854; + shf.l.wrap.b32 %r2861, %r2860, %r2860, 25; + add.s32 %r2862, %r2805, %r2032; + add.s32 %r2863, %r2862, %r2814; + xor.b32 %r2864, %r2863, %r2774; + shf.l.wrap.b32 %r2865, %r2864, %r2864, 16; + add.s32 %r2866, %r2865, %r2789; + xor.b32 %r2867, %r2866, %r2805; + shf.l.wrap.b32 %r2868, %r2867, %r2867, 20; + add.s32 %r2869, %r2863, %r2018; + add.s32 %r2870, %r2869, %r2868; + xor.b32 %r2871, %r2870, %r2865; + shf.l.wrap.b32 %r2872, %r2871, %r2871, 24; + add.s32 %r2873, %r2872, %r2866; + xor.b32 %r2874, %r2873, %r2868; + shf.l.wrap.b32 %r2875, %r2874, %r2874, 25; + add.s32 %r2876, %r2847, %r2074; + add.s32 %r2877, %r2876, %r2828; + xor.b32 %r2878, %r2877, %r2872; + shf.l.wrap.b32 %r2879, %r2878, %r2878, 16; + add.s32 %r2880, %r2879, %r2859; + xor.b32 %r2881, %r2880, %r2847; + shf.l.wrap.b32 %r2882, %r2881, %r2881, 20; + add.s32 %r2883, %r2877, %r2046; + add.s32 %r2884, %r2883, %r2882; + xor.b32 %r2885, %r2884, %r2879; + shf.l.wrap.b32 %r2886, %r2885, %r2885, 24; + add.s32 %r2887, %r2886, %r2880; + xor.b32 %r2888, %r2887, %r2882; + shf.l.wrap.b32 %r2889, %r2888, %r2888, 25; + add.s32 %r2890, %r2842, %r1990; + add.s32 %r2891, %r2890, %r2861; + xor.b32 %r2892, %r2830, %r2891; + shf.l.wrap.b32 %r2893, %r2892, %r2892, 16; + add.s32 %r2894, %r2893, %r2873; + xor.b32 %r2895, %r2894, %r2861; + shf.l.wrap.b32 %r2896, %r2895, %r2895, 20; + add.s32 %r2897, %r2891, %r2060; + add.s32 %r2898, %r2897, %r2896; + xor.b32 %r2899, %r2898, %r2893; + shf.l.wrap.b32 %r2900, %r2899, %r2899, 24; + add.s32 %r2901, %r2900, %r2894; + xor.b32 %r2902, %r2901, %r2896; + shf.l.wrap.b32 %r2903, %r2902, %r2902, 25; + add.s32 %r2904, %r2856, %r1997; + add.s32 %r2905, %r2904, %r2875; + xor.b32 %r2906, %r2905, %r2844; + shf.l.wrap.b32 %r2907, %r2906, %r2906, 16; + add.s32 %r2908, %r2907, %r2831; + xor.b32 %r2909, %r2908, %r2875; + shf.l.wrap.b32 %r2910, %r2909, %r2909, 20; + add.s32 %r2911, %r2905, %r2004; + add.s32 %r2912, %r2911, %r2910; + xor.b32 %r2913, %r2912, %r2907; + shf.l.wrap.b32 %r2914, %r2913, %r2913, 24; + add.s32 %r2915, %r2914, %r2908; + xor.b32 %r2916, %r2915, %r2910; + shf.l.wrap.b32 %r2917, %r2916, %r2916, 25; + add.s32 %r2918, %r2870, %r2025; + add.s32 %r2919, %r2918, %r2833; + xor.b32 %r2920, %r2919, %r2858; + shf.l.wrap.b32 %r2921, %r2920, %r2920, 16; + add.s32 %r2922, %r2921, %r2845; + xor.b32 %r2923, %r2922, %r2833; + shf.l.wrap.b32 %r2924, %r2923, %r2923, 20; + add.s32 %r2925, %r2919, %r2067; + add.s32 %r2926, %r2925, %r2924; + xor.b32 %r2927, %r2926, %r2921; + shf.l.wrap.b32 %r2928, %r2927, %r2927, 24; + add.s32 %r2929, %r2928, %r2922; + xor.b32 %r2930, %r2929, %r2924; + shf.l.wrap.b32 %r2931, %r2930, %r2930, 25; + xor.b32 %r27, %r2915, %r2884; + xor.b32 %r28, %r2929, %r2898; + xor.b32 %r29, %r2887, %r2912; + xor.b32 %r30, %r2926, %r2901; + xor.b32 %r31, %r2931, %r2900; + xor.b32 %r32, %r2889, %r2914; + xor.b32 %r33, %r2928, %r2903; + xor.b32 %r34, %r2917, %r2886; + popc.b64 %r2932, %rd128; + cvt.u64.u32 %rd30, %r2932; + ld.local.u8 %rs127, [%rd3+8]; + cvt.u64.u16 %rd130, %rs127; + setp.ge.u64 %p15, %rd30, %rd130; + mul.wide.u16 %r11659, %rs127, 32; + @%p15 bra $L__BB1_19; + +$L__BB1_18: + add.s32 %r2933, %r11659, -64; + cvt.s64.s32 %rd131, %r2933; + add.s64 %rd132, %rd2, %rd131; + ld.local.u8 %r2934, [%rd3+2]; + ld.local.u8 %r2935, [%rd132+145]; + ld.local.u8 %r2936, [%rd132+146]; + prmt.b32 %r2937, %r2936, %r2935, 30212; + ld.local.u8 %r2938, [%rd132+147]; + prmt.b32 %r2939, %r2938, %r2937, 28756; + ld.local.u8 %r2940, [%rd132+148]; + prmt.b32 %r2941, %r2940, %r2939, 1620; + ld.local.u8 %r2942, [%rd132+149]; + ld.local.u8 %r2943, [%rd132+150]; + prmt.b32 %r2944, %r2943, %r2942, 30212; + ld.local.u8 %r2945, [%rd132+151]; + prmt.b32 %r2946, %r2945, %r2944, 28756; + ld.local.u8 %r2947, [%rd132+152]; + prmt.b32 %r2948, %r2947, %r2946, 1620; + ld.local.u8 %r2949, [%rd132+153]; + ld.local.u8 %r2950, [%rd132+154]; + prmt.b32 %r2951, %r2950, %r2949, 30212; + ld.local.u8 %r2952, [%rd132+155]; + prmt.b32 %r2953, %r2952, %r2951, 28756; + ld.local.u8 %r2954, [%rd132+156]; + prmt.b32 %r2955, %r2954, %r2953, 1620; + ld.local.u8 %r2956, [%rd132+157]; + ld.local.u8 %r2957, [%rd132+158]; + prmt.b32 %r2958, %r2957, %r2956, 30212; + ld.local.u8 %r2959, [%rd132+159]; + prmt.b32 %r2960, %r2959, %r2958, 28756; + ld.local.u8 %r2961, [%rd132+160]; + prmt.b32 %r2962, %r2961, %r2960, 1620; + ld.local.u8 %r2963, [%rd132+161]; + ld.local.u8 %r2964, [%rd132+162]; + prmt.b32 %r2965, %r2964, %r2963, 30212; + ld.local.u8 %r2966, [%rd132+163]; + prmt.b32 %r2967, %r2966, %r2965, 28756; + ld.local.u8 %r2968, [%rd132+164]; + prmt.b32 %r2969, %r2968, %r2967, 1620; + ld.local.u8 %r2970, [%rd132+165]; + ld.local.u8 %r2971, [%rd132+166]; + prmt.b32 %r2972, %r2971, %r2970, 30212; + ld.local.u8 %r2973, [%rd132+167]; + prmt.b32 %r2974, %r2973, %r2972, 28756; + ld.local.u8 %r2975, [%rd132+168]; + prmt.b32 %r2976, %r2975, %r2974, 1620; + ld.local.u8 %r2977, [%rd132+169]; + ld.local.u8 %r2978, [%rd132+170]; + prmt.b32 %r2979, %r2978, %r2977, 30212; + ld.local.u8 %r2980, [%rd132+171]; + prmt.b32 %r2981, %r2980, %r2979, 28756; + ld.local.u8 %r2982, [%rd132+172]; + prmt.b32 %r2983, %r2982, %r2981, 1620; + ld.local.u8 %r2984, [%rd132+173]; + ld.local.u8 %r2985, [%rd132+174]; + prmt.b32 %r2986, %r2985, %r2984, 30212; + ld.local.u8 %r2987, [%rd132+175]; + prmt.b32 %r2988, %r2987, %r2986, 28756; + ld.local.u8 %r2989, [%rd132+176]; + prmt.b32 %r2990, %r2989, %r2988, 1620; + ld.local.u8 %r2991, [%rd132+177]; + ld.local.u8 %r2992, [%rd132+178]; + prmt.b32 %r2993, %r2992, %r2991, 30212; + ld.local.u8 %r2994, [%rd132+179]; + prmt.b32 %r2995, %r2994, %r2993, 28756; + ld.local.u8 %r2996, [%rd132+180]; + prmt.b32 %r2997, %r2996, %r2995, 1620; + ld.local.u8 %r2998, [%rd132+181]; + ld.local.u8 %r2999, [%rd132+182]; + prmt.b32 %r3000, %r2999, %r2998, 30212; + ld.local.u8 %r3001, [%rd132+183]; + prmt.b32 %r3002, %r3001, %r3000, 28756; + ld.local.u8 %r3003, [%rd132+184]; + prmt.b32 %r3004, %r3003, %r3002, 1620; + ld.local.u8 %r3005, [%rd132+185]; + ld.local.u8 %r3006, [%rd132+186]; + prmt.b32 %r3007, %r3006, %r3005, 30212; + ld.local.u8 %r3008, [%rd132+187]; + prmt.b32 %r3009, %r3008, %r3007, 28756; + ld.local.u8 %r3010, [%rd132+188]; + prmt.b32 %r3011, %r3010, %r3009, 1620; + ld.local.u8 %r3012, [%rd132+189]; + ld.local.u8 %r3013, [%rd132+190]; + prmt.b32 %r3014, %r3013, %r3012, 30212; + ld.local.u8 %r3015, [%rd132+191]; + prmt.b32 %r3016, %r3015, %r3014, 28756; + ld.local.u8 %r3017, [%rd132+192]; + prmt.b32 %r3018, %r3017, %r3016, 1620; + ld.local.u8 %r3019, [%rd132+193]; + ld.local.u8 %r3020, [%rd132+194]; + prmt.b32 %r3021, %r3020, %r3019, 30212; + ld.local.u8 %r3022, [%rd132+195]; + prmt.b32 %r3023, %r3022, %r3021, 28756; + ld.local.u8 %r3024, [%rd132+196]; + prmt.b32 %r3025, %r3024, %r3023, 1620; + ld.local.u8 %r3026, [%rd132+197]; + ld.local.u8 %r3027, [%rd132+198]; + prmt.b32 %r3028, %r3027, %r3026, 30212; + ld.local.u8 %r3029, [%rd132+199]; + prmt.b32 %r3030, %r3029, %r3028, 28756; + ld.local.u8 %r3031, [%rd132+200]; + prmt.b32 %r3032, %r3031, %r3030, 1620; + ld.local.u8 %r3033, [%rd132+201]; + ld.local.u8 %r3034, [%rd132+202]; + prmt.b32 %r3035, %r3034, %r3033, 30212; + ld.local.u8 %r3036, [%rd132+203]; + prmt.b32 %r3037, %r3036, %r3035, 28756; + ld.local.u8 %r3038, [%rd132+204]; + prmt.b32 %r3039, %r3038, %r3037, 1620; + ld.local.u8 %r3040, [%rd132+205]; + ld.local.u8 %r3041, [%rd132+206]; + prmt.b32 %r3042, %r3041, %r3040, 30212; + ld.local.u8 %r3043, [%rd132+207]; + prmt.b32 %r3044, %r3043, %r3042, 28756; + ld.local.u8 %r3045, [%rd132+208]; + prmt.b32 %r3046, %r3045, %r3044, 1620; + or.b32 %r3047, %r2934, 4; + ld.local.u8 %r3048, [%rd3+-120]; + ld.local.u8 %r3049, [%rd3+-119]; + prmt.b32 %r3050, %r3049, %r3048, 30212; + ld.local.u8 %r3051, [%rd3+-118]; + ld.local.u8 %r3052, [%rd3+-117]; + prmt.b32 %r3053, %r3052, %r3051, 30212; + prmt.b32 %r3054, %r3053, %r3050, 4180; + ld.local.u8 %r3055, [%rd3+-136]; + ld.local.u8 %r3056, [%rd3+-135]; + prmt.b32 %r3057, %r3056, %r3055, 30212; + ld.local.u8 %r3058, [%rd3+-134]; + ld.local.u8 %r3059, [%rd3+-133]; + prmt.b32 %r3060, %r3059, %r3058, 30212; + prmt.b32 %r3061, %r3060, %r3057, 4180; + add.s32 %r3062, %r3054, %r3061; + add.s32 %r3063, %r3062, %r2941; + shf.l.wrap.b32 %r3064, %r3063, %r3063, 16; + add.s32 %r3065, %r3064, 1779033703; + xor.b32 %r3066, %r3065, %r3054; + shf.l.wrap.b32 %r3067, %r3066, %r3066, 20; + add.s32 %r3068, %r2948, %r3063; + add.s32 %r3069, %r3068, %r3067; + xor.b32 %r3070, %r3069, %r3064; + shf.l.wrap.b32 %r3071, %r3070, %r3070, 24; + add.s32 %r3072, %r3071, %r3065; + xor.b32 %r3073, %r3072, %r3067; + shf.l.wrap.b32 %r3074, %r3073, %r3073, 25; + ld.local.u8 %r3075, [%rd3+-116]; + ld.local.u8 %r3076, [%rd3+-115]; + prmt.b32 %r3077, %r3076, %r3075, 30212; + ld.local.u8 %r3078, [%rd3+-114]; + ld.local.u8 %r3079, [%rd3+-113]; + prmt.b32 %r3080, %r3079, %r3078, 30212; + prmt.b32 %r3081, %r3080, %r3077, 4180; + ld.local.u8 %r3082, [%rd3+-132]; + ld.local.u8 %r3083, [%rd3+-131]; + prmt.b32 %r3084, %r3083, %r3082, 30212; + ld.local.u8 %r3085, [%rd3+-130]; + ld.local.u8 %r3086, [%rd3+-129]; + prmt.b32 %r3087, %r3086, %r3085, 30212; + prmt.b32 %r3088, %r3087, %r3084, 4180; + add.s32 %r3089, %r3081, %r3088; + add.s32 %r3090, %r3089, %r2955; + shf.l.wrap.b32 %r3091, %r3090, %r3090, 16; + add.s32 %r3092, %r3091, -1150833019; + xor.b32 %r3093, %r3092, %r3081; + shf.l.wrap.b32 %r3094, %r3093, %r3093, 20; + add.s32 %r3095, %r2962, %r3090; + add.s32 %r3096, %r3095, %r3094; + xor.b32 %r3097, %r3096, %r3091; + shf.l.wrap.b32 %r3098, %r3097, %r3097, 24; + add.s32 %r3099, %r3098, %r3092; + xor.b32 %r3100, %r3099, %r3094; + shf.l.wrap.b32 %r3101, %r3100, %r3100, 25; + ld.local.u8 %r3102, [%rd3+-112]; + ld.local.u8 %r3103, [%rd3+-111]; + prmt.b32 %r3104, %r3103, %r3102, 30212; + ld.local.u8 %r3105, [%rd3+-110]; + ld.local.u8 %r3106, [%rd3+-109]; + prmt.b32 %r3107, %r3106, %r3105, 30212; + prmt.b32 %r3108, %r3107, %r3104, 4180; + ld.local.u8 %r3109, [%rd3+-128]; + ld.local.u8 %r3110, [%rd3+-127]; + prmt.b32 %r3111, %r3110, %r3109, 30212; + ld.local.u8 %r3112, [%rd3+-126]; + ld.local.u8 %r3113, [%rd3+-125]; + prmt.b32 %r3114, %r3113, %r3112, 30212; + prmt.b32 %r3115, %r3114, %r3111, 4180; + add.s32 %r3116, %r3108, %r3115; + add.s32 %r3117, %r3116, %r2969; + shr.u32 %r3118, %r3117, 16; + shl.b32 %r3119, %r3117, 16; + xor.b32 %r3120, %r3119, 4194304; + or.b32 %r3121, %r3120, %r3118; + add.s32 %r3122, %r3121, 1013904242; + xor.b32 %r3123, %r3122, %r3108; + shf.l.wrap.b32 %r3124, %r3123, %r3123, 20; + add.s32 %r3125, %r2976, %r3117; + add.s32 %r3126, %r3125, %r3124; + xor.b32 %r3127, %r3126, %r3121; + shf.l.wrap.b32 %r3128, %r3127, %r3127, 24; + add.s32 %r3129, %r3128, %r3122; + xor.b32 %r3130, %r3129, %r3124; + shf.l.wrap.b32 %r3131, %r3130, %r3130, 25; + ld.local.u8 %r3132, [%rd3+-108]; + ld.local.u8 %r3133, [%rd3+-107]; + prmt.b32 %r3134, %r3133, %r3132, 30212; + ld.local.u8 %r3135, [%rd3+-106]; + ld.local.u8 %r3136, [%rd3+-105]; + prmt.b32 %r3137, %r3136, %r3135, 30212; + prmt.b32 %r3138, %r3137, %r3134, 4180; + ld.local.u8 %r3139, [%rd3+-124]; + ld.local.u8 %r3140, [%rd3+-123]; + prmt.b32 %r3141, %r3140, %r3139, 30212; + ld.local.u8 %r3142, [%rd3+-122]; + ld.local.u8 %r3143, [%rd3+-121]; + prmt.b32 %r3144, %r3143, %r3142, 30212; + prmt.b32 %r3145, %r3144, %r3141, 4180; + add.s32 %r3146, %r3138, %r3145; + add.s32 %r3147, %r3146, %r2983; + xor.b32 %r3148, %r3147, %r3047; + shr.u32 %r3149, %r3147, 16; + shl.b32 %r3150, %r3148, 16; + or.b32 %r3151, %r3150, %r3149; + add.s32 %r3152, %r3151, -1521486534; + xor.b32 %r3153, %r3152, %r3138; + shf.l.wrap.b32 %r3154, %r3153, %r3153, 20; + add.s32 %r3155, %r2990, %r3147; + add.s32 %r3156, %r3155, %r3154; + xor.b32 %r3157, %r3156, %r3151; + shf.l.wrap.b32 %r3158, %r3157, %r3157, 24; + add.s32 %r3159, %r3158, %r3152; + xor.b32 %r3160, %r3159, %r3154; + shf.l.wrap.b32 %r3161, %r3160, %r3160, 25; + add.s32 %r3162, %r3101, %r3069; + add.s32 %r3163, %r3162, %r2997; + xor.b32 %r3164, %r3158, %r3163; + shf.l.wrap.b32 %r3165, %r3164, %r3164, 16; + add.s32 %r3166, %r3165, %r3129; + xor.b32 %r3167, %r3166, %r3101; + shf.l.wrap.b32 %r3168, %r3167, %r3167, 20; + add.s32 %r3169, %r3004, %r3163; + add.s32 %r3170, %r3169, %r3168; + xor.b32 %r3171, %r3170, %r3165; + shf.l.wrap.b32 %r3172, %r3171, %r3171, 24; + add.s32 %r3173, %r3172, %r3166; + xor.b32 %r3174, %r3173, %r3168; + shf.l.wrap.b32 %r3175, %r3174, %r3174, 25; + add.s32 %r3176, %r3131, %r3096; + add.s32 %r3177, %r3176, %r3011; + xor.b32 %r3178, %r3177, %r3071; + shf.l.wrap.b32 %r3179, %r3178, %r3178, 16; + add.s32 %r3180, %r3179, %r3159; + xor.b32 %r3181, %r3180, %r3131; + shf.l.wrap.b32 %r3182, %r3181, %r3181, 20; + add.s32 %r3183, %r3018, %r3177; + add.s32 %r3184, %r3183, %r3182; + xor.b32 %r3185, %r3184, %r3179; + shf.l.wrap.b32 %r3186, %r3185, %r3185, 24; + add.s32 %r3187, %r3186, %r3180; + xor.b32 %r3188, %r3187, %r3182; + shf.l.wrap.b32 %r3189, %r3188, %r3188, 25; + add.s32 %r3190, %r3161, %r3126; + add.s32 %r3191, %r3190, %r3025; + xor.b32 %r3192, %r3191, %r3098; + shf.l.wrap.b32 %r3193, %r3192, %r3192, 16; + add.s32 %r3194, %r3193, %r3072; + xor.b32 %r3195, %r3194, %r3161; + shf.l.wrap.b32 %r3196, %r3195, %r3195, 20; + add.s32 %r3197, %r3032, %r3191; + add.s32 %r3198, %r3197, %r3196; + xor.b32 %r3199, %r3198, %r3193; + shf.l.wrap.b32 %r3200, %r3199, %r3199, 24; + add.s32 %r3201, %r3200, %r3194; + xor.b32 %r3202, %r3201, %r3196; + shf.l.wrap.b32 %r3203, %r3202, %r3202, 25; + add.s32 %r3204, %r3156, %r3074; + add.s32 %r3205, %r3204, %r3039; + xor.b32 %r3206, %r3205, %r3128; + shf.l.wrap.b32 %r3207, %r3206, %r3206, 16; + add.s32 %r3208, %r3207, %r3099; + xor.b32 %r3209, %r3208, %r3074; + shf.l.wrap.b32 %r3210, %r3209, %r3209, 20; + add.s32 %r3211, %r3046, %r3205; + add.s32 %r3212, %r3211, %r3210; + xor.b32 %r3213, %r3212, %r3207; + shf.l.wrap.b32 %r3214, %r3213, %r3213, 24; + add.s32 %r3215, %r3214, %r3208; + xor.b32 %r3216, %r3215, %r3210; + shf.l.wrap.b32 %r3217, %r3216, %r3216, 25; + add.s32 %r3218, %r3170, %r2955; + add.s32 %r3219, %r3218, %r3217; + xor.b32 %r3220, %r3219, %r3186; + shf.l.wrap.b32 %r3221, %r3220, %r3220, 16; + add.s32 %r3222, %r3221, %r3201; + xor.b32 %r3223, %r3222, %r3217; + shf.l.wrap.b32 %r3224, %r3223, %r3223, 20; + add.s32 %r3225, %r3219, %r2983; + add.s32 %r3226, %r3225, %r3224; + xor.b32 %r3227, %r3226, %r3221; + shf.l.wrap.b32 %r3228, %r3227, %r3227, 24; + add.s32 %r3229, %r3228, %r3222; + xor.b32 %r3230, %r3229, %r3224; + shf.l.wrap.b32 %r3231, %r3230, %r3230, 25; + add.s32 %r3232, %r3184, %r2962; + add.s32 %r3233, %r3232, %r3175; + xor.b32 %r3234, %r3200, %r3233; + shf.l.wrap.b32 %r3235, %r3234, %r3234, 16; + add.s32 %r3236, %r3215, %r3235; + xor.b32 %r3237, %r3236, %r3175; + shf.l.wrap.b32 %r3238, %r3237, %r3237, 20; + add.s32 %r3239, %r3233, %r3011; + add.s32 %r3240, %r3239, %r3238; + xor.b32 %r3241, %r3240, %r3235; + shf.l.wrap.b32 %r3242, %r3241, %r3241, 24; + add.s32 %r3243, %r3242, %r3236; + xor.b32 %r3244, %r3243, %r3238; + shf.l.wrap.b32 %r3245, %r3244, %r3244, 25; + add.s32 %r3246, %r3189, %r2990; + add.s32 %r3247, %r3246, %r3198; + xor.b32 %r3248, %r3214, %r3247; + shf.l.wrap.b32 %r3249, %r3248, %r3248, 16; + add.s32 %r3250, %r3249, %r3173; + xor.b32 %r3251, %r3250, %r3189; + shf.l.wrap.b32 %r3252, %r3251, %r3251, 20; + add.s32 %r3253, %r3247, %r2941; + add.s32 %r3254, %r3253, %r3252; + xor.b32 %r3255, %r3254, %r3249; + shf.l.wrap.b32 %r3256, %r3255, %r3255, 24; + add.s32 %r3257, %r3256, %r3250; + xor.b32 %r3258, %r3257, %r3252; + shf.l.wrap.b32 %r3259, %r3258, %r3258, 25; + add.s32 %r3260, %r3203, %r2969; + add.s32 %r3261, %r3260, %r3212; + xor.b32 %r3262, %r3261, %r3172; + shf.l.wrap.b32 %r3263, %r3262, %r3262, 16; + add.s32 %r3264, %r3263, %r3187; + xor.b32 %r3265, %r3264, %r3203; + shf.l.wrap.b32 %r3266, %r3265, %r3265, 20; + add.s32 %r3267, %r3261, %r3032; + add.s32 %r3268, %r3267, %r3266; + xor.b32 %r3269, %r3268, %r3263; + shf.l.wrap.b32 %r3270, %r3269, %r3269, 24; + add.s32 %r3271, %r3270, %r3264; + xor.b32 %r3272, %r3271, %r3266; + shf.l.wrap.b32 %r3273, %r3272, %r3272, 25; + add.s32 %r3274, %r3245, %r2948; + add.s32 %r3275, %r3274, %r3226; + xor.b32 %r3276, %r3275, %r3270; + shf.l.wrap.b32 %r3277, %r3276, %r3276, 16; + add.s32 %r3278, %r3277, %r3257; + xor.b32 %r3279, %r3278, %r3245; + shf.l.wrap.b32 %r3280, %r3279, %r3279, 20; + add.s32 %r3281, %r3275, %r3018; + add.s32 %r3282, %r3281, %r3280; + xor.b32 %r3283, %r3282, %r3277; + shf.l.wrap.b32 %r3284, %r3283, %r3283, 24; + add.s32 %r3285, %r3284, %r3278; + xor.b32 %r3286, %r3285, %r3280; + shf.l.wrap.b32 %r3287, %r3286, %r3286, 25; + add.s32 %r3288, %r3240, %r3025; + add.s32 %r3289, %r3288, %r3259; + xor.b32 %r3290, %r3228, %r3289; + shf.l.wrap.b32 %r3291, %r3290, %r3290, 16; + add.s32 %r3292, %r3291, %r3271; + xor.b32 %r3293, %r3292, %r3259; + shf.l.wrap.b32 %r3294, %r3293, %r3293, 20; + add.s32 %r3295, %r3289, %r2976; + add.s32 %r3296, %r3295, %r3294; + xor.b32 %r3297, %r3296, %r3291; + shf.l.wrap.b32 %r3298, %r3297, %r3297, 24; + add.s32 %r3299, %r3298, %r3292; + xor.b32 %r3300, %r3299, %r3294; + shf.l.wrap.b32 %r3301, %r3300, %r3300, 25; + add.s32 %r3302, %r3254, %r3004; + add.s32 %r3303, %r3302, %r3273; + xor.b32 %r3304, %r3303, %r3242; + shf.l.wrap.b32 %r3305, %r3304, %r3304, 16; + add.s32 %r3306, %r3305, %r3229; + xor.b32 %r3307, %r3306, %r3273; + shf.l.wrap.b32 %r3308, %r3307, %r3307, 20; + add.s32 %r3309, %r3303, %r3039; + add.s32 %r3310, %r3309, %r3308; + xor.b32 %r3311, %r3310, %r3305; + shf.l.wrap.b32 %r3312, %r3311, %r3311, 24; + add.s32 %r3313, %r3312, %r3306; + xor.b32 %r3314, %r3313, %r3308; + shf.l.wrap.b32 %r3315, %r3314, %r3314, 25; + add.s32 %r3316, %r3268, %r3046; + add.s32 %r3317, %r3316, %r3231; + xor.b32 %r3318, %r3317, %r3256; + shf.l.wrap.b32 %r3319, %r3318, %r3318, 16; + add.s32 %r3320, %r3319, %r3243; + xor.b32 %r3321, %r3320, %r3231; + shf.l.wrap.b32 %r3322, %r3321, %r3321, 20; + add.s32 %r3323, %r3317, %r2997; + add.s32 %r3324, %r3323, %r3322; + xor.b32 %r3325, %r3324, %r3319; + shf.l.wrap.b32 %r3326, %r3325, %r3325, 24; + add.s32 %r3327, %r3326, %r3320; + xor.b32 %r3328, %r3327, %r3322; + shf.l.wrap.b32 %r3329, %r3328, %r3328, 25; + add.s32 %r3330, %r3282, %r2962; + add.s32 %r3331, %r3330, %r3329; + xor.b32 %r3332, %r3331, %r3298; + shf.l.wrap.b32 %r3333, %r3332, %r3332, 16; + add.s32 %r3334, %r3333, %r3313; + xor.b32 %r3335, %r3334, %r3329; + shf.l.wrap.b32 %r3336, %r3335, %r3335, 20; + add.s32 %r3337, %r3331, %r2969; + add.s32 %r3338, %r3337, %r3336; + xor.b32 %r3339, %r3338, %r3333; + shf.l.wrap.b32 %r3340, %r3339, %r3339, 24; + add.s32 %r3341, %r3340, %r3334; + xor.b32 %r3342, %r3341, %r3336; + shf.l.wrap.b32 %r3343, %r3342, %r3342, 25; + add.s32 %r3344, %r3296, %r3011; + add.s32 %r3345, %r3344, %r3287; + xor.b32 %r3346, %r3345, %r3312; + shf.l.wrap.b32 %r3347, %r3346, %r3346, 16; + add.s32 %r3348, %r3347, %r3327; + xor.b32 %r3349, %r3348, %r3287; + shf.l.wrap.b32 %r3350, %r3349, %r3349, 20; + add.s32 %r3351, %r3345, %r3025; + add.s32 %r3352, %r3351, %r3350; + xor.b32 %r3353, %r3352, %r3347; + shf.l.wrap.b32 %r3354, %r3353, %r3353, 24; + add.s32 %r3355, %r3354, %r3348; + xor.b32 %r3356, %r3355, %r3350; + shf.l.wrap.b32 %r3357, %r3356, %r3356, 25; + add.s32 %r3358, %r3310, %r3032; + add.s32 %r3359, %r3358, %r3301; + xor.b32 %r3360, %r3326, %r3359; + shf.l.wrap.b32 %r3361, %r3360, %r3360, 16; + add.s32 %r3362, %r3361, %r3285; + xor.b32 %r3363, %r3362, %r3301; + shf.l.wrap.b32 %r3364, %r3363, %r3363, 20; + add.s32 %r3365, %r3359, %r2955; + add.s32 %r3366, %r3365, %r3364; + xor.b32 %r3367, %r3366, %r3361; + shf.l.wrap.b32 %r3368, %r3367, %r3367, 24; + add.s32 %r3369, %r3368, %r3362; + xor.b32 %r3370, %r3369, %r3364; + shf.l.wrap.b32 %r3371, %r3370, %r3370, 25; + add.s32 %r3372, %r3315, %r2990; + add.s32 %r3373, %r3372, %r3324; + xor.b32 %r3374, %r3373, %r3284; + shf.l.wrap.b32 %r3375, %r3374, %r3374, 16; + add.s32 %r3376, %r3375, %r3299; + xor.b32 %r3377, %r3376, %r3315; + shf.l.wrap.b32 %r3378, %r3377, %r3377, 20; + add.s32 %r3379, %r3373, %r3039; + add.s32 %r3380, %r3379, %r3378; + xor.b32 %r3381, %r3380, %r3375; + shf.l.wrap.b32 %r3382, %r3381, %r3381, 24; + add.s32 %r3383, %r3382, %r3376; + xor.b32 %r3384, %r3383, %r3378; + shf.l.wrap.b32 %r3385, %r3384, %r3384, 25; + add.s32 %r3386, %r3357, %r2983; + add.s32 %r3387, %r3386, %r3338; + xor.b32 %r3388, %r3387, %r3382; + shf.l.wrap.b32 %r3389, %r3388, %r3388, 16; + add.s32 %r3390, %r3389, %r3369; + xor.b32 %r3391, %r3390, %r3357; + shf.l.wrap.b32 %r3392, %r3391, %r3391, 20; + add.s32 %r3393, %r3387, %r2976; + add.s32 %r3394, %r3393, %r3392; + xor.b32 %r3395, %r3394, %r3389; + shf.l.wrap.b32 %r3396, %r3395, %r3395, 24; + add.s32 %r3397, %r3396, %r3390; + xor.b32 %r3398, %r3397, %r3392; + shf.l.wrap.b32 %r3399, %r3398, %r3398, 25; + add.s32 %r3400, %r3352, %r3004; + add.s32 %r3401, %r3400, %r3371; + xor.b32 %r3402, %r3340, %r3401; + shf.l.wrap.b32 %r3403, %r3402, %r3402, 16; + add.s32 %r3404, %r3403, %r3383; + xor.b32 %r3405, %r3404, %r3371; + shf.l.wrap.b32 %r3406, %r3405, %r3405, 20; + add.s32 %r3407, %r3401, %r2941; + add.s32 %r3408, %r3407, %r3406; + xor.b32 %r3409, %r3408, %r3403; + shf.l.wrap.b32 %r3410, %r3409, %r3409, 24; + add.s32 %r3411, %r3410, %r3404; + xor.b32 %r3412, %r3411, %r3406; + shf.l.wrap.b32 %r3413, %r3412, %r3412, 25; + add.s32 %r3414, %r3366, %r3018; + add.s32 %r3415, %r3414, %r3385; + xor.b32 %r3416, %r3415, %r3354; + shf.l.wrap.b32 %r3417, %r3416, %r3416, 16; + add.s32 %r3418, %r3417, %r3341; + xor.b32 %r3419, %r3418, %r3385; + shf.l.wrap.b32 %r3420, %r3419, %r3419, 20; + add.s32 %r3421, %r3415, %r3046; + add.s32 %r3422, %r3421, %r3420; + xor.b32 %r3423, %r3422, %r3417; + shf.l.wrap.b32 %r3424, %r3423, %r3423, 24; + add.s32 %r3425, %r3424, %r3418; + xor.b32 %r3426, %r3425, %r3420; + shf.l.wrap.b32 %r3427, %r3426, %r3426, 25; + add.s32 %r3428, %r3380, %r2997; + add.s32 %r3429, %r3428, %r3343; + xor.b32 %r3430, %r3429, %r3368; + shf.l.wrap.b32 %r3431, %r3430, %r3430, 16; + add.s32 %r3432, %r3431, %r3355; + xor.b32 %r3433, %r3432, %r3343; + shf.l.wrap.b32 %r3434, %r3433, %r3433, 20; + add.s32 %r3435, %r3429, %r2948; + add.s32 %r3436, %r3435, %r3434; + xor.b32 %r3437, %r3436, %r3431; + shf.l.wrap.b32 %r3438, %r3437, %r3437, 24; + add.s32 %r3439, %r3438, %r3432; + xor.b32 %r3440, %r3439, %r3434; + shf.l.wrap.b32 %r3441, %r3440, %r3440, 25; + add.s32 %r3442, %r3394, %r3011; + add.s32 %r3443, %r3442, %r3441; + xor.b32 %r3444, %r3443, %r3410; + shf.l.wrap.b32 %r3445, %r3444, %r3444, 16; + add.s32 %r3446, %r3445, %r3425; + xor.b32 %r3447, %r3446, %r3441; + shf.l.wrap.b32 %r3448, %r3447, %r3447, 20; + add.s32 %r3449, %r3443, %r2990; + add.s32 %r3450, %r3449, %r3448; + xor.b32 %r3451, %r3450, %r3445; + shf.l.wrap.b32 %r3452, %r3451, %r3451, 24; + add.s32 %r3453, %r3452, %r3446; + xor.b32 %r3454, %r3453, %r3448; + shf.l.wrap.b32 %r3455, %r3454, %r3454, 25; + add.s32 %r3456, %r3408, %r3025; + add.s32 %r3457, %r3456, %r3399; + xor.b32 %r3458, %r3457, %r3424; + shf.l.wrap.b32 %r3459, %r3458, %r3458, 16; + add.s32 %r3460, %r3459, %r3439; + xor.b32 %r3461, %r3460, %r3399; + shf.l.wrap.b32 %r3462, %r3461, %r3461, 20; + add.s32 %r3463, %r3457, %r3004; + add.s32 %r3464, %r3463, %r3462; + xor.b32 %r3465, %r3464, %r3459; + shf.l.wrap.b32 %r3466, %r3465, %r3465, 24; + add.s32 %r3467, %r3466, %r3460; + xor.b32 %r3468, %r3467, %r3462; + shf.l.wrap.b32 %r3469, %r3468, %r3468, 25; + add.s32 %r3470, %r3422, %r3039; + add.s32 %r3471, %r3470, %r3413; + xor.b32 %r3472, %r3438, %r3471; + shf.l.wrap.b32 %r3473, %r3472, %r3472, 16; + add.s32 %r3474, %r3473, %r3397; + xor.b32 %r3475, %r3474, %r3413; + shf.l.wrap.b32 %r3476, %r3475, %r3475, 20; + add.s32 %r3477, %r3471, %r2962; + add.s32 %r3478, %r3477, %r3476; + xor.b32 %r3479, %r3478, %r3473; + shf.l.wrap.b32 %r3480, %r3479, %r3479, 24; + add.s32 %r3481, %r3480, %r3474; + xor.b32 %r3482, %r3481, %r3476; + shf.l.wrap.b32 %r3483, %r3482, %r3482, 25; + add.s32 %r3484, %r3427, %r3032; + add.s32 %r3485, %r3484, %r3436; + xor.b32 %r3486, %r3485, %r3396; + shf.l.wrap.b32 %r3487, %r3486, %r3486, 16; + add.s32 %r3488, %r3487, %r3411; + xor.b32 %r3489, %r3488, %r3427; + shf.l.wrap.b32 %r3490, %r3489, %r3489, 20; + add.s32 %r3491, %r3485, %r3046; + add.s32 %r3492, %r3491, %r3490; + xor.b32 %r3493, %r3492, %r3487; + shf.l.wrap.b32 %r3494, %r3493, %r3493, 24; + add.s32 %r3495, %r3494, %r3488; + xor.b32 %r3496, %r3495, %r3490; + shf.l.wrap.b32 %r3497, %r3496, %r3496, 25; + add.s32 %r3498, %r3469, %r2969; + add.s32 %r3499, %r3498, %r3450; + xor.b32 %r3500, %r3499, %r3494; + shf.l.wrap.b32 %r3501, %r3500, %r3500, 16; + add.s32 %r3502, %r3501, %r3481; + xor.b32 %r3503, %r3502, %r3469; + shf.l.wrap.b32 %r3504, %r3503, %r3503, 20; + add.s32 %r3505, %r3499, %r2941; + add.s32 %r3506, %r3505, %r3504; + xor.b32 %r3507, %r3506, %r3501; + shf.l.wrap.b32 %r3508, %r3507, %r3507, 24; + add.s32 %r3509, %r3508, %r3502; + xor.b32 %r3510, %r3509, %r3504; + shf.l.wrap.b32 %r3511, %r3510, %r3510, 25; + add.s32 %r3512, %r3464, %r3018; + add.s32 %r3513, %r3512, %r3483; + xor.b32 %r3514, %r3452, %r3513; + shf.l.wrap.b32 %r3515, %r3514, %r3514, 16; + add.s32 %r3516, %r3515, %r3495; + xor.b32 %r3517, %r3516, %r3483; + shf.l.wrap.b32 %r3518, %r3517, %r3517, 20; + add.s32 %r3519, %r3513, %r2955; + add.s32 %r3520, %r3519, %r3518; + xor.b32 %r3521, %r3520, %r3515; + shf.l.wrap.b32 %r3522, %r3521, %r3521, 24; + add.s32 %r3523, %r3522, %r3516; + xor.b32 %r3524, %r3523, %r3518; + shf.l.wrap.b32 %r3525, %r3524, %r3524, 25; + add.s32 %r3526, %r3478, %r2976; + add.s32 %r3527, %r3526, %r3497; + xor.b32 %r3528, %r3527, %r3466; + shf.l.wrap.b32 %r3529, %r3528, %r3528, 16; + add.s32 %r3530, %r3529, %r3453; + xor.b32 %r3531, %r3530, %r3497; + shf.l.wrap.b32 %r3532, %r3531, %r3531, 20; + add.s32 %r3533, %r3527, %r2997; + add.s32 %r3534, %r3533, %r3532; + xor.b32 %r3535, %r3534, %r3529; + shf.l.wrap.b32 %r3536, %r3535, %r3535, 24; + add.s32 %r3537, %r3536, %r3530; + xor.b32 %r3538, %r3537, %r3532; + shf.l.wrap.b32 %r3539, %r3538, %r3538, 25; + add.s32 %r3540, %r3492, %r2948; + add.s32 %r3541, %r3540, %r3455; + xor.b32 %r3542, %r3541, %r3480; + shf.l.wrap.b32 %r3543, %r3542, %r3542, 16; + add.s32 %r3544, %r3543, %r3467; + xor.b32 %r3545, %r3544, %r3455; + shf.l.wrap.b32 %r3546, %r3545, %r3545, 20; + add.s32 %r3547, %r3541, %r2983; + add.s32 %r3548, %r3547, %r3546; + xor.b32 %r3549, %r3548, %r3543; + shf.l.wrap.b32 %r3550, %r3549, %r3549, 24; + add.s32 %r3551, %r3550, %r3544; + xor.b32 %r3552, %r3551, %r3546; + shf.l.wrap.b32 %r3553, %r3552, %r3552, 25; + add.s32 %r3554, %r3506, %r3025; + add.s32 %r3555, %r3554, %r3553; + xor.b32 %r3556, %r3555, %r3522; + shf.l.wrap.b32 %r3557, %r3556, %r3556, 16; + add.s32 %r3558, %r3557, %r3537; + xor.b32 %r3559, %r3558, %r3553; + shf.l.wrap.b32 %r3560, %r3559, %r3559, 20; + add.s32 %r3561, %r3555, %r3032; + add.s32 %r3562, %r3561, %r3560; + xor.b32 %r3563, %r3562, %r3557; + shf.l.wrap.b32 %r3564, %r3563, %r3563, 24; + add.s32 %r3565, %r3564, %r3558; + xor.b32 %r3566, %r3565, %r3560; + shf.l.wrap.b32 %r3567, %r3566, %r3566, 25; + add.s32 %r3568, %r3520, %r3004; + add.s32 %r3569, %r3568, %r3511; + xor.b32 %r3570, %r3569, %r3536; + shf.l.wrap.b32 %r3571, %r3570, %r3570, 16; + add.s32 %r3572, %r3571, %r3551; + xor.b32 %r3573, %r3572, %r3511; + shf.l.wrap.b32 %r3574, %r3573, %r3573, 20; + add.s32 %r3575, %r3569, %r3018; + add.s32 %r3576, %r3575, %r3574; + xor.b32 %r3577, %r3576, %r3571; + shf.l.wrap.b32 %r3578, %r3577, %r3577, 24; + add.s32 %r3579, %r3578, %r3572; + xor.b32 %r3580, %r3579, %r3574; + shf.l.wrap.b32 %r3581, %r3580, %r3580, 25; + add.s32 %r3582, %r3534, %r3046; + add.s32 %r3583, %r3582, %r3525; + xor.b32 %r3584, %r3550, %r3583; + shf.l.wrap.b32 %r3585, %r3584, %r3584, 16; + add.s32 %r3586, %r3585, %r3509; + xor.b32 %r3587, %r3586, %r3525; + shf.l.wrap.b32 %r3588, %r3587, %r3587, 20; + add.s32 %r3589, %r3583, %r3011; + add.s32 %r3590, %r3589, %r3588; + xor.b32 %r3591, %r3590, %r3585; + shf.l.wrap.b32 %r3592, %r3591, %r3591, 24; + add.s32 %r3593, %r3592, %r3586; + xor.b32 %r3594, %r3593, %r3588; + shf.l.wrap.b32 %r3595, %r3594, %r3594, 25; + add.s32 %r3596, %r3539, %r3039; + add.s32 %r3597, %r3596, %r3548; + xor.b32 %r3598, %r3597, %r3508; + shf.l.wrap.b32 %r3599, %r3598, %r3598, 16; + add.s32 %r3600, %r3599, %r3523; + xor.b32 %r3601, %r3600, %r3539; + shf.l.wrap.b32 %r3602, %r3601, %r3601, 20; + add.s32 %r3603, %r3597, %r2997; + add.s32 %r3604, %r3603, %r3602; + xor.b32 %r3605, %r3604, %r3599; + shf.l.wrap.b32 %r3606, %r3605, %r3605, 24; + add.s32 %r3607, %r3606, %r3600; + xor.b32 %r3608, %r3607, %r3602; + shf.l.wrap.b32 %r3609, %r3608, %r3608, 25; + add.s32 %r3610, %r3581, %r2990; + add.s32 %r3611, %r3610, %r3562; + xor.b32 %r3612, %r3611, %r3606; + shf.l.wrap.b32 %r3613, %r3612, %r3612, 16; + add.s32 %r3614, %r3613, %r3593; + xor.b32 %r3615, %r3614, %r3581; + shf.l.wrap.b32 %r3616, %r3615, %r3615, 20; + add.s32 %r3617, %r3611, %r2955; + add.s32 %r3618, %r3617, %r3616; + xor.b32 %r3619, %r3618, %r3613; + shf.l.wrap.b32 %r3620, %r3619, %r3619, 24; + add.s32 %r3621, %r3620, %r3614; + xor.b32 %r3622, %r3621, %r3616; + shf.l.wrap.b32 %r3623, %r3622, %r3622, 25; + add.s32 %r3624, %r3576, %r2976; + add.s32 %r3625, %r3624, %r3595; + xor.b32 %r3626, %r3564, %r3625; + shf.l.wrap.b32 %r3627, %r3626, %r3626, 16; + add.s32 %r3628, %r3627, %r3607; + xor.b32 %r3629, %r3628, %r3595; + shf.l.wrap.b32 %r3630, %r3629, %r3629, 20; + add.s32 %r3631, %r3625, %r2962; + add.s32 %r3632, %r3631, %r3630; + xor.b32 %r3633, %r3632, %r3627; + shf.l.wrap.b32 %r3634, %r3633, %r3633, 24; + add.s32 %r3635, %r3634, %r3628; + xor.b32 %r3636, %r3635, %r3630; + shf.l.wrap.b32 %r3637, %r3636, %r3636, 25; + add.s32 %r3638, %r3590, %r2941; + add.s32 %r3639, %r3638, %r3609; + xor.b32 %r3640, %r3639, %r3578; + shf.l.wrap.b32 %r3641, %r3640, %r3640, 16; + add.s32 %r3642, %r3641, %r3565; + xor.b32 %r3643, %r3642, %r3609; + shf.l.wrap.b32 %r3644, %r3643, %r3643, 20; + add.s32 %r3645, %r3639, %r2948; + add.s32 %r3646, %r3645, %r3644; + xor.b32 %r3647, %r3646, %r3641; + shf.l.wrap.b32 %r3648, %r3647, %r3647, 24; + add.s32 %r3649, %r3648, %r3642; + xor.b32 %r3650, %r3649, %r3644; + shf.l.wrap.b32 %r3651, %r3650, %r3650, 25; + add.s32 %r3652, %r3604, %r2983; + add.s32 %r3653, %r3652, %r3567; + xor.b32 %r3654, %r3653, %r3592; + shf.l.wrap.b32 %r3655, %r3654, %r3654, 16; + add.s32 %r3656, %r3655, %r3579; + xor.b32 %r3657, %r3656, %r3567; + shf.l.wrap.b32 %r3658, %r3657, %r3657, 20; + add.s32 %r3659, %r3653, %r2969; + add.s32 %r3660, %r3659, %r3658; + xor.b32 %r3661, %r3660, %r3655; + shf.l.wrap.b32 %r3662, %r3661, %r3661, 24; + add.s32 %r3663, %r3662, %r3656; + xor.b32 %r3664, %r3663, %r3658; + shf.l.wrap.b32 %r3665, %r3664, %r3664, 25; + add.s32 %r3666, %r3618, %r3004; + add.s32 %r3667, %r3666, %r3665; + xor.b32 %r3668, %r3667, %r3634; + shf.l.wrap.b32 %r3669, %r3668, %r3668, 16; + add.s32 %r3670, %r3669, %r3649; + xor.b32 %r3671, %r3670, %r3665; + shf.l.wrap.b32 %r3672, %r3671, %r3671, 20; + add.s32 %r3673, %r3667, %r3039; + add.s32 %r3674, %r3673, %r3672; + xor.b32 %r3675, %r3674, %r3669; + shf.l.wrap.b32 %r3676, %r3675, %r3675, 24; + add.s32 %r3677, %r3676, %r3670; + xor.b32 %r3678, %r3677, %r3672; + shf.l.wrap.b32 %r3679, %r3678, %r3678, 25; + add.s32 %r3680, %r3632, %r3018; + add.s32 %r3681, %r3680, %r3623; + xor.b32 %r3682, %r3681, %r3648; + shf.l.wrap.b32 %r3683, %r3682, %r3682, 16; + add.s32 %r3684, %r3683, %r3663; + xor.b32 %r3685, %r3684, %r3623; + shf.l.wrap.b32 %r3686, %r3685, %r3685, 20; + add.s32 %r3687, %r3681, %r2976; + add.s32 %r3688, %r3687, %r3686; + xor.b32 %r3689, %r3688, %r3683; + shf.l.wrap.b32 %r3690, %r3689, %r3689, 24; + add.s32 %r3691, %r3690, %r3684; + xor.b32 %r3692, %r3691, %r3686; + shf.l.wrap.b32 %r3693, %r3692, %r3692, 25; + add.s32 %r3694, %r3646, %r2997; + add.s32 %r3695, %r3694, %r3637; + xor.b32 %r3696, %r3662, %r3695; + shf.l.wrap.b32 %r3697, %r3696, %r3696, 16; + add.s32 %r3698, %r3697, %r3621; + xor.b32 %r3699, %r3698, %r3637; + shf.l.wrap.b32 %r3700, %r3699, %r3699, 20; + add.s32 %r3701, %r3695, %r3025; + add.s32 %r3702, %r3701, %r3700; + xor.b32 %r3703, %r3702, %r3697; + shf.l.wrap.b32 %r3704, %r3703, %r3703, 24; + add.s32 %r3705, %r3704, %r3698; + xor.b32 %r3706, %r3705, %r3700; + shf.l.wrap.b32 %r3707, %r3706, %r3706, 25; + add.s32 %r3708, %r3651, %r3046; + add.s32 %r3709, %r3708, %r3660; + xor.b32 %r3710, %r3709, %r3620; + shf.l.wrap.b32 %r3711, %r3710, %r3710, 16; + add.s32 %r3712, %r3711, %r3635; + xor.b32 %r3713, %r3712, %r3651; + shf.l.wrap.b32 %r3714, %r3713, %r3713, 20; + add.s32 %r3715, %r3709, %r2948; + add.s32 %r3716, %r3715, %r3714; + xor.b32 %r3717, %r3716, %r3711; + shf.l.wrap.b32 %r3718, %r3717, %r3717, 24; + add.s32 %r3719, %r3718, %r3712; + xor.b32 %r3720, %r3719, %r3714; + shf.l.wrap.b32 %r3721, %r3720, %r3720, 25; + add.s32 %r3722, %r3693, %r3032; + add.s32 %r3723, %r3722, %r3674; + xor.b32 %r3724, %r3723, %r3718; + shf.l.wrap.b32 %r3725, %r3724, %r3724, 16; + add.s32 %r3726, %r3725, %r3705; + xor.b32 %r3727, %r3726, %r3693; + shf.l.wrap.b32 %r3728, %r3727, %r3727, 20; + add.s32 %r3729, %r3723, %r2962; + add.s32 %r3730, %r3729, %r3728; + xor.b32 %r3731, %r3730, %r3725; + shf.l.wrap.b32 %r3732, %r3731, %r3731, 24; + add.s32 %r3733, %r3732, %r3726; + xor.b32 %r3734, %r3733, %r3728; + shf.l.wrap.b32 %r3735, %r3734, %r3734, 25; + add.s32 %r3736, %r3688, %r2941; + add.s32 %r3737, %r3736, %r3707; + xor.b32 %r3738, %r3676, %r3737; + shf.l.wrap.b32 %r3739, %r3738, %r3738, 16; + add.s32 %r3740, %r3739, %r3719; + xor.b32 %r3741, %r3740, %r3707; + shf.l.wrap.b32 %r3742, %r3741, %r3741, 20; + add.s32 %r3743, %r3737, %r3011; + add.s32 %r3744, %r3743, %r3742; + xor.b32 %r3745, %r3744, %r3739; + shf.l.wrap.b32 %r3746, %r3745, %r3745, 24; + add.s32 %r3747, %r3746, %r3740; + xor.b32 %r3748, %r3747, %r3742; + shf.l.wrap.b32 %r3749, %r3748, %r3748, 25; + add.s32 %r3750, %r3702, %r2955; + add.s32 %r3751, %r3750, %r3721; + xor.b32 %r3752, %r3751, %r3690; + shf.l.wrap.b32 %r3753, %r3752, %r3752, 16; + add.s32 %r3754, %r3753, %r3677; + xor.b32 %r3755, %r3754, %r3721; + shf.l.wrap.b32 %r3756, %r3755, %r3755, 20; + add.s32 %r3757, %r3751, %r2983; + add.s32 %r3758, %r3757, %r3756; + xor.b32 %r3759, %r3758, %r3753; + shf.l.wrap.b32 %r3760, %r3759, %r3759, 24; + add.s32 %r3761, %r3760, %r3754; + xor.b32 %r3762, %r3761, %r3756; + shf.l.wrap.b32 %r3763, %r3762, %r3762, 25; + add.s32 %r3764, %r3716, %r2969; + add.s32 %r3765, %r3764, %r3679; + xor.b32 %r3766, %r3765, %r3704; + shf.l.wrap.b32 %r3767, %r3766, %r3766, 16; + add.s32 %r3768, %r3767, %r3691; + xor.b32 %r3769, %r3768, %r3679; + shf.l.wrap.b32 %r3770, %r3769, %r3769, 20; + add.s32 %r3771, %r3765, %r2990; + add.s32 %r3772, %r3771, %r3770; + xor.b32 %r3773, %r3772, %r3767; + shf.l.wrap.b32 %r3774, %r3773, %r3773, 24; + add.s32 %r3775, %r3774, %r3768; + xor.b32 %r3776, %r3775, %r3770; + shf.l.wrap.b32 %r3777, %r3776, %r3776, 25; + add.s32 %r3778, %r3730, %r3018; + add.s32 %r3779, %r3778, %r3777; + xor.b32 %r3780, %r3779, %r3746; + shf.l.wrap.b32 %r3781, %r3780, %r3780, 16; + add.s32 %r3782, %r3781, %r3761; + xor.b32 %r3783, %r3782, %r3777; + shf.l.wrap.b32 %r3784, %r3783, %r3783, 20; + add.s32 %r3785, %r3779, %r3046; + add.s32 %r3786, %r3785, %r3784; + xor.b32 %r3787, %r3786, %r3781; + shf.l.wrap.b32 %r3788, %r3787, %r3787, 24; + add.s32 %r3789, %r3788, %r3782; + xor.b32 %r3790, %r3789, %r3784; + shf.l.wrap.b32 %r3791, %r3790, %r3790, 25; + add.s32 %r3792, %r3744, %r2976; + add.s32 %r3793, %r3792, %r3735; + xor.b32 %r3794, %r3793, %r3760; + shf.l.wrap.b32 %r3795, %r3794, %r3794, 16; + add.s32 %r3796, %r3795, %r3775; + xor.b32 %r3797, %r3796, %r3735; + shf.l.wrap.b32 %r3798, %r3797, %r3797, 20; + add.s32 %r3799, %r3793, %r2941; + add.s32 %r3800, %r3799, %r3798; + xor.b32 %r3801, %r3800, %r3795; + shf.l.wrap.b32 %r3802, %r3801, %r3801, 24; + add.s32 %r3803, %r3802, %r3796; + xor.b32 %r3804, %r3803, %r3798; + shf.l.wrap.b32 %r3805, %r3804, %r3804, 25; + add.s32 %r3806, %r3758, %r2948; + add.s32 %r3807, %r3806, %r3749; + xor.b32 %r3808, %r3774, %r3807; + shf.l.wrap.b32 %r3809, %r3808, %r3808, 16; + add.s32 %r3810, %r3809, %r3733; + xor.b32 %r3811, %r3810, %r3749; + shf.l.wrap.b32 %r3812, %r3811, %r3811, 20; + add.s32 %r3813, %r3807, %r3004; + add.s32 %r3814, %r3813, %r3812; + xor.b32 %r3815, %r3814, %r3809; + shf.l.wrap.b32 %r3816, %r3815, %r3815, 24; + add.s32 %r3817, %r3816, %r3810; + xor.b32 %r3818, %r3817, %r3812; + shf.l.wrap.b32 %r3819, %r3818, %r3818, 25; + add.s32 %r3820, %r3763, %r2997; + add.s32 %r3821, %r3820, %r3772; + xor.b32 %r3822, %r3821, %r3732; + shf.l.wrap.b32 %r3823, %r3822, %r3822, 16; + add.s32 %r3824, %r3823, %r3747; + xor.b32 %r3825, %r3824, %r3763; + shf.l.wrap.b32 %r3826, %r3825, %r3825, 20; + add.s32 %r3827, %r3821, %r2983; + add.s32 %r3828, %r3827, %r3826; + xor.b32 %r3829, %r3828, %r3823; + shf.l.wrap.b32 %r3830, %r3829, %r3829, 24; + add.s32 %r3831, %r3830, %r3824; + xor.b32 %r3832, %r3831, %r3826; + shf.l.wrap.b32 %r3833, %r3832, %r3832, 25; + add.s32 %r3834, %r3805, %r3039; + add.s32 %r3835, %r3834, %r3786; + xor.b32 %r3836, %r3835, %r3830; + shf.l.wrap.b32 %r3837, %r3836, %r3836, 16; + add.s32 %r3838, %r3837, %r3817; + xor.b32 %r3839, %r3838, %r3805; + shf.l.wrap.b32 %r3840, %r3839, %r3839, 20; + add.s32 %r3841, %r3835, %r3011; + add.s32 %r3842, %r3841, %r3840; + xor.b32 %r3843, %r3842, %r3837; + shf.l.wrap.b32 %r3844, %r3843, %r3843, 24; + add.s32 %r3845, %r3844, %r3838; + xor.b32 %r3846, %r3845, %r3840; + shf.l.wrap.b32 %r3847, %r3846, %r3846, 25; + add.s32 %r3848, %r3800, %r2955; + add.s32 %r3849, %r3848, %r3819; + xor.b32 %r3850, %r3788, %r3849; + shf.l.wrap.b32 %r3851, %r3850, %r3850, 16; + add.s32 %r3852, %r3851, %r3831; + xor.b32 %r3853, %r3852, %r3819; + shf.l.wrap.b32 %r3854, %r3853, %r3853, 20; + add.s32 %r3855, %r3849, %r3025; + add.s32 %r3856, %r3855, %r3854; + xor.b32 %r3857, %r3856, %r3851; + shf.l.wrap.b32 %r3858, %r3857, %r3857, 24; + add.s32 %r3859, %r3858, %r3852; + xor.b32 %r3860, %r3859, %r3854; + shf.l.wrap.b32 %r3861, %r3860, %r3860, 25; + add.s32 %r3862, %r3814, %r2962; + add.s32 %r3863, %r3862, %r3833; + xor.b32 %r3864, %r3863, %r3802; + shf.l.wrap.b32 %r3865, %r3864, %r3864, 16; + add.s32 %r3866, %r3865, %r3789; + xor.b32 %r3867, %r3866, %r3833; + shf.l.wrap.b32 %r3868, %r3867, %r3867, 20; + add.s32 %r3869, %r3863, %r2969; + add.s32 %r3870, %r3869, %r3868; + xor.b32 %r3871, %r3870, %r3865; + shf.l.wrap.b32 %r3872, %r3871, %r3871, 24; + add.s32 %r3873, %r3872, %r3866; + xor.b32 %r3874, %r3873, %r3868; + shf.l.wrap.b32 %r3875, %r3874, %r3874, 25; + add.s32 %r3876, %r3828, %r2990; + add.s32 %r3877, %r3876, %r3791; + xor.b32 %r3878, %r3877, %r3816; + shf.l.wrap.b32 %r3879, %r3878, %r3878, 16; + add.s32 %r3880, %r3879, %r3803; + xor.b32 %r3881, %r3880, %r3791; + shf.l.wrap.b32 %r3882, %r3881, %r3881, 20; + add.s32 %r3883, %r3877, %r3032; + add.s32 %r3884, %r3883, %r3882; + xor.b32 %r3885, %r3884, %r3879; + shf.l.wrap.b32 %r3886, %r3885, %r3885, 24; + add.s32 %r3887, %r3886, %r3880; + xor.b32 %r3888, %r3887, %r3882; + shf.l.wrap.b32 %r3889, %r3888, %r3888, 25; + xor.b32 %r3890, %r3873, %r3842; + xor.b32 %r3891, %r3887, %r3856; + xor.b32 %r3892, %r3845, %r3870; + xor.b32 %r3893, %r3884, %r3859; + xor.b32 %r3894, %r3889, %r3858; + xor.b32 %r3895, %r3847, %r3872; + xor.b32 %r3896, %r3886, %r3861; + xor.b32 %r3897, %r3875, %r3844; + st.local.u8 [%rd132+145], %r3890; + shr.u32 %r3898, %r3890, 8; + st.local.u8 [%rd132+146], %r3898; + shr.u32 %r3899, %r3890, 16; + st.local.u8 [%rd132+147], %r3899; + shr.u32 %r3900, %r3890, 24; + st.local.u8 [%rd132+148], %r3900; + st.local.u8 [%rd132+149], %r3891; + shr.u32 %r3901, %r3891, 8; + st.local.u8 [%rd132+150], %r3901; + shr.u32 %r3902, %r3891, 16; + st.local.u8 [%rd132+151], %r3902; + shr.u32 %r3903, %r3891, 24; + st.local.u8 [%rd132+152], %r3903; + st.local.u8 [%rd132+153], %r3892; + shr.u32 %r3904, %r3892, 8; + st.local.u8 [%rd132+154], %r3904; + shr.u32 %r3905, %r3892, 16; + st.local.u8 [%rd132+155], %r3905; + shr.u32 %r3906, %r3892, 24; + st.local.u8 [%rd132+156], %r3906; + st.local.u8 [%rd132+157], %r3893; + shr.u32 %r3907, %r3893, 8; + st.local.u8 [%rd132+158], %r3907; + shr.u32 %r3908, %r3893, 16; + st.local.u8 [%rd132+159], %r3908; + shr.u32 %r3909, %r3893, 24; + st.local.u8 [%rd132+160], %r3909; + st.local.u8 [%rd132+161], %r3894; + shr.u32 %r3910, %r3894, 8; + st.local.u8 [%rd132+162], %r3910; + shr.u32 %r3911, %r3894, 16; + st.local.u8 [%rd132+163], %r3911; + shr.u32 %r3912, %r3894, 24; + st.local.u8 [%rd132+164], %r3912; + st.local.u8 [%rd132+165], %r3895; + shr.u32 %r3913, %r3895, 8; + st.local.u8 [%rd132+166], %r3913; + shr.u32 %r3914, %r3895, 16; + st.local.u8 [%rd132+167], %r3914; + shr.u32 %r3915, %r3895, 24; + st.local.u8 [%rd132+168], %r3915; + st.local.u8 [%rd132+169], %r3896; + shr.u32 %r3916, %r3896, 8; + st.local.u8 [%rd132+170], %r3916; + shr.u32 %r3917, %r3896, 16; + st.local.u8 [%rd132+171], %r3917; + shr.u32 %r3918, %r3896, 24; + st.local.u8 [%rd132+172], %r3918; + st.local.u8 [%rd132+173], %r3897; + shr.u32 %r3919, %r3897, 8; + st.local.u8 [%rd132+174], %r3919; + shr.u32 %r3920, %r3897, 16; + st.local.u8 [%rd132+175], %r3920; + shr.u32 %r3921, %r3897, 24; + st.local.u8 [%rd132+176], %r3921; + ld.local.u8 %rs128, [%rd3+8]; + add.s16 %rs129, %rs128, -1; + st.local.u8 [%rd3+8], %rs129; + cvt.u64.u16 %rd133, %rs129; + and.b64 %rd134, %rd133, 255; + setp.lt.u64 %p16, %rd30, %rd134; + and.b16 %rs130, %rs129, 255; + mul.wide.u16 %r11659, %rs130, 32; + @%p16 bra $L__BB1_18; + +$L__BB1_19: + ld.param.u64 %rd223, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvt.s64.s32 %rd136, %r11659; + add.s64 %rd137, %rd2, %rd136; + mov.u64 %rd246, 0; + st.local.u8 [%rd137+145], %r27; + shr.u32 %r3922, %r27, 8; + st.local.u8 [%rd137+146], %r3922; + shr.u32 %r3923, %r27, 16; + st.local.u8 [%rd137+147], %r3923; + shr.u32 %r3924, %r27, 24; + st.local.u8 [%rd137+148], %r3924; + st.local.u8 [%rd137+149], %r28; + shr.u32 %r3925, %r28, 8; + st.local.u8 [%rd137+150], %r3925; + shr.u32 %r3926, %r28, 16; + st.local.u8 [%rd137+151], %r3926; + shr.u32 %r3927, %r28, 24; + st.local.u8 [%rd137+152], %r3927; + st.local.u8 [%rd137+153], %r29; + shr.u32 %r3928, %r29, 8; + st.local.u8 [%rd137+154], %r3928; + shr.u32 %r3929, %r29, 16; + st.local.u8 [%rd137+155], %r3929; + shr.u32 %r3930, %r29, 24; + st.local.u8 [%rd137+156], %r3930; + st.local.u8 [%rd137+157], %r30; + shr.u32 %r3931, %r30, 8; + st.local.u8 [%rd137+158], %r3931; + shr.u32 %r3932, %r30, 16; + st.local.u8 [%rd137+159], %r3932; + shr.u32 %r3933, %r30, 24; + st.local.u8 [%rd137+160], %r3933; + st.local.u8 [%rd137+161], %r31; + shr.u32 %r3934, %r31, 8; + st.local.u8 [%rd137+162], %r3934; + shr.u32 %r3935, %r31, 16; + st.local.u8 [%rd137+163], %r3935; + shr.u32 %r3936, %r31, 24; + st.local.u8 [%rd137+164], %r3936; + st.local.u8 [%rd137+165], %r32; + shr.u32 %r3937, %r32, 8; + st.local.u8 [%rd137+166], %r3937; + shr.u32 %r3938, %r32, 16; + st.local.u8 [%rd137+167], %r3938; + shr.u32 %r3939, %r32, 24; + st.local.u8 [%rd137+168], %r3939; + st.local.u8 [%rd137+169], %r33; + shr.u32 %r3940, %r33, 8; + st.local.u8 [%rd137+170], %r3940; + shr.u32 %r3941, %r33, 16; + st.local.u8 [%rd137+171], %r3941; + shr.u32 %r3942, %r33, 24; + st.local.u8 [%rd137+172], %r3942; + st.local.u8 [%rd137+173], %r34; + shr.u32 %r3943, %r34, 8; + st.local.u8 [%rd137+174], %r3943; + shr.u32 %r3944, %r34, 16; + st.local.u8 [%rd137+175], %r3944; + shr.u32 %r3945, %r34, 24; + st.local.u8 [%rd137+176], %r3945; + ld.local.u8 %rs131, [%rd3+8]; + add.s16 %rs132, %rs131, 1; + st.local.u8 [%rd3+8], %rs132; + ld.local.u64 %rd138, [%rd3+-72]; + add.s64 %rd32, %rd138, 1; + add.s64 %rd254, %rd223, %rd6; + +$L__BB1_20: + add.s64 %rd139, %rd2, %rd246; + ld.local.u8 %rs133, [%rd139]; + st.local.u8 [%rd139+32], %rs133; + add.s64 %rd246, %rd246, 1; + setp.lt.u64 %p17, %rd246, 32; + @%p17 bra $L__BB1_20; + + mov.u64 %rd247, 0; + st.local.u64 [%rd3+-72], %rd32; + mov.u16 %rs134, 0; + st.local.u8 [%rd3+1], %rs134; + +$L__BB1_22: + add.s64 %rd141, %rd2, %rd247; + st.local.u8 [%rd141+72], %rs134; + add.s64 %rd247, %rd247, 1; + setp.lt.u64 %p18, %rd247, 64; + @%p18 bra $L__BB1_22; + + ld.param.u64 %rd236, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_1]; + cvta.to.local.u64 %rd235, %rd236; + add.s64 %rd261, %rd235, %rd6; + mov.u64 %rd225, 80; + sub.s64 %rd262, %rd225, %rd6; + mov.u16 %rs136, 0; + st.local.u8 [%rd3], %rs136; + +$L__BB1_24: + setp.lt.u64 %p19, %rd262, 1025; + @%p19 bra $L__BB1_48; + + ld.local.u64 %rd251, [%rd3+-72]; + add.u64 %rd142, %SP, 0; + add.u64 %rd42, %SPL, 0; + +$L__BB1_26: + or.b64 %rd143, %rd262, 1; + mov.u64 %rd144, 1; + setp.gt.u64 %p20, %rd143, 4294967295; + shr.u64 %rd145, %rd262, 32; + selp.b64 %rd146, %rd145, %rd143, %p20; + selp.b32 %r3946, 32, 0, %p20; + and.b64 %rd147, %rd146, 4294901760; + setp.ne.s64 %p21, %rd147, 0; + shr.u64 %rd148, %rd146, 16; + or.b32 %r3947, %r3946, 16; + selp.b64 %rd149, %rd148, %rd146, %p21; + selp.b32 %r3948, %r3947, %r3946, %p21; + and.b64 %rd150, %rd149, 65280; + setp.ne.s64 %p22, %rd150, 0; + shr.u64 %rd151, %rd149, 8; + or.b32 %r3949, %r3948, 8; + selp.b64 %rd152, %rd151, %rd149, %p22; + selp.b32 %r3950, %r3949, %r3948, %p22; + and.b64 %rd153, %rd152, 240; + setp.ne.s64 %p23, %rd153, 0; + shr.u64 %rd154, %rd152, 4; + or.b32 %r3951, %r3950, 4; + selp.b64 %rd155, %rd154, %rd152, %p23; + selp.b32 %r3952, %r3951, %r3950, %p23; + and.b64 %rd156, %rd155, 12; + setp.ne.s64 %p24, %rd156, 0; + shr.u64 %rd157, %rd155, 2; + add.s32 %r3953, %r3952, 2; + selp.b64 %rd158, %rd157, %rd155, %p24; + selp.b32 %r3954, %r3953, %r3952, %p24; + and.b64 %rd159, %rd158, 2; + shr.u64 %rd160, %rd159, 1; + cvt.u32.u64 %r3955, %rd160; + add.s32 %r3956, %r3954, %r3955; + shl.b64 %rd255, %rd144, %r3956; + shl.b64 %rd48, %rd251, 10; + +$L__BB1_27: + mov.u64 %rd49, %rd255; + add.s64 %rd161, %rd49, -1; + and.b64 %rd162, %rd161, %rd48; + setp.ne.s64 %p25, %rd162, 0; + shr.u64 %rd255, %rd49, 1; + @%p25 bra $L__BB1_27; + + ld.local.u8 %rs14, [%rd3+2]; + setp.lt.u64 %p26, %rd49, 1025; + @%p26 bra $L__BB1_36; + bra.uni $L__BB1_29; + +$L__BB1_36: + ld.local.u8 %r5955, [%rd3+-136]; + ld.local.u8 %r5956, [%rd3+-135]; + prmt.b32 %r5957, %r5956, %r5955, 30212; + ld.local.u8 %r5958, [%rd3+-134]; + ld.local.u8 %r5959, [%rd3+-133]; + prmt.b32 %r5960, %r5959, %r5958, 30212; + prmt.b32 %r11679, %r5960, %r5957, 4180; + ld.local.u8 %r5961, [%rd3+-132]; + ld.local.u8 %r5962, [%rd3+-131]; + prmt.b32 %r5963, %r5962, %r5961, 30212; + ld.local.u8 %r5964, [%rd3+-130]; + ld.local.u8 %r5965, [%rd3+-129]; + prmt.b32 %r5966, %r5965, %r5964, 30212; + prmt.b32 %r11678, %r5966, %r5963, 4180; + ld.local.u8 %r5967, [%rd3+-128]; + ld.local.u8 %r5968, [%rd3+-127]; + prmt.b32 %r5969, %r5968, %r5967, 30212; + ld.local.u8 %r5970, [%rd3+-126]; + ld.local.u8 %r5971, [%rd3+-125]; + prmt.b32 %r5972, %r5971, %r5970, 30212; + prmt.b32 %r11677, %r5972, %r5969, 4180; + ld.local.u8 %r5973, [%rd3+-124]; + ld.local.u8 %r5974, [%rd3+-123]; + prmt.b32 %r5975, %r5974, %r5973, 30212; + ld.local.u8 %r5976, [%rd3+-122]; + ld.local.u8 %r5977, [%rd3+-121]; + prmt.b32 %r5978, %r5977, %r5976, 30212; + prmt.b32 %r11676, %r5978, %r5975, 4180; + ld.local.u8 %r5979, [%rd3+-120]; + ld.local.u8 %r5980, [%rd3+-119]; + prmt.b32 %r5981, %r5980, %r5979, 30212; + ld.local.u8 %r5982, [%rd3+-118]; + ld.local.u8 %r5983, [%rd3+-117]; + prmt.b32 %r5984, %r5983, %r5982, 30212; + prmt.b32 %r11675, %r5984, %r5981, 4180; + ld.local.u8 %r5985, [%rd3+-116]; + ld.local.u8 %r5986, [%rd3+-115]; + prmt.b32 %r5987, %r5986, %r5985, 30212; + ld.local.u8 %r5988, [%rd3+-114]; + ld.local.u8 %r5989, [%rd3+-113]; + prmt.b32 %r5990, %r5989, %r5988, 30212; + prmt.b32 %r11674, %r5990, %r5987, 4180; + ld.local.u8 %r5991, [%rd3+-112]; + ld.local.u8 %r5992, [%rd3+-111]; + prmt.b32 %r5993, %r5992, %r5991, 30212; + ld.local.u8 %r5994, [%rd3+-110]; + ld.local.u8 %r5995, [%rd3+-109]; + prmt.b32 %r5996, %r5995, %r5994, 30212; + prmt.b32 %r11673, %r5996, %r5993, 4180; + ld.local.u8 %r5997, [%rd3+-108]; + ld.local.u8 %r5998, [%rd3+-107]; + prmt.b32 %r5999, %r5998, %r5997, 30212; + ld.local.u8 %r6000, [%rd3+-106]; + ld.local.u8 %r6001, [%rd3+-105]; + prmt.b32 %r6002, %r6001, %r6000, 30212; + prmt.b32 %r11672, %r6002, %r5999, 4180; + add.u64 %rd53, %SPL, 64; + mov.u32 %r6003, 0; + st.local.v2.u32 [%rd53], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+8], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+16], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+24], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+32], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+40], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+48], {%r6003, %r6003}; + st.local.v2.u32 [%rd53+56], {%r6003, %r6003}; + mov.u16 %rs354, 0; + st.local.v2.u8 [%rd53+64], {%rs354, %rs354}; + st.local.u8 [%rd53+66], %rs14; + cvt.u32.u64 %r71, %rd251; + shr.u64 %rd185, %rd251, 32; + cvt.u32.u64 %r72, %rd185; + setp.lt.u64 %p31, %rd49, 65; + mov.u64 %rd258, %rd261; + mov.u64 %rd259, %rd49; + @%p31 bra $L__BB1_39; + + add.s64 %rd54, %rd53, 64; + mov.u16 %rs353, 0; + mov.u64 %rd259, %rd49; + mov.u64 %rd258, %rd261; + +$L__BB1_38: + and.b16 %rs213, %rs353, 255; + setp.eq.s16 %p32, %rs213, 0; + selp.u16 %rs214, 1, 0, %p32; + or.b16 %rs215, %rs14, %rs214; + ld.local.u8 %r6004, [%rd258]; + ld.local.u8 %r6005, [%rd258+1]; + prmt.b32 %r6006, %r6005, %r6004, 30212; + ld.local.u8 %r6007, [%rd258+2]; + prmt.b32 %r6008, %r6007, %r6006, 28756; + ld.local.u8 %r6009, [%rd258+3]; + prmt.b32 %r6010, %r6009, %r6008, 1620; + ld.local.u8 %r6011, [%rd258+4]; + ld.local.u8 %r6012, [%rd258+5]; + prmt.b32 %r6013, %r6012, %r6011, 30212; + ld.local.u8 %r6014, [%rd258+6]; + prmt.b32 %r6015, %r6014, %r6013, 28756; + ld.local.u8 %r6016, [%rd258+7]; + prmt.b32 %r6017, %r6016, %r6015, 1620; + ld.local.u8 %r6018, [%rd258+8]; + ld.local.u8 %r6019, [%rd258+9]; + prmt.b32 %r6020, %r6019, %r6018, 30212; + ld.local.u8 %r6021, [%rd258+10]; + prmt.b32 %r6022, %r6021, %r6020, 28756; + ld.local.u8 %r6023, [%rd258+11]; + prmt.b32 %r6024, %r6023, %r6022, 1620; + ld.local.u8 %r6025, [%rd258+12]; + ld.local.u8 %r6026, [%rd258+13]; + prmt.b32 %r6027, %r6026, %r6025, 30212; + ld.local.u8 %r6028, [%rd258+14]; + prmt.b32 %r6029, %r6028, %r6027, 28756; + ld.local.u8 %r6030, [%rd258+15]; + prmt.b32 %r6031, %r6030, %r6029, 1620; + ld.local.u8 %r6032, [%rd258+16]; + ld.local.u8 %r6033, [%rd258+17]; + prmt.b32 %r6034, %r6033, %r6032, 30212; + ld.local.u8 %r6035, [%rd258+18]; + prmt.b32 %r6036, %r6035, %r6034, 28756; + ld.local.u8 %r6037, [%rd258+19]; + prmt.b32 %r6038, %r6037, %r6036, 1620; + ld.local.u8 %r6039, [%rd258+20]; + ld.local.u8 %r6040, [%rd258+21]; + prmt.b32 %r6041, %r6040, %r6039, 30212; + ld.local.u8 %r6042, [%rd258+22]; + prmt.b32 %r6043, %r6042, %r6041, 28756; + ld.local.u8 %r6044, [%rd258+23]; + prmt.b32 %r6045, %r6044, %r6043, 1620; + ld.local.u8 %r6046, [%rd258+24]; + ld.local.u8 %r6047, [%rd258+25]; + prmt.b32 %r6048, %r6047, %r6046, 30212; + ld.local.u8 %r6049, [%rd258+26]; + prmt.b32 %r6050, %r6049, %r6048, 28756; + ld.local.u8 %r6051, [%rd258+27]; + prmt.b32 %r6052, %r6051, %r6050, 1620; + ld.local.u8 %r6053, [%rd258+28]; + ld.local.u8 %r6054, [%rd258+29]; + prmt.b32 %r6055, %r6054, %r6053, 30212; + ld.local.u8 %r6056, [%rd258+30]; + prmt.b32 %r6057, %r6056, %r6055, 28756; + ld.local.u8 %r6058, [%rd258+31]; + prmt.b32 %r6059, %r6058, %r6057, 1620; + ld.local.u8 %r6060, [%rd258+32]; + ld.local.u8 %r6061, [%rd258+33]; + prmt.b32 %r6062, %r6061, %r6060, 30212; + ld.local.u8 %r6063, [%rd258+34]; + prmt.b32 %r6064, %r6063, %r6062, 28756; + ld.local.u8 %r6065, [%rd258+35]; + prmt.b32 %r6066, %r6065, %r6064, 1620; + ld.local.u8 %r6067, [%rd258+36]; + ld.local.u8 %r6068, [%rd258+37]; + prmt.b32 %r6069, %r6068, %r6067, 30212; + ld.local.u8 %r6070, [%rd258+38]; + prmt.b32 %r6071, %r6070, %r6069, 28756; + ld.local.u8 %r6072, [%rd258+39]; + prmt.b32 %r6073, %r6072, %r6071, 1620; + ld.local.u8 %r6074, [%rd258+40]; + ld.local.u8 %r6075, [%rd258+41]; + prmt.b32 %r6076, %r6075, %r6074, 30212; + ld.local.u8 %r6077, [%rd258+42]; + prmt.b32 %r6078, %r6077, %r6076, 28756; + ld.local.u8 %r6079, [%rd258+43]; + prmt.b32 %r6080, %r6079, %r6078, 1620; + ld.local.u8 %r6081, [%rd258+44]; + ld.local.u8 %r6082, [%rd258+45]; + prmt.b32 %r6083, %r6082, %r6081, 30212; + ld.local.u8 %r6084, [%rd258+46]; + prmt.b32 %r6085, %r6084, %r6083, 28756; + ld.local.u8 %r6086, [%rd258+47]; + prmt.b32 %r6087, %r6086, %r6085, 1620; + ld.local.u8 %r6088, [%rd258+48]; + ld.local.u8 %r6089, [%rd258+49]; + prmt.b32 %r6090, %r6089, %r6088, 30212; + ld.local.u8 %r6091, [%rd258+50]; + prmt.b32 %r6092, %r6091, %r6090, 28756; + ld.local.u8 %r6093, [%rd258+51]; + prmt.b32 %r6094, %r6093, %r6092, 1620; + ld.local.u8 %r6095, [%rd258+52]; + ld.local.u8 %r6096, [%rd258+53]; + prmt.b32 %r6097, %r6096, %r6095, 30212; + ld.local.u8 %r6098, [%rd258+54]; + prmt.b32 %r6099, %r6098, %r6097, 28756; + ld.local.u8 %r6100, [%rd258+55]; + prmt.b32 %r6101, %r6100, %r6099, 1620; + ld.local.u8 %r6102, [%rd258+56]; + ld.local.u8 %r6103, [%rd258+57]; + prmt.b32 %r6104, %r6103, %r6102, 30212; + ld.local.u8 %r6105, [%rd258+58]; + prmt.b32 %r6106, %r6105, %r6104, 28756; + ld.local.u8 %r6107, [%rd258+59]; + prmt.b32 %r6108, %r6107, %r6106, 1620; + ld.local.u8 %r6109, [%rd258+60]; + ld.local.u8 %r6110, [%rd258+61]; + prmt.b32 %r6111, %r6110, %r6109, 30212; + ld.local.u8 %r6112, [%rd258+62]; + prmt.b32 %r6113, %r6112, %r6111, 28756; + ld.local.u8 %r6114, [%rd258+63]; + prmt.b32 %r6115, %r6114, %r6113, 1620; + cvt.u32.u16 %r6116, %rs215; + and.b32 %r6117, %r6116, 255; + add.s32 %r6118, %r11679, %r11675; + add.s32 %r6119, %r6118, %r6010; + xor.b32 %r6120, %r6119, %r71; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 16; + add.s32 %r6122, %r6121, 1779033703; + xor.b32 %r6123, %r6122, %r11675; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 20; + add.s32 %r6125, %r6017, %r6119; + add.s32 %r6126, %r6125, %r6124; + xor.b32 %r6127, %r6126, %r6121; + shf.l.wrap.b32 %r6128, %r6127, %r6127, 24; + add.s32 %r6129, %r6128, %r6122; + xor.b32 %r6130, %r6129, %r6124; + shf.l.wrap.b32 %r6131, %r6130, %r6130, 25; + add.s32 %r6132, %r11678, %r11674; + add.s32 %r6133, %r6132, %r6024; + xor.b32 %r6134, %r6133, %r72; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 16; + add.s32 %r6136, %r6135, -1150833019; + xor.b32 %r6137, %r6136, %r11674; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 20; + add.s32 %r6139, %r6031, %r6133; + add.s32 %r6140, %r6139, %r6138; + xor.b32 %r6141, %r6140, %r6135; + shf.l.wrap.b32 %r6142, %r6141, %r6141, 24; + add.s32 %r6143, %r6142, %r6136; + xor.b32 %r6144, %r6143, %r6138; + shf.l.wrap.b32 %r6145, %r6144, %r6144, 25; + add.s32 %r6146, %r11677, %r11673; + add.s32 %r6147, %r6146, %r6038; + shr.u32 %r6148, %r6147, 16; + shl.b32 %r6149, %r6147, 16; + xor.b32 %r6150, %r6149, 4194304; + or.b32 %r6151, %r6150, %r6148; + add.s32 %r6152, %r6151, 1013904242; + xor.b32 %r6153, %r6152, %r11673; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6045, %r6147; + add.s32 %r6156, %r6155, %r6154; + xor.b32 %r6157, %r6156, %r6151; + shf.l.wrap.b32 %r6158, %r6157, %r6157, 24; + add.s32 %r6159, %r6158, %r6152; + xor.b32 %r6160, %r6159, %r6154; + shf.l.wrap.b32 %r6161, %r6160, %r6160, 25; + add.s32 %r6162, %r11676, %r11672; + add.s32 %r6163, %r6162, %r6052; + xor.b32 %r6164, %r6163, %r6117; + shr.u32 %r6165, %r6163, 16; + shl.b32 %r6166, %r6164, 16; + or.b32 %r6167, %r6166, %r6165; + add.s32 %r6168, %r6167, -1521486534; + xor.b32 %r6169, %r6168, %r11672; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 20; + add.s32 %r6171, %r6059, %r6163; + add.s32 %r6172, %r6171, %r6170; + xor.b32 %r6173, %r6172, %r6167; + shf.l.wrap.b32 %r6174, %r6173, %r6173, 24; + add.s32 %r6175, %r6174, %r6168; + xor.b32 %r6176, %r6175, %r6170; + shf.l.wrap.b32 %r6177, %r6176, %r6176, 25; + add.s32 %r6178, %r6145, %r6126; + add.s32 %r6179, %r6178, %r6066; + xor.b32 %r6180, %r6174, %r6179; + shf.l.wrap.b32 %r6181, %r6180, %r6180, 16; + add.s32 %r6182, %r6181, %r6159; + xor.b32 %r6183, %r6182, %r6145; + shf.l.wrap.b32 %r6184, %r6183, %r6183, 20; + add.s32 %r6185, %r6073, %r6179; + add.s32 %r6186, %r6185, %r6184; + xor.b32 %r6187, %r6186, %r6181; + shf.l.wrap.b32 %r6188, %r6187, %r6187, 24; + add.s32 %r6189, %r6188, %r6182; + xor.b32 %r6190, %r6189, %r6184; + shf.l.wrap.b32 %r6191, %r6190, %r6190, 25; + add.s32 %r6192, %r6161, %r6140; + add.s32 %r6193, %r6192, %r6080; + xor.b32 %r6194, %r6193, %r6128; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 16; + add.s32 %r6196, %r6195, %r6175; + xor.b32 %r6197, %r6196, %r6161; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 20; + add.s32 %r6199, %r6087, %r6193; + add.s32 %r6200, %r6199, %r6198; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 24; + add.s32 %r6203, %r6202, %r6196; + xor.b32 %r6204, %r6203, %r6198; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 25; + add.s32 %r6206, %r6177, %r6156; + add.s32 %r6207, %r6206, %r6094; + xor.b32 %r6208, %r6207, %r6142; + shf.l.wrap.b32 %r6209, %r6208, %r6208, 16; + add.s32 %r6210, %r6209, %r6129; + xor.b32 %r6211, %r6210, %r6177; + shf.l.wrap.b32 %r6212, %r6211, %r6211, 20; + add.s32 %r6213, %r6101, %r6207; + add.s32 %r6214, %r6213, %r6212; + xor.b32 %r6215, %r6214, %r6209; + shf.l.wrap.b32 %r6216, %r6215, %r6215, 24; + add.s32 %r6217, %r6216, %r6210; + xor.b32 %r6218, %r6217, %r6212; + shf.l.wrap.b32 %r6219, %r6218, %r6218, 25; + add.s32 %r6220, %r6172, %r6131; + add.s32 %r6221, %r6220, %r6108; + xor.b32 %r6222, %r6221, %r6158; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 16; + add.s32 %r6224, %r6223, %r6143; + xor.b32 %r6225, %r6224, %r6131; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 20; + add.s32 %r6227, %r6115, %r6221; + add.s32 %r6228, %r6227, %r6226; + xor.b32 %r6229, %r6228, %r6223; + shf.l.wrap.b32 %r6230, %r6229, %r6229, 24; + add.s32 %r6231, %r6230, %r6224; + xor.b32 %r6232, %r6231, %r6226; + shf.l.wrap.b32 %r6233, %r6232, %r6232, 25; + add.s32 %r6234, %r6186, %r6024; + add.s32 %r6235, %r6234, %r6233; + xor.b32 %r6236, %r6235, %r6202; + shf.l.wrap.b32 %r6237, %r6236, %r6236, 16; + add.s32 %r6238, %r6237, %r6217; + xor.b32 %r6239, %r6238, %r6233; + shf.l.wrap.b32 %r6240, %r6239, %r6239, 20; + add.s32 %r6241, %r6235, %r6052; + add.s32 %r6242, %r6241, %r6240; + xor.b32 %r6243, %r6242, %r6237; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6238; + xor.b32 %r6246, %r6245, %r6240; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6200, %r6031; + add.s32 %r6249, %r6248, %r6191; + xor.b32 %r6250, %r6216, %r6249; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6231, %r6251; + xor.b32 %r6253, %r6252, %r6191; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6080; + add.s32 %r6256, %r6255, %r6254; + xor.b32 %r6257, %r6256, %r6251; + shf.l.wrap.b32 %r6258, %r6257, %r6257, 24; + add.s32 %r6259, %r6258, %r6252; + xor.b32 %r6260, %r6259, %r6254; + shf.l.wrap.b32 %r6261, %r6260, %r6260, 25; + add.s32 %r6262, %r6205, %r6059; + add.s32 %r6263, %r6262, %r6214; + xor.b32 %r6264, %r6230, %r6263; + shf.l.wrap.b32 %r6265, %r6264, %r6264, 16; + add.s32 %r6266, %r6265, %r6189; + xor.b32 %r6267, %r6266, %r6205; + shf.l.wrap.b32 %r6268, %r6267, %r6267, 20; + add.s32 %r6269, %r6263, %r6010; + add.s32 %r6270, %r6269, %r6268; + xor.b32 %r6271, %r6270, %r6265; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 24; + add.s32 %r6273, %r6272, %r6266; + xor.b32 %r6274, %r6273, %r6268; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 25; + add.s32 %r6276, %r6219, %r6038; + add.s32 %r6277, %r6276, %r6228; + xor.b32 %r6278, %r6277, %r6188; + shf.l.wrap.b32 %r6279, %r6278, %r6278, 16; + add.s32 %r6280, %r6279, %r6203; + xor.b32 %r6281, %r6280, %r6219; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 20; + add.s32 %r6283, %r6277, %r6101; + add.s32 %r6284, %r6283, %r6282; + xor.b32 %r6285, %r6284, %r6279; + shf.l.wrap.b32 %r6286, %r6285, %r6285, 24; + add.s32 %r6287, %r6286, %r6280; + xor.b32 %r6288, %r6287, %r6282; + shf.l.wrap.b32 %r6289, %r6288, %r6288, 25; + add.s32 %r6290, %r6242, %r6017; + add.s32 %r6291, %r6290, %r6261; + xor.b32 %r6292, %r6291, %r6286; + shf.l.wrap.b32 %r6293, %r6292, %r6292, 16; + add.s32 %r6294, %r6293, %r6273; + xor.b32 %r6295, %r6294, %r6261; + shf.l.wrap.b32 %r6296, %r6295, %r6295, 20; + add.s32 %r6297, %r6291, %r6087; + add.s32 %r6298, %r6297, %r6296; + xor.b32 %r6299, %r6298, %r6293; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 24; + add.s32 %r6301, %r6300, %r6294; + xor.b32 %r6302, %r6301, %r6296; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 25; + add.s32 %r6304, %r6256, %r6094; + add.s32 %r6305, %r6304, %r6275; + xor.b32 %r6306, %r6305, %r6244; + shf.l.wrap.b32 %r6307, %r6306, %r6306, 16; + add.s32 %r6308, %r6307, %r6287; + xor.b32 %r6309, %r6308, %r6275; + shf.l.wrap.b32 %r6310, %r6309, %r6309, 20; + add.s32 %r6311, %r6305, %r6045; + add.s32 %r6312, %r6311, %r6310; + xor.b32 %r6313, %r6312, %r6307; + shf.l.wrap.b32 %r6314, %r6313, %r6313, 24; + add.s32 %r6315, %r6314, %r6308; + xor.b32 %r6316, %r6315, %r6310; + shf.l.wrap.b32 %r6317, %r6316, %r6316, 25; + add.s32 %r6318, %r6270, %r6073; + add.s32 %r6319, %r6318, %r6289; + xor.b32 %r6320, %r6319, %r6258; + shf.l.wrap.b32 %r6321, %r6320, %r6320, 16; + add.s32 %r6322, %r6321, %r6245; + xor.b32 %r6323, %r6322, %r6289; + shf.l.wrap.b32 %r6324, %r6323, %r6323, 20; + add.s32 %r6325, %r6319, %r6108; + add.s32 %r6326, %r6325, %r6324; + xor.b32 %r6327, %r6326, %r6321; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 24; + add.s32 %r6329, %r6328, %r6322; + xor.b32 %r6330, %r6329, %r6324; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 25; + add.s32 %r6332, %r6284, %r6115; + add.s32 %r6333, %r6332, %r6247; + xor.b32 %r6334, %r6333, %r6272; + shf.l.wrap.b32 %r6335, %r6334, %r6334, 16; + add.s32 %r6336, %r6335, %r6259; + xor.b32 %r6337, %r6336, %r6247; + shf.l.wrap.b32 %r6338, %r6337, %r6337, 20; + add.s32 %r6339, %r6333, %r6066; + add.s32 %r6340, %r6339, %r6338; + xor.b32 %r6341, %r6340, %r6335; + shf.l.wrap.b32 %r6342, %r6341, %r6341, 24; + add.s32 %r6343, %r6342, %r6336; + xor.b32 %r6344, %r6343, %r6338; + shf.l.wrap.b32 %r6345, %r6344, %r6344, 25; + add.s32 %r6346, %r6298, %r6031; + add.s32 %r6347, %r6346, %r6345; + xor.b32 %r6348, %r6347, %r6314; + shf.l.wrap.b32 %r6349, %r6348, %r6348, 16; + add.s32 %r6350, %r6349, %r6329; + xor.b32 %r6351, %r6350, %r6345; + shf.l.wrap.b32 %r6352, %r6351, %r6351, 20; + add.s32 %r6353, %r6347, %r6038; + add.s32 %r6354, %r6353, %r6352; + xor.b32 %r6355, %r6354, %r6349; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6350; + xor.b32 %r6358, %r6357, %r6352; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6312, %r6080; + add.s32 %r6361, %r6360, %r6303; + xor.b32 %r6362, %r6361, %r6328; + shf.l.wrap.b32 %r6363, %r6362, %r6362, 16; + add.s32 %r6364, %r6363, %r6343; + xor.b32 %r6365, %r6364, %r6303; + shf.l.wrap.b32 %r6366, %r6365, %r6365, 20; + add.s32 %r6367, %r6361, %r6094; + add.s32 %r6368, %r6367, %r6366; + xor.b32 %r6369, %r6368, %r6363; + shf.l.wrap.b32 %r6370, %r6369, %r6369, 24; + add.s32 %r6371, %r6370, %r6364; + xor.b32 %r6372, %r6371, %r6366; + shf.l.wrap.b32 %r6373, %r6372, %r6372, 25; + add.s32 %r6374, %r6326, %r6101; + add.s32 %r6375, %r6374, %r6317; + xor.b32 %r6376, %r6375, %r6342; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 16; + add.s32 %r6378, %r6377, %r6301; + xor.b32 %r6379, %r6378, %r6317; + shf.l.wrap.b32 %r6380, %r6379, %r6379, 20; + add.s32 %r6381, %r6375, %r6024; + add.s32 %r6382, %r6381, %r6380; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 24; + add.s32 %r6385, %r6384, %r6378; + xor.b32 %r6386, %r6385, %r6380; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 25; + add.s32 %r6388, %r6340, %r6059; + add.s32 %r6389, %r6388, %r6331; + xor.b32 %r6390, %r6389, %r6300; + shf.l.wrap.b32 %r6391, %r6390, %r6390, 16; + add.s32 %r6392, %r6391, %r6315; + xor.b32 %r6393, %r6392, %r6331; + shf.l.wrap.b32 %r6394, %r6393, %r6393, 20; + add.s32 %r6395, %r6389, %r6108; + add.s32 %r6396, %r6395, %r6394; + xor.b32 %r6397, %r6396, %r6391; + shf.l.wrap.b32 %r6398, %r6397, %r6397, 24; + add.s32 %r6399, %r6398, %r6392; + xor.b32 %r6400, %r6399, %r6394; + shf.l.wrap.b32 %r6401, %r6400, %r6400, 25; + add.s32 %r6402, %r6354, %r6052; + add.s32 %r6403, %r6402, %r6373; + xor.b32 %r6404, %r6403, %r6398; + shf.l.wrap.b32 %r6405, %r6404, %r6404, 16; + add.s32 %r6406, %r6405, %r6385; + xor.b32 %r6407, %r6406, %r6373; + shf.l.wrap.b32 %r6408, %r6407, %r6407, 20; + add.s32 %r6409, %r6403, %r6045; + add.s32 %r6410, %r6409, %r6408; + xor.b32 %r6411, %r6410, %r6405; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 24; + add.s32 %r6413, %r6412, %r6406; + xor.b32 %r6414, %r6413, %r6408; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 25; + add.s32 %r6416, %r6368, %r6073; + add.s32 %r6417, %r6416, %r6387; + xor.b32 %r6418, %r6417, %r6356; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 16; + add.s32 %r6420, %r6419, %r6399; + xor.b32 %r6421, %r6420, %r6387; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 20; + add.s32 %r6423, %r6417, %r6010; + add.s32 %r6424, %r6423, %r6422; + xor.b32 %r6425, %r6424, %r6419; + shf.l.wrap.b32 %r6426, %r6425, %r6425, 24; + add.s32 %r6427, %r6426, %r6420; + xor.b32 %r6428, %r6427, %r6422; + shf.l.wrap.b32 %r6429, %r6428, %r6428, 25; + add.s32 %r6430, %r6382, %r6087; + add.s32 %r6431, %r6430, %r6401; + xor.b32 %r6432, %r6431, %r6370; + shf.l.wrap.b32 %r6433, %r6432, %r6432, 16; + add.s32 %r6434, %r6433, %r6357; + xor.b32 %r6435, %r6434, %r6401; + shf.l.wrap.b32 %r6436, %r6435, %r6435, 20; + add.s32 %r6437, %r6431, %r6115; + add.s32 %r6438, %r6437, %r6436; + xor.b32 %r6439, %r6438, %r6433; + shf.l.wrap.b32 %r6440, %r6439, %r6439, 24; + add.s32 %r6441, %r6440, %r6434; + xor.b32 %r6442, %r6441, %r6436; + shf.l.wrap.b32 %r6443, %r6442, %r6442, 25; + add.s32 %r6444, %r6396, %r6066; + add.s32 %r6445, %r6444, %r6359; + xor.b32 %r6446, %r6445, %r6384; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 16; + add.s32 %r6448, %r6447, %r6371; + xor.b32 %r6449, %r6448, %r6359; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 20; + add.s32 %r6451, %r6445, %r6017; + add.s32 %r6452, %r6451, %r6450; + xor.b32 %r6453, %r6452, %r6447; + shf.l.wrap.b32 %r6454, %r6453, %r6453, 24; + add.s32 %r6455, %r6454, %r6448; + xor.b32 %r6456, %r6455, %r6450; + shf.l.wrap.b32 %r6457, %r6456, %r6456, 25; + add.s32 %r6458, %r6410, %r6080; + add.s32 %r6459, %r6458, %r6457; + xor.b32 %r6460, %r6459, %r6426; + shf.l.wrap.b32 %r6461, %r6460, %r6460, 16; + add.s32 %r6462, %r6461, %r6441; + xor.b32 %r6463, %r6462, %r6457; + shf.l.wrap.b32 %r6464, %r6463, %r6463, 20; + add.s32 %r6465, %r6459, %r6059; + add.s32 %r6466, %r6465, %r6464; + xor.b32 %r6467, %r6466, %r6461; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6462; + xor.b32 %r6470, %r6469, %r6464; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6424, %r6094; + add.s32 %r6473, %r6472, %r6415; + xor.b32 %r6474, %r6473, %r6440; + shf.l.wrap.b32 %r6475, %r6474, %r6474, 16; + add.s32 %r6476, %r6475, %r6455; + xor.b32 %r6477, %r6476, %r6415; + shf.l.wrap.b32 %r6478, %r6477, %r6477, 20; + add.s32 %r6479, %r6473, %r6073; + add.s32 %r6480, %r6479, %r6478; + xor.b32 %r6481, %r6480, %r6475; + shf.l.wrap.b32 %r6482, %r6481, %r6481, 24; + add.s32 %r6483, %r6482, %r6476; + xor.b32 %r6484, %r6483, %r6478; + shf.l.wrap.b32 %r6485, %r6484, %r6484, 25; + add.s32 %r6486, %r6438, %r6108; + add.s32 %r6487, %r6486, %r6429; + xor.b32 %r6488, %r6487, %r6454; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 16; + add.s32 %r6490, %r6489, %r6413; + xor.b32 %r6491, %r6490, %r6429; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 20; + add.s32 %r6493, %r6487, %r6031; + add.s32 %r6494, %r6493, %r6492; + xor.b32 %r6495, %r6494, %r6489; + shf.l.wrap.b32 %r6496, %r6495, %r6495, 24; + add.s32 %r6497, %r6496, %r6490; + xor.b32 %r6498, %r6497, %r6492; + shf.l.wrap.b32 %r6499, %r6498, %r6498, 25; + add.s32 %r6500, %r6452, %r6101; + add.s32 %r6501, %r6500, %r6443; + xor.b32 %r6502, %r6501, %r6412; + shf.l.wrap.b32 %r6503, %r6502, %r6502, 16; + add.s32 %r6504, %r6503, %r6427; + xor.b32 %r6505, %r6504, %r6443; + shf.l.wrap.b32 %r6506, %r6505, %r6505, 20; + add.s32 %r6507, %r6501, %r6115; + add.s32 %r6508, %r6507, %r6506; + xor.b32 %r6509, %r6508, %r6503; + shf.l.wrap.b32 %r6510, %r6509, %r6509, 24; + add.s32 %r6511, %r6510, %r6504; + xor.b32 %r6512, %r6511, %r6506; + shf.l.wrap.b32 %r6513, %r6512, %r6512, 25; + add.s32 %r6514, %r6466, %r6038; + add.s32 %r6515, %r6514, %r6485; + xor.b32 %r6516, %r6515, %r6510; + shf.l.wrap.b32 %r6517, %r6516, %r6516, 16; + add.s32 %r6518, %r6517, %r6497; + xor.b32 %r6519, %r6518, %r6485; + shf.l.wrap.b32 %r6520, %r6519, %r6519, 20; + add.s32 %r6521, %r6515, %r6010; + add.s32 %r6522, %r6521, %r6520; + xor.b32 %r6523, %r6522, %r6517; + shf.l.wrap.b32 %r6524, %r6523, %r6523, 24; + add.s32 %r6525, %r6524, %r6518; + xor.b32 %r6526, %r6525, %r6520; + shf.l.wrap.b32 %r6527, %r6526, %r6526, 25; + add.s32 %r6528, %r6480, %r6087; + add.s32 %r6529, %r6528, %r6499; + xor.b32 %r6530, %r6529, %r6468; + shf.l.wrap.b32 %r6531, %r6530, %r6530, 16; + add.s32 %r6532, %r6531, %r6511; + xor.b32 %r6533, %r6532, %r6499; + shf.l.wrap.b32 %r6534, %r6533, %r6533, 20; + add.s32 %r6535, %r6529, %r6024; + add.s32 %r6536, %r6535, %r6534; + xor.b32 %r6537, %r6536, %r6531; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 24; + add.s32 %r6539, %r6538, %r6532; + xor.b32 %r6540, %r6539, %r6534; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 25; + add.s32 %r6542, %r6494, %r6045; + add.s32 %r6543, %r6542, %r6513; + xor.b32 %r6544, %r6543, %r6482; + shf.l.wrap.b32 %r6545, %r6544, %r6544, 16; + add.s32 %r6546, %r6545, %r6469; + xor.b32 %r6547, %r6546, %r6513; + shf.l.wrap.b32 %r6548, %r6547, %r6547, 20; + add.s32 %r6549, %r6543, %r6066; + add.s32 %r6550, %r6549, %r6548; + xor.b32 %r6551, %r6550, %r6545; + shf.l.wrap.b32 %r6552, %r6551, %r6551, 24; + add.s32 %r6553, %r6552, %r6546; + xor.b32 %r6554, %r6553, %r6548; + shf.l.wrap.b32 %r6555, %r6554, %r6554, 25; + add.s32 %r6556, %r6508, %r6017; + add.s32 %r6557, %r6556, %r6471; + xor.b32 %r6558, %r6557, %r6496; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 16; + add.s32 %r6560, %r6559, %r6483; + xor.b32 %r6561, %r6560, %r6471; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 20; + add.s32 %r6563, %r6557, %r6052; + add.s32 %r6564, %r6563, %r6562; + xor.b32 %r6565, %r6564, %r6559; + shf.l.wrap.b32 %r6566, %r6565, %r6565, 24; + add.s32 %r6567, %r6566, %r6560; + xor.b32 %r6568, %r6567, %r6562; + shf.l.wrap.b32 %r6569, %r6568, %r6568, 25; + add.s32 %r6570, %r6522, %r6094; + add.s32 %r6571, %r6570, %r6569; + xor.b32 %r6572, %r6571, %r6538; + shf.l.wrap.b32 %r6573, %r6572, %r6572, 16; + add.s32 %r6574, %r6573, %r6553; + xor.b32 %r6575, %r6574, %r6569; + shf.l.wrap.b32 %r6576, %r6575, %r6575, 20; + add.s32 %r6577, %r6571, %r6101; + add.s32 %r6578, %r6577, %r6576; + xor.b32 %r6579, %r6578, %r6573; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6574; + xor.b32 %r6582, %r6581, %r6576; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6536, %r6073; + add.s32 %r6585, %r6584, %r6527; + xor.b32 %r6586, %r6585, %r6552; + shf.l.wrap.b32 %r6587, %r6586, %r6586, 16; + add.s32 %r6588, %r6587, %r6567; + xor.b32 %r6589, %r6588, %r6527; + shf.l.wrap.b32 %r6590, %r6589, %r6589, 20; + add.s32 %r6591, %r6585, %r6087; + add.s32 %r6592, %r6591, %r6590; + xor.b32 %r6593, %r6592, %r6587; + shf.l.wrap.b32 %r6594, %r6593, %r6593, 24; + add.s32 %r6595, %r6594, %r6588; + xor.b32 %r6596, %r6595, %r6590; + shf.l.wrap.b32 %r6597, %r6596, %r6596, 25; + add.s32 %r6598, %r6550, %r6115; + add.s32 %r6599, %r6598, %r6541; + xor.b32 %r6600, %r6599, %r6566; + shf.l.wrap.b32 %r6601, %r6600, %r6600, 16; + add.s32 %r6602, %r6601, %r6525; + xor.b32 %r6603, %r6602, %r6541; + shf.l.wrap.b32 %r6604, %r6603, %r6603, 20; + add.s32 %r6605, %r6599, %r6080; + add.s32 %r6606, %r6605, %r6604; + xor.b32 %r6607, %r6606, %r6601; + shf.l.wrap.b32 %r6608, %r6607, %r6607, 24; + add.s32 %r6609, %r6608, %r6602; + xor.b32 %r6610, %r6609, %r6604; + shf.l.wrap.b32 %r6611, %r6610, %r6610, 25; + add.s32 %r6612, %r6564, %r6108; + add.s32 %r6613, %r6612, %r6555; + xor.b32 %r6614, %r6613, %r6524; + shf.l.wrap.b32 %r6615, %r6614, %r6614, 16; + add.s32 %r6616, %r6615, %r6539; + xor.b32 %r6617, %r6616, %r6555; + shf.l.wrap.b32 %r6618, %r6617, %r6617, 20; + add.s32 %r6619, %r6613, %r6066; + add.s32 %r6620, %r6619, %r6618; + xor.b32 %r6621, %r6620, %r6615; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 24; + add.s32 %r6623, %r6622, %r6616; + xor.b32 %r6624, %r6623, %r6618; + shf.l.wrap.b32 %r6625, %r6624, %r6624, 25; + add.s32 %r6626, %r6578, %r6059; + add.s32 %r6627, %r6626, %r6597; + xor.b32 %r6628, %r6627, %r6622; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 16; + add.s32 %r6630, %r6629, %r6609; + xor.b32 %r6631, %r6630, %r6597; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 20; + add.s32 %r6633, %r6627, %r6024; + add.s32 %r6634, %r6633, %r6632; + xor.b32 %r6635, %r6634, %r6629; + shf.l.wrap.b32 %r6636, %r6635, %r6635, 24; + add.s32 %r6637, %r6636, %r6630; + xor.b32 %r6638, %r6637, %r6632; + shf.l.wrap.b32 %r6639, %r6638, %r6638, 25; + add.s32 %r6640, %r6592, %r6045; + add.s32 %r6641, %r6640, %r6611; + xor.b32 %r6642, %r6641, %r6580; + shf.l.wrap.b32 %r6643, %r6642, %r6642, 16; + add.s32 %r6644, %r6643, %r6623; + xor.b32 %r6645, %r6644, %r6611; + shf.l.wrap.b32 %r6646, %r6645, %r6645, 20; + add.s32 %r6647, %r6641, %r6031; + add.s32 %r6648, %r6647, %r6646; + xor.b32 %r6649, %r6648, %r6643; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 24; + add.s32 %r6651, %r6650, %r6644; + xor.b32 %r6652, %r6651, %r6646; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 25; + add.s32 %r6654, %r6606, %r6010; + add.s32 %r6655, %r6654, %r6625; + xor.b32 %r6656, %r6655, %r6594; + shf.l.wrap.b32 %r6657, %r6656, %r6656, 16; + add.s32 %r6658, %r6657, %r6581; + xor.b32 %r6659, %r6658, %r6625; + shf.l.wrap.b32 %r6660, %r6659, %r6659, 20; + add.s32 %r6661, %r6655, %r6017; + add.s32 %r6662, %r6661, %r6660; + xor.b32 %r6663, %r6662, %r6657; + shf.l.wrap.b32 %r6664, %r6663, %r6663, 24; + add.s32 %r6665, %r6664, %r6658; + xor.b32 %r6666, %r6665, %r6660; + shf.l.wrap.b32 %r6667, %r6666, %r6666, 25; + add.s32 %r6668, %r6620, %r6052; + add.s32 %r6669, %r6668, %r6583; + xor.b32 %r6670, %r6669, %r6608; + shf.l.wrap.b32 %r6671, %r6670, %r6670, 16; + add.s32 %r6672, %r6671, %r6595; + xor.b32 %r6673, %r6672, %r6583; + shf.l.wrap.b32 %r6674, %r6673, %r6673, 20; + add.s32 %r6675, %r6669, %r6038; + add.s32 %r6676, %r6675, %r6674; + xor.b32 %r6677, %r6676, %r6671; + shf.l.wrap.b32 %r6678, %r6677, %r6677, 24; + add.s32 %r6679, %r6678, %r6672; + xor.b32 %r6680, %r6679, %r6674; + shf.l.wrap.b32 %r6681, %r6680, %r6680, 25; + add.s32 %r6682, %r6634, %r6073; + add.s32 %r6683, %r6682, %r6681; + xor.b32 %r6684, %r6683, %r6650; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 16; + add.s32 %r6686, %r6685, %r6665; + xor.b32 %r6687, %r6686, %r6681; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 20; + add.s32 %r6689, %r6683, %r6108; + add.s32 %r6690, %r6689, %r6688; + xor.b32 %r6691, %r6690, %r6685; + shf.l.wrap.b32 %r6692, %r6691, %r6691, 24; + add.s32 %r6693, %r6692, %r6686; + xor.b32 %r6694, %r6693, %r6688; + shf.l.wrap.b32 %r6695, %r6694, %r6694, 25; + add.s32 %r6696, %r6648, %r6087; + add.s32 %r6697, %r6696, %r6639; + xor.b32 %r6698, %r6697, %r6664; + shf.l.wrap.b32 %r6699, %r6698, %r6698, 16; + add.s32 %r6700, %r6699, %r6679; + xor.b32 %r6701, %r6700, %r6639; + shf.l.wrap.b32 %r6702, %r6701, %r6701, 20; + add.s32 %r6703, %r6697, %r6045; + add.s32 %r6704, %r6703, %r6702; + xor.b32 %r6705, %r6704, %r6699; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6700; + xor.b32 %r6708, %r6707, %r6702; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6662, %r6066; + add.s32 %r6711, %r6710, %r6653; + xor.b32 %r6712, %r6711, %r6678; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6637; + xor.b32 %r6715, %r6714, %r6653; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6094; + add.s32 %r6718, %r6717, %r6716; + xor.b32 %r6719, %r6718, %r6713; + shf.l.wrap.b32 %r6720, %r6719, %r6719, 24; + add.s32 %r6721, %r6720, %r6714; + xor.b32 %r6722, %r6721, %r6716; + shf.l.wrap.b32 %r6723, %r6722, %r6722, 25; + add.s32 %r6724, %r6676, %r6115; + add.s32 %r6725, %r6724, %r6667; + xor.b32 %r6726, %r6725, %r6636; + shf.l.wrap.b32 %r6727, %r6726, %r6726, 16; + add.s32 %r6728, %r6727, %r6651; + xor.b32 %r6729, %r6728, %r6667; + shf.l.wrap.b32 %r6730, %r6729, %r6729, 20; + add.s32 %r6731, %r6725, %r6017; + add.s32 %r6732, %r6731, %r6730; + xor.b32 %r6733, %r6732, %r6727; + shf.l.wrap.b32 %r6734, %r6733, %r6733, 24; + add.s32 %r6735, %r6734, %r6728; + xor.b32 %r6736, %r6735, %r6730; + shf.l.wrap.b32 %r6737, %r6736, %r6736, 25; + add.s32 %r6738, %r6690, %r6101; + add.s32 %r6739, %r6738, %r6709; + xor.b32 %r6740, %r6739, %r6734; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 16; + add.s32 %r6742, %r6741, %r6721; + xor.b32 %r6743, %r6742, %r6709; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 20; + add.s32 %r6745, %r6739, %r6031; + add.s32 %r6746, %r6745, %r6744; + xor.b32 %r6747, %r6746, %r6741; + shf.l.wrap.b32 %r6748, %r6747, %r6747, 24; + add.s32 %r6749, %r6748, %r6742; + xor.b32 %r6750, %r6749, %r6744; + shf.l.wrap.b32 %r6751, %r6750, %r6750, 25; + add.s32 %r6752, %r6704, %r6010; + add.s32 %r6753, %r6752, %r6723; + xor.b32 %r6754, %r6753, %r6692; + shf.l.wrap.b32 %r6755, %r6754, %r6754, 16; + add.s32 %r6756, %r6755, %r6735; + xor.b32 %r6757, %r6756, %r6723; + shf.l.wrap.b32 %r6758, %r6757, %r6757, 20; + add.s32 %r6759, %r6753, %r6080; + add.s32 %r6760, %r6759, %r6758; + xor.b32 %r6761, %r6760, %r6755; + shf.l.wrap.b32 %r6762, %r6761, %r6761, 24; + add.s32 %r6763, %r6762, %r6756; + xor.b32 %r6764, %r6763, %r6758; + shf.l.wrap.b32 %r6765, %r6764, %r6764, 25; + add.s32 %r6766, %r6718, %r6024; + add.s32 %r6767, %r6766, %r6737; + xor.b32 %r6768, %r6767, %r6706; + shf.l.wrap.b32 %r6769, %r6768, %r6768, 16; + add.s32 %r6770, %r6769, %r6693; + xor.b32 %r6771, %r6770, %r6737; + shf.l.wrap.b32 %r6772, %r6771, %r6771, 20; + add.s32 %r6773, %r6767, %r6052; + add.s32 %r6774, %r6773, %r6772; + xor.b32 %r6775, %r6774, %r6769; + shf.l.wrap.b32 %r6776, %r6775, %r6775, 24; + add.s32 %r6777, %r6776, %r6770; + xor.b32 %r6778, %r6777, %r6772; + shf.l.wrap.b32 %r6779, %r6778, %r6778, 25; + add.s32 %r6780, %r6732, %r6038; + add.s32 %r6781, %r6780, %r6695; + xor.b32 %r6782, %r6781, %r6720; + shf.l.wrap.b32 %r6783, %r6782, %r6782, 16; + add.s32 %r6784, %r6783, %r6707; + xor.b32 %r6785, %r6784, %r6695; + shf.l.wrap.b32 %r6786, %r6785, %r6785, 20; + add.s32 %r6787, %r6781, %r6059; + add.s32 %r6788, %r6787, %r6786; + xor.b32 %r6789, %r6788, %r6783; + shf.l.wrap.b32 %r6790, %r6789, %r6789, 24; + add.s32 %r6791, %r6790, %r6784; + xor.b32 %r6792, %r6791, %r6786; + shf.l.wrap.b32 %r6793, %r6792, %r6792, 25; + add.s32 %r6794, %r6746, %r6087; + add.s32 %r6795, %r6794, %r6793; + xor.b32 %r6796, %r6795, %r6762; + shf.l.wrap.b32 %r6797, %r6796, %r6796, 16; + add.s32 %r6798, %r6797, %r6777; + xor.b32 %r6799, %r6798, %r6793; + shf.l.wrap.b32 %r6800, %r6799, %r6799, 20; + add.s32 %r6801, %r6795, %r6115; + add.s32 %r6802, %r6801, %r6800; + xor.b32 %r6803, %r6802, %r6797; + shf.l.wrap.b32 %r6804, %r6803, %r6803, 24; + add.s32 %r6805, %r6804, %r6798; + xor.b32 %r6806, %r6805, %r6800; + shf.l.wrap.b32 %r6807, %r6806, %r6806, 25; + add.s32 %r6808, %r6760, %r6045; + add.s32 %r6809, %r6808, %r6751; + xor.b32 %r6810, %r6809, %r6776; + shf.l.wrap.b32 %r6811, %r6810, %r6810, 16; + add.s32 %r6812, %r6811, %r6791; + xor.b32 %r6813, %r6812, %r6751; + shf.l.wrap.b32 %r6814, %r6813, %r6813, 20; + add.s32 %r6815, %r6809, %r6010; + add.s32 %r6816, %r6815, %r6814; + xor.b32 %r6817, %r6816, %r6811; + shf.l.wrap.b32 %r6818, %r6817, %r6817, 24; + add.s32 %r6819, %r6818, %r6812; + xor.b32 %r6820, %r6819, %r6814; + shf.l.wrap.b32 %r6821, %r6820, %r6820, 25; + add.s32 %r6822, %r6774, %r6017; + add.s32 %r6823, %r6822, %r6765; + xor.b32 %r6824, %r6823, %r6790; + shf.l.wrap.b32 %r6825, %r6824, %r6824, 16; + add.s32 %r6826, %r6825, %r6749; + xor.b32 %r6827, %r6826, %r6765; + shf.l.wrap.b32 %r6828, %r6827, %r6827, 20; + add.s32 %r6829, %r6823, %r6073; + add.s32 %r6830, %r6829, %r6828; + xor.b32 %r6831, %r6830, %r6825; + shf.l.wrap.b32 %r6832, %r6831, %r6831, 24; + add.s32 %r6833, %r6832, %r6826; + xor.b32 %r6834, %r6833, %r6828; + shf.l.wrap.b32 %r6835, %r6834, %r6834, 25; + add.s32 %r6836, %r6788, %r6066; + add.s32 %r6837, %r6836, %r6779; + xor.b32 %r6838, %r6837, %r6748; + shf.l.wrap.b32 %r6839, %r6838, %r6838, 16; + add.s32 %r6840, %r6839, %r6763; + xor.b32 %r6841, %r6840, %r6779; + shf.l.wrap.b32 %r6842, %r6841, %r6841, 20; + add.s32 %r6843, %r6837, %r6052; + add.s32 %r6844, %r6843, %r6842; + xor.b32 %r6845, %r6844, %r6839; + shf.l.wrap.b32 %r6846, %r6845, %r6845, 24; + add.s32 %r6847, %r6846, %r6840; + xor.b32 %r6848, %r6847, %r6842; + shf.l.wrap.b32 %r6849, %r6848, %r6848, 25; + add.s32 %r6850, %r6802, %r6108; + add.s32 %r6851, %r6850, %r6821; + xor.b32 %r6852, %r6851, %r6846; + shf.l.wrap.b32 %r6853, %r6852, %r6852, 16; + add.s32 %r6854, %r6853, %r6833; + xor.b32 %r6855, %r6854, %r6821; + shf.l.wrap.b32 %r6856, %r6855, %r6855, 20; + add.s32 %r6857, %r6851, %r6080; + add.s32 %r6858, %r6857, %r6856; + xor.b32 %r6859, %r6858, %r6853; + shf.l.wrap.b32 %r6860, %r6859, %r6859, 24; + add.s32 %r6861, %r6860, %r6854; + xor.b32 %r6862, %r6861, %r6856; + shf.l.wrap.b32 %r6863, %r6862, %r6862, 25; + add.s32 %r6864, %r6816, %r6024; + add.s32 %r6865, %r6864, %r6835; + xor.b32 %r6866, %r6865, %r6804; + shf.l.wrap.b32 %r6867, %r6866, %r6866, 16; + add.s32 %r6868, %r6867, %r6847; + xor.b32 %r6869, %r6868, %r6835; + shf.l.wrap.b32 %r6870, %r6869, %r6869, 20; + add.s32 %r6871, %r6865, %r6094; + add.s32 %r6872, %r6871, %r6870; + xor.b32 %r6873, %r6872, %r6867; + shf.l.wrap.b32 %r6874, %r6873, %r6873, 24; + add.s32 %r6875, %r6874, %r6868; + xor.b32 %r6876, %r6875, %r6870; + shf.l.wrap.b32 %r6877, %r6876, %r6876, 25; + add.s32 %r6878, %r6830, %r6031; + add.s32 %r6879, %r6878, %r6849; + xor.b32 %r6880, %r6879, %r6818; + shf.l.wrap.b32 %r6881, %r6880, %r6880, 16; + add.s32 %r6882, %r6881, %r6805; + xor.b32 %r6883, %r6882, %r6849; + shf.l.wrap.b32 %r6884, %r6883, %r6883, 20; + add.s32 %r6885, %r6879, %r6038; + add.s32 %r6886, %r6885, %r6884; + xor.b32 %r6887, %r6886, %r6881; + shf.l.wrap.b32 %r6888, %r6887, %r6887, 24; + add.s32 %r6889, %r6888, %r6882; + xor.b32 %r6890, %r6889, %r6884; + shf.l.wrap.b32 %r6891, %r6890, %r6890, 25; + add.s32 %r6892, %r6844, %r6059; + add.s32 %r6893, %r6892, %r6807; + xor.b32 %r6894, %r6893, %r6832; + shf.l.wrap.b32 %r6895, %r6894, %r6894, 16; + add.s32 %r6896, %r6895, %r6819; + xor.b32 %r6897, %r6896, %r6807; + shf.l.wrap.b32 %r6898, %r6897, %r6897, 20; + add.s32 %r6899, %r6893, %r6101; + add.s32 %r6900, %r6899, %r6898; + xor.b32 %r6901, %r6900, %r6895; + shf.l.wrap.b32 %r6902, %r6901, %r6901, 24; + add.s32 %r6903, %r6902, %r6896; + xor.b32 %r6904, %r6903, %r6898; + shf.l.wrap.b32 %r6905, %r6904, %r6904, 25; + xor.b32 %r11679, %r6889, %r6858; + xor.b32 %r11678, %r6903, %r6872; + xor.b32 %r11677, %r6861, %r6886; + xor.b32 %r11676, %r6900, %r6875; + xor.b32 %r11675, %r6905, %r6874; + xor.b32 %r11674, %r6863, %r6888; + xor.b32 %r11673, %r6902, %r6877; + xor.b32 %r11672, %r6891, %r6860; + add.s16 %rs353, %rs353, 1; + st.local.u8 [%rd54+1], %rs353; + add.s64 %rd258, %rd258, 64; + add.s64 %rd259, %rd259, -64; + setp.gt.u64 %p33, %rd259, 64; + @%p33 bra $L__BB1_38; + +$L__BB1_39: + min.u64 %rd61, %rd259, 64; + setp.eq.s64 %p34, %rd61, 0; + mov.u16 %rs355, %rs354; + mov.u16 %rs356, %rs354; + mov.u16 %rs357, %rs354; + mov.u16 %rs358, %rs354; + mov.u16 %rs359, %rs354; + mov.u16 %rs360, %rs354; + mov.u16 %rs361, %rs354; + mov.u16 %rs362, %rs354; + mov.u16 %rs363, %rs354; + mov.u16 %rs364, %rs354; + mov.u16 %rs365, %rs354; + mov.u16 %rs366, %rs354; + mov.u16 %rs367, %rs354; + mov.u16 %rs368, %rs354; + mov.u16 %rs369, %rs354; + mov.u16 %rs370, %rs354; + mov.u16 %rs371, %rs354; + mov.u16 %rs372, %rs354; + mov.u16 %rs373, %rs354; + mov.u16 %rs374, %rs354; + mov.u16 %rs375, %rs354; + mov.u16 %rs376, %rs354; + mov.u16 %rs377, %rs354; + mov.u16 %rs378, %rs354; + mov.u16 %rs379, %rs354; + mov.u16 %rs380, %rs354; + mov.u16 %rs381, %rs354; + mov.u16 %rs382, %rs354; + mov.u16 %rs383, %rs354; + mov.u16 %rs384, %rs354; + mov.u16 %rs385, %rs354; + mov.u16 %rs386, %rs354; + mov.u16 %rs387, %rs354; + @%p34 bra $L__BB1_43; + + mov.u64 %rd260, 0; + +$L__BB1_41: + add.s64 %rd187, %rd258, %rd260; + ld.local.u8 %rs251, [%rd187]; + add.s64 %rd188, %rd53, %rd260; + st.local.u8 [%rd188], %rs251; + add.s64 %rd260, %rd260, 1; + setp.lt.u64 %p35, %rd260, %rd61; + @%p35 bra $L__BB1_41; + + ld.local.v4.u16 {%rs384, %rs385, %rs386, %rs387}, [%rd53]; + ld.local.v4.u16 {%rs380, %rs381, %rs382, %rs383}, [%rd53+8]; + ld.local.v4.u16 {%rs376, %rs377, %rs378, %rs379}, [%rd53+16]; + ld.local.v4.u16 {%rs372, %rs373, %rs374, %rs375}, [%rd53+24]; + ld.local.v4.u16 {%rs368, %rs369, %rs370, %rs371}, [%rd53+32]; + ld.local.v4.u16 {%rs364, %rs365, %rs366, %rs367}, [%rd53+40]; + ld.local.v4.u16 {%rs360, %rs361, %rs362, %rs363}, [%rd53+48]; + ld.local.v4.u16 {%rs357, %rs358, %rs359, %rs283}, [%rd53+56]; + ld.local.u8 %rs356, [%rd53+61]; + ld.local.v2.u8 {%rs354, %rs355}, [%rd53+62]; + +$L__BB1_43: + ld.local.v4.u8 {%rs286, %rs287, %rs288, %rs289}, [%rd53+64]; + cvt.u16.u64 %rs292, %rd61; + add.s16 %rs293, %rs286, %rs292; + st.local.u8 [%rd53+64], %rs293; + setp.eq.s16 %p36, %rs287, 0; + selp.u16 %rs294, 1, 0, %p36; + or.b16 %rs295, %rs288, %rs294; + or.b16 %rs296, %rs295, 2; + shr.u16 %rs297, %rs384, 8; + shr.u16 %rs298, %rs385, 8; + shr.u16 %rs299, %rs386, 8; + shr.u16 %rs300, %rs387, 8; + shr.u16 %rs301, %rs380, 8; + shr.u16 %rs302, %rs381, 8; + shr.u16 %rs303, %rs382, 8; + shr.u16 %rs304, %rs383, 8; + shr.u16 %rs305, %rs376, 8; + shr.u16 %rs306, %rs377, 8; + shr.u16 %rs307, %rs378, 8; + shr.u16 %rs308, %rs379, 8; + shr.u16 %rs309, %rs372, 8; + shr.u16 %rs310, %rs373, 8; + shr.u16 %rs311, %rs374, 8; + shr.u16 %rs312, %rs375, 8; + shr.u16 %rs313, %rs368, 8; + shr.u16 %rs314, %rs369, 8; + shr.u16 %rs315, %rs370, 8; + shr.u16 %rs316, %rs371, 8; + shr.u16 %rs317, %rs364, 8; + shr.u16 %rs318, %rs365, 8; + shr.u16 %rs319, %rs366, 8; + shr.u16 %rs320, %rs367, 8; + shr.u16 %rs321, %rs360, 8; + shr.u16 %rs322, %rs361, 8; + shr.u16 %rs323, %rs362, 8; + shr.u16 %rs324, %rs363, 8; + shr.u16 %rs325, %rs357, 8; + shr.u16 %rs326, %rs358, 8; + cvt.u32.u16 %r6906, %rs384; + and.b32 %r6907, %r6906, 255; + cvt.u32.u16 %r6908, %rs297; + prmt.b32 %r6909, %r6908, %r6907, 30212; + cvt.u32.u16 %r6910, %rs385; + prmt.b32 %r6911, %r6910, %r6909, 28756; + cvt.u32.u16 %r6912, %rs298; + prmt.b32 %r6913, %r6912, %r6911, 1620; + cvt.u32.u16 %r6914, %rs386; + and.b32 %r6915, %r6914, 255; + cvt.u32.u16 %r6916, %rs299; + prmt.b32 %r6917, %r6916, %r6915, 30212; + cvt.u32.u16 %r6918, %rs387; + prmt.b32 %r6919, %r6918, %r6917, 28756; + cvt.u32.u16 %r6920, %rs300; + prmt.b32 %r6921, %r6920, %r6919, 1620; + cvt.u32.u16 %r6922, %rs380; + and.b32 %r6923, %r6922, 255; + cvt.u32.u16 %r6924, %rs301; + prmt.b32 %r6925, %r6924, %r6923, 30212; + cvt.u32.u16 %r6926, %rs381; + prmt.b32 %r6927, %r6926, %r6925, 28756; + cvt.u32.u16 %r6928, %rs302; + prmt.b32 %r6929, %r6928, %r6927, 1620; + cvt.u32.u16 %r6930, %rs382; + and.b32 %r6931, %r6930, 255; + cvt.u32.u16 %r6932, %rs303; + prmt.b32 %r6933, %r6932, %r6931, 30212; + cvt.u32.u16 %r6934, %rs383; + prmt.b32 %r6935, %r6934, %r6933, 28756; + cvt.u32.u16 %r6936, %rs304; + prmt.b32 %r6937, %r6936, %r6935, 1620; + cvt.u32.u16 %r6938, %rs376; + and.b32 %r6939, %r6938, 255; + cvt.u32.u16 %r6940, %rs305; + prmt.b32 %r6941, %r6940, %r6939, 30212; + cvt.u32.u16 %r6942, %rs377; + prmt.b32 %r6943, %r6942, %r6941, 28756; + cvt.u32.u16 %r6944, %rs306; + prmt.b32 %r6945, %r6944, %r6943, 1620; + cvt.u32.u16 %r6946, %rs378; + and.b32 %r6947, %r6946, 255; + cvt.u32.u16 %r6948, %rs307; + prmt.b32 %r6949, %r6948, %r6947, 30212; + cvt.u32.u16 %r6950, %rs379; + prmt.b32 %r6951, %r6950, %r6949, 28756; + cvt.u32.u16 %r6952, %rs308; + prmt.b32 %r6953, %r6952, %r6951, 1620; + cvt.u32.u16 %r6954, %rs372; + and.b32 %r6955, %r6954, 255; + cvt.u32.u16 %r6956, %rs309; + prmt.b32 %r6957, %r6956, %r6955, 30212; + cvt.u32.u16 %r6958, %rs373; + prmt.b32 %r6959, %r6958, %r6957, 28756; + cvt.u32.u16 %r6960, %rs310; + prmt.b32 %r6961, %r6960, %r6959, 1620; + cvt.u32.u16 %r6962, %rs374; + and.b32 %r6963, %r6962, 255; + cvt.u32.u16 %r6964, %rs311; + prmt.b32 %r6965, %r6964, %r6963, 30212; + cvt.u32.u16 %r6966, %rs375; + prmt.b32 %r6967, %r6966, %r6965, 28756; + cvt.u32.u16 %r6968, %rs312; + prmt.b32 %r6969, %r6968, %r6967, 1620; + cvt.u32.u16 %r6970, %rs368; + and.b32 %r6971, %r6970, 255; + cvt.u32.u16 %r6972, %rs313; + prmt.b32 %r6973, %r6972, %r6971, 30212; + cvt.u32.u16 %r6974, %rs369; + prmt.b32 %r6975, %r6974, %r6973, 28756; + cvt.u32.u16 %r6976, %rs314; + prmt.b32 %r6977, %r6976, %r6975, 1620; + cvt.u32.u16 %r6978, %rs370; + and.b32 %r6979, %r6978, 255; + cvt.u32.u16 %r6980, %rs315; + prmt.b32 %r6981, %r6980, %r6979, 30212; + cvt.u32.u16 %r6982, %rs371; + prmt.b32 %r6983, %r6982, %r6981, 28756; + cvt.u32.u16 %r6984, %rs316; + prmt.b32 %r6985, %r6984, %r6983, 1620; + cvt.u32.u16 %r6986, %rs364; + and.b32 %r6987, %r6986, 255; + cvt.u32.u16 %r6988, %rs317; + prmt.b32 %r6989, %r6988, %r6987, 30212; + cvt.u32.u16 %r6990, %rs365; + prmt.b32 %r6991, %r6990, %r6989, 28756; + cvt.u32.u16 %r6992, %rs318; + prmt.b32 %r6993, %r6992, %r6991, 1620; + cvt.u32.u16 %r6994, %rs366; + and.b32 %r6995, %r6994, 255; + cvt.u32.u16 %r6996, %rs319; + prmt.b32 %r6997, %r6996, %r6995, 30212; + cvt.u32.u16 %r6998, %rs367; + prmt.b32 %r6999, %r6998, %r6997, 28756; + cvt.u32.u16 %r7000, %rs320; + prmt.b32 %r7001, %r7000, %r6999, 1620; + cvt.u32.u16 %r7002, %rs360; + and.b32 %r7003, %r7002, 255; + cvt.u32.u16 %r7004, %rs321; + prmt.b32 %r7005, %r7004, %r7003, 30212; + cvt.u32.u16 %r7006, %rs361; + prmt.b32 %r7007, %r7006, %r7005, 28756; + cvt.u32.u16 %r7008, %rs322; + prmt.b32 %r7009, %r7008, %r7007, 1620; + cvt.u32.u16 %r7010, %rs362; + and.b32 %r7011, %r7010, 255; + cvt.u32.u16 %r7012, %rs323; + prmt.b32 %r7013, %r7012, %r7011, 30212; + cvt.u32.u16 %r7014, %rs363; + prmt.b32 %r7015, %r7014, %r7013, 28756; + cvt.u32.u16 %r7016, %rs324; + prmt.b32 %r7017, %r7016, %r7015, 1620; + cvt.u32.u16 %r7018, %rs357; + and.b32 %r7019, %r7018, 255; + cvt.u32.u16 %r7020, %rs325; + prmt.b32 %r7021, %r7020, %r7019, 30212; + cvt.u32.u16 %r7022, %rs358; + prmt.b32 %r7023, %r7022, %r7021, 28756; + cvt.u32.u16 %r7024, %rs326; + prmt.b32 %r7025, %r7024, %r7023, 1620; + cvt.u32.u16 %r7026, %rs359; + and.b32 %r7027, %r7026, 255; + cvt.u32.u16 %r7028, %rs356; + prmt.b32 %r7029, %r7028, %r7027, 30212; + cvt.u32.u16 %r7030, %rs354; + shl.b32 %r7031, %r7030, 16; + and.b32 %r7032, %r7031, 16711680; + or.b32 %r7033, %r7029, %r7032; + cvt.u32.u16 %r7034, %rs355; + shl.b32 %r7035, %r7034, 24; + or.b32 %r7036, %r7033, %r7035; + cvt.u32.u16 %r7037, %rs293; + and.b32 %r7038, %r7037, 255; + cvt.u32.u16 %r7039, %rs296; + and.b32 %r7040, %r7039, 255; + add.s32 %r7041, %r11675, %r11679; + add.s32 %r7042, %r7041, %r6913; + xor.b32 %r7043, %r7042, %r71; + shf.l.wrap.b32 %r7044, %r7043, %r7043, 16; + add.s32 %r7045, %r7044, 1779033703; + xor.b32 %r7046, %r7045, %r11675; + shf.l.wrap.b32 %r7047, %r7046, %r7046, 20; + add.s32 %r7048, %r6921, %r7042; + add.s32 %r7049, %r7048, %r7047; + xor.b32 %r7050, %r7049, %r7044; + shf.l.wrap.b32 %r7051, %r7050, %r7050, 24; + add.s32 %r7052, %r7051, %r7045; + xor.b32 %r7053, %r7052, %r7047; + shf.l.wrap.b32 %r7054, %r7053, %r7053, 25; + add.s32 %r7055, %r11674, %r11678; + add.s32 %r7056, %r7055, %r6929; + xor.b32 %r7057, %r7056, %r72; + shf.l.wrap.b32 %r7058, %r7057, %r7057, 16; + add.s32 %r7059, %r7058, -1150833019; + xor.b32 %r7060, %r7059, %r11674; + shf.l.wrap.b32 %r7061, %r7060, %r7060, 20; + add.s32 %r7062, %r6937, %r7056; + add.s32 %r7063, %r7062, %r7061; + xor.b32 %r7064, %r7063, %r7058; + shf.l.wrap.b32 %r7065, %r7064, %r7064, 24; + add.s32 %r7066, %r7065, %r7059; + xor.b32 %r7067, %r7066, %r7061; + shf.l.wrap.b32 %r7068, %r7067, %r7067, 25; + add.s32 %r7069, %r11673, %r11677; + add.s32 %r7070, %r7069, %r6945; + xor.b32 %r7071, %r7070, %r7038; + shr.u32 %r7072, %r7070, 16; + shl.b32 %r7073, %r7071, 16; + or.b32 %r7074, %r7073, %r7072; + add.s32 %r7075, %r7074, 1013904242; + xor.b32 %r7076, %r7075, %r11673; + shf.l.wrap.b32 %r7077, %r7076, %r7076, 20; + add.s32 %r7078, %r6953, %r7070; + add.s32 %r7079, %r7078, %r7077; + xor.b32 %r7080, %r7079, %r7074; + shf.l.wrap.b32 %r7081, %r7080, %r7080, 24; + add.s32 %r7082, %r7081, %r7075; + xor.b32 %r7083, %r7082, %r7077; + shf.l.wrap.b32 %r7084, %r7083, %r7083, 25; + add.s32 %r7085, %r11672, %r11676; + add.s32 %r7086, %r7085, %r6961; + xor.b32 %r7087, %r7086, %r7040; + shr.u32 %r7088, %r7086, 16; + shl.b32 %r7089, %r7087, 16; + or.b32 %r7090, %r7089, %r7088; + add.s32 %r7091, %r7090, -1521486534; + xor.b32 %r7092, %r7091, %r11672; + shf.l.wrap.b32 %r7093, %r7092, %r7092, 20; + add.s32 %r7094, %r6969, %r7086; + add.s32 %r7095, %r7094, %r7093; + xor.b32 %r7096, %r7095, %r7090; + shf.l.wrap.b32 %r7097, %r7096, %r7096, 24; + add.s32 %r7098, %r7097, %r7091; + xor.b32 %r7099, %r7098, %r7093; + shf.l.wrap.b32 %r7100, %r7099, %r7099, 25; + add.s32 %r7101, %r7068, %r7049; + add.s32 %r7102, %r7101, %r6977; + xor.b32 %r7103, %r7097, %r7102; + shf.l.wrap.b32 %r7104, %r7103, %r7103, 16; + add.s32 %r7105, %r7104, %r7082; + xor.b32 %r7106, %r7105, %r7068; + shf.l.wrap.b32 %r7107, %r7106, %r7106, 20; + add.s32 %r7108, %r6985, %r7102; + add.s32 %r7109, %r7108, %r7107; + xor.b32 %r7110, %r7109, %r7104; + shf.l.wrap.b32 %r7111, %r7110, %r7110, 24; + add.s32 %r7112, %r7111, %r7105; + xor.b32 %r7113, %r7112, %r7107; + shf.l.wrap.b32 %r7114, %r7113, %r7113, 25; + add.s32 %r7115, %r7084, %r7063; + add.s32 %r7116, %r7115, %r6993; + xor.b32 %r7117, %r7116, %r7051; + shf.l.wrap.b32 %r7118, %r7117, %r7117, 16; + add.s32 %r7119, %r7118, %r7098; + xor.b32 %r7120, %r7119, %r7084; + shf.l.wrap.b32 %r7121, %r7120, %r7120, 20; + add.s32 %r7122, %r7001, %r7116; + add.s32 %r7123, %r7122, %r7121; + xor.b32 %r7124, %r7123, %r7118; + shf.l.wrap.b32 %r7125, %r7124, %r7124, 24; + add.s32 %r7126, %r7125, %r7119; + xor.b32 %r7127, %r7126, %r7121; + shf.l.wrap.b32 %r7128, %r7127, %r7127, 25; + add.s32 %r7129, %r7100, %r7079; + add.s32 %r7130, %r7129, %r7009; + xor.b32 %r7131, %r7130, %r7065; + shf.l.wrap.b32 %r7132, %r7131, %r7131, 16; + add.s32 %r7133, %r7132, %r7052; + xor.b32 %r7134, %r7133, %r7100; + shf.l.wrap.b32 %r7135, %r7134, %r7134, 20; + add.s32 %r7136, %r7017, %r7130; + add.s32 %r7137, %r7136, %r7135; + xor.b32 %r7138, %r7137, %r7132; + shf.l.wrap.b32 %r7139, %r7138, %r7138, 24; + add.s32 %r7140, %r7139, %r7133; + xor.b32 %r7141, %r7140, %r7135; + shf.l.wrap.b32 %r7142, %r7141, %r7141, 25; + add.s32 %r7143, %r7095, %r7054; + add.s32 %r7144, %r7143, %r7025; + xor.b32 %r7145, %r7144, %r7081; + shf.l.wrap.b32 %r7146, %r7145, %r7145, 16; + add.s32 %r7147, %r7146, %r7066; + xor.b32 %r7148, %r7147, %r7054; + shf.l.wrap.b32 %r7149, %r7148, %r7148, 20; + add.s32 %r7150, %r7036, %r7144; + add.s32 %r7151, %r7150, %r7149; + xor.b32 %r7152, %r7151, %r7146; + shf.l.wrap.b32 %r7153, %r7152, %r7152, 24; + add.s32 %r7154, %r7153, %r7147; + xor.b32 %r7155, %r7154, %r7149; + shf.l.wrap.b32 %r7156, %r7155, %r7155, 25; + add.s32 %r7157, %r7109, %r6929; + add.s32 %r7158, %r7157, %r7156; + xor.b32 %r7159, %r7158, %r7125; + shf.l.wrap.b32 %r7160, %r7159, %r7159, 16; + add.s32 %r7161, %r7160, %r7140; + xor.b32 %r7162, %r7161, %r7156; + shf.l.wrap.b32 %r7163, %r7162, %r7162, 20; + add.s32 %r7164, %r7158, %r6961; + add.s32 %r7165, %r7164, %r7163; + xor.b32 %r7166, %r7165, %r7160; + shf.l.wrap.b32 %r7167, %r7166, %r7166, 24; + add.s32 %r7168, %r7167, %r7161; + xor.b32 %r7169, %r7168, %r7163; + shf.l.wrap.b32 %r7170, %r7169, %r7169, 25; + add.s32 %r7171, %r7123, %r6937; + add.s32 %r7172, %r7171, %r7114; + xor.b32 %r7173, %r7139, %r7172; + shf.l.wrap.b32 %r7174, %r7173, %r7173, 16; + add.s32 %r7175, %r7154, %r7174; + xor.b32 %r7176, %r7175, %r7114; + shf.l.wrap.b32 %r7177, %r7176, %r7176, 20; + add.s32 %r7178, %r7172, %r6993; + add.s32 %r7179, %r7178, %r7177; + xor.b32 %r7180, %r7179, %r7174; + shf.l.wrap.b32 %r7181, %r7180, %r7180, 24; + add.s32 %r7182, %r7181, %r7175; + xor.b32 %r7183, %r7182, %r7177; + shf.l.wrap.b32 %r7184, %r7183, %r7183, 25; + add.s32 %r7185, %r7128, %r6969; + add.s32 %r7186, %r7185, %r7137; + xor.b32 %r7187, %r7153, %r7186; + shf.l.wrap.b32 %r7188, %r7187, %r7187, 16; + add.s32 %r7189, %r7188, %r7112; + xor.b32 %r7190, %r7189, %r7128; + shf.l.wrap.b32 %r7191, %r7190, %r7190, 20; + add.s32 %r7192, %r7186, %r6913; + add.s32 %r7193, %r7192, %r7191; + xor.b32 %r7194, %r7193, %r7188; + shf.l.wrap.b32 %r7195, %r7194, %r7194, 24; + add.s32 %r7196, %r7195, %r7189; + xor.b32 %r7197, %r7196, %r7191; + shf.l.wrap.b32 %r7198, %r7197, %r7197, 25; + add.s32 %r7199, %r7142, %r6945; + add.s32 %r7200, %r7199, %r7151; + xor.b32 %r7201, %r7200, %r7111; + shf.l.wrap.b32 %r7202, %r7201, %r7201, 16; + add.s32 %r7203, %r7202, %r7126; + xor.b32 %r7204, %r7203, %r7142; + shf.l.wrap.b32 %r7205, %r7204, %r7204, 20; + add.s32 %r7206, %r7200, %r7017; + add.s32 %r7207, %r7206, %r7205; + xor.b32 %r7208, %r7207, %r7202; + shf.l.wrap.b32 %r7209, %r7208, %r7208, 24; + add.s32 %r7210, %r7209, %r7203; + xor.b32 %r7211, %r7210, %r7205; + shf.l.wrap.b32 %r7212, %r7211, %r7211, 25; + add.s32 %r7213, %r7184, %r6921; + add.s32 %r7214, %r7213, %r7165; + xor.b32 %r7215, %r7214, %r7209; + shf.l.wrap.b32 %r7216, %r7215, %r7215, 16; + add.s32 %r7217, %r7216, %r7196; + xor.b32 %r7218, %r7217, %r7184; + shf.l.wrap.b32 %r7219, %r7218, %r7218, 20; + add.s32 %r7220, %r7214, %r7001; + add.s32 %r7221, %r7220, %r7219; + xor.b32 %r7222, %r7221, %r7216; + shf.l.wrap.b32 %r7223, %r7222, %r7222, 24; + add.s32 %r7224, %r7223, %r7217; + xor.b32 %r7225, %r7224, %r7219; + shf.l.wrap.b32 %r7226, %r7225, %r7225, 25; + add.s32 %r7227, %r7179, %r7009; + add.s32 %r7228, %r7227, %r7198; + xor.b32 %r7229, %r7167, %r7228; + shf.l.wrap.b32 %r7230, %r7229, %r7229, 16; + add.s32 %r7231, %r7230, %r7210; + xor.b32 %r7232, %r7231, %r7198; + shf.l.wrap.b32 %r7233, %r7232, %r7232, 20; + add.s32 %r7234, %r7228, %r6953; + add.s32 %r7235, %r7234, %r7233; + xor.b32 %r7236, %r7235, %r7230; + shf.l.wrap.b32 %r7237, %r7236, %r7236, 24; + add.s32 %r7238, %r7237, %r7231; + xor.b32 %r7239, %r7238, %r7233; + shf.l.wrap.b32 %r7240, %r7239, %r7239, 25; + add.s32 %r7241, %r7193, %r6985; + add.s32 %r7242, %r7241, %r7212; + xor.b32 %r7243, %r7242, %r7181; + shf.l.wrap.b32 %r7244, %r7243, %r7243, 16; + add.s32 %r7245, %r7244, %r7168; + xor.b32 %r7246, %r7245, %r7212; + shf.l.wrap.b32 %r7247, %r7246, %r7246, 20; + add.s32 %r7248, %r7242, %r7025; + add.s32 %r7249, %r7248, %r7247; + xor.b32 %r7250, %r7249, %r7244; + shf.l.wrap.b32 %r7251, %r7250, %r7250, 24; + add.s32 %r7252, %r7251, %r7245; + xor.b32 %r7253, %r7252, %r7247; + shf.l.wrap.b32 %r7254, %r7253, %r7253, 25; + add.s32 %r7255, %r7207, %r7036; + add.s32 %r7256, %r7255, %r7170; + xor.b32 %r7257, %r7256, %r7195; + shf.l.wrap.b32 %r7258, %r7257, %r7257, 16; + add.s32 %r7259, %r7258, %r7182; + xor.b32 %r7260, %r7259, %r7170; + shf.l.wrap.b32 %r7261, %r7260, %r7260, 20; + add.s32 %r7262, %r7256, %r6977; + add.s32 %r7263, %r7262, %r7261; + xor.b32 %r7264, %r7263, %r7258; + shf.l.wrap.b32 %r7265, %r7264, %r7264, 24; + add.s32 %r7266, %r7265, %r7259; + xor.b32 %r7267, %r7266, %r7261; + shf.l.wrap.b32 %r7268, %r7267, %r7267, 25; + add.s32 %r7269, %r7221, %r6937; + add.s32 %r7270, %r7269, %r7268; + xor.b32 %r7271, %r7270, %r7237; + shf.l.wrap.b32 %r7272, %r7271, %r7271, 16; + add.s32 %r7273, %r7272, %r7252; + xor.b32 %r7274, %r7273, %r7268; + shf.l.wrap.b32 %r7275, %r7274, %r7274, 20; + add.s32 %r7276, %r7270, %r6945; + add.s32 %r7277, %r7276, %r7275; + xor.b32 %r7278, %r7277, %r7272; + shf.l.wrap.b32 %r7279, %r7278, %r7278, 24; + add.s32 %r7280, %r7279, %r7273; + xor.b32 %r7281, %r7280, %r7275; + shf.l.wrap.b32 %r7282, %r7281, %r7281, 25; + add.s32 %r7283, %r7235, %r6993; + add.s32 %r7284, %r7283, %r7226; + xor.b32 %r7285, %r7284, %r7251; + shf.l.wrap.b32 %r7286, %r7285, %r7285, 16; + add.s32 %r7287, %r7286, %r7266; + xor.b32 %r7288, %r7287, %r7226; + shf.l.wrap.b32 %r7289, %r7288, %r7288, 20; + add.s32 %r7290, %r7284, %r7009; + add.s32 %r7291, %r7290, %r7289; + xor.b32 %r7292, %r7291, %r7286; + shf.l.wrap.b32 %r7293, %r7292, %r7292, 24; + add.s32 %r7294, %r7293, %r7287; + xor.b32 %r7295, %r7294, %r7289; + shf.l.wrap.b32 %r7296, %r7295, %r7295, 25; + add.s32 %r7297, %r7249, %r7017; + add.s32 %r7298, %r7297, %r7240; + xor.b32 %r7299, %r7265, %r7298; + shf.l.wrap.b32 %r7300, %r7299, %r7299, 16; + add.s32 %r7301, %r7300, %r7224; + xor.b32 %r7302, %r7301, %r7240; + shf.l.wrap.b32 %r7303, %r7302, %r7302, 20; + add.s32 %r7304, %r7298, %r6929; + add.s32 %r7305, %r7304, %r7303; + xor.b32 %r7306, %r7305, %r7300; + shf.l.wrap.b32 %r7307, %r7306, %r7306, 24; + add.s32 %r7308, %r7307, %r7301; + xor.b32 %r7309, %r7308, %r7303; + shf.l.wrap.b32 %r7310, %r7309, %r7309, 25; + add.s32 %r7311, %r7254, %r6969; + add.s32 %r7312, %r7311, %r7263; + xor.b32 %r7313, %r7312, %r7223; + shf.l.wrap.b32 %r7314, %r7313, %r7313, 16; + add.s32 %r7315, %r7314, %r7238; + xor.b32 %r7316, %r7315, %r7254; + shf.l.wrap.b32 %r7317, %r7316, %r7316, 20; + add.s32 %r7318, %r7312, %r7025; + add.s32 %r7319, %r7318, %r7317; + xor.b32 %r7320, %r7319, %r7314; + shf.l.wrap.b32 %r7321, %r7320, %r7320, 24; + add.s32 %r7322, %r7321, %r7315; + xor.b32 %r7323, %r7322, %r7317; + shf.l.wrap.b32 %r7324, %r7323, %r7323, 25; + add.s32 %r7325, %r7277, %r6961; + add.s32 %r7326, %r7325, %r7296; + xor.b32 %r7327, %r7326, %r7321; + shf.l.wrap.b32 %r7328, %r7327, %r7327, 16; + add.s32 %r7329, %r7328, %r7308; + xor.b32 %r7330, %r7329, %r7296; + shf.l.wrap.b32 %r7331, %r7330, %r7330, 20; + add.s32 %r7332, %r7326, %r6953; + add.s32 %r7333, %r7332, %r7331; + xor.b32 %r7334, %r7333, %r7328; + shf.l.wrap.b32 %r7335, %r7334, %r7334, 24; + add.s32 %r7336, %r7335, %r7329; + xor.b32 %r7337, %r7336, %r7331; + shf.l.wrap.b32 %r7338, %r7337, %r7337, 25; + add.s32 %r7339, %r7291, %r6985; + add.s32 %r7340, %r7339, %r7310; + xor.b32 %r7341, %r7279, %r7340; + shf.l.wrap.b32 %r7342, %r7341, %r7341, 16; + add.s32 %r7343, %r7342, %r7322; + xor.b32 %r7344, %r7343, %r7310; + shf.l.wrap.b32 %r7345, %r7344, %r7344, 20; + add.s32 %r7346, %r7340, %r6913; + add.s32 %r7347, %r7346, %r7345; + xor.b32 %r7348, %r7347, %r7342; + shf.l.wrap.b32 %r7349, %r7348, %r7348, 24; + add.s32 %r7350, %r7349, %r7343; + xor.b32 %r7351, %r7350, %r7345; + shf.l.wrap.b32 %r7352, %r7351, %r7351, 25; + add.s32 %r7353, %r7305, %r7001; + add.s32 %r7354, %r7353, %r7324; + xor.b32 %r7355, %r7354, %r7293; + shf.l.wrap.b32 %r7356, %r7355, %r7355, 16; + add.s32 %r7357, %r7356, %r7280; + xor.b32 %r7358, %r7357, %r7324; + shf.l.wrap.b32 %r7359, %r7358, %r7358, 20; + add.s32 %r7360, %r7354, %r7036; + add.s32 %r7361, %r7360, %r7359; + xor.b32 %r7362, %r7361, %r7356; + shf.l.wrap.b32 %r7363, %r7362, %r7362, 24; + add.s32 %r7364, %r7363, %r7357; + xor.b32 %r7365, %r7364, %r7359; + shf.l.wrap.b32 %r7366, %r7365, %r7365, 25; + add.s32 %r7367, %r7319, %r6977; + add.s32 %r7368, %r7367, %r7282; + xor.b32 %r7369, %r7368, %r7307; + shf.l.wrap.b32 %r7370, %r7369, %r7369, 16; + add.s32 %r7371, %r7370, %r7294; + xor.b32 %r7372, %r7371, %r7282; + shf.l.wrap.b32 %r7373, %r7372, %r7372, 20; + add.s32 %r7374, %r7368, %r6921; + add.s32 %r7375, %r7374, %r7373; + xor.b32 %r7376, %r7375, %r7370; + shf.l.wrap.b32 %r7377, %r7376, %r7376, 24; + add.s32 %r7378, %r7377, %r7371; + xor.b32 %r7379, %r7378, %r7373; + shf.l.wrap.b32 %r7380, %r7379, %r7379, 25; + add.s32 %r7381, %r7333, %r6993; + add.s32 %r7382, %r7381, %r7380; + xor.b32 %r7383, %r7382, %r7349; + shf.l.wrap.b32 %r7384, %r7383, %r7383, 16; + add.s32 %r7385, %r7384, %r7364; + xor.b32 %r7386, %r7385, %r7380; + shf.l.wrap.b32 %r7387, %r7386, %r7386, 20; + add.s32 %r7388, %r7382, %r6969; + add.s32 %r7389, %r7388, %r7387; + xor.b32 %r7390, %r7389, %r7384; + shf.l.wrap.b32 %r7391, %r7390, %r7390, 24; + add.s32 %r7392, %r7391, %r7385; + xor.b32 %r7393, %r7392, %r7387; + shf.l.wrap.b32 %r7394, %r7393, %r7393, 25; + add.s32 %r7395, %r7347, %r7009; + add.s32 %r7396, %r7395, %r7338; + xor.b32 %r7397, %r7396, %r7363; + shf.l.wrap.b32 %r7398, %r7397, %r7397, 16; + add.s32 %r7399, %r7398, %r7378; + xor.b32 %r7400, %r7399, %r7338; + shf.l.wrap.b32 %r7401, %r7400, %r7400, 20; + add.s32 %r7402, %r7396, %r6985; + add.s32 %r7403, %r7402, %r7401; + xor.b32 %r7404, %r7403, %r7398; + shf.l.wrap.b32 %r7405, %r7404, %r7404, 24; + add.s32 %r7406, %r7405, %r7399; + xor.b32 %r7407, %r7406, %r7401; + shf.l.wrap.b32 %r7408, %r7407, %r7407, 25; + add.s32 %r7409, %r7361, %r7025; + add.s32 %r7410, %r7409, %r7352; + xor.b32 %r7411, %r7377, %r7410; + shf.l.wrap.b32 %r7412, %r7411, %r7411, 16; + add.s32 %r7413, %r7412, %r7336; + xor.b32 %r7414, %r7413, %r7352; + shf.l.wrap.b32 %r7415, %r7414, %r7414, 20; + add.s32 %r7416, %r7410, %r6937; + add.s32 %r7417, %r7416, %r7415; + xor.b32 %r7418, %r7417, %r7412; + shf.l.wrap.b32 %r7419, %r7418, %r7418, 24; + add.s32 %r7420, %r7419, %r7413; + xor.b32 %r7421, %r7420, %r7415; + shf.l.wrap.b32 %r7422, %r7421, %r7421, 25; + add.s32 %r7423, %r7375, %r7017; + add.s32 %r7424, %r7423, %r7366; + xor.b32 %r7425, %r7424, %r7335; + shf.l.wrap.b32 %r7426, %r7425, %r7425, 16; + add.s32 %r7427, %r7426, %r7350; + xor.b32 %r7428, %r7427, %r7366; + shf.l.wrap.b32 %r7429, %r7428, %r7428, 20; + add.s32 %r7430, %r7424, %r7036; + add.s32 %r7431, %r7430, %r7429; + xor.b32 %r7432, %r7431, %r7426; + shf.l.wrap.b32 %r7433, %r7432, %r7432, 24; + add.s32 %r7434, %r7433, %r7427; + xor.b32 %r7435, %r7434, %r7429; + shf.l.wrap.b32 %r7436, %r7435, %r7435, 25; + add.s32 %r7437, %r7389, %r6945; + add.s32 %r7438, %r7437, %r7408; + xor.b32 %r7439, %r7438, %r7433; + shf.l.wrap.b32 %r7440, %r7439, %r7439, 16; + add.s32 %r7441, %r7440, %r7420; + xor.b32 %r7442, %r7441, %r7408; + shf.l.wrap.b32 %r7443, %r7442, %r7442, 20; + add.s32 %r7444, %r7438, %r6913; + add.s32 %r7445, %r7444, %r7443; + xor.b32 %r7446, %r7445, %r7440; + shf.l.wrap.b32 %r7447, %r7446, %r7446, 24; + add.s32 %r7448, %r7447, %r7441; + xor.b32 %r7449, %r7448, %r7443; + shf.l.wrap.b32 %r7450, %r7449, %r7449, 25; + add.s32 %r7451, %r7403, %r7001; + add.s32 %r7452, %r7451, %r7422; + xor.b32 %r7453, %r7391, %r7452; + shf.l.wrap.b32 %r7454, %r7453, %r7453, 16; + add.s32 %r7455, %r7454, %r7434; + xor.b32 %r7456, %r7455, %r7422; + shf.l.wrap.b32 %r7457, %r7456, %r7456, 20; + add.s32 %r7458, %r7452, %r6929; + add.s32 %r7459, %r7458, %r7457; + xor.b32 %r7460, %r7459, %r7454; + shf.l.wrap.b32 %r7461, %r7460, %r7460, 24; + add.s32 %r7462, %r7461, %r7455; + xor.b32 %r7463, %r7462, %r7457; + shf.l.wrap.b32 %r7464, %r7463, %r7463, 25; + add.s32 %r7465, %r7417, %r6953; + add.s32 %r7466, %r7465, %r7436; + xor.b32 %r7467, %r7466, %r7405; + shf.l.wrap.b32 %r7468, %r7467, %r7467, 16; + add.s32 %r7469, %r7468, %r7392; + xor.b32 %r7470, %r7469, %r7436; + shf.l.wrap.b32 %r7471, %r7470, %r7470, 20; + add.s32 %r7472, %r7466, %r6977; + add.s32 %r7473, %r7472, %r7471; + xor.b32 %r7474, %r7473, %r7468; + shf.l.wrap.b32 %r7475, %r7474, %r7474, 24; + add.s32 %r7476, %r7475, %r7469; + xor.b32 %r7477, %r7476, %r7471; + shf.l.wrap.b32 %r7478, %r7477, %r7477, 25; + add.s32 %r7479, %r7431, %r6921; + add.s32 %r7480, %r7479, %r7394; + xor.b32 %r7481, %r7480, %r7419; + shf.l.wrap.b32 %r7482, %r7481, %r7481, 16; + add.s32 %r7483, %r7482, %r7406; + xor.b32 %r7484, %r7483, %r7394; + shf.l.wrap.b32 %r7485, %r7484, %r7484, 20; + add.s32 %r7486, %r7480, %r6961; + add.s32 %r7487, %r7486, %r7485; + xor.b32 %r7488, %r7487, %r7482; + shf.l.wrap.b32 %r7489, %r7488, %r7488, 24; + add.s32 %r7490, %r7489, %r7483; + xor.b32 %r7491, %r7490, %r7485; + shf.l.wrap.b32 %r7492, %r7491, %r7491, 25; + add.s32 %r7493, %r7445, %r7009; + add.s32 %r7494, %r7493, %r7492; + xor.b32 %r7495, %r7494, %r7461; + shf.l.wrap.b32 %r7496, %r7495, %r7495, 16; + add.s32 %r7497, %r7496, %r7476; + xor.b32 %r7498, %r7497, %r7492; + shf.l.wrap.b32 %r7499, %r7498, %r7498, 20; + add.s32 %r7500, %r7494, %r7017; + add.s32 %r7501, %r7500, %r7499; + xor.b32 %r7502, %r7501, %r7496; + shf.l.wrap.b32 %r7503, %r7502, %r7502, 24; + add.s32 %r7504, %r7503, %r7497; + xor.b32 %r7505, %r7504, %r7499; + shf.l.wrap.b32 %r7506, %r7505, %r7505, 25; + add.s32 %r7507, %r7459, %r6985; + add.s32 %r7508, %r7507, %r7450; + xor.b32 %r7509, %r7508, %r7475; + shf.l.wrap.b32 %r7510, %r7509, %r7509, 16; + add.s32 %r7511, %r7510, %r7490; + xor.b32 %r7512, %r7511, %r7450; + shf.l.wrap.b32 %r7513, %r7512, %r7512, 20; + add.s32 %r7514, %r7508, %r7001; + add.s32 %r7515, %r7514, %r7513; + xor.b32 %r7516, %r7515, %r7510; + shf.l.wrap.b32 %r7517, %r7516, %r7516, 24; + add.s32 %r7518, %r7517, %r7511; + xor.b32 %r7519, %r7518, %r7513; + shf.l.wrap.b32 %r7520, %r7519, %r7519, 25; + add.s32 %r7521, %r7473, %r7036; + add.s32 %r7522, %r7521, %r7464; + xor.b32 %r7523, %r7489, %r7522; + shf.l.wrap.b32 %r7524, %r7523, %r7523, 16; + add.s32 %r7525, %r7524, %r7448; + xor.b32 %r7526, %r7525, %r7464; + shf.l.wrap.b32 %r7527, %r7526, %r7526, 20; + add.s32 %r7528, %r7522, %r6993; + add.s32 %r7529, %r7528, %r7527; + xor.b32 %r7530, %r7529, %r7524; + shf.l.wrap.b32 %r7531, %r7530, %r7530, 24; + add.s32 %r7532, %r7531, %r7525; + xor.b32 %r7533, %r7532, %r7527; + shf.l.wrap.b32 %r7534, %r7533, %r7533, 25; + add.s32 %r7535, %r7487, %r7025; + add.s32 %r7536, %r7535, %r7478; + xor.b32 %r7537, %r7536, %r7447; + shf.l.wrap.b32 %r7538, %r7537, %r7537, 16; + add.s32 %r7539, %r7538, %r7462; + xor.b32 %r7540, %r7539, %r7478; + shf.l.wrap.b32 %r7541, %r7540, %r7540, 20; + add.s32 %r7542, %r7536, %r6977; + add.s32 %r7543, %r7542, %r7541; + xor.b32 %r7544, %r7543, %r7538; + shf.l.wrap.b32 %r7545, %r7544, %r7544, 24; + add.s32 %r7546, %r7545, %r7539; + xor.b32 %r7547, %r7546, %r7541; + shf.l.wrap.b32 %r7548, %r7547, %r7547, 25; + add.s32 %r7549, %r7501, %r6969; + add.s32 %r7550, %r7549, %r7520; + xor.b32 %r7551, %r7550, %r7545; + shf.l.wrap.b32 %r7552, %r7551, %r7551, 16; + add.s32 %r7553, %r7552, %r7532; + xor.b32 %r7554, %r7553, %r7520; + shf.l.wrap.b32 %r7555, %r7554, %r7554, 20; + add.s32 %r7556, %r7550, %r6929; + add.s32 %r7557, %r7556, %r7555; + xor.b32 %r7558, %r7557, %r7552; + shf.l.wrap.b32 %r7559, %r7558, %r7558, 24; + add.s32 %r7560, %r7559, %r7553; + xor.b32 %r7561, %r7560, %r7555; + shf.l.wrap.b32 %r7562, %r7561, %r7561, 25; + add.s32 %r7563, %r7515, %r6953; + add.s32 %r7564, %r7563, %r7534; + xor.b32 %r7565, %r7503, %r7564; + shf.l.wrap.b32 %r7566, %r7565, %r7565, 16; + add.s32 %r7567, %r7566, %r7546; + xor.b32 %r7568, %r7567, %r7534; + shf.l.wrap.b32 %r7569, %r7568, %r7568, 20; + add.s32 %r7570, %r7564, %r6937; + add.s32 %r7571, %r7570, %r7569; + xor.b32 %r7572, %r7571, %r7566; + shf.l.wrap.b32 %r7573, %r7572, %r7572, 24; + add.s32 %r7574, %r7573, %r7567; + xor.b32 %r7575, %r7574, %r7569; + shf.l.wrap.b32 %r7576, %r7575, %r7575, 25; + add.s32 %r7577, %r7529, %r6913; + add.s32 %r7578, %r7577, %r7548; + xor.b32 %r7579, %r7578, %r7517; + shf.l.wrap.b32 %r7580, %r7579, %r7579, 16; + add.s32 %r7581, %r7580, %r7504; + xor.b32 %r7582, %r7581, %r7548; + shf.l.wrap.b32 %r7583, %r7582, %r7582, 20; + add.s32 %r7584, %r7578, %r6921; + add.s32 %r7585, %r7584, %r7583; + xor.b32 %r7586, %r7585, %r7580; + shf.l.wrap.b32 %r7587, %r7586, %r7586, 24; + add.s32 %r7588, %r7587, %r7581; + xor.b32 %r7589, %r7588, %r7583; + shf.l.wrap.b32 %r7590, %r7589, %r7589, 25; + add.s32 %r7591, %r7543, %r6961; + add.s32 %r7592, %r7591, %r7506; + xor.b32 %r7593, %r7592, %r7531; + shf.l.wrap.b32 %r7594, %r7593, %r7593, 16; + add.s32 %r7595, %r7594, %r7518; + xor.b32 %r7596, %r7595, %r7506; + shf.l.wrap.b32 %r7597, %r7596, %r7596, 20; + add.s32 %r7598, %r7592, %r6945; + add.s32 %r7599, %r7598, %r7597; + xor.b32 %r7600, %r7599, %r7594; + shf.l.wrap.b32 %r7601, %r7600, %r7600, 24; + add.s32 %r7602, %r7601, %r7595; + xor.b32 %r7603, %r7602, %r7597; + shf.l.wrap.b32 %r7604, %r7603, %r7603, 25; + add.s32 %r7605, %r7557, %r6985; + add.s32 %r7606, %r7605, %r7604; + xor.b32 %r7607, %r7606, %r7573; + shf.l.wrap.b32 %r7608, %r7607, %r7607, 16; + add.s32 %r7609, %r7608, %r7588; + xor.b32 %r7610, %r7609, %r7604; + shf.l.wrap.b32 %r7611, %r7610, %r7610, 20; + add.s32 %r7612, %r7606, %r7025; + add.s32 %r7613, %r7612, %r7611; + xor.b32 %r7614, %r7613, %r7608; + shf.l.wrap.b32 %r7615, %r7614, %r7614, 24; + add.s32 %r7616, %r7615, %r7609; + xor.b32 %r7617, %r7616, %r7611; + shf.l.wrap.b32 %r7618, %r7617, %r7617, 25; + add.s32 %r7619, %r7571, %r7001; + add.s32 %r7620, %r7619, %r7562; + xor.b32 %r7621, %r7620, %r7587; + shf.l.wrap.b32 %r7622, %r7621, %r7621, 16; + add.s32 %r7623, %r7622, %r7602; + xor.b32 %r7624, %r7623, %r7562; + shf.l.wrap.b32 %r7625, %r7624, %r7624, 20; + add.s32 %r7626, %r7620, %r6953; + add.s32 %r7627, %r7626, %r7625; + xor.b32 %r7628, %r7627, %r7622; + shf.l.wrap.b32 %r7629, %r7628, %r7628, 24; + add.s32 %r7630, %r7629, %r7623; + xor.b32 %r7631, %r7630, %r7625; + shf.l.wrap.b32 %r7632, %r7631, %r7631, 25; + add.s32 %r7633, %r7585, %r6977; + add.s32 %r7634, %r7633, %r7576; + xor.b32 %r7635, %r7601, %r7634; + shf.l.wrap.b32 %r7636, %r7635, %r7635, 16; + add.s32 %r7637, %r7636, %r7560; + xor.b32 %r7638, %r7637, %r7576; + shf.l.wrap.b32 %r7639, %r7638, %r7638, 20; + add.s32 %r7640, %r7634, %r7009; + add.s32 %r7641, %r7640, %r7639; + xor.b32 %r7642, %r7641, %r7636; + shf.l.wrap.b32 %r7643, %r7642, %r7642, 24; + add.s32 %r7644, %r7643, %r7637; + xor.b32 %r7645, %r7644, %r7639; + shf.l.wrap.b32 %r7646, %r7645, %r7645, 25; + add.s32 %r7647, %r7599, %r7036; + add.s32 %r7648, %r7647, %r7590; + xor.b32 %r7649, %r7648, %r7559; + shf.l.wrap.b32 %r7650, %r7649, %r7649, 16; + add.s32 %r7651, %r7650, %r7574; + xor.b32 %r7652, %r7651, %r7590; + shf.l.wrap.b32 %r7653, %r7652, %r7652, 20; + add.s32 %r7654, %r7648, %r6921; + add.s32 %r7655, %r7654, %r7653; + xor.b32 %r7656, %r7655, %r7650; + shf.l.wrap.b32 %r7657, %r7656, %r7656, 24; + add.s32 %r7658, %r7657, %r7651; + xor.b32 %r7659, %r7658, %r7653; + shf.l.wrap.b32 %r7660, %r7659, %r7659, 25; + add.s32 %r7661, %r7613, %r7017; + add.s32 %r7662, %r7661, %r7632; + xor.b32 %r7663, %r7662, %r7657; + shf.l.wrap.b32 %r7664, %r7663, %r7663, 16; + add.s32 %r7665, %r7664, %r7644; + xor.b32 %r7666, %r7665, %r7632; + shf.l.wrap.b32 %r7667, %r7666, %r7666, 20; + add.s32 %r7668, %r7662, %r6937; + add.s32 %r7669, %r7668, %r7667; + xor.b32 %r7670, %r7669, %r7664; + shf.l.wrap.b32 %r7671, %r7670, %r7670, 24; + add.s32 %r7672, %r7671, %r7665; + xor.b32 %r7673, %r7672, %r7667; + shf.l.wrap.b32 %r7674, %r7673, %r7673, 25; + add.s32 %r7675, %r7627, %r6913; + add.s32 %r7676, %r7675, %r7646; + xor.b32 %r7677, %r7615, %r7676; + shf.l.wrap.b32 %r7678, %r7677, %r7677, 16; + add.s32 %r7679, %r7678, %r7658; + xor.b32 %r7680, %r7679, %r7646; + shf.l.wrap.b32 %r7681, %r7680, %r7680, 20; + add.s32 %r7682, %r7676, %r6993; + add.s32 %r7683, %r7682, %r7681; + xor.b32 %r7684, %r7683, %r7678; + shf.l.wrap.b32 %r7685, %r7684, %r7684, 24; + add.s32 %r7686, %r7685, %r7679; + xor.b32 %r7687, %r7686, %r7681; + shf.l.wrap.b32 %r7688, %r7687, %r7687, 25; + add.s32 %r7689, %r7641, %r6929; + add.s32 %r7690, %r7689, %r7660; + xor.b32 %r7691, %r7690, %r7629; + shf.l.wrap.b32 %r7692, %r7691, %r7691, 16; + add.s32 %r7693, %r7692, %r7616; + xor.b32 %r7694, %r7693, %r7660; + shf.l.wrap.b32 %r7695, %r7694, %r7694, 20; + add.s32 %r7696, %r7690, %r6961; + add.s32 %r7697, %r7696, %r7695; + xor.b32 %r7698, %r7697, %r7692; + shf.l.wrap.b32 %r7699, %r7698, %r7698, 24; + add.s32 %r7700, %r7699, %r7693; + xor.b32 %r7701, %r7700, %r7695; + shf.l.wrap.b32 %r7702, %r7701, %r7701, 25; + add.s32 %r7703, %r7655, %r6945; + add.s32 %r7704, %r7703, %r7618; + xor.b32 %r7705, %r7704, %r7643; + shf.l.wrap.b32 %r7706, %r7705, %r7705, 16; + add.s32 %r7707, %r7706, %r7630; + xor.b32 %r7708, %r7707, %r7618; + shf.l.wrap.b32 %r7709, %r7708, %r7708, 20; + add.s32 %r7710, %r7704, %r6969; + add.s32 %r7711, %r7710, %r7709; + xor.b32 %r7712, %r7711, %r7706; + shf.l.wrap.b32 %r7713, %r7712, %r7712, 24; + add.s32 %r7714, %r7713, %r7707; + xor.b32 %r7715, %r7714, %r7709; + shf.l.wrap.b32 %r7716, %r7715, %r7715, 25; + add.s32 %r7717, %r7669, %r7001; + add.s32 %r7718, %r7717, %r7716; + xor.b32 %r7719, %r7718, %r7685; + shf.l.wrap.b32 %r7720, %r7719, %r7719, 16; + add.s32 %r7721, %r7720, %r7700; + xor.b32 %r7722, %r7721, %r7716; + shf.l.wrap.b32 %r7723, %r7722, %r7722, 20; + add.s32 %r7724, %r7718, %r7036; + add.s32 %r7725, %r7724, %r7723; + xor.b32 %r7726, %r7725, %r7720; + shf.l.wrap.b32 %r7727, %r7726, %r7726, 24; + add.s32 %r7728, %r7727, %r7721; + xor.b32 %r7729, %r7728, %r7723; + shf.l.wrap.b32 %r7730, %r7729, %r7729, 25; + add.s32 %r7731, %r7683, %r6953; + add.s32 %r7732, %r7731, %r7674; + xor.b32 %r7733, %r7732, %r7699; + shf.l.wrap.b32 %r7734, %r7733, %r7733, 16; + add.s32 %r7735, %r7734, %r7714; + xor.b32 %r7736, %r7735, %r7674; + shf.l.wrap.b32 %r7737, %r7736, %r7736, 20; + add.s32 %r7738, %r7732, %r6913; + add.s32 %r7739, %r7738, %r7737; + xor.b32 %r7740, %r7739, %r7734; + shf.l.wrap.b32 %r7741, %r7740, %r7740, 24; + add.s32 %r7742, %r7741, %r7735; + xor.b32 %r7743, %r7742, %r7737; + shf.l.wrap.b32 %r7744, %r7743, %r7743, 25; + add.s32 %r7745, %r7697, %r6921; + add.s32 %r7746, %r7745, %r7688; + xor.b32 %r7747, %r7713, %r7746; + shf.l.wrap.b32 %r7748, %r7747, %r7747, 16; + add.s32 %r7749, %r7748, %r7672; + xor.b32 %r7750, %r7749, %r7688; + shf.l.wrap.b32 %r7751, %r7750, %r7750, 20; + add.s32 %r7752, %r7746, %r6985; + add.s32 %r7753, %r7752, %r7751; + xor.b32 %r7754, %r7753, %r7748; + shf.l.wrap.b32 %r7755, %r7754, %r7754, 24; + add.s32 %r7756, %r7755, %r7749; + xor.b32 %r7757, %r7756, %r7751; + shf.l.wrap.b32 %r7758, %r7757, %r7757, 25; + add.s32 %r7759, %r7711, %r6977; + add.s32 %r7760, %r7759, %r7702; + xor.b32 %r7761, %r7760, %r7671; + shf.l.wrap.b32 %r7762, %r7761, %r7761, 16; + add.s32 %r7763, %r7762, %r7686; + xor.b32 %r7764, %r7763, %r7702; + shf.l.wrap.b32 %r7765, %r7764, %r7764, 20; + add.s32 %r7766, %r7760, %r6961; + add.s32 %r7767, %r7766, %r7765; + xor.b32 %r7768, %r7767, %r7762; + shf.l.wrap.b32 %r7769, %r7768, %r7768, 24; + add.s32 %r7770, %r7769, %r7763; + xor.b32 %r7771, %r7770, %r7765; + shf.l.wrap.b32 %r7772, %r7771, %r7771, 25; + add.s32 %r7773, %r7725, %r7025; + add.s32 %r7774, %r7773, %r7744; + xor.b32 %r7775, %r7774, %r7769; + shf.l.wrap.b32 %r7776, %r7775, %r7775, 16; + add.s32 %r7777, %r7776, %r7756; + xor.b32 %r7778, %r7777, %r7744; + shf.l.wrap.b32 %r7779, %r7778, %r7778, 20; + add.s32 %r7780, %r7774, %r6993; + add.s32 %r7781, %r7780, %r7779; + xor.b32 %r7782, %r7781, %r7776; + shf.l.wrap.b32 %r7783, %r7782, %r7782, 24; + add.s32 %r7784, %r7783, %r7777; + xor.b32 %r7785, %r7784, %r7779; + shf.l.wrap.b32 %r7786, %r7785, %r7785, 25; + add.s32 %r7787, %r7739, %r6929; + add.s32 %r7788, %r7787, %r7758; + xor.b32 %r7789, %r7727, %r7788; + shf.l.wrap.b32 %r7790, %r7789, %r7789, 16; + add.s32 %r7791, %r7790, %r7770; + xor.b32 %r7792, %r7791, %r7758; + shf.l.wrap.b32 %r7793, %r7792, %r7792, 20; + add.s32 %r7794, %r7788, %r7009; + add.s32 %r7795, %r7794, %r7793; + xor.b32 %r7796, %r7795, %r7790; + shf.l.wrap.b32 %r7797, %r7796, %r7796, 24; + add.s32 %r7798, %r7797, %r7791; + xor.b32 %r7799, %r7798, %r7793; + shf.l.wrap.b32 %r7800, %r7799, %r7799, 25; + add.s32 %r7801, %r7753, %r6937; + add.s32 %r7802, %r7801, %r7772; + xor.b32 %r7803, %r7802, %r7741; + shf.l.wrap.b32 %r7804, %r7803, %r7803, 16; + add.s32 %r7805, %r7804, %r7728; + xor.b32 %r7806, %r7805, %r7772; + shf.l.wrap.b32 %r7807, %r7806, %r7806, 20; + add.s32 %r7808, %r7802, %r6945; + add.s32 %r7809, %r7808, %r7807; + xor.b32 %r7810, %r7809, %r7804; + shf.l.wrap.b32 %r7811, %r7810, %r7810, 24; + add.s32 %r7812, %r7811, %r7805; + xor.b32 %r7813, %r7812, %r7807; + shf.l.wrap.b32 %r7814, %r7813, %r7813, 25; + add.s32 %r7815, %r7767, %r6969; + add.s32 %r7816, %r7815, %r7730; + xor.b32 %r7817, %r7816, %r7755; + shf.l.wrap.b32 %r7818, %r7817, %r7817, 16; + add.s32 %r7819, %r7818, %r7742; + xor.b32 %r7820, %r7819, %r7730; + shf.l.wrap.b32 %r7821, %r7820, %r7820, 20; + add.s32 %r7822, %r7816, %r7017; + add.s32 %r7823, %r7822, %r7821; + xor.b32 %r7824, %r7823, %r7818; + shf.l.wrap.b32 %r7825, %r7824, %r7824, 24; + add.s32 %r7826, %r7825, %r7819; + xor.b32 %r7827, %r7826, %r7821; + shf.l.wrap.b32 %r7828, %r7827, %r7827, 25; + xor.b32 %r97, %r7812, %r7781; + xor.b32 %r98, %r7826, %r7795; + xor.b32 %r99, %r7784, %r7809; + xor.b32 %r100, %r7823, %r7798; + xor.b32 %r101, %r7828, %r7797; + xor.b32 %r102, %r7786, %r7811; + xor.b32 %r103, %r7825, %r7800; + xor.b32 %r104, %r7814, %r7783; + ld.local.u8 %rs327, [%rd3+8]; + cvt.u64.u16 %rd189, %rs327; + popc.b64 %r7829, %rd251; + cvt.u64.u32 %rd64, %r7829; + setp.ge.u64 %p37, %rd64, %rd189; + mul.wide.u16 %r11681, %rs327, 32; + @%p37 bra $L__BB1_46; + +$L__BB1_45: + popc.b64 %r11649, %rd251; + cvt.u64.u32 %rd230, %r11649; + add.s32 %r7830, %r11681, -64; + cvt.s64.s32 %rd190, %r7830; + add.s64 %rd191, %rd2, %rd190; + ld.local.u8 %r7831, [%rd3+2]; + ld.local.u8 %r7832, [%rd191+145]; + ld.local.u8 %r7833, [%rd191+146]; + prmt.b32 %r7834, %r7833, %r7832, 30212; + ld.local.u8 %r7835, [%rd191+147]; + prmt.b32 %r7836, %r7835, %r7834, 28756; + ld.local.u8 %r7837, [%rd191+148]; + prmt.b32 %r7838, %r7837, %r7836, 1620; + ld.local.u8 %r7839, [%rd191+149]; + ld.local.u8 %r7840, [%rd191+150]; + prmt.b32 %r7841, %r7840, %r7839, 30212; + ld.local.u8 %r7842, [%rd191+151]; + prmt.b32 %r7843, %r7842, %r7841, 28756; + ld.local.u8 %r7844, [%rd191+152]; + prmt.b32 %r7845, %r7844, %r7843, 1620; + ld.local.u8 %r7846, [%rd191+153]; + ld.local.u8 %r7847, [%rd191+154]; + prmt.b32 %r7848, %r7847, %r7846, 30212; + ld.local.u8 %r7849, [%rd191+155]; + prmt.b32 %r7850, %r7849, %r7848, 28756; + ld.local.u8 %r7851, [%rd191+156]; + prmt.b32 %r7852, %r7851, %r7850, 1620; + ld.local.u8 %r7853, [%rd191+157]; + ld.local.u8 %r7854, [%rd191+158]; + prmt.b32 %r7855, %r7854, %r7853, 30212; + ld.local.u8 %r7856, [%rd191+159]; + prmt.b32 %r7857, %r7856, %r7855, 28756; + ld.local.u8 %r7858, [%rd191+160]; + prmt.b32 %r7859, %r7858, %r7857, 1620; + ld.local.u8 %r7860, [%rd191+161]; + ld.local.u8 %r7861, [%rd191+162]; + prmt.b32 %r7862, %r7861, %r7860, 30212; + ld.local.u8 %r7863, [%rd191+163]; + prmt.b32 %r7864, %r7863, %r7862, 28756; + ld.local.u8 %r7865, [%rd191+164]; + prmt.b32 %r7866, %r7865, %r7864, 1620; + ld.local.u8 %r7867, [%rd191+165]; + ld.local.u8 %r7868, [%rd191+166]; + prmt.b32 %r7869, %r7868, %r7867, 30212; + ld.local.u8 %r7870, [%rd191+167]; + prmt.b32 %r7871, %r7870, %r7869, 28756; + ld.local.u8 %r7872, [%rd191+168]; + prmt.b32 %r7873, %r7872, %r7871, 1620; + ld.local.u8 %r7874, [%rd191+169]; + ld.local.u8 %r7875, [%rd191+170]; + prmt.b32 %r7876, %r7875, %r7874, 30212; + ld.local.u8 %r7877, [%rd191+171]; + prmt.b32 %r7878, %r7877, %r7876, 28756; + ld.local.u8 %r7879, [%rd191+172]; + prmt.b32 %r7880, %r7879, %r7878, 1620; + ld.local.u8 %r7881, [%rd191+173]; + ld.local.u8 %r7882, [%rd191+174]; + prmt.b32 %r7883, %r7882, %r7881, 30212; + ld.local.u8 %r7884, [%rd191+175]; + prmt.b32 %r7885, %r7884, %r7883, 28756; + ld.local.u8 %r7886, [%rd191+176]; + prmt.b32 %r7887, %r7886, %r7885, 1620; + ld.local.u8 %r7888, [%rd191+177]; + ld.local.u8 %r7889, [%rd191+178]; + prmt.b32 %r7890, %r7889, %r7888, 30212; + ld.local.u8 %r7891, [%rd191+179]; + prmt.b32 %r7892, %r7891, %r7890, 28756; + ld.local.u8 %r7893, [%rd191+180]; + prmt.b32 %r7894, %r7893, %r7892, 1620; + ld.local.u8 %r7895, [%rd191+181]; + ld.local.u8 %r7896, [%rd191+182]; + prmt.b32 %r7897, %r7896, %r7895, 30212; + ld.local.u8 %r7898, [%rd191+183]; + prmt.b32 %r7899, %r7898, %r7897, 28756; + ld.local.u8 %r7900, [%rd191+184]; + prmt.b32 %r7901, %r7900, %r7899, 1620; + ld.local.u8 %r7902, [%rd191+185]; + ld.local.u8 %r7903, [%rd191+186]; + prmt.b32 %r7904, %r7903, %r7902, 30212; + ld.local.u8 %r7905, [%rd191+187]; + prmt.b32 %r7906, %r7905, %r7904, 28756; + ld.local.u8 %r7907, [%rd191+188]; + prmt.b32 %r7908, %r7907, %r7906, 1620; + ld.local.u8 %r7909, [%rd191+189]; + ld.local.u8 %r7910, [%rd191+190]; + prmt.b32 %r7911, %r7910, %r7909, 30212; + ld.local.u8 %r7912, [%rd191+191]; + prmt.b32 %r7913, %r7912, %r7911, 28756; + ld.local.u8 %r7914, [%rd191+192]; + prmt.b32 %r7915, %r7914, %r7913, 1620; + ld.local.u8 %r7916, [%rd191+193]; + ld.local.u8 %r7917, [%rd191+194]; + prmt.b32 %r7918, %r7917, %r7916, 30212; + ld.local.u8 %r7919, [%rd191+195]; + prmt.b32 %r7920, %r7919, %r7918, 28756; + ld.local.u8 %r7921, [%rd191+196]; + prmt.b32 %r7922, %r7921, %r7920, 1620; + ld.local.u8 %r7923, [%rd191+197]; + ld.local.u8 %r7924, [%rd191+198]; + prmt.b32 %r7925, %r7924, %r7923, 30212; + ld.local.u8 %r7926, [%rd191+199]; + prmt.b32 %r7927, %r7926, %r7925, 28756; + ld.local.u8 %r7928, [%rd191+200]; + prmt.b32 %r7929, %r7928, %r7927, 1620; + ld.local.u8 %r7930, [%rd191+201]; + ld.local.u8 %r7931, [%rd191+202]; + prmt.b32 %r7932, %r7931, %r7930, 30212; + ld.local.u8 %r7933, [%rd191+203]; + prmt.b32 %r7934, %r7933, %r7932, 28756; + ld.local.u8 %r7935, [%rd191+204]; + prmt.b32 %r7936, %r7935, %r7934, 1620; + ld.local.u8 %r7937, [%rd191+205]; + ld.local.u8 %r7938, [%rd191+206]; + prmt.b32 %r7939, %r7938, %r7937, 30212; + ld.local.u8 %r7940, [%rd191+207]; + prmt.b32 %r7941, %r7940, %r7939, 28756; + ld.local.u8 %r7942, [%rd191+208]; + prmt.b32 %r7943, %r7942, %r7941, 1620; + or.b32 %r7944, %r7831, 4; + ld.local.u8 %r7945, [%rd3+-120]; + ld.local.u8 %r7946, [%rd3+-119]; + prmt.b32 %r7947, %r7946, %r7945, 30212; + ld.local.u8 %r7948, [%rd3+-118]; + ld.local.u8 %r7949, [%rd3+-117]; + prmt.b32 %r7950, %r7949, %r7948, 30212; + prmt.b32 %r7951, %r7950, %r7947, 4180; + ld.local.u8 %r7952, [%rd3+-136]; + ld.local.u8 %r7953, [%rd3+-135]; + prmt.b32 %r7954, %r7953, %r7952, 30212; + ld.local.u8 %r7955, [%rd3+-134]; + ld.local.u8 %r7956, [%rd3+-133]; + prmt.b32 %r7957, %r7956, %r7955, 30212; + prmt.b32 %r7958, %r7957, %r7954, 4180; + add.s32 %r7959, %r7951, %r7958; + add.s32 %r7960, %r7959, %r7838; + shf.l.wrap.b32 %r7961, %r7960, %r7960, 16; + add.s32 %r7962, %r7961, 1779033703; + xor.b32 %r7963, %r7962, %r7951; + shf.l.wrap.b32 %r7964, %r7963, %r7963, 20; + add.s32 %r7965, %r7845, %r7960; + add.s32 %r7966, %r7965, %r7964; + xor.b32 %r7967, %r7966, %r7961; + shf.l.wrap.b32 %r7968, %r7967, %r7967, 24; + add.s32 %r7969, %r7968, %r7962; + xor.b32 %r7970, %r7969, %r7964; + shf.l.wrap.b32 %r7971, %r7970, %r7970, 25; + ld.local.u8 %r7972, [%rd3+-116]; + ld.local.u8 %r7973, [%rd3+-115]; + prmt.b32 %r7974, %r7973, %r7972, 30212; + ld.local.u8 %r7975, [%rd3+-114]; + ld.local.u8 %r7976, [%rd3+-113]; + prmt.b32 %r7977, %r7976, %r7975, 30212; + prmt.b32 %r7978, %r7977, %r7974, 4180; + ld.local.u8 %r7979, [%rd3+-132]; + ld.local.u8 %r7980, [%rd3+-131]; + prmt.b32 %r7981, %r7980, %r7979, 30212; + ld.local.u8 %r7982, [%rd3+-130]; + ld.local.u8 %r7983, [%rd3+-129]; + prmt.b32 %r7984, %r7983, %r7982, 30212; + prmt.b32 %r7985, %r7984, %r7981, 4180; + add.s32 %r7986, %r7978, %r7985; + add.s32 %r7987, %r7986, %r7852; + shf.l.wrap.b32 %r7988, %r7987, %r7987, 16; + add.s32 %r7989, %r7988, -1150833019; + xor.b32 %r7990, %r7989, %r7978; + shf.l.wrap.b32 %r7991, %r7990, %r7990, 20; + add.s32 %r7992, %r7859, %r7987; + add.s32 %r7993, %r7992, %r7991; + xor.b32 %r7994, %r7993, %r7988; + shf.l.wrap.b32 %r7995, %r7994, %r7994, 24; + add.s32 %r7996, %r7995, %r7989; + xor.b32 %r7997, %r7996, %r7991; + shf.l.wrap.b32 %r7998, %r7997, %r7997, 25; + ld.local.u8 %r7999, [%rd3+-112]; + ld.local.u8 %r8000, [%rd3+-111]; + prmt.b32 %r8001, %r8000, %r7999, 30212; + ld.local.u8 %r8002, [%rd3+-110]; + ld.local.u8 %r8003, [%rd3+-109]; + prmt.b32 %r8004, %r8003, %r8002, 30212; + prmt.b32 %r8005, %r8004, %r8001, 4180; + ld.local.u8 %r8006, [%rd3+-128]; + ld.local.u8 %r8007, [%rd3+-127]; + prmt.b32 %r8008, %r8007, %r8006, 30212; + ld.local.u8 %r8009, [%rd3+-126]; + ld.local.u8 %r8010, [%rd3+-125]; + prmt.b32 %r8011, %r8010, %r8009, 30212; + prmt.b32 %r8012, %r8011, %r8008, 4180; + add.s32 %r8013, %r8005, %r8012; + add.s32 %r8014, %r8013, %r7866; + shr.u32 %r8015, %r8014, 16; + shl.b32 %r8016, %r8014, 16; + xor.b32 %r8017, %r8016, 4194304; + or.b32 %r8018, %r8017, %r8015; + add.s32 %r8019, %r8018, 1013904242; + xor.b32 %r8020, %r8019, %r8005; + shf.l.wrap.b32 %r8021, %r8020, %r8020, 20; + add.s32 %r8022, %r7873, %r8014; + add.s32 %r8023, %r8022, %r8021; + xor.b32 %r8024, %r8023, %r8018; + shf.l.wrap.b32 %r8025, %r8024, %r8024, 24; + add.s32 %r8026, %r8025, %r8019; + xor.b32 %r8027, %r8026, %r8021; + shf.l.wrap.b32 %r8028, %r8027, %r8027, 25; + ld.local.u8 %r8029, [%rd3+-108]; + ld.local.u8 %r8030, [%rd3+-107]; + prmt.b32 %r8031, %r8030, %r8029, 30212; + ld.local.u8 %r8032, [%rd3+-106]; + ld.local.u8 %r8033, [%rd3+-105]; + prmt.b32 %r8034, %r8033, %r8032, 30212; + prmt.b32 %r8035, %r8034, %r8031, 4180; + ld.local.u8 %r8036, [%rd3+-124]; + ld.local.u8 %r8037, [%rd3+-123]; + prmt.b32 %r8038, %r8037, %r8036, 30212; + ld.local.u8 %r8039, [%rd3+-122]; + ld.local.u8 %r8040, [%rd3+-121]; + prmt.b32 %r8041, %r8040, %r8039, 30212; + prmt.b32 %r8042, %r8041, %r8038, 4180; + add.s32 %r8043, %r8035, %r8042; + add.s32 %r8044, %r8043, %r7880; + xor.b32 %r8045, %r8044, %r7944; + shr.u32 %r8046, %r8044, 16; + shl.b32 %r8047, %r8045, 16; + or.b32 %r8048, %r8047, %r8046; + add.s32 %r8049, %r8048, -1521486534; + xor.b32 %r8050, %r8049, %r8035; + shf.l.wrap.b32 %r8051, %r8050, %r8050, 20; + add.s32 %r8052, %r7887, %r8044; + add.s32 %r8053, %r8052, %r8051; + xor.b32 %r8054, %r8053, %r8048; + shf.l.wrap.b32 %r8055, %r8054, %r8054, 24; + add.s32 %r8056, %r8055, %r8049; + xor.b32 %r8057, %r8056, %r8051; + shf.l.wrap.b32 %r8058, %r8057, %r8057, 25; + add.s32 %r8059, %r7998, %r7966; + add.s32 %r8060, %r8059, %r7894; + xor.b32 %r8061, %r8055, %r8060; + shf.l.wrap.b32 %r8062, %r8061, %r8061, 16; + add.s32 %r8063, %r8062, %r8026; + xor.b32 %r8064, %r8063, %r7998; + shf.l.wrap.b32 %r8065, %r8064, %r8064, 20; + add.s32 %r8066, %r7901, %r8060; + add.s32 %r8067, %r8066, %r8065; + xor.b32 %r8068, %r8067, %r8062; + shf.l.wrap.b32 %r8069, %r8068, %r8068, 24; + add.s32 %r8070, %r8069, %r8063; + xor.b32 %r8071, %r8070, %r8065; + shf.l.wrap.b32 %r8072, %r8071, %r8071, 25; + add.s32 %r8073, %r8028, %r7993; + add.s32 %r8074, %r8073, %r7908; + xor.b32 %r8075, %r8074, %r7968; + shf.l.wrap.b32 %r8076, %r8075, %r8075, 16; + add.s32 %r8077, %r8076, %r8056; + xor.b32 %r8078, %r8077, %r8028; + shf.l.wrap.b32 %r8079, %r8078, %r8078, 20; + add.s32 %r8080, %r7915, %r8074; + add.s32 %r8081, %r8080, %r8079; + xor.b32 %r8082, %r8081, %r8076; + shf.l.wrap.b32 %r8083, %r8082, %r8082, 24; + add.s32 %r8084, %r8083, %r8077; + xor.b32 %r8085, %r8084, %r8079; + shf.l.wrap.b32 %r8086, %r8085, %r8085, 25; + add.s32 %r8087, %r8058, %r8023; + add.s32 %r8088, %r8087, %r7922; + xor.b32 %r8089, %r8088, %r7995; + shf.l.wrap.b32 %r8090, %r8089, %r8089, 16; + add.s32 %r8091, %r8090, %r7969; + xor.b32 %r8092, %r8091, %r8058; + shf.l.wrap.b32 %r8093, %r8092, %r8092, 20; + add.s32 %r8094, %r7929, %r8088; + add.s32 %r8095, %r8094, %r8093; + xor.b32 %r8096, %r8095, %r8090; + shf.l.wrap.b32 %r8097, %r8096, %r8096, 24; + add.s32 %r8098, %r8097, %r8091; + xor.b32 %r8099, %r8098, %r8093; + shf.l.wrap.b32 %r8100, %r8099, %r8099, 25; + add.s32 %r8101, %r8053, %r7971; + add.s32 %r8102, %r8101, %r7936; + xor.b32 %r8103, %r8102, %r8025; + shf.l.wrap.b32 %r8104, %r8103, %r8103, 16; + add.s32 %r8105, %r8104, %r7996; + xor.b32 %r8106, %r8105, %r7971; + shf.l.wrap.b32 %r8107, %r8106, %r8106, 20; + add.s32 %r8108, %r7943, %r8102; + add.s32 %r8109, %r8108, %r8107; + xor.b32 %r8110, %r8109, %r8104; + shf.l.wrap.b32 %r8111, %r8110, %r8110, 24; + add.s32 %r8112, %r8111, %r8105; + xor.b32 %r8113, %r8112, %r8107; + shf.l.wrap.b32 %r8114, %r8113, %r8113, 25; + add.s32 %r8115, %r8067, %r7852; + add.s32 %r8116, %r8115, %r8114; + xor.b32 %r8117, %r8116, %r8083; + shf.l.wrap.b32 %r8118, %r8117, %r8117, 16; + add.s32 %r8119, %r8118, %r8098; + xor.b32 %r8120, %r8119, %r8114; + shf.l.wrap.b32 %r8121, %r8120, %r8120, 20; + add.s32 %r8122, %r8116, %r7880; + add.s32 %r8123, %r8122, %r8121; + xor.b32 %r8124, %r8123, %r8118; + shf.l.wrap.b32 %r8125, %r8124, %r8124, 24; + add.s32 %r8126, %r8125, %r8119; + xor.b32 %r8127, %r8126, %r8121; + shf.l.wrap.b32 %r8128, %r8127, %r8127, 25; + add.s32 %r8129, %r8081, %r7859; + add.s32 %r8130, %r8129, %r8072; + xor.b32 %r8131, %r8097, %r8130; + shf.l.wrap.b32 %r8132, %r8131, %r8131, 16; + add.s32 %r8133, %r8112, %r8132; + xor.b32 %r8134, %r8133, %r8072; + shf.l.wrap.b32 %r8135, %r8134, %r8134, 20; + add.s32 %r8136, %r8130, %r7908; + add.s32 %r8137, %r8136, %r8135; + xor.b32 %r8138, %r8137, %r8132; + shf.l.wrap.b32 %r8139, %r8138, %r8138, 24; + add.s32 %r8140, %r8139, %r8133; + xor.b32 %r8141, %r8140, %r8135; + shf.l.wrap.b32 %r8142, %r8141, %r8141, 25; + add.s32 %r8143, %r8086, %r7887; + add.s32 %r8144, %r8143, %r8095; + xor.b32 %r8145, %r8111, %r8144; + shf.l.wrap.b32 %r8146, %r8145, %r8145, 16; + add.s32 %r8147, %r8146, %r8070; + xor.b32 %r8148, %r8147, %r8086; + shf.l.wrap.b32 %r8149, %r8148, %r8148, 20; + add.s32 %r8150, %r8144, %r7838; + add.s32 %r8151, %r8150, %r8149; + xor.b32 %r8152, %r8151, %r8146; + shf.l.wrap.b32 %r8153, %r8152, %r8152, 24; + add.s32 %r8154, %r8153, %r8147; + xor.b32 %r8155, %r8154, %r8149; + shf.l.wrap.b32 %r8156, %r8155, %r8155, 25; + add.s32 %r8157, %r8100, %r7866; + add.s32 %r8158, %r8157, %r8109; + xor.b32 %r8159, %r8158, %r8069; + shf.l.wrap.b32 %r8160, %r8159, %r8159, 16; + add.s32 %r8161, %r8160, %r8084; + xor.b32 %r8162, %r8161, %r8100; + shf.l.wrap.b32 %r8163, %r8162, %r8162, 20; + add.s32 %r8164, %r8158, %r7929; + add.s32 %r8165, %r8164, %r8163; + xor.b32 %r8166, %r8165, %r8160; + shf.l.wrap.b32 %r8167, %r8166, %r8166, 24; + add.s32 %r8168, %r8167, %r8161; + xor.b32 %r8169, %r8168, %r8163; + shf.l.wrap.b32 %r8170, %r8169, %r8169, 25; + add.s32 %r8171, %r8142, %r7845; + add.s32 %r8172, %r8171, %r8123; + xor.b32 %r8173, %r8172, %r8167; + shf.l.wrap.b32 %r8174, %r8173, %r8173, 16; + add.s32 %r8175, %r8174, %r8154; + xor.b32 %r8176, %r8175, %r8142; + shf.l.wrap.b32 %r8177, %r8176, %r8176, 20; + add.s32 %r8178, %r8172, %r7915; + add.s32 %r8179, %r8178, %r8177; + xor.b32 %r8180, %r8179, %r8174; + shf.l.wrap.b32 %r8181, %r8180, %r8180, 24; + add.s32 %r8182, %r8181, %r8175; + xor.b32 %r8183, %r8182, %r8177; + shf.l.wrap.b32 %r8184, %r8183, %r8183, 25; + add.s32 %r8185, %r8137, %r7922; + add.s32 %r8186, %r8185, %r8156; + xor.b32 %r8187, %r8125, %r8186; + shf.l.wrap.b32 %r8188, %r8187, %r8187, 16; + add.s32 %r8189, %r8188, %r8168; + xor.b32 %r8190, %r8189, %r8156; + shf.l.wrap.b32 %r8191, %r8190, %r8190, 20; + add.s32 %r8192, %r8186, %r7873; + add.s32 %r8193, %r8192, %r8191; + xor.b32 %r8194, %r8193, %r8188; + shf.l.wrap.b32 %r8195, %r8194, %r8194, 24; + add.s32 %r8196, %r8195, %r8189; + xor.b32 %r8197, %r8196, %r8191; + shf.l.wrap.b32 %r8198, %r8197, %r8197, 25; + add.s32 %r8199, %r8151, %r7901; + add.s32 %r8200, %r8199, %r8170; + xor.b32 %r8201, %r8200, %r8139; + shf.l.wrap.b32 %r8202, %r8201, %r8201, 16; + add.s32 %r8203, %r8202, %r8126; + xor.b32 %r8204, %r8203, %r8170; + shf.l.wrap.b32 %r8205, %r8204, %r8204, 20; + add.s32 %r8206, %r8200, %r7936; + add.s32 %r8207, %r8206, %r8205; + xor.b32 %r8208, %r8207, %r8202; + shf.l.wrap.b32 %r8209, %r8208, %r8208, 24; + add.s32 %r8210, %r8209, %r8203; + xor.b32 %r8211, %r8210, %r8205; + shf.l.wrap.b32 %r8212, %r8211, %r8211, 25; + add.s32 %r8213, %r8165, %r7943; + add.s32 %r8214, %r8213, %r8128; + xor.b32 %r8215, %r8214, %r8153; + shf.l.wrap.b32 %r8216, %r8215, %r8215, 16; + add.s32 %r8217, %r8216, %r8140; + xor.b32 %r8218, %r8217, %r8128; + shf.l.wrap.b32 %r8219, %r8218, %r8218, 20; + add.s32 %r8220, %r8214, %r7894; + add.s32 %r8221, %r8220, %r8219; + xor.b32 %r8222, %r8221, %r8216; + shf.l.wrap.b32 %r8223, %r8222, %r8222, 24; + add.s32 %r8224, %r8223, %r8217; + xor.b32 %r8225, %r8224, %r8219; + shf.l.wrap.b32 %r8226, %r8225, %r8225, 25; + add.s32 %r8227, %r8179, %r7859; + add.s32 %r8228, %r8227, %r8226; + xor.b32 %r8229, %r8228, %r8195; + shf.l.wrap.b32 %r8230, %r8229, %r8229, 16; + add.s32 %r8231, %r8230, %r8210; + xor.b32 %r8232, %r8231, %r8226; + shf.l.wrap.b32 %r8233, %r8232, %r8232, 20; + add.s32 %r8234, %r8228, %r7866; + add.s32 %r8235, %r8234, %r8233; + xor.b32 %r8236, %r8235, %r8230; + shf.l.wrap.b32 %r8237, %r8236, %r8236, 24; + add.s32 %r8238, %r8237, %r8231; + xor.b32 %r8239, %r8238, %r8233; + shf.l.wrap.b32 %r8240, %r8239, %r8239, 25; + add.s32 %r8241, %r8193, %r7908; + add.s32 %r8242, %r8241, %r8184; + xor.b32 %r8243, %r8242, %r8209; + shf.l.wrap.b32 %r8244, %r8243, %r8243, 16; + add.s32 %r8245, %r8244, %r8224; + xor.b32 %r8246, %r8245, %r8184; + shf.l.wrap.b32 %r8247, %r8246, %r8246, 20; + add.s32 %r8248, %r8242, %r7922; + add.s32 %r8249, %r8248, %r8247; + xor.b32 %r8250, %r8249, %r8244; + shf.l.wrap.b32 %r8251, %r8250, %r8250, 24; + add.s32 %r8252, %r8251, %r8245; + xor.b32 %r8253, %r8252, %r8247; + shf.l.wrap.b32 %r8254, %r8253, %r8253, 25; + add.s32 %r8255, %r8207, %r7929; + add.s32 %r8256, %r8255, %r8198; + xor.b32 %r8257, %r8223, %r8256; + shf.l.wrap.b32 %r8258, %r8257, %r8257, 16; + add.s32 %r8259, %r8258, %r8182; + xor.b32 %r8260, %r8259, %r8198; + shf.l.wrap.b32 %r8261, %r8260, %r8260, 20; + add.s32 %r8262, %r8256, %r7852; + add.s32 %r8263, %r8262, %r8261; + xor.b32 %r8264, %r8263, %r8258; + shf.l.wrap.b32 %r8265, %r8264, %r8264, 24; + add.s32 %r8266, %r8265, %r8259; + xor.b32 %r8267, %r8266, %r8261; + shf.l.wrap.b32 %r8268, %r8267, %r8267, 25; + add.s32 %r8269, %r8212, %r7887; + add.s32 %r8270, %r8269, %r8221; + xor.b32 %r8271, %r8270, %r8181; + shf.l.wrap.b32 %r8272, %r8271, %r8271, 16; + add.s32 %r8273, %r8272, %r8196; + xor.b32 %r8274, %r8273, %r8212; + shf.l.wrap.b32 %r8275, %r8274, %r8274, 20; + add.s32 %r8276, %r8270, %r7936; + add.s32 %r8277, %r8276, %r8275; + xor.b32 %r8278, %r8277, %r8272; + shf.l.wrap.b32 %r8279, %r8278, %r8278, 24; + add.s32 %r8280, %r8279, %r8273; + xor.b32 %r8281, %r8280, %r8275; + shf.l.wrap.b32 %r8282, %r8281, %r8281, 25; + add.s32 %r8283, %r8235, %r7880; + add.s32 %r8284, %r8283, %r8254; + xor.b32 %r8285, %r8284, %r8279; + shf.l.wrap.b32 %r8286, %r8285, %r8285, 16; + add.s32 %r8287, %r8286, %r8266; + xor.b32 %r8288, %r8287, %r8254; + shf.l.wrap.b32 %r8289, %r8288, %r8288, 20; + add.s32 %r8290, %r8284, %r7873; + add.s32 %r8291, %r8290, %r8289; + xor.b32 %r8292, %r8291, %r8286; + shf.l.wrap.b32 %r8293, %r8292, %r8292, 24; + add.s32 %r8294, %r8293, %r8287; + xor.b32 %r8295, %r8294, %r8289; + shf.l.wrap.b32 %r8296, %r8295, %r8295, 25; + add.s32 %r8297, %r8249, %r7901; + add.s32 %r8298, %r8297, %r8268; + xor.b32 %r8299, %r8237, %r8298; + shf.l.wrap.b32 %r8300, %r8299, %r8299, 16; + add.s32 %r8301, %r8300, %r8280; + xor.b32 %r8302, %r8301, %r8268; + shf.l.wrap.b32 %r8303, %r8302, %r8302, 20; + add.s32 %r8304, %r8298, %r7838; + add.s32 %r8305, %r8304, %r8303; + xor.b32 %r8306, %r8305, %r8300; + shf.l.wrap.b32 %r8307, %r8306, %r8306, 24; + add.s32 %r8308, %r8307, %r8301; + xor.b32 %r8309, %r8308, %r8303; + shf.l.wrap.b32 %r8310, %r8309, %r8309, 25; + add.s32 %r8311, %r8263, %r7915; + add.s32 %r8312, %r8311, %r8282; + xor.b32 %r8313, %r8312, %r8251; + shf.l.wrap.b32 %r8314, %r8313, %r8313, 16; + add.s32 %r8315, %r8314, %r8238; + xor.b32 %r8316, %r8315, %r8282; + shf.l.wrap.b32 %r8317, %r8316, %r8316, 20; + add.s32 %r8318, %r8312, %r7943; + add.s32 %r8319, %r8318, %r8317; + xor.b32 %r8320, %r8319, %r8314; + shf.l.wrap.b32 %r8321, %r8320, %r8320, 24; + add.s32 %r8322, %r8321, %r8315; + xor.b32 %r8323, %r8322, %r8317; + shf.l.wrap.b32 %r8324, %r8323, %r8323, 25; + add.s32 %r8325, %r8277, %r7894; + add.s32 %r8326, %r8325, %r8240; + xor.b32 %r8327, %r8326, %r8265; + shf.l.wrap.b32 %r8328, %r8327, %r8327, 16; + add.s32 %r8329, %r8328, %r8252; + xor.b32 %r8330, %r8329, %r8240; + shf.l.wrap.b32 %r8331, %r8330, %r8330, 20; + add.s32 %r8332, %r8326, %r7845; + add.s32 %r8333, %r8332, %r8331; + xor.b32 %r8334, %r8333, %r8328; + shf.l.wrap.b32 %r8335, %r8334, %r8334, 24; + add.s32 %r8336, %r8335, %r8329; + xor.b32 %r8337, %r8336, %r8331; + shf.l.wrap.b32 %r8338, %r8337, %r8337, 25; + add.s32 %r8339, %r8291, %r7908; + add.s32 %r8340, %r8339, %r8338; + xor.b32 %r8341, %r8340, %r8307; + shf.l.wrap.b32 %r8342, %r8341, %r8341, 16; + add.s32 %r8343, %r8342, %r8322; + xor.b32 %r8344, %r8343, %r8338; + shf.l.wrap.b32 %r8345, %r8344, %r8344, 20; + add.s32 %r8346, %r8340, %r7887; + add.s32 %r8347, %r8346, %r8345; + xor.b32 %r8348, %r8347, %r8342; + shf.l.wrap.b32 %r8349, %r8348, %r8348, 24; + add.s32 %r8350, %r8349, %r8343; + xor.b32 %r8351, %r8350, %r8345; + shf.l.wrap.b32 %r8352, %r8351, %r8351, 25; + add.s32 %r8353, %r8305, %r7922; + add.s32 %r8354, %r8353, %r8296; + xor.b32 %r8355, %r8354, %r8321; + shf.l.wrap.b32 %r8356, %r8355, %r8355, 16; + add.s32 %r8357, %r8356, %r8336; + xor.b32 %r8358, %r8357, %r8296; + shf.l.wrap.b32 %r8359, %r8358, %r8358, 20; + add.s32 %r8360, %r8354, %r7901; + add.s32 %r8361, %r8360, %r8359; + xor.b32 %r8362, %r8361, %r8356; + shf.l.wrap.b32 %r8363, %r8362, %r8362, 24; + add.s32 %r8364, %r8363, %r8357; + xor.b32 %r8365, %r8364, %r8359; + shf.l.wrap.b32 %r8366, %r8365, %r8365, 25; + add.s32 %r8367, %r8319, %r7936; + add.s32 %r8368, %r8367, %r8310; + xor.b32 %r8369, %r8335, %r8368; + shf.l.wrap.b32 %r8370, %r8369, %r8369, 16; + add.s32 %r8371, %r8370, %r8294; + xor.b32 %r8372, %r8371, %r8310; + shf.l.wrap.b32 %r8373, %r8372, %r8372, 20; + add.s32 %r8374, %r8368, %r7859; + add.s32 %r8375, %r8374, %r8373; + xor.b32 %r8376, %r8375, %r8370; + shf.l.wrap.b32 %r8377, %r8376, %r8376, 24; + add.s32 %r8378, %r8377, %r8371; + xor.b32 %r8379, %r8378, %r8373; + shf.l.wrap.b32 %r8380, %r8379, %r8379, 25; + add.s32 %r8381, %r8333, %r7929; + add.s32 %r8382, %r8381, %r8324; + xor.b32 %r8383, %r8382, %r8293; + shf.l.wrap.b32 %r8384, %r8383, %r8383, 16; + add.s32 %r8385, %r8384, %r8308; + xor.b32 %r8386, %r8385, %r8324; + shf.l.wrap.b32 %r8387, %r8386, %r8386, 20; + add.s32 %r8388, %r8382, %r7943; + add.s32 %r8389, %r8388, %r8387; + xor.b32 %r8390, %r8389, %r8384; + shf.l.wrap.b32 %r8391, %r8390, %r8390, 24; + add.s32 %r8392, %r8391, %r8385; + xor.b32 %r8393, %r8392, %r8387; + shf.l.wrap.b32 %r8394, %r8393, %r8393, 25; + add.s32 %r8395, %r8347, %r7866; + add.s32 %r8396, %r8395, %r8366; + xor.b32 %r8397, %r8396, %r8391; + shf.l.wrap.b32 %r8398, %r8397, %r8397, 16; + add.s32 %r8399, %r8398, %r8378; + xor.b32 %r8400, %r8399, %r8366; + shf.l.wrap.b32 %r8401, %r8400, %r8400, 20; + add.s32 %r8402, %r8396, %r7838; + add.s32 %r8403, %r8402, %r8401; + xor.b32 %r8404, %r8403, %r8398; + shf.l.wrap.b32 %r8405, %r8404, %r8404, 24; + add.s32 %r8406, %r8405, %r8399; + xor.b32 %r8407, %r8406, %r8401; + shf.l.wrap.b32 %r8408, %r8407, %r8407, 25; + add.s32 %r8409, %r8361, %r7915; + add.s32 %r8410, %r8409, %r8380; + xor.b32 %r8411, %r8349, %r8410; + shf.l.wrap.b32 %r8412, %r8411, %r8411, 16; + add.s32 %r8413, %r8412, %r8392; + xor.b32 %r8414, %r8413, %r8380; + shf.l.wrap.b32 %r8415, %r8414, %r8414, 20; + add.s32 %r8416, %r8410, %r7852; + add.s32 %r8417, %r8416, %r8415; + xor.b32 %r8418, %r8417, %r8412; + shf.l.wrap.b32 %r8419, %r8418, %r8418, 24; + add.s32 %r8420, %r8419, %r8413; + xor.b32 %r8421, %r8420, %r8415; + shf.l.wrap.b32 %r8422, %r8421, %r8421, 25; + add.s32 %r8423, %r8375, %r7873; + add.s32 %r8424, %r8423, %r8394; + xor.b32 %r8425, %r8424, %r8363; + shf.l.wrap.b32 %r8426, %r8425, %r8425, 16; + add.s32 %r8427, %r8426, %r8350; + xor.b32 %r8428, %r8427, %r8394; + shf.l.wrap.b32 %r8429, %r8428, %r8428, 20; + add.s32 %r8430, %r8424, %r7894; + add.s32 %r8431, %r8430, %r8429; + xor.b32 %r8432, %r8431, %r8426; + shf.l.wrap.b32 %r8433, %r8432, %r8432, 24; + add.s32 %r8434, %r8433, %r8427; + xor.b32 %r8435, %r8434, %r8429; + shf.l.wrap.b32 %r8436, %r8435, %r8435, 25; + add.s32 %r8437, %r8389, %r7845; + add.s32 %r8438, %r8437, %r8352; + xor.b32 %r8439, %r8438, %r8377; + shf.l.wrap.b32 %r8440, %r8439, %r8439, 16; + add.s32 %r8441, %r8440, %r8364; + xor.b32 %r8442, %r8441, %r8352; + shf.l.wrap.b32 %r8443, %r8442, %r8442, 20; + add.s32 %r8444, %r8438, %r7880; + add.s32 %r8445, %r8444, %r8443; + xor.b32 %r8446, %r8445, %r8440; + shf.l.wrap.b32 %r8447, %r8446, %r8446, 24; + add.s32 %r8448, %r8447, %r8441; + xor.b32 %r8449, %r8448, %r8443; + shf.l.wrap.b32 %r8450, %r8449, %r8449, 25; + add.s32 %r8451, %r8403, %r7922; + add.s32 %r8452, %r8451, %r8450; + xor.b32 %r8453, %r8452, %r8419; + shf.l.wrap.b32 %r8454, %r8453, %r8453, 16; + add.s32 %r8455, %r8454, %r8434; + xor.b32 %r8456, %r8455, %r8450; + shf.l.wrap.b32 %r8457, %r8456, %r8456, 20; + add.s32 %r8458, %r8452, %r7929; + add.s32 %r8459, %r8458, %r8457; + xor.b32 %r8460, %r8459, %r8454; + shf.l.wrap.b32 %r8461, %r8460, %r8460, 24; + add.s32 %r8462, %r8461, %r8455; + xor.b32 %r8463, %r8462, %r8457; + shf.l.wrap.b32 %r8464, %r8463, %r8463, 25; + add.s32 %r8465, %r8417, %r7901; + add.s32 %r8466, %r8465, %r8408; + xor.b32 %r8467, %r8466, %r8433; + shf.l.wrap.b32 %r8468, %r8467, %r8467, 16; + add.s32 %r8469, %r8468, %r8448; + xor.b32 %r8470, %r8469, %r8408; + shf.l.wrap.b32 %r8471, %r8470, %r8470, 20; + add.s32 %r8472, %r8466, %r7915; + add.s32 %r8473, %r8472, %r8471; + xor.b32 %r8474, %r8473, %r8468; + shf.l.wrap.b32 %r8475, %r8474, %r8474, 24; + add.s32 %r8476, %r8475, %r8469; + xor.b32 %r8477, %r8476, %r8471; + shf.l.wrap.b32 %r8478, %r8477, %r8477, 25; + add.s32 %r8479, %r8431, %r7943; + add.s32 %r8480, %r8479, %r8422; + xor.b32 %r8481, %r8447, %r8480; + shf.l.wrap.b32 %r8482, %r8481, %r8481, 16; + add.s32 %r8483, %r8482, %r8406; + xor.b32 %r8484, %r8483, %r8422; + shf.l.wrap.b32 %r8485, %r8484, %r8484, 20; + add.s32 %r8486, %r8480, %r7908; + add.s32 %r8487, %r8486, %r8485; + xor.b32 %r8488, %r8487, %r8482; + shf.l.wrap.b32 %r8489, %r8488, %r8488, 24; + add.s32 %r8490, %r8489, %r8483; + xor.b32 %r8491, %r8490, %r8485; + shf.l.wrap.b32 %r8492, %r8491, %r8491, 25; + add.s32 %r8493, %r8445, %r7936; + add.s32 %r8494, %r8493, %r8436; + xor.b32 %r8495, %r8494, %r8405; + shf.l.wrap.b32 %r8496, %r8495, %r8495, 16; + add.s32 %r8497, %r8496, %r8420; + xor.b32 %r8498, %r8497, %r8436; + shf.l.wrap.b32 %r8499, %r8498, %r8498, 20; + add.s32 %r8500, %r8494, %r7894; + add.s32 %r8501, %r8500, %r8499; + xor.b32 %r8502, %r8501, %r8496; + shf.l.wrap.b32 %r8503, %r8502, %r8502, 24; + add.s32 %r8504, %r8503, %r8497; + xor.b32 %r8505, %r8504, %r8499; + shf.l.wrap.b32 %r8506, %r8505, %r8505, 25; + add.s32 %r8507, %r8459, %r7887; + add.s32 %r8508, %r8507, %r8478; + xor.b32 %r8509, %r8508, %r8503; + shf.l.wrap.b32 %r8510, %r8509, %r8509, 16; + add.s32 %r8511, %r8510, %r8490; + xor.b32 %r8512, %r8511, %r8478; + shf.l.wrap.b32 %r8513, %r8512, %r8512, 20; + add.s32 %r8514, %r8508, %r7852; + add.s32 %r8515, %r8514, %r8513; + xor.b32 %r8516, %r8515, %r8510; + shf.l.wrap.b32 %r8517, %r8516, %r8516, 24; + add.s32 %r8518, %r8517, %r8511; + xor.b32 %r8519, %r8518, %r8513; + shf.l.wrap.b32 %r8520, %r8519, %r8519, 25; + add.s32 %r8521, %r8473, %r7873; + add.s32 %r8522, %r8521, %r8492; + xor.b32 %r8523, %r8461, %r8522; + shf.l.wrap.b32 %r8524, %r8523, %r8523, 16; + add.s32 %r8525, %r8524, %r8504; + xor.b32 %r8526, %r8525, %r8492; + shf.l.wrap.b32 %r8527, %r8526, %r8526, 20; + add.s32 %r8528, %r8522, %r7859; + add.s32 %r8529, %r8528, %r8527; + xor.b32 %r8530, %r8529, %r8524; + shf.l.wrap.b32 %r8531, %r8530, %r8530, 24; + add.s32 %r8532, %r8531, %r8525; + xor.b32 %r8533, %r8532, %r8527; + shf.l.wrap.b32 %r8534, %r8533, %r8533, 25; + add.s32 %r8535, %r8487, %r7838; + add.s32 %r8536, %r8535, %r8506; + xor.b32 %r8537, %r8536, %r8475; + shf.l.wrap.b32 %r8538, %r8537, %r8537, 16; + add.s32 %r8539, %r8538, %r8462; + xor.b32 %r8540, %r8539, %r8506; + shf.l.wrap.b32 %r8541, %r8540, %r8540, 20; + add.s32 %r8542, %r8536, %r7845; + add.s32 %r8543, %r8542, %r8541; + xor.b32 %r8544, %r8543, %r8538; + shf.l.wrap.b32 %r8545, %r8544, %r8544, 24; + add.s32 %r8546, %r8545, %r8539; + xor.b32 %r8547, %r8546, %r8541; + shf.l.wrap.b32 %r8548, %r8547, %r8547, 25; + add.s32 %r8549, %r8501, %r7880; + add.s32 %r8550, %r8549, %r8464; + xor.b32 %r8551, %r8550, %r8489; + shf.l.wrap.b32 %r8552, %r8551, %r8551, 16; + add.s32 %r8553, %r8552, %r8476; + xor.b32 %r8554, %r8553, %r8464; + shf.l.wrap.b32 %r8555, %r8554, %r8554, 20; + add.s32 %r8556, %r8550, %r7866; + add.s32 %r8557, %r8556, %r8555; + xor.b32 %r8558, %r8557, %r8552; + shf.l.wrap.b32 %r8559, %r8558, %r8558, 24; + add.s32 %r8560, %r8559, %r8553; + xor.b32 %r8561, %r8560, %r8555; + shf.l.wrap.b32 %r8562, %r8561, %r8561, 25; + add.s32 %r8563, %r8515, %r7901; + add.s32 %r8564, %r8563, %r8562; + xor.b32 %r8565, %r8564, %r8531; + shf.l.wrap.b32 %r8566, %r8565, %r8565, 16; + add.s32 %r8567, %r8566, %r8546; + xor.b32 %r8568, %r8567, %r8562; + shf.l.wrap.b32 %r8569, %r8568, %r8568, 20; + add.s32 %r8570, %r8564, %r7936; + add.s32 %r8571, %r8570, %r8569; + xor.b32 %r8572, %r8571, %r8566; + shf.l.wrap.b32 %r8573, %r8572, %r8572, 24; + add.s32 %r8574, %r8573, %r8567; + xor.b32 %r8575, %r8574, %r8569; + shf.l.wrap.b32 %r8576, %r8575, %r8575, 25; + add.s32 %r8577, %r8529, %r7915; + add.s32 %r8578, %r8577, %r8520; + xor.b32 %r8579, %r8578, %r8545; + shf.l.wrap.b32 %r8580, %r8579, %r8579, 16; + add.s32 %r8581, %r8580, %r8560; + xor.b32 %r8582, %r8581, %r8520; + shf.l.wrap.b32 %r8583, %r8582, %r8582, 20; + add.s32 %r8584, %r8578, %r7873; + add.s32 %r8585, %r8584, %r8583; + xor.b32 %r8586, %r8585, %r8580; + shf.l.wrap.b32 %r8587, %r8586, %r8586, 24; + add.s32 %r8588, %r8587, %r8581; + xor.b32 %r8589, %r8588, %r8583; + shf.l.wrap.b32 %r8590, %r8589, %r8589, 25; + add.s32 %r8591, %r8543, %r7894; + add.s32 %r8592, %r8591, %r8534; + xor.b32 %r8593, %r8559, %r8592; + shf.l.wrap.b32 %r8594, %r8593, %r8593, 16; + add.s32 %r8595, %r8594, %r8518; + xor.b32 %r8596, %r8595, %r8534; + shf.l.wrap.b32 %r8597, %r8596, %r8596, 20; + add.s32 %r8598, %r8592, %r7922; + add.s32 %r8599, %r8598, %r8597; + xor.b32 %r8600, %r8599, %r8594; + shf.l.wrap.b32 %r8601, %r8600, %r8600, 24; + add.s32 %r8602, %r8601, %r8595; + xor.b32 %r8603, %r8602, %r8597; + shf.l.wrap.b32 %r8604, %r8603, %r8603, 25; + add.s32 %r8605, %r8557, %r7943; + add.s32 %r8606, %r8605, %r8548; + xor.b32 %r8607, %r8606, %r8517; + shf.l.wrap.b32 %r8608, %r8607, %r8607, 16; + add.s32 %r8609, %r8608, %r8532; + xor.b32 %r8610, %r8609, %r8548; + shf.l.wrap.b32 %r8611, %r8610, %r8610, 20; + add.s32 %r8612, %r8606, %r7845; + add.s32 %r8613, %r8612, %r8611; + xor.b32 %r8614, %r8613, %r8608; + shf.l.wrap.b32 %r8615, %r8614, %r8614, 24; + add.s32 %r8616, %r8615, %r8609; + xor.b32 %r8617, %r8616, %r8611; + shf.l.wrap.b32 %r8618, %r8617, %r8617, 25; + add.s32 %r8619, %r8571, %r7929; + add.s32 %r8620, %r8619, %r8590; + xor.b32 %r8621, %r8620, %r8615; + shf.l.wrap.b32 %r8622, %r8621, %r8621, 16; + add.s32 %r8623, %r8622, %r8602; + xor.b32 %r8624, %r8623, %r8590; + shf.l.wrap.b32 %r8625, %r8624, %r8624, 20; + add.s32 %r8626, %r8620, %r7859; + add.s32 %r8627, %r8626, %r8625; + xor.b32 %r8628, %r8627, %r8622; + shf.l.wrap.b32 %r8629, %r8628, %r8628, 24; + add.s32 %r8630, %r8629, %r8623; + xor.b32 %r8631, %r8630, %r8625; + shf.l.wrap.b32 %r8632, %r8631, %r8631, 25; + add.s32 %r8633, %r8585, %r7838; + add.s32 %r8634, %r8633, %r8604; + xor.b32 %r8635, %r8573, %r8634; + shf.l.wrap.b32 %r8636, %r8635, %r8635, 16; + add.s32 %r8637, %r8636, %r8616; + xor.b32 %r8638, %r8637, %r8604; + shf.l.wrap.b32 %r8639, %r8638, %r8638, 20; + add.s32 %r8640, %r8634, %r7908; + add.s32 %r8641, %r8640, %r8639; + xor.b32 %r8642, %r8641, %r8636; + shf.l.wrap.b32 %r8643, %r8642, %r8642, 24; + add.s32 %r8644, %r8643, %r8637; + xor.b32 %r8645, %r8644, %r8639; + shf.l.wrap.b32 %r8646, %r8645, %r8645, 25; + add.s32 %r8647, %r8599, %r7852; + add.s32 %r8648, %r8647, %r8618; + xor.b32 %r8649, %r8648, %r8587; + shf.l.wrap.b32 %r8650, %r8649, %r8649, 16; + add.s32 %r8651, %r8650, %r8574; + xor.b32 %r8652, %r8651, %r8618; + shf.l.wrap.b32 %r8653, %r8652, %r8652, 20; + add.s32 %r8654, %r8648, %r7880; + add.s32 %r8655, %r8654, %r8653; + xor.b32 %r8656, %r8655, %r8650; + shf.l.wrap.b32 %r8657, %r8656, %r8656, 24; + add.s32 %r8658, %r8657, %r8651; + xor.b32 %r8659, %r8658, %r8653; + shf.l.wrap.b32 %r8660, %r8659, %r8659, 25; + add.s32 %r8661, %r8613, %r7866; + add.s32 %r8662, %r8661, %r8576; + xor.b32 %r8663, %r8662, %r8601; + shf.l.wrap.b32 %r8664, %r8663, %r8663, 16; + add.s32 %r8665, %r8664, %r8588; + xor.b32 %r8666, %r8665, %r8576; + shf.l.wrap.b32 %r8667, %r8666, %r8666, 20; + add.s32 %r8668, %r8662, %r7887; + add.s32 %r8669, %r8668, %r8667; + xor.b32 %r8670, %r8669, %r8664; + shf.l.wrap.b32 %r8671, %r8670, %r8670, 24; + add.s32 %r8672, %r8671, %r8665; + xor.b32 %r8673, %r8672, %r8667; + shf.l.wrap.b32 %r8674, %r8673, %r8673, 25; + add.s32 %r8675, %r8627, %r7915; + add.s32 %r8676, %r8675, %r8674; + xor.b32 %r8677, %r8676, %r8643; + shf.l.wrap.b32 %r8678, %r8677, %r8677, 16; + add.s32 %r8679, %r8678, %r8658; + xor.b32 %r8680, %r8679, %r8674; + shf.l.wrap.b32 %r8681, %r8680, %r8680, 20; + add.s32 %r8682, %r8676, %r7943; + add.s32 %r8683, %r8682, %r8681; + xor.b32 %r8684, %r8683, %r8678; + shf.l.wrap.b32 %r8685, %r8684, %r8684, 24; + add.s32 %r8686, %r8685, %r8679; + xor.b32 %r8687, %r8686, %r8681; + shf.l.wrap.b32 %r8688, %r8687, %r8687, 25; + add.s32 %r8689, %r8641, %r7873; + add.s32 %r8690, %r8689, %r8632; + xor.b32 %r8691, %r8690, %r8657; + shf.l.wrap.b32 %r8692, %r8691, %r8691, 16; + add.s32 %r8693, %r8692, %r8672; + xor.b32 %r8694, %r8693, %r8632; + shf.l.wrap.b32 %r8695, %r8694, %r8694, 20; + add.s32 %r8696, %r8690, %r7838; + add.s32 %r8697, %r8696, %r8695; + xor.b32 %r8698, %r8697, %r8692; + shf.l.wrap.b32 %r8699, %r8698, %r8698, 24; + add.s32 %r8700, %r8699, %r8693; + xor.b32 %r8701, %r8700, %r8695; + shf.l.wrap.b32 %r8702, %r8701, %r8701, 25; + add.s32 %r8703, %r8655, %r7845; + add.s32 %r8704, %r8703, %r8646; + xor.b32 %r8705, %r8671, %r8704; + shf.l.wrap.b32 %r8706, %r8705, %r8705, 16; + add.s32 %r8707, %r8706, %r8630; + xor.b32 %r8708, %r8707, %r8646; + shf.l.wrap.b32 %r8709, %r8708, %r8708, 20; + add.s32 %r8710, %r8704, %r7901; + add.s32 %r8711, %r8710, %r8709; + xor.b32 %r8712, %r8711, %r8706; + shf.l.wrap.b32 %r8713, %r8712, %r8712, 24; + add.s32 %r8714, %r8713, %r8707; + xor.b32 %r8715, %r8714, %r8709; + shf.l.wrap.b32 %r8716, %r8715, %r8715, 25; + add.s32 %r8717, %r8669, %r7894; + add.s32 %r8718, %r8717, %r8660; + xor.b32 %r8719, %r8718, %r8629; + shf.l.wrap.b32 %r8720, %r8719, %r8719, 16; + add.s32 %r8721, %r8720, %r8644; + xor.b32 %r8722, %r8721, %r8660; + shf.l.wrap.b32 %r8723, %r8722, %r8722, 20; + add.s32 %r8724, %r8718, %r7880; + add.s32 %r8725, %r8724, %r8723; + xor.b32 %r8726, %r8725, %r8720; + shf.l.wrap.b32 %r8727, %r8726, %r8726, 24; + add.s32 %r8728, %r8727, %r8721; + xor.b32 %r8729, %r8728, %r8723; + shf.l.wrap.b32 %r8730, %r8729, %r8729, 25; + add.s32 %r8731, %r8683, %r7936; + add.s32 %r8732, %r8731, %r8702; + xor.b32 %r8733, %r8732, %r8727; + shf.l.wrap.b32 %r8734, %r8733, %r8733, 16; + add.s32 %r8735, %r8734, %r8714; + xor.b32 %r8736, %r8735, %r8702; + shf.l.wrap.b32 %r8737, %r8736, %r8736, 20; + add.s32 %r8738, %r8732, %r7908; + add.s32 %r8739, %r8738, %r8737; + xor.b32 %r8740, %r8739, %r8734; + shf.l.wrap.b32 %r8741, %r8740, %r8740, 24; + add.s32 %r8742, %r8741, %r8735; + xor.b32 %r8743, %r8742, %r8737; + shf.l.wrap.b32 %r8744, %r8743, %r8743, 25; + add.s32 %r8745, %r8697, %r7852; + add.s32 %r8746, %r8745, %r8716; + xor.b32 %r8747, %r8685, %r8746; + shf.l.wrap.b32 %r8748, %r8747, %r8747, 16; + add.s32 %r8749, %r8748, %r8728; + xor.b32 %r8750, %r8749, %r8716; + shf.l.wrap.b32 %r8751, %r8750, %r8750, 20; + add.s32 %r8752, %r8746, %r7922; + add.s32 %r8753, %r8752, %r8751; + xor.b32 %r8754, %r8753, %r8748; + shf.l.wrap.b32 %r8755, %r8754, %r8754, 24; + add.s32 %r8756, %r8755, %r8749; + xor.b32 %r8757, %r8756, %r8751; + shf.l.wrap.b32 %r8758, %r8757, %r8757, 25; + add.s32 %r8759, %r8711, %r7859; + add.s32 %r8760, %r8759, %r8730; + xor.b32 %r8761, %r8760, %r8699; + shf.l.wrap.b32 %r8762, %r8761, %r8761, 16; + add.s32 %r8763, %r8762, %r8686; + xor.b32 %r8764, %r8763, %r8730; + shf.l.wrap.b32 %r8765, %r8764, %r8764, 20; + add.s32 %r8766, %r8760, %r7866; + add.s32 %r8767, %r8766, %r8765; + xor.b32 %r8768, %r8767, %r8762; + shf.l.wrap.b32 %r8769, %r8768, %r8768, 24; + add.s32 %r8770, %r8769, %r8763; + xor.b32 %r8771, %r8770, %r8765; + shf.l.wrap.b32 %r8772, %r8771, %r8771, 25; + add.s32 %r8773, %r8725, %r7887; + add.s32 %r8774, %r8773, %r8688; + xor.b32 %r8775, %r8774, %r8713; + shf.l.wrap.b32 %r8776, %r8775, %r8775, 16; + add.s32 %r8777, %r8776, %r8700; + xor.b32 %r8778, %r8777, %r8688; + shf.l.wrap.b32 %r8779, %r8778, %r8778, 20; + add.s32 %r8780, %r8774, %r7929; + add.s32 %r8781, %r8780, %r8779; + xor.b32 %r8782, %r8781, %r8776; + shf.l.wrap.b32 %r8783, %r8782, %r8782, 24; + add.s32 %r8784, %r8783, %r8777; + xor.b32 %r8785, %r8784, %r8779; + shf.l.wrap.b32 %r8786, %r8785, %r8785, 25; + xor.b32 %r8787, %r8770, %r8739; + xor.b32 %r8788, %r8784, %r8753; + xor.b32 %r8789, %r8742, %r8767; + xor.b32 %r8790, %r8781, %r8756; + xor.b32 %r8791, %r8786, %r8755; + xor.b32 %r8792, %r8744, %r8769; + xor.b32 %r8793, %r8783, %r8758; + xor.b32 %r8794, %r8772, %r8741; + st.local.u8 [%rd191+145], %r8787; + shr.u32 %r8795, %r8787, 8; + st.local.u8 [%rd191+146], %r8795; + shr.u32 %r8796, %r8787, 16; + st.local.u8 [%rd191+147], %r8796; + shr.u32 %r8797, %r8787, 24; + st.local.u8 [%rd191+148], %r8797; + st.local.u8 [%rd191+149], %r8788; + shr.u32 %r8798, %r8788, 8; + st.local.u8 [%rd191+150], %r8798; + shr.u32 %r8799, %r8788, 16; + st.local.u8 [%rd191+151], %r8799; + shr.u32 %r8800, %r8788, 24; + st.local.u8 [%rd191+152], %r8800; + st.local.u8 [%rd191+153], %r8789; + shr.u32 %r8801, %r8789, 8; + st.local.u8 [%rd191+154], %r8801; + shr.u32 %r8802, %r8789, 16; + st.local.u8 [%rd191+155], %r8802; + shr.u32 %r8803, %r8789, 24; + st.local.u8 [%rd191+156], %r8803; + st.local.u8 [%rd191+157], %r8790; + shr.u32 %r8804, %r8790, 8; + st.local.u8 [%rd191+158], %r8804; + shr.u32 %r8805, %r8790, 16; + st.local.u8 [%rd191+159], %r8805; + shr.u32 %r8806, %r8790, 24; + st.local.u8 [%rd191+160], %r8806; + st.local.u8 [%rd191+161], %r8791; + shr.u32 %r8807, %r8791, 8; + st.local.u8 [%rd191+162], %r8807; + shr.u32 %r8808, %r8791, 16; + st.local.u8 [%rd191+163], %r8808; + shr.u32 %r8809, %r8791, 24; + st.local.u8 [%rd191+164], %r8809; + st.local.u8 [%rd191+165], %r8792; + shr.u32 %r8810, %r8792, 8; + st.local.u8 [%rd191+166], %r8810; + shr.u32 %r8811, %r8792, 16; + st.local.u8 [%rd191+167], %r8811; + shr.u32 %r8812, %r8792, 24; + st.local.u8 [%rd191+168], %r8812; + st.local.u8 [%rd191+169], %r8793; + shr.u32 %r8813, %r8793, 8; + st.local.u8 [%rd191+170], %r8813; + shr.u32 %r8814, %r8793, 16; + st.local.u8 [%rd191+171], %r8814; + shr.u32 %r8815, %r8793, 24; + st.local.u8 [%rd191+172], %r8815; + st.local.u8 [%rd191+173], %r8794; + shr.u32 %r8816, %r8794, 8; + st.local.u8 [%rd191+174], %r8816; + shr.u32 %r8817, %r8794, 16; + st.local.u8 [%rd191+175], %r8817; + shr.u32 %r8818, %r8794, 24; + st.local.u8 [%rd191+176], %r8818; + ld.local.u8 %rs328, [%rd3+8]; + add.s16 %rs329, %rs328, -1; + st.local.u8 [%rd3+8], %rs329; + cvt.u64.u16 %rd192, %rs329; + and.b64 %rd193, %rd192, 255; + setp.lt.u64 %p38, %rd230, %rd193; + and.b16 %rs330, %rs329, 255; + mul.wide.u16 %r11681, %rs330, 32; + @%p38 bra $L__BB1_45; + +$L__BB1_46: + cvt.s64.s32 %rd194, %r11681; + add.s64 %rd195, %rd2, %rd194; + st.local.u8 [%rd195+145], %r97; + shr.u32 %r8819, %r97, 8; + st.local.u8 [%rd195+146], %r8819; + shr.u32 %r8820, %r97, 16; + st.local.u8 [%rd195+147], %r8820; + shr.u32 %r8821, %r97, 24; + st.local.u8 [%rd195+148], %r8821; + st.local.u8 [%rd195+149], %r98; + shr.u32 %r8822, %r98, 8; + st.local.u8 [%rd195+150], %r8822; + shr.u32 %r8823, %r98, 16; + st.local.u8 [%rd195+151], %r8823; + shr.u32 %r8824, %r98, 24; + st.local.u8 [%rd195+152], %r8824; + st.local.u8 [%rd195+153], %r99; + shr.u32 %r8825, %r99, 8; + st.local.u8 [%rd195+154], %r8825; + shr.u32 %r8826, %r99, 16; + st.local.u8 [%rd195+155], %r8826; + shr.u32 %r8827, %r99, 24; + st.local.u8 [%rd195+156], %r8827; + st.local.u8 [%rd195+157], %r100; + shr.u32 %r8828, %r100, 8; + st.local.u8 [%rd195+158], %r8828; + shr.u32 %r8829, %r100, 16; + st.local.u8 [%rd195+159], %r8829; + shr.u32 %r8830, %r100, 24; + st.local.u8 [%rd195+160], %r8830; + st.local.u8 [%rd195+161], %r101; + shr.u32 %r8831, %r101, 8; + st.local.u8 [%rd195+162], %r8831; + shr.u32 %r8832, %r101, 16; + st.local.u8 [%rd195+163], %r8832; + shr.u32 %r8833, %r101, 24; + st.local.u8 [%rd195+164], %r8833; + st.local.u8 [%rd195+165], %r102; + shr.u32 %r8834, %r102, 8; + st.local.u8 [%rd195+166], %r8834; + shr.u32 %r8835, %r102, 16; + st.local.u8 [%rd195+167], %r8835; + shr.u32 %r8836, %r102, 24; + st.local.u8 [%rd195+168], %r8836; + st.local.u8 [%rd195+169], %r103; + shr.u32 %r8837, %r103, 8; + st.local.u8 [%rd195+170], %r8837; + shr.u32 %r8838, %r103, 16; + st.local.u8 [%rd195+171], %r8838; + shr.u32 %r8839, %r103, 24; + st.local.u8 [%rd195+172], %r8839; + st.local.u8 [%rd195+173], %r104; + shr.u32 %r8840, %r104, 8; + st.local.u8 [%rd195+174], %r8840; + shr.u32 %r8841, %r104, 16; + st.local.u8 [%rd195+175], %r8841; + shr.u32 %r8842, %r104, 24; + st.local.u8 [%rd195+176], %r8842; + ld.local.u8 %rs388, [%rd3+8]; + bra.uni $L__BB1_47; + +$L__BB1_29: + cvt.u32.u16 %r3957, %rs14; + and.b32 %r3958, %r3957, 255; + { // callseq 2, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd254; + .param .b64 param1; + st.param.b64 [param1+0], %rd49; + .param .b64 param2; + st.param.b64 [param2+0], %rd98; + .param .b64 param3; + st.param.b64 [param3+0], %rd251; + .param .b32 param4; + st.param.b32 [param4+0], %r3958; + .param .b64 param5; + st.param.b64 [param5+0], %rd142; + .param .b64 retval0; + call.uni (retval0), + _ZN44_INTERNAL_bc27aae3_13_kaspa_cuda_cu_01cb461828blake3_compress_subtree_wideEPKhyPKjyhPh, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b64 %rd164, [retval0+0]; + } // callseq 2 + ld.local.v4.u32 {%r3959, %r3960, %r3961, %r3962}, [%rd42]; + ld.local.v4.u32 {%r3963, %r3964, %r3965, %r3966}, [%rd42+16]; + ld.local.v4.u32 {%r3967, %r3968, %r3969, %r3970}, [%rd42+32]; + ld.local.v4.u32 {%r3971, %r3972, %r3973, %r3974}, [%rd42+48]; + ld.local.u64 %rd165, [%rd3+-72]; + popc.b64 %r3975, %rd165; + cvt.u64.u32 %rd51, %r3975; + ld.local.u8 %rs137, [%rd3+8]; + cvt.u64.u16 %rd166, %rs137; + setp.ge.u64 %p27, %rd51, %rd166; + mul.wide.u16 %r11661, %rs137, 32; + @%p27 bra $L__BB1_32; + +$L__BB1_31: + popc.b64 %r11647, %rd165; + cvt.u64.u32 %rd226, %r11647; + add.s32 %r3976, %r11661, -64; + cvt.s64.s32 %rd167, %r3976; + add.s64 %rd168, %rd2, %rd167; + ld.local.u8 %r3977, [%rd3+2]; + ld.local.u8 %r3978, [%rd168+145]; + ld.local.u8 %r3979, [%rd168+146]; + prmt.b32 %r3980, %r3979, %r3978, 30212; + ld.local.u8 %r3981, [%rd168+147]; + prmt.b32 %r3982, %r3981, %r3980, 28756; + ld.local.u8 %r3983, [%rd168+148]; + prmt.b32 %r3984, %r3983, %r3982, 1620; + ld.local.u8 %r3985, [%rd168+149]; + ld.local.u8 %r3986, [%rd168+150]; + prmt.b32 %r3987, %r3986, %r3985, 30212; + ld.local.u8 %r3988, [%rd168+151]; + prmt.b32 %r3989, %r3988, %r3987, 28756; + ld.local.u8 %r3990, [%rd168+152]; + prmt.b32 %r3991, %r3990, %r3989, 1620; + ld.local.u8 %r3992, [%rd168+153]; + ld.local.u8 %r3993, [%rd168+154]; + prmt.b32 %r3994, %r3993, %r3992, 30212; + ld.local.u8 %r3995, [%rd168+155]; + prmt.b32 %r3996, %r3995, %r3994, 28756; + ld.local.u8 %r3997, [%rd168+156]; + prmt.b32 %r3998, %r3997, %r3996, 1620; + ld.local.u8 %r3999, [%rd168+157]; + ld.local.u8 %r4000, [%rd168+158]; + prmt.b32 %r4001, %r4000, %r3999, 30212; + ld.local.u8 %r4002, [%rd168+159]; + prmt.b32 %r4003, %r4002, %r4001, 28756; + ld.local.u8 %r4004, [%rd168+160]; + prmt.b32 %r4005, %r4004, %r4003, 1620; + ld.local.u8 %r4006, [%rd168+161]; + ld.local.u8 %r4007, [%rd168+162]; + prmt.b32 %r4008, %r4007, %r4006, 30212; + ld.local.u8 %r4009, [%rd168+163]; + prmt.b32 %r4010, %r4009, %r4008, 28756; + ld.local.u8 %r4011, [%rd168+164]; + prmt.b32 %r4012, %r4011, %r4010, 1620; + ld.local.u8 %r4013, [%rd168+165]; + ld.local.u8 %r4014, [%rd168+166]; + prmt.b32 %r4015, %r4014, %r4013, 30212; + ld.local.u8 %r4016, [%rd168+167]; + prmt.b32 %r4017, %r4016, %r4015, 28756; + ld.local.u8 %r4018, [%rd168+168]; + prmt.b32 %r4019, %r4018, %r4017, 1620; + ld.local.u8 %r4020, [%rd168+169]; + ld.local.u8 %r4021, [%rd168+170]; + prmt.b32 %r4022, %r4021, %r4020, 30212; + ld.local.u8 %r4023, [%rd168+171]; + prmt.b32 %r4024, %r4023, %r4022, 28756; + ld.local.u8 %r4025, [%rd168+172]; + prmt.b32 %r4026, %r4025, %r4024, 1620; + ld.local.u8 %r4027, [%rd168+173]; + ld.local.u8 %r4028, [%rd168+174]; + prmt.b32 %r4029, %r4028, %r4027, 30212; + ld.local.u8 %r4030, [%rd168+175]; + prmt.b32 %r4031, %r4030, %r4029, 28756; + ld.local.u8 %r4032, [%rd168+176]; + prmt.b32 %r4033, %r4032, %r4031, 1620; + ld.local.u8 %r4034, [%rd168+177]; + ld.local.u8 %r4035, [%rd168+178]; + prmt.b32 %r4036, %r4035, %r4034, 30212; + ld.local.u8 %r4037, [%rd168+179]; + prmt.b32 %r4038, %r4037, %r4036, 28756; + ld.local.u8 %r4039, [%rd168+180]; + prmt.b32 %r4040, %r4039, %r4038, 1620; + ld.local.u8 %r4041, [%rd168+181]; + ld.local.u8 %r4042, [%rd168+182]; + prmt.b32 %r4043, %r4042, %r4041, 30212; + ld.local.u8 %r4044, [%rd168+183]; + prmt.b32 %r4045, %r4044, %r4043, 28756; + ld.local.u8 %r4046, [%rd168+184]; + prmt.b32 %r4047, %r4046, %r4045, 1620; + ld.local.u8 %r4048, [%rd168+185]; + ld.local.u8 %r4049, [%rd168+186]; + prmt.b32 %r4050, %r4049, %r4048, 30212; + ld.local.u8 %r4051, [%rd168+187]; + prmt.b32 %r4052, %r4051, %r4050, 28756; + ld.local.u8 %r4053, [%rd168+188]; + prmt.b32 %r4054, %r4053, %r4052, 1620; + ld.local.u8 %r4055, [%rd168+189]; + ld.local.u8 %r4056, [%rd168+190]; + prmt.b32 %r4057, %r4056, %r4055, 30212; + ld.local.u8 %r4058, [%rd168+191]; + prmt.b32 %r4059, %r4058, %r4057, 28756; + ld.local.u8 %r4060, [%rd168+192]; + prmt.b32 %r4061, %r4060, %r4059, 1620; + ld.local.u8 %r4062, [%rd168+193]; + ld.local.u8 %r4063, [%rd168+194]; + prmt.b32 %r4064, %r4063, %r4062, 30212; + ld.local.u8 %r4065, [%rd168+195]; + prmt.b32 %r4066, %r4065, %r4064, 28756; + ld.local.u8 %r4067, [%rd168+196]; + prmt.b32 %r4068, %r4067, %r4066, 1620; + ld.local.u8 %r4069, [%rd168+197]; + ld.local.u8 %r4070, [%rd168+198]; + prmt.b32 %r4071, %r4070, %r4069, 30212; + ld.local.u8 %r4072, [%rd168+199]; + prmt.b32 %r4073, %r4072, %r4071, 28756; + ld.local.u8 %r4074, [%rd168+200]; + prmt.b32 %r4075, %r4074, %r4073, 1620; + ld.local.u8 %r4076, [%rd168+201]; + ld.local.u8 %r4077, [%rd168+202]; + prmt.b32 %r4078, %r4077, %r4076, 30212; + ld.local.u8 %r4079, [%rd168+203]; + prmt.b32 %r4080, %r4079, %r4078, 28756; + ld.local.u8 %r4081, [%rd168+204]; + prmt.b32 %r4082, %r4081, %r4080, 1620; + ld.local.u8 %r4083, [%rd168+205]; + ld.local.u8 %r4084, [%rd168+206]; + prmt.b32 %r4085, %r4084, %r4083, 30212; + ld.local.u8 %r4086, [%rd168+207]; + prmt.b32 %r4087, %r4086, %r4085, 28756; + ld.local.u8 %r4088, [%rd168+208]; + prmt.b32 %r4089, %r4088, %r4087, 1620; + or.b32 %r4090, %r3977, 4; + ld.local.u8 %r4091, [%rd3+-120]; + ld.local.u8 %r4092, [%rd3+-119]; + prmt.b32 %r4093, %r4092, %r4091, 30212; + ld.local.u8 %r4094, [%rd3+-118]; + ld.local.u8 %r4095, [%rd3+-117]; + prmt.b32 %r4096, %r4095, %r4094, 30212; + prmt.b32 %r4097, %r4096, %r4093, 4180; + ld.local.u8 %r4098, [%rd3+-136]; + ld.local.u8 %r4099, [%rd3+-135]; + prmt.b32 %r4100, %r4099, %r4098, 30212; + ld.local.u8 %r4101, [%rd3+-134]; + ld.local.u8 %r4102, [%rd3+-133]; + prmt.b32 %r4103, %r4102, %r4101, 30212; + prmt.b32 %r4104, %r4103, %r4100, 4180; + add.s32 %r4105, %r4097, %r4104; + add.s32 %r4106, %r4105, %r3984; + shf.l.wrap.b32 %r4107, %r4106, %r4106, 16; + add.s32 %r4108, %r4107, 1779033703; + xor.b32 %r4109, %r4108, %r4097; + shf.l.wrap.b32 %r4110, %r4109, %r4109, 20; + add.s32 %r4111, %r3991, %r4106; + add.s32 %r4112, %r4111, %r4110; + xor.b32 %r4113, %r4112, %r4107; + shf.l.wrap.b32 %r4114, %r4113, %r4113, 24; + add.s32 %r4115, %r4114, %r4108; + xor.b32 %r4116, %r4115, %r4110; + shf.l.wrap.b32 %r4117, %r4116, %r4116, 25; + ld.local.u8 %r4118, [%rd3+-116]; + ld.local.u8 %r4119, [%rd3+-115]; + prmt.b32 %r4120, %r4119, %r4118, 30212; + ld.local.u8 %r4121, [%rd3+-114]; + ld.local.u8 %r4122, [%rd3+-113]; + prmt.b32 %r4123, %r4122, %r4121, 30212; + prmt.b32 %r4124, %r4123, %r4120, 4180; + ld.local.u8 %r4125, [%rd3+-132]; + ld.local.u8 %r4126, [%rd3+-131]; + prmt.b32 %r4127, %r4126, %r4125, 30212; + ld.local.u8 %r4128, [%rd3+-130]; + ld.local.u8 %r4129, [%rd3+-129]; + prmt.b32 %r4130, %r4129, %r4128, 30212; + prmt.b32 %r4131, %r4130, %r4127, 4180; + add.s32 %r4132, %r4124, %r4131; + add.s32 %r4133, %r4132, %r3998; + shf.l.wrap.b32 %r4134, %r4133, %r4133, 16; + add.s32 %r4135, %r4134, -1150833019; + xor.b32 %r4136, %r4135, %r4124; + shf.l.wrap.b32 %r4137, %r4136, %r4136, 20; + add.s32 %r4138, %r4005, %r4133; + add.s32 %r4139, %r4138, %r4137; + xor.b32 %r4140, %r4139, %r4134; + shf.l.wrap.b32 %r4141, %r4140, %r4140, 24; + add.s32 %r4142, %r4141, %r4135; + xor.b32 %r4143, %r4142, %r4137; + shf.l.wrap.b32 %r4144, %r4143, %r4143, 25; + ld.local.u8 %r4145, [%rd3+-112]; + ld.local.u8 %r4146, [%rd3+-111]; + prmt.b32 %r4147, %r4146, %r4145, 30212; + ld.local.u8 %r4148, [%rd3+-110]; + ld.local.u8 %r4149, [%rd3+-109]; + prmt.b32 %r4150, %r4149, %r4148, 30212; + prmt.b32 %r4151, %r4150, %r4147, 4180; + ld.local.u8 %r4152, [%rd3+-128]; + ld.local.u8 %r4153, [%rd3+-127]; + prmt.b32 %r4154, %r4153, %r4152, 30212; + ld.local.u8 %r4155, [%rd3+-126]; + ld.local.u8 %r4156, [%rd3+-125]; + prmt.b32 %r4157, %r4156, %r4155, 30212; + prmt.b32 %r4158, %r4157, %r4154, 4180; + add.s32 %r4159, %r4151, %r4158; + add.s32 %r4160, %r4159, %r4012; + shr.u32 %r4161, %r4160, 16; + shl.b32 %r4162, %r4160, 16; + xor.b32 %r4163, %r4162, 4194304; + or.b32 %r4164, %r4163, %r4161; + add.s32 %r4165, %r4164, 1013904242; + xor.b32 %r4166, %r4165, %r4151; + shf.l.wrap.b32 %r4167, %r4166, %r4166, 20; + add.s32 %r4168, %r4019, %r4160; + add.s32 %r4169, %r4168, %r4167; + xor.b32 %r4170, %r4169, %r4164; + shf.l.wrap.b32 %r4171, %r4170, %r4170, 24; + add.s32 %r4172, %r4171, %r4165; + xor.b32 %r4173, %r4172, %r4167; + shf.l.wrap.b32 %r4174, %r4173, %r4173, 25; + ld.local.u8 %r4175, [%rd3+-108]; + ld.local.u8 %r4176, [%rd3+-107]; + prmt.b32 %r4177, %r4176, %r4175, 30212; + ld.local.u8 %r4178, [%rd3+-106]; + ld.local.u8 %r4179, [%rd3+-105]; + prmt.b32 %r4180, %r4179, %r4178, 30212; + prmt.b32 %r4181, %r4180, %r4177, 4180; + ld.local.u8 %r4182, [%rd3+-124]; + ld.local.u8 %r4183, [%rd3+-123]; + prmt.b32 %r4184, %r4183, %r4182, 30212; + ld.local.u8 %r4185, [%rd3+-122]; + ld.local.u8 %r4186, [%rd3+-121]; + prmt.b32 %r4187, %r4186, %r4185, 30212; + prmt.b32 %r4188, %r4187, %r4184, 4180; + add.s32 %r4189, %r4181, %r4188; + add.s32 %r4190, %r4189, %r4026; + xor.b32 %r4191, %r4190, %r4090; + shr.u32 %r4192, %r4190, 16; + shl.b32 %r4193, %r4191, 16; + or.b32 %r4194, %r4193, %r4192; + add.s32 %r4195, %r4194, -1521486534; + xor.b32 %r4196, %r4195, %r4181; + shf.l.wrap.b32 %r4197, %r4196, %r4196, 20; + add.s32 %r4198, %r4033, %r4190; + add.s32 %r4199, %r4198, %r4197; + xor.b32 %r4200, %r4199, %r4194; + shf.l.wrap.b32 %r4201, %r4200, %r4200, 24; + add.s32 %r4202, %r4201, %r4195; + xor.b32 %r4203, %r4202, %r4197; + shf.l.wrap.b32 %r4204, %r4203, %r4203, 25; + add.s32 %r4205, %r4144, %r4112; + add.s32 %r4206, %r4205, %r4040; + xor.b32 %r4207, %r4201, %r4206; + shf.l.wrap.b32 %r4208, %r4207, %r4207, 16; + add.s32 %r4209, %r4208, %r4172; + xor.b32 %r4210, %r4209, %r4144; + shf.l.wrap.b32 %r4211, %r4210, %r4210, 20; + add.s32 %r4212, %r4047, %r4206; + add.s32 %r4213, %r4212, %r4211; + xor.b32 %r4214, %r4213, %r4208; + shf.l.wrap.b32 %r4215, %r4214, %r4214, 24; + add.s32 %r4216, %r4215, %r4209; + xor.b32 %r4217, %r4216, %r4211; + shf.l.wrap.b32 %r4218, %r4217, %r4217, 25; + add.s32 %r4219, %r4174, %r4139; + add.s32 %r4220, %r4219, %r4054; + xor.b32 %r4221, %r4220, %r4114; + shf.l.wrap.b32 %r4222, %r4221, %r4221, 16; + add.s32 %r4223, %r4222, %r4202; + xor.b32 %r4224, %r4223, %r4174; + shf.l.wrap.b32 %r4225, %r4224, %r4224, 20; + add.s32 %r4226, %r4061, %r4220; + add.s32 %r4227, %r4226, %r4225; + xor.b32 %r4228, %r4227, %r4222; + shf.l.wrap.b32 %r4229, %r4228, %r4228, 24; + add.s32 %r4230, %r4229, %r4223; + xor.b32 %r4231, %r4230, %r4225; + shf.l.wrap.b32 %r4232, %r4231, %r4231, 25; + add.s32 %r4233, %r4204, %r4169; + add.s32 %r4234, %r4233, %r4068; + xor.b32 %r4235, %r4234, %r4141; + shf.l.wrap.b32 %r4236, %r4235, %r4235, 16; + add.s32 %r4237, %r4236, %r4115; + xor.b32 %r4238, %r4237, %r4204; + shf.l.wrap.b32 %r4239, %r4238, %r4238, 20; + add.s32 %r4240, %r4075, %r4234; + add.s32 %r4241, %r4240, %r4239; + xor.b32 %r4242, %r4241, %r4236; + shf.l.wrap.b32 %r4243, %r4242, %r4242, 24; + add.s32 %r4244, %r4243, %r4237; + xor.b32 %r4245, %r4244, %r4239; + shf.l.wrap.b32 %r4246, %r4245, %r4245, 25; + add.s32 %r4247, %r4199, %r4117; + add.s32 %r4248, %r4247, %r4082; + xor.b32 %r4249, %r4248, %r4171; + shf.l.wrap.b32 %r4250, %r4249, %r4249, 16; + add.s32 %r4251, %r4250, %r4142; + xor.b32 %r4252, %r4251, %r4117; + shf.l.wrap.b32 %r4253, %r4252, %r4252, 20; + add.s32 %r4254, %r4089, %r4248; + add.s32 %r4255, %r4254, %r4253; + xor.b32 %r4256, %r4255, %r4250; + shf.l.wrap.b32 %r4257, %r4256, %r4256, 24; + add.s32 %r4258, %r4257, %r4251; + xor.b32 %r4259, %r4258, %r4253; + shf.l.wrap.b32 %r4260, %r4259, %r4259, 25; + add.s32 %r4261, %r4213, %r3998; + add.s32 %r4262, %r4261, %r4260; + xor.b32 %r4263, %r4262, %r4229; + shf.l.wrap.b32 %r4264, %r4263, %r4263, 16; + add.s32 %r4265, %r4264, %r4244; + xor.b32 %r4266, %r4265, %r4260; + shf.l.wrap.b32 %r4267, %r4266, %r4266, 20; + add.s32 %r4268, %r4262, %r4026; + add.s32 %r4269, %r4268, %r4267; + xor.b32 %r4270, %r4269, %r4264; + shf.l.wrap.b32 %r4271, %r4270, %r4270, 24; + add.s32 %r4272, %r4271, %r4265; + xor.b32 %r4273, %r4272, %r4267; + shf.l.wrap.b32 %r4274, %r4273, %r4273, 25; + add.s32 %r4275, %r4227, %r4005; + add.s32 %r4276, %r4275, %r4218; + xor.b32 %r4277, %r4243, %r4276; + shf.l.wrap.b32 %r4278, %r4277, %r4277, 16; + add.s32 %r4279, %r4258, %r4278; + xor.b32 %r4280, %r4279, %r4218; + shf.l.wrap.b32 %r4281, %r4280, %r4280, 20; + add.s32 %r4282, %r4276, %r4054; + add.s32 %r4283, %r4282, %r4281; + xor.b32 %r4284, %r4283, %r4278; + shf.l.wrap.b32 %r4285, %r4284, %r4284, 24; + add.s32 %r4286, %r4285, %r4279; + xor.b32 %r4287, %r4286, %r4281; + shf.l.wrap.b32 %r4288, %r4287, %r4287, 25; + add.s32 %r4289, %r4232, %r4033; + add.s32 %r4290, %r4289, %r4241; + xor.b32 %r4291, %r4257, %r4290; + shf.l.wrap.b32 %r4292, %r4291, %r4291, 16; + add.s32 %r4293, %r4292, %r4216; + xor.b32 %r4294, %r4293, %r4232; + shf.l.wrap.b32 %r4295, %r4294, %r4294, 20; + add.s32 %r4296, %r4290, %r3984; + add.s32 %r4297, %r4296, %r4295; + xor.b32 %r4298, %r4297, %r4292; + shf.l.wrap.b32 %r4299, %r4298, %r4298, 24; + add.s32 %r4300, %r4299, %r4293; + xor.b32 %r4301, %r4300, %r4295; + shf.l.wrap.b32 %r4302, %r4301, %r4301, 25; + add.s32 %r4303, %r4246, %r4012; + add.s32 %r4304, %r4303, %r4255; + xor.b32 %r4305, %r4304, %r4215; + shf.l.wrap.b32 %r4306, %r4305, %r4305, 16; + add.s32 %r4307, %r4306, %r4230; + xor.b32 %r4308, %r4307, %r4246; + shf.l.wrap.b32 %r4309, %r4308, %r4308, 20; + add.s32 %r4310, %r4304, %r4075; + add.s32 %r4311, %r4310, %r4309; + xor.b32 %r4312, %r4311, %r4306; + shf.l.wrap.b32 %r4313, %r4312, %r4312, 24; + add.s32 %r4314, %r4313, %r4307; + xor.b32 %r4315, %r4314, %r4309; + shf.l.wrap.b32 %r4316, %r4315, %r4315, 25; + add.s32 %r4317, %r4288, %r3991; + add.s32 %r4318, %r4317, %r4269; + xor.b32 %r4319, %r4318, %r4313; + shf.l.wrap.b32 %r4320, %r4319, %r4319, 16; + add.s32 %r4321, %r4320, %r4300; + xor.b32 %r4322, %r4321, %r4288; + shf.l.wrap.b32 %r4323, %r4322, %r4322, 20; + add.s32 %r4324, %r4318, %r4061; + add.s32 %r4325, %r4324, %r4323; + xor.b32 %r4326, %r4325, %r4320; + shf.l.wrap.b32 %r4327, %r4326, %r4326, 24; + add.s32 %r4328, %r4327, %r4321; + xor.b32 %r4329, %r4328, %r4323; + shf.l.wrap.b32 %r4330, %r4329, %r4329, 25; + add.s32 %r4331, %r4283, %r4068; + add.s32 %r4332, %r4331, %r4302; + xor.b32 %r4333, %r4271, %r4332; + shf.l.wrap.b32 %r4334, %r4333, %r4333, 16; + add.s32 %r4335, %r4334, %r4314; + xor.b32 %r4336, %r4335, %r4302; + shf.l.wrap.b32 %r4337, %r4336, %r4336, 20; + add.s32 %r4338, %r4332, %r4019; + add.s32 %r4339, %r4338, %r4337; + xor.b32 %r4340, %r4339, %r4334; + shf.l.wrap.b32 %r4341, %r4340, %r4340, 24; + add.s32 %r4342, %r4341, %r4335; + xor.b32 %r4343, %r4342, %r4337; + shf.l.wrap.b32 %r4344, %r4343, %r4343, 25; + add.s32 %r4345, %r4297, %r4047; + add.s32 %r4346, %r4345, %r4316; + xor.b32 %r4347, %r4346, %r4285; + shf.l.wrap.b32 %r4348, %r4347, %r4347, 16; + add.s32 %r4349, %r4348, %r4272; + xor.b32 %r4350, %r4349, %r4316; + shf.l.wrap.b32 %r4351, %r4350, %r4350, 20; + add.s32 %r4352, %r4346, %r4082; + add.s32 %r4353, %r4352, %r4351; + xor.b32 %r4354, %r4353, %r4348; + shf.l.wrap.b32 %r4355, %r4354, %r4354, 24; + add.s32 %r4356, %r4355, %r4349; + xor.b32 %r4357, %r4356, %r4351; + shf.l.wrap.b32 %r4358, %r4357, %r4357, 25; + add.s32 %r4359, %r4311, %r4089; + add.s32 %r4360, %r4359, %r4274; + xor.b32 %r4361, %r4360, %r4299; + shf.l.wrap.b32 %r4362, %r4361, %r4361, 16; + add.s32 %r4363, %r4362, %r4286; + xor.b32 %r4364, %r4363, %r4274; + shf.l.wrap.b32 %r4365, %r4364, %r4364, 20; + add.s32 %r4366, %r4360, %r4040; + add.s32 %r4367, %r4366, %r4365; + xor.b32 %r4368, %r4367, %r4362; + shf.l.wrap.b32 %r4369, %r4368, %r4368, 24; + add.s32 %r4370, %r4369, %r4363; + xor.b32 %r4371, %r4370, %r4365; + shf.l.wrap.b32 %r4372, %r4371, %r4371, 25; + add.s32 %r4373, %r4325, %r4005; + add.s32 %r4374, %r4373, %r4372; + xor.b32 %r4375, %r4374, %r4341; + shf.l.wrap.b32 %r4376, %r4375, %r4375, 16; + add.s32 %r4377, %r4376, %r4356; + xor.b32 %r4378, %r4377, %r4372; + shf.l.wrap.b32 %r4379, %r4378, %r4378, 20; + add.s32 %r4380, %r4374, %r4012; + add.s32 %r4381, %r4380, %r4379; + xor.b32 %r4382, %r4381, %r4376; + shf.l.wrap.b32 %r4383, %r4382, %r4382, 24; + add.s32 %r4384, %r4383, %r4377; + xor.b32 %r4385, %r4384, %r4379; + shf.l.wrap.b32 %r4386, %r4385, %r4385, 25; + add.s32 %r4387, %r4339, %r4054; + add.s32 %r4388, %r4387, %r4330; + xor.b32 %r4389, %r4388, %r4355; + shf.l.wrap.b32 %r4390, %r4389, %r4389, 16; + add.s32 %r4391, %r4390, %r4370; + xor.b32 %r4392, %r4391, %r4330; + shf.l.wrap.b32 %r4393, %r4392, %r4392, 20; + add.s32 %r4394, %r4388, %r4068; + add.s32 %r4395, %r4394, %r4393; + xor.b32 %r4396, %r4395, %r4390; + shf.l.wrap.b32 %r4397, %r4396, %r4396, 24; + add.s32 %r4398, %r4397, %r4391; + xor.b32 %r4399, %r4398, %r4393; + shf.l.wrap.b32 %r4400, %r4399, %r4399, 25; + add.s32 %r4401, %r4353, %r4075; + add.s32 %r4402, %r4401, %r4344; + xor.b32 %r4403, %r4369, %r4402; + shf.l.wrap.b32 %r4404, %r4403, %r4403, 16; + add.s32 %r4405, %r4404, %r4328; + xor.b32 %r4406, %r4405, %r4344; + shf.l.wrap.b32 %r4407, %r4406, %r4406, 20; + add.s32 %r4408, %r4402, %r3998; + add.s32 %r4409, %r4408, %r4407; + xor.b32 %r4410, %r4409, %r4404; + shf.l.wrap.b32 %r4411, %r4410, %r4410, 24; + add.s32 %r4412, %r4411, %r4405; + xor.b32 %r4413, %r4412, %r4407; + shf.l.wrap.b32 %r4414, %r4413, %r4413, 25; + add.s32 %r4415, %r4358, %r4033; + add.s32 %r4416, %r4415, %r4367; + xor.b32 %r4417, %r4416, %r4327; + shf.l.wrap.b32 %r4418, %r4417, %r4417, 16; + add.s32 %r4419, %r4418, %r4342; + xor.b32 %r4420, %r4419, %r4358; + shf.l.wrap.b32 %r4421, %r4420, %r4420, 20; + add.s32 %r4422, %r4416, %r4082; + add.s32 %r4423, %r4422, %r4421; + xor.b32 %r4424, %r4423, %r4418; + shf.l.wrap.b32 %r4425, %r4424, %r4424, 24; + add.s32 %r4426, %r4425, %r4419; + xor.b32 %r4427, %r4426, %r4421; + shf.l.wrap.b32 %r4428, %r4427, %r4427, 25; + add.s32 %r4429, %r4381, %r4026; + add.s32 %r4430, %r4429, %r4400; + xor.b32 %r4431, %r4430, %r4425; + shf.l.wrap.b32 %r4432, %r4431, %r4431, 16; + add.s32 %r4433, %r4432, %r4412; + xor.b32 %r4434, %r4433, %r4400; + shf.l.wrap.b32 %r4435, %r4434, %r4434, 20; + add.s32 %r4436, %r4430, %r4019; + add.s32 %r4437, %r4436, %r4435; + xor.b32 %r4438, %r4437, %r4432; + shf.l.wrap.b32 %r4439, %r4438, %r4438, 24; + add.s32 %r4440, %r4439, %r4433; + xor.b32 %r4441, %r4440, %r4435; + shf.l.wrap.b32 %r4442, %r4441, %r4441, 25; + add.s32 %r4443, %r4395, %r4047; + add.s32 %r4444, %r4443, %r4414; + xor.b32 %r4445, %r4383, %r4444; + shf.l.wrap.b32 %r4446, %r4445, %r4445, 16; + add.s32 %r4447, %r4446, %r4426; + xor.b32 %r4448, %r4447, %r4414; + shf.l.wrap.b32 %r4449, %r4448, %r4448, 20; + add.s32 %r4450, %r4444, %r3984; + add.s32 %r4451, %r4450, %r4449; + xor.b32 %r4452, %r4451, %r4446; + shf.l.wrap.b32 %r4453, %r4452, %r4452, 24; + add.s32 %r4454, %r4453, %r4447; + xor.b32 %r4455, %r4454, %r4449; + shf.l.wrap.b32 %r4456, %r4455, %r4455, 25; + add.s32 %r4457, %r4409, %r4061; + add.s32 %r4458, %r4457, %r4428; + xor.b32 %r4459, %r4458, %r4397; + shf.l.wrap.b32 %r4460, %r4459, %r4459, 16; + add.s32 %r4461, %r4460, %r4384; + xor.b32 %r4462, %r4461, %r4428; + shf.l.wrap.b32 %r4463, %r4462, %r4462, 20; + add.s32 %r4464, %r4458, %r4089; + add.s32 %r4465, %r4464, %r4463; + xor.b32 %r4466, %r4465, %r4460; + shf.l.wrap.b32 %r4467, %r4466, %r4466, 24; + add.s32 %r4468, %r4467, %r4461; + xor.b32 %r4469, %r4468, %r4463; + shf.l.wrap.b32 %r4470, %r4469, %r4469, 25; + add.s32 %r4471, %r4423, %r4040; + add.s32 %r4472, %r4471, %r4386; + xor.b32 %r4473, %r4472, %r4411; + shf.l.wrap.b32 %r4474, %r4473, %r4473, 16; + add.s32 %r4475, %r4474, %r4398; + xor.b32 %r4476, %r4475, %r4386; + shf.l.wrap.b32 %r4477, %r4476, %r4476, 20; + add.s32 %r4478, %r4472, %r3991; + add.s32 %r4479, %r4478, %r4477; + xor.b32 %r4480, %r4479, %r4474; + shf.l.wrap.b32 %r4481, %r4480, %r4480, 24; + add.s32 %r4482, %r4481, %r4475; + xor.b32 %r4483, %r4482, %r4477; + shf.l.wrap.b32 %r4484, %r4483, %r4483, 25; + add.s32 %r4485, %r4437, %r4054; + add.s32 %r4486, %r4485, %r4484; + xor.b32 %r4487, %r4486, %r4453; + shf.l.wrap.b32 %r4488, %r4487, %r4487, 16; + add.s32 %r4489, %r4488, %r4468; + xor.b32 %r4490, %r4489, %r4484; + shf.l.wrap.b32 %r4491, %r4490, %r4490, 20; + add.s32 %r4492, %r4486, %r4033; + add.s32 %r4493, %r4492, %r4491; + xor.b32 %r4494, %r4493, %r4488; + shf.l.wrap.b32 %r4495, %r4494, %r4494, 24; + add.s32 %r4496, %r4495, %r4489; + xor.b32 %r4497, %r4496, %r4491; + shf.l.wrap.b32 %r4498, %r4497, %r4497, 25; + add.s32 %r4499, %r4451, %r4068; + add.s32 %r4500, %r4499, %r4442; + xor.b32 %r4501, %r4500, %r4467; + shf.l.wrap.b32 %r4502, %r4501, %r4501, 16; + add.s32 %r4503, %r4502, %r4482; + xor.b32 %r4504, %r4503, %r4442; + shf.l.wrap.b32 %r4505, %r4504, %r4504, 20; + add.s32 %r4506, %r4500, %r4047; + add.s32 %r4507, %r4506, %r4505; + xor.b32 %r4508, %r4507, %r4502; + shf.l.wrap.b32 %r4509, %r4508, %r4508, 24; + add.s32 %r4510, %r4509, %r4503; + xor.b32 %r4511, %r4510, %r4505; + shf.l.wrap.b32 %r4512, %r4511, %r4511, 25; + add.s32 %r4513, %r4465, %r4082; + add.s32 %r4514, %r4513, %r4456; + xor.b32 %r4515, %r4481, %r4514; + shf.l.wrap.b32 %r4516, %r4515, %r4515, 16; + add.s32 %r4517, %r4516, %r4440; + xor.b32 %r4518, %r4517, %r4456; + shf.l.wrap.b32 %r4519, %r4518, %r4518, 20; + add.s32 %r4520, %r4514, %r4005; + add.s32 %r4521, %r4520, %r4519; + xor.b32 %r4522, %r4521, %r4516; + shf.l.wrap.b32 %r4523, %r4522, %r4522, 24; + add.s32 %r4524, %r4523, %r4517; + xor.b32 %r4525, %r4524, %r4519; + shf.l.wrap.b32 %r4526, %r4525, %r4525, 25; + add.s32 %r4527, %r4479, %r4075; + add.s32 %r4528, %r4527, %r4470; + xor.b32 %r4529, %r4528, %r4439; + shf.l.wrap.b32 %r4530, %r4529, %r4529, 16; + add.s32 %r4531, %r4530, %r4454; + xor.b32 %r4532, %r4531, %r4470; + shf.l.wrap.b32 %r4533, %r4532, %r4532, 20; + add.s32 %r4534, %r4528, %r4089; + add.s32 %r4535, %r4534, %r4533; + xor.b32 %r4536, %r4535, %r4530; + shf.l.wrap.b32 %r4537, %r4536, %r4536, 24; + add.s32 %r4538, %r4537, %r4531; + xor.b32 %r4539, %r4538, %r4533; + shf.l.wrap.b32 %r4540, %r4539, %r4539, 25; + add.s32 %r4541, %r4493, %r4012; + add.s32 %r4542, %r4541, %r4512; + xor.b32 %r4543, %r4542, %r4537; + shf.l.wrap.b32 %r4544, %r4543, %r4543, 16; + add.s32 %r4545, %r4544, %r4524; + xor.b32 %r4546, %r4545, %r4512; + shf.l.wrap.b32 %r4547, %r4546, %r4546, 20; + add.s32 %r4548, %r4542, %r3984; + add.s32 %r4549, %r4548, %r4547; + xor.b32 %r4550, %r4549, %r4544; + shf.l.wrap.b32 %r4551, %r4550, %r4550, 24; + add.s32 %r4552, %r4551, %r4545; + xor.b32 %r4553, %r4552, %r4547; + shf.l.wrap.b32 %r4554, %r4553, %r4553, 25; + add.s32 %r4555, %r4507, %r4061; + add.s32 %r4556, %r4555, %r4526; + xor.b32 %r4557, %r4495, %r4556; + shf.l.wrap.b32 %r4558, %r4557, %r4557, 16; + add.s32 %r4559, %r4558, %r4538; + xor.b32 %r4560, %r4559, %r4526; + shf.l.wrap.b32 %r4561, %r4560, %r4560, 20; + add.s32 %r4562, %r4556, %r3998; + add.s32 %r4563, %r4562, %r4561; + xor.b32 %r4564, %r4563, %r4558; + shf.l.wrap.b32 %r4565, %r4564, %r4564, 24; + add.s32 %r4566, %r4565, %r4559; + xor.b32 %r4567, %r4566, %r4561; + shf.l.wrap.b32 %r4568, %r4567, %r4567, 25; + add.s32 %r4569, %r4521, %r4019; + add.s32 %r4570, %r4569, %r4540; + xor.b32 %r4571, %r4570, %r4509; + shf.l.wrap.b32 %r4572, %r4571, %r4571, 16; + add.s32 %r4573, %r4572, %r4496; + xor.b32 %r4574, %r4573, %r4540; + shf.l.wrap.b32 %r4575, %r4574, %r4574, 20; + add.s32 %r4576, %r4570, %r4040; + add.s32 %r4577, %r4576, %r4575; + xor.b32 %r4578, %r4577, %r4572; + shf.l.wrap.b32 %r4579, %r4578, %r4578, 24; + add.s32 %r4580, %r4579, %r4573; + xor.b32 %r4581, %r4580, %r4575; + shf.l.wrap.b32 %r4582, %r4581, %r4581, 25; + add.s32 %r4583, %r4535, %r3991; + add.s32 %r4584, %r4583, %r4498; + xor.b32 %r4585, %r4584, %r4523; + shf.l.wrap.b32 %r4586, %r4585, %r4585, 16; + add.s32 %r4587, %r4586, %r4510; + xor.b32 %r4588, %r4587, %r4498; + shf.l.wrap.b32 %r4589, %r4588, %r4588, 20; + add.s32 %r4590, %r4584, %r4026; + add.s32 %r4591, %r4590, %r4589; + xor.b32 %r4592, %r4591, %r4586; + shf.l.wrap.b32 %r4593, %r4592, %r4592, 24; + add.s32 %r4594, %r4593, %r4587; + xor.b32 %r4595, %r4594, %r4589; + shf.l.wrap.b32 %r4596, %r4595, %r4595, 25; + add.s32 %r4597, %r4549, %r4068; + add.s32 %r4598, %r4597, %r4596; + xor.b32 %r4599, %r4598, %r4565; + shf.l.wrap.b32 %r4600, %r4599, %r4599, 16; + add.s32 %r4601, %r4600, %r4580; + xor.b32 %r4602, %r4601, %r4596; + shf.l.wrap.b32 %r4603, %r4602, %r4602, 20; + add.s32 %r4604, %r4598, %r4075; + add.s32 %r4605, %r4604, %r4603; + xor.b32 %r4606, %r4605, %r4600; + shf.l.wrap.b32 %r4607, %r4606, %r4606, 24; + add.s32 %r4608, %r4607, %r4601; + xor.b32 %r4609, %r4608, %r4603; + shf.l.wrap.b32 %r4610, %r4609, %r4609, 25; + add.s32 %r4611, %r4563, %r4047; + add.s32 %r4612, %r4611, %r4554; + xor.b32 %r4613, %r4612, %r4579; + shf.l.wrap.b32 %r4614, %r4613, %r4613, 16; + add.s32 %r4615, %r4614, %r4594; + xor.b32 %r4616, %r4615, %r4554; + shf.l.wrap.b32 %r4617, %r4616, %r4616, 20; + add.s32 %r4618, %r4612, %r4061; + add.s32 %r4619, %r4618, %r4617; + xor.b32 %r4620, %r4619, %r4614; + shf.l.wrap.b32 %r4621, %r4620, %r4620, 24; + add.s32 %r4622, %r4621, %r4615; + xor.b32 %r4623, %r4622, %r4617; + shf.l.wrap.b32 %r4624, %r4623, %r4623, 25; + add.s32 %r4625, %r4577, %r4089; + add.s32 %r4626, %r4625, %r4568; + xor.b32 %r4627, %r4593, %r4626; + shf.l.wrap.b32 %r4628, %r4627, %r4627, 16; + add.s32 %r4629, %r4628, %r4552; + xor.b32 %r4630, %r4629, %r4568; + shf.l.wrap.b32 %r4631, %r4630, %r4630, 20; + add.s32 %r4632, %r4626, %r4054; + add.s32 %r4633, %r4632, %r4631; + xor.b32 %r4634, %r4633, %r4628; + shf.l.wrap.b32 %r4635, %r4634, %r4634, 24; + add.s32 %r4636, %r4635, %r4629; + xor.b32 %r4637, %r4636, %r4631; + shf.l.wrap.b32 %r4638, %r4637, %r4637, 25; + add.s32 %r4639, %r4591, %r4082; + add.s32 %r4640, %r4639, %r4582; + xor.b32 %r4641, %r4640, %r4551; + shf.l.wrap.b32 %r4642, %r4641, %r4641, 16; + add.s32 %r4643, %r4642, %r4566; + xor.b32 %r4644, %r4643, %r4582; + shf.l.wrap.b32 %r4645, %r4644, %r4644, 20; + add.s32 %r4646, %r4640, %r4040; + add.s32 %r4647, %r4646, %r4645; + xor.b32 %r4648, %r4647, %r4642; + shf.l.wrap.b32 %r4649, %r4648, %r4648, 24; + add.s32 %r4650, %r4649, %r4643; + xor.b32 %r4651, %r4650, %r4645; + shf.l.wrap.b32 %r4652, %r4651, %r4651, 25; + add.s32 %r4653, %r4605, %r4033; + add.s32 %r4654, %r4653, %r4624; + xor.b32 %r4655, %r4654, %r4649; + shf.l.wrap.b32 %r4656, %r4655, %r4655, 16; + add.s32 %r4657, %r4656, %r4636; + xor.b32 %r4658, %r4657, %r4624; + shf.l.wrap.b32 %r4659, %r4658, %r4658, 20; + add.s32 %r4660, %r4654, %r3998; + add.s32 %r4661, %r4660, %r4659; + xor.b32 %r4662, %r4661, %r4656; + shf.l.wrap.b32 %r4663, %r4662, %r4662, 24; + add.s32 %r4664, %r4663, %r4657; + xor.b32 %r4665, %r4664, %r4659; + shf.l.wrap.b32 %r4666, %r4665, %r4665, 25; + add.s32 %r4667, %r4619, %r4019; + add.s32 %r4668, %r4667, %r4638; + xor.b32 %r4669, %r4607, %r4668; + shf.l.wrap.b32 %r4670, %r4669, %r4669, 16; + add.s32 %r4671, %r4670, %r4650; + xor.b32 %r4672, %r4671, %r4638; + shf.l.wrap.b32 %r4673, %r4672, %r4672, 20; + add.s32 %r4674, %r4668, %r4005; + add.s32 %r4675, %r4674, %r4673; + xor.b32 %r4676, %r4675, %r4670; + shf.l.wrap.b32 %r4677, %r4676, %r4676, 24; + add.s32 %r4678, %r4677, %r4671; + xor.b32 %r4679, %r4678, %r4673; + shf.l.wrap.b32 %r4680, %r4679, %r4679, 25; + add.s32 %r4681, %r4633, %r3984; + add.s32 %r4682, %r4681, %r4652; + xor.b32 %r4683, %r4682, %r4621; + shf.l.wrap.b32 %r4684, %r4683, %r4683, 16; + add.s32 %r4685, %r4684, %r4608; + xor.b32 %r4686, %r4685, %r4652; + shf.l.wrap.b32 %r4687, %r4686, %r4686, 20; + add.s32 %r4688, %r4682, %r3991; + add.s32 %r4689, %r4688, %r4687; + xor.b32 %r4690, %r4689, %r4684; + shf.l.wrap.b32 %r4691, %r4690, %r4690, 24; + add.s32 %r4692, %r4691, %r4685; + xor.b32 %r4693, %r4692, %r4687; + shf.l.wrap.b32 %r4694, %r4693, %r4693, 25; + add.s32 %r4695, %r4647, %r4026; + add.s32 %r4696, %r4695, %r4610; + xor.b32 %r4697, %r4696, %r4635; + shf.l.wrap.b32 %r4698, %r4697, %r4697, 16; + add.s32 %r4699, %r4698, %r4622; + xor.b32 %r4700, %r4699, %r4610; + shf.l.wrap.b32 %r4701, %r4700, %r4700, 20; + add.s32 %r4702, %r4696, %r4012; + add.s32 %r4703, %r4702, %r4701; + xor.b32 %r4704, %r4703, %r4698; + shf.l.wrap.b32 %r4705, %r4704, %r4704, 24; + add.s32 %r4706, %r4705, %r4699; + xor.b32 %r4707, %r4706, %r4701; + shf.l.wrap.b32 %r4708, %r4707, %r4707, 25; + add.s32 %r4709, %r4661, %r4047; + add.s32 %r4710, %r4709, %r4708; + xor.b32 %r4711, %r4710, %r4677; + shf.l.wrap.b32 %r4712, %r4711, %r4711, 16; + add.s32 %r4713, %r4712, %r4692; + xor.b32 %r4714, %r4713, %r4708; + shf.l.wrap.b32 %r4715, %r4714, %r4714, 20; + add.s32 %r4716, %r4710, %r4082; + add.s32 %r4717, %r4716, %r4715; + xor.b32 %r4718, %r4717, %r4712; + shf.l.wrap.b32 %r4719, %r4718, %r4718, 24; + add.s32 %r4720, %r4719, %r4713; + xor.b32 %r4721, %r4720, %r4715; + shf.l.wrap.b32 %r4722, %r4721, %r4721, 25; + add.s32 %r4723, %r4675, %r4061; + add.s32 %r4724, %r4723, %r4666; + xor.b32 %r4725, %r4724, %r4691; + shf.l.wrap.b32 %r4726, %r4725, %r4725, 16; + add.s32 %r4727, %r4726, %r4706; + xor.b32 %r4728, %r4727, %r4666; + shf.l.wrap.b32 %r4729, %r4728, %r4728, 20; + add.s32 %r4730, %r4724, %r4019; + add.s32 %r4731, %r4730, %r4729; + xor.b32 %r4732, %r4731, %r4726; + shf.l.wrap.b32 %r4733, %r4732, %r4732, 24; + add.s32 %r4734, %r4733, %r4727; + xor.b32 %r4735, %r4734, %r4729; + shf.l.wrap.b32 %r4736, %r4735, %r4735, 25; + add.s32 %r4737, %r4689, %r4040; + add.s32 %r4738, %r4737, %r4680; + xor.b32 %r4739, %r4705, %r4738; + shf.l.wrap.b32 %r4740, %r4739, %r4739, 16; + add.s32 %r4741, %r4740, %r4664; + xor.b32 %r4742, %r4741, %r4680; + shf.l.wrap.b32 %r4743, %r4742, %r4742, 20; + add.s32 %r4744, %r4738, %r4068; + add.s32 %r4745, %r4744, %r4743; + xor.b32 %r4746, %r4745, %r4740; + shf.l.wrap.b32 %r4747, %r4746, %r4746, 24; + add.s32 %r4748, %r4747, %r4741; + xor.b32 %r4749, %r4748, %r4743; + shf.l.wrap.b32 %r4750, %r4749, %r4749, 25; + add.s32 %r4751, %r4703, %r4089; + add.s32 %r4752, %r4751, %r4694; + xor.b32 %r4753, %r4752, %r4663; + shf.l.wrap.b32 %r4754, %r4753, %r4753, 16; + add.s32 %r4755, %r4754, %r4678; + xor.b32 %r4756, %r4755, %r4694; + shf.l.wrap.b32 %r4757, %r4756, %r4756, 20; + add.s32 %r4758, %r4752, %r3991; + add.s32 %r4759, %r4758, %r4757; + xor.b32 %r4760, %r4759, %r4754; + shf.l.wrap.b32 %r4761, %r4760, %r4760, 24; + add.s32 %r4762, %r4761, %r4755; + xor.b32 %r4763, %r4762, %r4757; + shf.l.wrap.b32 %r4764, %r4763, %r4763, 25; + add.s32 %r4765, %r4717, %r4075; + add.s32 %r4766, %r4765, %r4736; + xor.b32 %r4767, %r4766, %r4761; + shf.l.wrap.b32 %r4768, %r4767, %r4767, 16; + add.s32 %r4769, %r4768, %r4748; + xor.b32 %r4770, %r4769, %r4736; + shf.l.wrap.b32 %r4771, %r4770, %r4770, 20; + add.s32 %r4772, %r4766, %r4005; + add.s32 %r4773, %r4772, %r4771; + xor.b32 %r4774, %r4773, %r4768; + shf.l.wrap.b32 %r4775, %r4774, %r4774, 24; + add.s32 %r4776, %r4775, %r4769; + xor.b32 %r4777, %r4776, %r4771; + shf.l.wrap.b32 %r4778, %r4777, %r4777, 25; + add.s32 %r4779, %r4731, %r3984; + add.s32 %r4780, %r4779, %r4750; + xor.b32 %r4781, %r4719, %r4780; + shf.l.wrap.b32 %r4782, %r4781, %r4781, 16; + add.s32 %r4783, %r4782, %r4762; + xor.b32 %r4784, %r4783, %r4750; + shf.l.wrap.b32 %r4785, %r4784, %r4784, 20; + add.s32 %r4786, %r4780, %r4054; + add.s32 %r4787, %r4786, %r4785; + xor.b32 %r4788, %r4787, %r4782; + shf.l.wrap.b32 %r4789, %r4788, %r4788, 24; + add.s32 %r4790, %r4789, %r4783; + xor.b32 %r4791, %r4790, %r4785; + shf.l.wrap.b32 %r4792, %r4791, %r4791, 25; + add.s32 %r4793, %r4745, %r3998; + add.s32 %r4794, %r4793, %r4764; + xor.b32 %r4795, %r4794, %r4733; + shf.l.wrap.b32 %r4796, %r4795, %r4795, 16; + add.s32 %r4797, %r4796, %r4720; + xor.b32 %r4798, %r4797, %r4764; + shf.l.wrap.b32 %r4799, %r4798, %r4798, 20; + add.s32 %r4800, %r4794, %r4026; + add.s32 %r4801, %r4800, %r4799; + xor.b32 %r4802, %r4801, %r4796; + shf.l.wrap.b32 %r4803, %r4802, %r4802, 24; + add.s32 %r4804, %r4803, %r4797; + xor.b32 %r4805, %r4804, %r4799; + shf.l.wrap.b32 %r4806, %r4805, %r4805, 25; + add.s32 %r4807, %r4759, %r4012; + add.s32 %r4808, %r4807, %r4722; + xor.b32 %r4809, %r4808, %r4747; + shf.l.wrap.b32 %r4810, %r4809, %r4809, 16; + add.s32 %r4811, %r4810, %r4734; + xor.b32 %r4812, %r4811, %r4722; + shf.l.wrap.b32 %r4813, %r4812, %r4812, 20; + add.s32 %r4814, %r4808, %r4033; + add.s32 %r4815, %r4814, %r4813; + xor.b32 %r4816, %r4815, %r4810; + shf.l.wrap.b32 %r4817, %r4816, %r4816, 24; + add.s32 %r4818, %r4817, %r4811; + xor.b32 %r4819, %r4818, %r4813; + shf.l.wrap.b32 %r4820, %r4819, %r4819, 25; + add.s32 %r4821, %r4773, %r4061; + add.s32 %r4822, %r4821, %r4820; + xor.b32 %r4823, %r4822, %r4789; + shf.l.wrap.b32 %r4824, %r4823, %r4823, 16; + add.s32 %r4825, %r4824, %r4804; + xor.b32 %r4826, %r4825, %r4820; + shf.l.wrap.b32 %r4827, %r4826, %r4826, 20; + add.s32 %r4828, %r4822, %r4089; + add.s32 %r4829, %r4828, %r4827; + xor.b32 %r4830, %r4829, %r4824; + shf.l.wrap.b32 %r4831, %r4830, %r4830, 24; + add.s32 %r4832, %r4831, %r4825; + xor.b32 %r4833, %r4832, %r4827; + shf.l.wrap.b32 %r4834, %r4833, %r4833, 25; + add.s32 %r4835, %r4787, %r4019; + add.s32 %r4836, %r4835, %r4778; + xor.b32 %r4837, %r4836, %r4803; + shf.l.wrap.b32 %r4838, %r4837, %r4837, 16; + add.s32 %r4839, %r4838, %r4818; + xor.b32 %r4840, %r4839, %r4778; + shf.l.wrap.b32 %r4841, %r4840, %r4840, 20; + add.s32 %r4842, %r4836, %r3984; + add.s32 %r4843, %r4842, %r4841; + xor.b32 %r4844, %r4843, %r4838; + shf.l.wrap.b32 %r4845, %r4844, %r4844, 24; + add.s32 %r4846, %r4845, %r4839; + xor.b32 %r4847, %r4846, %r4841; + shf.l.wrap.b32 %r4848, %r4847, %r4847, 25; + add.s32 %r4849, %r4801, %r3991; + add.s32 %r4850, %r4849, %r4792; + xor.b32 %r4851, %r4817, %r4850; + shf.l.wrap.b32 %r4852, %r4851, %r4851, 16; + add.s32 %r4853, %r4852, %r4776; + xor.b32 %r4854, %r4853, %r4792; + shf.l.wrap.b32 %r4855, %r4854, %r4854, 20; + add.s32 %r4856, %r4850, %r4047; + add.s32 %r4857, %r4856, %r4855; + xor.b32 %r4858, %r4857, %r4852; + shf.l.wrap.b32 %r4859, %r4858, %r4858, 24; + add.s32 %r4860, %r4859, %r4853; + xor.b32 %r4861, %r4860, %r4855; + shf.l.wrap.b32 %r4862, %r4861, %r4861, 25; + add.s32 %r4863, %r4815, %r4040; + add.s32 %r4864, %r4863, %r4806; + xor.b32 %r4865, %r4864, %r4775; + shf.l.wrap.b32 %r4866, %r4865, %r4865, 16; + add.s32 %r4867, %r4866, %r4790; + xor.b32 %r4868, %r4867, %r4806; + shf.l.wrap.b32 %r4869, %r4868, %r4868, 20; + add.s32 %r4870, %r4864, %r4026; + add.s32 %r4871, %r4870, %r4869; + xor.b32 %r4872, %r4871, %r4866; + shf.l.wrap.b32 %r4873, %r4872, %r4872, 24; + add.s32 %r4874, %r4873, %r4867; + xor.b32 %r4875, %r4874, %r4869; + shf.l.wrap.b32 %r4876, %r4875, %r4875, 25; + add.s32 %r4877, %r4829, %r4082; + add.s32 %r4878, %r4877, %r4848; + xor.b32 %r4879, %r4878, %r4873; + shf.l.wrap.b32 %r4880, %r4879, %r4879, 16; + add.s32 %r4881, %r4880, %r4860; + xor.b32 %r4882, %r4881, %r4848; + shf.l.wrap.b32 %r4883, %r4882, %r4882, 20; + add.s32 %r4884, %r4878, %r4054; + add.s32 %r4885, %r4884, %r4883; + xor.b32 %r4886, %r4885, %r4880; + shf.l.wrap.b32 %r4887, %r4886, %r4886, 24; + add.s32 %r4888, %r4887, %r4881; + xor.b32 %r4889, %r4888, %r4883; + shf.l.wrap.b32 %r4890, %r4889, %r4889, 25; + add.s32 %r4891, %r4843, %r3998; + add.s32 %r4892, %r4891, %r4862; + xor.b32 %r4893, %r4831, %r4892; + shf.l.wrap.b32 %r4894, %r4893, %r4893, 16; + add.s32 %r4895, %r4894, %r4874; + xor.b32 %r4896, %r4895, %r4862; + shf.l.wrap.b32 %r4897, %r4896, %r4896, 20; + add.s32 %r4898, %r4892, %r4068; + add.s32 %r4899, %r4898, %r4897; + xor.b32 %r4900, %r4899, %r4894; + shf.l.wrap.b32 %r4901, %r4900, %r4900, 24; + add.s32 %r4902, %r4901, %r4895; + xor.b32 %r4903, %r4902, %r4897; + shf.l.wrap.b32 %r4904, %r4903, %r4903, 25; + add.s32 %r4905, %r4857, %r4005; + add.s32 %r4906, %r4905, %r4876; + xor.b32 %r4907, %r4906, %r4845; + shf.l.wrap.b32 %r4908, %r4907, %r4907, 16; + add.s32 %r4909, %r4908, %r4832; + xor.b32 %r4910, %r4909, %r4876; + shf.l.wrap.b32 %r4911, %r4910, %r4910, 20; + add.s32 %r4912, %r4906, %r4012; + add.s32 %r4913, %r4912, %r4911; + xor.b32 %r4914, %r4913, %r4908; + shf.l.wrap.b32 %r4915, %r4914, %r4914, 24; + add.s32 %r4916, %r4915, %r4909; + xor.b32 %r4917, %r4916, %r4911; + shf.l.wrap.b32 %r4918, %r4917, %r4917, 25; + add.s32 %r4919, %r4871, %r4033; + add.s32 %r4920, %r4919, %r4834; + xor.b32 %r4921, %r4920, %r4859; + shf.l.wrap.b32 %r4922, %r4921, %r4921, 16; + add.s32 %r4923, %r4922, %r4846; + xor.b32 %r4924, %r4923, %r4834; + shf.l.wrap.b32 %r4925, %r4924, %r4924, 20; + add.s32 %r4926, %r4920, %r4075; + add.s32 %r4927, %r4926, %r4925; + xor.b32 %r4928, %r4927, %r4922; + shf.l.wrap.b32 %r4929, %r4928, %r4928, 24; + add.s32 %r4930, %r4929, %r4923; + xor.b32 %r4931, %r4930, %r4925; + shf.l.wrap.b32 %r4932, %r4931, %r4931, 25; + xor.b32 %r4933, %r4916, %r4885; + xor.b32 %r4934, %r4930, %r4899; + xor.b32 %r4935, %r4888, %r4913; + xor.b32 %r4936, %r4927, %r4902; + xor.b32 %r4937, %r4932, %r4901; + xor.b32 %r4938, %r4890, %r4915; + xor.b32 %r4939, %r4929, %r4904; + xor.b32 %r4940, %r4918, %r4887; + st.local.u8 [%rd168+145], %r4933; + shr.u32 %r4941, %r4933, 8; + st.local.u8 [%rd168+146], %r4941; + shr.u32 %r4942, %r4933, 16; + st.local.u8 [%rd168+147], %r4942; + shr.u32 %r4943, %r4933, 24; + st.local.u8 [%rd168+148], %r4943; + st.local.u8 [%rd168+149], %r4934; + shr.u32 %r4944, %r4934, 8; + st.local.u8 [%rd168+150], %r4944; + shr.u32 %r4945, %r4934, 16; + st.local.u8 [%rd168+151], %r4945; + shr.u32 %r4946, %r4934, 24; + st.local.u8 [%rd168+152], %r4946; + st.local.u8 [%rd168+153], %r4935; + shr.u32 %r4947, %r4935, 8; + st.local.u8 [%rd168+154], %r4947; + shr.u32 %r4948, %r4935, 16; + st.local.u8 [%rd168+155], %r4948; + shr.u32 %r4949, %r4935, 24; + st.local.u8 [%rd168+156], %r4949; + st.local.u8 [%rd168+157], %r4936; + shr.u32 %r4950, %r4936, 8; + st.local.u8 [%rd168+158], %r4950; + shr.u32 %r4951, %r4936, 16; + st.local.u8 [%rd168+159], %r4951; + shr.u32 %r4952, %r4936, 24; + st.local.u8 [%rd168+160], %r4952; + st.local.u8 [%rd168+161], %r4937; + shr.u32 %r4953, %r4937, 8; + st.local.u8 [%rd168+162], %r4953; + shr.u32 %r4954, %r4937, 16; + st.local.u8 [%rd168+163], %r4954; + shr.u32 %r4955, %r4937, 24; + st.local.u8 [%rd168+164], %r4955; + st.local.u8 [%rd168+165], %r4938; + shr.u32 %r4956, %r4938, 8; + st.local.u8 [%rd168+166], %r4956; + shr.u32 %r4957, %r4938, 16; + st.local.u8 [%rd168+167], %r4957; + shr.u32 %r4958, %r4938, 24; + st.local.u8 [%rd168+168], %r4958; + st.local.u8 [%rd168+169], %r4939; + shr.u32 %r4959, %r4939, 8; + st.local.u8 [%rd168+170], %r4959; + shr.u32 %r4960, %r4939, 16; + st.local.u8 [%rd168+171], %r4960; + shr.u32 %r4961, %r4939, 24; + st.local.u8 [%rd168+172], %r4961; + st.local.u8 [%rd168+173], %r4940; + shr.u32 %r4962, %r4940, 8; + st.local.u8 [%rd168+174], %r4962; + shr.u32 %r4963, %r4940, 16; + st.local.u8 [%rd168+175], %r4963; + shr.u32 %r4964, %r4940, 24; + st.local.u8 [%rd168+176], %r4964; + ld.local.u8 %rs138, [%rd3+8]; + add.s16 %rs139, %rs138, -1; + st.local.u8 [%rd3+8], %rs139; + cvt.u64.u16 %rd169, %rs139; + and.b64 %rd170, %rd169, 255; + setp.lt.u64 %p28, %rd226, %rd170; + and.b16 %rs140, %rs139, 255; + mul.wide.u16 %r11661, %rs140, 32; + @%p28 bra $L__BB1_31; + +$L__BB1_32: + cvt.s64.s32 %rd171, %r11661; + add.s64 %rd172, %rd2, %rd171; + mov.b32 {%rs141, %rs142}, %r3959; + st.local.u8 [%rd172+145], %rs141; + shr.u16 %rs143, %rs141, 8; + st.local.u8 [%rd172+146], %rs143; + st.local.u8 [%rd172+147], %rs142; + shr.u16 %rs144, %rs142, 8; + st.local.u8 [%rd172+148], %rs144; + mov.b32 {%rs145, %rs146}, %r3960; + st.local.u8 [%rd172+149], %rs145; + shr.u16 %rs147, %rs145, 8; + st.local.u8 [%rd172+150], %rs147; + st.local.u8 [%rd172+151], %rs146; + shr.u16 %rs148, %rs146, 8; + st.local.u8 [%rd172+152], %rs148; + mov.b32 {%rs149, %rs150}, %r3961; + st.local.u8 [%rd172+153], %rs149; + shr.u16 %rs151, %rs149, 8; + st.local.u8 [%rd172+154], %rs151; + st.local.u8 [%rd172+155], %rs150; + shr.u16 %rs152, %rs150, 8; + st.local.u8 [%rd172+156], %rs152; + mov.b32 {%rs153, %rs154}, %r3962; + st.local.u8 [%rd172+157], %rs153; + shr.u16 %rs155, %rs153, 8; + st.local.u8 [%rd172+158], %rs155; + st.local.u8 [%rd172+159], %rs154; + shr.u16 %rs156, %rs154, 8; + st.local.u8 [%rd172+160], %rs156; + mov.b32 {%rs157, %rs158}, %r3963; + st.local.u8 [%rd172+161], %rs157; + shr.u16 %rs159, %rs157, 8; + st.local.u8 [%rd172+162], %rs159; + st.local.u8 [%rd172+163], %rs158; + shr.u16 %rs160, %rs158, 8; + st.local.u8 [%rd172+164], %rs160; + mov.b32 {%rs161, %rs162}, %r3964; + st.local.u8 [%rd172+165], %rs161; + shr.u16 %rs163, %rs161, 8; + st.local.u8 [%rd172+166], %rs163; + st.local.u8 [%rd172+167], %rs162; + shr.u16 %rs164, %rs162, 8; + st.local.u8 [%rd172+168], %rs164; + mov.b32 {%rs165, %rs166}, %r3965; + st.local.u8 [%rd172+169], %rs165; + shr.u16 %rs167, %rs165, 8; + st.local.u8 [%rd172+170], %rs167; + st.local.u8 [%rd172+171], %rs166; + shr.u16 %rs168, %rs166, 8; + st.local.u8 [%rd172+172], %rs168; + mov.b32 {%rs169, %rs170}, %r3966; + st.local.u8 [%rd172+173], %rs169; + shr.u16 %rs171, %rs169, 8; + st.local.u8 [%rd172+174], %rs171; + st.local.u8 [%rd172+175], %rs170; + shr.u16 %rs172, %rs170, 8; + st.local.u8 [%rd172+176], %rs172; + ld.local.u8 %rs173, [%rd3+8]; + add.s16 %rs174, %rs173, 1; + st.local.u8 [%rd3+8], %rs174; + shr.u64 %rd173, %rd49, 11; + ld.local.u64 %rd174, [%rd3+-72]; + add.s64 %rd175, %rd174, %rd173; + popc.b64 %r4965, %rd175; + cvt.u64.u32 %rd52, %r4965; + cvt.u64.u16 %rd176, %rs174; + and.b64 %rd177, %rd176, 255; + setp.ge.u64 %p29, %rd52, %rd177; + and.b16 %rs175, %rs174, 255; + mul.wide.u16 %r11663, %rs175, 32; + @%p29 bra $L__BB1_35; + +$L__BB1_34: + shr.u64 %rd229, %rd49, 11; + add.s64 %rd228, %rd174, %rd229; + popc.b64 %r11648, %rd228; + cvt.u64.u32 %rd227, %r11648; + add.s32 %r4966, %r11663, -64; + cvt.s64.s32 %rd178, %r4966; + add.s64 %rd179, %rd2, %rd178; + ld.local.u8 %r4967, [%rd3+2]; + ld.local.u8 %r4968, [%rd179+145]; + ld.local.u8 %r4969, [%rd179+146]; + prmt.b32 %r4970, %r4969, %r4968, 30212; + ld.local.u8 %r4971, [%rd179+147]; + prmt.b32 %r4972, %r4971, %r4970, 28756; + ld.local.u8 %r4973, [%rd179+148]; + prmt.b32 %r4974, %r4973, %r4972, 1620; + ld.local.u8 %r4975, [%rd179+149]; + ld.local.u8 %r4976, [%rd179+150]; + prmt.b32 %r4977, %r4976, %r4975, 30212; + ld.local.u8 %r4978, [%rd179+151]; + prmt.b32 %r4979, %r4978, %r4977, 28756; + ld.local.u8 %r4980, [%rd179+152]; + prmt.b32 %r4981, %r4980, %r4979, 1620; + ld.local.u8 %r4982, [%rd179+153]; + ld.local.u8 %r4983, [%rd179+154]; + prmt.b32 %r4984, %r4983, %r4982, 30212; + ld.local.u8 %r4985, [%rd179+155]; + prmt.b32 %r4986, %r4985, %r4984, 28756; + ld.local.u8 %r4987, [%rd179+156]; + prmt.b32 %r4988, %r4987, %r4986, 1620; + ld.local.u8 %r4989, [%rd179+157]; + ld.local.u8 %r4990, [%rd179+158]; + prmt.b32 %r4991, %r4990, %r4989, 30212; + ld.local.u8 %r4992, [%rd179+159]; + prmt.b32 %r4993, %r4992, %r4991, 28756; + ld.local.u8 %r4994, [%rd179+160]; + prmt.b32 %r4995, %r4994, %r4993, 1620; + ld.local.u8 %r4996, [%rd179+161]; + ld.local.u8 %r4997, [%rd179+162]; + prmt.b32 %r4998, %r4997, %r4996, 30212; + ld.local.u8 %r4999, [%rd179+163]; + prmt.b32 %r5000, %r4999, %r4998, 28756; + ld.local.u8 %r5001, [%rd179+164]; + prmt.b32 %r5002, %r5001, %r5000, 1620; + ld.local.u8 %r5003, [%rd179+165]; + ld.local.u8 %r5004, [%rd179+166]; + prmt.b32 %r5005, %r5004, %r5003, 30212; + ld.local.u8 %r5006, [%rd179+167]; + prmt.b32 %r5007, %r5006, %r5005, 28756; + ld.local.u8 %r5008, [%rd179+168]; + prmt.b32 %r5009, %r5008, %r5007, 1620; + ld.local.u8 %r5010, [%rd179+169]; + ld.local.u8 %r5011, [%rd179+170]; + prmt.b32 %r5012, %r5011, %r5010, 30212; + ld.local.u8 %r5013, [%rd179+171]; + prmt.b32 %r5014, %r5013, %r5012, 28756; + ld.local.u8 %r5015, [%rd179+172]; + prmt.b32 %r5016, %r5015, %r5014, 1620; + ld.local.u8 %r5017, [%rd179+173]; + ld.local.u8 %r5018, [%rd179+174]; + prmt.b32 %r5019, %r5018, %r5017, 30212; + ld.local.u8 %r5020, [%rd179+175]; + prmt.b32 %r5021, %r5020, %r5019, 28756; + ld.local.u8 %r5022, [%rd179+176]; + prmt.b32 %r5023, %r5022, %r5021, 1620; + ld.local.u8 %r5024, [%rd179+177]; + ld.local.u8 %r5025, [%rd179+178]; + prmt.b32 %r5026, %r5025, %r5024, 30212; + ld.local.u8 %r5027, [%rd179+179]; + prmt.b32 %r5028, %r5027, %r5026, 28756; + ld.local.u8 %r5029, [%rd179+180]; + prmt.b32 %r5030, %r5029, %r5028, 1620; + ld.local.u8 %r5031, [%rd179+181]; + ld.local.u8 %r5032, [%rd179+182]; + prmt.b32 %r5033, %r5032, %r5031, 30212; + ld.local.u8 %r5034, [%rd179+183]; + prmt.b32 %r5035, %r5034, %r5033, 28756; + ld.local.u8 %r5036, [%rd179+184]; + prmt.b32 %r5037, %r5036, %r5035, 1620; + ld.local.u8 %r5038, [%rd179+185]; + ld.local.u8 %r5039, [%rd179+186]; + prmt.b32 %r5040, %r5039, %r5038, 30212; + ld.local.u8 %r5041, [%rd179+187]; + prmt.b32 %r5042, %r5041, %r5040, 28756; + ld.local.u8 %r5043, [%rd179+188]; + prmt.b32 %r5044, %r5043, %r5042, 1620; + ld.local.u8 %r5045, [%rd179+189]; + ld.local.u8 %r5046, [%rd179+190]; + prmt.b32 %r5047, %r5046, %r5045, 30212; + ld.local.u8 %r5048, [%rd179+191]; + prmt.b32 %r5049, %r5048, %r5047, 28756; + ld.local.u8 %r5050, [%rd179+192]; + prmt.b32 %r5051, %r5050, %r5049, 1620; + ld.local.u8 %r5052, [%rd179+193]; + ld.local.u8 %r5053, [%rd179+194]; + prmt.b32 %r5054, %r5053, %r5052, 30212; + ld.local.u8 %r5055, [%rd179+195]; + prmt.b32 %r5056, %r5055, %r5054, 28756; + ld.local.u8 %r5057, [%rd179+196]; + prmt.b32 %r5058, %r5057, %r5056, 1620; + ld.local.u8 %r5059, [%rd179+197]; + ld.local.u8 %r5060, [%rd179+198]; + prmt.b32 %r5061, %r5060, %r5059, 30212; + ld.local.u8 %r5062, [%rd179+199]; + prmt.b32 %r5063, %r5062, %r5061, 28756; + ld.local.u8 %r5064, [%rd179+200]; + prmt.b32 %r5065, %r5064, %r5063, 1620; + ld.local.u8 %r5066, [%rd179+201]; + ld.local.u8 %r5067, [%rd179+202]; + prmt.b32 %r5068, %r5067, %r5066, 30212; + ld.local.u8 %r5069, [%rd179+203]; + prmt.b32 %r5070, %r5069, %r5068, 28756; + ld.local.u8 %r5071, [%rd179+204]; + prmt.b32 %r5072, %r5071, %r5070, 1620; + ld.local.u8 %r5073, [%rd179+205]; + ld.local.u8 %r5074, [%rd179+206]; + prmt.b32 %r5075, %r5074, %r5073, 30212; + ld.local.u8 %r5076, [%rd179+207]; + prmt.b32 %r5077, %r5076, %r5075, 28756; + ld.local.u8 %r5078, [%rd179+208]; + prmt.b32 %r5079, %r5078, %r5077, 1620; + or.b32 %r5080, %r4967, 4; + ld.local.u8 %r5081, [%rd3+-120]; + ld.local.u8 %r5082, [%rd3+-119]; + prmt.b32 %r5083, %r5082, %r5081, 30212; + ld.local.u8 %r5084, [%rd3+-118]; + ld.local.u8 %r5085, [%rd3+-117]; + prmt.b32 %r5086, %r5085, %r5084, 30212; + prmt.b32 %r5087, %r5086, %r5083, 4180; + ld.local.u8 %r5088, [%rd3+-136]; + ld.local.u8 %r5089, [%rd3+-135]; + prmt.b32 %r5090, %r5089, %r5088, 30212; + ld.local.u8 %r5091, [%rd3+-134]; + ld.local.u8 %r5092, [%rd3+-133]; + prmt.b32 %r5093, %r5092, %r5091, 30212; + prmt.b32 %r5094, %r5093, %r5090, 4180; + add.s32 %r5095, %r5087, %r5094; + add.s32 %r5096, %r5095, %r4974; + shf.l.wrap.b32 %r5097, %r5096, %r5096, 16; + add.s32 %r5098, %r5097, 1779033703; + xor.b32 %r5099, %r5098, %r5087; + shf.l.wrap.b32 %r5100, %r5099, %r5099, 20; + add.s32 %r5101, %r4981, %r5096; + add.s32 %r5102, %r5101, %r5100; + xor.b32 %r5103, %r5102, %r5097; + shf.l.wrap.b32 %r5104, %r5103, %r5103, 24; + add.s32 %r5105, %r5104, %r5098; + xor.b32 %r5106, %r5105, %r5100; + shf.l.wrap.b32 %r5107, %r5106, %r5106, 25; + ld.local.u8 %r5108, [%rd3+-116]; + ld.local.u8 %r5109, [%rd3+-115]; + prmt.b32 %r5110, %r5109, %r5108, 30212; + ld.local.u8 %r5111, [%rd3+-114]; + ld.local.u8 %r5112, [%rd3+-113]; + prmt.b32 %r5113, %r5112, %r5111, 30212; + prmt.b32 %r5114, %r5113, %r5110, 4180; + ld.local.u8 %r5115, [%rd3+-132]; + ld.local.u8 %r5116, [%rd3+-131]; + prmt.b32 %r5117, %r5116, %r5115, 30212; + ld.local.u8 %r5118, [%rd3+-130]; + ld.local.u8 %r5119, [%rd3+-129]; + prmt.b32 %r5120, %r5119, %r5118, 30212; + prmt.b32 %r5121, %r5120, %r5117, 4180; + add.s32 %r5122, %r5114, %r5121; + add.s32 %r5123, %r5122, %r4988; + shf.l.wrap.b32 %r5124, %r5123, %r5123, 16; + add.s32 %r5125, %r5124, -1150833019; + xor.b32 %r5126, %r5125, %r5114; + shf.l.wrap.b32 %r5127, %r5126, %r5126, 20; + add.s32 %r5128, %r4995, %r5123; + add.s32 %r5129, %r5128, %r5127; + xor.b32 %r5130, %r5129, %r5124; + shf.l.wrap.b32 %r5131, %r5130, %r5130, 24; + add.s32 %r5132, %r5131, %r5125; + xor.b32 %r5133, %r5132, %r5127; + shf.l.wrap.b32 %r5134, %r5133, %r5133, 25; + ld.local.u8 %r5135, [%rd3+-112]; + ld.local.u8 %r5136, [%rd3+-111]; + prmt.b32 %r5137, %r5136, %r5135, 30212; + ld.local.u8 %r5138, [%rd3+-110]; + ld.local.u8 %r5139, [%rd3+-109]; + prmt.b32 %r5140, %r5139, %r5138, 30212; + prmt.b32 %r5141, %r5140, %r5137, 4180; + ld.local.u8 %r5142, [%rd3+-128]; + ld.local.u8 %r5143, [%rd3+-127]; + prmt.b32 %r5144, %r5143, %r5142, 30212; + ld.local.u8 %r5145, [%rd3+-126]; + ld.local.u8 %r5146, [%rd3+-125]; + prmt.b32 %r5147, %r5146, %r5145, 30212; + prmt.b32 %r5148, %r5147, %r5144, 4180; + add.s32 %r5149, %r5141, %r5148; + add.s32 %r5150, %r5149, %r5002; + shr.u32 %r5151, %r5150, 16; + shl.b32 %r5152, %r5150, 16; + xor.b32 %r5153, %r5152, 4194304; + or.b32 %r5154, %r5153, %r5151; + add.s32 %r5155, %r5154, 1013904242; + xor.b32 %r5156, %r5155, %r5141; + shf.l.wrap.b32 %r5157, %r5156, %r5156, 20; + add.s32 %r5158, %r5009, %r5150; + add.s32 %r5159, %r5158, %r5157; + xor.b32 %r5160, %r5159, %r5154; + shf.l.wrap.b32 %r5161, %r5160, %r5160, 24; + add.s32 %r5162, %r5161, %r5155; + xor.b32 %r5163, %r5162, %r5157; + shf.l.wrap.b32 %r5164, %r5163, %r5163, 25; + ld.local.u8 %r5165, [%rd3+-108]; + ld.local.u8 %r5166, [%rd3+-107]; + prmt.b32 %r5167, %r5166, %r5165, 30212; + ld.local.u8 %r5168, [%rd3+-106]; + ld.local.u8 %r5169, [%rd3+-105]; + prmt.b32 %r5170, %r5169, %r5168, 30212; + prmt.b32 %r5171, %r5170, %r5167, 4180; + ld.local.u8 %r5172, [%rd3+-124]; + ld.local.u8 %r5173, [%rd3+-123]; + prmt.b32 %r5174, %r5173, %r5172, 30212; + ld.local.u8 %r5175, [%rd3+-122]; + ld.local.u8 %r5176, [%rd3+-121]; + prmt.b32 %r5177, %r5176, %r5175, 30212; + prmt.b32 %r5178, %r5177, %r5174, 4180; + add.s32 %r5179, %r5171, %r5178; + add.s32 %r5180, %r5179, %r5016; + xor.b32 %r5181, %r5180, %r5080; + shr.u32 %r5182, %r5180, 16; + shl.b32 %r5183, %r5181, 16; + or.b32 %r5184, %r5183, %r5182; + add.s32 %r5185, %r5184, -1521486534; + xor.b32 %r5186, %r5185, %r5171; + shf.l.wrap.b32 %r5187, %r5186, %r5186, 20; + add.s32 %r5188, %r5023, %r5180; + add.s32 %r5189, %r5188, %r5187; + xor.b32 %r5190, %r5189, %r5184; + shf.l.wrap.b32 %r5191, %r5190, %r5190, 24; + add.s32 %r5192, %r5191, %r5185; + xor.b32 %r5193, %r5192, %r5187; + shf.l.wrap.b32 %r5194, %r5193, %r5193, 25; + add.s32 %r5195, %r5134, %r5102; + add.s32 %r5196, %r5195, %r5030; + xor.b32 %r5197, %r5191, %r5196; + shf.l.wrap.b32 %r5198, %r5197, %r5197, 16; + add.s32 %r5199, %r5198, %r5162; + xor.b32 %r5200, %r5199, %r5134; + shf.l.wrap.b32 %r5201, %r5200, %r5200, 20; + add.s32 %r5202, %r5037, %r5196; + add.s32 %r5203, %r5202, %r5201; + xor.b32 %r5204, %r5203, %r5198; + shf.l.wrap.b32 %r5205, %r5204, %r5204, 24; + add.s32 %r5206, %r5205, %r5199; + xor.b32 %r5207, %r5206, %r5201; + shf.l.wrap.b32 %r5208, %r5207, %r5207, 25; + add.s32 %r5209, %r5164, %r5129; + add.s32 %r5210, %r5209, %r5044; + xor.b32 %r5211, %r5210, %r5104; + shf.l.wrap.b32 %r5212, %r5211, %r5211, 16; + add.s32 %r5213, %r5212, %r5192; + xor.b32 %r5214, %r5213, %r5164; + shf.l.wrap.b32 %r5215, %r5214, %r5214, 20; + add.s32 %r5216, %r5051, %r5210; + add.s32 %r5217, %r5216, %r5215; + xor.b32 %r5218, %r5217, %r5212; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 24; + add.s32 %r5220, %r5219, %r5213; + xor.b32 %r5221, %r5220, %r5215; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 25; + add.s32 %r5223, %r5194, %r5159; + add.s32 %r5224, %r5223, %r5058; + xor.b32 %r5225, %r5224, %r5131; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 16; + add.s32 %r5227, %r5226, %r5105; + xor.b32 %r5228, %r5227, %r5194; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 20; + add.s32 %r5230, %r5065, %r5224; + add.s32 %r5231, %r5230, %r5229; + xor.b32 %r5232, %r5231, %r5226; + shf.l.wrap.b32 %r5233, %r5232, %r5232, 24; + add.s32 %r5234, %r5233, %r5227; + xor.b32 %r5235, %r5234, %r5229; + shf.l.wrap.b32 %r5236, %r5235, %r5235, 25; + add.s32 %r5237, %r5189, %r5107; + add.s32 %r5238, %r5237, %r5072; + xor.b32 %r5239, %r5238, %r5161; + shf.l.wrap.b32 %r5240, %r5239, %r5239, 16; + add.s32 %r5241, %r5240, %r5132; + xor.b32 %r5242, %r5241, %r5107; + shf.l.wrap.b32 %r5243, %r5242, %r5242, 20; + add.s32 %r5244, %r5079, %r5238; + add.s32 %r5245, %r5244, %r5243; + xor.b32 %r5246, %r5245, %r5240; + shf.l.wrap.b32 %r5247, %r5246, %r5246, 24; + add.s32 %r5248, %r5247, %r5241; + xor.b32 %r5249, %r5248, %r5243; + shf.l.wrap.b32 %r5250, %r5249, %r5249, 25; + add.s32 %r5251, %r5203, %r4988; + add.s32 %r5252, %r5251, %r5250; + xor.b32 %r5253, %r5252, %r5219; + shf.l.wrap.b32 %r5254, %r5253, %r5253, 16; + add.s32 %r5255, %r5254, %r5234; + xor.b32 %r5256, %r5255, %r5250; + shf.l.wrap.b32 %r5257, %r5256, %r5256, 20; + add.s32 %r5258, %r5252, %r5016; + add.s32 %r5259, %r5258, %r5257; + xor.b32 %r5260, %r5259, %r5254; + shf.l.wrap.b32 %r5261, %r5260, %r5260, 24; + add.s32 %r5262, %r5261, %r5255; + xor.b32 %r5263, %r5262, %r5257; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 25; + add.s32 %r5265, %r5217, %r4995; + add.s32 %r5266, %r5265, %r5208; + xor.b32 %r5267, %r5233, %r5266; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 16; + add.s32 %r5269, %r5248, %r5268; + xor.b32 %r5270, %r5269, %r5208; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 20; + add.s32 %r5272, %r5266, %r5044; + add.s32 %r5273, %r5272, %r5271; + xor.b32 %r5274, %r5273, %r5268; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 24; + add.s32 %r5276, %r5275, %r5269; + xor.b32 %r5277, %r5276, %r5271; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 25; + add.s32 %r5279, %r5222, %r5023; + add.s32 %r5280, %r5279, %r5231; + xor.b32 %r5281, %r5247, %r5280; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 16; + add.s32 %r5283, %r5282, %r5206; + xor.b32 %r5284, %r5283, %r5222; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 20; + add.s32 %r5286, %r5280, %r4974; + add.s32 %r5287, %r5286, %r5285; + xor.b32 %r5288, %r5287, %r5282; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 24; + add.s32 %r5290, %r5289, %r5283; + xor.b32 %r5291, %r5290, %r5285; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 25; + add.s32 %r5293, %r5236, %r5002; + add.s32 %r5294, %r5293, %r5245; + xor.b32 %r5295, %r5294, %r5205; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 16; + add.s32 %r5297, %r5296, %r5220; + xor.b32 %r5298, %r5297, %r5236; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 20; + add.s32 %r5300, %r5294, %r5065; + add.s32 %r5301, %r5300, %r5299; + xor.b32 %r5302, %r5301, %r5296; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 24; + add.s32 %r5304, %r5303, %r5297; + xor.b32 %r5305, %r5304, %r5299; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 25; + add.s32 %r5307, %r5278, %r4981; + add.s32 %r5308, %r5307, %r5259; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 16; + add.s32 %r5311, %r5310, %r5290; + xor.b32 %r5312, %r5311, %r5278; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 20; + add.s32 %r5314, %r5308, %r5051; + add.s32 %r5315, %r5314, %r5313; + xor.b32 %r5316, %r5315, %r5310; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 24; + add.s32 %r5318, %r5317, %r5311; + xor.b32 %r5319, %r5318, %r5313; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 25; + add.s32 %r5321, %r5273, %r5058; + add.s32 %r5322, %r5321, %r5292; + xor.b32 %r5323, %r5261, %r5322; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 16; + add.s32 %r5325, %r5324, %r5304; + xor.b32 %r5326, %r5325, %r5292; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 20; + add.s32 %r5328, %r5322, %r5009; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5324; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 24; + add.s32 %r5332, %r5331, %r5325; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 25; + add.s32 %r5335, %r5287, %r5037; + add.s32 %r5336, %r5335, %r5306; + xor.b32 %r5337, %r5336, %r5275; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 16; + add.s32 %r5339, %r5338, %r5262; + xor.b32 %r5340, %r5339, %r5306; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 20; + add.s32 %r5342, %r5336, %r5072; + add.s32 %r5343, %r5342, %r5341; + xor.b32 %r5344, %r5343, %r5338; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 24; + add.s32 %r5346, %r5345, %r5339; + xor.b32 %r5347, %r5346, %r5341; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 25; + add.s32 %r5349, %r5301, %r5079; + add.s32 %r5350, %r5349, %r5264; + xor.b32 %r5351, %r5350, %r5289; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 16; + add.s32 %r5353, %r5352, %r5276; + xor.b32 %r5354, %r5353, %r5264; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 20; + add.s32 %r5356, %r5350, %r5030; + add.s32 %r5357, %r5356, %r5355; + xor.b32 %r5358, %r5357, %r5352; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 24; + add.s32 %r5360, %r5359, %r5353; + xor.b32 %r5361, %r5360, %r5355; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 25; + add.s32 %r5363, %r5315, %r4995; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5331; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 16; + add.s32 %r5367, %r5366, %r5346; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 20; + add.s32 %r5370, %r5364, %r5002; + add.s32 %r5371, %r5370, %r5369; + xor.b32 %r5372, %r5371, %r5366; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 24; + add.s32 %r5374, %r5373, %r5367; + xor.b32 %r5375, %r5374, %r5369; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 25; + add.s32 %r5377, %r5329, %r5044; + add.s32 %r5378, %r5377, %r5320; + xor.b32 %r5379, %r5378, %r5345; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 16; + add.s32 %r5381, %r5380, %r5360; + xor.b32 %r5382, %r5381, %r5320; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 20; + add.s32 %r5384, %r5378, %r5058; + add.s32 %r5385, %r5384, %r5383; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 24; + add.s32 %r5388, %r5387, %r5381; + xor.b32 %r5389, %r5388, %r5383; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 25; + add.s32 %r5391, %r5343, %r5065; + add.s32 %r5392, %r5391, %r5334; + xor.b32 %r5393, %r5359, %r5392; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 16; + add.s32 %r5395, %r5394, %r5318; + xor.b32 %r5396, %r5395, %r5334; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 20; + add.s32 %r5398, %r5392, %r4988; + add.s32 %r5399, %r5398, %r5397; + xor.b32 %r5400, %r5399, %r5394; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 24; + add.s32 %r5402, %r5401, %r5395; + xor.b32 %r5403, %r5402, %r5397; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 25; + add.s32 %r5405, %r5348, %r5023; + add.s32 %r5406, %r5405, %r5357; + xor.b32 %r5407, %r5406, %r5317; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 16; + add.s32 %r5409, %r5408, %r5332; + xor.b32 %r5410, %r5409, %r5348; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 20; + add.s32 %r5412, %r5406, %r5072; + add.s32 %r5413, %r5412, %r5411; + xor.b32 %r5414, %r5413, %r5408; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 24; + add.s32 %r5416, %r5415, %r5409; + xor.b32 %r5417, %r5416, %r5411; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 25; + add.s32 %r5419, %r5371, %r5016; + add.s32 %r5420, %r5419, %r5390; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 16; + add.s32 %r5423, %r5422, %r5402; + xor.b32 %r5424, %r5423, %r5390; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 20; + add.s32 %r5426, %r5420, %r5009; + add.s32 %r5427, %r5426, %r5425; + xor.b32 %r5428, %r5427, %r5422; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 24; + add.s32 %r5430, %r5429, %r5423; + xor.b32 %r5431, %r5430, %r5425; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 25; + add.s32 %r5433, %r5385, %r5037; + add.s32 %r5434, %r5433, %r5404; + xor.b32 %r5435, %r5373, %r5434; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 16; + add.s32 %r5437, %r5436, %r5416; + xor.b32 %r5438, %r5437, %r5404; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 20; + add.s32 %r5440, %r5434, %r4974; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5436; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 24; + add.s32 %r5444, %r5443, %r5437; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 25; + add.s32 %r5447, %r5399, %r5051; + add.s32 %r5448, %r5447, %r5418; + xor.b32 %r5449, %r5448, %r5387; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 16; + add.s32 %r5451, %r5450, %r5374; + xor.b32 %r5452, %r5451, %r5418; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 20; + add.s32 %r5454, %r5448, %r5079; + add.s32 %r5455, %r5454, %r5453; + xor.b32 %r5456, %r5455, %r5450; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 24; + add.s32 %r5458, %r5457, %r5451; + xor.b32 %r5459, %r5458, %r5453; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 25; + add.s32 %r5461, %r5413, %r5030; + add.s32 %r5462, %r5461, %r5376; + xor.b32 %r5463, %r5462, %r5401; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 16; + add.s32 %r5465, %r5464, %r5388; + xor.b32 %r5466, %r5465, %r5376; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 20; + add.s32 %r5468, %r5462, %r4981; + add.s32 %r5469, %r5468, %r5467; + xor.b32 %r5470, %r5469, %r5464; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 24; + add.s32 %r5472, %r5471, %r5465; + xor.b32 %r5473, %r5472, %r5467; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 25; + add.s32 %r5475, %r5427, %r5044; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5443; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 16; + add.s32 %r5479, %r5478, %r5458; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 20; + add.s32 %r5482, %r5476, %r5023; + add.s32 %r5483, %r5482, %r5481; + xor.b32 %r5484, %r5483, %r5478; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 24; + add.s32 %r5486, %r5485, %r5479; + xor.b32 %r5487, %r5486, %r5481; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 25; + add.s32 %r5489, %r5441, %r5058; + add.s32 %r5490, %r5489, %r5432; + xor.b32 %r5491, %r5490, %r5457; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 16; + add.s32 %r5493, %r5492, %r5472; + xor.b32 %r5494, %r5493, %r5432; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 20; + add.s32 %r5496, %r5490, %r5037; + add.s32 %r5497, %r5496, %r5495; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 24; + add.s32 %r5500, %r5499, %r5493; + xor.b32 %r5501, %r5500, %r5495; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 25; + add.s32 %r5503, %r5455, %r5072; + add.s32 %r5504, %r5503, %r5446; + xor.b32 %r5505, %r5471, %r5504; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 16; + add.s32 %r5507, %r5506, %r5430; + xor.b32 %r5508, %r5507, %r5446; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 20; + add.s32 %r5510, %r5504, %r4995; + add.s32 %r5511, %r5510, %r5509; + xor.b32 %r5512, %r5511, %r5506; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 24; + add.s32 %r5514, %r5513, %r5507; + xor.b32 %r5515, %r5514, %r5509; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 25; + add.s32 %r5517, %r5469, %r5065; + add.s32 %r5518, %r5517, %r5460; + xor.b32 %r5519, %r5518, %r5429; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 16; + add.s32 %r5521, %r5520, %r5444; + xor.b32 %r5522, %r5521, %r5460; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 20; + add.s32 %r5524, %r5518, %r5079; + add.s32 %r5525, %r5524, %r5523; + xor.b32 %r5526, %r5525, %r5520; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 24; + add.s32 %r5528, %r5527, %r5521; + xor.b32 %r5529, %r5528, %r5523; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 25; + add.s32 %r5531, %r5483, %r5002; + add.s32 %r5532, %r5531, %r5502; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 16; + add.s32 %r5535, %r5534, %r5514; + xor.b32 %r5536, %r5535, %r5502; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 20; + add.s32 %r5538, %r5532, %r4974; + add.s32 %r5539, %r5538, %r5537; + xor.b32 %r5540, %r5539, %r5534; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 24; + add.s32 %r5542, %r5541, %r5535; + xor.b32 %r5543, %r5542, %r5537; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 25; + add.s32 %r5545, %r5497, %r5051; + add.s32 %r5546, %r5545, %r5516; + xor.b32 %r5547, %r5485, %r5546; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 16; + add.s32 %r5549, %r5548, %r5528; + xor.b32 %r5550, %r5549, %r5516; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 20; + add.s32 %r5552, %r5546, %r4988; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5548; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 24; + add.s32 %r5556, %r5555, %r5549; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 25; + add.s32 %r5559, %r5511, %r5009; + add.s32 %r5560, %r5559, %r5530; + xor.b32 %r5561, %r5560, %r5499; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 16; + add.s32 %r5563, %r5562, %r5486; + xor.b32 %r5564, %r5563, %r5530; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 20; + add.s32 %r5566, %r5560, %r5030; + add.s32 %r5567, %r5566, %r5565; + xor.b32 %r5568, %r5567, %r5562; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 24; + add.s32 %r5570, %r5569, %r5563; + xor.b32 %r5571, %r5570, %r5565; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 25; + add.s32 %r5573, %r5525, %r4981; + add.s32 %r5574, %r5573, %r5488; + xor.b32 %r5575, %r5574, %r5513; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 16; + add.s32 %r5577, %r5576, %r5500; + xor.b32 %r5578, %r5577, %r5488; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 20; + add.s32 %r5580, %r5574, %r5016; + add.s32 %r5581, %r5580, %r5579; + xor.b32 %r5582, %r5581, %r5576; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 24; + add.s32 %r5584, %r5583, %r5577; + xor.b32 %r5585, %r5584, %r5579; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 25; + add.s32 %r5587, %r5539, %r5058; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5555; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 16; + add.s32 %r5591, %r5590, %r5570; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 20; + add.s32 %r5594, %r5588, %r5065; + add.s32 %r5595, %r5594, %r5593; + xor.b32 %r5596, %r5595, %r5590; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 24; + add.s32 %r5598, %r5597, %r5591; + xor.b32 %r5599, %r5598, %r5593; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 25; + add.s32 %r5601, %r5553, %r5037; + add.s32 %r5602, %r5601, %r5544; + xor.b32 %r5603, %r5602, %r5569; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 16; + add.s32 %r5605, %r5604, %r5584; + xor.b32 %r5606, %r5605, %r5544; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 20; + add.s32 %r5608, %r5602, %r5051; + add.s32 %r5609, %r5608, %r5607; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 24; + add.s32 %r5612, %r5611, %r5605; + xor.b32 %r5613, %r5612, %r5607; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 25; + add.s32 %r5615, %r5567, %r5079; + add.s32 %r5616, %r5615, %r5558; + xor.b32 %r5617, %r5583, %r5616; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 16; + add.s32 %r5619, %r5618, %r5542; + xor.b32 %r5620, %r5619, %r5558; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 20; + add.s32 %r5622, %r5616, %r5044; + add.s32 %r5623, %r5622, %r5621; + xor.b32 %r5624, %r5623, %r5618; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 24; + add.s32 %r5626, %r5625, %r5619; + xor.b32 %r5627, %r5626, %r5621; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 25; + add.s32 %r5629, %r5581, %r5072; + add.s32 %r5630, %r5629, %r5572; + xor.b32 %r5631, %r5630, %r5541; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 16; + add.s32 %r5633, %r5632, %r5556; + xor.b32 %r5634, %r5633, %r5572; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 20; + add.s32 %r5636, %r5630, %r5030; + add.s32 %r5637, %r5636, %r5635; + xor.b32 %r5638, %r5637, %r5632; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 24; + add.s32 %r5640, %r5639, %r5633; + xor.b32 %r5641, %r5640, %r5635; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 25; + add.s32 %r5643, %r5595, %r5023; + add.s32 %r5644, %r5643, %r5614; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 16; + add.s32 %r5647, %r5646, %r5626; + xor.b32 %r5648, %r5647, %r5614; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 20; + add.s32 %r5650, %r5644, %r4988; + add.s32 %r5651, %r5650, %r5649; + xor.b32 %r5652, %r5651, %r5646; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 24; + add.s32 %r5654, %r5653, %r5647; + xor.b32 %r5655, %r5654, %r5649; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 25; + add.s32 %r5657, %r5609, %r5009; + add.s32 %r5658, %r5657, %r5628; + xor.b32 %r5659, %r5597, %r5658; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 16; + add.s32 %r5661, %r5660, %r5640; + xor.b32 %r5662, %r5661, %r5628; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 20; + add.s32 %r5664, %r5658, %r4995; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5660; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 24; + add.s32 %r5668, %r5667, %r5661; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 25; + add.s32 %r5671, %r5623, %r4974; + add.s32 %r5672, %r5671, %r5642; + xor.b32 %r5673, %r5672, %r5611; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 16; + add.s32 %r5675, %r5674, %r5598; + xor.b32 %r5676, %r5675, %r5642; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 20; + add.s32 %r5678, %r5672, %r4981; + add.s32 %r5679, %r5678, %r5677; + xor.b32 %r5680, %r5679, %r5674; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 24; + add.s32 %r5682, %r5681, %r5675; + xor.b32 %r5683, %r5682, %r5677; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 25; + add.s32 %r5685, %r5637, %r5016; + add.s32 %r5686, %r5685, %r5600; + xor.b32 %r5687, %r5686, %r5625; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 16; + add.s32 %r5689, %r5688, %r5612; + xor.b32 %r5690, %r5689, %r5600; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 20; + add.s32 %r5692, %r5686, %r5002; + add.s32 %r5693, %r5692, %r5691; + xor.b32 %r5694, %r5693, %r5688; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 24; + add.s32 %r5696, %r5695, %r5689; + xor.b32 %r5697, %r5696, %r5691; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 25; + add.s32 %r5699, %r5651, %r5037; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5667; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 16; + add.s32 %r5703, %r5702, %r5682; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 20; + add.s32 %r5706, %r5700, %r5072; + add.s32 %r5707, %r5706, %r5705; + xor.b32 %r5708, %r5707, %r5702; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 24; + add.s32 %r5710, %r5709, %r5703; + xor.b32 %r5711, %r5710, %r5705; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 25; + add.s32 %r5713, %r5665, %r5051; + add.s32 %r5714, %r5713, %r5656; + xor.b32 %r5715, %r5714, %r5681; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 16; + add.s32 %r5717, %r5716, %r5696; + xor.b32 %r5718, %r5717, %r5656; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 20; + add.s32 %r5720, %r5714, %r5009; + add.s32 %r5721, %r5720, %r5719; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 24; + add.s32 %r5724, %r5723, %r5717; + xor.b32 %r5725, %r5724, %r5719; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 25; + add.s32 %r5727, %r5679, %r5030; + add.s32 %r5728, %r5727, %r5670; + xor.b32 %r5729, %r5695, %r5728; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 16; + add.s32 %r5731, %r5730, %r5654; + xor.b32 %r5732, %r5731, %r5670; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 20; + add.s32 %r5734, %r5728, %r5058; + add.s32 %r5735, %r5734, %r5733; + xor.b32 %r5736, %r5735, %r5730; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 24; + add.s32 %r5738, %r5737, %r5731; + xor.b32 %r5739, %r5738, %r5733; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 25; + add.s32 %r5741, %r5693, %r5079; + add.s32 %r5742, %r5741, %r5684; + xor.b32 %r5743, %r5742, %r5653; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 16; + add.s32 %r5745, %r5744, %r5668; + xor.b32 %r5746, %r5745, %r5684; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 20; + add.s32 %r5748, %r5742, %r4981; + add.s32 %r5749, %r5748, %r5747; + xor.b32 %r5750, %r5749, %r5744; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 24; + add.s32 %r5752, %r5751, %r5745; + xor.b32 %r5753, %r5752, %r5747; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 25; + add.s32 %r5755, %r5707, %r5065; + add.s32 %r5756, %r5755, %r5726; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 16; + add.s32 %r5759, %r5758, %r5738; + xor.b32 %r5760, %r5759, %r5726; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 20; + add.s32 %r5762, %r5756, %r4995; + add.s32 %r5763, %r5762, %r5761; + xor.b32 %r5764, %r5763, %r5758; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 24; + add.s32 %r5766, %r5765, %r5759; + xor.b32 %r5767, %r5766, %r5761; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 25; + add.s32 %r5769, %r5721, %r4974; + add.s32 %r5770, %r5769, %r5740; + xor.b32 %r5771, %r5709, %r5770; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 16; + add.s32 %r5773, %r5772, %r5752; + xor.b32 %r5774, %r5773, %r5740; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 20; + add.s32 %r5776, %r5770, %r5044; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5772; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 24; + add.s32 %r5780, %r5779, %r5773; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 25; + add.s32 %r5783, %r5735, %r4988; + add.s32 %r5784, %r5783, %r5754; + xor.b32 %r5785, %r5784, %r5723; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 16; + add.s32 %r5787, %r5786, %r5710; + xor.b32 %r5788, %r5787, %r5754; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 20; + add.s32 %r5790, %r5784, %r5016; + add.s32 %r5791, %r5790, %r5789; + xor.b32 %r5792, %r5791, %r5786; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 24; + add.s32 %r5794, %r5793, %r5787; + xor.b32 %r5795, %r5794, %r5789; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 25; + add.s32 %r5797, %r5749, %r5002; + add.s32 %r5798, %r5797, %r5712; + xor.b32 %r5799, %r5798, %r5737; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 16; + add.s32 %r5801, %r5800, %r5724; + xor.b32 %r5802, %r5801, %r5712; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 20; + add.s32 %r5804, %r5798, %r5023; + add.s32 %r5805, %r5804, %r5803; + xor.b32 %r5806, %r5805, %r5800; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 24; + add.s32 %r5808, %r5807, %r5801; + xor.b32 %r5809, %r5808, %r5803; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 25; + add.s32 %r5811, %r5763, %r5051; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5779; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 16; + add.s32 %r5815, %r5814, %r5794; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 20; + add.s32 %r5818, %r5812, %r5079; + add.s32 %r5819, %r5818, %r5817; + xor.b32 %r5820, %r5819, %r5814; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 24; + add.s32 %r5822, %r5821, %r5815; + xor.b32 %r5823, %r5822, %r5817; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 25; + add.s32 %r5825, %r5777, %r5009; + add.s32 %r5826, %r5825, %r5768; + xor.b32 %r5827, %r5826, %r5793; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 16; + add.s32 %r5829, %r5828, %r5808; + xor.b32 %r5830, %r5829, %r5768; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 20; + add.s32 %r5832, %r5826, %r4974; + add.s32 %r5833, %r5832, %r5831; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 24; + add.s32 %r5836, %r5835, %r5829; + xor.b32 %r5837, %r5836, %r5831; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 25; + add.s32 %r5839, %r5791, %r4981; + add.s32 %r5840, %r5839, %r5782; + xor.b32 %r5841, %r5807, %r5840; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 16; + add.s32 %r5843, %r5842, %r5766; + xor.b32 %r5844, %r5843, %r5782; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 20; + add.s32 %r5846, %r5840, %r5037; + add.s32 %r5847, %r5846, %r5845; + xor.b32 %r5848, %r5847, %r5842; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 24; + add.s32 %r5850, %r5849, %r5843; + xor.b32 %r5851, %r5850, %r5845; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 25; + add.s32 %r5853, %r5805, %r5030; + add.s32 %r5854, %r5853, %r5796; + xor.b32 %r5855, %r5854, %r5765; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 16; + add.s32 %r5857, %r5856, %r5780; + xor.b32 %r5858, %r5857, %r5796; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 20; + add.s32 %r5860, %r5854, %r5016; + add.s32 %r5861, %r5860, %r5859; + xor.b32 %r5862, %r5861, %r5856; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 24; + add.s32 %r5864, %r5863, %r5857; + xor.b32 %r5865, %r5864, %r5859; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 25; + add.s32 %r5867, %r5819, %r5072; + add.s32 %r5868, %r5867, %r5838; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 16; + add.s32 %r5871, %r5870, %r5850; + xor.b32 %r5872, %r5871, %r5838; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 20; + add.s32 %r5874, %r5868, %r5044; + add.s32 %r5875, %r5874, %r5873; + xor.b32 %r5876, %r5875, %r5870; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 24; + add.s32 %r5878, %r5877, %r5871; + xor.b32 %r5879, %r5878, %r5873; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 25; + add.s32 %r5881, %r5833, %r4988; + add.s32 %r5882, %r5881, %r5852; + xor.b32 %r5883, %r5821, %r5882; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 16; + add.s32 %r5885, %r5884, %r5864; + xor.b32 %r5886, %r5885, %r5852; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 20; + add.s32 %r5888, %r5882, %r5058; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5884; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 24; + add.s32 %r5892, %r5891, %r5885; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 25; + add.s32 %r5895, %r5847, %r4995; + add.s32 %r5896, %r5895, %r5866; + xor.b32 %r5897, %r5896, %r5835; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 16; + add.s32 %r5899, %r5898, %r5822; + xor.b32 %r5900, %r5899, %r5866; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 20; + add.s32 %r5902, %r5896, %r5002; + add.s32 %r5903, %r5902, %r5901; + xor.b32 %r5904, %r5903, %r5898; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 24; + add.s32 %r5906, %r5905, %r5899; + xor.b32 %r5907, %r5906, %r5901; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 25; + add.s32 %r5909, %r5861, %r5023; + add.s32 %r5910, %r5909, %r5824; + xor.b32 %r5911, %r5910, %r5849; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 16; + add.s32 %r5913, %r5912, %r5836; + xor.b32 %r5914, %r5913, %r5824; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 20; + add.s32 %r5916, %r5910, %r5065; + add.s32 %r5917, %r5916, %r5915; + xor.b32 %r5918, %r5917, %r5912; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 24; + add.s32 %r5920, %r5919, %r5913; + xor.b32 %r5921, %r5920, %r5915; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 25; + xor.b32 %r5923, %r5906, %r5875; + xor.b32 %r5924, %r5920, %r5889; + xor.b32 %r5925, %r5878, %r5903; + xor.b32 %r5926, %r5917, %r5892; + xor.b32 %r5927, %r5922, %r5891; + xor.b32 %r5928, %r5880, %r5905; + xor.b32 %r5929, %r5919, %r5894; + xor.b32 %r5930, %r5908, %r5877; + st.local.u8 [%rd179+145], %r5923; + shr.u32 %r5931, %r5923, 8; + st.local.u8 [%rd179+146], %r5931; + shr.u32 %r5932, %r5923, 16; + st.local.u8 [%rd179+147], %r5932; + shr.u32 %r5933, %r5923, 24; + st.local.u8 [%rd179+148], %r5933; + st.local.u8 [%rd179+149], %r5924; + shr.u32 %r5934, %r5924, 8; + st.local.u8 [%rd179+150], %r5934; + shr.u32 %r5935, %r5924, 16; + st.local.u8 [%rd179+151], %r5935; + shr.u32 %r5936, %r5924, 24; + st.local.u8 [%rd179+152], %r5936; + st.local.u8 [%rd179+153], %r5925; + shr.u32 %r5937, %r5925, 8; + st.local.u8 [%rd179+154], %r5937; + shr.u32 %r5938, %r5925, 16; + st.local.u8 [%rd179+155], %r5938; + shr.u32 %r5939, %r5925, 24; + st.local.u8 [%rd179+156], %r5939; + st.local.u8 [%rd179+157], %r5926; + shr.u32 %r5940, %r5926, 8; + st.local.u8 [%rd179+158], %r5940; + shr.u32 %r5941, %r5926, 16; + st.local.u8 [%rd179+159], %r5941; + shr.u32 %r5942, %r5926, 24; + st.local.u8 [%rd179+160], %r5942; + st.local.u8 [%rd179+161], %r5927; + shr.u32 %r5943, %r5927, 8; + st.local.u8 [%rd179+162], %r5943; + shr.u32 %r5944, %r5927, 16; + st.local.u8 [%rd179+163], %r5944; + shr.u32 %r5945, %r5927, 24; + st.local.u8 [%rd179+164], %r5945; + st.local.u8 [%rd179+165], %r5928; + shr.u32 %r5946, %r5928, 8; + st.local.u8 [%rd179+166], %r5946; + shr.u32 %r5947, %r5928, 16; + st.local.u8 [%rd179+167], %r5947; + shr.u32 %r5948, %r5928, 24; + st.local.u8 [%rd179+168], %r5948; + st.local.u8 [%rd179+169], %r5929; + shr.u32 %r5949, %r5929, 8; + st.local.u8 [%rd179+170], %r5949; + shr.u32 %r5950, %r5929, 16; + st.local.u8 [%rd179+171], %r5950; + shr.u32 %r5951, %r5929, 24; + st.local.u8 [%rd179+172], %r5951; + st.local.u8 [%rd179+173], %r5930; + shr.u32 %r5952, %r5930, 8; + st.local.u8 [%rd179+174], %r5952; + shr.u32 %r5953, %r5930, 16; + st.local.u8 [%rd179+175], %r5953; + shr.u32 %r5954, %r5930, 24; + st.local.u8 [%rd179+176], %r5954; + ld.local.u8 %rs176, [%rd3+8]; + add.s16 %rs177, %rs176, -1; + st.local.u8 [%rd3+8], %rs177; + cvt.u64.u16 %rd180, %rs177; + and.b64 %rd181, %rd180, 255; + setp.lt.u64 %p30, %rd227, %rd181; + and.b16 %rs178, %rs177, 255; + mul.wide.u16 %r11663, %rs178, 32; + @%p30 bra $L__BB1_34; + +$L__BB1_35: + cvt.s64.s32 %rd182, %r11663; + add.s64 %rd183, %rd2, %rd182; + mov.b32 {%rs179, %rs180}, %r3967; + st.local.u8 [%rd183+145], %rs179; + shr.u16 %rs181, %rs179, 8; + st.local.u8 [%rd183+146], %rs181; + st.local.u8 [%rd183+147], %rs180; + shr.u16 %rs182, %rs180, 8; + st.local.u8 [%rd183+148], %rs182; + mov.b32 {%rs183, %rs184}, %r3968; + st.local.u8 [%rd183+149], %rs183; + shr.u16 %rs185, %rs183, 8; + st.local.u8 [%rd183+150], %rs185; + st.local.u8 [%rd183+151], %rs184; + shr.u16 %rs186, %rs184, 8; + st.local.u8 [%rd183+152], %rs186; + mov.b32 {%rs187, %rs188}, %r3969; + st.local.u8 [%rd183+153], %rs187; + shr.u16 %rs189, %rs187, 8; + st.local.u8 [%rd183+154], %rs189; + st.local.u8 [%rd183+155], %rs188; + shr.u16 %rs190, %rs188, 8; + st.local.u8 [%rd183+156], %rs190; + mov.b32 {%rs191, %rs192}, %r3970; + st.local.u8 [%rd183+157], %rs191; + shr.u16 %rs193, %rs191, 8; + st.local.u8 [%rd183+158], %rs193; + st.local.u8 [%rd183+159], %rs192; + shr.u16 %rs194, %rs192, 8; + st.local.u8 [%rd183+160], %rs194; + mov.b32 {%rs195, %rs196}, %r3971; + st.local.u8 [%rd183+161], %rs195; + shr.u16 %rs197, %rs195, 8; + st.local.u8 [%rd183+162], %rs197; + st.local.u8 [%rd183+163], %rs196; + shr.u16 %rs198, %rs196, 8; + st.local.u8 [%rd183+164], %rs198; + mov.b32 {%rs199, %rs200}, %r3972; + st.local.u8 [%rd183+165], %rs199; + shr.u16 %rs201, %rs199, 8; + st.local.u8 [%rd183+166], %rs201; + st.local.u8 [%rd183+167], %rs200; + shr.u16 %rs202, %rs200, 8; + st.local.u8 [%rd183+168], %rs202; + mov.b32 {%rs203, %rs204}, %r3973; + st.local.u8 [%rd183+169], %rs203; + shr.u16 %rs205, %rs203, 8; + st.local.u8 [%rd183+170], %rs205; + st.local.u8 [%rd183+171], %rs204; + shr.u16 %rs206, %rs204, 8; + st.local.u8 [%rd183+172], %rs206; + mov.b32 {%rs207, %rs208}, %r3974; + st.local.u8 [%rd183+173], %rs207; + shr.u16 %rs209, %rs207, 8; + st.local.u8 [%rd183+174], %rs209; + st.local.u8 [%rd183+175], %rs208; + shr.u16 %rs210, %rs208, 8; + st.local.u8 [%rd183+176], %rs210; + ld.local.u8 %rs388, [%rd3+8]; + +$L__BB1_47: + add.s16 %rs331, %rs388, 1; + st.local.u8 [%rd3+8], %rs331; + ld.local.u64 %rd196, [%rd3+-72]; + shr.u64 %rd197, %rd49, 10; + add.s64 %rd251, %rd196, %rd197; + st.local.u64 [%rd3+-72], %rd251; + add.s64 %rd261, %rd261, %rd49; + add.s64 %rd254, %rd254, %rd49; + sub.s64 %rd262, %rd262, %rd49; + setp.gt.u64 %p39, %rd262, 1024; + @%p39 bra $L__BB1_26; + +$L__BB1_48: + setp.eq.s64 %p40, %rd262, 0; + @%p40 bra $L__BB1_68; + + ld.local.u8 %rs389, [%rd3]; + cvt.u64.u16 %rd71, %rs389; + setp.eq.s16 %p41, %rs389, 0; + mov.u16 %rs390, 0; + mov.u64 %rd271, %rd262; + @%p41 bra $L__BB1_57; + + mov.u64 %rd198, 64; + sub.s64 %rd199, %rd198, %rd71; + min.u64 %rd72, %rd199, %rd262; + setp.eq.s64 %p42, %rd72, 0; + @%p42 bra $L__BB1_54; + + add.s64 %rd201, %rd2, %rd71; + add.s64 %rd73, %rd201, 72; + mov.u64 %rd263, 0; + +$L__BB1_52: + add.s64 %rd202, %rd261, %rd263; + ld.local.u8 %rs333, [%rd202]; + add.s64 %rd203, %rd73, %rd263; + st.local.u8 [%rd203], %rs333; + add.s64 %rd263, %rd263, 1; + setp.lt.u64 %p43, %rd263, %rd72; + @%p43 bra $L__BB1_52; + + ld.local.u8 %rs389, [%rd3]; + +$L__BB1_54: + cvt.u16.u64 %rs334, %rd72; + add.s16 %rs390, %rs389, %rs334; + mov.u64 %rd271, 0; + st.local.u8 [%rd3], %rs390; + add.s64 %rd261, %rd261, %rd72; + sub.s64 %rd77, %rd262, %rd72; + setp.eq.s64 %p44, %rd77, 0; + @%p44 bra $L__BB1_57; + + add.s64 %rd78, %rd2, 72; + ld.local.u8 %rs335, [%rd3+1]; + mov.u64 %rd264, 0; + setp.eq.s16 %p45, %rs335, 0; + mov.u16 %rs390, 0; + selp.u16 %rs337, 1, 0, %p45; + ld.local.u8 %rs338, [%rd3+2]; + or.b16 %rs339, %rs338, %rs337; + ld.local.u8 %r8843, [%rd3+-64]; + ld.local.u8 %r8844, [%rd3+-63]; + prmt.b32 %r8845, %r8844, %r8843, 30212; + ld.local.u8 %r8846, [%rd3+-62]; + prmt.b32 %r8847, %r8846, %r8845, 28756; + ld.local.u8 %r8848, [%rd3+-61]; + prmt.b32 %r8849, %r8848, %r8847, 1620; + ld.local.u8 %r8850, [%rd3+-60]; + ld.local.u8 %r8851, [%rd3+-59]; + prmt.b32 %r8852, %r8851, %r8850, 30212; + ld.local.u8 %r8853, [%rd3+-58]; + prmt.b32 %r8854, %r8853, %r8852, 28756; + ld.local.u8 %r8855, [%rd3+-57]; + prmt.b32 %r8856, %r8855, %r8854, 1620; + ld.local.u8 %r8857, [%rd3+-56]; + ld.local.u8 %r8858, [%rd3+-55]; + prmt.b32 %r8859, %r8858, %r8857, 30212; + ld.local.u8 %r8860, [%rd3+-54]; + prmt.b32 %r8861, %r8860, %r8859, 28756; + ld.local.u8 %r8862, [%rd3+-53]; + prmt.b32 %r8863, %r8862, %r8861, 1620; + ld.local.u8 %r8864, [%rd3+-52]; + ld.local.u8 %r8865, [%rd3+-51]; + prmt.b32 %r8866, %r8865, %r8864, 30212; + ld.local.u8 %r8867, [%rd3+-50]; + prmt.b32 %r8868, %r8867, %r8866, 28756; + ld.local.u8 %r8869, [%rd3+-49]; + prmt.b32 %r8870, %r8869, %r8868, 1620; + ld.local.u8 %r8871, [%rd3+-48]; + ld.local.u8 %r8872, [%rd3+-47]; + prmt.b32 %r8873, %r8872, %r8871, 30212; + ld.local.u8 %r8874, [%rd3+-46]; + prmt.b32 %r8875, %r8874, %r8873, 28756; + ld.local.u8 %r8876, [%rd3+-45]; + prmt.b32 %r8877, %r8876, %r8875, 1620; + ld.local.u8 %r8878, [%rd3+-44]; + ld.local.u8 %r8879, [%rd3+-43]; + prmt.b32 %r8880, %r8879, %r8878, 30212; + ld.local.u8 %r8881, [%rd3+-42]; + prmt.b32 %r8882, %r8881, %r8880, 28756; + ld.local.u8 %r8883, [%rd3+-41]; + prmt.b32 %r8884, %r8883, %r8882, 1620; + ld.local.u8 %r8885, [%rd3+-40]; + ld.local.u8 %r8886, [%rd3+-39]; + prmt.b32 %r8887, %r8886, %r8885, 30212; + ld.local.u8 %r8888, [%rd3+-38]; + prmt.b32 %r8889, %r8888, %r8887, 28756; + ld.local.u8 %r8890, [%rd3+-37]; + prmt.b32 %r8891, %r8890, %r8889, 1620; + ld.local.u8 %r8892, [%rd3+-36]; + ld.local.u8 %r8893, [%rd3+-35]; + prmt.b32 %r8894, %r8893, %r8892, 30212; + ld.local.u8 %r8895, [%rd3+-34]; + prmt.b32 %r8896, %r8895, %r8894, 28756; + ld.local.u8 %r8897, [%rd3+-33]; + prmt.b32 %r8898, %r8897, %r8896, 1620; + ld.local.u8 %r8899, [%rd3+-32]; + ld.local.u8 %r8900, [%rd3+-31]; + prmt.b32 %r8901, %r8900, %r8899, 30212; + ld.local.u8 %r8902, [%rd3+-30]; + prmt.b32 %r8903, %r8902, %r8901, 28756; + ld.local.u8 %r8904, [%rd3+-29]; + prmt.b32 %r8905, %r8904, %r8903, 1620; + ld.local.u8 %r8906, [%rd3+-28]; + ld.local.u8 %r8907, [%rd3+-27]; + prmt.b32 %r8908, %r8907, %r8906, 30212; + ld.local.u8 %r8909, [%rd3+-26]; + prmt.b32 %r8910, %r8909, %r8908, 28756; + ld.local.u8 %r8911, [%rd3+-25]; + prmt.b32 %r8912, %r8911, %r8910, 1620; + ld.local.u8 %r8913, [%rd3+-24]; + ld.local.u8 %r8914, [%rd3+-23]; + prmt.b32 %r8915, %r8914, %r8913, 30212; + ld.local.u8 %r8916, [%rd3+-22]; + prmt.b32 %r8917, %r8916, %r8915, 28756; + ld.local.u8 %r8918, [%rd3+-21]; + prmt.b32 %r8919, %r8918, %r8917, 1620; + ld.local.u8 %r8920, [%rd3+-20]; + ld.local.u8 %r8921, [%rd3+-19]; + prmt.b32 %r8922, %r8921, %r8920, 30212; + ld.local.u8 %r8923, [%rd3+-18]; + prmt.b32 %r8924, %r8923, %r8922, 28756; + ld.local.u8 %r8925, [%rd3+-17]; + prmt.b32 %r8926, %r8925, %r8924, 1620; + ld.local.u8 %r8927, [%rd3+-16]; + ld.local.u8 %r8928, [%rd3+-15]; + prmt.b32 %r8929, %r8928, %r8927, 30212; + ld.local.u8 %r8930, [%rd3+-14]; + prmt.b32 %r8931, %r8930, %r8929, 28756; + ld.local.u8 %r8932, [%rd3+-13]; + prmt.b32 %r8933, %r8932, %r8931, 1620; + ld.local.u8 %r8934, [%rd3+-12]; + ld.local.u8 %r8935, [%rd3+-11]; + prmt.b32 %r8936, %r8935, %r8934, 30212; + ld.local.u8 %r8937, [%rd3+-10]; + prmt.b32 %r8938, %r8937, %r8936, 28756; + ld.local.u8 %r8939, [%rd3+-9]; + prmt.b32 %r8940, %r8939, %r8938, 1620; + ld.local.u8 %r8941, [%rd3+-8]; + ld.local.u8 %r8942, [%rd3+-7]; + prmt.b32 %r8943, %r8942, %r8941, 30212; + ld.local.u8 %r8944, [%rd3+-6]; + prmt.b32 %r8945, %r8944, %r8943, 28756; + ld.local.u8 %r8946, [%rd3+-5]; + prmt.b32 %r8947, %r8946, %r8945, 1620; + ld.local.u8 %r8948, [%rd3+-4]; + ld.local.u8 %r8949, [%rd3+-3]; + prmt.b32 %r8950, %r8949, %r8948, 30212; + ld.local.u8 %r8951, [%rd3+-2]; + prmt.b32 %r8952, %r8951, %r8950, 28756; + ld.local.u8 %r8953, [%rd3+-1]; + prmt.b32 %r8954, %r8953, %r8952, 1620; + ld.local.u64 %rd206, [%rd3+-72]; + cvt.u32.u64 %r8955, %rd206; + shr.u64 %rd207, %rd206, 32; + cvt.u32.u64 %r8956, %rd207; + cvt.u32.u16 %r8957, %rs339; + and.b32 %r8958, %r8957, 255; + ld.local.u32 %r8959, [%rd3+-104]; + add.s32 %r8960, %r8959, %r8849; + ld.local.u32 %r8961, [%rd3+-88]; + add.s32 %r8962, %r8960, %r8961; + xor.b32 %r8963, %r8962, %r8955; + shf.l.wrap.b32 %r8964, %r8963, %r8963, 16; + add.s32 %r8965, %r8964, 1779033703; + xor.b32 %r8966, %r8965, %r8961; + shf.l.wrap.b32 %r8967, %r8966, %r8966, 20; + add.s32 %r8968, %r8962, %r8856; + add.s32 %r8969, %r8968, %r8967; + xor.b32 %r8970, %r8969, %r8964; + shf.l.wrap.b32 %r8971, %r8970, %r8970, 24; + add.s32 %r8972, %r8971, %r8965; + xor.b32 %r8973, %r8972, %r8967; + shf.l.wrap.b32 %r8974, %r8973, %r8973, 25; + ld.local.u32 %r8975, [%rd3+-100]; + add.s32 %r8976, %r8975, %r8863; + ld.local.u32 %r8977, [%rd3+-84]; + add.s32 %r8978, %r8976, %r8977; + xor.b32 %r8979, %r8978, %r8956; + shf.l.wrap.b32 %r8980, %r8979, %r8979, 16; + add.s32 %r8981, %r8980, -1150833019; + xor.b32 %r8982, %r8981, %r8977; + shf.l.wrap.b32 %r8983, %r8982, %r8982, 20; + add.s32 %r8984, %r8978, %r8870; + add.s32 %r8985, %r8984, %r8983; + xor.b32 %r8986, %r8985, %r8980; + shf.l.wrap.b32 %r8987, %r8986, %r8986, 24; + add.s32 %r8988, %r8987, %r8981; + xor.b32 %r8989, %r8988, %r8983; + shf.l.wrap.b32 %r8990, %r8989, %r8989, 25; + ld.local.u32 %r8991, [%rd3+-96]; + add.s32 %r8992, %r8991, %r8877; + ld.local.u32 %r8993, [%rd3+-80]; + add.s32 %r8994, %r8992, %r8993; + shr.u32 %r8995, %r8994, 16; + shl.b32 %r8996, %r8994, 16; + xor.b32 %r8997, %r8996, 4194304; + or.b32 %r8998, %r8997, %r8995; + add.s32 %r8999, %r8998, 1013904242; + xor.b32 %r9000, %r8999, %r8993; + shf.l.wrap.b32 %r9001, %r9000, %r9000, 20; + add.s32 %r9002, %r8994, %r8884; + add.s32 %r9003, %r9002, %r9001; + xor.b32 %r9004, %r9003, %r8998; + shf.l.wrap.b32 %r9005, %r9004, %r9004, 24; + add.s32 %r9006, %r9005, %r8999; + xor.b32 %r9007, %r9006, %r9001; + shf.l.wrap.b32 %r9008, %r9007, %r9007, 25; + ld.local.u32 %r9009, [%rd3+-92]; + add.s32 %r9010, %r9009, %r8891; + ld.local.u32 %r9011, [%rd3+-76]; + add.s32 %r9012, %r9010, %r9011; + xor.b32 %r9013, %r9012, %r8958; + shr.u32 %r9014, %r9012, 16; + shl.b32 %r9015, %r9013, 16; + or.b32 %r9016, %r9015, %r9014; + add.s32 %r9017, %r9016, -1521486534; + xor.b32 %r9018, %r9017, %r9011; + shf.l.wrap.b32 %r9019, %r9018, %r9018, 20; + add.s32 %r9020, %r9012, %r8898; + add.s32 %r9021, %r9020, %r9019; + xor.b32 %r9022, %r9021, %r9016; + shf.l.wrap.b32 %r9023, %r9022, %r9022, 24; + add.s32 %r9024, %r9023, %r9017; + xor.b32 %r9025, %r9024, %r9019; + shf.l.wrap.b32 %r9026, %r9025, %r9025, 25; + add.s32 %r9027, %r8969, %r8905; + add.s32 %r9028, %r9027, %r8990; + xor.b32 %r9029, %r9028, %r9023; + shf.l.wrap.b32 %r9030, %r9029, %r9029, 16; + add.s32 %r9031, %r9030, %r9006; + xor.b32 %r9032, %r9031, %r8990; + shf.l.wrap.b32 %r9033, %r9032, %r9032, 20; + add.s32 %r9034, %r9028, %r8912; + add.s32 %r9035, %r9034, %r9033; + xor.b32 %r9036, %r9035, %r9030; + shf.l.wrap.b32 %r9037, %r9036, %r9036, 24; + add.s32 %r9038, %r9037, %r9031; + xor.b32 %r9039, %r9038, %r9033; + shf.l.wrap.b32 %r9040, %r9039, %r9039, 25; + add.s32 %r9041, %r8985, %r8919; + add.s32 %r9042, %r9041, %r9008; + xor.b32 %r9043, %r9042, %r8971; + shf.l.wrap.b32 %r9044, %r9043, %r9043, 16; + add.s32 %r9045, %r9044, %r9024; + xor.b32 %r9046, %r9045, %r9008; + shf.l.wrap.b32 %r9047, %r9046, %r9046, 20; + add.s32 %r9048, %r9042, %r8926; + add.s32 %r9049, %r9048, %r9047; + xor.b32 %r9050, %r9049, %r9044; + shf.l.wrap.b32 %r9051, %r9050, %r9050, 24; + add.s32 %r9052, %r9051, %r9045; + xor.b32 %r9053, %r9052, %r9047; + shf.l.wrap.b32 %r9054, %r9053, %r9053, 25; + add.s32 %r9055, %r9003, %r8933; + add.s32 %r9056, %r9055, %r9026; + xor.b32 %r9057, %r9056, %r8987; + shf.l.wrap.b32 %r9058, %r9057, %r9057, 16; + add.s32 %r9059, %r9058, %r8972; + xor.b32 %r9060, %r9059, %r9026; + shf.l.wrap.b32 %r9061, %r9060, %r9060, 20; + add.s32 %r9062, %r9056, %r8940; + add.s32 %r9063, %r9062, %r9061; + xor.b32 %r9064, %r9063, %r9058; + shf.l.wrap.b32 %r9065, %r9064, %r9064, 24; + add.s32 %r9066, %r9065, %r9059; + xor.b32 %r9067, %r9066, %r9061; + shf.l.wrap.b32 %r9068, %r9067, %r9067, 25; + add.s32 %r9069, %r9021, %r8947; + add.s32 %r9070, %r9069, %r8974; + xor.b32 %r9071, %r9070, %r9005; + shf.l.wrap.b32 %r9072, %r9071, %r9071, 16; + add.s32 %r9073, %r9072, %r8988; + xor.b32 %r9074, %r9073, %r8974; + shf.l.wrap.b32 %r9075, %r9074, %r9074, 20; + add.s32 %r9076, %r9070, %r8954; + add.s32 %r9077, %r9076, %r9075; + xor.b32 %r9078, %r9077, %r9072; + shf.l.wrap.b32 %r9079, %r9078, %r9078, 24; + add.s32 %r9080, %r9079, %r9073; + xor.b32 %r9081, %r9080, %r9075; + shf.l.wrap.b32 %r9082, %r9081, %r9081, 25; + add.s32 %r9083, %r9035, %r8863; + add.s32 %r9084, %r9083, %r9082; + xor.b32 %r9085, %r9084, %r9051; + shf.l.wrap.b32 %r9086, %r9085, %r9085, 16; + add.s32 %r9087, %r9086, %r9066; + xor.b32 %r9088, %r9087, %r9082; + shf.l.wrap.b32 %r9089, %r9088, %r9088, 20; + add.s32 %r9090, %r9084, %r8891; + add.s32 %r9091, %r9090, %r9089; + xor.b32 %r9092, %r9091, %r9086; + shf.l.wrap.b32 %r9093, %r9092, %r9092, 24; + add.s32 %r9094, %r9093, %r9087; + xor.b32 %r9095, %r9094, %r9089; + shf.l.wrap.b32 %r9096, %r9095, %r9095, 25; + add.s32 %r9097, %r9049, %r8870; + add.s32 %r9098, %r9097, %r9040; + xor.b32 %r9099, %r9098, %r9065; + shf.l.wrap.b32 %r9100, %r9099, %r9099, 16; + add.s32 %r9101, %r9100, %r9080; + xor.b32 %r9102, %r9101, %r9040; + shf.l.wrap.b32 %r9103, %r9102, %r9102, 20; + add.s32 %r9104, %r9098, %r8919; + add.s32 %r9105, %r9104, %r9103; + xor.b32 %r9106, %r9105, %r9100; + shf.l.wrap.b32 %r9107, %r9106, %r9106, 24; + add.s32 %r9108, %r9107, %r9101; + xor.b32 %r9109, %r9108, %r9103; + shf.l.wrap.b32 %r9110, %r9109, %r9109, 25; + add.s32 %r9111, %r9063, %r8898; + add.s32 %r9112, %r9111, %r9054; + xor.b32 %r9113, %r9112, %r9079; + shf.l.wrap.b32 %r9114, %r9113, %r9113, 16; + add.s32 %r9115, %r9114, %r9038; + xor.b32 %r9116, %r9115, %r9054; + shf.l.wrap.b32 %r9117, %r9116, %r9116, 20; + add.s32 %r9118, %r9112, %r8849; + add.s32 %r9119, %r9118, %r9117; + xor.b32 %r9120, %r9119, %r9114; + shf.l.wrap.b32 %r9121, %r9120, %r9120, 24; + add.s32 %r9122, %r9121, %r9115; + xor.b32 %r9123, %r9122, %r9117; + shf.l.wrap.b32 %r9124, %r9123, %r9123, 25; + add.s32 %r9125, %r9077, %r8877; + add.s32 %r9126, %r9125, %r9068; + xor.b32 %r9127, %r9126, %r9037; + shf.l.wrap.b32 %r9128, %r9127, %r9127, 16; + add.s32 %r9129, %r9128, %r9052; + xor.b32 %r9130, %r9129, %r9068; + shf.l.wrap.b32 %r9131, %r9130, %r9130, 20; + add.s32 %r9132, %r9126, %r8940; + add.s32 %r9133, %r9132, %r9131; + xor.b32 %r9134, %r9133, %r9128; + shf.l.wrap.b32 %r9135, %r9134, %r9134, 24; + add.s32 %r9136, %r9135, %r9129; + xor.b32 %r9137, %r9136, %r9131; + shf.l.wrap.b32 %r9138, %r9137, %r9137, 25; + add.s32 %r9139, %r9091, %r8856; + add.s32 %r9140, %r9139, %r9110; + xor.b32 %r9141, %r9140, %r9135; + shf.l.wrap.b32 %r9142, %r9141, %r9141, 16; + add.s32 %r9143, %r9142, %r9122; + xor.b32 %r9144, %r9143, %r9110; + shf.l.wrap.b32 %r9145, %r9144, %r9144, 20; + add.s32 %r9146, %r9140, %r8926; + add.s32 %r9147, %r9146, %r9145; + xor.b32 %r9148, %r9147, %r9142; + shf.l.wrap.b32 %r9149, %r9148, %r9148, 24; + add.s32 %r9150, %r9149, %r9143; + xor.b32 %r9151, %r9150, %r9145; + shf.l.wrap.b32 %r9152, %r9151, %r9151, 25; + add.s32 %r9153, %r9105, %r8933; + add.s32 %r9154, %r9153, %r9124; + xor.b32 %r9155, %r9154, %r9093; + shf.l.wrap.b32 %r9156, %r9155, %r9155, 16; + add.s32 %r9157, %r9156, %r9136; + xor.b32 %r9158, %r9157, %r9124; + shf.l.wrap.b32 %r9159, %r9158, %r9158, 20; + add.s32 %r9160, %r9154, %r8884; + add.s32 %r9161, %r9160, %r9159; + xor.b32 %r9162, %r9161, %r9156; + shf.l.wrap.b32 %r9163, %r9162, %r9162, 24; + add.s32 %r9164, %r9163, %r9157; + xor.b32 %r9165, %r9164, %r9159; + shf.l.wrap.b32 %r9166, %r9165, %r9165, 25; + add.s32 %r9167, %r9119, %r8912; + add.s32 %r9168, %r9167, %r9138; + xor.b32 %r9169, %r9168, %r9107; + shf.l.wrap.b32 %r9170, %r9169, %r9169, 16; + add.s32 %r9171, %r9170, %r9094; + xor.b32 %r9172, %r9171, %r9138; + shf.l.wrap.b32 %r9173, %r9172, %r9172, 20; + add.s32 %r9174, %r9168, %r8947; + add.s32 %r9175, %r9174, %r9173; + xor.b32 %r9176, %r9175, %r9170; + shf.l.wrap.b32 %r9177, %r9176, %r9176, 24; + add.s32 %r9178, %r9177, %r9171; + xor.b32 %r9179, %r9178, %r9173; + shf.l.wrap.b32 %r9180, %r9179, %r9179, 25; + add.s32 %r9181, %r9133, %r8954; + add.s32 %r9182, %r9181, %r9096; + xor.b32 %r9183, %r9182, %r9121; + shf.l.wrap.b32 %r9184, %r9183, %r9183, 16; + add.s32 %r9185, %r9184, %r9108; + xor.b32 %r9186, %r9185, %r9096; + shf.l.wrap.b32 %r9187, %r9186, %r9186, 20; + add.s32 %r9188, %r9182, %r8905; + add.s32 %r9189, %r9188, %r9187; + xor.b32 %r9190, %r9189, %r9184; + shf.l.wrap.b32 %r9191, %r9190, %r9190, 24; + add.s32 %r9192, %r9191, %r9185; + xor.b32 %r9193, %r9192, %r9187; + shf.l.wrap.b32 %r9194, %r9193, %r9193, 25; + add.s32 %r9195, %r9147, %r8870; + add.s32 %r9196, %r9195, %r9194; + xor.b32 %r9197, %r9196, %r9163; + shf.l.wrap.b32 %r9198, %r9197, %r9197, 16; + add.s32 %r9199, %r9198, %r9178; + xor.b32 %r9200, %r9199, %r9194; + shf.l.wrap.b32 %r9201, %r9200, %r9200, 20; + add.s32 %r9202, %r9196, %r8877; + add.s32 %r9203, %r9202, %r9201; + xor.b32 %r9204, %r9203, %r9198; + shf.l.wrap.b32 %r9205, %r9204, %r9204, 24; + add.s32 %r9206, %r9205, %r9199; + xor.b32 %r9207, %r9206, %r9201; + shf.l.wrap.b32 %r9208, %r9207, %r9207, 25; + add.s32 %r9209, %r9161, %r8919; + add.s32 %r9210, %r9209, %r9152; + xor.b32 %r9211, %r9210, %r9177; + shf.l.wrap.b32 %r9212, %r9211, %r9211, 16; + add.s32 %r9213, %r9212, %r9192; + xor.b32 %r9214, %r9213, %r9152; + shf.l.wrap.b32 %r9215, %r9214, %r9214, 20; + add.s32 %r9216, %r9210, %r8933; + add.s32 %r9217, %r9216, %r9215; + xor.b32 %r9218, %r9217, %r9212; + shf.l.wrap.b32 %r9219, %r9218, %r9218, 24; + add.s32 %r9220, %r9219, %r9213; + xor.b32 %r9221, %r9220, %r9215; + shf.l.wrap.b32 %r9222, %r9221, %r9221, 25; + add.s32 %r9223, %r9175, %r8940; + add.s32 %r9224, %r9223, %r9166; + xor.b32 %r9225, %r9224, %r9191; + shf.l.wrap.b32 %r9226, %r9225, %r9225, 16; + add.s32 %r9227, %r9226, %r9150; + xor.b32 %r9228, %r9227, %r9166; + shf.l.wrap.b32 %r9229, %r9228, %r9228, 20; + add.s32 %r9230, %r9224, %r8863; + add.s32 %r9231, %r9230, %r9229; + xor.b32 %r9232, %r9231, %r9226; + shf.l.wrap.b32 %r9233, %r9232, %r9232, 24; + add.s32 %r9234, %r9233, %r9227; + xor.b32 %r9235, %r9234, %r9229; + shf.l.wrap.b32 %r9236, %r9235, %r9235, 25; + add.s32 %r9237, %r9189, %r8898; + add.s32 %r9238, %r9237, %r9180; + xor.b32 %r9239, %r9238, %r9149; + shf.l.wrap.b32 %r9240, %r9239, %r9239, 16; + add.s32 %r9241, %r9240, %r9164; + xor.b32 %r9242, %r9241, %r9180; + shf.l.wrap.b32 %r9243, %r9242, %r9242, 20; + add.s32 %r9244, %r9238, %r8947; + add.s32 %r9245, %r9244, %r9243; + xor.b32 %r9246, %r9245, %r9240; + shf.l.wrap.b32 %r9247, %r9246, %r9246, 24; + add.s32 %r9248, %r9247, %r9241; + xor.b32 %r9249, %r9248, %r9243; + shf.l.wrap.b32 %r9250, %r9249, %r9249, 25; + add.s32 %r9251, %r9203, %r8891; + add.s32 %r9252, %r9251, %r9222; + xor.b32 %r9253, %r9252, %r9247; + shf.l.wrap.b32 %r9254, %r9253, %r9253, 16; + add.s32 %r9255, %r9254, %r9234; + xor.b32 %r9256, %r9255, %r9222; + shf.l.wrap.b32 %r9257, %r9256, %r9256, 20; + add.s32 %r9258, %r9252, %r8884; + add.s32 %r9259, %r9258, %r9257; + xor.b32 %r9260, %r9259, %r9254; + shf.l.wrap.b32 %r9261, %r9260, %r9260, 24; + add.s32 %r9262, %r9261, %r9255; + xor.b32 %r9263, %r9262, %r9257; + shf.l.wrap.b32 %r9264, %r9263, %r9263, 25; + add.s32 %r9265, %r9217, %r8912; + add.s32 %r9266, %r9265, %r9236; + xor.b32 %r9267, %r9266, %r9205; + shf.l.wrap.b32 %r9268, %r9267, %r9267, 16; + add.s32 %r9269, %r9268, %r9248; + xor.b32 %r9270, %r9269, %r9236; + shf.l.wrap.b32 %r9271, %r9270, %r9270, 20; + add.s32 %r9272, %r9266, %r8849; + add.s32 %r9273, %r9272, %r9271; + xor.b32 %r9274, %r9273, %r9268; + shf.l.wrap.b32 %r9275, %r9274, %r9274, 24; + add.s32 %r9276, %r9275, %r9269; + xor.b32 %r9277, %r9276, %r9271; + shf.l.wrap.b32 %r9278, %r9277, %r9277, 25; + add.s32 %r9279, %r9231, %r8926; + add.s32 %r9280, %r9279, %r9250; + xor.b32 %r9281, %r9280, %r9219; + shf.l.wrap.b32 %r9282, %r9281, %r9281, 16; + add.s32 %r9283, %r9282, %r9206; + xor.b32 %r9284, %r9283, %r9250; + shf.l.wrap.b32 %r9285, %r9284, %r9284, 20; + add.s32 %r9286, %r9280, %r8954; + add.s32 %r9287, %r9286, %r9285; + xor.b32 %r9288, %r9287, %r9282; + shf.l.wrap.b32 %r9289, %r9288, %r9288, 24; + add.s32 %r9290, %r9289, %r9283; + xor.b32 %r9291, %r9290, %r9285; + shf.l.wrap.b32 %r9292, %r9291, %r9291, 25; + add.s32 %r9293, %r9245, %r8905; + add.s32 %r9294, %r9293, %r9208; + xor.b32 %r9295, %r9294, %r9233; + shf.l.wrap.b32 %r9296, %r9295, %r9295, 16; + add.s32 %r9297, %r9296, %r9220; + xor.b32 %r9298, %r9297, %r9208; + shf.l.wrap.b32 %r9299, %r9298, %r9298, 20; + add.s32 %r9300, %r9294, %r8856; + add.s32 %r9301, %r9300, %r9299; + xor.b32 %r9302, %r9301, %r9296; + shf.l.wrap.b32 %r9303, %r9302, %r9302, 24; + add.s32 %r9304, %r9303, %r9297; + xor.b32 %r9305, %r9304, %r9299; + shf.l.wrap.b32 %r9306, %r9305, %r9305, 25; + add.s32 %r9307, %r9259, %r8919; + add.s32 %r9308, %r9307, %r9306; + xor.b32 %r9309, %r9308, %r9275; + shf.l.wrap.b32 %r9310, %r9309, %r9309, 16; + add.s32 %r9311, %r9310, %r9290; + xor.b32 %r9312, %r9311, %r9306; + shf.l.wrap.b32 %r9313, %r9312, %r9312, 20; + add.s32 %r9314, %r9308, %r8898; + add.s32 %r9315, %r9314, %r9313; + xor.b32 %r9316, %r9315, %r9310; + shf.l.wrap.b32 %r9317, %r9316, %r9316, 24; + add.s32 %r9318, %r9317, %r9311; + xor.b32 %r9319, %r9318, %r9313; + shf.l.wrap.b32 %r9320, %r9319, %r9319, 25; + add.s32 %r9321, %r9273, %r8933; + add.s32 %r9322, %r9321, %r9264; + xor.b32 %r9323, %r9322, %r9289; + shf.l.wrap.b32 %r9324, %r9323, %r9323, 16; + add.s32 %r9325, %r9324, %r9304; + xor.b32 %r9326, %r9325, %r9264; + shf.l.wrap.b32 %r9327, %r9326, %r9326, 20; + add.s32 %r9328, %r9322, %r8912; + add.s32 %r9329, %r9328, %r9327; + xor.b32 %r9330, %r9329, %r9324; + shf.l.wrap.b32 %r9331, %r9330, %r9330, 24; + add.s32 %r9332, %r9331, %r9325; + xor.b32 %r9333, %r9332, %r9327; + shf.l.wrap.b32 %r9334, %r9333, %r9333, 25; + add.s32 %r9335, %r9287, %r8947; + add.s32 %r9336, %r9335, %r9278; + xor.b32 %r9337, %r9336, %r9303; + shf.l.wrap.b32 %r9338, %r9337, %r9337, 16; + add.s32 %r9339, %r9338, %r9262; + xor.b32 %r9340, %r9339, %r9278; + shf.l.wrap.b32 %r9341, %r9340, %r9340, 20; + add.s32 %r9342, %r9336, %r8870; + add.s32 %r9343, %r9342, %r9341; + xor.b32 %r9344, %r9343, %r9338; + shf.l.wrap.b32 %r9345, %r9344, %r9344, 24; + add.s32 %r9346, %r9345, %r9339; + xor.b32 %r9347, %r9346, %r9341; + shf.l.wrap.b32 %r9348, %r9347, %r9347, 25; + add.s32 %r9349, %r9301, %r8940; + add.s32 %r9350, %r9349, %r9292; + xor.b32 %r9351, %r9350, %r9261; + shf.l.wrap.b32 %r9352, %r9351, %r9351, 16; + add.s32 %r9353, %r9352, %r9276; + xor.b32 %r9354, %r9353, %r9292; + shf.l.wrap.b32 %r9355, %r9354, %r9354, 20; + add.s32 %r9356, %r9350, %r8954; + add.s32 %r9357, %r9356, %r9355; + xor.b32 %r9358, %r9357, %r9352; + shf.l.wrap.b32 %r9359, %r9358, %r9358, 24; + add.s32 %r9360, %r9359, %r9353; + xor.b32 %r9361, %r9360, %r9355; + shf.l.wrap.b32 %r9362, %r9361, %r9361, 25; + add.s32 %r9363, %r9315, %r8877; + add.s32 %r9364, %r9363, %r9334; + xor.b32 %r9365, %r9364, %r9359; + shf.l.wrap.b32 %r9366, %r9365, %r9365, 16; + add.s32 %r9367, %r9366, %r9346; + xor.b32 %r9368, %r9367, %r9334; + shf.l.wrap.b32 %r9369, %r9368, %r9368, 20; + add.s32 %r9370, %r9364, %r8849; + add.s32 %r9371, %r9370, %r9369; + xor.b32 %r9372, %r9371, %r9366; + shf.l.wrap.b32 %r9373, %r9372, %r9372, 24; + add.s32 %r9374, %r9373, %r9367; + xor.b32 %r9375, %r9374, %r9369; + shf.l.wrap.b32 %r9376, %r9375, %r9375, 25; + add.s32 %r9377, %r9329, %r8926; + add.s32 %r9378, %r9377, %r9348; + xor.b32 %r9379, %r9378, %r9317; + shf.l.wrap.b32 %r9380, %r9379, %r9379, 16; + add.s32 %r9381, %r9380, %r9360; + xor.b32 %r9382, %r9381, %r9348; + shf.l.wrap.b32 %r9383, %r9382, %r9382, 20; + add.s32 %r9384, %r9378, %r8863; + add.s32 %r9385, %r9384, %r9383; + xor.b32 %r9386, %r9385, %r9380; + shf.l.wrap.b32 %r9387, %r9386, %r9386, 24; + add.s32 %r9388, %r9387, %r9381; + xor.b32 %r9389, %r9388, %r9383; + shf.l.wrap.b32 %r9390, %r9389, %r9389, 25; + add.s32 %r9391, %r9343, %r8884; + add.s32 %r9392, %r9391, %r9362; + xor.b32 %r9393, %r9392, %r9331; + shf.l.wrap.b32 %r9394, %r9393, %r9393, 16; + add.s32 %r9395, %r9394, %r9318; + xor.b32 %r9396, %r9395, %r9362; + shf.l.wrap.b32 %r9397, %r9396, %r9396, 20; + add.s32 %r9398, %r9392, %r8905; + add.s32 %r9399, %r9398, %r9397; + xor.b32 %r9400, %r9399, %r9394; + shf.l.wrap.b32 %r9401, %r9400, %r9400, 24; + add.s32 %r9402, %r9401, %r9395; + xor.b32 %r9403, %r9402, %r9397; + shf.l.wrap.b32 %r9404, %r9403, %r9403, 25; + add.s32 %r9405, %r9357, %r8856; + add.s32 %r9406, %r9405, %r9320; + xor.b32 %r9407, %r9406, %r9345; + shf.l.wrap.b32 %r9408, %r9407, %r9407, 16; + add.s32 %r9409, %r9408, %r9332; + xor.b32 %r9410, %r9409, %r9320; + shf.l.wrap.b32 %r9411, %r9410, %r9410, 20; + add.s32 %r9412, %r9406, %r8891; + add.s32 %r9413, %r9412, %r9411; + xor.b32 %r9414, %r9413, %r9408; + shf.l.wrap.b32 %r9415, %r9414, %r9414, 24; + add.s32 %r9416, %r9415, %r9409; + xor.b32 %r9417, %r9416, %r9411; + shf.l.wrap.b32 %r9418, %r9417, %r9417, 25; + add.s32 %r9419, %r9371, %r8933; + add.s32 %r9420, %r9419, %r9418; + xor.b32 %r9421, %r9420, %r9387; + shf.l.wrap.b32 %r9422, %r9421, %r9421, 16; + add.s32 %r9423, %r9422, %r9402; + xor.b32 %r9424, %r9423, %r9418; + shf.l.wrap.b32 %r9425, %r9424, %r9424, 20; + add.s32 %r9426, %r9420, %r8940; + add.s32 %r9427, %r9426, %r9425; + xor.b32 %r9428, %r9427, %r9422; + shf.l.wrap.b32 %r9429, %r9428, %r9428, 24; + add.s32 %r9430, %r9429, %r9423; + xor.b32 %r9431, %r9430, %r9425; + shf.l.wrap.b32 %r9432, %r9431, %r9431, 25; + add.s32 %r9433, %r9385, %r8912; + add.s32 %r9434, %r9433, %r9376; + xor.b32 %r9435, %r9434, %r9401; + shf.l.wrap.b32 %r9436, %r9435, %r9435, 16; + add.s32 %r9437, %r9436, %r9416; + xor.b32 %r9438, %r9437, %r9376; + shf.l.wrap.b32 %r9439, %r9438, %r9438, 20; + add.s32 %r9440, %r9434, %r8926; + add.s32 %r9441, %r9440, %r9439; + xor.b32 %r9442, %r9441, %r9436; + shf.l.wrap.b32 %r9443, %r9442, %r9442, 24; + add.s32 %r9444, %r9443, %r9437; + xor.b32 %r9445, %r9444, %r9439; + shf.l.wrap.b32 %r9446, %r9445, %r9445, 25; + add.s32 %r9447, %r9399, %r8954; + add.s32 %r9448, %r9447, %r9390; + xor.b32 %r9449, %r9448, %r9415; + shf.l.wrap.b32 %r9450, %r9449, %r9449, 16; + add.s32 %r9451, %r9450, %r9374; + xor.b32 %r9452, %r9451, %r9390; + shf.l.wrap.b32 %r9453, %r9452, %r9452, 20; + add.s32 %r9454, %r9448, %r8919; + add.s32 %r9455, %r9454, %r9453; + xor.b32 %r9456, %r9455, %r9450; + shf.l.wrap.b32 %r9457, %r9456, %r9456, 24; + add.s32 %r9458, %r9457, %r9451; + xor.b32 %r9459, %r9458, %r9453; + shf.l.wrap.b32 %r9460, %r9459, %r9459, 25; + add.s32 %r9461, %r9413, %r8947; + add.s32 %r9462, %r9461, %r9404; + xor.b32 %r9463, %r9462, %r9373; + shf.l.wrap.b32 %r9464, %r9463, %r9463, 16; + add.s32 %r9465, %r9464, %r9388; + xor.b32 %r9466, %r9465, %r9404; + shf.l.wrap.b32 %r9467, %r9466, %r9466, 20; + add.s32 %r9468, %r9462, %r8905; + add.s32 %r9469, %r9468, %r9467; + xor.b32 %r9470, %r9469, %r9464; + shf.l.wrap.b32 %r9471, %r9470, %r9470, 24; + add.s32 %r9472, %r9471, %r9465; + xor.b32 %r9473, %r9472, %r9467; + shf.l.wrap.b32 %r9474, %r9473, %r9473, 25; + add.s32 %r9475, %r9427, %r8898; + add.s32 %r9476, %r9475, %r9446; + xor.b32 %r9477, %r9476, %r9471; + shf.l.wrap.b32 %r9478, %r9477, %r9477, 16; + add.s32 %r9479, %r9478, %r9458; + xor.b32 %r9480, %r9479, %r9446; + shf.l.wrap.b32 %r9481, %r9480, %r9480, 20; + add.s32 %r9482, %r9476, %r8863; + add.s32 %r9483, %r9482, %r9481; + xor.b32 %r9484, %r9483, %r9478; + shf.l.wrap.b32 %r9485, %r9484, %r9484, 24; + add.s32 %r9486, %r9485, %r9479; + xor.b32 %r9487, %r9486, %r9481; + shf.l.wrap.b32 %r9488, %r9487, %r9487, 25; + add.s32 %r9489, %r9441, %r8884; + add.s32 %r9490, %r9489, %r9460; + xor.b32 %r9491, %r9490, %r9429; + shf.l.wrap.b32 %r9492, %r9491, %r9491, 16; + add.s32 %r9493, %r9492, %r9472; + xor.b32 %r9494, %r9493, %r9460; + shf.l.wrap.b32 %r9495, %r9494, %r9494, 20; + add.s32 %r9496, %r9490, %r8870; + add.s32 %r9497, %r9496, %r9495; + xor.b32 %r9498, %r9497, %r9492; + shf.l.wrap.b32 %r9499, %r9498, %r9498, 24; + add.s32 %r9500, %r9499, %r9493; + xor.b32 %r9501, %r9500, %r9495; + shf.l.wrap.b32 %r9502, %r9501, %r9501, 25; + add.s32 %r9503, %r9455, %r8849; + add.s32 %r9504, %r9503, %r9474; + xor.b32 %r9505, %r9504, %r9443; + shf.l.wrap.b32 %r9506, %r9505, %r9505, 16; + add.s32 %r9507, %r9506, %r9430; + xor.b32 %r9508, %r9507, %r9474; + shf.l.wrap.b32 %r9509, %r9508, %r9508, 20; + add.s32 %r9510, %r9504, %r8856; + add.s32 %r9511, %r9510, %r9509; + xor.b32 %r9512, %r9511, %r9506; + shf.l.wrap.b32 %r9513, %r9512, %r9512, 24; + add.s32 %r9514, %r9513, %r9507; + xor.b32 %r9515, %r9514, %r9509; + shf.l.wrap.b32 %r9516, %r9515, %r9515, 25; + add.s32 %r9517, %r9469, %r8891; + add.s32 %r9518, %r9517, %r9432; + xor.b32 %r9519, %r9518, %r9457; + shf.l.wrap.b32 %r9520, %r9519, %r9519, 16; + add.s32 %r9521, %r9520, %r9444; + xor.b32 %r9522, %r9521, %r9432; + shf.l.wrap.b32 %r9523, %r9522, %r9522, 20; + add.s32 %r9524, %r9518, %r8877; + add.s32 %r9525, %r9524, %r9523; + xor.b32 %r9526, %r9525, %r9520; + shf.l.wrap.b32 %r9527, %r9526, %r9526, 24; + add.s32 %r9528, %r9527, %r9521; + xor.b32 %r9529, %r9528, %r9523; + shf.l.wrap.b32 %r9530, %r9529, %r9529, 25; + add.s32 %r9531, %r9483, %r8912; + add.s32 %r9532, %r9531, %r9530; + xor.b32 %r9533, %r9532, %r9499; + shf.l.wrap.b32 %r9534, %r9533, %r9533, 16; + add.s32 %r9535, %r9534, %r9514; + xor.b32 %r9536, %r9535, %r9530; + shf.l.wrap.b32 %r9537, %r9536, %r9536, 20; + add.s32 %r9538, %r9532, %r8947; + add.s32 %r9539, %r9538, %r9537; + xor.b32 %r9540, %r9539, %r9534; + shf.l.wrap.b32 %r9541, %r9540, %r9540, 24; + add.s32 %r9542, %r9541, %r9535; + xor.b32 %r9543, %r9542, %r9537; + shf.l.wrap.b32 %r9544, %r9543, %r9543, 25; + add.s32 %r9545, %r9497, %r8926; + add.s32 %r9546, %r9545, %r9488; + xor.b32 %r9547, %r9546, %r9513; + shf.l.wrap.b32 %r9548, %r9547, %r9547, 16; + add.s32 %r9549, %r9548, %r9528; + xor.b32 %r9550, %r9549, %r9488; + shf.l.wrap.b32 %r9551, %r9550, %r9550, 20; + add.s32 %r9552, %r9546, %r8884; + add.s32 %r9553, %r9552, %r9551; + xor.b32 %r9554, %r9553, %r9548; + shf.l.wrap.b32 %r9555, %r9554, %r9554, 24; + add.s32 %r9556, %r9555, %r9549; + xor.b32 %r9557, %r9556, %r9551; + shf.l.wrap.b32 %r9558, %r9557, %r9557, 25; + add.s32 %r9559, %r9511, %r8905; + add.s32 %r9560, %r9559, %r9502; + xor.b32 %r9561, %r9560, %r9527; + shf.l.wrap.b32 %r9562, %r9561, %r9561, 16; + add.s32 %r9563, %r9562, %r9486; + xor.b32 %r9564, %r9563, %r9502; + shf.l.wrap.b32 %r9565, %r9564, %r9564, 20; + add.s32 %r9566, %r9560, %r8933; + add.s32 %r9567, %r9566, %r9565; + xor.b32 %r9568, %r9567, %r9562; + shf.l.wrap.b32 %r9569, %r9568, %r9568, 24; + add.s32 %r9570, %r9569, %r9563; + xor.b32 %r9571, %r9570, %r9565; + shf.l.wrap.b32 %r9572, %r9571, %r9571, 25; + add.s32 %r9573, %r9525, %r8954; + add.s32 %r9574, %r9573, %r9516; + xor.b32 %r9575, %r9574, %r9485; + shf.l.wrap.b32 %r9576, %r9575, %r9575, 16; + add.s32 %r9577, %r9576, %r9500; + xor.b32 %r9578, %r9577, %r9516; + shf.l.wrap.b32 %r9579, %r9578, %r9578, 20; + add.s32 %r9580, %r9574, %r8856; + add.s32 %r9581, %r9580, %r9579; + xor.b32 %r9582, %r9581, %r9576; + shf.l.wrap.b32 %r9583, %r9582, %r9582, 24; + add.s32 %r9584, %r9583, %r9577; + xor.b32 %r9585, %r9584, %r9579; + shf.l.wrap.b32 %r9586, %r9585, %r9585, 25; + add.s32 %r9587, %r9539, %r8940; + add.s32 %r9588, %r9587, %r9558; + xor.b32 %r9589, %r9588, %r9583; + shf.l.wrap.b32 %r9590, %r9589, %r9589, 16; + add.s32 %r9591, %r9590, %r9570; + xor.b32 %r9592, %r9591, %r9558; + shf.l.wrap.b32 %r9593, %r9592, %r9592, 20; + add.s32 %r9594, %r9588, %r8870; + add.s32 %r9595, %r9594, %r9593; + xor.b32 %r9596, %r9595, %r9590; + shf.l.wrap.b32 %r9597, %r9596, %r9596, 24; + add.s32 %r9598, %r9597, %r9591; + xor.b32 %r9599, %r9598, %r9593; + shf.l.wrap.b32 %r9600, %r9599, %r9599, 25; + add.s32 %r9601, %r9553, %r8849; + add.s32 %r9602, %r9601, %r9572; + xor.b32 %r9603, %r9602, %r9541; + shf.l.wrap.b32 %r9604, %r9603, %r9603, 16; + add.s32 %r9605, %r9604, %r9584; + xor.b32 %r9606, %r9605, %r9572; + shf.l.wrap.b32 %r9607, %r9606, %r9606, 20; + add.s32 %r9608, %r9602, %r8919; + add.s32 %r9609, %r9608, %r9607; + xor.b32 %r9610, %r9609, %r9604; + shf.l.wrap.b32 %r9611, %r9610, %r9610, 24; + add.s32 %r9612, %r9611, %r9605; + xor.b32 %r9613, %r9612, %r9607; + shf.l.wrap.b32 %r9614, %r9613, %r9613, 25; + add.s32 %r9615, %r9567, %r8863; + add.s32 %r9616, %r9615, %r9586; + xor.b32 %r9617, %r9616, %r9555; + shf.l.wrap.b32 %r9618, %r9617, %r9617, 16; + add.s32 %r9619, %r9618, %r9542; + xor.b32 %r9620, %r9619, %r9586; + shf.l.wrap.b32 %r9621, %r9620, %r9620, 20; + add.s32 %r9622, %r9616, %r8891; + add.s32 %r9623, %r9622, %r9621; + xor.b32 %r9624, %r9623, %r9618; + shf.l.wrap.b32 %r9625, %r9624, %r9624, 24; + add.s32 %r9626, %r9625, %r9619; + xor.b32 %r9627, %r9626, %r9621; + shf.l.wrap.b32 %r9628, %r9627, %r9627, 25; + add.s32 %r9629, %r9581, %r8877; + add.s32 %r9630, %r9629, %r9544; + xor.b32 %r9631, %r9630, %r9569; + shf.l.wrap.b32 %r9632, %r9631, %r9631, 16; + add.s32 %r9633, %r9632, %r9556; + xor.b32 %r9634, %r9633, %r9544; + shf.l.wrap.b32 %r9635, %r9634, %r9634, 20; + add.s32 %r9636, %r9630, %r8898; + add.s32 %r9637, %r9636, %r9635; + xor.b32 %r9638, %r9637, %r9632; + shf.l.wrap.b32 %r9639, %r9638, %r9638, 24; + add.s32 %r9640, %r9639, %r9633; + xor.b32 %r9641, %r9640, %r9635; + shf.l.wrap.b32 %r9642, %r9641, %r9641, 25; + add.s32 %r9643, %r9595, %r8926; + add.s32 %r9644, %r9643, %r9642; + xor.b32 %r9645, %r9644, %r9611; + shf.l.wrap.b32 %r9646, %r9645, %r9645, 16; + add.s32 %r9647, %r9646, %r9626; + xor.b32 %r9648, %r9647, %r9642; + shf.l.wrap.b32 %r9649, %r9648, %r9648, 20; + add.s32 %r9650, %r9644, %r8954; + add.s32 %r9651, %r9650, %r9649; + xor.b32 %r9652, %r9651, %r9646; + shf.l.wrap.b32 %r9653, %r9652, %r9652, 24; + add.s32 %r9654, %r9653, %r9647; + xor.b32 %r9655, %r9654, %r9649; + shf.l.wrap.b32 %r9656, %r9655, %r9655, 25; + add.s32 %r9657, %r9609, %r8884; + add.s32 %r9658, %r9657, %r9600; + xor.b32 %r9659, %r9658, %r9625; + shf.l.wrap.b32 %r9660, %r9659, %r9659, 16; + add.s32 %r9661, %r9660, %r9640; + xor.b32 %r9662, %r9661, %r9600; + shf.l.wrap.b32 %r9663, %r9662, %r9662, 20; + add.s32 %r9664, %r9658, %r8849; + add.s32 %r9665, %r9664, %r9663; + xor.b32 %r9666, %r9665, %r9660; + shf.l.wrap.b32 %r9667, %r9666, %r9666, 24; + add.s32 %r9668, %r9667, %r9661; + xor.b32 %r9669, %r9668, %r9663; + shf.l.wrap.b32 %r9670, %r9669, %r9669, 25; + add.s32 %r9671, %r9623, %r8856; + add.s32 %r9672, %r9671, %r9614; + xor.b32 %r9673, %r9672, %r9639; + shf.l.wrap.b32 %r9674, %r9673, %r9673, 16; + add.s32 %r9675, %r9674, %r9598; + xor.b32 %r9676, %r9675, %r9614; + shf.l.wrap.b32 %r9677, %r9676, %r9676, 20; + add.s32 %r9678, %r9672, %r8912; + add.s32 %r9679, %r9678, %r9677; + xor.b32 %r9680, %r9679, %r9674; + shf.l.wrap.b32 %r9681, %r9680, %r9680, 24; + add.s32 %r9682, %r9681, %r9675; + xor.b32 %r9683, %r9682, %r9677; + shf.l.wrap.b32 %r9684, %r9683, %r9683, 25; + add.s32 %r9685, %r9637, %r8905; + add.s32 %r9686, %r9685, %r9628; + xor.b32 %r9687, %r9686, %r9597; + shf.l.wrap.b32 %r9688, %r9687, %r9687, 16; + add.s32 %r9689, %r9688, %r9612; + xor.b32 %r9690, %r9689, %r9628; + shf.l.wrap.b32 %r9691, %r9690, %r9690, 20; + add.s32 %r9692, %r9686, %r8891; + add.s32 %r9693, %r9692, %r9691; + xor.b32 %r9694, %r9693, %r9688; + shf.l.wrap.b32 %r9695, %r9694, %r9694, 24; + add.s32 %r9696, %r9695, %r9689; + xor.b32 %r9697, %r9696, %r9691; + shf.l.wrap.b32 %r9698, %r9697, %r9697, 25; + add.s32 %r9699, %r9651, %r8947; + add.s32 %r9700, %r9699, %r9670; + xor.b32 %r9701, %r9700, %r9695; + shf.l.wrap.b32 %r9702, %r9701, %r9701, 16; + add.s32 %r9703, %r9702, %r9682; + xor.b32 %r9704, %r9703, %r9670; + shf.l.wrap.b32 %r9705, %r9704, %r9704, 20; + add.s32 %r9706, %r9700, %r8919; + add.s32 %r9707, %r9706, %r9705; + xor.b32 %r9708, %r9707, %r9702; + shf.l.wrap.b32 %r9709, %r9708, %r9708, 24; + add.s32 %r9710, %r9709, %r9703; + xor.b32 %r9711, %r9710, %r9705; + shf.l.wrap.b32 %r9712, %r9711, %r9711, 25; + add.s32 %r9713, %r9665, %r8863; + add.s32 %r9714, %r9713, %r9684; + xor.b32 %r9715, %r9714, %r9653; + shf.l.wrap.b32 %r9716, %r9715, %r9715, 16; + add.s32 %r9717, %r9716, %r9696; + xor.b32 %r9718, %r9717, %r9684; + shf.l.wrap.b32 %r9719, %r9718, %r9718, 20; + add.s32 %r9720, %r9714, %r8933; + add.s32 %r9721, %r9720, %r9719; + xor.b32 %r9722, %r9721, %r9716; + shf.l.wrap.b32 %r9723, %r9722, %r9722, 24; + add.s32 %r9724, %r9723, %r9717; + xor.b32 %r9725, %r9724, %r9719; + shf.l.wrap.b32 %r9726, %r9725, %r9725, 25; + add.s32 %r9727, %r9679, %r8870; + add.s32 %r9728, %r9727, %r9698; + xor.b32 %r9729, %r9728, %r9667; + shf.l.wrap.b32 %r9730, %r9729, %r9729, 16; + add.s32 %r9731, %r9730, %r9654; + xor.b32 %r9732, %r9731, %r9698; + shf.l.wrap.b32 %r9733, %r9732, %r9732, 20; + add.s32 %r9734, %r9728, %r8877; + add.s32 %r9735, %r9734, %r9733; + xor.b32 %r9736, %r9735, %r9730; + shf.l.wrap.b32 %r9737, %r9736, %r9736, 24; + add.s32 %r9738, %r9737, %r9731; + xor.b32 %r9739, %r9738, %r9733; + shf.l.wrap.b32 %r9740, %r9739, %r9739, 25; + add.s32 %r9741, %r9693, %r8898; + add.s32 %r9742, %r9741, %r9656; + xor.b32 %r9743, %r9742, %r9681; + shf.l.wrap.b32 %r9744, %r9743, %r9743, 16; + add.s32 %r9745, %r9744, %r9668; + xor.b32 %r9746, %r9745, %r9656; + shf.l.wrap.b32 %r9747, %r9746, %r9746, 20; + add.s32 %r9748, %r9742, %r8940; + add.s32 %r9749, %r9748, %r9747; + xor.b32 %r9750, %r9749, %r9744; + shf.l.wrap.b32 %r9751, %r9750, %r9750, 24; + add.s32 %r9752, %r9751, %r9745; + xor.b32 %r9753, %r9752, %r9747; + shf.l.wrap.b32 %r9754, %r9753, %r9753, 25; + xor.b32 %r9755, %r9738, %r9707; + st.local.u32 [%rd3+-104], %r9755; + xor.b32 %r9756, %r9752, %r9721; + st.local.u32 [%rd3+-100], %r9756; + xor.b32 %r9757, %r9710, %r9735; + st.local.u32 [%rd3+-96], %r9757; + xor.b32 %r9758, %r9724, %r9749; + st.local.u32 [%rd3+-92], %r9758; + xor.b32 %r9759, %r9754, %r9723; + st.local.u32 [%rd3+-88], %r9759; + xor.b32 %r9760, %r9712, %r9737; + st.local.u32 [%rd3+-84], %r9760; + xor.b32 %r9761, %r9726, %r9751; + st.local.u32 [%rd3+-80], %r9761; + xor.b32 %r9762, %r9740, %r9709; + st.local.u32 [%rd3+-76], %r9762; + add.s16 %rs340, %rs335, 1; + st.local.v2.u8 [%rd3], {%rs390, %rs340}; + +$L__BB1_56: + add.s64 %rd208, %rd78, %rd264; + st.local.u8 [%rd208], %rs390; + add.s64 %rd264, %rd264, 1; + setp.lt.u64 %p46, %rd264, 64; + mov.u64 %rd271, %rd77; + @%p46 bra $L__BB1_56; + +$L__BB1_57: + setp.gt.u64 %p47, %rd271, 64; + @%p47 bra $L__BB1_59; + bra.uni $L__BB1_58; + +$L__BB1_59: + ld.local.u8 %rs95, [%rd3+2]; + ld.local.u8 %rs391, [%rd3+1]; + ld.local.u32 %r11689, [%rd3+-104]; + ld.local.u32 %r11688, [%rd3+-100]; + ld.local.u32 %r11687, [%rd3+-96]; + ld.local.u32 %r11686, [%rd3+-92]; + ld.local.u32 %r11685, [%rd3+-88]; + ld.local.u32 %r11684, [%rd3+-84]; + ld.local.u32 %r11683, [%rd3+-80]; + ld.local.u32 %r11682, [%rd3+-76]; + ld.local.u64 %rd269, [%rd3+-72]; + cvt.u32.u64 %r117, %rd269; + shr.u64 %rd209, %rd269, 32; + cvt.u32.u64 %r118, %rd209; + +$L__BB1_60: + and.b16 %rs342, %rs391, 255; + setp.eq.s16 %p48, %rs342, 0; + selp.u16 %rs343, 1, 0, %p48; + or.b16 %rs344, %rs95, %rs343; + ld.local.u8 %r9763, [%rd261]; + ld.local.u8 %r9764, [%rd261+1]; + prmt.b32 %r9765, %r9764, %r9763, 30212; + ld.local.u8 %r9766, [%rd261+2]; + prmt.b32 %r9767, %r9766, %r9765, 28756; + ld.local.u8 %r9768, [%rd261+3]; + prmt.b32 %r9769, %r9768, %r9767, 1620; + ld.local.u8 %r9770, [%rd261+4]; + ld.local.u8 %r9771, [%rd261+5]; + prmt.b32 %r9772, %r9771, %r9770, 30212; + ld.local.u8 %r9773, [%rd261+6]; + prmt.b32 %r9774, %r9773, %r9772, 28756; + ld.local.u8 %r9775, [%rd261+7]; + prmt.b32 %r9776, %r9775, %r9774, 1620; + ld.local.u8 %r9777, [%rd261+8]; + ld.local.u8 %r9778, [%rd261+9]; + prmt.b32 %r9779, %r9778, %r9777, 30212; + ld.local.u8 %r9780, [%rd261+10]; + prmt.b32 %r9781, %r9780, %r9779, 28756; + ld.local.u8 %r9782, [%rd261+11]; + prmt.b32 %r9783, %r9782, %r9781, 1620; + ld.local.u8 %r9784, [%rd261+12]; + ld.local.u8 %r9785, [%rd261+13]; + prmt.b32 %r9786, %r9785, %r9784, 30212; + ld.local.u8 %r9787, [%rd261+14]; + prmt.b32 %r9788, %r9787, %r9786, 28756; + ld.local.u8 %r9789, [%rd261+15]; + prmt.b32 %r9790, %r9789, %r9788, 1620; + ld.local.u8 %r9791, [%rd261+16]; + ld.local.u8 %r9792, [%rd261+17]; + prmt.b32 %r9793, %r9792, %r9791, 30212; + ld.local.u8 %r9794, [%rd261+18]; + prmt.b32 %r9795, %r9794, %r9793, 28756; + ld.local.u8 %r9796, [%rd261+19]; + prmt.b32 %r9797, %r9796, %r9795, 1620; + ld.local.u8 %r9798, [%rd261+20]; + ld.local.u8 %r9799, [%rd261+21]; + prmt.b32 %r9800, %r9799, %r9798, 30212; + ld.local.u8 %r9801, [%rd261+22]; + prmt.b32 %r9802, %r9801, %r9800, 28756; + ld.local.u8 %r9803, [%rd261+23]; + prmt.b32 %r9804, %r9803, %r9802, 1620; + ld.local.u8 %r9805, [%rd261+24]; + ld.local.u8 %r9806, [%rd261+25]; + prmt.b32 %r9807, %r9806, %r9805, 30212; + ld.local.u8 %r9808, [%rd261+26]; + prmt.b32 %r9809, %r9808, %r9807, 28756; + ld.local.u8 %r9810, [%rd261+27]; + prmt.b32 %r9811, %r9810, %r9809, 1620; + ld.local.u8 %r9812, [%rd261+28]; + ld.local.u8 %r9813, [%rd261+29]; + prmt.b32 %r9814, %r9813, %r9812, 30212; + ld.local.u8 %r9815, [%rd261+30]; + prmt.b32 %r9816, %r9815, %r9814, 28756; + ld.local.u8 %r9817, [%rd261+31]; + prmt.b32 %r9818, %r9817, %r9816, 1620; + ld.local.u8 %r9819, [%rd261+32]; + ld.local.u8 %r9820, [%rd261+33]; + prmt.b32 %r9821, %r9820, %r9819, 30212; + ld.local.u8 %r9822, [%rd261+34]; + prmt.b32 %r9823, %r9822, %r9821, 28756; + ld.local.u8 %r9824, [%rd261+35]; + prmt.b32 %r9825, %r9824, %r9823, 1620; + ld.local.u8 %r9826, [%rd261+36]; + ld.local.u8 %r9827, [%rd261+37]; + prmt.b32 %r9828, %r9827, %r9826, 30212; + ld.local.u8 %r9829, [%rd261+38]; + prmt.b32 %r9830, %r9829, %r9828, 28756; + ld.local.u8 %r9831, [%rd261+39]; + prmt.b32 %r9832, %r9831, %r9830, 1620; + ld.local.u8 %r9833, [%rd261+40]; + ld.local.u8 %r9834, [%rd261+41]; + prmt.b32 %r9835, %r9834, %r9833, 30212; + ld.local.u8 %r9836, [%rd261+42]; + prmt.b32 %r9837, %r9836, %r9835, 28756; + ld.local.u8 %r9838, [%rd261+43]; + prmt.b32 %r9839, %r9838, %r9837, 1620; + ld.local.u8 %r9840, [%rd261+44]; + ld.local.u8 %r9841, [%rd261+45]; + prmt.b32 %r9842, %r9841, %r9840, 30212; + ld.local.u8 %r9843, [%rd261+46]; + prmt.b32 %r9844, %r9843, %r9842, 28756; + ld.local.u8 %r9845, [%rd261+47]; + prmt.b32 %r9846, %r9845, %r9844, 1620; + ld.local.u8 %r9847, [%rd261+48]; + ld.local.u8 %r9848, [%rd261+49]; + prmt.b32 %r9849, %r9848, %r9847, 30212; + ld.local.u8 %r9850, [%rd261+50]; + prmt.b32 %r9851, %r9850, %r9849, 28756; + ld.local.u8 %r9852, [%rd261+51]; + prmt.b32 %r9853, %r9852, %r9851, 1620; + ld.local.u8 %r9854, [%rd261+52]; + ld.local.u8 %r9855, [%rd261+53]; + prmt.b32 %r9856, %r9855, %r9854, 30212; + ld.local.u8 %r9857, [%rd261+54]; + prmt.b32 %r9858, %r9857, %r9856, 28756; + ld.local.u8 %r9859, [%rd261+55]; + prmt.b32 %r9860, %r9859, %r9858, 1620; + ld.local.u8 %r9861, [%rd261+56]; + ld.local.u8 %r9862, [%rd261+57]; + prmt.b32 %r9863, %r9862, %r9861, 30212; + ld.local.u8 %r9864, [%rd261+58]; + prmt.b32 %r9865, %r9864, %r9863, 28756; + ld.local.u8 %r9866, [%rd261+59]; + prmt.b32 %r9867, %r9866, %r9865, 1620; + ld.local.u8 %r9868, [%rd261+60]; + ld.local.u8 %r9869, [%rd261+61]; + prmt.b32 %r9870, %r9869, %r9868, 30212; + ld.local.u8 %r9871, [%rd261+62]; + prmt.b32 %r9872, %r9871, %r9870, 28756; + ld.local.u8 %r9873, [%rd261+63]; + prmt.b32 %r9874, %r9873, %r9872, 1620; + cvt.u32.u16 %r9875, %rs344; + and.b32 %r9876, %r9875, 255; + add.s32 %r9877, %r11689, %r11685; + add.s32 %r9878, %r9877, %r9769; + xor.b32 %r9879, %r9878, %r117; + shf.l.wrap.b32 %r9880, %r9879, %r9879, 16; + add.s32 %r9881, %r9880, 1779033703; + xor.b32 %r9882, %r9881, %r11685; + shf.l.wrap.b32 %r9883, %r9882, %r9882, 20; + add.s32 %r9884, %r9776, %r9878; + add.s32 %r9885, %r9884, %r9883; + xor.b32 %r9886, %r9885, %r9880; + shf.l.wrap.b32 %r9887, %r9886, %r9886, 24; + add.s32 %r9888, %r9887, %r9881; + xor.b32 %r9889, %r9888, %r9883; + shf.l.wrap.b32 %r9890, %r9889, %r9889, 25; + add.s32 %r9891, %r11688, %r11684; + add.s32 %r9892, %r9891, %r9783; + xor.b32 %r9893, %r9892, %r118; + shf.l.wrap.b32 %r9894, %r9893, %r9893, 16; + add.s32 %r9895, %r9894, -1150833019; + xor.b32 %r9896, %r9895, %r11684; + shf.l.wrap.b32 %r9897, %r9896, %r9896, 20; + add.s32 %r9898, %r9790, %r9892; + add.s32 %r9899, %r9898, %r9897; + xor.b32 %r9900, %r9899, %r9894; + shf.l.wrap.b32 %r9901, %r9900, %r9900, 24; + add.s32 %r9902, %r9901, %r9895; + xor.b32 %r9903, %r9902, %r9897; + shf.l.wrap.b32 %r9904, %r9903, %r9903, 25; + add.s32 %r9905, %r11687, %r11683; + add.s32 %r9906, %r9905, %r9797; + shr.u32 %r9907, %r9906, 16; + shl.b32 %r9908, %r9906, 16; + xor.b32 %r9909, %r9908, 4194304; + or.b32 %r9910, %r9909, %r9907; + add.s32 %r9911, %r9910, 1013904242; + xor.b32 %r9912, %r9911, %r11683; + shf.l.wrap.b32 %r9913, %r9912, %r9912, 20; + add.s32 %r9914, %r9804, %r9906; + add.s32 %r9915, %r9914, %r9913; + xor.b32 %r9916, %r9915, %r9910; + shf.l.wrap.b32 %r9917, %r9916, %r9916, 24; + add.s32 %r9918, %r9917, %r9911; + xor.b32 %r9919, %r9918, %r9913; + shf.l.wrap.b32 %r9920, %r9919, %r9919, 25; + add.s32 %r9921, %r11686, %r11682; + add.s32 %r9922, %r9921, %r9811; + xor.b32 %r9923, %r9922, %r9876; + shr.u32 %r9924, %r9922, 16; + shl.b32 %r9925, %r9923, 16; + or.b32 %r9926, %r9925, %r9924; + add.s32 %r9927, %r9926, -1521486534; + xor.b32 %r9928, %r9927, %r11682; + shf.l.wrap.b32 %r9929, %r9928, %r9928, 20; + add.s32 %r9930, %r9818, %r9922; + add.s32 %r9931, %r9930, %r9929; + xor.b32 %r9932, %r9931, %r9926; + shf.l.wrap.b32 %r9933, %r9932, %r9932, 24; + add.s32 %r9934, %r9933, %r9927; + xor.b32 %r9935, %r9934, %r9929; + shf.l.wrap.b32 %r9936, %r9935, %r9935, 25; + add.s32 %r9937, %r9904, %r9885; + add.s32 %r9938, %r9937, %r9825; + xor.b32 %r9939, %r9933, %r9938; + shf.l.wrap.b32 %r9940, %r9939, %r9939, 16; + add.s32 %r9941, %r9940, %r9918; + xor.b32 %r9942, %r9941, %r9904; + shf.l.wrap.b32 %r9943, %r9942, %r9942, 20; + add.s32 %r9944, %r9832, %r9938; + add.s32 %r9945, %r9944, %r9943; + xor.b32 %r9946, %r9945, %r9940; + shf.l.wrap.b32 %r9947, %r9946, %r9946, 24; + add.s32 %r9948, %r9947, %r9941; + xor.b32 %r9949, %r9948, %r9943; + shf.l.wrap.b32 %r9950, %r9949, %r9949, 25; + add.s32 %r9951, %r9920, %r9899; + add.s32 %r9952, %r9951, %r9839; + xor.b32 %r9953, %r9952, %r9887; + shf.l.wrap.b32 %r9954, %r9953, %r9953, 16; + add.s32 %r9955, %r9954, %r9934; + xor.b32 %r9956, %r9955, %r9920; + shf.l.wrap.b32 %r9957, %r9956, %r9956, 20; + add.s32 %r9958, %r9846, %r9952; + add.s32 %r9959, %r9958, %r9957; + xor.b32 %r9960, %r9959, %r9954; + shf.l.wrap.b32 %r9961, %r9960, %r9960, 24; + add.s32 %r9962, %r9961, %r9955; + xor.b32 %r9963, %r9962, %r9957; + shf.l.wrap.b32 %r9964, %r9963, %r9963, 25; + add.s32 %r9965, %r9936, %r9915; + add.s32 %r9966, %r9965, %r9853; + xor.b32 %r9967, %r9966, %r9901; + shf.l.wrap.b32 %r9968, %r9967, %r9967, 16; + add.s32 %r9969, %r9968, %r9888; + xor.b32 %r9970, %r9969, %r9936; + shf.l.wrap.b32 %r9971, %r9970, %r9970, 20; + add.s32 %r9972, %r9860, %r9966; + add.s32 %r9973, %r9972, %r9971; + xor.b32 %r9974, %r9973, %r9968; + shf.l.wrap.b32 %r9975, %r9974, %r9974, 24; + add.s32 %r9976, %r9975, %r9969; + xor.b32 %r9977, %r9976, %r9971; + shf.l.wrap.b32 %r9978, %r9977, %r9977, 25; + add.s32 %r9979, %r9931, %r9890; + add.s32 %r9980, %r9979, %r9867; + xor.b32 %r9981, %r9980, %r9917; + shf.l.wrap.b32 %r9982, %r9981, %r9981, 16; + add.s32 %r9983, %r9982, %r9902; + xor.b32 %r9984, %r9983, %r9890; + shf.l.wrap.b32 %r9985, %r9984, %r9984, 20; + add.s32 %r9986, %r9874, %r9980; + add.s32 %r9987, %r9986, %r9985; + xor.b32 %r9988, %r9987, %r9982; + shf.l.wrap.b32 %r9989, %r9988, %r9988, 24; + add.s32 %r9990, %r9989, %r9983; + xor.b32 %r9991, %r9990, %r9985; + shf.l.wrap.b32 %r9992, %r9991, %r9991, 25; + add.s32 %r9993, %r9945, %r9783; + add.s32 %r9994, %r9993, %r9992; + xor.b32 %r9995, %r9994, %r9961; + shf.l.wrap.b32 %r9996, %r9995, %r9995, 16; + add.s32 %r9997, %r9996, %r9976; + xor.b32 %r9998, %r9997, %r9992; + shf.l.wrap.b32 %r9999, %r9998, %r9998, 20; + add.s32 %r10000, %r9994, %r9811; + add.s32 %r10001, %r10000, %r9999; + xor.b32 %r10002, %r10001, %r9996; + shf.l.wrap.b32 %r10003, %r10002, %r10002, 24; + add.s32 %r10004, %r10003, %r9997; + xor.b32 %r10005, %r10004, %r9999; + shf.l.wrap.b32 %r10006, %r10005, %r10005, 25; + add.s32 %r10007, %r9959, %r9790; + add.s32 %r10008, %r10007, %r9950; + xor.b32 %r10009, %r9975, %r10008; + shf.l.wrap.b32 %r10010, %r10009, %r10009, 16; + add.s32 %r10011, %r9990, %r10010; + xor.b32 %r10012, %r10011, %r9950; + shf.l.wrap.b32 %r10013, %r10012, %r10012, 20; + add.s32 %r10014, %r10008, %r9839; + add.s32 %r10015, %r10014, %r10013; + xor.b32 %r10016, %r10015, %r10010; + shf.l.wrap.b32 %r10017, %r10016, %r10016, 24; + add.s32 %r10018, %r10017, %r10011; + xor.b32 %r10019, %r10018, %r10013; + shf.l.wrap.b32 %r10020, %r10019, %r10019, 25; + add.s32 %r10021, %r9964, %r9818; + add.s32 %r10022, %r10021, %r9973; + xor.b32 %r10023, %r9989, %r10022; + shf.l.wrap.b32 %r10024, %r10023, %r10023, 16; + add.s32 %r10025, %r10024, %r9948; + xor.b32 %r10026, %r10025, %r9964; + shf.l.wrap.b32 %r10027, %r10026, %r10026, 20; + add.s32 %r10028, %r10022, %r9769; + add.s32 %r10029, %r10028, %r10027; + xor.b32 %r10030, %r10029, %r10024; + shf.l.wrap.b32 %r10031, %r10030, %r10030, 24; + add.s32 %r10032, %r10031, %r10025; + xor.b32 %r10033, %r10032, %r10027; + shf.l.wrap.b32 %r10034, %r10033, %r10033, 25; + add.s32 %r10035, %r9978, %r9797; + add.s32 %r10036, %r10035, %r9987; + xor.b32 %r10037, %r10036, %r9947; + shf.l.wrap.b32 %r10038, %r10037, %r10037, 16; + add.s32 %r10039, %r10038, %r9962; + xor.b32 %r10040, %r10039, %r9978; + shf.l.wrap.b32 %r10041, %r10040, %r10040, 20; + add.s32 %r10042, %r10036, %r9860; + add.s32 %r10043, %r10042, %r10041; + xor.b32 %r10044, %r10043, %r10038; + shf.l.wrap.b32 %r10045, %r10044, %r10044, 24; + add.s32 %r10046, %r10045, %r10039; + xor.b32 %r10047, %r10046, %r10041; + shf.l.wrap.b32 %r10048, %r10047, %r10047, 25; + add.s32 %r10049, %r10001, %r9776; + add.s32 %r10050, %r10049, %r10020; + xor.b32 %r10051, %r10050, %r10045; + shf.l.wrap.b32 %r10052, %r10051, %r10051, 16; + add.s32 %r10053, %r10052, %r10032; + xor.b32 %r10054, %r10053, %r10020; + shf.l.wrap.b32 %r10055, %r10054, %r10054, 20; + add.s32 %r10056, %r10050, %r9846; + add.s32 %r10057, %r10056, %r10055; + xor.b32 %r10058, %r10057, %r10052; + shf.l.wrap.b32 %r10059, %r10058, %r10058, 24; + add.s32 %r10060, %r10059, %r10053; + xor.b32 %r10061, %r10060, %r10055; + shf.l.wrap.b32 %r10062, %r10061, %r10061, 25; + add.s32 %r10063, %r10015, %r9853; + add.s32 %r10064, %r10063, %r10034; + xor.b32 %r10065, %r10064, %r10003; + shf.l.wrap.b32 %r10066, %r10065, %r10065, 16; + add.s32 %r10067, %r10066, %r10046; + xor.b32 %r10068, %r10067, %r10034; + shf.l.wrap.b32 %r10069, %r10068, %r10068, 20; + add.s32 %r10070, %r10064, %r9804; + add.s32 %r10071, %r10070, %r10069; + xor.b32 %r10072, %r10071, %r10066; + shf.l.wrap.b32 %r10073, %r10072, %r10072, 24; + add.s32 %r10074, %r10073, %r10067; + xor.b32 %r10075, %r10074, %r10069; + shf.l.wrap.b32 %r10076, %r10075, %r10075, 25; + add.s32 %r10077, %r10029, %r9832; + add.s32 %r10078, %r10077, %r10048; + xor.b32 %r10079, %r10078, %r10017; + shf.l.wrap.b32 %r10080, %r10079, %r10079, 16; + add.s32 %r10081, %r10080, %r10004; + xor.b32 %r10082, %r10081, %r10048; + shf.l.wrap.b32 %r10083, %r10082, %r10082, 20; + add.s32 %r10084, %r10078, %r9867; + add.s32 %r10085, %r10084, %r10083; + xor.b32 %r10086, %r10085, %r10080; + shf.l.wrap.b32 %r10087, %r10086, %r10086, 24; + add.s32 %r10088, %r10087, %r10081; + xor.b32 %r10089, %r10088, %r10083; + shf.l.wrap.b32 %r10090, %r10089, %r10089, 25; + add.s32 %r10091, %r10043, %r9874; + add.s32 %r10092, %r10091, %r10006; + xor.b32 %r10093, %r10092, %r10031; + shf.l.wrap.b32 %r10094, %r10093, %r10093, 16; + add.s32 %r10095, %r10094, %r10018; + xor.b32 %r10096, %r10095, %r10006; + shf.l.wrap.b32 %r10097, %r10096, %r10096, 20; + add.s32 %r10098, %r10092, %r9825; + add.s32 %r10099, %r10098, %r10097; + xor.b32 %r10100, %r10099, %r10094; + shf.l.wrap.b32 %r10101, %r10100, %r10100, 24; + add.s32 %r10102, %r10101, %r10095; + xor.b32 %r10103, %r10102, %r10097; + shf.l.wrap.b32 %r10104, %r10103, %r10103, 25; + add.s32 %r10105, %r10057, %r9790; + add.s32 %r10106, %r10105, %r10104; + xor.b32 %r10107, %r10106, %r10073; + shf.l.wrap.b32 %r10108, %r10107, %r10107, 16; + add.s32 %r10109, %r10108, %r10088; + xor.b32 %r10110, %r10109, %r10104; + shf.l.wrap.b32 %r10111, %r10110, %r10110, 20; + add.s32 %r10112, %r10106, %r9797; + add.s32 %r10113, %r10112, %r10111; + xor.b32 %r10114, %r10113, %r10108; + shf.l.wrap.b32 %r10115, %r10114, %r10114, 24; + add.s32 %r10116, %r10115, %r10109; + xor.b32 %r10117, %r10116, %r10111; + shf.l.wrap.b32 %r10118, %r10117, %r10117, 25; + add.s32 %r10119, %r10071, %r9839; + add.s32 %r10120, %r10119, %r10062; + xor.b32 %r10121, %r10120, %r10087; + shf.l.wrap.b32 %r10122, %r10121, %r10121, 16; + add.s32 %r10123, %r10122, %r10102; + xor.b32 %r10124, %r10123, %r10062; + shf.l.wrap.b32 %r10125, %r10124, %r10124, 20; + add.s32 %r10126, %r10120, %r9853; + add.s32 %r10127, %r10126, %r10125; + xor.b32 %r10128, %r10127, %r10122; + shf.l.wrap.b32 %r10129, %r10128, %r10128, 24; + add.s32 %r10130, %r10129, %r10123; + xor.b32 %r10131, %r10130, %r10125; + shf.l.wrap.b32 %r10132, %r10131, %r10131, 25; + add.s32 %r10133, %r10085, %r9860; + add.s32 %r10134, %r10133, %r10076; + xor.b32 %r10135, %r10134, %r10101; + shf.l.wrap.b32 %r10136, %r10135, %r10135, 16; + add.s32 %r10137, %r10136, %r10060; + xor.b32 %r10138, %r10137, %r10076; + shf.l.wrap.b32 %r10139, %r10138, %r10138, 20; + add.s32 %r10140, %r10134, %r9783; + add.s32 %r10141, %r10140, %r10139; + xor.b32 %r10142, %r10141, %r10136; + shf.l.wrap.b32 %r10143, %r10142, %r10142, 24; + add.s32 %r10144, %r10143, %r10137; + xor.b32 %r10145, %r10144, %r10139; + shf.l.wrap.b32 %r10146, %r10145, %r10145, 25; + add.s32 %r10147, %r10099, %r9818; + add.s32 %r10148, %r10147, %r10090; + xor.b32 %r10149, %r10148, %r10059; + shf.l.wrap.b32 %r10150, %r10149, %r10149, 16; + add.s32 %r10151, %r10150, %r10074; + xor.b32 %r10152, %r10151, %r10090; + shf.l.wrap.b32 %r10153, %r10152, %r10152, 20; + add.s32 %r10154, %r10148, %r9867; + add.s32 %r10155, %r10154, %r10153; + xor.b32 %r10156, %r10155, %r10150; + shf.l.wrap.b32 %r10157, %r10156, %r10156, 24; + add.s32 %r10158, %r10157, %r10151; + xor.b32 %r10159, %r10158, %r10153; + shf.l.wrap.b32 %r10160, %r10159, %r10159, 25; + add.s32 %r10161, %r10113, %r9811; + add.s32 %r10162, %r10161, %r10132; + xor.b32 %r10163, %r10162, %r10157; + shf.l.wrap.b32 %r10164, %r10163, %r10163, 16; + add.s32 %r10165, %r10164, %r10144; + xor.b32 %r10166, %r10165, %r10132; + shf.l.wrap.b32 %r10167, %r10166, %r10166, 20; + add.s32 %r10168, %r10162, %r9804; + add.s32 %r10169, %r10168, %r10167; + xor.b32 %r10170, %r10169, %r10164; + shf.l.wrap.b32 %r10171, %r10170, %r10170, 24; + add.s32 %r10172, %r10171, %r10165; + xor.b32 %r10173, %r10172, %r10167; + shf.l.wrap.b32 %r10174, %r10173, %r10173, 25; + add.s32 %r10175, %r10127, %r9832; + add.s32 %r10176, %r10175, %r10146; + xor.b32 %r10177, %r10176, %r10115; + shf.l.wrap.b32 %r10178, %r10177, %r10177, 16; + add.s32 %r10179, %r10178, %r10158; + xor.b32 %r10180, %r10179, %r10146; + shf.l.wrap.b32 %r10181, %r10180, %r10180, 20; + add.s32 %r10182, %r10176, %r9769; + add.s32 %r10183, %r10182, %r10181; + xor.b32 %r10184, %r10183, %r10178; + shf.l.wrap.b32 %r10185, %r10184, %r10184, 24; + add.s32 %r10186, %r10185, %r10179; + xor.b32 %r10187, %r10186, %r10181; + shf.l.wrap.b32 %r10188, %r10187, %r10187, 25; + add.s32 %r10189, %r10141, %r9846; + add.s32 %r10190, %r10189, %r10160; + xor.b32 %r10191, %r10190, %r10129; + shf.l.wrap.b32 %r10192, %r10191, %r10191, 16; + add.s32 %r10193, %r10192, %r10116; + xor.b32 %r10194, %r10193, %r10160; + shf.l.wrap.b32 %r10195, %r10194, %r10194, 20; + add.s32 %r10196, %r10190, %r9874; + add.s32 %r10197, %r10196, %r10195; + xor.b32 %r10198, %r10197, %r10192; + shf.l.wrap.b32 %r10199, %r10198, %r10198, 24; + add.s32 %r10200, %r10199, %r10193; + xor.b32 %r10201, %r10200, %r10195; + shf.l.wrap.b32 %r10202, %r10201, %r10201, 25; + add.s32 %r10203, %r10155, %r9825; + add.s32 %r10204, %r10203, %r10118; + xor.b32 %r10205, %r10204, %r10143; + shf.l.wrap.b32 %r10206, %r10205, %r10205, 16; + add.s32 %r10207, %r10206, %r10130; + xor.b32 %r10208, %r10207, %r10118; + shf.l.wrap.b32 %r10209, %r10208, %r10208, 20; + add.s32 %r10210, %r10204, %r9776; + add.s32 %r10211, %r10210, %r10209; + xor.b32 %r10212, %r10211, %r10206; + shf.l.wrap.b32 %r10213, %r10212, %r10212, 24; + add.s32 %r10214, %r10213, %r10207; + xor.b32 %r10215, %r10214, %r10209; + shf.l.wrap.b32 %r10216, %r10215, %r10215, 25; + add.s32 %r10217, %r10169, %r9839; + add.s32 %r10218, %r10217, %r10216; + xor.b32 %r10219, %r10218, %r10185; + shf.l.wrap.b32 %r10220, %r10219, %r10219, 16; + add.s32 %r10221, %r10220, %r10200; + xor.b32 %r10222, %r10221, %r10216; + shf.l.wrap.b32 %r10223, %r10222, %r10222, 20; + add.s32 %r10224, %r10218, %r9818; + add.s32 %r10225, %r10224, %r10223; + xor.b32 %r10226, %r10225, %r10220; + shf.l.wrap.b32 %r10227, %r10226, %r10226, 24; + add.s32 %r10228, %r10227, %r10221; + xor.b32 %r10229, %r10228, %r10223; + shf.l.wrap.b32 %r10230, %r10229, %r10229, 25; + add.s32 %r10231, %r10183, %r9853; + add.s32 %r10232, %r10231, %r10174; + xor.b32 %r10233, %r10232, %r10199; + shf.l.wrap.b32 %r10234, %r10233, %r10233, 16; + add.s32 %r10235, %r10234, %r10214; + xor.b32 %r10236, %r10235, %r10174; + shf.l.wrap.b32 %r10237, %r10236, %r10236, 20; + add.s32 %r10238, %r10232, %r9832; + add.s32 %r10239, %r10238, %r10237; + xor.b32 %r10240, %r10239, %r10234; + shf.l.wrap.b32 %r10241, %r10240, %r10240, 24; + add.s32 %r10242, %r10241, %r10235; + xor.b32 %r10243, %r10242, %r10237; + shf.l.wrap.b32 %r10244, %r10243, %r10243, 25; + add.s32 %r10245, %r10197, %r9867; + add.s32 %r10246, %r10245, %r10188; + xor.b32 %r10247, %r10246, %r10213; + shf.l.wrap.b32 %r10248, %r10247, %r10247, 16; + add.s32 %r10249, %r10248, %r10172; + xor.b32 %r10250, %r10249, %r10188; + shf.l.wrap.b32 %r10251, %r10250, %r10250, 20; + add.s32 %r10252, %r10246, %r9790; + add.s32 %r10253, %r10252, %r10251; + xor.b32 %r10254, %r10253, %r10248; + shf.l.wrap.b32 %r10255, %r10254, %r10254, 24; + add.s32 %r10256, %r10255, %r10249; + xor.b32 %r10257, %r10256, %r10251; + shf.l.wrap.b32 %r10258, %r10257, %r10257, 25; + add.s32 %r10259, %r10211, %r9860; + add.s32 %r10260, %r10259, %r10202; + xor.b32 %r10261, %r10260, %r10171; + shf.l.wrap.b32 %r10262, %r10261, %r10261, 16; + add.s32 %r10263, %r10262, %r10186; + xor.b32 %r10264, %r10263, %r10202; + shf.l.wrap.b32 %r10265, %r10264, %r10264, 20; + add.s32 %r10266, %r10260, %r9874; + add.s32 %r10267, %r10266, %r10265; + xor.b32 %r10268, %r10267, %r10262; + shf.l.wrap.b32 %r10269, %r10268, %r10268, 24; + add.s32 %r10270, %r10269, %r10263; + xor.b32 %r10271, %r10270, %r10265; + shf.l.wrap.b32 %r10272, %r10271, %r10271, 25; + add.s32 %r10273, %r10225, %r9797; + add.s32 %r10274, %r10273, %r10244; + xor.b32 %r10275, %r10274, %r10269; + shf.l.wrap.b32 %r10276, %r10275, %r10275, 16; + add.s32 %r10277, %r10276, %r10256; + xor.b32 %r10278, %r10277, %r10244; + shf.l.wrap.b32 %r10279, %r10278, %r10278, 20; + add.s32 %r10280, %r10274, %r9769; + add.s32 %r10281, %r10280, %r10279; + xor.b32 %r10282, %r10281, %r10276; + shf.l.wrap.b32 %r10283, %r10282, %r10282, 24; + add.s32 %r10284, %r10283, %r10277; + xor.b32 %r10285, %r10284, %r10279; + shf.l.wrap.b32 %r10286, %r10285, %r10285, 25; + add.s32 %r10287, %r10239, %r9846; + add.s32 %r10288, %r10287, %r10258; + xor.b32 %r10289, %r10288, %r10227; + shf.l.wrap.b32 %r10290, %r10289, %r10289, 16; + add.s32 %r10291, %r10290, %r10270; + xor.b32 %r10292, %r10291, %r10258; + shf.l.wrap.b32 %r10293, %r10292, %r10292, 20; + add.s32 %r10294, %r10288, %r9783; + add.s32 %r10295, %r10294, %r10293; + xor.b32 %r10296, %r10295, %r10290; + shf.l.wrap.b32 %r10297, %r10296, %r10296, 24; + add.s32 %r10298, %r10297, %r10291; + xor.b32 %r10299, %r10298, %r10293; + shf.l.wrap.b32 %r10300, %r10299, %r10299, 25; + add.s32 %r10301, %r10253, %r9804; + add.s32 %r10302, %r10301, %r10272; + xor.b32 %r10303, %r10302, %r10241; + shf.l.wrap.b32 %r10304, %r10303, %r10303, 16; + add.s32 %r10305, %r10304, %r10228; + xor.b32 %r10306, %r10305, %r10272; + shf.l.wrap.b32 %r10307, %r10306, %r10306, 20; + add.s32 %r10308, %r10302, %r9825; + add.s32 %r10309, %r10308, %r10307; + xor.b32 %r10310, %r10309, %r10304; + shf.l.wrap.b32 %r10311, %r10310, %r10310, 24; + add.s32 %r10312, %r10311, %r10305; + xor.b32 %r10313, %r10312, %r10307; + shf.l.wrap.b32 %r10314, %r10313, %r10313, 25; + add.s32 %r10315, %r10267, %r9776; + add.s32 %r10316, %r10315, %r10230; + xor.b32 %r10317, %r10316, %r10255; + shf.l.wrap.b32 %r10318, %r10317, %r10317, 16; + add.s32 %r10319, %r10318, %r10242; + xor.b32 %r10320, %r10319, %r10230; + shf.l.wrap.b32 %r10321, %r10320, %r10320, 20; + add.s32 %r10322, %r10316, %r9811; + add.s32 %r10323, %r10322, %r10321; + xor.b32 %r10324, %r10323, %r10318; + shf.l.wrap.b32 %r10325, %r10324, %r10324, 24; + add.s32 %r10326, %r10325, %r10319; + xor.b32 %r10327, %r10326, %r10321; + shf.l.wrap.b32 %r10328, %r10327, %r10327, 25; + add.s32 %r10329, %r10281, %r9853; + add.s32 %r10330, %r10329, %r10328; + xor.b32 %r10331, %r10330, %r10297; + shf.l.wrap.b32 %r10332, %r10331, %r10331, 16; + add.s32 %r10333, %r10332, %r10312; + xor.b32 %r10334, %r10333, %r10328; + shf.l.wrap.b32 %r10335, %r10334, %r10334, 20; + add.s32 %r10336, %r10330, %r9860; + add.s32 %r10337, %r10336, %r10335; + xor.b32 %r10338, %r10337, %r10332; + shf.l.wrap.b32 %r10339, %r10338, %r10338, 24; + add.s32 %r10340, %r10339, %r10333; + xor.b32 %r10341, %r10340, %r10335; + shf.l.wrap.b32 %r10342, %r10341, %r10341, 25; + add.s32 %r10343, %r10295, %r9832; + add.s32 %r10344, %r10343, %r10286; + xor.b32 %r10345, %r10344, %r10311; + shf.l.wrap.b32 %r10346, %r10345, %r10345, 16; + add.s32 %r10347, %r10346, %r10326; + xor.b32 %r10348, %r10347, %r10286; + shf.l.wrap.b32 %r10349, %r10348, %r10348, 20; + add.s32 %r10350, %r10344, %r9846; + add.s32 %r10351, %r10350, %r10349; + xor.b32 %r10352, %r10351, %r10346; + shf.l.wrap.b32 %r10353, %r10352, %r10352, 24; + add.s32 %r10354, %r10353, %r10347; + xor.b32 %r10355, %r10354, %r10349; + shf.l.wrap.b32 %r10356, %r10355, %r10355, 25; + add.s32 %r10357, %r10309, %r9874; + add.s32 %r10358, %r10357, %r10300; + xor.b32 %r10359, %r10358, %r10325; + shf.l.wrap.b32 %r10360, %r10359, %r10359, 16; + add.s32 %r10361, %r10360, %r10284; + xor.b32 %r10362, %r10361, %r10300; + shf.l.wrap.b32 %r10363, %r10362, %r10362, 20; + add.s32 %r10364, %r10358, %r9839; + add.s32 %r10365, %r10364, %r10363; + xor.b32 %r10366, %r10365, %r10360; + shf.l.wrap.b32 %r10367, %r10366, %r10366, 24; + add.s32 %r10368, %r10367, %r10361; + xor.b32 %r10369, %r10368, %r10363; + shf.l.wrap.b32 %r10370, %r10369, %r10369, 25; + add.s32 %r10371, %r10323, %r9867; + add.s32 %r10372, %r10371, %r10314; + xor.b32 %r10373, %r10372, %r10283; + shf.l.wrap.b32 %r10374, %r10373, %r10373, 16; + add.s32 %r10375, %r10374, %r10298; + xor.b32 %r10376, %r10375, %r10314; + shf.l.wrap.b32 %r10377, %r10376, %r10376, 20; + add.s32 %r10378, %r10372, %r9825; + add.s32 %r10379, %r10378, %r10377; + xor.b32 %r10380, %r10379, %r10374; + shf.l.wrap.b32 %r10381, %r10380, %r10380, 24; + add.s32 %r10382, %r10381, %r10375; + xor.b32 %r10383, %r10382, %r10377; + shf.l.wrap.b32 %r10384, %r10383, %r10383, 25; + add.s32 %r10385, %r10337, %r9818; + add.s32 %r10386, %r10385, %r10356; + xor.b32 %r10387, %r10386, %r10381; + shf.l.wrap.b32 %r10388, %r10387, %r10387, 16; + add.s32 %r10389, %r10388, %r10368; + xor.b32 %r10390, %r10389, %r10356; + shf.l.wrap.b32 %r10391, %r10390, %r10390, 20; + add.s32 %r10392, %r10386, %r9783; + add.s32 %r10393, %r10392, %r10391; + xor.b32 %r10394, %r10393, %r10388; + shf.l.wrap.b32 %r10395, %r10394, %r10394, 24; + add.s32 %r10396, %r10395, %r10389; + xor.b32 %r10397, %r10396, %r10391; + shf.l.wrap.b32 %r10398, %r10397, %r10397, 25; + add.s32 %r10399, %r10351, %r9804; + add.s32 %r10400, %r10399, %r10370; + xor.b32 %r10401, %r10400, %r10339; + shf.l.wrap.b32 %r10402, %r10401, %r10401, 16; + add.s32 %r10403, %r10402, %r10382; + xor.b32 %r10404, %r10403, %r10370; + shf.l.wrap.b32 %r10405, %r10404, %r10404, 20; + add.s32 %r10406, %r10400, %r9790; + add.s32 %r10407, %r10406, %r10405; + xor.b32 %r10408, %r10407, %r10402; + shf.l.wrap.b32 %r10409, %r10408, %r10408, 24; + add.s32 %r10410, %r10409, %r10403; + xor.b32 %r10411, %r10410, %r10405; + shf.l.wrap.b32 %r10412, %r10411, %r10411, 25; + add.s32 %r10413, %r10365, %r9769; + add.s32 %r10414, %r10413, %r10384; + xor.b32 %r10415, %r10414, %r10353; + shf.l.wrap.b32 %r10416, %r10415, %r10415, 16; + add.s32 %r10417, %r10416, %r10340; + xor.b32 %r10418, %r10417, %r10384; + shf.l.wrap.b32 %r10419, %r10418, %r10418, 20; + add.s32 %r10420, %r10414, %r9776; + add.s32 %r10421, %r10420, %r10419; + xor.b32 %r10422, %r10421, %r10416; + shf.l.wrap.b32 %r10423, %r10422, %r10422, 24; + add.s32 %r10424, %r10423, %r10417; + xor.b32 %r10425, %r10424, %r10419; + shf.l.wrap.b32 %r10426, %r10425, %r10425, 25; + add.s32 %r10427, %r10379, %r9811; + add.s32 %r10428, %r10427, %r10342; + xor.b32 %r10429, %r10428, %r10367; + shf.l.wrap.b32 %r10430, %r10429, %r10429, 16; + add.s32 %r10431, %r10430, %r10354; + xor.b32 %r10432, %r10431, %r10342; + shf.l.wrap.b32 %r10433, %r10432, %r10432, 20; + add.s32 %r10434, %r10428, %r9797; + add.s32 %r10435, %r10434, %r10433; + xor.b32 %r10436, %r10435, %r10430; + shf.l.wrap.b32 %r10437, %r10436, %r10436, 24; + add.s32 %r10438, %r10437, %r10431; + xor.b32 %r10439, %r10438, %r10433; + shf.l.wrap.b32 %r10440, %r10439, %r10439, 25; + add.s32 %r10441, %r10393, %r9832; + add.s32 %r10442, %r10441, %r10440; + xor.b32 %r10443, %r10442, %r10409; + shf.l.wrap.b32 %r10444, %r10443, %r10443, 16; + add.s32 %r10445, %r10444, %r10424; + xor.b32 %r10446, %r10445, %r10440; + shf.l.wrap.b32 %r10447, %r10446, %r10446, 20; + add.s32 %r10448, %r10442, %r9867; + add.s32 %r10449, %r10448, %r10447; + xor.b32 %r10450, %r10449, %r10444; + shf.l.wrap.b32 %r10451, %r10450, %r10450, 24; + add.s32 %r10452, %r10451, %r10445; + xor.b32 %r10453, %r10452, %r10447; + shf.l.wrap.b32 %r10454, %r10453, %r10453, 25; + add.s32 %r10455, %r10407, %r9846; + add.s32 %r10456, %r10455, %r10398; + xor.b32 %r10457, %r10456, %r10423; + shf.l.wrap.b32 %r10458, %r10457, %r10457, 16; + add.s32 %r10459, %r10458, %r10438; + xor.b32 %r10460, %r10459, %r10398; + shf.l.wrap.b32 %r10461, %r10460, %r10460, 20; + add.s32 %r10462, %r10456, %r9804; + add.s32 %r10463, %r10462, %r10461; + xor.b32 %r10464, %r10463, %r10458; + shf.l.wrap.b32 %r10465, %r10464, %r10464, 24; + add.s32 %r10466, %r10465, %r10459; + xor.b32 %r10467, %r10466, %r10461; + shf.l.wrap.b32 %r10468, %r10467, %r10467, 25; + add.s32 %r10469, %r10421, %r9825; + add.s32 %r10470, %r10469, %r10412; + xor.b32 %r10471, %r10470, %r10437; + shf.l.wrap.b32 %r10472, %r10471, %r10471, 16; + add.s32 %r10473, %r10472, %r10396; + xor.b32 %r10474, %r10473, %r10412; + shf.l.wrap.b32 %r10475, %r10474, %r10474, 20; + add.s32 %r10476, %r10470, %r9853; + add.s32 %r10477, %r10476, %r10475; + xor.b32 %r10478, %r10477, %r10472; + shf.l.wrap.b32 %r10479, %r10478, %r10478, 24; + add.s32 %r10480, %r10479, %r10473; + xor.b32 %r10481, %r10480, %r10475; + shf.l.wrap.b32 %r10482, %r10481, %r10481, 25; + add.s32 %r10483, %r10435, %r9874; + add.s32 %r10484, %r10483, %r10426; + xor.b32 %r10485, %r10484, %r10395; + shf.l.wrap.b32 %r10486, %r10485, %r10485, 16; + add.s32 %r10487, %r10486, %r10410; + xor.b32 %r10488, %r10487, %r10426; + shf.l.wrap.b32 %r10489, %r10488, %r10488, 20; + add.s32 %r10490, %r10484, %r9776; + add.s32 %r10491, %r10490, %r10489; + xor.b32 %r10492, %r10491, %r10486; + shf.l.wrap.b32 %r10493, %r10492, %r10492, 24; + add.s32 %r10494, %r10493, %r10487; + xor.b32 %r10495, %r10494, %r10489; + shf.l.wrap.b32 %r10496, %r10495, %r10495, 25; + add.s32 %r10497, %r10449, %r9860; + add.s32 %r10498, %r10497, %r10468; + xor.b32 %r10499, %r10498, %r10493; + shf.l.wrap.b32 %r10500, %r10499, %r10499, 16; + add.s32 %r10501, %r10500, %r10480; + xor.b32 %r10502, %r10501, %r10468; + shf.l.wrap.b32 %r10503, %r10502, %r10502, 20; + add.s32 %r10504, %r10498, %r9790; + add.s32 %r10505, %r10504, %r10503; + xor.b32 %r10506, %r10505, %r10500; + shf.l.wrap.b32 %r10507, %r10506, %r10506, 24; + add.s32 %r10508, %r10507, %r10501; + xor.b32 %r10509, %r10508, %r10503; + shf.l.wrap.b32 %r10510, %r10509, %r10509, 25; + add.s32 %r10511, %r10463, %r9769; + add.s32 %r10512, %r10511, %r10482; + xor.b32 %r10513, %r10512, %r10451; + shf.l.wrap.b32 %r10514, %r10513, %r10513, 16; + add.s32 %r10515, %r10514, %r10494; + xor.b32 %r10516, %r10515, %r10482; + shf.l.wrap.b32 %r10517, %r10516, %r10516, 20; + add.s32 %r10518, %r10512, %r9839; + add.s32 %r10519, %r10518, %r10517; + xor.b32 %r10520, %r10519, %r10514; + shf.l.wrap.b32 %r10521, %r10520, %r10520, 24; + add.s32 %r10522, %r10521, %r10515; + xor.b32 %r10523, %r10522, %r10517; + shf.l.wrap.b32 %r10524, %r10523, %r10523, 25; + add.s32 %r10525, %r10477, %r9783; + add.s32 %r10526, %r10525, %r10496; + xor.b32 %r10527, %r10526, %r10465; + shf.l.wrap.b32 %r10528, %r10527, %r10527, 16; + add.s32 %r10529, %r10528, %r10452; + xor.b32 %r10530, %r10529, %r10496; + shf.l.wrap.b32 %r10531, %r10530, %r10530, 20; + add.s32 %r10532, %r10526, %r9811; + add.s32 %r10533, %r10532, %r10531; + xor.b32 %r10534, %r10533, %r10528; + shf.l.wrap.b32 %r10535, %r10534, %r10534, 24; + add.s32 %r10536, %r10535, %r10529; + xor.b32 %r10537, %r10536, %r10531; + shf.l.wrap.b32 %r10538, %r10537, %r10537, 25; + add.s32 %r10539, %r10491, %r9797; + add.s32 %r10540, %r10539, %r10454; + xor.b32 %r10541, %r10540, %r10479; + shf.l.wrap.b32 %r10542, %r10541, %r10541, 16; + add.s32 %r10543, %r10542, %r10466; + xor.b32 %r10544, %r10543, %r10454; + shf.l.wrap.b32 %r10545, %r10544, %r10544, 20; + add.s32 %r10546, %r10540, %r9818; + add.s32 %r10547, %r10546, %r10545; + xor.b32 %r10548, %r10547, %r10542; + shf.l.wrap.b32 %r10549, %r10548, %r10548, 24; + add.s32 %r10550, %r10549, %r10543; + xor.b32 %r10551, %r10550, %r10545; + shf.l.wrap.b32 %r10552, %r10551, %r10551, 25; + add.s32 %r10553, %r10505, %r9846; + add.s32 %r10554, %r10553, %r10552; + xor.b32 %r10555, %r10554, %r10521; + shf.l.wrap.b32 %r10556, %r10555, %r10555, 16; + add.s32 %r10557, %r10556, %r10536; + xor.b32 %r10558, %r10557, %r10552; + shf.l.wrap.b32 %r10559, %r10558, %r10558, 20; + add.s32 %r10560, %r10554, %r9874; + add.s32 %r10561, %r10560, %r10559; + xor.b32 %r10562, %r10561, %r10556; + shf.l.wrap.b32 %r10563, %r10562, %r10562, 24; + add.s32 %r10564, %r10563, %r10557; + xor.b32 %r10565, %r10564, %r10559; + shf.l.wrap.b32 %r10566, %r10565, %r10565, 25; + add.s32 %r10567, %r10519, %r9804; + add.s32 %r10568, %r10567, %r10510; + xor.b32 %r10569, %r10568, %r10535; + shf.l.wrap.b32 %r10570, %r10569, %r10569, 16; + add.s32 %r10571, %r10570, %r10550; + xor.b32 %r10572, %r10571, %r10510; + shf.l.wrap.b32 %r10573, %r10572, %r10572, 20; + add.s32 %r10574, %r10568, %r9769; + add.s32 %r10575, %r10574, %r10573; + xor.b32 %r10576, %r10575, %r10570; + shf.l.wrap.b32 %r10577, %r10576, %r10576, 24; + add.s32 %r10578, %r10577, %r10571; + xor.b32 %r10579, %r10578, %r10573; + shf.l.wrap.b32 %r10580, %r10579, %r10579, 25; + add.s32 %r10581, %r10533, %r9776; + add.s32 %r10582, %r10581, %r10524; + xor.b32 %r10583, %r10582, %r10549; + shf.l.wrap.b32 %r10584, %r10583, %r10583, 16; + add.s32 %r10585, %r10584, %r10508; + xor.b32 %r10586, %r10585, %r10524; + shf.l.wrap.b32 %r10587, %r10586, %r10586, 20; + add.s32 %r10588, %r10582, %r9832; + add.s32 %r10589, %r10588, %r10587; + xor.b32 %r10590, %r10589, %r10584; + shf.l.wrap.b32 %r10591, %r10590, %r10590, 24; + add.s32 %r10592, %r10591, %r10585; + xor.b32 %r10593, %r10592, %r10587; + shf.l.wrap.b32 %r10594, %r10593, %r10593, 25; + add.s32 %r10595, %r10547, %r9825; + add.s32 %r10596, %r10595, %r10538; + xor.b32 %r10597, %r10596, %r10507; + shf.l.wrap.b32 %r10598, %r10597, %r10597, 16; + add.s32 %r10599, %r10598, %r10522; + xor.b32 %r10600, %r10599, %r10538; + shf.l.wrap.b32 %r10601, %r10600, %r10600, 20; + add.s32 %r10602, %r10596, %r9811; + add.s32 %r10603, %r10602, %r10601; + xor.b32 %r10604, %r10603, %r10598; + shf.l.wrap.b32 %r10605, %r10604, %r10604, 24; + add.s32 %r10606, %r10605, %r10599; + xor.b32 %r10607, %r10606, %r10601; + shf.l.wrap.b32 %r10608, %r10607, %r10607, 25; + add.s32 %r10609, %r10561, %r9867; + add.s32 %r10610, %r10609, %r10580; + xor.b32 %r10611, %r10610, %r10605; + shf.l.wrap.b32 %r10612, %r10611, %r10611, 16; + add.s32 %r10613, %r10612, %r10592; + xor.b32 %r10614, %r10613, %r10580; + shf.l.wrap.b32 %r10615, %r10614, %r10614, 20; + add.s32 %r10616, %r10610, %r9839; + add.s32 %r10617, %r10616, %r10615; + xor.b32 %r10618, %r10617, %r10612; + shf.l.wrap.b32 %r10619, %r10618, %r10618, 24; + add.s32 %r10620, %r10619, %r10613; + xor.b32 %r10621, %r10620, %r10615; + shf.l.wrap.b32 %r10622, %r10621, %r10621, 25; + add.s32 %r10623, %r10575, %r9783; + add.s32 %r10624, %r10623, %r10594; + xor.b32 %r10625, %r10624, %r10563; + shf.l.wrap.b32 %r10626, %r10625, %r10625, 16; + add.s32 %r10627, %r10626, %r10606; + xor.b32 %r10628, %r10627, %r10594; + shf.l.wrap.b32 %r10629, %r10628, %r10628, 20; + add.s32 %r10630, %r10624, %r9853; + add.s32 %r10631, %r10630, %r10629; + xor.b32 %r10632, %r10631, %r10626; + shf.l.wrap.b32 %r10633, %r10632, %r10632, 24; + add.s32 %r10634, %r10633, %r10627; + xor.b32 %r10635, %r10634, %r10629; + shf.l.wrap.b32 %r10636, %r10635, %r10635, 25; + add.s32 %r10637, %r10589, %r9790; + add.s32 %r10638, %r10637, %r10608; + xor.b32 %r10639, %r10638, %r10577; + shf.l.wrap.b32 %r10640, %r10639, %r10639, 16; + add.s32 %r10641, %r10640, %r10564; + xor.b32 %r10642, %r10641, %r10608; + shf.l.wrap.b32 %r10643, %r10642, %r10642, 20; + add.s32 %r10644, %r10638, %r9797; + add.s32 %r10645, %r10644, %r10643; + xor.b32 %r10646, %r10645, %r10640; + shf.l.wrap.b32 %r10647, %r10646, %r10646, 24; + add.s32 %r10648, %r10647, %r10641; + xor.b32 %r10649, %r10648, %r10643; + shf.l.wrap.b32 %r10650, %r10649, %r10649, 25; + add.s32 %r10651, %r10603, %r9818; + add.s32 %r10652, %r10651, %r10566; + xor.b32 %r10653, %r10652, %r10591; + shf.l.wrap.b32 %r10654, %r10653, %r10653, 16; + add.s32 %r10655, %r10654, %r10578; + xor.b32 %r10656, %r10655, %r10566; + shf.l.wrap.b32 %r10657, %r10656, %r10656, 20; + add.s32 %r10658, %r10652, %r9860; + add.s32 %r10659, %r10658, %r10657; + xor.b32 %r10660, %r10659, %r10654; + shf.l.wrap.b32 %r10661, %r10660, %r10660, 24; + add.s32 %r10662, %r10661, %r10655; + xor.b32 %r10663, %r10662, %r10657; + shf.l.wrap.b32 %r10664, %r10663, %r10663, 25; + xor.b32 %r11689, %r10648, %r10617; + st.local.u32 [%rd3+-104], %r11689; + xor.b32 %r11688, %r10662, %r10631; + st.local.u32 [%rd3+-100], %r11688; + xor.b32 %r11687, %r10620, %r10645; + st.local.u32 [%rd3+-96], %r11687; + xor.b32 %r11686, %r10659, %r10634; + st.local.u32 [%rd3+-92], %r11686; + xor.b32 %r11685, %r10664, %r10633; + st.local.u32 [%rd3+-88], %r11685; + xor.b32 %r11684, %r10622, %r10647; + st.local.u32 [%rd3+-84], %r11684; + xor.b32 %r11683, %r10661, %r10636; + st.local.u32 [%rd3+-80], %r11683; + xor.b32 %r11682, %r10650, %r10619; + st.local.u32 [%rd3+-76], %r11682; + add.s16 %rs391, %rs391, 1; + st.local.u8 [%rd3+1], %rs391; + add.s64 %rd261, %rd261, 64; + add.s64 %rd271, %rd271, -64; + setp.gt.u64 %p49, %rd271, 64; + @%p49 bra $L__BB1_60; + bra.uni $L__BB1_61; + +$L__BB1_58: + ld.local.u64 %rd269, [%rd3+-72]; + +$L__BB1_61: + cvt.u64.u16 %rd210, %rs390; + and.b64 %rd92, %rd210, 255; + mov.u64 %rd211, 64; + sub.s64 %rd212, %rd211, %rd92; + min.u64 %rd93, %rd212, %rd271; + setp.eq.s64 %p50, %rd93, 0; + @%p50 bra $L__BB1_64; + + add.s64 %rd214, %rd2, %rd92; + add.s64 %rd94, %rd214, 72; + mov.u64 %rd272, 0; + +$L__BB1_63: + add.s64 %rd215, %rd261, %rd272; + ld.local.u8 %rs345, [%rd215]; + add.s64 %rd216, %rd94, %rd272; + st.local.u8 [%rd216], %rs345; + add.s64 %rd272, %rd272, 1; + setp.lt.u64 %p51, %rd272, %rd93; + @%p51 bra $L__BB1_63; + +$L__BB1_64: + cvt.u16.u64 %rs346, %rd93; + ld.local.u8 %rs347, [%rd3]; + add.s16 %rs348, %rs347, %rs346; + st.local.u8 [%rd3], %rs348; + ld.local.u8 %rs392, [%rd3+8]; + cvt.u64.u16 %rd217, %rs392; + and.b64 %rd218, %rd217, 255; + popc.b64 %r10665, %rd269; + cvt.u64.u32 %rd97, %r10665; + setp.ge.u64 %p52, %rd97, %rd218; + @%p52 bra $L__BB1_68; + + ld.local.u8 %r10666, [%rd3+2]; + or.b32 %r135, %r10666, 4; + ld.local.u8 %r10667, [%rd3+-120]; + ld.local.u8 %r10668, [%rd3+-119]; + prmt.b32 %r10669, %r10668, %r10667, 30212; + ld.local.u8 %r10670, [%rd3+-118]; + ld.local.u8 %r10671, [%rd3+-117]; + prmt.b32 %r10672, %r10671, %r10670, 30212; + prmt.b32 %r136, %r10672, %r10669, 4180; + ld.local.u8 %r10673, [%rd3+-136]; + ld.local.u8 %r10674, [%rd3+-135]; + prmt.b32 %r10675, %r10674, %r10673, 30212; + ld.local.u8 %r10676, [%rd3+-134]; + ld.local.u8 %r10677, [%rd3+-133]; + prmt.b32 %r10678, %r10677, %r10676, 30212; + prmt.b32 %r10679, %r10678, %r10675, 4180; + add.s32 %r137, %r136, %r10679; + ld.local.u8 %r10680, [%rd3+-116]; + ld.local.u8 %r10681, [%rd3+-115]; + prmt.b32 %r10682, %r10681, %r10680, 30212; + ld.local.u8 %r10683, [%rd3+-114]; + ld.local.u8 %r10684, [%rd3+-113]; + prmt.b32 %r10685, %r10684, %r10683, 30212; + prmt.b32 %r138, %r10685, %r10682, 4180; + ld.local.u8 %r10686, [%rd3+-132]; + ld.local.u8 %r10687, [%rd3+-131]; + prmt.b32 %r10688, %r10687, %r10686, 30212; + ld.local.u8 %r10689, [%rd3+-130]; + ld.local.u8 %r10690, [%rd3+-129]; + prmt.b32 %r10691, %r10690, %r10689, 30212; + prmt.b32 %r10692, %r10691, %r10688, 4180; + add.s32 %r139, %r138, %r10692; + ld.local.u8 %r10693, [%rd3+-112]; + ld.local.u8 %r10694, [%rd3+-111]; + prmt.b32 %r10695, %r10694, %r10693, 30212; + ld.local.u8 %r10696, [%rd3+-110]; + ld.local.u8 %r10697, [%rd3+-109]; + prmt.b32 %r10698, %r10697, %r10696, 30212; + prmt.b32 %r140, %r10698, %r10695, 4180; + ld.local.u8 %r10699, [%rd3+-128]; + ld.local.u8 %r10700, [%rd3+-127]; + prmt.b32 %r10701, %r10700, %r10699, 30212; + ld.local.u8 %r10702, [%rd3+-126]; + ld.local.u8 %r10703, [%rd3+-125]; + prmt.b32 %r10704, %r10703, %r10702, 30212; + prmt.b32 %r10705, %r10704, %r10701, 4180; + add.s32 %r141, %r140, %r10705; + ld.local.u8 %r10706, [%rd3+-108]; + ld.local.u8 %r10707, [%rd3+-107]; + prmt.b32 %r10708, %r10707, %r10706, 30212; + ld.local.u8 %r10709, [%rd3+-106]; + ld.local.u8 %r10710, [%rd3+-105]; + prmt.b32 %r10711, %r10710, %r10709, 30212; + prmt.b32 %r142, %r10711, %r10708, 4180; + ld.local.u8 %r10712, [%rd3+-124]; + ld.local.u8 %r10713, [%rd3+-123]; + prmt.b32 %r10714, %r10713, %r10712, 30212; + ld.local.u8 %r10715, [%rd3+-122]; + ld.local.u8 %r10716, [%rd3+-121]; + prmt.b32 %r10717, %r10716, %r10715, 30212; + prmt.b32 %r10718, %r10717, %r10714, 4180; + add.s32 %r143, %r142, %r10718; + +$L__BB1_66: + and.b16 %rs349, %rs392, 255; + mul.wide.u16 %r10719, %rs349, 32; + add.s32 %r10720, %r10719, -64; + cvt.s64.s32 %rd219, %r10720; + add.s64 %rd220, %rd2, %rd219; + ld.local.u8 %r10721, [%rd220+145]; + ld.local.u8 %r10722, [%rd220+146]; + prmt.b32 %r10723, %r10722, %r10721, 30212; + ld.local.u8 %r10724, [%rd220+147]; + prmt.b32 %r10725, %r10724, %r10723, 28756; + ld.local.u8 %r10726, [%rd220+148]; + prmt.b32 %r10727, %r10726, %r10725, 1620; + ld.local.u8 %r10728, [%rd220+149]; + ld.local.u8 %r10729, [%rd220+150]; + prmt.b32 %r10730, %r10729, %r10728, 30212; + ld.local.u8 %r10731, [%rd220+151]; + prmt.b32 %r10732, %r10731, %r10730, 28756; + ld.local.u8 %r10733, [%rd220+152]; + prmt.b32 %r10734, %r10733, %r10732, 1620; + ld.local.u8 %r10735, [%rd220+153]; + ld.local.u8 %r10736, [%rd220+154]; + prmt.b32 %r10737, %r10736, %r10735, 30212; + ld.local.u8 %r10738, [%rd220+155]; + prmt.b32 %r10739, %r10738, %r10737, 28756; + ld.local.u8 %r10740, [%rd220+156]; + prmt.b32 %r10741, %r10740, %r10739, 1620; + ld.local.u8 %r10742, [%rd220+157]; + ld.local.u8 %r10743, [%rd220+158]; + prmt.b32 %r10744, %r10743, %r10742, 30212; + ld.local.u8 %r10745, [%rd220+159]; + prmt.b32 %r10746, %r10745, %r10744, 28756; + ld.local.u8 %r10747, [%rd220+160]; + prmt.b32 %r10748, %r10747, %r10746, 1620; + ld.local.u8 %r10749, [%rd220+161]; + ld.local.u8 %r10750, [%rd220+162]; + prmt.b32 %r10751, %r10750, %r10749, 30212; + ld.local.u8 %r10752, [%rd220+163]; + prmt.b32 %r10753, %r10752, %r10751, 28756; + ld.local.u8 %r10754, [%rd220+164]; + prmt.b32 %r10755, %r10754, %r10753, 1620; + ld.local.u8 %r10756, [%rd220+165]; + ld.local.u8 %r10757, [%rd220+166]; + prmt.b32 %r10758, %r10757, %r10756, 30212; + ld.local.u8 %r10759, [%rd220+167]; + prmt.b32 %r10760, %r10759, %r10758, 28756; + ld.local.u8 %r10761, [%rd220+168]; + prmt.b32 %r10762, %r10761, %r10760, 1620; + ld.local.u8 %r10763, [%rd220+169]; + ld.local.u8 %r10764, [%rd220+170]; + prmt.b32 %r10765, %r10764, %r10763, 30212; + ld.local.u8 %r10766, [%rd220+171]; + prmt.b32 %r10767, %r10766, %r10765, 28756; + ld.local.u8 %r10768, [%rd220+172]; + prmt.b32 %r10769, %r10768, %r10767, 1620; + ld.local.u8 %r10770, [%rd220+173]; + ld.local.u8 %r10771, [%rd220+174]; + prmt.b32 %r10772, %r10771, %r10770, 30212; + ld.local.u8 %r10773, [%rd220+175]; + prmt.b32 %r10774, %r10773, %r10772, 28756; + ld.local.u8 %r10775, [%rd220+176]; + prmt.b32 %r10776, %r10775, %r10774, 1620; + ld.local.u8 %r10777, [%rd220+177]; + ld.local.u8 %r10778, [%rd220+178]; + prmt.b32 %r10779, %r10778, %r10777, 30212; + ld.local.u8 %r10780, [%rd220+179]; + prmt.b32 %r10781, %r10780, %r10779, 28756; + ld.local.u8 %r10782, [%rd220+180]; + prmt.b32 %r10783, %r10782, %r10781, 1620; + ld.local.u8 %r10784, [%rd220+181]; + ld.local.u8 %r10785, [%rd220+182]; + prmt.b32 %r10786, %r10785, %r10784, 30212; + ld.local.u8 %r10787, [%rd220+183]; + prmt.b32 %r10788, %r10787, %r10786, 28756; + ld.local.u8 %r10789, [%rd220+184]; + prmt.b32 %r10790, %r10789, %r10788, 1620; + ld.local.u8 %r10791, [%rd220+185]; + ld.local.u8 %r10792, [%rd220+186]; + prmt.b32 %r10793, %r10792, %r10791, 30212; + ld.local.u8 %r10794, [%rd220+187]; + prmt.b32 %r10795, %r10794, %r10793, 28756; + ld.local.u8 %r10796, [%rd220+188]; + prmt.b32 %r10797, %r10796, %r10795, 1620; + ld.local.u8 %r10798, [%rd220+189]; + ld.local.u8 %r10799, [%rd220+190]; + prmt.b32 %r10800, %r10799, %r10798, 30212; + ld.local.u8 %r10801, [%rd220+191]; + prmt.b32 %r10802, %r10801, %r10800, 28756; + ld.local.u8 %r10803, [%rd220+192]; + prmt.b32 %r10804, %r10803, %r10802, 1620; + ld.local.u8 %r10805, [%rd220+193]; + ld.local.u8 %r10806, [%rd220+194]; + prmt.b32 %r10807, %r10806, %r10805, 30212; + ld.local.u8 %r10808, [%rd220+195]; + prmt.b32 %r10809, %r10808, %r10807, 28756; + ld.local.u8 %r10810, [%rd220+196]; + prmt.b32 %r10811, %r10810, %r10809, 1620; + ld.local.u8 %r10812, [%rd220+197]; + ld.local.u8 %r10813, [%rd220+198]; + prmt.b32 %r10814, %r10813, %r10812, 30212; + ld.local.u8 %r10815, [%rd220+199]; + prmt.b32 %r10816, %r10815, %r10814, 28756; + ld.local.u8 %r10817, [%rd220+200]; + prmt.b32 %r10818, %r10817, %r10816, 1620; + ld.local.u8 %r10819, [%rd220+201]; + ld.local.u8 %r10820, [%rd220+202]; + prmt.b32 %r10821, %r10820, %r10819, 30212; + ld.local.u8 %r10822, [%rd220+203]; + prmt.b32 %r10823, %r10822, %r10821, 28756; + ld.local.u8 %r10824, [%rd220+204]; + prmt.b32 %r10825, %r10824, %r10823, 1620; + ld.local.u8 %r10826, [%rd220+205]; + ld.local.u8 %r10827, [%rd220+206]; + prmt.b32 %r10828, %r10827, %r10826, 30212; + ld.local.u8 %r10829, [%rd220+207]; + prmt.b32 %r10830, %r10829, %r10828, 28756; + ld.local.u8 %r10831, [%rd220+208]; + prmt.b32 %r10832, %r10831, %r10830, 1620; + add.s32 %r10833, %r137, %r10727; + shf.l.wrap.b32 %r10834, %r10833, %r10833, 16; + add.s32 %r10835, %r10834, 1779033703; + xor.b32 %r10836, %r10835, %r136; + shf.l.wrap.b32 %r10837, %r10836, %r10836, 20; + add.s32 %r10838, %r10734, %r10833; + add.s32 %r10839, %r10838, %r10837; + xor.b32 %r10840, %r10839, %r10834; + shf.l.wrap.b32 %r10841, %r10840, %r10840, 24; + add.s32 %r10842, %r10841, %r10835; + xor.b32 %r10843, %r10842, %r10837; + shf.l.wrap.b32 %r10844, %r10843, %r10843, 25; + add.s32 %r10845, %r139, %r10741; + shf.l.wrap.b32 %r10846, %r10845, %r10845, 16; + add.s32 %r10847, %r10846, -1150833019; + xor.b32 %r10848, %r10847, %r138; + shf.l.wrap.b32 %r10849, %r10848, %r10848, 20; + add.s32 %r10850, %r10748, %r10845; + add.s32 %r10851, %r10850, %r10849; + xor.b32 %r10852, %r10851, %r10846; + shf.l.wrap.b32 %r10853, %r10852, %r10852, 24; + add.s32 %r10854, %r10853, %r10847; + xor.b32 %r10855, %r10854, %r10849; + shf.l.wrap.b32 %r10856, %r10855, %r10855, 25; + add.s32 %r10857, %r141, %r10755; + shr.u32 %r10858, %r10857, 16; + shl.b32 %r10859, %r10857, 16; + xor.b32 %r10860, %r10859, 4194304; + or.b32 %r10861, %r10860, %r10858; + add.s32 %r10862, %r10861, 1013904242; + xor.b32 %r10863, %r10862, %r140; + shf.l.wrap.b32 %r10864, %r10863, %r10863, 20; + add.s32 %r10865, %r10762, %r10857; + add.s32 %r10866, %r10865, %r10864; + xor.b32 %r10867, %r10866, %r10861; + shf.l.wrap.b32 %r10868, %r10867, %r10867, 24; + add.s32 %r10869, %r10868, %r10862; + xor.b32 %r10870, %r10869, %r10864; + shf.l.wrap.b32 %r10871, %r10870, %r10870, 25; + add.s32 %r10872, %r143, %r10769; + xor.b32 %r10873, %r10872, %r135; + shr.u32 %r10874, %r10872, 16; + shl.b32 %r10875, %r10873, 16; + or.b32 %r10876, %r10875, %r10874; + add.s32 %r10877, %r10876, -1521486534; + xor.b32 %r10878, %r10877, %r142; + shf.l.wrap.b32 %r10879, %r10878, %r10878, 20; + add.s32 %r10880, %r10776, %r10872; + add.s32 %r10881, %r10880, %r10879; + xor.b32 %r10882, %r10881, %r10876; + shf.l.wrap.b32 %r10883, %r10882, %r10882, 24; + add.s32 %r10884, %r10883, %r10877; + xor.b32 %r10885, %r10884, %r10879; + shf.l.wrap.b32 %r10886, %r10885, %r10885, 25; + add.s32 %r10887, %r10856, %r10839; + add.s32 %r10888, %r10887, %r10783; + xor.b32 %r10889, %r10883, %r10888; + shf.l.wrap.b32 %r10890, %r10889, %r10889, 16; + add.s32 %r10891, %r10890, %r10869; + xor.b32 %r10892, %r10891, %r10856; + shf.l.wrap.b32 %r10893, %r10892, %r10892, 20; + add.s32 %r10894, %r10790, %r10888; + add.s32 %r10895, %r10894, %r10893; + xor.b32 %r10896, %r10895, %r10890; + shf.l.wrap.b32 %r10897, %r10896, %r10896, 24; + add.s32 %r10898, %r10897, %r10891; + xor.b32 %r10899, %r10898, %r10893; + shf.l.wrap.b32 %r10900, %r10899, %r10899, 25; + add.s32 %r10901, %r10871, %r10851; + add.s32 %r10902, %r10901, %r10797; + xor.b32 %r10903, %r10902, %r10841; + shf.l.wrap.b32 %r10904, %r10903, %r10903, 16; + add.s32 %r10905, %r10904, %r10884; + xor.b32 %r10906, %r10905, %r10871; + shf.l.wrap.b32 %r10907, %r10906, %r10906, 20; + add.s32 %r10908, %r10804, %r10902; + add.s32 %r10909, %r10908, %r10907; + xor.b32 %r10910, %r10909, %r10904; + shf.l.wrap.b32 %r10911, %r10910, %r10910, 24; + add.s32 %r10912, %r10911, %r10905; + xor.b32 %r10913, %r10912, %r10907; + shf.l.wrap.b32 %r10914, %r10913, %r10913, 25; + add.s32 %r10915, %r10886, %r10866; + add.s32 %r10916, %r10915, %r10811; + xor.b32 %r10917, %r10916, %r10853; + shf.l.wrap.b32 %r10918, %r10917, %r10917, 16; + add.s32 %r10919, %r10918, %r10842; + xor.b32 %r10920, %r10919, %r10886; + shf.l.wrap.b32 %r10921, %r10920, %r10920, 20; + add.s32 %r10922, %r10818, %r10916; + add.s32 %r10923, %r10922, %r10921; + xor.b32 %r10924, %r10923, %r10918; + shf.l.wrap.b32 %r10925, %r10924, %r10924, 24; + add.s32 %r10926, %r10925, %r10919; + xor.b32 %r10927, %r10926, %r10921; + shf.l.wrap.b32 %r10928, %r10927, %r10927, 25; + add.s32 %r10929, %r10881, %r10844; + add.s32 %r10930, %r10929, %r10825; + xor.b32 %r10931, %r10930, %r10868; + shf.l.wrap.b32 %r10932, %r10931, %r10931, 16; + add.s32 %r10933, %r10932, %r10854; + xor.b32 %r10934, %r10933, %r10844; + shf.l.wrap.b32 %r10935, %r10934, %r10934, 20; + add.s32 %r10936, %r10832, %r10930; + add.s32 %r10937, %r10936, %r10935; + xor.b32 %r10938, %r10937, %r10932; + shf.l.wrap.b32 %r10939, %r10938, %r10938, 24; + add.s32 %r10940, %r10939, %r10933; + xor.b32 %r10941, %r10940, %r10935; + shf.l.wrap.b32 %r10942, %r10941, %r10941, 25; + add.s32 %r10943, %r10895, %r10741; + add.s32 %r10944, %r10943, %r10942; + xor.b32 %r10945, %r10944, %r10911; + shf.l.wrap.b32 %r10946, %r10945, %r10945, 16; + add.s32 %r10947, %r10946, %r10926; + xor.b32 %r10948, %r10947, %r10942; + shf.l.wrap.b32 %r10949, %r10948, %r10948, 20; + add.s32 %r10950, %r10944, %r10769; + add.s32 %r10951, %r10950, %r10949; + xor.b32 %r10952, %r10951, %r10946; + shf.l.wrap.b32 %r10953, %r10952, %r10952, 24; + add.s32 %r10954, %r10953, %r10947; + xor.b32 %r10955, %r10954, %r10949; + shf.l.wrap.b32 %r10956, %r10955, %r10955, 25; + add.s32 %r10957, %r10909, %r10748; + add.s32 %r10958, %r10957, %r10900; + xor.b32 %r10959, %r10925, %r10958; + shf.l.wrap.b32 %r10960, %r10959, %r10959, 16; + add.s32 %r10961, %r10940, %r10960; + xor.b32 %r10962, %r10961, %r10900; + shf.l.wrap.b32 %r10963, %r10962, %r10962, 20; + add.s32 %r10964, %r10958, %r10797; + add.s32 %r10965, %r10964, %r10963; + xor.b32 %r10966, %r10965, %r10960; + shf.l.wrap.b32 %r10967, %r10966, %r10966, 24; + add.s32 %r10968, %r10967, %r10961; + xor.b32 %r10969, %r10968, %r10963; + shf.l.wrap.b32 %r10970, %r10969, %r10969, 25; + add.s32 %r10971, %r10914, %r10776; + add.s32 %r10972, %r10971, %r10923; + xor.b32 %r10973, %r10939, %r10972; + shf.l.wrap.b32 %r10974, %r10973, %r10973, 16; + add.s32 %r10975, %r10974, %r10898; + xor.b32 %r10976, %r10975, %r10914; + shf.l.wrap.b32 %r10977, %r10976, %r10976, 20; + add.s32 %r10978, %r10972, %r10727; + add.s32 %r10979, %r10978, %r10977; + xor.b32 %r10980, %r10979, %r10974; + shf.l.wrap.b32 %r10981, %r10980, %r10980, 24; + add.s32 %r10982, %r10981, %r10975; + xor.b32 %r10983, %r10982, %r10977; + shf.l.wrap.b32 %r10984, %r10983, %r10983, 25; + add.s32 %r10985, %r10928, %r10755; + add.s32 %r10986, %r10985, %r10937; + xor.b32 %r10987, %r10986, %r10897; + shf.l.wrap.b32 %r10988, %r10987, %r10987, 16; + add.s32 %r10989, %r10988, %r10912; + xor.b32 %r10990, %r10989, %r10928; + shf.l.wrap.b32 %r10991, %r10990, %r10990, 20; + add.s32 %r10992, %r10986, %r10818; + add.s32 %r10993, %r10992, %r10991; + xor.b32 %r10994, %r10993, %r10988; + shf.l.wrap.b32 %r10995, %r10994, %r10994, 24; + add.s32 %r10996, %r10995, %r10989; + xor.b32 %r10997, %r10996, %r10991; + shf.l.wrap.b32 %r10998, %r10997, %r10997, 25; + add.s32 %r10999, %r10970, %r10734; + add.s32 %r11000, %r10999, %r10951; + xor.b32 %r11001, %r11000, %r10995; + shf.l.wrap.b32 %r11002, %r11001, %r11001, 16; + add.s32 %r11003, %r11002, %r10982; + xor.b32 %r11004, %r11003, %r10970; + shf.l.wrap.b32 %r11005, %r11004, %r11004, 20; + add.s32 %r11006, %r11000, %r10804; + add.s32 %r11007, %r11006, %r11005; + xor.b32 %r11008, %r11007, %r11002; + shf.l.wrap.b32 %r11009, %r11008, %r11008, 24; + add.s32 %r11010, %r11009, %r11003; + xor.b32 %r11011, %r11010, %r11005; + shf.l.wrap.b32 %r11012, %r11011, %r11011, 25; + add.s32 %r11013, %r10965, %r10811; + add.s32 %r11014, %r11013, %r10984; + xor.b32 %r11015, %r10953, %r11014; + shf.l.wrap.b32 %r11016, %r11015, %r11015, 16; + add.s32 %r11017, %r11016, %r10996; + xor.b32 %r11018, %r11017, %r10984; + shf.l.wrap.b32 %r11019, %r11018, %r11018, 20; + add.s32 %r11020, %r11014, %r10762; + add.s32 %r11021, %r11020, %r11019; + xor.b32 %r11022, %r11021, %r11016; + shf.l.wrap.b32 %r11023, %r11022, %r11022, 24; + add.s32 %r11024, %r11023, %r11017; + xor.b32 %r11025, %r11024, %r11019; + shf.l.wrap.b32 %r11026, %r11025, %r11025, 25; + add.s32 %r11027, %r10979, %r10790; + add.s32 %r11028, %r11027, %r10998; + xor.b32 %r11029, %r11028, %r10967; + shf.l.wrap.b32 %r11030, %r11029, %r11029, 16; + add.s32 %r11031, %r11030, %r10954; + xor.b32 %r11032, %r11031, %r10998; + shf.l.wrap.b32 %r11033, %r11032, %r11032, 20; + add.s32 %r11034, %r11028, %r10825; + add.s32 %r11035, %r11034, %r11033; + xor.b32 %r11036, %r11035, %r11030; + shf.l.wrap.b32 %r11037, %r11036, %r11036, 24; + add.s32 %r11038, %r11037, %r11031; + xor.b32 %r11039, %r11038, %r11033; + shf.l.wrap.b32 %r11040, %r11039, %r11039, 25; + add.s32 %r11041, %r10993, %r10832; + add.s32 %r11042, %r11041, %r10956; + xor.b32 %r11043, %r11042, %r10981; + shf.l.wrap.b32 %r11044, %r11043, %r11043, 16; + add.s32 %r11045, %r11044, %r10968; + xor.b32 %r11046, %r11045, %r10956; + shf.l.wrap.b32 %r11047, %r11046, %r11046, 20; + add.s32 %r11048, %r11042, %r10783; + add.s32 %r11049, %r11048, %r11047; + xor.b32 %r11050, %r11049, %r11044; + shf.l.wrap.b32 %r11051, %r11050, %r11050, 24; + add.s32 %r11052, %r11051, %r11045; + xor.b32 %r11053, %r11052, %r11047; + shf.l.wrap.b32 %r11054, %r11053, %r11053, 25; + add.s32 %r11055, %r11007, %r10748; + add.s32 %r11056, %r11055, %r11054; + xor.b32 %r11057, %r11056, %r11023; + shf.l.wrap.b32 %r11058, %r11057, %r11057, 16; + add.s32 %r11059, %r11058, %r11038; + xor.b32 %r11060, %r11059, %r11054; + shf.l.wrap.b32 %r11061, %r11060, %r11060, 20; + add.s32 %r11062, %r11056, %r10755; + add.s32 %r11063, %r11062, %r11061; + xor.b32 %r11064, %r11063, %r11058; + shf.l.wrap.b32 %r11065, %r11064, %r11064, 24; + add.s32 %r11066, %r11065, %r11059; + xor.b32 %r11067, %r11066, %r11061; + shf.l.wrap.b32 %r11068, %r11067, %r11067, 25; + add.s32 %r11069, %r11021, %r10797; + add.s32 %r11070, %r11069, %r11012; + xor.b32 %r11071, %r11070, %r11037; + shf.l.wrap.b32 %r11072, %r11071, %r11071, 16; + add.s32 %r11073, %r11072, %r11052; + xor.b32 %r11074, %r11073, %r11012; + shf.l.wrap.b32 %r11075, %r11074, %r11074, 20; + add.s32 %r11076, %r11070, %r10811; + add.s32 %r11077, %r11076, %r11075; + xor.b32 %r11078, %r11077, %r11072; + shf.l.wrap.b32 %r11079, %r11078, %r11078, 24; + add.s32 %r11080, %r11079, %r11073; + xor.b32 %r11081, %r11080, %r11075; + shf.l.wrap.b32 %r11082, %r11081, %r11081, 25; + add.s32 %r11083, %r11035, %r10818; + add.s32 %r11084, %r11083, %r11026; + xor.b32 %r11085, %r11051, %r11084; + shf.l.wrap.b32 %r11086, %r11085, %r11085, 16; + add.s32 %r11087, %r11086, %r11010; + xor.b32 %r11088, %r11087, %r11026; + shf.l.wrap.b32 %r11089, %r11088, %r11088, 20; + add.s32 %r11090, %r11084, %r10741; + add.s32 %r11091, %r11090, %r11089; + xor.b32 %r11092, %r11091, %r11086; + shf.l.wrap.b32 %r11093, %r11092, %r11092, 24; + add.s32 %r11094, %r11093, %r11087; + xor.b32 %r11095, %r11094, %r11089; + shf.l.wrap.b32 %r11096, %r11095, %r11095, 25; + add.s32 %r11097, %r11040, %r10776; + add.s32 %r11098, %r11097, %r11049; + xor.b32 %r11099, %r11098, %r11009; + shf.l.wrap.b32 %r11100, %r11099, %r11099, 16; + add.s32 %r11101, %r11100, %r11024; + xor.b32 %r11102, %r11101, %r11040; + shf.l.wrap.b32 %r11103, %r11102, %r11102, 20; + add.s32 %r11104, %r11098, %r10825; + add.s32 %r11105, %r11104, %r11103; + xor.b32 %r11106, %r11105, %r11100; + shf.l.wrap.b32 %r11107, %r11106, %r11106, 24; + add.s32 %r11108, %r11107, %r11101; + xor.b32 %r11109, %r11108, %r11103; + shf.l.wrap.b32 %r11110, %r11109, %r11109, 25; + add.s32 %r11111, %r11082, %r10769; + add.s32 %r11112, %r11111, %r11063; + xor.b32 %r11113, %r11112, %r11107; + shf.l.wrap.b32 %r11114, %r11113, %r11113, 16; + add.s32 %r11115, %r11114, %r11094; + xor.b32 %r11116, %r11115, %r11082; + shf.l.wrap.b32 %r11117, %r11116, %r11116, 20; + add.s32 %r11118, %r11112, %r10762; + add.s32 %r11119, %r11118, %r11117; + xor.b32 %r11120, %r11119, %r11114; + shf.l.wrap.b32 %r11121, %r11120, %r11120, 24; + add.s32 %r11122, %r11121, %r11115; + xor.b32 %r11123, %r11122, %r11117; + shf.l.wrap.b32 %r11124, %r11123, %r11123, 25; + add.s32 %r11125, %r11077, %r10790; + add.s32 %r11126, %r11125, %r11096; + xor.b32 %r11127, %r11065, %r11126; + shf.l.wrap.b32 %r11128, %r11127, %r11127, 16; + add.s32 %r11129, %r11128, %r11108; + xor.b32 %r11130, %r11129, %r11096; + shf.l.wrap.b32 %r11131, %r11130, %r11130, 20; + add.s32 %r11132, %r11126, %r10727; + add.s32 %r11133, %r11132, %r11131; + xor.b32 %r11134, %r11133, %r11128; + shf.l.wrap.b32 %r11135, %r11134, %r11134, 24; + add.s32 %r11136, %r11135, %r11129; + xor.b32 %r11137, %r11136, %r11131; + shf.l.wrap.b32 %r11138, %r11137, %r11137, 25; + add.s32 %r11139, %r11091, %r10804; + add.s32 %r11140, %r11139, %r11110; + xor.b32 %r11141, %r11140, %r11079; + shf.l.wrap.b32 %r11142, %r11141, %r11141, 16; + add.s32 %r11143, %r11142, %r11066; + xor.b32 %r11144, %r11143, %r11110; + shf.l.wrap.b32 %r11145, %r11144, %r11144, 20; + add.s32 %r11146, %r11140, %r10832; + add.s32 %r11147, %r11146, %r11145; + xor.b32 %r11148, %r11147, %r11142; + shf.l.wrap.b32 %r11149, %r11148, %r11148, 24; + add.s32 %r11150, %r11149, %r11143; + xor.b32 %r11151, %r11150, %r11145; + shf.l.wrap.b32 %r11152, %r11151, %r11151, 25; + add.s32 %r11153, %r11105, %r10783; + add.s32 %r11154, %r11153, %r11068; + xor.b32 %r11155, %r11154, %r11093; + shf.l.wrap.b32 %r11156, %r11155, %r11155, 16; + add.s32 %r11157, %r11156, %r11080; + xor.b32 %r11158, %r11157, %r11068; + shf.l.wrap.b32 %r11159, %r11158, %r11158, 20; + add.s32 %r11160, %r11154, %r10734; + add.s32 %r11161, %r11160, %r11159; + xor.b32 %r11162, %r11161, %r11156; + shf.l.wrap.b32 %r11163, %r11162, %r11162, 24; + add.s32 %r11164, %r11163, %r11157; + xor.b32 %r11165, %r11164, %r11159; + shf.l.wrap.b32 %r11166, %r11165, %r11165, 25; + add.s32 %r11167, %r11119, %r10797; + add.s32 %r11168, %r11167, %r11166; + xor.b32 %r11169, %r11168, %r11135; + shf.l.wrap.b32 %r11170, %r11169, %r11169, 16; + add.s32 %r11171, %r11170, %r11150; + xor.b32 %r11172, %r11171, %r11166; + shf.l.wrap.b32 %r11173, %r11172, %r11172, 20; + add.s32 %r11174, %r11168, %r10776; + add.s32 %r11175, %r11174, %r11173; + xor.b32 %r11176, %r11175, %r11170; + shf.l.wrap.b32 %r11177, %r11176, %r11176, 24; + add.s32 %r11178, %r11177, %r11171; + xor.b32 %r11179, %r11178, %r11173; + shf.l.wrap.b32 %r11180, %r11179, %r11179, 25; + add.s32 %r11181, %r11133, %r10811; + add.s32 %r11182, %r11181, %r11124; + xor.b32 %r11183, %r11182, %r11149; + shf.l.wrap.b32 %r11184, %r11183, %r11183, 16; + add.s32 %r11185, %r11184, %r11164; + xor.b32 %r11186, %r11185, %r11124; + shf.l.wrap.b32 %r11187, %r11186, %r11186, 20; + add.s32 %r11188, %r11182, %r10790; + add.s32 %r11189, %r11188, %r11187; + xor.b32 %r11190, %r11189, %r11184; + shf.l.wrap.b32 %r11191, %r11190, %r11190, 24; + add.s32 %r11192, %r11191, %r11185; + xor.b32 %r11193, %r11192, %r11187; + shf.l.wrap.b32 %r11194, %r11193, %r11193, 25; + add.s32 %r11195, %r11147, %r10825; + add.s32 %r11196, %r11195, %r11138; + xor.b32 %r11197, %r11163, %r11196; + shf.l.wrap.b32 %r11198, %r11197, %r11197, 16; + add.s32 %r11199, %r11198, %r11122; + xor.b32 %r11200, %r11199, %r11138; + shf.l.wrap.b32 %r11201, %r11200, %r11200, 20; + add.s32 %r11202, %r11196, %r10748; + add.s32 %r11203, %r11202, %r11201; + xor.b32 %r11204, %r11203, %r11198; + shf.l.wrap.b32 %r11205, %r11204, %r11204, 24; + add.s32 %r11206, %r11205, %r11199; + xor.b32 %r11207, %r11206, %r11201; + shf.l.wrap.b32 %r11208, %r11207, %r11207, 25; + add.s32 %r11209, %r11152, %r10818; + add.s32 %r11210, %r11209, %r11161; + xor.b32 %r11211, %r11210, %r11121; + shf.l.wrap.b32 %r11212, %r11211, %r11211, 16; + add.s32 %r11213, %r11212, %r11136; + xor.b32 %r11214, %r11213, %r11152; + shf.l.wrap.b32 %r11215, %r11214, %r11214, 20; + add.s32 %r11216, %r11210, %r10832; + add.s32 %r11217, %r11216, %r11215; + xor.b32 %r11218, %r11217, %r11212; + shf.l.wrap.b32 %r11219, %r11218, %r11218, 24; + add.s32 %r11220, %r11219, %r11213; + xor.b32 %r11221, %r11220, %r11215; + shf.l.wrap.b32 %r11222, %r11221, %r11221, 25; + add.s32 %r11223, %r11194, %r10755; + add.s32 %r11224, %r11223, %r11175; + xor.b32 %r11225, %r11224, %r11219; + shf.l.wrap.b32 %r11226, %r11225, %r11225, 16; + add.s32 %r11227, %r11226, %r11206; + xor.b32 %r11228, %r11227, %r11194; + shf.l.wrap.b32 %r11229, %r11228, %r11228, 20; + add.s32 %r11230, %r11224, %r10727; + add.s32 %r11231, %r11230, %r11229; + xor.b32 %r11232, %r11231, %r11226; + shf.l.wrap.b32 %r11233, %r11232, %r11232, 24; + add.s32 %r11234, %r11233, %r11227; + xor.b32 %r11235, %r11234, %r11229; + shf.l.wrap.b32 %r11236, %r11235, %r11235, 25; + add.s32 %r11237, %r11189, %r10804; + add.s32 %r11238, %r11237, %r11208; + xor.b32 %r11239, %r11177, %r11238; + shf.l.wrap.b32 %r11240, %r11239, %r11239, 16; + add.s32 %r11241, %r11240, %r11220; + xor.b32 %r11242, %r11241, %r11208; + shf.l.wrap.b32 %r11243, %r11242, %r11242, 20; + add.s32 %r11244, %r11238, %r10741; + add.s32 %r11245, %r11244, %r11243; + xor.b32 %r11246, %r11245, %r11240; + shf.l.wrap.b32 %r11247, %r11246, %r11246, 24; + add.s32 %r11248, %r11247, %r11241; + xor.b32 %r11249, %r11248, %r11243; + shf.l.wrap.b32 %r11250, %r11249, %r11249, 25; + add.s32 %r11251, %r11203, %r10762; + add.s32 %r11252, %r11251, %r11222; + xor.b32 %r11253, %r11252, %r11191; + shf.l.wrap.b32 %r11254, %r11253, %r11253, 16; + add.s32 %r11255, %r11254, %r11178; + xor.b32 %r11256, %r11255, %r11222; + shf.l.wrap.b32 %r11257, %r11256, %r11256, 20; + add.s32 %r11258, %r11252, %r10783; + add.s32 %r11259, %r11258, %r11257; + xor.b32 %r11260, %r11259, %r11254; + shf.l.wrap.b32 %r11261, %r11260, %r11260, 24; + add.s32 %r11262, %r11261, %r11255; + xor.b32 %r11263, %r11262, %r11257; + shf.l.wrap.b32 %r11264, %r11263, %r11263, 25; + add.s32 %r11265, %r11217, %r10734; + add.s32 %r11266, %r11265, %r11180; + xor.b32 %r11267, %r11266, %r11205; + shf.l.wrap.b32 %r11268, %r11267, %r11267, 16; + add.s32 %r11269, %r11268, %r11192; + xor.b32 %r11270, %r11269, %r11180; + shf.l.wrap.b32 %r11271, %r11270, %r11270, 20; + add.s32 %r11272, %r11266, %r10769; + add.s32 %r11273, %r11272, %r11271; + xor.b32 %r11274, %r11273, %r11268; + shf.l.wrap.b32 %r11275, %r11274, %r11274, 24; + add.s32 %r11276, %r11275, %r11269; + xor.b32 %r11277, %r11276, %r11271; + shf.l.wrap.b32 %r11278, %r11277, %r11277, 25; + add.s32 %r11279, %r11231, %r10811; + add.s32 %r11280, %r11279, %r11278; + xor.b32 %r11281, %r11280, %r11247; + shf.l.wrap.b32 %r11282, %r11281, %r11281, 16; + add.s32 %r11283, %r11282, %r11262; + xor.b32 %r11284, %r11283, %r11278; + shf.l.wrap.b32 %r11285, %r11284, %r11284, 20; + add.s32 %r11286, %r11280, %r10818; + add.s32 %r11287, %r11286, %r11285; + xor.b32 %r11288, %r11287, %r11282; + shf.l.wrap.b32 %r11289, %r11288, %r11288, 24; + add.s32 %r11290, %r11289, %r11283; + xor.b32 %r11291, %r11290, %r11285; + shf.l.wrap.b32 %r11292, %r11291, %r11291, 25; + add.s32 %r11293, %r11245, %r10790; + add.s32 %r11294, %r11293, %r11236; + xor.b32 %r11295, %r11294, %r11261; + shf.l.wrap.b32 %r11296, %r11295, %r11295, 16; + add.s32 %r11297, %r11296, %r11276; + xor.b32 %r11298, %r11297, %r11236; + shf.l.wrap.b32 %r11299, %r11298, %r11298, 20; + add.s32 %r11300, %r11294, %r10804; + add.s32 %r11301, %r11300, %r11299; + xor.b32 %r11302, %r11301, %r11296; + shf.l.wrap.b32 %r11303, %r11302, %r11302, 24; + add.s32 %r11304, %r11303, %r11297; + xor.b32 %r11305, %r11304, %r11299; + shf.l.wrap.b32 %r11306, %r11305, %r11305, 25; + add.s32 %r11307, %r11259, %r10832; + add.s32 %r11308, %r11307, %r11250; + xor.b32 %r11309, %r11275, %r11308; + shf.l.wrap.b32 %r11310, %r11309, %r11309, 16; + add.s32 %r11311, %r11310, %r11234; + xor.b32 %r11312, %r11311, %r11250; + shf.l.wrap.b32 %r11313, %r11312, %r11312, 20; + add.s32 %r11314, %r11308, %r10797; + add.s32 %r11315, %r11314, %r11313; + xor.b32 %r11316, %r11315, %r11310; + shf.l.wrap.b32 %r11317, %r11316, %r11316, 24; + add.s32 %r11318, %r11317, %r11311; + xor.b32 %r11319, %r11318, %r11313; + shf.l.wrap.b32 %r11320, %r11319, %r11319, 25; + add.s32 %r11321, %r11264, %r10825; + add.s32 %r11322, %r11321, %r11273; + xor.b32 %r11323, %r11322, %r11233; + shf.l.wrap.b32 %r11324, %r11323, %r11323, 16; + add.s32 %r11325, %r11324, %r11248; + xor.b32 %r11326, %r11325, %r11264; + shf.l.wrap.b32 %r11327, %r11326, %r11326, 20; + add.s32 %r11328, %r11322, %r10783; + add.s32 %r11329, %r11328, %r11327; + xor.b32 %r11330, %r11329, %r11324; + shf.l.wrap.b32 %r11331, %r11330, %r11330, 24; + add.s32 %r11332, %r11331, %r11325; + xor.b32 %r11333, %r11332, %r11327; + shf.l.wrap.b32 %r11334, %r11333, %r11333, 25; + add.s32 %r11335, %r11306, %r10776; + add.s32 %r11336, %r11335, %r11287; + xor.b32 %r11337, %r11336, %r11331; + shf.l.wrap.b32 %r11338, %r11337, %r11337, 16; + add.s32 %r11339, %r11338, %r11318; + xor.b32 %r11340, %r11339, %r11306; + shf.l.wrap.b32 %r11341, %r11340, %r11340, 20; + add.s32 %r11342, %r11336, %r10741; + add.s32 %r11343, %r11342, %r11341; + xor.b32 %r11344, %r11343, %r11338; + shf.l.wrap.b32 %r11345, %r11344, %r11344, 24; + add.s32 %r11346, %r11345, %r11339; + xor.b32 %r11347, %r11346, %r11341; + shf.l.wrap.b32 %r11348, %r11347, %r11347, 25; + add.s32 %r11349, %r11301, %r10762; + add.s32 %r11350, %r11349, %r11320; + xor.b32 %r11351, %r11289, %r11350; + shf.l.wrap.b32 %r11352, %r11351, %r11351, 16; + add.s32 %r11353, %r11352, %r11332; + xor.b32 %r11354, %r11353, %r11320; + shf.l.wrap.b32 %r11355, %r11354, %r11354, 20; + add.s32 %r11356, %r11350, %r10748; + add.s32 %r11357, %r11356, %r11355; + xor.b32 %r11358, %r11357, %r11352; + shf.l.wrap.b32 %r11359, %r11358, %r11358, 24; + add.s32 %r11360, %r11359, %r11353; + xor.b32 %r11361, %r11360, %r11355; + shf.l.wrap.b32 %r11362, %r11361, %r11361, 25; + add.s32 %r11363, %r11315, %r10727; + add.s32 %r11364, %r11363, %r11334; + xor.b32 %r11365, %r11364, %r11303; + shf.l.wrap.b32 %r11366, %r11365, %r11365, 16; + add.s32 %r11367, %r11366, %r11290; + xor.b32 %r11368, %r11367, %r11334; + shf.l.wrap.b32 %r11369, %r11368, %r11368, 20; + add.s32 %r11370, %r11364, %r10734; + add.s32 %r11371, %r11370, %r11369; + xor.b32 %r11372, %r11371, %r11366; + shf.l.wrap.b32 %r11373, %r11372, %r11372, 24; + add.s32 %r11374, %r11373, %r11367; + xor.b32 %r11375, %r11374, %r11369; + shf.l.wrap.b32 %r11376, %r11375, %r11375, 25; + add.s32 %r11377, %r11329, %r10769; + add.s32 %r11378, %r11377, %r11292; + xor.b32 %r11379, %r11378, %r11317; + shf.l.wrap.b32 %r11380, %r11379, %r11379, 16; + add.s32 %r11381, %r11380, %r11304; + xor.b32 %r11382, %r11381, %r11292; + shf.l.wrap.b32 %r11383, %r11382, %r11382, 20; + add.s32 %r11384, %r11378, %r10755; + add.s32 %r11385, %r11384, %r11383; + xor.b32 %r11386, %r11385, %r11380; + shf.l.wrap.b32 %r11387, %r11386, %r11386, 24; + add.s32 %r11388, %r11387, %r11381; + xor.b32 %r11389, %r11388, %r11383; + shf.l.wrap.b32 %r11390, %r11389, %r11389, 25; + add.s32 %r11391, %r11343, %r10790; + add.s32 %r11392, %r11391, %r11390; + xor.b32 %r11393, %r11392, %r11359; + shf.l.wrap.b32 %r11394, %r11393, %r11393, 16; + add.s32 %r11395, %r11394, %r11374; + xor.b32 %r11396, %r11395, %r11390; + shf.l.wrap.b32 %r11397, %r11396, %r11396, 20; + add.s32 %r11398, %r11392, %r10825; + add.s32 %r11399, %r11398, %r11397; + xor.b32 %r11400, %r11399, %r11394; + shf.l.wrap.b32 %r11401, %r11400, %r11400, 24; + add.s32 %r11402, %r11401, %r11395; + xor.b32 %r11403, %r11402, %r11397; + shf.l.wrap.b32 %r11404, %r11403, %r11403, 25; + add.s32 %r11405, %r11357, %r10804; + add.s32 %r11406, %r11405, %r11348; + xor.b32 %r11407, %r11406, %r11373; + shf.l.wrap.b32 %r11408, %r11407, %r11407, 16; + add.s32 %r11409, %r11408, %r11388; + xor.b32 %r11410, %r11409, %r11348; + shf.l.wrap.b32 %r11411, %r11410, %r11410, 20; + add.s32 %r11412, %r11406, %r10762; + add.s32 %r11413, %r11412, %r11411; + xor.b32 %r11414, %r11413, %r11408; + shf.l.wrap.b32 %r11415, %r11414, %r11414, 24; + add.s32 %r11416, %r11415, %r11409; + xor.b32 %r11417, %r11416, %r11411; + shf.l.wrap.b32 %r11418, %r11417, %r11417, 25; + add.s32 %r11419, %r11371, %r10783; + add.s32 %r11420, %r11419, %r11362; + xor.b32 %r11421, %r11387, %r11420; + shf.l.wrap.b32 %r11422, %r11421, %r11421, 16; + add.s32 %r11423, %r11422, %r11346; + xor.b32 %r11424, %r11423, %r11362; + shf.l.wrap.b32 %r11425, %r11424, %r11424, 20; + add.s32 %r11426, %r11420, %r10811; + add.s32 %r11427, %r11426, %r11425; + xor.b32 %r11428, %r11427, %r11422; + shf.l.wrap.b32 %r11429, %r11428, %r11428, 24; + add.s32 %r11430, %r11429, %r11423; + xor.b32 %r11431, %r11430, %r11425; + shf.l.wrap.b32 %r11432, %r11431, %r11431, 25; + add.s32 %r11433, %r11376, %r10832; + add.s32 %r11434, %r11433, %r11385; + xor.b32 %r11435, %r11434, %r11345; + shf.l.wrap.b32 %r11436, %r11435, %r11435, 16; + add.s32 %r11437, %r11436, %r11360; + xor.b32 %r11438, %r11437, %r11376; + shf.l.wrap.b32 %r11439, %r11438, %r11438, 20; + add.s32 %r11440, %r11434, %r10734; + add.s32 %r11441, %r11440, %r11439; + xor.b32 %r11442, %r11441, %r11436; + shf.l.wrap.b32 %r11443, %r11442, %r11442, 24; + add.s32 %r11444, %r11443, %r11437; + xor.b32 %r11445, %r11444, %r11439; + shf.l.wrap.b32 %r11446, %r11445, %r11445, 25; + add.s32 %r11447, %r11418, %r10818; + add.s32 %r11448, %r11447, %r11399; + xor.b32 %r11449, %r11448, %r11443; + shf.l.wrap.b32 %r11450, %r11449, %r11449, 16; + add.s32 %r11451, %r11450, %r11430; + xor.b32 %r11452, %r11451, %r11418; + shf.l.wrap.b32 %r11453, %r11452, %r11452, 20; + add.s32 %r11454, %r11448, %r10748; + add.s32 %r11455, %r11454, %r11453; + xor.b32 %r11456, %r11455, %r11450; + shf.l.wrap.b32 %r11457, %r11456, %r11456, 24; + add.s32 %r11458, %r11457, %r11451; + xor.b32 %r11459, %r11458, %r11453; + shf.l.wrap.b32 %r11460, %r11459, %r11459, 25; + add.s32 %r11461, %r11413, %r10727; + add.s32 %r11462, %r11461, %r11432; + xor.b32 %r11463, %r11401, %r11462; + shf.l.wrap.b32 %r11464, %r11463, %r11463, 16; + add.s32 %r11465, %r11464, %r11444; + xor.b32 %r11466, %r11465, %r11432; + shf.l.wrap.b32 %r11467, %r11466, %r11466, 20; + add.s32 %r11468, %r11462, %r10797; + add.s32 %r11469, %r11468, %r11467; + xor.b32 %r11470, %r11469, %r11464; + shf.l.wrap.b32 %r11471, %r11470, %r11470, 24; + add.s32 %r11472, %r11471, %r11465; + xor.b32 %r11473, %r11472, %r11467; + shf.l.wrap.b32 %r11474, %r11473, %r11473, 25; + add.s32 %r11475, %r11427, %r10741; + add.s32 %r11476, %r11475, %r11446; + xor.b32 %r11477, %r11476, %r11415; + shf.l.wrap.b32 %r11478, %r11477, %r11477, 16; + add.s32 %r11479, %r11478, %r11402; + xor.b32 %r11480, %r11479, %r11446; + shf.l.wrap.b32 %r11481, %r11480, %r11480, 20; + add.s32 %r11482, %r11476, %r10769; + add.s32 %r11483, %r11482, %r11481; + xor.b32 %r11484, %r11483, %r11478; + shf.l.wrap.b32 %r11485, %r11484, %r11484, 24; + add.s32 %r11486, %r11485, %r11479; + xor.b32 %r11487, %r11486, %r11481; + shf.l.wrap.b32 %r11488, %r11487, %r11487, 25; + add.s32 %r11489, %r11441, %r10755; + add.s32 %r11490, %r11489, %r11404; + xor.b32 %r11491, %r11490, %r11429; + shf.l.wrap.b32 %r11492, %r11491, %r11491, 16; + add.s32 %r11493, %r11492, %r11416; + xor.b32 %r11494, %r11493, %r11404; + shf.l.wrap.b32 %r11495, %r11494, %r11494, 20; + add.s32 %r11496, %r11490, %r10776; + add.s32 %r11497, %r11496, %r11495; + xor.b32 %r11498, %r11497, %r11492; + shf.l.wrap.b32 %r11499, %r11498, %r11498, 24; + add.s32 %r11500, %r11499, %r11493; + xor.b32 %r11501, %r11500, %r11495; + shf.l.wrap.b32 %r11502, %r11501, %r11501, 25; + add.s32 %r11503, %r11455, %r10804; + add.s32 %r11504, %r11503, %r11502; + xor.b32 %r11505, %r11504, %r11471; + shf.l.wrap.b32 %r11506, %r11505, %r11505, 16; + add.s32 %r11507, %r11506, %r11486; + xor.b32 %r11508, %r11507, %r11502; + shf.l.wrap.b32 %r11509, %r11508, %r11508, 20; + add.s32 %r11510, %r11504, %r10832; + add.s32 %r11511, %r11510, %r11509; + xor.b32 %r11512, %r11511, %r11506; + shf.l.wrap.b32 %r11513, %r11512, %r11512, 24; + add.s32 %r11514, %r11513, %r11507; + xor.b32 %r11515, %r11514, %r11509; + shf.l.wrap.b32 %r11516, %r11515, %r11515, 25; + add.s32 %r11517, %r11469, %r10762; + add.s32 %r11518, %r11517, %r11460; + xor.b32 %r11519, %r11518, %r11485; + shf.l.wrap.b32 %r11520, %r11519, %r11519, 16; + add.s32 %r11521, %r11520, %r11500; + xor.b32 %r11522, %r11521, %r11460; + shf.l.wrap.b32 %r11523, %r11522, %r11522, 20; + add.s32 %r11524, %r11518, %r10727; + add.s32 %r11525, %r11524, %r11523; + xor.b32 %r11526, %r11525, %r11520; + shf.l.wrap.b32 %r11527, %r11526, %r11526, 24; + add.s32 %r11528, %r11527, %r11521; + xor.b32 %r11529, %r11528, %r11523; + shf.l.wrap.b32 %r11530, %r11529, %r11529, 25; + add.s32 %r11531, %r11483, %r10734; + add.s32 %r11532, %r11531, %r11474; + xor.b32 %r11533, %r11499, %r11532; + shf.l.wrap.b32 %r11534, %r11533, %r11533, 16; + add.s32 %r11535, %r11534, %r11458; + xor.b32 %r11536, %r11535, %r11474; + shf.l.wrap.b32 %r11537, %r11536, %r11536, 20; + add.s32 %r11538, %r11532, %r10790; + add.s32 %r11539, %r11538, %r11537; + xor.b32 %r11540, %r11539, %r11534; + shf.l.wrap.b32 %r11541, %r11540, %r11540, 24; + add.s32 %r11542, %r11541, %r11535; + xor.b32 %r11543, %r11542, %r11537; + shf.l.wrap.b32 %r11544, %r11543, %r11543, 25; + add.s32 %r11545, %r11488, %r10783; + add.s32 %r11546, %r11545, %r11497; + xor.b32 %r11547, %r11546, %r11457; + shf.l.wrap.b32 %r11548, %r11547, %r11547, 16; + add.s32 %r11549, %r11548, %r11472; + xor.b32 %r11550, %r11549, %r11488; + shf.l.wrap.b32 %r11551, %r11550, %r11550, 20; + add.s32 %r11552, %r11546, %r10769; + add.s32 %r11553, %r11552, %r11551; + xor.b32 %r11554, %r11553, %r11548; + shf.l.wrap.b32 %r11555, %r11554, %r11554, 24; + add.s32 %r11556, %r11555, %r11549; + xor.b32 %r11557, %r11556, %r11551; + shf.l.wrap.b32 %r11558, %r11557, %r11557, 25; + add.s32 %r11559, %r11530, %r10825; + add.s32 %r11560, %r11559, %r11511; + xor.b32 %r11561, %r11560, %r11555; + shf.l.wrap.b32 %r11562, %r11561, %r11561, 16; + add.s32 %r11563, %r11562, %r11542; + xor.b32 %r11564, %r11563, %r11530; + shf.l.wrap.b32 %r11565, %r11564, %r11564, 20; + add.s32 %r11566, %r11560, %r10797; + add.s32 %r11567, %r11566, %r11565; + xor.b32 %r11568, %r11567, %r11562; + shf.l.wrap.b32 %r11569, %r11568, %r11568, 24; + add.s32 %r11570, %r11569, %r11563; + xor.b32 %r11571, %r11570, %r11565; + shf.l.wrap.b32 %r11572, %r11571, %r11571, 25; + add.s32 %r11573, %r11525, %r10741; + add.s32 %r11574, %r11573, %r11544; + xor.b32 %r11575, %r11513, %r11574; + shf.l.wrap.b32 %r11576, %r11575, %r11575, 16; + add.s32 %r11577, %r11576, %r11556; + xor.b32 %r11578, %r11577, %r11544; + shf.l.wrap.b32 %r11579, %r11578, %r11578, 20; + add.s32 %r11580, %r11574, %r10811; + add.s32 %r11581, %r11580, %r11579; + xor.b32 %r11582, %r11581, %r11576; + shf.l.wrap.b32 %r11583, %r11582, %r11582, 24; + add.s32 %r11584, %r11583, %r11577; + xor.b32 %r11585, %r11584, %r11579; + shf.l.wrap.b32 %r11586, %r11585, %r11585, 25; + add.s32 %r11587, %r11539, %r10748; + add.s32 %r11588, %r11587, %r11558; + xor.b32 %r11589, %r11588, %r11527; + shf.l.wrap.b32 %r11590, %r11589, %r11589, 16; + add.s32 %r11591, %r11590, %r11514; + xor.b32 %r11592, %r11591, %r11558; + shf.l.wrap.b32 %r11593, %r11592, %r11592, 20; + add.s32 %r11594, %r11588, %r10755; + add.s32 %r11595, %r11594, %r11593; + xor.b32 %r11596, %r11595, %r11590; + shf.l.wrap.b32 %r11597, %r11596, %r11596, 24; + add.s32 %r11598, %r11597, %r11591; + xor.b32 %r11599, %r11598, %r11593; + shf.l.wrap.b32 %r11600, %r11599, %r11599, 25; + add.s32 %r11601, %r11553, %r10776; + add.s32 %r11602, %r11601, %r11516; + xor.b32 %r11603, %r11602, %r11541; + shf.l.wrap.b32 %r11604, %r11603, %r11603, 16; + add.s32 %r11605, %r11604, %r11528; + xor.b32 %r11606, %r11605, %r11516; + shf.l.wrap.b32 %r11607, %r11606, %r11606, 20; + add.s32 %r11608, %r11602, %r10818; + add.s32 %r11609, %r11608, %r11607; + xor.b32 %r11610, %r11609, %r11604; + shf.l.wrap.b32 %r11611, %r11610, %r11610, 24; + add.s32 %r11612, %r11611, %r11605; + xor.b32 %r11613, %r11612, %r11607; + shf.l.wrap.b32 %r11614, %r11613, %r11613, 25; + xor.b32 %r11615, %r11598, %r11567; + xor.b32 %r11616, %r11612, %r11581; + xor.b32 %r11617, %r11570, %r11595; + xor.b32 %r11618, %r11609, %r11584; + xor.b32 %r11619, %r11614, %r11583; + xor.b32 %r11620, %r11572, %r11597; + xor.b32 %r11621, %r11611, %r11586; + xor.b32 %r11622, %r11600, %r11569; + st.local.u8 [%rd220+145], %r11615; + shr.u32 %r11623, %r11615, 8; + st.local.u8 [%rd220+146], %r11623; + shr.u32 %r11624, %r11615, 16; + st.local.u8 [%rd220+147], %r11624; + shr.u32 %r11625, %r11615, 24; + st.local.u8 [%rd220+148], %r11625; + st.local.u8 [%rd220+149], %r11616; + shr.u32 %r11626, %r11616, 8; + st.local.u8 [%rd220+150], %r11626; + shr.u32 %r11627, %r11616, 16; + st.local.u8 [%rd220+151], %r11627; + shr.u32 %r11628, %r11616, 24; + st.local.u8 [%rd220+152], %r11628; + st.local.u8 [%rd220+153], %r11617; + shr.u32 %r11629, %r11617, 8; + st.local.u8 [%rd220+154], %r11629; + shr.u32 %r11630, %r11617, 16; + st.local.u8 [%rd220+155], %r11630; + shr.u32 %r11631, %r11617, 24; + st.local.u8 [%rd220+156], %r11631; + st.local.u8 [%rd220+157], %r11618; + shr.u32 %r11632, %r11618, 8; + st.local.u8 [%rd220+158], %r11632; + shr.u32 %r11633, %r11618, 16; + st.local.u8 [%rd220+159], %r11633; + shr.u32 %r11634, %r11618, 24; + st.local.u8 [%rd220+160], %r11634; + st.local.u8 [%rd220+161], %r11619; + shr.u32 %r11635, %r11619, 8; + st.local.u8 [%rd220+162], %r11635; + shr.u32 %r11636, %r11619, 16; + st.local.u8 [%rd220+163], %r11636; + shr.u32 %r11637, %r11619, 24; + st.local.u8 [%rd220+164], %r11637; + st.local.u8 [%rd220+165], %r11620; + shr.u32 %r11638, %r11620, 8; + st.local.u8 [%rd220+166], %r11638; + shr.u32 %r11639, %r11620, 16; + st.local.u8 [%rd220+167], %r11639; + shr.u32 %r11640, %r11620, 24; + st.local.u8 [%rd220+168], %r11640; + st.local.u8 [%rd220+169], %r11621; + shr.u32 %r11641, %r11621, 8; + st.local.u8 [%rd220+170], %r11641; + shr.u32 %r11642, %r11621, 16; + st.local.u8 [%rd220+171], %r11642; + shr.u32 %r11643, %r11621, 24; + st.local.u8 [%rd220+172], %r11643; + st.local.u8 [%rd220+173], %r11622; + shr.u32 %r11644, %r11622, 8; + st.local.u8 [%rd220+174], %r11644; + shr.u32 %r11645, %r11622, 16; + st.local.u8 [%rd220+175], %r11645; + shr.u32 %r11646, %r11622, 24; + st.local.u8 [%rd220+176], %r11646; + add.s16 %rs392, %rs392, -1; + cvt.u64.u16 %rd221, %rs392; + and.b64 %rd222, %rd221, 255; + setp.lt.u64 %p53, %rd97, %rd222; + @%p53 bra $L__BB1_66; + + ld.param.u64 %rd233, [_Z20blake3_hasher_updateP13blake3_hasherPKvy_param_0]; + cvta.to.local.u64 %rd232, %rd233; + add.s64 %rd231, %rd232, 136; + st.local.u8 [%rd231+8], %rs392; + +$L__BB1_68: + ret; + +} + // .globl heavy_hash +.visible .entry heavy_hash( + .param .u64 heavy_hash_param_0, + .param .u64 heavy_hash_param_1, + .param .u64 heavy_hash_param_2, + .param .u8 heavy_hash_param_3, + .param .u64 heavy_hash_param_4, + .param .u64 heavy_hash_param_5, + .param .u64 heavy_hash_param_6, + .param .u64 heavy_hash_param_7 +) +{ + .local .align 16 .b8 __local_depot2[2096]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<64>; + .reg .b16 %rs<866>; + .reg .b32 %r<30985>; + .reg .b64 %rd<1282>; + + + mov.u64 %SPL, __local_depot2; + cvta.local.u64 %SP, %SPL; + ld.param.u8 %rs409, [heavy_hash_param_3]; + ld.param.u64 %rd349, [heavy_hash_param_0]; + ld.param.u64 %rd350, [heavy_hash_param_1]; + ld.param.u64 %rd351, [heavy_hash_param_2]; + ld.param.u64 %rd355, [heavy_hash_param_4]; + ld.param.u64 %rd352, [heavy_hash_param_5]; + ld.param.u64 %rd353, [heavy_hash_param_6]; + ld.param.u64 %rd354, [heavy_hash_param_7]; + cvta.to.global.u64 %rd1254, %rd353; + cvta.to.global.u64 %rd2, %rd355; + add.u64 %rd356, %SP, 0; + add.u64 %rd3, %SPL, 0; + add.u64 %rd4, %SPL, 2016; + mov.u32 %r1, %ctaid.x; + mov.u32 %r2, %tid.x; + or.b32 %r5023, %r2, %r1; + setp.ne.s32 %p6, %r5023, 0; + @%p6 bra $L__BB2_8; + + add.u64 %rd358, %SP, 2000; + add.u64 %rd359, %SPL, 2000; + mov.u32 %r29535, 0; + mov.u64 %rd360, 0; + st.local.v2.u32 [%rd359], {%r2, %r1}; + mov.u64 %rd361, $str$2; + cvta.global.u64 %rd362, %rd361; + { // callseq 3, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd362; + .param .b64 param1; + st.param.b64 [param1+0], %rd358; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5025, [retval0+0]; + } // callseq 3 + mov.u64 %rd363, $str$3; + cvta.global.u64 %rd364, %rd363; + { // callseq 4, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd364; + .param .b64 param1; + st.param.b64 [param1+0], %rd360; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5026, [retval0+0]; + } // callseq 4 + mov.u64 %rd1252, %rd1254; + +$L__BB2_2: + ld.global.u8 %r5027, [%rd1252+1280]; + st.local.u32 [%rd3], %r5027; + mov.u64 %rd368, $str; + cvta.global.u64 %rd369, %rd368; + { // callseq 5, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5028, [retval0+0]; + } // callseq 5 + add.s64 %rd1252, %rd1252, 1; + add.s32 %r29535, %r29535, 1; + setp.lt.u32 %p7, %r29535, 128; + @%p7 bra $L__BB2_2; + + mov.u64 %rd371, $str$1; + cvta.global.u64 %rd372, %rd371; + mov.u32 %r29536, 0; + mov.u64 %rd373, 0; + { // callseq 6, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd373; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5030, [retval0+0]; + } // callseq 6 + mov.u64 %rd374, $str$4; + cvta.global.u64 %rd375, %rd374; + { // callseq 7, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd375; + .param .b64 param1; + st.param.b64 [param1+0], %rd373; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5031, [retval0+0]; + } // callseq 7 + mov.u64 %rd1253, %rd1254; + +$L__BB2_4: + ld.global.u8 %r5032, [%rd1253+5376]; + st.local.u32 [%rd3], %r5032; + { // callseq 8, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5033, [retval0+0]; + } // callseq 8 + add.s64 %rd1253, %rd1253, 1; + add.s32 %r29536, %r29536, 1; + setp.lt.u32 %p8, %r29536, 128; + @%p8 bra $L__BB2_4; + + mov.u32 %r29537, 0; + mov.u64 %rd381, 0; + { // callseq 9, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd381; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5035, [retval0+0]; + } // callseq 9 + mov.u64 %rd382, $str$5; + cvta.global.u64 %rd383, %rd382; + { // callseq 10, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd383; + .param .b64 param1; + st.param.b64 [param1+0], %rd381; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5036, [retval0+0]; + } // callseq 10 + +$L__BB2_6: + ld.global.u8 %r5037, [%rd1254+1580160]; + st.local.u32 [%rd3], %r5037; + { // callseq 11, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd369; + .param .b64 param1; + st.param.b64 [param1+0], %rd356; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5038, [retval0+0]; + } // callseq 11 + add.s64 %rd1254, %rd1254, 1; + add.s32 %r29537, %r29537, 1; + setp.lt.u32 %p9, %r29537, 128; + @%p9 bra $L__BB2_6; + + mov.u64 %rd389, 0; + { // callseq 12, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd372; + .param .b64 param1; + st.param.b64 [param1+0], %rd389; + .param .b32 retval0; + call.uni (retval0), + vprintf, + ( + param0, + param1 + ); + ld.param.b32 %r5039, [retval0+0]; + } // callseq 12 + +$L__BB2_8: + mov.u32 %r5040, %ntid.x; + mad.lo.s32 %r5041, %r1, %r5040, %r2; + cvt.s64.s32 %rd14, %r5041; + setp.ge.u64 %p10, %rd14, %rd351; + @%p10 bra $L__BB2_113; + + cvt.u32.u64 %r5042, %rd14; + setp.ne.s32 %p11, %r5042, 0; + @%p11 bra $L__BB2_11; + + cvta.to.global.u64 %rd390, %rd352; + mov.u64 %rd391, 0; + st.global.u64 [%rd390], %rd391; + +$L__BB2_11: + setp.eq.s16 %p12, %rs409, 0; + @%p12 bra $L__BB2_13; + + shl.b64 %rd392, %rd14, 5; + add.s64 %rd393, %rd2, %rd392; + ld.global.v2.u64 {%rd394, %rd395}, [%rd393]; + mul.lo.s64 %rd398, %rd395, 5; + { + .reg .b64 %lhs; + .reg .b64 %rhs; + shl.b64 %lhs, %rd398, 7; + shr.b64 %rhs, %rd398, 57; + add.u64 %rd399, %lhs, %rhs; + } + mul.lo.s64 %rd1255, %rd399, 9; + shl.b64 %rd400, %rd395, 17; + ld.global.v2.u64 {%rd401, %rd402}, [%rd393+16]; + xor.b64 %rd405, %rd401, %rd394; + xor.b64 %rd406, %rd402, %rd395; + xor.b64 %rd407, %rd395, %rd405; + xor.b64 %rd408, %rd394, %rd406; + st.global.v2.u64 [%rd393], {%rd408, %rd407}; + { + .reg .b32 %dummy; + mov.b64 {%r5043,%dummy}, %rd406; + } + { + .reg .b32 %dummy; + mov.b64 {%dummy,%r5044}, %rd406; + } + shf.r.wrap.b32 %r5045, %r5044, %r5043, 19; + shf.r.wrap.b32 %r5046, %r5043, %r5044, 19; + mov.b64 %rd409, {%r5046, %r5045}; + xor.b64 %rd410, %rd405, %rd400; + st.global.v2.u64 [%rd393+16], {%rd410, %rd409}; + bra.uni $L__BB2_14; + +$L__BB2_13: + ld.global.u64 %rd411, [%rd2]; + xor.b64 %rd1255, %rd411, %rd14; + +$L__BB2_14: + and.b64 %rd413, %rd1255, %rd349; + or.b64 %rd18, %rd413, %rd350; + mov.u64 %rd1256, 0; + mov.u64 %rd414, hash_header; + +$L__BB2_15: + add.s64 %rd415, %rd414, %rd1256; + ld.const.u8 %rs410, [%rd415]; + add.s64 %rd416, %rd4, %rd1256; + st.local.u8 [%rd416], %rs410; + add.s64 %rd1256, %rd1256, 1; + setp.lt.u64 %p13, %rd1256, 72; + @%p13 bra $L__BB2_15; + + mov.u64 %rd417, 0; + st.local.u64 [%rd4+72], %rd18; + mov.u32 %r5047, -1150833019; + mov.u32 %r5048, 1779033703; + st.local.v2.u32 [%rd3], {%r5048, %r5047}; + mov.u32 %r5049, -1521486534; + mov.u32 %r5050, 1013904242; + st.local.v2.u32 [%rd3+8], {%r5050, %r5049}; + mov.u32 %r5051, -1694144372; + mov.u32 %r5052, 1359893119; + st.local.v2.u32 [%rd3+16], {%r5052, %r5051}; + mov.u32 %r5053, 1541459225; + mov.u32 %r5054, 528734635; + st.local.v2.u32 [%rd3+24], {%r5054, %r5053}; + st.local.v2.u32 [%rd3+32], {%r5048, %r5047}; + st.local.v2.u32 [%rd3+40], {%r5050, %r5049}; + st.local.v2.u32 [%rd3+48], {%r5052, %r5051}; + st.local.v2.u32 [%rd3+56], {%r5054, %r5053}; + st.local.u64 [%rd3+64], %rd417; + mov.u32 %r5055, 0; + st.local.v2.u32 [%rd3+72], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+80], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+88], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+96], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+104], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+112], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+120], {%r5055, %r5055}; + st.local.v2.u32 [%rd3+128], {%r5055, %r5055}; + mov.u16 %rs411, 0; + st.local.v2.u8 [%rd3+136], {%rs411, %rs411}; + st.local.u8 [%rd3+138], %rs411; + st.local.u8 [%rd3+144], %rs411; + ld.local.v4.u8 {%rs412, %rs413, %rs414, %rs415}, [%rd3+136]; + setp.eq.s16 %p14, %rs413, 0; + selp.u16 %rs419, 1, 0, %p14; + or.b16 %rs420, %rs414, %rs419; + ld.local.v4.u32 {%r5056, %r5057, %r5058, %r5059}, [%rd4]; + mov.b32 {%rs421, %rs422}, %r5056; + shr.u16 %rs423, %rs421, 8; + shr.u16 %rs424, %rs422, 8; + mov.b32 {%rs425, %rs426}, %r5057; + shr.u16 %rs427, %rs425, 8; + shr.u16 %rs428, %rs426, 8; + mov.b32 {%rs429, %rs430}, %r5058; + shr.u16 %rs431, %rs429, 8; + shr.u16 %rs432, %rs430, 8; + mov.b32 {%rs433, %rs434}, %r5059; + shr.u16 %rs435, %rs433, 8; + shr.u16 %rs436, %rs434, 8; + cvt.u32.u16 %r5064, %rs421; + and.b32 %r5065, %r5064, 255; + cvt.u32.u16 %r5066, %rs423; + prmt.b32 %r5067, %r5066, %r5065, 30212; + cvt.u32.u16 %r5068, %rs422; + prmt.b32 %r5069, %r5068, %r5067, 28756; + cvt.u32.u16 %r5070, %rs424; + prmt.b32 %r5071, %r5070, %r5069, 1620; + cvt.u32.u16 %r5072, %rs425; + and.b32 %r5073, %r5072, 255; + cvt.u32.u16 %r5074, %rs427; + prmt.b32 %r5075, %r5074, %r5073, 30212; + cvt.u32.u16 %r5076, %rs426; + prmt.b32 %r5077, %r5076, %r5075, 28756; + cvt.u32.u16 %r5078, %rs428; + prmt.b32 %r5079, %r5078, %r5077, 1620; + cvt.u32.u16 %r5080, %rs429; + and.b32 %r5081, %r5080, 255; + cvt.u32.u16 %r5082, %rs431; + prmt.b32 %r5083, %r5082, %r5081, 30212; + cvt.u32.u16 %r5084, %rs430; + prmt.b32 %r5085, %r5084, %r5083, 28756; + cvt.u32.u16 %r5086, %rs432; + prmt.b32 %r5087, %r5086, %r5085, 1620; + cvt.u32.u16 %r5088, %rs433; + and.b32 %r5089, %r5088, 255; + cvt.u32.u16 %r5090, %rs435; + prmt.b32 %r5091, %r5090, %r5089, 30212; + cvt.u32.u16 %r5092, %rs434; + prmt.b32 %r5093, %r5092, %r5091, 28756; + cvt.u32.u16 %r5094, %rs436; + prmt.b32 %r5095, %r5094, %r5093, 1620; + ld.local.v4.u32 {%r5096, %r5097, %r5098, %r5099}, [%rd4+16]; + mov.b32 {%rs437, %rs438}, %r5096; + shr.u16 %rs439, %rs437, 8; + shr.u16 %rs440, %rs438, 8; + mov.b32 {%rs441, %rs442}, %r5097; + shr.u16 %rs443, %rs441, 8; + shr.u16 %rs444, %rs442, 8; + mov.b32 {%rs445, %rs446}, %r5098; + shr.u16 %rs447, %rs445, 8; + shr.u16 %rs448, %rs446, 8; + mov.b32 {%rs449, %rs450}, %r5099; + shr.u16 %rs451, %rs449, 8; + shr.u16 %rs452, %rs450, 8; + cvt.u32.u16 %r5104, %rs437; + and.b32 %r5105, %r5104, 255; + cvt.u32.u16 %r5106, %rs439; + prmt.b32 %r5107, %r5106, %r5105, 30212; + cvt.u32.u16 %r5108, %rs438; + prmt.b32 %r5109, %r5108, %r5107, 28756; + cvt.u32.u16 %r5110, %rs440; + prmt.b32 %r5111, %r5110, %r5109, 1620; + cvt.u32.u16 %r5112, %rs441; + and.b32 %r5113, %r5112, 255; + cvt.u32.u16 %r5114, %rs443; + prmt.b32 %r5115, %r5114, %r5113, 30212; + cvt.u32.u16 %r5116, %rs442; + prmt.b32 %r5117, %r5116, %r5115, 28756; + cvt.u32.u16 %r5118, %rs444; + prmt.b32 %r5119, %r5118, %r5117, 1620; + cvt.u32.u16 %r5120, %rs445; + and.b32 %r5121, %r5120, 255; + cvt.u32.u16 %r5122, %rs447; + prmt.b32 %r5123, %r5122, %r5121, 30212; + cvt.u32.u16 %r5124, %rs446; + prmt.b32 %r5125, %r5124, %r5123, 28756; + cvt.u32.u16 %r5126, %rs448; + prmt.b32 %r5127, %r5126, %r5125, 1620; + cvt.u32.u16 %r5128, %rs449; + and.b32 %r5129, %r5128, 255; + cvt.u32.u16 %r5130, %rs451; + prmt.b32 %r5131, %r5130, %r5129, 30212; + cvt.u32.u16 %r5132, %rs450; + prmt.b32 %r5133, %r5132, %r5131, 28756; + cvt.u32.u16 %r5134, %rs452; + prmt.b32 %r5135, %r5134, %r5133, 1620; + ld.local.v4.u32 {%r5136, %r5137, %r5138, %r5139}, [%rd4+32]; + mov.b32 {%rs453, %rs454}, %r5136; + shr.u16 %rs455, %rs453, 8; + shr.u16 %rs456, %rs454, 8; + mov.b32 {%rs457, %rs458}, %r5137; + shr.u16 %rs459, %rs457, 8; + shr.u16 %rs460, %rs458, 8; + mov.b32 {%rs461, %rs462}, %r5138; + shr.u16 %rs463, %rs461, 8; + shr.u16 %rs464, %rs462, 8; + mov.b32 {%rs465, %rs466}, %r5139; + shr.u16 %rs467, %rs465, 8; + shr.u16 %rs468, %rs466, 8; + cvt.u32.u16 %r5144, %rs453; + and.b32 %r5145, %r5144, 255; + cvt.u32.u16 %r5146, %rs455; + prmt.b32 %r5147, %r5146, %r5145, 30212; + cvt.u32.u16 %r5148, %rs454; + prmt.b32 %r5149, %r5148, %r5147, 28756; + cvt.u32.u16 %r5150, %rs456; + prmt.b32 %r5151, %r5150, %r5149, 1620; + cvt.u32.u16 %r5152, %rs457; + and.b32 %r5153, %r5152, 255; + cvt.u32.u16 %r5154, %rs459; + prmt.b32 %r5155, %r5154, %r5153, 30212; + cvt.u32.u16 %r5156, %rs458; + prmt.b32 %r5157, %r5156, %r5155, 28756; + cvt.u32.u16 %r5158, %rs460; + prmt.b32 %r5159, %r5158, %r5157, 1620; + cvt.u32.u16 %r5160, %rs461; + and.b32 %r5161, %r5160, 255; + cvt.u32.u16 %r5162, %rs463; + prmt.b32 %r5163, %r5162, %r5161, 30212; + cvt.u32.u16 %r5164, %rs462; + prmt.b32 %r5165, %r5164, %r5163, 28756; + cvt.u32.u16 %r5166, %rs464; + prmt.b32 %r5167, %r5166, %r5165, 1620; + cvt.u32.u16 %r5168, %rs465; + and.b32 %r5169, %r5168, 255; + cvt.u32.u16 %r5170, %rs467; + prmt.b32 %r5171, %r5170, %r5169, 30212; + cvt.u32.u16 %r5172, %rs466; + prmt.b32 %r5173, %r5172, %r5171, 28756; + cvt.u32.u16 %r5174, %rs468; + prmt.b32 %r5175, %r5174, %r5173, 1620; + ld.local.v4.u32 {%r5176, %r5177, %r5178, %r5179}, [%rd4+48]; + mov.b32 {%rs469, %rs470}, %r5176; + shr.u16 %rs471, %rs469, 8; + shr.u16 %rs472, %rs470, 8; + mov.b32 {%rs473, %rs474}, %r5177; + shr.u16 %rs475, %rs473, 8; + shr.u16 %rs476, %rs474, 8; + mov.b32 {%rs477, %rs478}, %r5178; + shr.u16 %rs479, %rs477, 8; + shr.u16 %rs480, %rs478, 8; + mov.b32 {%rs481, %rs482}, %r5179; + shr.u16 %rs483, %rs481, 8; + shr.u16 %rs484, %rs482, 8; + cvt.u32.u16 %r5184, %rs469; + and.b32 %r5185, %r5184, 255; + cvt.u32.u16 %r5186, %rs471; + prmt.b32 %r5187, %r5186, %r5185, 30212; + cvt.u32.u16 %r5188, %rs470; + prmt.b32 %r5189, %r5188, %r5187, 28756; + cvt.u32.u16 %r5190, %rs472; + prmt.b32 %r5191, %r5190, %r5189, 1620; + cvt.u32.u16 %r5192, %rs473; + and.b32 %r5193, %r5192, 255; + cvt.u32.u16 %r5194, %rs475; + prmt.b32 %r5195, %r5194, %r5193, 30212; + cvt.u32.u16 %r5196, %rs474; + prmt.b32 %r5197, %r5196, %r5195, 28756; + cvt.u32.u16 %r5198, %rs476; + prmt.b32 %r5199, %r5198, %r5197, 1620; + cvt.u32.u16 %r5200, %rs477; + and.b32 %r5201, %r5200, 255; + cvt.u32.u16 %r5202, %rs479; + prmt.b32 %r5203, %r5202, %r5201, 30212; + cvt.u32.u16 %r5204, %rs478; + prmt.b32 %r5205, %r5204, %r5203, 28756; + cvt.u32.u16 %r5206, %rs480; + prmt.b32 %r5207, %r5206, %r5205, 1620; + cvt.u32.u16 %r5208, %rs481; + and.b32 %r5209, %r5208, 255; + cvt.u32.u16 %r5210, %rs483; + prmt.b32 %r5211, %r5210, %r5209, 30212; + cvt.u32.u16 %r5212, %rs482; + prmt.b32 %r5213, %r5212, %r5211, 28756; + cvt.u32.u16 %r5214, %rs484; + prmt.b32 %r5215, %r5214, %r5213, 1620; + cvt.u32.u16 %r5216, %rs420; + and.b32 %r5217, %r5216, 255; + add.s32 %r5218, %r5071, -1156040474; + shf.l.wrap.b32 %r5219, %r5218, %r5218, 16; + add.s32 %r5220, %r5219, 1779033703; + xor.b32 %r5221, %r5220, 1359893119; + shf.l.wrap.b32 %r5222, %r5221, %r5221, 20; + add.s32 %r5223, %r5079, %r5218; + add.s32 %r5224, %r5223, %r5222; + xor.b32 %r5225, %r5224, %r5219; + shf.l.wrap.b32 %r5226, %r5225, %r5225, 24; + add.s32 %r5227, %r5226, %r5220; + xor.b32 %r5228, %r5227, %r5222; + shf.l.wrap.b32 %r5229, %r5228, %r5228, 25; + add.s32 %r5230, %r5087, 1449989905; + shf.l.wrap.b32 %r5231, %r5230, %r5230, 16; + add.s32 %r5232, %r5231, -1150833019; + xor.b32 %r5233, %r5232, -1694144372; + shf.l.wrap.b32 %r5234, %r5233, %r5233, 20; + add.s32 %r5235, %r5095, %r5230; + add.s32 %r5236, %r5235, %r5234; + xor.b32 %r5237, %r5236, %r5231; + shf.l.wrap.b32 %r5238, %r5237, %r5237, 24; + add.s32 %r5239, %r5238, %r5232; + xor.b32 %r5240, %r5239, %r5234; + shf.l.wrap.b32 %r5241, %r5240, %r5240, 25; + add.s32 %r5242, %r5111, 1542638877; + shr.u32 %r5243, %r5242, 16; + shl.b32 %r5244, %r5242, 16; + xor.b32 %r5245, %r5244, 4194304; + or.b32 %r5246, %r5245, %r5243; + add.s32 %r5247, %r5246, 1013904242; + xor.b32 %r5248, %r5247, 528734635; + shf.l.wrap.b32 %r5249, %r5248, %r5248, 20; + add.s32 %r5250, %r5119, %r5242; + add.s32 %r5251, %r5250, %r5249; + xor.b32 %r5252, %r5251, %r5246; + shf.l.wrap.b32 %r5253, %r5252, %r5252, 24; + add.s32 %r5254, %r5253, %r5247; + xor.b32 %r5255, %r5254, %r5249; + shf.l.wrap.b32 %r5256, %r5255, %r5255, 25; + add.s32 %r5257, %r5127, 19972691; + xor.b32 %r5258, %r5257, %r5217; + shr.u32 %r5259, %r5257, 16; + shl.b32 %r5260, %r5258, 16; + or.b32 %r5261, %r5260, %r5259; + add.s32 %r5262, %r5261, -1521486534; + xor.b32 %r5263, %r5262, 1541459225; + shf.l.wrap.b32 %r5264, %r5263, %r5263, 20; + add.s32 %r5265, %r5135, %r5257; + add.s32 %r5266, %r5265, %r5264; + xor.b32 %r5267, %r5266, %r5261; + shf.l.wrap.b32 %r5268, %r5267, %r5267, 24; + add.s32 %r5269, %r5268, %r5262; + xor.b32 %r5270, %r5269, %r5264; + shf.l.wrap.b32 %r5271, %r5270, %r5270, 25; + add.s32 %r5272, %r5241, %r5224; + add.s32 %r5273, %r5272, %r5151; + xor.b32 %r5274, %r5268, %r5273; + shf.l.wrap.b32 %r5275, %r5274, %r5274, 16; + add.s32 %r5276, %r5275, %r5254; + xor.b32 %r5277, %r5276, %r5241; + shf.l.wrap.b32 %r5278, %r5277, %r5277, 20; + add.s32 %r5279, %r5159, %r5273; + add.s32 %r5280, %r5279, %r5278; + xor.b32 %r5281, %r5280, %r5275; + shf.l.wrap.b32 %r5282, %r5281, %r5281, 24; + add.s32 %r5283, %r5282, %r5276; + xor.b32 %r5284, %r5283, %r5278; + shf.l.wrap.b32 %r5285, %r5284, %r5284, 25; + add.s32 %r5286, %r5256, %r5236; + add.s32 %r5287, %r5286, %r5167; + xor.b32 %r5288, %r5287, %r5226; + shf.l.wrap.b32 %r5289, %r5288, %r5288, 16; + add.s32 %r5290, %r5289, %r5269; + xor.b32 %r5291, %r5290, %r5256; + shf.l.wrap.b32 %r5292, %r5291, %r5291, 20; + add.s32 %r5293, %r5175, %r5287; + add.s32 %r5294, %r5293, %r5292; + xor.b32 %r5295, %r5294, %r5289; + shf.l.wrap.b32 %r5296, %r5295, %r5295, 24; + add.s32 %r5297, %r5296, %r5290; + xor.b32 %r5298, %r5297, %r5292; + shf.l.wrap.b32 %r5299, %r5298, %r5298, 25; + add.s32 %r5300, %r5271, %r5251; + add.s32 %r5301, %r5300, %r5191; + xor.b32 %r5302, %r5301, %r5238; + shf.l.wrap.b32 %r5303, %r5302, %r5302, 16; + add.s32 %r5304, %r5303, %r5227; + xor.b32 %r5305, %r5304, %r5271; + shf.l.wrap.b32 %r5306, %r5305, %r5305, 20; + add.s32 %r5307, %r5199, %r5301; + add.s32 %r5308, %r5307, %r5306; + xor.b32 %r5309, %r5308, %r5303; + shf.l.wrap.b32 %r5310, %r5309, %r5309, 24; + add.s32 %r5311, %r5310, %r5304; + xor.b32 %r5312, %r5311, %r5306; + shf.l.wrap.b32 %r5313, %r5312, %r5312, 25; + add.s32 %r5314, %r5266, %r5229; + add.s32 %r5315, %r5314, %r5207; + xor.b32 %r5316, %r5315, %r5253; + shf.l.wrap.b32 %r5317, %r5316, %r5316, 16; + add.s32 %r5318, %r5317, %r5239; + xor.b32 %r5319, %r5318, %r5229; + shf.l.wrap.b32 %r5320, %r5319, %r5319, 20; + add.s32 %r5321, %r5215, %r5315; + add.s32 %r5322, %r5321, %r5320; + xor.b32 %r5323, %r5322, %r5317; + shf.l.wrap.b32 %r5324, %r5323, %r5323, 24; + add.s32 %r5325, %r5324, %r5318; + xor.b32 %r5326, %r5325, %r5320; + shf.l.wrap.b32 %r5327, %r5326, %r5326, 25; + add.s32 %r5328, %r5280, %r5087; + add.s32 %r5329, %r5328, %r5327; + xor.b32 %r5330, %r5329, %r5296; + shf.l.wrap.b32 %r5331, %r5330, %r5330, 16; + add.s32 %r5332, %r5331, %r5311; + xor.b32 %r5333, %r5332, %r5327; + shf.l.wrap.b32 %r5334, %r5333, %r5333, 20; + add.s32 %r5335, %r5329, %r5127; + add.s32 %r5336, %r5335, %r5334; + xor.b32 %r5337, %r5336, %r5331; + shf.l.wrap.b32 %r5338, %r5337, %r5337, 24; + add.s32 %r5339, %r5338, %r5332; + xor.b32 %r5340, %r5339, %r5334; + shf.l.wrap.b32 %r5341, %r5340, %r5340, 25; + add.s32 %r5342, %r5294, %r5095; + add.s32 %r5343, %r5342, %r5285; + xor.b32 %r5344, %r5310, %r5343; + shf.l.wrap.b32 %r5345, %r5344, %r5344, 16; + add.s32 %r5346, %r5325, %r5345; + xor.b32 %r5347, %r5346, %r5285; + shf.l.wrap.b32 %r5348, %r5347, %r5347, 20; + add.s32 %r5349, %r5343, %r5167; + add.s32 %r5350, %r5349, %r5348; + xor.b32 %r5351, %r5350, %r5345; + shf.l.wrap.b32 %r5352, %r5351, %r5351, 24; + add.s32 %r5353, %r5352, %r5346; + xor.b32 %r5354, %r5353, %r5348; + shf.l.wrap.b32 %r5355, %r5354, %r5354, 25; + add.s32 %r5356, %r5299, %r5135; + add.s32 %r5357, %r5356, %r5308; + xor.b32 %r5358, %r5324, %r5357; + shf.l.wrap.b32 %r5359, %r5358, %r5358, 16; + add.s32 %r5360, %r5359, %r5283; + xor.b32 %r5361, %r5360, %r5299; + shf.l.wrap.b32 %r5362, %r5361, %r5361, 20; + add.s32 %r5363, %r5357, %r5071; + add.s32 %r5364, %r5363, %r5362; + xor.b32 %r5365, %r5364, %r5359; + shf.l.wrap.b32 %r5366, %r5365, %r5365, 24; + add.s32 %r5367, %r5366, %r5360; + xor.b32 %r5368, %r5367, %r5362; + shf.l.wrap.b32 %r5369, %r5368, %r5368, 25; + add.s32 %r5370, %r5313, %r5111; + add.s32 %r5371, %r5370, %r5322; + xor.b32 %r5372, %r5371, %r5282; + shf.l.wrap.b32 %r5373, %r5372, %r5372, 16; + add.s32 %r5374, %r5373, %r5297; + xor.b32 %r5375, %r5374, %r5313; + shf.l.wrap.b32 %r5376, %r5375, %r5375, 20; + add.s32 %r5377, %r5371, %r5199; + add.s32 %r5378, %r5377, %r5376; + xor.b32 %r5379, %r5378, %r5373; + shf.l.wrap.b32 %r5380, %r5379, %r5379, 24; + add.s32 %r5381, %r5380, %r5374; + xor.b32 %r5382, %r5381, %r5376; + shf.l.wrap.b32 %r5383, %r5382, %r5382, 25; + add.s32 %r5384, %r5336, %r5079; + add.s32 %r5385, %r5384, %r5355; + xor.b32 %r5386, %r5385, %r5380; + shf.l.wrap.b32 %r5387, %r5386, %r5386, 16; + add.s32 %r5388, %r5387, %r5367; + xor.b32 %r5389, %r5388, %r5355; + shf.l.wrap.b32 %r5390, %r5389, %r5389, 20; + add.s32 %r5391, %r5385, %r5175; + add.s32 %r5392, %r5391, %r5390; + xor.b32 %r5393, %r5392, %r5387; + shf.l.wrap.b32 %r5394, %r5393, %r5393, 24; + add.s32 %r5395, %r5394, %r5388; + xor.b32 %r5396, %r5395, %r5390; + shf.l.wrap.b32 %r5397, %r5396, %r5396, 25; + add.s32 %r5398, %r5350, %r5191; + add.s32 %r5399, %r5398, %r5369; + xor.b32 %r5400, %r5399, %r5338; + shf.l.wrap.b32 %r5401, %r5400, %r5400, 16; + add.s32 %r5402, %r5401, %r5381; + xor.b32 %r5403, %r5402, %r5369; + shf.l.wrap.b32 %r5404, %r5403, %r5403, 20; + add.s32 %r5405, %r5399, %r5119; + add.s32 %r5406, %r5405, %r5404; + xor.b32 %r5407, %r5406, %r5401; + shf.l.wrap.b32 %r5408, %r5407, %r5407, 24; + add.s32 %r5409, %r5408, %r5402; + xor.b32 %r5410, %r5409, %r5404; + shf.l.wrap.b32 %r5411, %r5410, %r5410, 25; + add.s32 %r5412, %r5364, %r5159; + add.s32 %r5413, %r5412, %r5383; + xor.b32 %r5414, %r5413, %r5352; + shf.l.wrap.b32 %r5415, %r5414, %r5414, 16; + add.s32 %r5416, %r5415, %r5339; + xor.b32 %r5417, %r5416, %r5383; + shf.l.wrap.b32 %r5418, %r5417, %r5417, 20; + add.s32 %r5419, %r5413, %r5207; + add.s32 %r5420, %r5419, %r5418; + xor.b32 %r5421, %r5420, %r5415; + shf.l.wrap.b32 %r5422, %r5421, %r5421, 24; + add.s32 %r5423, %r5422, %r5416; + xor.b32 %r5424, %r5423, %r5418; + shf.l.wrap.b32 %r5425, %r5424, %r5424, 25; + add.s32 %r5426, %r5378, %r5215; + add.s32 %r5427, %r5426, %r5341; + xor.b32 %r5428, %r5427, %r5366; + shf.l.wrap.b32 %r5429, %r5428, %r5428, 16; + add.s32 %r5430, %r5429, %r5353; + xor.b32 %r5431, %r5430, %r5341; + shf.l.wrap.b32 %r5432, %r5431, %r5431, 20; + add.s32 %r5433, %r5427, %r5151; + add.s32 %r5434, %r5433, %r5432; + xor.b32 %r5435, %r5434, %r5429; + shf.l.wrap.b32 %r5436, %r5435, %r5435, 24; + add.s32 %r5437, %r5436, %r5430; + xor.b32 %r5438, %r5437, %r5432; + shf.l.wrap.b32 %r5439, %r5438, %r5438, 25; + add.s32 %r5440, %r5392, %r5095; + add.s32 %r5441, %r5440, %r5439; + xor.b32 %r5442, %r5441, %r5408; + shf.l.wrap.b32 %r5443, %r5442, %r5442, 16; + add.s32 %r5444, %r5443, %r5423; + xor.b32 %r5445, %r5444, %r5439; + shf.l.wrap.b32 %r5446, %r5445, %r5445, 20; + add.s32 %r5447, %r5441, %r5111; + add.s32 %r5448, %r5447, %r5446; + xor.b32 %r5449, %r5448, %r5443; + shf.l.wrap.b32 %r5450, %r5449, %r5449, 24; + add.s32 %r5451, %r5450, %r5444; + xor.b32 %r5452, %r5451, %r5446; + shf.l.wrap.b32 %r5453, %r5452, %r5452, 25; + add.s32 %r5454, %r5406, %r5167; + add.s32 %r5455, %r5454, %r5397; + xor.b32 %r5456, %r5455, %r5422; + shf.l.wrap.b32 %r5457, %r5456, %r5456, 16; + add.s32 %r5458, %r5457, %r5437; + xor.b32 %r5459, %r5458, %r5397; + shf.l.wrap.b32 %r5460, %r5459, %r5459, 20; + add.s32 %r5461, %r5455, %r5191; + add.s32 %r5462, %r5461, %r5460; + xor.b32 %r5463, %r5462, %r5457; + shf.l.wrap.b32 %r5464, %r5463, %r5463, 24; + add.s32 %r5465, %r5464, %r5458; + xor.b32 %r5466, %r5465, %r5460; + shf.l.wrap.b32 %r5467, %r5466, %r5466, 25; + add.s32 %r5468, %r5420, %r5199; + add.s32 %r5469, %r5468, %r5411; + xor.b32 %r5470, %r5469, %r5436; + shf.l.wrap.b32 %r5471, %r5470, %r5470, 16; + add.s32 %r5472, %r5471, %r5395; + xor.b32 %r5473, %r5472, %r5411; + shf.l.wrap.b32 %r5474, %r5473, %r5473, 20; + add.s32 %r5475, %r5469, %r5087; + add.s32 %r5476, %r5475, %r5474; + xor.b32 %r5477, %r5476, %r5471; + shf.l.wrap.b32 %r5478, %r5477, %r5477, 24; + add.s32 %r5479, %r5478, %r5472; + xor.b32 %r5480, %r5479, %r5474; + shf.l.wrap.b32 %r5481, %r5480, %r5480, 25; + add.s32 %r5482, %r5434, %r5135; + add.s32 %r5483, %r5482, %r5425; + xor.b32 %r5484, %r5483, %r5394; + shf.l.wrap.b32 %r5485, %r5484, %r5484, 16; + add.s32 %r5486, %r5485, %r5409; + xor.b32 %r5487, %r5486, %r5425; + shf.l.wrap.b32 %r5488, %r5487, %r5487, 20; + add.s32 %r5489, %r5483, %r5207; + add.s32 %r5490, %r5489, %r5488; + xor.b32 %r5491, %r5490, %r5485; + shf.l.wrap.b32 %r5492, %r5491, %r5491, 24; + add.s32 %r5493, %r5492, %r5486; + xor.b32 %r5494, %r5493, %r5488; + shf.l.wrap.b32 %r5495, %r5494, %r5494, 25; + add.s32 %r5496, %r5448, %r5127; + add.s32 %r5497, %r5496, %r5467; + xor.b32 %r5498, %r5497, %r5492; + shf.l.wrap.b32 %r5499, %r5498, %r5498, 16; + add.s32 %r5500, %r5499, %r5479; + xor.b32 %r5501, %r5500, %r5467; + shf.l.wrap.b32 %r5502, %r5501, %r5501, 20; + add.s32 %r5503, %r5497, %r5119; + add.s32 %r5504, %r5503, %r5502; + xor.b32 %r5505, %r5504, %r5499; + shf.l.wrap.b32 %r5506, %r5505, %r5505, 24; + add.s32 %r5507, %r5506, %r5500; + xor.b32 %r5508, %r5507, %r5502; + shf.l.wrap.b32 %r5509, %r5508, %r5508, 25; + add.s32 %r5510, %r5462, %r5159; + add.s32 %r5511, %r5510, %r5481; + xor.b32 %r5512, %r5511, %r5450; + shf.l.wrap.b32 %r5513, %r5512, %r5512, 16; + add.s32 %r5514, %r5513, %r5493; + xor.b32 %r5515, %r5514, %r5481; + shf.l.wrap.b32 %r5516, %r5515, %r5515, 20; + add.s32 %r5517, %r5511, %r5071; + add.s32 %r5518, %r5517, %r5516; + xor.b32 %r5519, %r5518, %r5513; + shf.l.wrap.b32 %r5520, %r5519, %r5519, 24; + add.s32 %r5521, %r5520, %r5514; + xor.b32 %r5522, %r5521, %r5516; + shf.l.wrap.b32 %r5523, %r5522, %r5522, 25; + add.s32 %r5524, %r5476, %r5175; + add.s32 %r5525, %r5524, %r5495; + xor.b32 %r5526, %r5525, %r5464; + shf.l.wrap.b32 %r5527, %r5526, %r5526, 16; + add.s32 %r5528, %r5527, %r5451; + xor.b32 %r5529, %r5528, %r5495; + shf.l.wrap.b32 %r5530, %r5529, %r5529, 20; + add.s32 %r5531, %r5525, %r5215; + add.s32 %r5532, %r5531, %r5530; + xor.b32 %r5533, %r5532, %r5527; + shf.l.wrap.b32 %r5534, %r5533, %r5533, 24; + add.s32 %r5535, %r5534, %r5528; + xor.b32 %r5536, %r5535, %r5530; + shf.l.wrap.b32 %r5537, %r5536, %r5536, 25; + add.s32 %r5538, %r5490, %r5151; + add.s32 %r5539, %r5538, %r5453; + xor.b32 %r5540, %r5539, %r5478; + shf.l.wrap.b32 %r5541, %r5540, %r5540, 16; + add.s32 %r5542, %r5541, %r5465; + xor.b32 %r5543, %r5542, %r5453; + shf.l.wrap.b32 %r5544, %r5543, %r5543, 20; + add.s32 %r5545, %r5539, %r5079; + add.s32 %r5546, %r5545, %r5544; + xor.b32 %r5547, %r5546, %r5541; + shf.l.wrap.b32 %r5548, %r5547, %r5547, 24; + add.s32 %r5549, %r5548, %r5542; + xor.b32 %r5550, %r5549, %r5544; + shf.l.wrap.b32 %r5551, %r5550, %r5550, 25; + add.s32 %r5552, %r5504, %r5167; + add.s32 %r5553, %r5552, %r5551; + xor.b32 %r5554, %r5553, %r5520; + shf.l.wrap.b32 %r5555, %r5554, %r5554, 16; + add.s32 %r5556, %r5555, %r5535; + xor.b32 %r5557, %r5556, %r5551; + shf.l.wrap.b32 %r5558, %r5557, %r5557, 20; + add.s32 %r5559, %r5553, %r5135; + add.s32 %r5560, %r5559, %r5558; + xor.b32 %r5561, %r5560, %r5555; + shf.l.wrap.b32 %r5562, %r5561, %r5561, 24; + add.s32 %r5563, %r5562, %r5556; + xor.b32 %r5564, %r5563, %r5558; + shf.l.wrap.b32 %r5565, %r5564, %r5564, 25; + add.s32 %r5566, %r5518, %r5191; + add.s32 %r5567, %r5566, %r5509; + xor.b32 %r5568, %r5567, %r5534; + shf.l.wrap.b32 %r5569, %r5568, %r5568, 16; + add.s32 %r5570, %r5569, %r5549; + xor.b32 %r5571, %r5570, %r5509; + shf.l.wrap.b32 %r5572, %r5571, %r5571, 20; + add.s32 %r5573, %r5567, %r5159; + add.s32 %r5574, %r5573, %r5572; + xor.b32 %r5575, %r5574, %r5569; + shf.l.wrap.b32 %r5576, %r5575, %r5575, 24; + add.s32 %r5577, %r5576, %r5570; + xor.b32 %r5578, %r5577, %r5572; + shf.l.wrap.b32 %r5579, %r5578, %r5578, 25; + add.s32 %r5580, %r5532, %r5207; + add.s32 %r5581, %r5580, %r5523; + xor.b32 %r5582, %r5581, %r5548; + shf.l.wrap.b32 %r5583, %r5582, %r5582, 16; + add.s32 %r5584, %r5583, %r5507; + xor.b32 %r5585, %r5584, %r5523; + shf.l.wrap.b32 %r5586, %r5585, %r5585, 20; + add.s32 %r5587, %r5581, %r5095; + add.s32 %r5588, %r5587, %r5586; + xor.b32 %r5589, %r5588, %r5583; + shf.l.wrap.b32 %r5590, %r5589, %r5589, 24; + add.s32 %r5591, %r5590, %r5584; + xor.b32 %r5592, %r5591, %r5586; + shf.l.wrap.b32 %r5593, %r5592, %r5592, 25; + add.s32 %r5594, %r5546, %r5199; + add.s32 %r5595, %r5594, %r5537; + xor.b32 %r5596, %r5595, %r5506; + shf.l.wrap.b32 %r5597, %r5596, %r5596, 16; + add.s32 %r5598, %r5597, %r5521; + xor.b32 %r5599, %r5598, %r5537; + shf.l.wrap.b32 %r5600, %r5599, %r5599, 20; + add.s32 %r5601, %r5595, %r5215; + add.s32 %r5602, %r5601, %r5600; + xor.b32 %r5603, %r5602, %r5597; + shf.l.wrap.b32 %r5604, %r5603, %r5603, 24; + add.s32 %r5605, %r5604, %r5598; + xor.b32 %r5606, %r5605, %r5600; + shf.l.wrap.b32 %r5607, %r5606, %r5606, 25; + add.s32 %r5608, %r5560, %r5111; + add.s32 %r5609, %r5608, %r5579; + xor.b32 %r5610, %r5609, %r5604; + shf.l.wrap.b32 %r5611, %r5610, %r5610, 16; + add.s32 %r5612, %r5611, %r5591; + xor.b32 %r5613, %r5612, %r5579; + shf.l.wrap.b32 %r5614, %r5613, %r5613, 20; + add.s32 %r5615, %r5609, %r5071; + add.s32 %r5616, %r5615, %r5614; + xor.b32 %r5617, %r5616, %r5611; + shf.l.wrap.b32 %r5618, %r5617, %r5617, 24; + add.s32 %r5619, %r5618, %r5612; + xor.b32 %r5620, %r5619, %r5614; + shf.l.wrap.b32 %r5621, %r5620, %r5620, 25; + add.s32 %r5622, %r5574, %r5175; + add.s32 %r5623, %r5622, %r5593; + xor.b32 %r5624, %r5623, %r5562; + shf.l.wrap.b32 %r5625, %r5624, %r5624, 16; + add.s32 %r5626, %r5625, %r5605; + xor.b32 %r5627, %r5626, %r5593; + shf.l.wrap.b32 %r5628, %r5627, %r5627, 20; + add.s32 %r5629, %r5623, %r5087; + add.s32 %r5630, %r5629, %r5628; + xor.b32 %r5631, %r5630, %r5625; + shf.l.wrap.b32 %r5632, %r5631, %r5631, 24; + add.s32 %r5633, %r5632, %r5626; + xor.b32 %r5634, %r5633, %r5628; + shf.l.wrap.b32 %r5635, %r5634, %r5634, 25; + add.s32 %r5636, %r5588, %r5119; + add.s32 %r5637, %r5636, %r5607; + xor.b32 %r5638, %r5637, %r5576; + shf.l.wrap.b32 %r5639, %r5638, %r5638, 16; + add.s32 %r5640, %r5639, %r5563; + xor.b32 %r5641, %r5640, %r5607; + shf.l.wrap.b32 %r5642, %r5641, %r5641, 20; + add.s32 %r5643, %r5637, %r5151; + add.s32 %r5644, %r5643, %r5642; + xor.b32 %r5645, %r5644, %r5639; + shf.l.wrap.b32 %r5646, %r5645, %r5645, 24; + add.s32 %r5647, %r5646, %r5640; + xor.b32 %r5648, %r5647, %r5642; + shf.l.wrap.b32 %r5649, %r5648, %r5648, 25; + add.s32 %r5650, %r5602, %r5079; + add.s32 %r5651, %r5650, %r5565; + xor.b32 %r5652, %r5651, %r5590; + shf.l.wrap.b32 %r5653, %r5652, %r5652, 16; + add.s32 %r5654, %r5653, %r5577; + xor.b32 %r5655, %r5654, %r5565; + shf.l.wrap.b32 %r5656, %r5655, %r5655, 20; + add.s32 %r5657, %r5651, %r5127; + add.s32 %r5658, %r5657, %r5656; + xor.b32 %r5659, %r5658, %r5653; + shf.l.wrap.b32 %r5660, %r5659, %r5659, 24; + add.s32 %r5661, %r5660, %r5654; + xor.b32 %r5662, %r5661, %r5656; + shf.l.wrap.b32 %r5663, %r5662, %r5662, 25; + add.s32 %r5664, %r5616, %r5191; + add.s32 %r5665, %r5664, %r5663; + xor.b32 %r5666, %r5665, %r5632; + shf.l.wrap.b32 %r5667, %r5666, %r5666, 16; + add.s32 %r5668, %r5667, %r5647; + xor.b32 %r5669, %r5668, %r5663; + shf.l.wrap.b32 %r5670, %r5669, %r5669, 20; + add.s32 %r5671, %r5665, %r5199; + add.s32 %r5672, %r5671, %r5670; + xor.b32 %r5673, %r5672, %r5667; + shf.l.wrap.b32 %r5674, %r5673, %r5673, 24; + add.s32 %r5675, %r5674, %r5668; + xor.b32 %r5676, %r5675, %r5670; + shf.l.wrap.b32 %r5677, %r5676, %r5676, 25; + add.s32 %r5678, %r5630, %r5159; + add.s32 %r5679, %r5678, %r5621; + xor.b32 %r5680, %r5679, %r5646; + shf.l.wrap.b32 %r5681, %r5680, %r5680, 16; + add.s32 %r5682, %r5681, %r5661; + xor.b32 %r5683, %r5682, %r5621; + shf.l.wrap.b32 %r5684, %r5683, %r5683, 20; + add.s32 %r5685, %r5679, %r5175; + add.s32 %r5686, %r5685, %r5684; + xor.b32 %r5687, %r5686, %r5681; + shf.l.wrap.b32 %r5688, %r5687, %r5687, 24; + add.s32 %r5689, %r5688, %r5682; + xor.b32 %r5690, %r5689, %r5684; + shf.l.wrap.b32 %r5691, %r5690, %r5690, 25; + add.s32 %r5692, %r5644, %r5215; + add.s32 %r5693, %r5692, %r5635; + xor.b32 %r5694, %r5693, %r5660; + shf.l.wrap.b32 %r5695, %r5694, %r5694, 16; + add.s32 %r5696, %r5695, %r5619; + xor.b32 %r5697, %r5696, %r5635; + shf.l.wrap.b32 %r5698, %r5697, %r5697, 20; + add.s32 %r5699, %r5693, %r5167; + add.s32 %r5700, %r5699, %r5698; + xor.b32 %r5701, %r5700, %r5695; + shf.l.wrap.b32 %r5702, %r5701, %r5701, 24; + add.s32 %r5703, %r5702, %r5696; + xor.b32 %r5704, %r5703, %r5698; + shf.l.wrap.b32 %r5705, %r5704, %r5704, 25; + add.s32 %r5706, %r5658, %r5207; + add.s32 %r5707, %r5706, %r5649; + xor.b32 %r5708, %r5707, %r5618; + shf.l.wrap.b32 %r5709, %r5708, %r5708, 16; + add.s32 %r5710, %r5709, %r5633; + xor.b32 %r5711, %r5710, %r5649; + shf.l.wrap.b32 %r5712, %r5711, %r5711, 20; + add.s32 %r5713, %r5707, %r5151; + add.s32 %r5714, %r5713, %r5712; + xor.b32 %r5715, %r5714, %r5709; + shf.l.wrap.b32 %r5716, %r5715, %r5715, 24; + add.s32 %r5717, %r5716, %r5710; + xor.b32 %r5718, %r5717, %r5712; + shf.l.wrap.b32 %r5719, %r5718, %r5718, 25; + add.s32 %r5720, %r5672, %r5135; + add.s32 %r5721, %r5720, %r5691; + xor.b32 %r5722, %r5721, %r5716; + shf.l.wrap.b32 %r5723, %r5722, %r5722, 16; + add.s32 %r5724, %r5723, %r5703; + xor.b32 %r5725, %r5724, %r5691; + shf.l.wrap.b32 %r5726, %r5725, %r5725, 20; + add.s32 %r5727, %r5721, %r5087; + add.s32 %r5728, %r5727, %r5726; + xor.b32 %r5729, %r5728, %r5723; + shf.l.wrap.b32 %r5730, %r5729, %r5729, 24; + add.s32 %r5731, %r5730, %r5724; + xor.b32 %r5732, %r5731, %r5726; + shf.l.wrap.b32 %r5733, %r5732, %r5732, 25; + add.s32 %r5734, %r5686, %r5119; + add.s32 %r5735, %r5734, %r5705; + xor.b32 %r5736, %r5735, %r5674; + shf.l.wrap.b32 %r5737, %r5736, %r5736, 16; + add.s32 %r5738, %r5737, %r5717; + xor.b32 %r5739, %r5738, %r5705; + shf.l.wrap.b32 %r5740, %r5739, %r5739, 20; + add.s32 %r5741, %r5735, %r5095; + add.s32 %r5742, %r5741, %r5740; + xor.b32 %r5743, %r5742, %r5737; + shf.l.wrap.b32 %r5744, %r5743, %r5743, 24; + add.s32 %r5745, %r5744, %r5738; + xor.b32 %r5746, %r5745, %r5740; + shf.l.wrap.b32 %r5747, %r5746, %r5746, 25; + add.s32 %r5748, %r5700, %r5071; + add.s32 %r5749, %r5748, %r5719; + xor.b32 %r5750, %r5749, %r5688; + shf.l.wrap.b32 %r5751, %r5750, %r5750, 16; + add.s32 %r5752, %r5751, %r5675; + xor.b32 %r5753, %r5752, %r5719; + shf.l.wrap.b32 %r5754, %r5753, %r5753, 20; + add.s32 %r5755, %r5749, %r5079; + add.s32 %r5756, %r5755, %r5754; + xor.b32 %r5757, %r5756, %r5751; + shf.l.wrap.b32 %r5758, %r5757, %r5757, 24; + add.s32 %r5759, %r5758, %r5752; + xor.b32 %r5760, %r5759, %r5754; + shf.l.wrap.b32 %r5761, %r5760, %r5760, 25; + add.s32 %r5762, %r5714, %r5127; + add.s32 %r5763, %r5762, %r5677; + xor.b32 %r5764, %r5763, %r5702; + shf.l.wrap.b32 %r5765, %r5764, %r5764, 16; + add.s32 %r5766, %r5765, %r5689; + xor.b32 %r5767, %r5766, %r5677; + shf.l.wrap.b32 %r5768, %r5767, %r5767, 20; + add.s32 %r5769, %r5763, %r5111; + add.s32 %r5770, %r5769, %r5768; + xor.b32 %r5771, %r5770, %r5765; + shf.l.wrap.b32 %r5772, %r5771, %r5771, 24; + add.s32 %r5773, %r5772, %r5766; + xor.b32 %r5774, %r5773, %r5768; + shf.l.wrap.b32 %r5775, %r5774, %r5774, 25; + add.s32 %r5776, %r5728, %r5159; + add.s32 %r5777, %r5776, %r5775; + xor.b32 %r5778, %r5777, %r5744; + shf.l.wrap.b32 %r5779, %r5778, %r5778, 16; + add.s32 %r5780, %r5779, %r5759; + xor.b32 %r5781, %r5780, %r5775; + shf.l.wrap.b32 %r5782, %r5781, %r5781, 20; + add.s32 %r5783, %r5777, %r5207; + add.s32 %r5784, %r5783, %r5782; + xor.b32 %r5785, %r5784, %r5779; + shf.l.wrap.b32 %r5786, %r5785, %r5785, 24; + add.s32 %r5787, %r5786, %r5780; + xor.b32 %r5788, %r5787, %r5782; + shf.l.wrap.b32 %r5789, %r5788, %r5788, 25; + add.s32 %r5790, %r5742, %r5175; + add.s32 %r5791, %r5790, %r5733; + xor.b32 %r5792, %r5791, %r5758; + shf.l.wrap.b32 %r5793, %r5792, %r5792, 16; + add.s32 %r5794, %r5793, %r5773; + xor.b32 %r5795, %r5794, %r5733; + shf.l.wrap.b32 %r5796, %r5795, %r5795, 20; + add.s32 %r5797, %r5791, %r5119; + add.s32 %r5798, %r5797, %r5796; + xor.b32 %r5799, %r5798, %r5793; + shf.l.wrap.b32 %r5800, %r5799, %r5799, 24; + add.s32 %r5801, %r5800, %r5794; + xor.b32 %r5802, %r5801, %r5796; + shf.l.wrap.b32 %r5803, %r5802, %r5802, 25; + add.s32 %r5804, %r5756, %r5151; + add.s32 %r5805, %r5804, %r5747; + xor.b32 %r5806, %r5805, %r5772; + shf.l.wrap.b32 %r5807, %r5806, %r5806, 16; + add.s32 %r5808, %r5807, %r5731; + xor.b32 %r5809, %r5808, %r5747; + shf.l.wrap.b32 %r5810, %r5809, %r5809, 20; + add.s32 %r5811, %r5805, %r5191; + add.s32 %r5812, %r5811, %r5810; + xor.b32 %r5813, %r5812, %r5807; + shf.l.wrap.b32 %r5814, %r5813, %r5813, 24; + add.s32 %r5815, %r5814, %r5808; + xor.b32 %r5816, %r5815, %r5810; + shf.l.wrap.b32 %r5817, %r5816, %r5816, 25; + add.s32 %r5818, %r5770, %r5215; + add.s32 %r5819, %r5818, %r5761; + xor.b32 %r5820, %r5819, %r5730; + shf.l.wrap.b32 %r5821, %r5820, %r5820, 16; + add.s32 %r5822, %r5821, %r5745; + xor.b32 %r5823, %r5822, %r5761; + shf.l.wrap.b32 %r5824, %r5823, %r5823, 20; + add.s32 %r5825, %r5819, %r5079; + add.s32 %r5826, %r5825, %r5824; + xor.b32 %r5827, %r5826, %r5821; + shf.l.wrap.b32 %r5828, %r5827, %r5827, 24; + add.s32 %r5829, %r5828, %r5822; + xor.b32 %r5830, %r5829, %r5824; + shf.l.wrap.b32 %r5831, %r5830, %r5830, 25; + add.s32 %r5832, %r5784, %r5199; + add.s32 %r5833, %r5832, %r5803; + xor.b32 %r5834, %r5833, %r5828; + shf.l.wrap.b32 %r5835, %r5834, %r5834, 16; + add.s32 %r5836, %r5835, %r5815; + xor.b32 %r5837, %r5836, %r5803; + shf.l.wrap.b32 %r5838, %r5837, %r5837, 20; + add.s32 %r5839, %r5833, %r5095; + add.s32 %r5840, %r5839, %r5838; + xor.b32 %r5841, %r5840, %r5835; + shf.l.wrap.b32 %r5842, %r5841, %r5841, 24; + add.s32 %r5843, %r5842, %r5836; + xor.b32 %r5844, %r5843, %r5838; + shf.l.wrap.b32 %r5845, %r5844, %r5844, 25; + add.s32 %r5846, %r5798, %r5071; + add.s32 %r5847, %r5846, %r5817; + xor.b32 %r5848, %r5847, %r5786; + shf.l.wrap.b32 %r5849, %r5848, %r5848, 16; + add.s32 %r5850, %r5849, %r5829; + xor.b32 %r5851, %r5850, %r5817; + shf.l.wrap.b32 %r5852, %r5851, %r5851, 20; + add.s32 %r5853, %r5847, %r5167; + add.s32 %r5854, %r5853, %r5852; + xor.b32 %r5855, %r5854, %r5849; + shf.l.wrap.b32 %r5856, %r5855, %r5855, 24; + add.s32 %r5857, %r5856, %r5850; + xor.b32 %r5858, %r5857, %r5852; + shf.l.wrap.b32 %r5859, %r5858, %r5858, 25; + add.s32 %r5860, %r5812, %r5087; + add.s32 %r5861, %r5860, %r5831; + xor.b32 %r5862, %r5861, %r5800; + shf.l.wrap.b32 %r5863, %r5862, %r5862, 16; + add.s32 %r5864, %r5863, %r5787; + xor.b32 %r5865, %r5864, %r5831; + shf.l.wrap.b32 %r5866, %r5865, %r5865, 20; + add.s32 %r5867, %r5861, %r5127; + add.s32 %r5868, %r5867, %r5866; + xor.b32 %r5869, %r5868, %r5863; + shf.l.wrap.b32 %r5870, %r5869, %r5869, 24; + add.s32 %r5871, %r5870, %r5864; + xor.b32 %r5872, %r5871, %r5866; + shf.l.wrap.b32 %r5873, %r5872, %r5872, 25; + add.s32 %r5874, %r5826, %r5111; + add.s32 %r5875, %r5874, %r5789; + xor.b32 %r5876, %r5875, %r5814; + shf.l.wrap.b32 %r5877, %r5876, %r5876, 16; + add.s32 %r5878, %r5877, %r5801; + xor.b32 %r5879, %r5878, %r5789; + shf.l.wrap.b32 %r5880, %r5879, %r5879, 20; + add.s32 %r5881, %r5875, %r5135; + add.s32 %r5882, %r5881, %r5880; + xor.b32 %r5883, %r5882, %r5877; + shf.l.wrap.b32 %r5884, %r5883, %r5883, 24; + add.s32 %r5885, %r5884, %r5878; + xor.b32 %r5886, %r5885, %r5880; + shf.l.wrap.b32 %r5887, %r5886, %r5886, 25; + add.s32 %r5888, %r5840, %r5175; + add.s32 %r5889, %r5888, %r5887; + xor.b32 %r5890, %r5889, %r5856; + shf.l.wrap.b32 %r5891, %r5890, %r5890, 16; + add.s32 %r5892, %r5891, %r5871; + xor.b32 %r5893, %r5892, %r5887; + shf.l.wrap.b32 %r5894, %r5893, %r5893, 20; + add.s32 %r5895, %r5889, %r5215; + add.s32 %r5896, %r5895, %r5894; + xor.b32 %r5897, %r5896, %r5891; + shf.l.wrap.b32 %r5898, %r5897, %r5897, 24; + add.s32 %r5899, %r5898, %r5892; + xor.b32 %r5900, %r5899, %r5894; + shf.l.wrap.b32 %r5901, %r5900, %r5900, 25; + add.s32 %r5902, %r5854, %r5119; + add.s32 %r5903, %r5902, %r5845; + xor.b32 %r5904, %r5903, %r5870; + shf.l.wrap.b32 %r5905, %r5904, %r5904, 16; + add.s32 %r5906, %r5905, %r5885; + xor.b32 %r5907, %r5906, %r5845; + shf.l.wrap.b32 %r5908, %r5907, %r5907, 20; + add.s32 %r5909, %r5903, %r5071; + add.s32 %r5910, %r5909, %r5908; + xor.b32 %r5911, %r5910, %r5905; + shf.l.wrap.b32 %r5912, %r5911, %r5911, 24; + add.s32 %r5913, %r5912, %r5906; + xor.b32 %r5914, %r5913, %r5908; + shf.l.wrap.b32 %r5915, %r5914, %r5914, 25; + add.s32 %r5916, %r5868, %r5079; + add.s32 %r5917, %r5916, %r5859; + xor.b32 %r5918, %r5917, %r5884; + shf.l.wrap.b32 %r5919, %r5918, %r5918, 16; + add.s32 %r5920, %r5919, %r5843; + xor.b32 %r5921, %r5920, %r5859; + shf.l.wrap.b32 %r5922, %r5921, %r5921, 20; + add.s32 %r5923, %r5917, %r5159; + add.s32 %r5924, %r5923, %r5922; + xor.b32 %r5925, %r5924, %r5919; + shf.l.wrap.b32 %r5926, %r5925, %r5925, 24; + add.s32 %r5927, %r5926, %r5920; + xor.b32 %r5928, %r5927, %r5922; + shf.l.wrap.b32 %r5929, %r5928, %r5928, 25; + add.s32 %r5930, %r5882, %r5151; + add.s32 %r5931, %r5930, %r5873; + xor.b32 %r5932, %r5931, %r5842; + shf.l.wrap.b32 %r5933, %r5932, %r5932, 16; + add.s32 %r5934, %r5933, %r5857; + xor.b32 %r5935, %r5934, %r5873; + shf.l.wrap.b32 %r5936, %r5935, %r5935, 20; + add.s32 %r5937, %r5931, %r5127; + add.s32 %r5938, %r5937, %r5936; + xor.b32 %r5939, %r5938, %r5933; + shf.l.wrap.b32 %r5940, %r5939, %r5939, 24; + add.s32 %r5941, %r5940, %r5934; + xor.b32 %r5942, %r5941, %r5936; + shf.l.wrap.b32 %r5943, %r5942, %r5942, 25; + add.s32 %r5944, %r5896, %r5207; + add.s32 %r5945, %r5944, %r5915; + xor.b32 %r5946, %r5945, %r5940; + shf.l.wrap.b32 %r5947, %r5946, %r5946, 16; + add.s32 %r5948, %r5947, %r5927; + xor.b32 %r5949, %r5948, %r5915; + shf.l.wrap.b32 %r5950, %r5949, %r5949, 20; + add.s32 %r5951, %r5945, %r5167; + add.s32 %r5952, %r5951, %r5950; + xor.b32 %r5953, %r5952, %r5947; + shf.l.wrap.b32 %r5954, %r5953, %r5953, 24; + add.s32 %r5955, %r5954, %r5948; + xor.b32 %r5956, %r5955, %r5950; + shf.l.wrap.b32 %r5957, %r5956, %r5956, 25; + add.s32 %r5958, %r5910, %r5087; + add.s32 %r5959, %r5958, %r5929; + xor.b32 %r5960, %r5959, %r5898; + shf.l.wrap.b32 %r5961, %r5960, %r5960, 16; + add.s32 %r5962, %r5961, %r5941; + xor.b32 %r5963, %r5962, %r5929; + shf.l.wrap.b32 %r5964, %r5963, %r5963, 20; + add.s32 %r5965, %r5959, %r5191; + add.s32 %r5966, %r5965, %r5964; + xor.b32 %r5967, %r5966, %r5961; + shf.l.wrap.b32 %r5968, %r5967, %r5967, 24; + add.s32 %r5969, %r5968, %r5962; + xor.b32 %r5970, %r5969, %r5964; + shf.l.wrap.b32 %r5971, %r5970, %r5970, 25; + add.s32 %r5972, %r5924, %r5095; + add.s32 %r5973, %r5972, %r5943; + xor.b32 %r5974, %r5973, %r5912; + shf.l.wrap.b32 %r5975, %r5974, %r5974, 16; + add.s32 %r5976, %r5975, %r5899; + xor.b32 %r5977, %r5976, %r5943; + shf.l.wrap.b32 %r5978, %r5977, %r5977, 20; + add.s32 %r5979, %r5973, %r5111; + add.s32 %r5980, %r5979, %r5978; + xor.b32 %r5981, %r5980, %r5975; + shf.l.wrap.b32 %r5982, %r5981, %r5981, 24; + add.s32 %r5983, %r5982, %r5976; + xor.b32 %r5984, %r5983, %r5978; + shf.l.wrap.b32 %r5985, %r5984, %r5984, 25; + add.s32 %r5986, %r5938, %r5135; + add.s32 %r5987, %r5986, %r5901; + xor.b32 %r5988, %r5987, %r5926; + shf.l.wrap.b32 %r5989, %r5988, %r5988, 16; + add.s32 %r5990, %r5989, %r5913; + xor.b32 %r5991, %r5990, %r5901; + shf.l.wrap.b32 %r5992, %r5991, %r5991, 20; + add.s32 %r5993, %r5987, %r5199; + add.s32 %r5994, %r5993, %r5992; + xor.b32 %r5995, %r5994, %r5989; + shf.l.wrap.b32 %r5996, %r5995, %r5995, 24; + add.s32 %r5997, %r5996, %r5990; + xor.b32 %r5998, %r5997, %r5992; + shf.l.wrap.b32 %r5999, %r5998, %r5998, 25; + xor.b32 %r9, %r5983, %r5952; + xor.b32 %r10, %r5997, %r5966; + st.local.v2.u32 [%rd3+32], {%r9, %r10}; + xor.b32 %r11, %r5955, %r5980; + xor.b32 %r12, %r5994, %r5969; + st.local.v2.u32 [%rd3+40], {%r11, %r12}; + xor.b32 %r13, %r5999, %r5968; + xor.b32 %r14, %r5957, %r5982; + st.local.v2.u32 [%rd3+48], {%r13, %r14}; + xor.b32 %r15, %r5996, %r5971; + xor.b32 %r16, %r5985, %r5954; + st.local.v2.u32 [%rd3+56], {%r15, %r16}; + ld.local.v4.u32 {%r6000, %r6001, %r6002, %r6003}, [%rd4+64]; + st.local.v2.u32 [%rd3+72], {%r6000, %r6001}; + st.local.v2.u32 [%rd3+80], {%r6002, %r6003}; + add.s16 %rs1, %rs412, 16; + and.b16 %rs485, %rs1, 255; + add.s16 %rs486, %rs413, 1; + st.local.v2.u8 [%rd3+136], {%rs1, %rs486}; + cvt.u32.u16 %r6008, %rs486; + cvt.u32.u16 %r6009, %rs485; + prmt.b32 %r6010, %r6008, %r6009, 30212; + cvt.u16.u32 %rs487, %r6010; + shr.u16 %rs2, %rs487, 8; + mov.b32 {%rs5, %rs6}, %r6001; + mov.b32 {%rs3, %rs4}, %r6000; + mov.b32 {%rs9, %rs10}, %r6003; + mov.b32 {%rs7, %rs8}, %r6002; + setp.eq.s16 %p15, %rs2, 0; + selp.u16 %rs488, 1, 0, %p15; + shr.u16 %rs489, %rs3, 8; + shr.u16 %rs490, %rs4, 8; + shr.u16 %rs491, %rs5, 8; + shr.u16 %rs492, %rs6, 8; + shr.u16 %rs493, %rs7, 8; + shr.u16 %rs494, %rs8, 8; + shr.u16 %rs495, %rs9, 8; + shr.u16 %rs496, %rs10, 8; + or.b16 %rs497, %rs488, 10; + cvt.u32.u16 %r6011, %rs3; + and.b32 %r6012, %r6011, 255; + cvt.u32.u16 %r6013, %rs489; + prmt.b32 %r6014, %r6013, %r6012, 30212; + cvt.u32.u16 %r6015, %rs4; + prmt.b32 %r6016, %r6015, %r6014, 28756; + cvt.u32.u16 %r6017, %rs490; + prmt.b32 %r6018, %r6017, %r6016, 1620; + cvt.u32.u16 %r6019, %rs5; + and.b32 %r6020, %r6019, 255; + cvt.u32.u16 %r6021, %rs491; + prmt.b32 %r6022, %r6021, %r6020, 30212; + cvt.u32.u16 %r6023, %rs6; + prmt.b32 %r6024, %r6023, %r6022, 28756; + cvt.u32.u16 %r6025, %rs492; + prmt.b32 %r6026, %r6025, %r6024, 1620; + cvt.u32.u16 %r6027, %rs7; + and.b32 %r6028, %r6027, 255; + cvt.u32.u16 %r6029, %rs493; + prmt.b32 %r6030, %r6029, %r6028, 30212; + cvt.u32.u16 %r6031, %rs8; + prmt.b32 %r6032, %r6031, %r6030, 28756; + cvt.u32.u16 %r6033, %rs494; + prmt.b32 %r6034, %r6033, %r6032, 1620; + cvt.u32.u16 %r6035, %rs9; + and.b32 %r6036, %r6035, 255; + cvt.u32.u16 %r6037, %rs495; + prmt.b32 %r6038, %r6037, %r6036, 30212; + cvt.u32.u16 %r6039, %rs10; + prmt.b32 %r6040, %r6039, %r6038, 28756; + cvt.u32.u16 %r6041, %rs496; + prmt.b32 %r6042, %r6041, %r6040, 1620; + cvt.u32.u16 %r6043, %rs497; + add.s32 %r6044, %r13, %r9; + add.s32 %r6045, %r6044, %r6018; + add.s32 %r6046, %r6026, %r6045; + add.s32 %r6047, %r14, %r10; + add.s32 %r6048, %r6047, %r6034; + add.s32 %r6049, %r6042, %r6048; + add.s32 %r6050, %r15, %r11; + cvt.u32.u16 %r6051, %rs1; + and.b32 %r6052, %r6051, 255; + xor.b32 %r6053, %r6050, %r6052; + shr.u32 %r6054, %r6050, 16; + shl.b32 %r6055, %r6053, 16; + or.b32 %r6056, %r6055, %r6054; + add.s32 %r6057, %r6056, 1013904242; + xor.b32 %r6058, %r6057, %r15; + shf.l.wrap.b32 %r6059, %r6058, %r6058, 20; + add.s32 %r6060, %r6050, %r6059; + xor.b32 %r6061, %r6060, %r6056; + shf.l.wrap.b32 %r6062, %r6061, %r6061, 24; + add.s32 %r6063, %r6062, %r6057; + xor.b32 %r6064, %r6063, %r6059; + shf.l.wrap.b32 %r6065, %r6064, %r6064, 25; + add.s32 %r6066, %r16, %r12; + xor.b32 %r6067, %r6066, %r6043; + shr.u32 %r6068, %r6066, 16; + shl.b32 %r6069, %r6067, 16; + or.b32 %r6070, %r6069, %r6068; + add.s32 %r6071, %r6070, -1521486534; + xor.b32 %r6072, %r6071, %r16; + shf.l.wrap.b32 %r6073, %r6072, %r6072, 20; + add.s32 %r6074, %r6066, %r6073; + xor.b32 %r6075, %r6074, %r6070; + shf.l.wrap.b32 %r6076, %r6075, %r6075, 24; + add.s32 %r6077, %r6076, %r6071; + xor.b32 %r6078, %r6077, %r6073; + shf.l.wrap.b32 %r6079, %r6078, %r6078, 25; + add.s32 %r6080, %r6079, %r6060; + shf.l.wrap.b32 %r6081, %r6045, %r6045, 16; + add.s32 %r6082, %r6081, 1779033703; + xor.b32 %r6083, %r6082, %r13; + shf.l.wrap.b32 %r6084, %r6083, %r6083, 20; + add.s32 %r6085, %r6046, %r6084; + xor.b32 %r6086, %r6085, %r6081; + shf.l.wrap.b32 %r6087, %r6086, %r6086, 24; + add.s32 %r6088, %r6087, %r6082; + xor.b32 %r6089, %r6088, %r6084; + shf.l.wrap.b32 %r6090, %r6089, %r6089, 25; + shf.l.wrap.b32 %r6091, %r6048, %r6048, 16; + add.s32 %r6092, %r6091, -1150833019; + xor.b32 %r6093, %r6092, %r14; + shf.l.wrap.b32 %r6094, %r6093, %r6093, 20; + add.s32 %r6095, %r6049, %r6094; + xor.b32 %r6096, %r6095, %r6091; + shf.l.wrap.b32 %r6097, %r6096, %r6096, 24; + add.s32 %r6098, %r6097, %r6092; + xor.b32 %r6099, %r6098, %r6094; + shf.l.wrap.b32 %r6100, %r6099, %r6099, 25; + add.s32 %r6101, %r6085, %r6100; + xor.b32 %r6102, %r6101, %r6076; + shf.l.wrap.b32 %r6103, %r6102, %r6102, 16; + add.s32 %r6104, %r6103, %r6063; + xor.b32 %r6105, %r6104, %r6100; + shf.l.wrap.b32 %r6106, %r6105, %r6105, 20; + add.s32 %r6107, %r6101, %r6106; + xor.b32 %r6108, %r6107, %r6103; + shf.l.wrap.b32 %r6109, %r6108, %r6108, 24; + add.s32 %r6110, %r6109, %r6104; + xor.b32 %r6111, %r6110, %r6106; + shf.l.wrap.b32 %r6112, %r6111, %r6111, 25; + add.s32 %r6113, %r6065, %r6095; + xor.b32 %r6114, %r6087, %r6113; + shf.l.wrap.b32 %r6115, %r6114, %r6114, 16; + add.s32 %r6116, %r6115, %r6077; + xor.b32 %r6117, %r6116, %r6065; + shf.l.wrap.b32 %r6118, %r6117, %r6117, 20; + add.s32 %r6119, %r6113, %r6118; + xor.b32 %r6120, %r6119, %r6115; + shf.l.wrap.b32 %r6121, %r6120, %r6120, 24; + add.s32 %r6122, %r6121, %r6116; + xor.b32 %r6123, %r6122, %r6118; + shf.l.wrap.b32 %r6124, %r6123, %r6123, 25; + xor.b32 %r6125, %r6097, %r6080; + shf.l.wrap.b32 %r6126, %r6125, %r6125, 16; + add.s32 %r6127, %r6126, %r6088; + xor.b32 %r6128, %r6127, %r6079; + shf.l.wrap.b32 %r6129, %r6128, %r6128, 20; + add.s32 %r6130, %r6080, %r6129; + xor.b32 %r6131, %r6130, %r6126; + shf.l.wrap.b32 %r6132, %r6131, %r6131, 24; + add.s32 %r6133, %r6132, %r6127; + xor.b32 %r6134, %r6133, %r6129; + shf.l.wrap.b32 %r6135, %r6134, %r6134, 25; + add.s32 %r6136, %r6074, %r6090; + xor.b32 %r6137, %r6136, %r6062; + shf.l.wrap.b32 %r6138, %r6137, %r6137, 16; + add.s32 %r6139, %r6138, %r6098; + xor.b32 %r6140, %r6139, %r6090; + shf.l.wrap.b32 %r6141, %r6140, %r6140, 20; + add.s32 %r6142, %r6136, %r6141; + xor.b32 %r6143, %r6142, %r6138; + shf.l.wrap.b32 %r6144, %r6143, %r6143, 24; + add.s32 %r6145, %r6144, %r6139; + xor.b32 %r6146, %r6145, %r6141; + shf.l.wrap.b32 %r6147, %r6146, %r6146, 25; + add.s32 %r6148, %r6107, %r6034; + add.s32 %r6149, %r6148, %r6147; + xor.b32 %r6150, %r6149, %r6121; + shf.l.wrap.b32 %r6151, %r6150, %r6150, 16; + add.s32 %r6152, %r6151, %r6133; + xor.b32 %r6153, %r6152, %r6147; + shf.l.wrap.b32 %r6154, %r6153, %r6153, 20; + add.s32 %r6155, %r6149, %r6154; + xor.b32 %r6156, %r6155, %r6151; + shf.l.wrap.b32 %r6157, %r6156, %r6156, 24; + add.s32 %r6158, %r6157, %r6152; + xor.b32 %r6159, %r6158, %r6154; + shf.l.wrap.b32 %r6160, %r6159, %r6159, 25; + add.s32 %r6161, %r6119, %r6042; + add.s32 %r6162, %r6161, %r6112; + xor.b32 %r6163, %r6162, %r6132; + shf.l.wrap.b32 %r6164, %r6163, %r6163, 16; + add.s32 %r6165, %r6164, %r6145; + xor.b32 %r6166, %r6165, %r6112; + shf.l.wrap.b32 %r6167, %r6166, %r6166, 20; + add.s32 %r6168, %r6162, %r6167; + xor.b32 %r6169, %r6168, %r6164; + shf.l.wrap.b32 %r6170, %r6169, %r6169, 24; + add.s32 %r6171, %r6170, %r6165; + xor.b32 %r6172, %r6171, %r6167; + shf.l.wrap.b32 %r6173, %r6172, %r6172, 25; + add.s32 %r6174, %r6130, %r6124; + xor.b32 %r6175, %r6144, %r6174; + shf.l.wrap.b32 %r6176, %r6175, %r6175, 16; + add.s32 %r6177, %r6176, %r6110; + xor.b32 %r6178, %r6177, %r6124; + shf.l.wrap.b32 %r6179, %r6178, %r6178, 20; + add.s32 %r6180, %r6174, %r6018; + add.s32 %r6181, %r6180, %r6179; + xor.b32 %r6182, %r6181, %r6176; + shf.l.wrap.b32 %r6183, %r6182, %r6182, 24; + add.s32 %r6184, %r6183, %r6177; + xor.b32 %r6185, %r6184, %r6179; + shf.l.wrap.b32 %r6186, %r6185, %r6185, 25; + add.s32 %r6187, %r6142, %r6135; + xor.b32 %r6188, %r6109, %r6187; + shf.l.wrap.b32 %r6189, %r6188, %r6188, 16; + add.s32 %r6190, %r6189, %r6122; + xor.b32 %r6191, %r6190, %r6135; + shf.l.wrap.b32 %r6192, %r6191, %r6191, 20; + add.s32 %r6193, %r6187, %r6192; + xor.b32 %r6194, %r6193, %r6189; + shf.l.wrap.b32 %r6195, %r6194, %r6194, 24; + add.s32 %r6196, %r6195, %r6190; + xor.b32 %r6197, %r6196, %r6192; + shf.l.wrap.b32 %r6198, %r6197, %r6197, 25; + add.s32 %r6199, %r6155, %r6026; + add.s32 %r6200, %r6199, %r6173; + xor.b32 %r6201, %r6200, %r6195; + shf.l.wrap.b32 %r6202, %r6201, %r6201, 16; + add.s32 %r6203, %r6202, %r6184; + xor.b32 %r6204, %r6203, %r6173; + shf.l.wrap.b32 %r6205, %r6204, %r6204, 20; + add.s32 %r6206, %r6200, %r6205; + xor.b32 %r6207, %r6206, %r6202; + shf.l.wrap.b32 %r6208, %r6207, %r6207, 24; + add.s32 %r6209, %r6208, %r6203; + xor.b32 %r6210, %r6209, %r6205; + shf.l.wrap.b32 %r6211, %r6210, %r6210, 25; + add.s32 %r6212, %r6186, %r6168; + xor.b32 %r6213, %r6157, %r6212; + shf.l.wrap.b32 %r6214, %r6213, %r6213, 16; + add.s32 %r6215, %r6214, %r6196; + xor.b32 %r6216, %r6215, %r6186; + shf.l.wrap.b32 %r6217, %r6216, %r6216, 20; + add.s32 %r6218, %r6212, %r6217; + xor.b32 %r6219, %r6218, %r6214; + shf.l.wrap.b32 %r6220, %r6219, %r6219, 24; + add.s32 %r6221, %r6220, %r6215; + xor.b32 %r6222, %r6221, %r6217; + shf.l.wrap.b32 %r6223, %r6222, %r6222, 25; + add.s32 %r6224, %r6181, %r6198; + xor.b32 %r6225, %r6170, %r6224; + shf.l.wrap.b32 %r6226, %r6225, %r6225, 16; + add.s32 %r6227, %r6226, %r6158; + xor.b32 %r6228, %r6227, %r6198; + shf.l.wrap.b32 %r6229, %r6228, %r6228, 20; + add.s32 %r6230, %r6224, %r6229; + xor.b32 %r6231, %r6230, %r6226; + shf.l.wrap.b32 %r6232, %r6231, %r6231, 24; + add.s32 %r6233, %r6232, %r6227; + xor.b32 %r6234, %r6233, %r6229; + shf.l.wrap.b32 %r6235, %r6234, %r6234, 25; + add.s32 %r6236, %r6193, %r6160; + xor.b32 %r6237, %r6236, %r6183; + shf.l.wrap.b32 %r6238, %r6237, %r6237, 16; + add.s32 %r6239, %r6238, %r6171; + xor.b32 %r6240, %r6239, %r6160; + shf.l.wrap.b32 %r6241, %r6240, %r6240, 20; + add.s32 %r6242, %r6236, %r6241; + xor.b32 %r6243, %r6242, %r6238; + shf.l.wrap.b32 %r6244, %r6243, %r6243, 24; + add.s32 %r6245, %r6244, %r6239; + xor.b32 %r6246, %r6245, %r6241; + shf.l.wrap.b32 %r6247, %r6246, %r6246, 25; + add.s32 %r6248, %r6206, %r6042; + add.s32 %r6249, %r6248, %r6247; + xor.b32 %r6250, %r6249, %r6220; + shf.l.wrap.b32 %r6251, %r6250, %r6250, 16; + add.s32 %r6252, %r6251, %r6233; + xor.b32 %r6253, %r6252, %r6247; + shf.l.wrap.b32 %r6254, %r6253, %r6253, 20; + add.s32 %r6255, %r6249, %r6254; + xor.b32 %r6256, %r6255, %r6251; + shf.l.wrap.b32 %r6257, %r6256, %r6256, 24; + add.s32 %r6258, %r6257, %r6252; + xor.b32 %r6259, %r6258, %r6254; + shf.l.wrap.b32 %r6260, %r6259, %r6259, 25; + add.s32 %r6261, %r6218, %r6211; + xor.b32 %r6262, %r6261, %r6232; + shf.l.wrap.b32 %r6263, %r6262, %r6262, 16; + add.s32 %r6264, %r6263, %r6245; + xor.b32 %r6265, %r6264, %r6211; + shf.l.wrap.b32 %r6266, %r6265, %r6265, 20; + add.s32 %r6267, %r6261, %r6266; + xor.b32 %r6268, %r6267, %r6263; + shf.l.wrap.b32 %r6269, %r6268, %r6268, 24; + add.s32 %r6270, %r6269, %r6264; + xor.b32 %r6271, %r6270, %r6266; + shf.l.wrap.b32 %r6272, %r6271, %r6271, 25; + add.s32 %r6273, %r6230, %r6223; + xor.b32 %r6274, %r6244, %r6273; + shf.l.wrap.b32 %r6275, %r6274, %r6274, 16; + add.s32 %r6276, %r6275, %r6209; + xor.b32 %r6277, %r6276, %r6223; + shf.l.wrap.b32 %r6278, %r6277, %r6277, 20; + add.s32 %r6279, %r6273, %r6034; + add.s32 %r6280, %r6279, %r6278; + xor.b32 %r6281, %r6280, %r6275; + shf.l.wrap.b32 %r6282, %r6281, %r6281, 24; + add.s32 %r6283, %r6282, %r6276; + xor.b32 %r6284, %r6283, %r6278; + shf.l.wrap.b32 %r6285, %r6284, %r6284, 25; + add.s32 %r6286, %r6242, %r6235; + xor.b32 %r6287, %r6208, %r6286; + shf.l.wrap.b32 %r6288, %r6287, %r6287, 16; + add.s32 %r6289, %r6288, %r6221; + xor.b32 %r6290, %r6289, %r6235; + shf.l.wrap.b32 %r6291, %r6290, %r6290, 20; + add.s32 %r6292, %r6286, %r6291; + xor.b32 %r6293, %r6292, %r6288; + shf.l.wrap.b32 %r6294, %r6293, %r6293, 24; + add.s32 %r6295, %r6294, %r6289; + xor.b32 %r6296, %r6295, %r6291; + shf.l.wrap.b32 %r6297, %r6296, %r6296, 25; + add.s32 %r6298, %r6255, %r6272; + xor.b32 %r6299, %r6298, %r6294; + shf.l.wrap.b32 %r6300, %r6299, %r6299, 16; + add.s32 %r6301, %r6300, %r6283; + xor.b32 %r6302, %r6301, %r6272; + shf.l.wrap.b32 %r6303, %r6302, %r6302, 20; + add.s32 %r6304, %r6298, %r6303; + xor.b32 %r6305, %r6304, %r6300; + shf.l.wrap.b32 %r6306, %r6305, %r6305, 24; + add.s32 %r6307, %r6306, %r6301; + xor.b32 %r6308, %r6307, %r6303; + shf.l.wrap.b32 %r6309, %r6308, %r6308, 25; + add.s32 %r6310, %r6285, %r6267; + xor.b32 %r6311, %r6257, %r6310; + shf.l.wrap.b32 %r6312, %r6311, %r6311, 16; + add.s32 %r6313, %r6312, %r6295; + xor.b32 %r6314, %r6313, %r6285; + shf.l.wrap.b32 %r6315, %r6314, %r6314, 20; + add.s32 %r6316, %r6310, %r6018; + add.s32 %r6317, %r6316, %r6315; + xor.b32 %r6318, %r6317, %r6312; + shf.l.wrap.b32 %r6319, %r6318, %r6318, 24; + add.s32 %r6320, %r6319, %r6313; + xor.b32 %r6321, %r6320, %r6315; + shf.l.wrap.b32 %r6322, %r6321, %r6321, 25; + add.s32 %r6323, %r6280, %r6297; + xor.b32 %r6324, %r6269, %r6323; + shf.l.wrap.b32 %r6325, %r6324, %r6324, 16; + add.s32 %r6326, %r6325, %r6258; + xor.b32 %r6327, %r6326, %r6297; + shf.l.wrap.b32 %r6328, %r6327, %r6327, 20; + add.s32 %r6329, %r6323, %r6328; + xor.b32 %r6330, %r6329, %r6325; + shf.l.wrap.b32 %r6331, %r6330, %r6330, 24; + add.s32 %r6332, %r6331, %r6326; + xor.b32 %r6333, %r6332, %r6328; + shf.l.wrap.b32 %r6334, %r6333, %r6333, 25; + add.s32 %r6335, %r6292, %r6260; + xor.b32 %r6336, %r6335, %r6282; + shf.l.wrap.b32 %r6337, %r6336, %r6336, 16; + add.s32 %r6338, %r6337, %r6270; + xor.b32 %r6339, %r6338, %r6260; + shf.l.wrap.b32 %r6340, %r6339, %r6339, 20; + add.s32 %r6341, %r6335, %r6026; + add.s32 %r6342, %r6341, %r6340; + xor.b32 %r6343, %r6342, %r6337; + shf.l.wrap.b32 %r6344, %r6343, %r6343, 24; + add.s32 %r6345, %r6344, %r6338; + xor.b32 %r6346, %r6345, %r6340; + shf.l.wrap.b32 %r6347, %r6346, %r6346, 25; + add.s32 %r6348, %r6304, %r6347; + xor.b32 %r6349, %r6348, %r6319; + shf.l.wrap.b32 %r6350, %r6349, %r6349, 16; + add.s32 %r6351, %r6350, %r6332; + xor.b32 %r6352, %r6351, %r6347; + shf.l.wrap.b32 %r6353, %r6352, %r6352, 20; + add.s32 %r6354, %r6348, %r6353; + xor.b32 %r6355, %r6354, %r6350; + shf.l.wrap.b32 %r6356, %r6355, %r6355, 24; + add.s32 %r6357, %r6356, %r6351; + xor.b32 %r6358, %r6357, %r6353; + shf.l.wrap.b32 %r6359, %r6358, %r6358, 25; + add.s32 %r6360, %r6317, %r6309; + xor.b32 %r6361, %r6360, %r6331; + shf.l.wrap.b32 %r6362, %r6361, %r6361, 16; + add.s32 %r6363, %r6362, %r6345; + xor.b32 %r6364, %r6363, %r6309; + shf.l.wrap.b32 %r6365, %r6364, %r6364, 20; + add.s32 %r6366, %r6360, %r6365; + xor.b32 %r6367, %r6366, %r6362; + shf.l.wrap.b32 %r6368, %r6367, %r6367, 24; + add.s32 %r6369, %r6368, %r6363; + xor.b32 %r6370, %r6369, %r6365; + shf.l.wrap.b32 %r6371, %r6370, %r6370, 25; + add.s32 %r6372, %r6329, %r6322; + xor.b32 %r6373, %r6344, %r6372; + shf.l.wrap.b32 %r6374, %r6373, %r6373, 16; + add.s32 %r6375, %r6374, %r6307; + xor.b32 %r6376, %r6375, %r6322; + shf.l.wrap.b32 %r6377, %r6376, %r6376, 20; + add.s32 %r6378, %r6372, %r6042; + add.s32 %r6379, %r6378, %r6377; + xor.b32 %r6380, %r6379, %r6374; + shf.l.wrap.b32 %r6381, %r6380, %r6380, 24; + add.s32 %r6382, %r6381, %r6375; + xor.b32 %r6383, %r6382, %r6377; + shf.l.wrap.b32 %r6384, %r6383, %r6383, 25; + add.s32 %r6385, %r6342, %r6334; + xor.b32 %r6386, %r6306, %r6385; + shf.l.wrap.b32 %r6387, %r6386, %r6386, 16; + add.s32 %r6388, %r6387, %r6320; + xor.b32 %r6389, %r6388, %r6334; + shf.l.wrap.b32 %r6390, %r6389, %r6389, 20; + add.s32 %r6391, %r6385, %r6390; + xor.b32 %r6392, %r6391, %r6387; + shf.l.wrap.b32 %r6393, %r6392, %r6392, 24; + add.s32 %r6394, %r6393, %r6388; + xor.b32 %r6395, %r6394, %r6390; + shf.l.wrap.b32 %r6396, %r6395, %r6395, 25; + add.s32 %r6397, %r6354, %r6371; + xor.b32 %r6398, %r6397, %r6393; + shf.l.wrap.b32 %r6399, %r6398, %r6398, 16; + add.s32 %r6400, %r6399, %r6382; + xor.b32 %r6401, %r6400, %r6371; + shf.l.wrap.b32 %r6402, %r6401, %r6401, 20; + add.s32 %r6403, %r6397, %r6018; + add.s32 %r6404, %r6403, %r6402; + xor.b32 %r6405, %r6404, %r6399; + shf.l.wrap.b32 %r6406, %r6405, %r6405, 24; + add.s32 %r6407, %r6406, %r6400; + xor.b32 %r6408, %r6407, %r6402; + shf.l.wrap.b32 %r6409, %r6408, %r6408, 25; + add.s32 %r6410, %r6384, %r6366; + xor.b32 %r6411, %r6356, %r6410; + shf.l.wrap.b32 %r6412, %r6411, %r6411, 16; + add.s32 %r6413, %r6412, %r6394; + xor.b32 %r6414, %r6413, %r6384; + shf.l.wrap.b32 %r6415, %r6414, %r6414, 20; + add.s32 %r6416, %r6410, %r6034; + add.s32 %r6417, %r6416, %r6415; + xor.b32 %r6418, %r6417, %r6412; + shf.l.wrap.b32 %r6419, %r6418, %r6418, 24; + add.s32 %r6420, %r6419, %r6413; + xor.b32 %r6421, %r6420, %r6415; + shf.l.wrap.b32 %r6422, %r6421, %r6421, 25; + add.s32 %r6423, %r6379, %r6396; + xor.b32 %r6424, %r6368, %r6423; + shf.l.wrap.b32 %r6425, %r6424, %r6424, 16; + add.s32 %r6426, %r6425, %r6357; + xor.b32 %r6427, %r6426, %r6396; + shf.l.wrap.b32 %r6428, %r6427, %r6427, 20; + add.s32 %r6429, %r6423, %r6428; + xor.b32 %r6430, %r6429, %r6425; + shf.l.wrap.b32 %r6431, %r6430, %r6430, 24; + add.s32 %r6432, %r6431, %r6426; + xor.b32 %r6433, %r6432, %r6428; + shf.l.wrap.b32 %r6434, %r6433, %r6433, 25; + add.s32 %r6435, %r6391, %r6026; + add.s32 %r6436, %r6435, %r6359; + xor.b32 %r6437, %r6436, %r6381; + shf.l.wrap.b32 %r6438, %r6437, %r6437, 16; + add.s32 %r6439, %r6438, %r6369; + xor.b32 %r6440, %r6439, %r6359; + shf.l.wrap.b32 %r6441, %r6440, %r6440, 20; + add.s32 %r6442, %r6436, %r6441; + xor.b32 %r6443, %r6442, %r6438; + shf.l.wrap.b32 %r6444, %r6443, %r6443, 24; + add.s32 %r6445, %r6444, %r6439; + xor.b32 %r6446, %r6445, %r6441; + shf.l.wrap.b32 %r6447, %r6446, %r6446, 25; + add.s32 %r6448, %r6404, %r6447; + xor.b32 %r6449, %r6448, %r6419; + shf.l.wrap.b32 %r6450, %r6449, %r6449, 16; + add.s32 %r6451, %r6450, %r6432; + xor.b32 %r6452, %r6451, %r6447; + shf.l.wrap.b32 %r6453, %r6452, %r6452, 20; + add.s32 %r6454, %r6448, %r6453; + xor.b32 %r6455, %r6454, %r6450; + shf.l.wrap.b32 %r6456, %r6455, %r6455, 24; + add.s32 %r6457, %r6456, %r6451; + xor.b32 %r6458, %r6457, %r6453; + shf.l.wrap.b32 %r6459, %r6458, %r6458, 25; + add.s32 %r6460, %r6417, %r6409; + xor.b32 %r6461, %r6460, %r6431; + shf.l.wrap.b32 %r6462, %r6461, %r6461, 16; + add.s32 %r6463, %r6462, %r6445; + xor.b32 %r6464, %r6463, %r6409; + shf.l.wrap.b32 %r6465, %r6464, %r6464, 20; + add.s32 %r6466, %r6460, %r6465; + xor.b32 %r6467, %r6466, %r6462; + shf.l.wrap.b32 %r6468, %r6467, %r6467, 24; + add.s32 %r6469, %r6468, %r6463; + xor.b32 %r6470, %r6469, %r6465; + shf.l.wrap.b32 %r6471, %r6470, %r6470, 25; + add.s32 %r6472, %r6429, %r6422; + xor.b32 %r6473, %r6444, %r6472; + shf.l.wrap.b32 %r6474, %r6473, %r6473, 16; + add.s32 %r6475, %r6474, %r6407; + xor.b32 %r6476, %r6475, %r6422; + shf.l.wrap.b32 %r6477, %r6476, %r6476, 20; + add.s32 %r6478, %r6472, %r6477; + xor.b32 %r6479, %r6478, %r6474; + shf.l.wrap.b32 %r6480, %r6479, %r6479, 24; + add.s32 %r6481, %r6480, %r6475; + xor.b32 %r6482, %r6481, %r6477; + shf.l.wrap.b32 %r6483, %r6482, %r6482, 25; + add.s32 %r6484, %r6442, %r6434; + xor.b32 %r6485, %r6406, %r6484; + shf.l.wrap.b32 %r6486, %r6485, %r6485, 16; + add.s32 %r6487, %r6486, %r6420; + xor.b32 %r6488, %r6487, %r6434; + shf.l.wrap.b32 %r6489, %r6488, %r6488, 20; + add.s32 %r6490, %r6484, %r6489; + xor.b32 %r6491, %r6490, %r6486; + shf.l.wrap.b32 %r6492, %r6491, %r6491, 24; + add.s32 %r6493, %r6492, %r6487; + xor.b32 %r6494, %r6493, %r6489; + shf.l.wrap.b32 %r6495, %r6494, %r6494, 25; + add.s32 %r6496, %r6454, %r6471; + xor.b32 %r6497, %r6496, %r6492; + shf.l.wrap.b32 %r6498, %r6497, %r6497, 16; + add.s32 %r6499, %r6498, %r6481; + xor.b32 %r6500, %r6499, %r6471; + shf.l.wrap.b32 %r6501, %r6500, %r6500, 20; + add.s32 %r6502, %r6496, %r6034; + add.s32 %r6503, %r6502, %r6501; + xor.b32 %r6504, %r6503, %r6498; + shf.l.wrap.b32 %r6505, %r6504, %r6504, 24; + add.s32 %r6506, %r6505, %r6499; + xor.b32 %r6507, %r6506, %r6501; + shf.l.wrap.b32 %r6508, %r6507, %r6507, 25; + add.s32 %r6509, %r6483, %r6466; + xor.b32 %r6510, %r6456, %r6509; + shf.l.wrap.b32 %r6511, %r6510, %r6510, 16; + add.s32 %r6512, %r6511, %r6493; + xor.b32 %r6513, %r6512, %r6483; + shf.l.wrap.b32 %r6514, %r6513, %r6513, 20; + add.s32 %r6515, %r6509, %r6042; + add.s32 %r6516, %r6515, %r6514; + xor.b32 %r6517, %r6516, %r6511; + shf.l.wrap.b32 %r6518, %r6517, %r6517, 24; + add.s32 %r6519, %r6518, %r6512; + xor.b32 %r6520, %r6519, %r6514; + shf.l.wrap.b32 %r6521, %r6520, %r6520, 25; + add.s32 %r6522, %r6478, %r6018; + add.s32 %r6523, %r6522, %r6495; + xor.b32 %r6524, %r6468, %r6523; + shf.l.wrap.b32 %r6525, %r6524, %r6524, 16; + add.s32 %r6526, %r6525, %r6457; + xor.b32 %r6527, %r6526, %r6495; + shf.l.wrap.b32 %r6528, %r6527, %r6527, 20; + add.s32 %r6529, %r6523, %r6026; + add.s32 %r6530, %r6529, %r6528; + xor.b32 %r6531, %r6530, %r6525; + shf.l.wrap.b32 %r6532, %r6531, %r6531, 24; + add.s32 %r6533, %r6532, %r6526; + xor.b32 %r6534, %r6533, %r6528; + shf.l.wrap.b32 %r6535, %r6534, %r6534, 25; + add.s32 %r6536, %r6490, %r6459; + xor.b32 %r6537, %r6536, %r6480; + shf.l.wrap.b32 %r6538, %r6537, %r6537, 16; + add.s32 %r6539, %r6538, %r6469; + xor.b32 %r6540, %r6539, %r6459; + shf.l.wrap.b32 %r6541, %r6540, %r6540, 20; + add.s32 %r6542, %r6536, %r6541; + xor.b32 %r6543, %r6542, %r6538; + shf.l.wrap.b32 %r6544, %r6543, %r6543, 24; + add.s32 %r6545, %r6544, %r6539; + xor.b32 %r6546, %r6545, %r6541; + shf.l.wrap.b32 %r6547, %r6546, %r6546, 25; + add.s32 %r6548, %r6503, %r6547; + xor.b32 %r6549, %r6548, %r6518; + shf.l.wrap.b32 %r6550, %r6549, %r6549, 16; + add.s32 %r6551, %r6550, %r6533; + xor.b32 %r6552, %r6551, %r6547; + shf.l.wrap.b32 %r6553, %r6552, %r6552, 20; + add.s32 %r6554, %r6548, %r6553; + xor.b32 %r6555, %r6554, %r6550; + shf.l.wrap.b32 %r6556, %r6555, %r6555, 24; + add.s32 %r6557, %r6556, %r6551; + xor.b32 %r6558, %r6557, %r6553; + shf.l.wrap.b32 %r6559, %r6558, %r6558, 25; + add.s32 %r6560, %r6516, %r6508; + xor.b32 %r6561, %r6560, %r6532; + shf.l.wrap.b32 %r6562, %r6561, %r6561, 16; + add.s32 %r6563, %r6562, %r6545; + xor.b32 %r6564, %r6563, %r6508; + shf.l.wrap.b32 %r6565, %r6564, %r6564, 20; + add.s32 %r6566, %r6560, %r6565; + xor.b32 %r6567, %r6566, %r6562; + shf.l.wrap.b32 %r6568, %r6567, %r6567, 24; + add.s32 %r6569, %r6568, %r6563; + xor.b32 %r6570, %r6569, %r6565; + shf.l.wrap.b32 %r6571, %r6570, %r6570, 25; + add.s32 %r6572, %r6530, %r6521; + xor.b32 %r6573, %r6544, %r6572; + shf.l.wrap.b32 %r6574, %r6573, %r6573, 16; + add.s32 %r6575, %r6574, %r6506; + xor.b32 %r6576, %r6575, %r6521; + shf.l.wrap.b32 %r6577, %r6576, %r6576, 20; + add.s32 %r6578, %r6572, %r6577; + xor.b32 %r6579, %r6578, %r6574; + shf.l.wrap.b32 %r6580, %r6579, %r6579, 24; + add.s32 %r6581, %r6580, %r6575; + xor.b32 %r6582, %r6581, %r6577; + shf.l.wrap.b32 %r6583, %r6582, %r6582, 25; + add.s32 %r6584, %r6542, %r6535; + xor.b32 %r6585, %r6505, %r6584; + shf.l.wrap.b32 %r6586, %r6585, %r6585, 16; + add.s32 %r6587, %r6586, %r6519; + xor.b32 %r6588, %r6587, %r6535; + shf.l.wrap.b32 %r6589, %r6588, %r6588, 20; + add.s32 %r6590, %r6584, %r6026; + add.s32 %r6591, %r6590, %r6589; + xor.b32 %r6592, %r6591, %r6586; + shf.l.wrap.b32 %r6593, %r6592, %r6592, 24; + add.s32 %r6594, %r6593, %r6587; + xor.b32 %r6595, %r6594, %r6589; + shf.l.wrap.b32 %r6596, %r6595, %r6595, 25; + add.s32 %r6597, %r6554, %r6571; + xor.b32 %r6598, %r6597, %r6593; + shf.l.wrap.b32 %r6599, %r6598, %r6598, 16; + add.s32 %r6600, %r6599, %r6581; + xor.b32 %r6601, %r6600, %r6571; + shf.l.wrap.b32 %r6602, %r6601, %r6601, 20; + add.s32 %r6603, %r6597, %r6042; + add.s32 %r6604, %r6603, %r6602; + xor.b32 %r6605, %r6604, %r6599; + shf.l.wrap.b32 %r6606, %r6605, %r6605, 24; + add.s32 %r6607, %r6606, %r6600; + xor.b32 %r6608, %r6607, %r6602; + shf.l.wrap.b32 %r6609, %r6608, %r6608, 25; + add.s32 %r6610, %r6583, %r6018; + add.s32 %r6611, %r6610, %r6566; + xor.b32 %r6612, %r6556, %r6611; + shf.l.wrap.b32 %r6613, %r6612, %r6612, 16; + add.s32 %r6614, %r6613, %r6594; + xor.b32 %r6615, %r6614, %r6583; + shf.l.wrap.b32 %r6616, %r6615, %r6615, 20; + add.s32 %r6617, %r6611, %r6616; + xor.b32 %r6618, %r6617, %r6613; + shf.l.wrap.b32 %r6619, %r6618, %r6618, 24; + add.s32 %r6620, %r6619, %r6614; + xor.b32 %r6621, %r6620, %r6616; + shf.l.wrap.b32 %r6622, %r6621, %r6621, 25; + add.s32 %r6623, %r6578, %r6034; + add.s32 %r6624, %r6623, %r6596; + xor.b32 %r6625, %r6568, %r6624; + shf.l.wrap.b32 %r6626, %r6625, %r6625, 16; + add.s32 %r6627, %r6626, %r6557; + xor.b32 %r6628, %r6627, %r6596; + shf.l.wrap.b32 %r6629, %r6628, %r6628, 20; + add.s32 %r6630, %r6624, %r6629; + xor.b32 %r6631, %r6630, %r6626; + shf.l.wrap.b32 %r6632, %r6631, %r6631, 24; + add.s32 %r6633, %r6632, %r6627; + xor.b32 %r6634, %r6633, %r6629; + shf.l.wrap.b32 %r6635, %r6634, %r6634, 25; + add.s32 %r6636, %r6591, %r6559; + xor.b32 %r6637, %r6636, %r6580; + shf.l.wrap.b32 %r6638, %r6637, %r6637, 16; + add.s32 %r6639, %r6638, %r6569; + xor.b32 %r6640, %r6639, %r6559; + shf.l.wrap.b32 %r6641, %r6640, %r6640, 20; + add.s32 %r6642, %r6636, %r6641; + xor.b32 %r6643, %r6642, %r6638; + shf.l.wrap.b32 %r6644, %r6643, %r6643, 24; + add.s32 %r6645, %r6644, %r6639; + xor.b32 %r6646, %r6645, %r6641; + shf.l.wrap.b32 %r6647, %r6646, %r6646, 25; + add.s32 %r6648, %r6604, %r6647; + xor.b32 %r6649, %r6648, %r6619; + shf.l.wrap.b32 %r6650, %r6649, %r6649, 16; + add.s32 %r6651, %r6650, %r6633; + xor.b32 %r6652, %r6651, %r6647; + shf.l.wrap.b32 %r6653, %r6652, %r6652, 20; + add.s32 %r6654, %r6648, %r6653; + xor.b32 %r6655, %r6654, %r6650; + shf.l.wrap.b32 %r6656, %r6655, %r6655, 24; + add.s32 %r6657, %r6656, %r6651; + xor.b32 %r6658, %r6657, %r6653; + shf.l.wrap.b32 %r6659, %r6658, %r6658, 25; + add.s32 %r6660, %r6617, %r6609; + xor.b32 %r6661, %r6660, %r6632; + shf.l.wrap.b32 %r6662, %r6661, %r6661, 16; + add.s32 %r6663, %r6662, %r6645; + xor.b32 %r6664, %r6663, %r6609; + shf.l.wrap.b32 %r6665, %r6664, %r6664, 20; + add.s32 %r6666, %r6660, %r6018; + add.s32 %r6667, %r6666, %r6665; + xor.b32 %r6668, %r6667, %r6662; + shf.l.wrap.b32 %r6669, %r6668, %r6668, 24; + add.s32 %r6670, %r6669, %r6663; + xor.b32 %r6671, %r6670, %r6665; + shf.l.wrap.b32 %r6672, %r6671, %r6671, 25; + add.s32 %r6673, %r6630, %r6026; + add.s32 %r6674, %r6673, %r6622; + xor.b32 %r6675, %r6644, %r6674; + shf.l.wrap.b32 %r6676, %r6675, %r6675, 16; + add.s32 %r6677, %r6676, %r6607; + xor.b32 %r6678, %r6677, %r6622; + shf.l.wrap.b32 %r6679, %r6678, %r6678, 20; + add.s32 %r6680, %r6674, %r6679; + xor.b32 %r6681, %r6680, %r6676; + shf.l.wrap.b32 %r6682, %r6681, %r6681, 24; + add.s32 %r6683, %r6682, %r6677; + xor.b32 %r6684, %r6683, %r6679; + shf.l.wrap.b32 %r6685, %r6684, %r6684, 25; + add.s32 %r6686, %r6642, %r6635; + xor.b32 %r6687, %r6606, %r6686; + shf.l.wrap.b32 %r6688, %r6687, %r6687, 16; + add.s32 %r6689, %r6688, %r6620; + xor.b32 %r6690, %r6689, %r6635; + shf.l.wrap.b32 %r6691, %r6690, %r6690, 20; + add.s32 %r6692, %r6686, %r6691; + xor.b32 %r6693, %r6692, %r6688; + shf.l.wrap.b32 %r6694, %r6693, %r6693, 24; + add.s32 %r6695, %r6694, %r6689; + xor.b32 %r6696, %r6695, %r6691; + shf.l.wrap.b32 %r6697, %r6696, %r6696, 25; + add.s32 %r6698, %r6654, %r6672; + xor.b32 %r6699, %r6698, %r6694; + shf.l.wrap.b32 %r6700, %r6699, %r6699, 16; + add.s32 %r6701, %r6700, %r6683; + xor.b32 %r6702, %r6701, %r6672; + shf.l.wrap.b32 %r6703, %r6702, %r6702, 20; + add.s32 %r6704, %r6698, %r6703; + xor.b32 %r6705, %r6704, %r6700; + shf.l.wrap.b32 %r6706, %r6705, %r6705, 24; + add.s32 %r6707, %r6706, %r6701; + xor.b32 %r6708, %r6707, %r6703; + shf.l.wrap.b32 %r6709, %r6708, %r6708, 25; + add.s32 %r6710, %r6685, %r6034; + add.s32 %r6711, %r6710, %r6667; + xor.b32 %r6712, %r6656, %r6711; + shf.l.wrap.b32 %r6713, %r6712, %r6712, 16; + add.s32 %r6714, %r6713, %r6695; + xor.b32 %r6715, %r6714, %r6685; + shf.l.wrap.b32 %r6716, %r6715, %r6715, 20; + add.s32 %r6717, %r6711, %r6716; + xor.b32 %r6718, %r6717, %r6713; + shf.l.wrap.b32 %r6719, %r6718, %r6718, 24; + add.s32 %r6720, %r6719, %r6714; + xor.b32 %r6721, %r6720, %r6716; + shf.l.wrap.b32 %r6722, %r6721, %r6721, 25; + add.s32 %r6723, %r6680, %r6042; + add.s32 %r6724, %r6723, %r6697; + xor.b32 %r6725, %r6669, %r6724; + shf.l.wrap.b32 %r6726, %r6725, %r6725, 16; + add.s32 %r6727, %r6726, %r6657; + xor.b32 %r6728, %r6727, %r6697; + shf.l.wrap.b32 %r6729, %r6728, %r6728, 20; + add.s32 %r6730, %r6724, %r6729; + xor.b32 %r6731, %r6730, %r6726; + shf.l.wrap.b32 %r6732, %r6731, %r6731, 24; + add.s32 %r6733, %r6732, %r6727; + xor.b32 %r6734, %r6733, %r6729; + shf.l.wrap.b32 %r6735, %r6734, %r6734, 25; + add.s32 %r6736, %r6692, %r6659; + xor.b32 %r6737, %r6736, %r6682; + shf.l.wrap.b32 %r6738, %r6737, %r6737, 16; + add.s32 %r6739, %r6738, %r6670; + xor.b32 %r6740, %r6739, %r6659; + shf.l.wrap.b32 %r6741, %r6740, %r6740, 20; + add.s32 %r6742, %r6736, %r6741; + xor.b32 %r6743, %r6742, %r6738; + shf.l.wrap.b32 %r6744, %r6743, %r6743, 24; + add.s32 %r6745, %r6744, %r6739; + xor.b32 %r6746, %r6745, %r6741; + shf.l.wrap.b32 %r6747, %r6746, %r6746, 25; + xor.b32 %r6748, %r6704, %r6733; + cvt.u64.u32 %rd418, %r6748; + xor.b32 %r6749, %r6745, %r6717; + and.b32 %r6750, %r6749, 255; + cvt.u64.u32 %rd419, %r6750; + bfi.b64 %rd420, %rd419, %rd418, 32, 32; + cvt.u64.u32 %rd421, %r6749; + shl.b64 %rd422, %rd421, 32; + and.b64 %rd423, %rd422, 280375465082880; + or.b64 %rd424, %rd420, %rd423; + and.b64 %rd425, %rd422, 71776119061217280; + shr.u32 %r6751, %r6749, 24; + cvt.u64.u32 %rd426, %r6751; + shl.b64 %rd427, %rd426, 56; + or.b64 %rd428, %rd424, %rd425; + or.b64 %rd22, %rd428, %rd427; + xor.b32 %r6752, %r6707, %r6730; + cvt.u64.u32 %rd429, %r6752; + xor.b32 %r6753, %r6742, %r6720; + and.b32 %r6754, %r6753, 255; + cvt.u64.u32 %rd430, %r6754; + bfi.b64 %rd431, %rd430, %rd429, 32, 32; + cvt.u64.u32 %rd432, %r6753; + shl.b64 %rd433, %rd432, 32; + and.b64 %rd434, %rd433, 280375465082880; + or.b64 %rd435, %rd431, %rd434; + and.b64 %rd436, %rd433, 71776119061217280; + shr.u32 %r6755, %r6753, 24; + cvt.u64.u32 %rd437, %r6755; + shl.b64 %rd438, %rd437, 56; + or.b64 %rd439, %rd435, %rd436; + or.b64 %rd23, %rd439, %rd438; + xor.b32 %r6756, %r6747, %r6719; + cvt.u64.u32 %rd440, %r6756; + xor.b32 %r6757, %r6709, %r6732; + and.b32 %r6758, %r6757, 255; + cvt.u64.u32 %rd441, %r6758; + bfi.b64 %rd442, %rd441, %rd440, 32, 32; + cvt.u64.u32 %rd443, %r6757; + shl.b64 %rd444, %rd443, 32; + and.b64 %rd445, %rd444, 280375465082880; + or.b64 %rd446, %rd442, %rd445; + and.b64 %rd447, %rd444, 71776119061217280; + shr.u32 %r6759, %r6757, 24; + cvt.u64.u32 %rd448, %r6759; + shl.b64 %rd449, %rd448, 56; + or.b64 %rd450, %rd446, %rd447; + or.b64 %rd24, %rd450, %rd449; + xor.b32 %r6760, %r6744, %r6722; + cvt.u64.u32 %rd451, %r6760; + xor.b32 %r6761, %r6706, %r6735; + and.b32 %r6762, %r6761, 255; + cvt.u64.u32 %rd452, %r6762; + bfi.b64 %rd453, %rd452, %rd451, 32, 32; + cvt.u64.u32 %rd454, %r6761; + shl.b64 %rd455, %rd454, 32; + and.b64 %rd456, %rd455, 280375465082880; + or.b64 %rd457, %rd453, %rd456; + and.b64 %rd458, %rd455, 71776119061217280; + shr.u32 %r6763, %r6761, 24; + cvt.u64.u32 %rd459, %r6763; + shl.b64 %rd460, %rd459, 56; + or.b64 %rd461, %rd457, %rd458; + or.b64 %rd25, %rd461, %rd460; + add.u64 %rd1248, %SPL, 2016; + mov.u32 %r29538, 0; + st.local.v4.u32 [%rd1248+32], {%r29538, %r29538, %r29538, %r29538}; + st.local.v4.u32 [%rd1248+48], {%r29538, %r29538, %r29538, %r29538}; + st.local.v4.u32 [%rd1248+64], {%r29538, %r29538, %r29538, %r29538}; + and.b64 %rd26, %rd22, -256; + st.local.v2.u64 [%rd1248], {%rd22, %rd23}; + st.local.v2.u64 [%rd1248+16], {%rd24, %rd25}; + and.b64 %rd1260, %rd22, 255; + +$L__BB2_19: + mov.b64 {%r20, %r19}, %rd1260; + mul.wide.u32 %rd463, %r20, 1908875315; + shr.u64 %rd464, %rd463, 56; + cvt.u32.u64 %r6765, %rd464; + mul.lo.s32 %r6766, %r6765, 37748717; + sub.s32 %r21, %r20, %r6766; + mov.b64 {%r24, %r23}, %rd1258; + mul.wide.u32 %rd465, %r24, 1908875315; + shr.u64 %rd466, %rd465, 56; + cvt.u32.u64 %r6767, %rd466; + mul.lo.s32 %r6768, %r6767, 37748717; + sub.s32 %r25, %r24, %r6768; + mov.b64 {%r28, %r27}, %rd1257; + mul.wide.u32 %rd467, %r28, 1908875315; + shr.u64 %rd468, %rd467, 56; + cvt.u32.u64 %r6769, %rd468; + mul.lo.s32 %r6770, %r6769, 37748717; + sub.s32 %r29, %r28, %r6770; + shl.b32 %r30, %r21, 1; + mul.wide.u32 %rd469, %r30, -954391867; + shr.u64 %rd470, %rd469, 32; + cvt.u32.u64 %r6771, %rd470; + sub.s32 %r6772, %r30, %r6771; + shr.u32 %r6773, %r6772, 1; + add.s32 %r6774, %r6773, %r6771; + shr.u32 %r6775, %r6774, 20; + mul.lo.s32 %r6776, %r6775, 1179641; + sub.s32 %r6777, %r30, %r6776; + cvta.to.global.u64 %rd471, %rd354; + mul.wide.u32 %rd472, %r6777, 64; + add.s64 %rd34, %rd471, %rd472; + or.b32 %r31, %r30, 1; + mul.wide.u32 %rd473, %r31, -954391867; + shr.u64 %rd474, %rd473, 32; + cvt.u32.u64 %r6778, %rd474; + sub.s32 %r6779, %r31, %r6778; + shr.u32 %r6780, %r6779, 1; + add.s32 %r6781, %r6780, %r6778; + shr.u32 %r6782, %r6781, 20; + mul.lo.s32 %r6783, %r6782, 1179641; + sub.s32 %r6784, %r31, %r6783; + mul.wide.u32 %rd475, %r6784, 64; + add.s64 %rd35, %rd471, %rd475; + setp.eq.s64 %p16, %rd353, 0; + @%p16 bra $L__BB2_33; + + cvta.to.global.u64 %rd476, %rd353; + mul.wide.u32 %rd477, %r21, 128; + add.s64 %rd36, %rd476, %rd477; + ld.global.u64 %rd1261, [%rd36]; + setp.eq.s64 %p17, %rd1261, 0; + @%p17 bra $L__BB2_22; + + ld.global.u64 %rd1264, [%rd36+32]; + ld.global.u64 %rd1263, [%rd36+16]; + ld.global.u64 %rd1262, [%rd36+8]; + bra.uni $L__BB2_44; + +$L__BB2_33: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd579, 1179641; + st.local.u64 [%rd3+8], %rd579; + st.local.u32 [%rd3+16], %r30; + ld.global.u64 %rd580, [%rd34]; + ld.global.u64 %rd581, [%rd34+8]; + ld.global.u64 %rd582, [%rd34+16]; + ld.global.u64 %rd583, [%rd34+24]; + ld.global.u64 %rd584, [%rd34+32]; + ld.global.u64 %rd585, [%rd34+40]; + ld.global.u64 %rd586, [%rd34+48]; + ld.global.u64 %rd587, [%rd34+56]; + st.local.u64 [%rd3+24], %rd580; + st.local.u64 [%rd3+32], %rd581; + st.local.u64 [%rd3+40], %rd582; + st.local.u64 [%rd3+48], %rd583; + st.local.u64 [%rd3+56], %rd584; + st.local.u64 [%rd3+64], %rd585; + st.local.u64 [%rd3+72], %rd586; + st.local.u64 [%rd3+80], %rd587; + cvt.u32.u64 %r10110, %rd580; + xor.b32 %r10111, %r30, %r10110; + st.local.u32 [%rd3+24], %r10111; + mov.u32 %r29776, 0; + st.local.v2.u32 [%rd3+96], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+104], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+112], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+120], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+128], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+136], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+144], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+152], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+160], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+168], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+176], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+184], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+192], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+200], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+208], {%r29776, %r29776}; + st.local.v2.u32 [%rd3+216], {%r29776, %r29776}; + mov.u32 %r29791, -2147483648; + mov.u32 %r10083, 1; + st.local.v2.u32 [%rd3+88], {%r10083, %r29791}; + ld.local.v2.u32 {%r29812, %r29813}, [%rd3+24]; + mov.b64 {%r29810, %r29811}, %rd585; + shr.u64 %rd588, %rd581, 32; + cvt.u32.u64 %r29824, %rd581; + cvt.u32.u64 %r29825, %rd588; + shr.u64 %rd589, %rd586, 32; + cvt.u32.u64 %r29822, %rd586; + cvt.u32.u64 %r29823, %rd589; + shr.u64 %rd590, %rd582, 32; + cvt.u32.u64 %r29820, %rd582; + cvt.u32.u64 %r29821, %rd590; + shr.u64 %rd591, %rd587, 32; + cvt.u32.u64 %r29818, %rd587; + cvt.u32.u64 %r29819, %rd591; + shr.u64 %rd592, %rd583, 32; + cvt.u32.u64 %r29816, %rd583; + cvt.u32.u64 %r29817, %rd592; + shr.u64 %rd593, %rd584, 32; + cvt.u32.u64 %r29814, %rd584; + cvt.u32.u64 %r29815, %rd593; + mov.u32 %r29777, %r29776; + mov.u32 %r29778, %r29776; + mov.u32 %r29779, %r29776; + mov.u32 %r29780, %r29776; + mov.u32 %r29781, %r29776; + mov.u32 %r29782, %r29776; + mov.u32 %r29783, %r29776; + mov.u32 %r29784, %r29776; + mov.u32 %r29785, %r29776; + mov.u32 %r29786, %r29776; + mov.u32 %r29787, %r29776; + mov.u32 %r29788, %r29776; + mov.u32 %r29789, %r29776; + mov.u32 %r29790, %r10083; + mov.u32 %r29792, %r29776; + mov.u32 %r29793, %r29776; + mov.u32 %r29794, %r29776; + mov.u32 %r29795, %r29776; + mov.u32 %r29796, %r29776; + mov.u32 %r29797, %r29776; + mov.u32 %r29798, %r29776; + mov.u32 %r29799, %r29776; + mov.u32 %r29800, %r29776; + mov.u32 %r29801, %r29776; + mov.u32 %r29802, %r29776; + mov.u32 %r29803, %r29776; + mov.u32 %r29804, %r29776; + mov.u32 %r29805, %r29776; + mov.u32 %r29806, %r29776; + mov.u32 %r29807, %r29776; + mov.u32 %r29808, %r29776; + mov.u32 %r29809, %r29776; + mov.u32 %r29826, %r29776; + +$L__BB2_34: + // begin inline asm + // xor5 + lop3.b32 %r10114, %r29812, %r29810, %r29808, 0x96; + lop3.b32 %r10114, %r10114, %r29806, %r29804, 0x96; + lop3.b32 %r10115, %r29813, %r29811, %r29809, 0x96; + lop3.b32 %r10115, %r10115, %r29807, %r29805, 0x96; + // end inline asm // begin inline asm - dp4a.u32.u32 %r1712, %r1713, %r5746, %r6244; + // xor5 + lop3.b32 %r10126, %r29824, %r29822, %r29802, 0x96; + lop3.b32 %r10126, %r10126, %r29800, %r29798, 0x96; + lop3.b32 %r10127, %r29825, %r29823, %r29803, 0x96; + lop3.b32 %r10127, %r10127, %r29801, %r29799, 0x96; // end inline asm - ld.const.u32 %r1717, [matrix+4]; // begin inline asm - dp4a.u32.u32 %r1716, %r1717, %r5750, %r1712; + // xor5 + lop3.b32 %r10138, %r29820, %r29818, %r29796, 0x96; + lop3.b32 %r10138, %r10138, %r29794, %r29792, 0x96; + lop3.b32 %r10139, %r29821, %r29819, %r29797, 0x96; + lop3.b32 %r10139, %r10139, %r29795, %r29793, 0x96; // end inline asm - ld.const.u32 %r1721, [matrix+8]; // begin inline asm - dp4a.u32.u32 %r1720, %r1721, %r5754, %r1716; + // xor5 + lop3.b32 %r10150, %r29816, %r29790, %r29788, 0x96; + lop3.b32 %r10150, %r10150, %r29786, %r29784, 0x96; + lop3.b32 %r10151, %r29817, %r29791, %r29789, 0x96; + lop3.b32 %r10151, %r10151, %r29787, %r29785, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10162, %r29814, %r29782, %r29780, 0x96; + lop3.b32 %r10162, %r10162, %r29778, %r29776, 0x96; + lop3.b32 %r10163, %r29815, %r29783, %r29781, 0x96; + lop3.b32 %r10163, %r10163, %r29779, %r29777, 0x96; // end inline asm - ld.const.u32 %r1725, [matrix+12]; // begin inline asm - dp4a.u32.u32 %r1724, %r1725, %r5758, %r1720; + shf.l.wrap.b32 %r10174, %r10127, %r10126, %r10083; // end inline asm - ld.const.u32 %r1729, [matrix+16]; // begin inline asm - dp4a.u32.u32 %r1728, %r1729, %r5762, %r1724; + shf.l.wrap.b32 %r10178, %r10126, %r10127, %r10083; // end inline asm - ld.const.u32 %r1733, [matrix+20]; + xor.b32 %r10608, %r10174, %r10162; + xor.b32 %r10609, %r10178, %r10163; + xor.b32 %r10441, %r29812, %r10608; + xor.b32 %r10444, %r29813, %r10609; + xor.b32 %r10348, %r29810, %r10608; + xor.b32 %r10347, %r29811, %r10609; + xor.b32 %r10395, %r29808, %r10608; + xor.b32 %r10396, %r29809, %r10609; + xor.b32 %r10300, %r29806, %r10608; + xor.b32 %r10299, %r29807, %r10609; + xor.b32 %r10251, %r29804, %r10608; + xor.b32 %r10252, %r29805, %r10609; // begin inline asm - dp4a.u32.u32 %r1732, %r1733, %r5766, %r1728; + shf.l.wrap.b32 %r10182, %r10139, %r10138, %r10083; // end inline asm - ld.const.u32 %r1737, [matrix+24]; // begin inline asm - dp4a.u32.u32 %r1736, %r1737, %r5770, %r1732; + shf.l.wrap.b32 %r10186, %r10138, %r10139, %r10083; // end inline asm - ld.const.u32 %r1741, [matrix+28]; + xor.b32 %r10610, %r10182, %r10114; + xor.b32 %r10611, %r10186, %r10115; + xor.b32 %r10403, %r29824, %r10610; + xor.b32 %r10404, %r29825, %r10611; + xor.b32 %r10220, %r29822, %r10610; + xor.b32 %r10219, %r29823, %r10611; + xor.b32 %r10379, %r29802, %r10610; + xor.b32 %r10380, %r29803, %r10611; + xor.b32 %r10340, %r29800, %r10610; + xor.b32 %r10339, %r29801, %r10611; + xor.b32 %r10323, %r29798, %r10610; + xor.b32 %r10324, %r29799, %r10611; // begin inline asm - dp4a.u32.u32 %r1740, %r1741, %r5774, %r1736; + shf.l.wrap.b32 %r10190, %r10151, %r10150, %r10083; // end inline asm - ld.const.u32 %r1745, [matrix+32]; // begin inline asm - dp4a.u32.u32 %r1744, %r1745, %r5778, %r1740; + shf.l.wrap.b32 %r10194, %r10150, %r10151, %r10083; // end inline asm - ld.const.u32 %r1749, [matrix+36]; + xor.b32 %r10612, %r10190, %r10126; + xor.b32 %r10613, %r10194, %r10127; + xor.b32 %r10260, %r29820, %r10612; + xor.b32 %r10259, %r29821, %r10613; + xor.b32 %r10387, %r29818, %r10612; + xor.b32 %r10388, %r29819, %r10613; + xor.b32 %r10268, %r29796, %r10612; + xor.b32 %r10267, %r29797, %r10613; + xor.b32 %r10371, %r29794, %r10612; + xor.b32 %r10372, %r29795, %r10613; + xor.b32 %r10236, %r29792, %r10612; + xor.b32 %r10235, %r29793, %r10613; // begin inline asm - dp4a.u32.u32 %r1748, %r1749, %r5782, %r1744; + shf.l.wrap.b32 %r10198, %r10163, %r10162, %r10083; // end inline asm - ld.const.u32 %r1753, [matrix+40]; // begin inline asm - dp4a.u32.u32 %r1752, %r1753, %r5786, %r1748; + shf.l.wrap.b32 %r10202, %r10162, %r10163, %r10083; // end inline asm - ld.const.u32 %r1757, [matrix+44]; + xor.b32 %r10614, %r10198, %r10138; + xor.b32 %r10615, %r10202, %r10139; + xor.b32 %r10355, %r29816, %r10614; + xor.b32 %r10356, %r29817, %r10615; + xor.b32 %r10332, %r29790, %r10614; + xor.b32 %r10331, %r29791, %r10615; + xor.b32 %r10275, %r29788, %r10614; + xor.b32 %r10276, %r29789, %r10615; + xor.b32 %r10363, %r29786, %r10614; + xor.b32 %r10364, %r29787, %r10615; + xor.b32 %r10292, %r29784, %r10614; + xor.b32 %r10291, %r29785, %r10615; // begin inline asm - dp4a.u32.u32 %r1756, %r1757, %r5790, %r1752; + shf.l.wrap.b32 %r10206, %r10115, %r10114, %r10083; // end inline asm - ld.const.u32 %r1761, [matrix+48]; // begin inline asm - dp4a.u32.u32 %r1760, %r1761, %r5794, %r1756; + shf.l.wrap.b32 %r10210, %r10114, %r10115, %r10083; // end inline asm - ld.const.u32 %r1765, [matrix+52]; + xor.b32 %r10616, %r10206, %r10150; + xor.b32 %r10617, %r10210, %r10151; + xor.b32 %r10307, %r29814, %r10616; + xor.b32 %r10308, %r29815, %r10617; + xor.b32 %r10227, %r29782, %r10616; + xor.b32 %r10228, %r29783, %r10617; + xor.b32 %r10244, %r29780, %r10616; + xor.b32 %r10243, %r29781, %r10617; + xor.b32 %r10283, %r29778, %r10616; + xor.b32 %r10284, %r29779, %r10617; + xor.b32 %r10315, %r29776, %r10616; + xor.b32 %r10316, %r29777, %r10617; + mov.u32 %r10221, 44; // begin inline asm - dp4a.u32.u32 %r1764, %r1765, %r5798, %r1760; + shf.l.wrap.b32 %r10214, %r10220, %r10219, %r10221; // end inline asm - ld.const.u32 %r1769, [matrix+56]; // begin inline asm - dp4a.u32.u32 %r1768, %r1769, %r5802, %r1764; + shf.l.wrap.b32 %r10218, %r10219, %r10220, %r10221; // end inline asm - ld.const.u32 %r1773, [matrix+60]; + mov.u32 %r10229, 20; // begin inline asm - dp4a.u32.u32 %r1772, %r1773, %r5806, %r1768; + shf.l.wrap.b32 %r10222, %r10228, %r10227, %r10229; // end inline asm - ld.const.u32 %r1777, [matrix+64]; // begin inline asm - dp4a.u32.u32 %r1776, %r1777, %r5746, %r6244; + shf.l.wrap.b32 %r10226, %r10227, %r10228, %r10229; // end inline asm - ld.const.u32 %r1781, [matrix+68]; + mov.u32 %r10237, 61; // begin inline asm - dp4a.u32.u32 %r1780, %r1781, %r5750, %r1776; + shf.l.wrap.b32 %r10230, %r10236, %r10235, %r10237; // end inline asm - ld.const.u32 %r1785, [matrix+72]; // begin inline asm - dp4a.u32.u32 %r1784, %r1785, %r5754, %r1780; + shf.l.wrap.b32 %r10234, %r10235, %r10236, %r10237; // end inline asm - ld.const.u32 %r1789, [matrix+76]; + mov.u32 %r10245, 39; // begin inline asm - dp4a.u32.u32 %r1788, %r1789, %r5758, %r1784; + shf.l.wrap.b32 %r10238, %r10244, %r10243, %r10245; // end inline asm - ld.const.u32 %r1793, [matrix+80]; // begin inline asm - dp4a.u32.u32 %r1792, %r1793, %r5762, %r1788; + shf.l.wrap.b32 %r10242, %r10243, %r10244, %r10245; // end inline asm - ld.const.u32 %r1797, [matrix+84]; + mov.u32 %r10253, 18; // begin inline asm - dp4a.u32.u32 %r1796, %r1797, %r5766, %r1792; + shf.l.wrap.b32 %r10246, %r10252, %r10251, %r10253; // end inline asm - ld.const.u32 %r1801, [matrix+88]; // begin inline asm - dp4a.u32.u32 %r1800, %r1801, %r5770, %r1796; + shf.l.wrap.b32 %r10250, %r10251, %r10252, %r10253; // end inline asm - ld.const.u32 %r1805, [matrix+92]; + mov.u32 %r10261, 62; // begin inline asm - dp4a.u32.u32 %r1804, %r1805, %r5774, %r1800; + shf.l.wrap.b32 %r10254, %r10260, %r10259, %r10261; // end inline asm - ld.const.u32 %r1809, [matrix+96]; // begin inline asm - dp4a.u32.u32 %r1808, %r1809, %r5778, %r1804; + shf.l.wrap.b32 %r10258, %r10259, %r10260, %r10261; // end inline asm - ld.const.u32 %r1813, [matrix+100]; + mov.u32 %r10269, 43; // begin inline asm - dp4a.u32.u32 %r1812, %r1813, %r5782, %r1808; + shf.l.wrap.b32 %r10262, %r10268, %r10267, %r10269; // end inline asm - ld.const.u32 %r1817, [matrix+104]; // begin inline asm - dp4a.u32.u32 %r1816, %r1817, %r5786, %r1812; + shf.l.wrap.b32 %r10266, %r10267, %r10268, %r10269; // end inline asm - ld.const.u32 %r1821, [matrix+108]; + mov.u32 %r10277, 25; // begin inline asm - dp4a.u32.u32 %r1820, %r1821, %r5790, %r1816; + shf.l.wrap.b32 %r10270, %r10276, %r10275, %r10277; // end inline asm - ld.const.u32 %r1825, [matrix+112]; // begin inline asm - dp4a.u32.u32 %r1824, %r1825, %r5794, %r1820; + shf.l.wrap.b32 %r10274, %r10275, %r10276, %r10277; // end inline asm - ld.const.u32 %r1829, [matrix+116]; + mov.u32 %r10285, 8; // begin inline asm - dp4a.u32.u32 %r1828, %r1829, %r5798, %r1824; + shf.l.wrap.b32 %r10278, %r10284, %r10283, %r10285; // end inline asm - ld.const.u32 %r1833, [matrix+120]; // begin inline asm - dp4a.u32.u32 %r1832, %r1833, %r5802, %r1828; + shf.l.wrap.b32 %r10282, %r10283, %r10284, %r10285; // end inline asm - ld.const.u32 %r1837, [matrix+124]; + mov.u32 %r10293, 56; // begin inline asm - dp4a.u32.u32 %r1836, %r1837, %r5806, %r1832; + shf.l.wrap.b32 %r10286, %r10292, %r10291, %r10293; // end inline asm - shr.u32 %r5966, %r1772, 6; - and.b32 %r5967, %r5966, 240; - shr.u32 %r5968, %r1836, 10; - or.b32 %r5969, %r5968, %r5967; - xor.b32 %r5970, %r9, %r5969; - ld.const.u32 %r1841, [matrix+128]; // begin inline asm - dp4a.u32.u32 %r1840, %r1841, %r5746, %r6244; + shf.l.wrap.b32 %r10290, %r10291, %r10292, %r10293; // end inline asm - ld.const.u32 %r1845, [matrix+132]; + mov.u32 %r10301, 41; // begin inline asm - dp4a.u32.u32 %r1844, %r1845, %r5750, %r1840; + shf.l.wrap.b32 %r10294, %r10300, %r10299, %r10301; // end inline asm - ld.const.u32 %r1849, [matrix+136]; // begin inline asm - dp4a.u32.u32 %r1848, %r1849, %r5754, %r1844; + shf.l.wrap.b32 %r10298, %r10299, %r10300, %r10301; // end inline asm - ld.const.u32 %r1853, [matrix+140]; + mov.u32 %r10309, 27; // begin inline asm - dp4a.u32.u32 %r1852, %r1853, %r5758, %r1848; + shf.l.wrap.b32 %r10302, %r10308, %r10307, %r10309; // end inline asm - ld.const.u32 %r1857, [matrix+144]; // begin inline asm - dp4a.u32.u32 %r1856, %r1857, %r5762, %r1852; + shf.l.wrap.b32 %r10306, %r10307, %r10308, %r10309; // end inline asm - ld.const.u32 %r1861, [matrix+148]; + mov.u32 %r10317, 14; // begin inline asm - dp4a.u32.u32 %r1860, %r1861, %r5766, %r1856; + shf.l.wrap.b32 %r10310, %r10316, %r10315, %r10317; // end inline asm - ld.const.u32 %r1865, [matrix+152]; // begin inline asm - dp4a.u32.u32 %r1864, %r1865, %r5770, %r1860; + shf.l.wrap.b32 %r10314, %r10315, %r10316, %r10317; // end inline asm - ld.const.u32 %r1869, [matrix+156]; + mov.u32 %r10325, 2; // begin inline asm - dp4a.u32.u32 %r1868, %r1869, %r5774, %r1864; + shf.l.wrap.b32 %r10318, %r10324, %r10323, %r10325; // end inline asm - ld.const.u32 %r1873, [matrix+160]; // begin inline asm - dp4a.u32.u32 %r1872, %r1873, %r5778, %r1868; + shf.l.wrap.b32 %r10322, %r10323, %r10324, %r10325; // end inline asm - ld.const.u32 %r1877, [matrix+164]; + mov.u32 %r10333, 55; // begin inline asm - dp4a.u32.u32 %r1876, %r1877, %r5782, %r1872; + shf.l.wrap.b32 %r10326, %r10332, %r10331, %r10333; // end inline asm - ld.const.u32 %r1881, [matrix+168]; // begin inline asm - dp4a.u32.u32 %r1880, %r1881, %r5786, %r1876; + shf.l.wrap.b32 %r10330, %r10331, %r10332, %r10333; // end inline asm - ld.const.u32 %r1885, [matrix+172]; + mov.u32 %r10341, 45; // begin inline asm - dp4a.u32.u32 %r1884, %r1885, %r5790, %r1880; + shf.l.wrap.b32 %r10334, %r10340, %r10339, %r10341; // end inline asm - ld.const.u32 %r1889, [matrix+176]; // begin inline asm - dp4a.u32.u32 %r1888, %r1889, %r5794, %r1884; + shf.l.wrap.b32 %r10338, %r10339, %r10340, %r10341; // end inline asm - ld.const.u32 %r1893, [matrix+180]; + mov.u32 %r10349, 36; // begin inline asm - dp4a.u32.u32 %r1892, %r1893, %r5798, %r1888; + shf.l.wrap.b32 %r10342, %r10348, %r10347, %r10349; // end inline asm - ld.const.u32 %r1897, [matrix+184]; // begin inline asm - dp4a.u32.u32 %r1896, %r1897, %r5802, %r1892; + shf.l.wrap.b32 %r10346, %r10347, %r10348, %r10349; // end inline asm - ld.const.u32 %r1901, [matrix+188]; + mov.u32 %r10357, 28; // begin inline asm - dp4a.u32.u32 %r1900, %r1901, %r5806, %r1896; + shf.l.wrap.b32 %r10350, %r10356, %r10355, %r10357; // end inline asm - ld.const.u32 %r1905, [matrix+192]; // begin inline asm - dp4a.u32.u32 %r1904, %r1905, %r5746, %r6244; + shf.l.wrap.b32 %r10354, %r10355, %r10356, %r10357; // end inline asm - ld.const.u32 %r1909, [matrix+196]; + mov.u32 %r10365, 21; // begin inline asm - dp4a.u32.u32 %r1908, %r1909, %r5750, %r1904; + shf.l.wrap.b32 %r10358, %r10364, %r10363, %r10365; // end inline asm - ld.const.u32 %r1913, [matrix+200]; // begin inline asm - dp4a.u32.u32 %r1912, %r1913, %r5754, %r1908; + shf.l.wrap.b32 %r10362, %r10363, %r10364, %r10365; // end inline asm - ld.const.u32 %r1917, [matrix+204]; + mov.u32 %r10373, 15; // begin inline asm - dp4a.u32.u32 %r1916, %r1917, %r5758, %r1912; + shf.l.wrap.b32 %r10366, %r10372, %r10371, %r10373; // end inline asm - ld.const.u32 %r1921, [matrix+208]; // begin inline asm - dp4a.u32.u32 %r1920, %r1921, %r5762, %r1916; + shf.l.wrap.b32 %r10370, %r10371, %r10372, %r10373; // end inline asm - ld.const.u32 %r1925, [matrix+212]; + mov.u32 %r10381, 10; // begin inline asm - dp4a.u32.u32 %r1924, %r1925, %r5766, %r1920; + shf.l.wrap.b32 %r10374, %r10380, %r10379, %r10381; // end inline asm - ld.const.u32 %r1929, [matrix+216]; // begin inline asm - dp4a.u32.u32 %r1928, %r1929, %r5770, %r1924; + shf.l.wrap.b32 %r10378, %r10379, %r10380, %r10381; // end inline asm - ld.const.u32 %r1933, [matrix+220]; + mov.u32 %r10389, 6; // begin inline asm - dp4a.u32.u32 %r1932, %r1933, %r5774, %r1928; + shf.l.wrap.b32 %r10382, %r10388, %r10387, %r10389; // end inline asm - ld.const.u32 %r1937, [matrix+224]; // begin inline asm - dp4a.u32.u32 %r1936, %r1937, %r5778, %r1932; + shf.l.wrap.b32 %r10386, %r10387, %r10388, %r10389; // end inline asm - ld.const.u32 %r1941, [matrix+228]; + mov.u32 %r10397, 3; // begin inline asm - dp4a.u32.u32 %r1940, %r1941, %r5782, %r1936; + shf.l.wrap.b32 %r10390, %r10396, %r10395, %r10397; // end inline asm - ld.const.u32 %r1945, [matrix+232]; // begin inline asm - dp4a.u32.u32 %r1944, %r1945, %r5786, %r1940; + shf.l.wrap.b32 %r10394, %r10395, %r10396, %r10397; // end inline asm - ld.const.u32 %r1949, [matrix+236]; // begin inline asm - dp4a.u32.u32 %r1948, %r1949, %r5790, %r1944; + shf.l.wrap.b32 %r10398, %r10404, %r10403, %r10083; // end inline asm - ld.const.u32 %r1953, [matrix+240]; // begin inline asm - dp4a.u32.u32 %r1952, %r1953, %r5794, %r1948; + shf.l.wrap.b32 %r10402, %r10403, %r10404, %r10083; // end inline asm - ld.const.u32 %r1957, [matrix+244]; // begin inline asm - dp4a.u32.u32 %r1956, %r1957, %r5798, %r1952; + // chi + lop3.b32 %r10406, %r10441, %r10214, %r10262, 0xD2; + lop3.b32 %r10407, %r10444, %r10218, %r10266, 0xD2; // end inline asm - ld.const.u32 %r1961, [matrix+248]; // begin inline asm - dp4a.u32.u32 %r1960, %r1961, %r5802, %r1956; + // chi + lop3.b32 %r29824, %r10214, %r10262, %r10358, 0xD2; + lop3.b32 %r29825, %r10218, %r10266, %r10362, 0xD2; // end inline asm - ld.const.u32 %r1965, [matrix+252]; // begin inline asm - dp4a.u32.u32 %r1964, %r1965, %r5806, %r1960; + // chi + lop3.b32 %r29820, %r10262, %r10358, %r10310, 0xD2; + lop3.b32 %r29821, %r10266, %r10362, %r10314, 0xD2; // end inline asm - shr.u32 %r5971, %r1900, 6; - and.b32 %r5972, %r5971, 240; - shr.u32 %r5973, %r1964, 10; - or.b32 %r5974, %r5973, %r5972; - xor.b32 %r5975, %r5810, %r5974; - ld.const.u32 %r1969, [matrix+256]; // begin inline asm - dp4a.u32.u32 %r1968, %r1969, %r5746, %r6244; + // chi + lop3.b32 %r29816, %r10358, %r10310, %r10441, 0xD2; + lop3.b32 %r29817, %r10362, %r10314, %r10444, 0xD2; // end inline asm - ld.const.u32 %r1973, [matrix+260]; // begin inline asm - dp4a.u32.u32 %r1972, %r1973, %r5750, %r1968; + // chi + lop3.b32 %r29814, %r10310, %r10441, %r10214, 0xD2; + lop3.b32 %r29815, %r10314, %r10444, %r10218, 0xD2; // end inline asm - ld.const.u32 %r1977, [matrix+264]; // begin inline asm - dp4a.u32.u32 %r1976, %r1977, %r5754, %r1972; + // chi + lop3.b32 %r29810, %r10350, %r10222, %r10390, 0xD2; + lop3.b32 %r29811, %r10354, %r10226, %r10394, 0xD2; // end inline asm - ld.const.u32 %r1981, [matrix+268]; // begin inline asm - dp4a.u32.u32 %r1980, %r1981, %r5758, %r1976; + // chi + lop3.b32 %r29822, %r10222, %r10390, %r10334, 0xD2; + lop3.b32 %r29823, %r10226, %r10394, %r10338, 0xD2; // end inline asm - ld.const.u32 %r1985, [matrix+272]; // begin inline asm - dp4a.u32.u32 %r1984, %r1985, %r5762, %r1980; + // chi + lop3.b32 %r29818, %r10390, %r10334, %r10230, 0xD2; + lop3.b32 %r29819, %r10394, %r10338, %r10234, 0xD2; // end inline asm - ld.const.u32 %r1989, [matrix+276]; // begin inline asm - dp4a.u32.u32 %r1988, %r1989, %r5766, %r1984; + // chi + lop3.b32 %r29790, %r10334, %r10230, %r10350, 0xD2; + lop3.b32 %r29791, %r10338, %r10234, %r10354, 0xD2; // end inline asm - ld.const.u32 %r1993, [matrix+280]; + st.local.v2.u32 [%rd3+88], {%r29790, %r29791}; // begin inline asm - dp4a.u32.u32 %r1992, %r1993, %r5770, %r1988; + // chi + lop3.b32 %r29782, %r10230, %r10350, %r10222, 0xD2; + lop3.b32 %r29783, %r10234, %r10354, %r10226, 0xD2; // end inline asm - ld.const.u32 %r1997, [matrix+284]; + st.local.v2.u32 [%rd3+96], {%r29782, %r29783}; // begin inline asm - dp4a.u32.u32 %r1996, %r1997, %r5774, %r1992; + // chi + lop3.b32 %r29808, %r10398, %r10382, %r10270, 0xD2; + lop3.b32 %r29809, %r10402, %r10386, %r10274, 0xD2; // end inline asm - ld.const.u32 %r2001, [matrix+288]; + st.local.v2.u32 [%rd3+104], {%r29808, %r29809}; // begin inline asm - dp4a.u32.u32 %r2000, %r2001, %r5778, %r1996; + // chi + lop3.b32 %r29802, %r10382, %r10270, %r10278, 0xD2; + lop3.b32 %r29803, %r10386, %r10274, %r10282, 0xD2; // end inline asm - ld.const.u32 %r2005, [matrix+292]; + st.local.v2.u32 [%rd3+112], {%r29802, %r29803}; // begin inline asm - dp4a.u32.u32 %r2004, %r2005, %r5782, %r2000; + // chi + lop3.b32 %r29796, %r10270, %r10278, %r10246, 0xD2; + lop3.b32 %r29797, %r10274, %r10282, %r10250, 0xD2; // end inline asm - ld.const.u32 %r2009, [matrix+296]; + st.local.v2.u32 [%rd3+120], {%r29796, %r29797}; // begin inline asm - dp4a.u32.u32 %r2008, %r2009, %r5786, %r2004; + // chi + lop3.b32 %r29788, %r10278, %r10246, %r10398, 0xD2; + lop3.b32 %r29789, %r10282, %r10250, %r10402, 0xD2; // end inline asm - ld.const.u32 %r2013, [matrix+300]; + st.local.v2.u32 [%rd3+128], {%r29788, %r29789}; // begin inline asm - dp4a.u32.u32 %r2012, %r2013, %r5790, %r2008; + // chi + lop3.b32 %r29780, %r10246, %r10398, %r10382, 0xD2; + lop3.b32 %r29781, %r10250, %r10402, %r10386, 0xD2; // end inline asm - ld.const.u32 %r2017, [matrix+304]; + st.local.v2.u32 [%rd3+136], {%r29780, %r29781}; // begin inline asm - dp4a.u32.u32 %r2016, %r2017, %r5794, %r2012; + // chi + lop3.b32 %r29806, %r10302, %r10342, %r10374, 0xD2; + lop3.b32 %r29807, %r10306, %r10346, %r10378, 0xD2; // end inline asm - ld.const.u32 %r2021, [matrix+308]; + st.local.v2.u32 [%rd3+144], {%r29806, %r29807}; // begin inline asm - dp4a.u32.u32 %r2020, %r2021, %r5798, %r2016; + // chi + lop3.b32 %r29800, %r10342, %r10374, %r10366, 0xD2; + lop3.b32 %r29801, %r10346, %r10378, %r10370, 0xD2; // end inline asm - ld.const.u32 %r2025, [matrix+312]; + st.local.v2.u32 [%rd3+152], {%r29800, %r29801}; // begin inline asm - dp4a.u32.u32 %r2024, %r2025, %r5802, %r2020; + // chi + lop3.b32 %r29794, %r10374, %r10366, %r10286, 0xD2; + lop3.b32 %r29795, %r10378, %r10370, %r10290, 0xD2; // end inline asm - ld.const.u32 %r2029, [matrix+316]; + st.local.v2.u32 [%rd3+160], {%r29794, %r29795}; // begin inline asm - dp4a.u32.u32 %r2028, %r2029, %r5806, %r2024; + // chi + lop3.b32 %r29786, %r10366, %r10286, %r10302, 0xD2; + lop3.b32 %r29787, %r10370, %r10290, %r10306, 0xD2; // end inline asm - ld.const.u32 %r2033, [matrix+320]; + st.local.v2.u32 [%rd3+168], {%r29786, %r29787}; // begin inline asm - dp4a.u32.u32 %r2032, %r2033, %r5746, %r6244; + // chi + lop3.b32 %r29778, %r10286, %r10302, %r10342, 0xD2; + lop3.b32 %r29779, %r10290, %r10306, %r10346, 0xD2; // end inline asm - ld.const.u32 %r2037, [matrix+324]; + st.local.v2.u32 [%rd3+176], {%r29778, %r29779}; // begin inline asm - dp4a.u32.u32 %r2036, %r2037, %r5750, %r2032; + // chi + lop3.b32 %r29804, %r10254, %r10326, %r10238, 0xD2; + lop3.b32 %r29805, %r10258, %r10330, %r10242, 0xD2; // end inline asm - ld.const.u32 %r2041, [matrix+328]; + st.local.v2.u32 [%rd3+184], {%r29804, %r29805}; // begin inline asm - dp4a.u32.u32 %r2040, %r2041, %r5754, %r2036; + // chi + lop3.b32 %r29798, %r10326, %r10238, %r10294, 0xD2; + lop3.b32 %r29799, %r10330, %r10242, %r10298, 0xD2; // end inline asm - ld.const.u32 %r2045, [matrix+332]; + st.local.v2.u32 [%rd3+192], {%r29798, %r29799}; // begin inline asm - dp4a.u32.u32 %r2044, %r2045, %r5758, %r2040; + // chi + lop3.b32 %r29792, %r10238, %r10294, %r10318, 0xD2; + lop3.b32 %r29793, %r10242, %r10298, %r10322, 0xD2; // end inline asm - ld.const.u32 %r2049, [matrix+336]; + st.local.v2.u32 [%rd3+200], {%r29792, %r29793}; // begin inline asm - dp4a.u32.u32 %r2048, %r2049, %r5762, %r2044; + // chi + lop3.b32 %r29784, %r10294, %r10318, %r10254, 0xD2; + lop3.b32 %r29785, %r10298, %r10322, %r10258, 0xD2; // end inline asm - ld.const.u32 %r2053, [matrix+340]; + st.local.v2.u32 [%rd3+208], {%r29784, %r29785}; // begin inline asm - dp4a.u32.u32 %r2052, %r2053, %r5766, %r2048; + // chi + lop3.b32 %r29776, %r10318, %r10254, %r10326, 0xD2; + lop3.b32 %r29777, %r10322, %r10258, %r10330, 0xD2; // end inline asm - ld.const.u32 %r2057, [matrix+344]; + st.local.v2.u32 [%rd3+216], {%r29776, %r29777}; + mul.wide.s32 %rd595, %r29826, 8; + mov.u64 %rd596, keccak_round_constants; + cvta.const.u64 %rd597, %rd596; + add.s64 %rd594, %rd597, %rd595; // begin inline asm - dp4a.u32.u32 %r2056, %r2057, %r5770, %r2052; + ld.global.nc.v2.u32 {%r10606,%r10607}, [%rd594]; // end inline asm - ld.const.u32 %r2061, [matrix+348]; + xor.b32 %r29812, %r10406, %r10606; + xor.b32 %r29813, %r10407, %r10607; + add.s32 %r29826, %r29826, 1; + setp.lt.u32 %p23, %r29826, 23; + @%p23 bra $L__BB2_34; + + add.u64 %rd84, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r29824, %r29825}; + st.local.v2.u32 [%rd3+72], {%r29822, %r29823}; + st.local.v2.u32 [%rd3+40], {%r29820, %r29821}; + st.local.v2.u32 [%rd3+80], {%r29818, %r29819}; + st.local.v2.u32 [%rd3+48], {%r29816, %r29817}; + st.local.v2.u32 [%rd3+56], {%r29814, %r29815}; + st.local.v2.u32 [%rd3+24], {%r29812, %r29813}; // begin inline asm - dp4a.u32.u32 %r2060, %r2061, %r5774, %r2056; + // xor5 + lop3.b32 %r10618, %r29812, %r29810, %r29808, 0x96; + lop3.b32 %r10618, %r10618, %r29806, %r29804, 0x96; + lop3.b32 %r10619, %r29813, %r29811, %r29809, 0x96; + lop3.b32 %r10619, %r10619, %r29807, %r29805, 0x96; // end inline asm - ld.const.u32 %r2065, [matrix+352]; // begin inline asm - dp4a.u32.u32 %r2064, %r2065, %r5778, %r2060; + // xor5 + lop3.b32 %r10630, %r29824, %r29822, %r29802, 0x96; + lop3.b32 %r10630, %r10630, %r29800, %r29798, 0x96; + lop3.b32 %r10631, %r29825, %r29823, %r29803, 0x96; + lop3.b32 %r10631, %r10631, %r29801, %r29799, 0x96; // end inline asm - ld.const.u32 %r2069, [matrix+356]; // begin inline asm - dp4a.u32.u32 %r2068, %r2069, %r5782, %r2064; + // xor5 + lop3.b32 %r10642, %r29820, %r29818, %r29796, 0x96; + lop3.b32 %r10642, %r10642, %r29794, %r29792, 0x96; + lop3.b32 %r10643, %r29821, %r29819, %r29797, 0x96; + lop3.b32 %r10643, %r10643, %r29795, %r29793, 0x96; // end inline asm - ld.const.u32 %r2073, [matrix+360]; // begin inline asm - dp4a.u32.u32 %r2072, %r2073, %r5786, %r2068; + // xor5 + lop3.b32 %r10654, %r29816, %r29790, %r29788, 0x96; + lop3.b32 %r10654, %r10654, %r29786, %r29784, 0x96; + lop3.b32 %r10655, %r29817, %r29791, %r29789, 0x96; + lop3.b32 %r10655, %r10655, %r29787, %r29785, 0x96; // end inline asm - ld.const.u32 %r2077, [matrix+364]; // begin inline asm - dp4a.u32.u32 %r2076, %r2077, %r5790, %r2072; + // xor5 + lop3.b32 %r10666, %r29814, %r29782, %r29780, 0x96; + lop3.b32 %r10666, %r10666, %r29778, %r29776, 0x96; + lop3.b32 %r10667, %r29815, %r29783, %r29781, 0x96; + lop3.b32 %r10667, %r10667, %r29779, %r29777, 0x96; // end inline asm - ld.const.u32 %r2081, [matrix+368]; + mov.u32 %r10870, 1; // begin inline asm - dp4a.u32.u32 %r2080, %r2081, %r5794, %r2076; + shf.l.wrap.b32 %r10678, %r10631, %r10630, %r10870; // end inline asm - ld.const.u32 %r2085, [matrix+372]; // begin inline asm - dp4a.u32.u32 %r2084, %r2085, %r5798, %r2080; + shf.l.wrap.b32 %r10682, %r10630, %r10631, %r10870; // end inline asm - ld.const.u32 %r2089, [matrix+376]; + xor.b32 %r10897, %r10678, %r10666; + xor.b32 %r10898, %r10682, %r10667; + xor.b32 %r10825, %r29812, %r10897; + xor.b32 %r10828, %r29813, %r10898; + xor.b32 %r10788, %r29809, %r10898; + xor.b32 %r10787, %r29808, %r10897; + st.local.v2.u32 [%rd3+104], {%r10787, %r10788}; // begin inline asm - dp4a.u32.u32 %r2088, %r2089, %r5802, %r2084; + shf.l.wrap.b32 %r10686, %r10643, %r10642, %r10870; // end inline asm - ld.const.u32 %r2093, [matrix+380]; // begin inline asm - dp4a.u32.u32 %r2092, %r2093, %r5806, %r2088; + shf.l.wrap.b32 %r10690, %r10642, %r10643, %r10870; // end inline asm - shr.u32 %r5976, %r2028, 6; - and.b32 %r5977, %r5976, 240; - shr.u32 %r5978, %r2092, 10; - or.b32 %r5979, %r5978, %r5977; - xor.b32 %r5980, %r5822, %r5979; - ld.const.u32 %r2097, [matrix+384]; + xor.b32 %r10899, %r10686, %r10618; + xor.b32 %r10900, %r10690, %r10619; + xor.b32 %r10724, %r29822, %r10899; + xor.b32 %r10723, %r29823, %r10900; + xor.b32 %r10763, %r29801, %r10900; + xor.b32 %r10764, %r29800, %r10899; + st.local.v2.u32 [%rd3+152], {%r10764, %r10763}; // begin inline asm - dp4a.u32.u32 %r2096, %r2097, %r5746, %r6244; + shf.l.wrap.b32 %r10694, %r10655, %r10654, %r10870; // end inline asm - ld.const.u32 %r2101, [matrix+388]; // begin inline asm - dp4a.u32.u32 %r2100, %r2101, %r5750, %r2096; + shf.l.wrap.b32 %r10698, %r10654, %r10655, %r10870; // end inline asm - ld.const.u32 %r2105, [matrix+392]; + xor.b32 %r10901, %r10694, %r10630; + xor.b32 %r10902, %r10698, %r10631; + xor.b32 %r10747, %r29797, %r10902; + xor.b32 %r10748, %r29796, %r10901; + st.local.v2.u32 [%rd3+120], {%r10748, %r10747}; + xor.b32 %r10739, %r29793, %r10902; + xor.b32 %r10740, %r29792, %r10901; + st.local.v2.u32 [%rd3+200], {%r10740, %r10739}; // begin inline asm - dp4a.u32.u32 %r2104, %r2105, %r5754, %r2100; + shf.l.wrap.b32 %r10702, %r10667, %r10666, %r10870; // end inline asm - ld.const.u32 %r2109, [matrix+396]; // begin inline asm - dp4a.u32.u32 %r2108, %r2109, %r5758, %r2104; + shf.l.wrap.b32 %r10706, %r10666, %r10667, %r10870; // end inline asm - ld.const.u32 %r2113, [matrix+400]; + xor.b32 %r10903, %r10702, %r10642; + xor.b32 %r10904, %r10706, %r10643; + xor.b32 %r10771, %r29816, %r10903; + xor.b32 %r10772, %r29817, %r10904; + xor.b32 %r10780, %r29787, %r10904; + xor.b32 %r10779, %r29786, %r10903; + st.local.v2.u32 [%rd3+168], {%r10779, %r10780}; // begin inline asm - dp4a.u32.u32 %r2112, %r2113, %r5762, %r2108; + shf.l.wrap.b32 %r10710, %r10619, %r10618, %r10870; // end inline asm - ld.const.u32 %r2117, [matrix+404]; // begin inline asm - dp4a.u32.u32 %r2116, %r2117, %r5766, %r2112; + shf.l.wrap.b32 %r10714, %r10618, %r10619, %r10870; // end inline asm - ld.const.u32 %r2121, [matrix+408]; + xor.b32 %r10905, %r10710, %r10654; + xor.b32 %r10906, %r10714, %r10655; + xor.b32 %r10731, %r29782, %r10905; + xor.b32 %r10732, %r29783, %r10906; + xor.b32 %r10756, %r29777, %r10906; + xor.b32 %r10755, %r29776, %r10905; + st.local.v2.u32 [%rd3+216], {%r10755, %r10756}; // begin inline asm - dp4a.u32.u32 %r2120, %r2121, %r5770, %r2116; + shf.l.wrap.b32 %r10718, %r10724, %r10723, %r10221; // end inline asm - ld.const.u32 %r2125, [matrix+412]; // begin inline asm - dp4a.u32.u32 %r2124, %r2125, %r5774, %r2120; + shf.l.wrap.b32 %r10722, %r10723, %r10724, %r10221; // end inline asm - ld.const.u32 %r2129, [matrix+416]; // begin inline asm - dp4a.u32.u32 %r2128, %r2129, %r5778, %r2124; + shf.l.wrap.b32 %r10726, %r10732, %r10731, %r10229; // end inline asm - ld.const.u32 %r2133, [matrix+420]; // begin inline asm - dp4a.u32.u32 %r2132, %r2133, %r5782, %r2128; + shf.l.wrap.b32 %r10730, %r10731, %r10732, %r10229; // end inline asm - ld.const.u32 %r2137, [matrix+424]; // begin inline asm - dp4a.u32.u32 %r2136, %r2137, %r5786, %r2132; + shf.l.wrap.b32 %r10738, %r10739, %r10740, %r10237; // end inline asm - ld.const.u32 %r2141, [matrix+428]; // begin inline asm - dp4a.u32.u32 %r2140, %r2141, %r5790, %r2136; + shf.l.wrap.b32 %r10734, %r10740, %r10739, %r10237; // end inline asm - ld.const.u32 %r2145, [matrix+432]; + st.local.v2.u32 [%rd3+96], {%r10734, %r10738}; // begin inline asm - dp4a.u32.u32 %r2144, %r2145, %r5794, %r2140; + shf.l.wrap.b32 %r10742, %r10748, %r10747, %r10269; // end inline asm - ld.const.u32 %r2149, [matrix+436]; // begin inline asm - dp4a.u32.u32 %r2148, %r2149, %r5798, %r2144; + shf.l.wrap.b32 %r10746, %r10747, %r10748, %r10269; // end inline asm - ld.const.u32 %r2153, [matrix+440]; // begin inline asm - dp4a.u32.u32 %r2152, %r2153, %r5802, %r2148; + shf.l.wrap.b32 %r10750, %r10756, %r10755, %r10317; // end inline asm - ld.const.u32 %r2157, [matrix+444]; // begin inline asm - dp4a.u32.u32 %r2156, %r2157, %r5806, %r2152; + shf.l.wrap.b32 %r10754, %r10755, %r10756, %r10317; // end inline asm - ld.const.u32 %r2161, [matrix+448]; // begin inline asm - dp4a.u32.u32 %r2160, %r2161, %r5746, %r6244; + shf.l.wrap.b32 %r10762, %r10763, %r10764, %r10341; // end inline asm - ld.const.u32 %r2165, [matrix+452]; // begin inline asm - dp4a.u32.u32 %r2164, %r2165, %r5750, %r2160; + shf.l.wrap.b32 %r10758, %r10764, %r10763, %r10341; // end inline asm - ld.const.u32 %r2169, [matrix+456]; + st.local.v2.u32 [%rd3+88], {%r10758, %r10762}; // begin inline asm - dp4a.u32.u32 %r2168, %r2169, %r5754, %r2164; + shf.l.wrap.b32 %r10766, %r10772, %r10771, %r10357; // end inline asm - ld.const.u32 %r2173, [matrix+460]; // begin inline asm - dp4a.u32.u32 %r2172, %r2173, %r5758, %r2168; + shf.l.wrap.b32 %r10770, %r10771, %r10772, %r10357; // end inline asm - ld.const.u32 %r2177, [matrix+464]; // begin inline asm - dp4a.u32.u32 %r2176, %r2177, %r5762, %r2172; + shf.l.wrap.b32 %r10774, %r10780, %r10779, %r10365; // end inline asm - ld.const.u32 %r2181, [matrix+468]; // begin inline asm - dp4a.u32.u32 %r2180, %r2181, %r5766, %r2176; + shf.l.wrap.b32 %r10778, %r10779, %r10780, %r10365; // end inline asm - ld.const.u32 %r2185, [matrix+472]; // begin inline asm - dp4a.u32.u32 %r2184, %r2185, %r5770, %r2180; + shf.l.wrap.b32 %r10782, %r10788, %r10787, %r10397; // end inline asm - ld.const.u32 %r2189, [matrix+476]; // begin inline asm - dp4a.u32.u32 %r2188, %r2189, %r5774, %r2184; + shf.l.wrap.b32 %r10786, %r10787, %r10788, %r10397; // end inline asm - ld.const.u32 %r2193, [matrix+480]; // begin inline asm - dp4a.u32.u32 %r2192, %r2193, %r5778, %r2188; + // chi + lop3.b32 %r10790, %r10825, %r10718, %r10742, 0xD2; + lop3.b32 %r10791, %r10828, %r10722, %r10746, 0xD2; // end inline asm - ld.const.u32 %r2197, [matrix+484]; // begin inline asm - dp4a.u32.u32 %r2196, %r2197, %r5782, %r2192; + // chi + lop3.b32 %r29959, %r10718, %r10742, %r10774, 0xD2; + lop3.b32 %r29960, %r10722, %r10746, %r10778, 0xD2; // end inline asm - ld.const.u32 %r2201, [matrix+488]; + st.local.v2.u32 [%rd3+32], {%r29959, %r29960}; // begin inline asm - dp4a.u32.u32 %r2200, %r2201, %r5786, %r2196; + // chi + lop3.b32 %r29955, %r10742, %r10774, %r10750, 0xD2; + lop3.b32 %r29956, %r10746, %r10778, %r10754, 0xD2; // end inline asm - ld.const.u32 %r2205, [matrix+492]; + st.local.v2.u32 [%rd3+40], {%r29955, %r29956}; // begin inline asm - dp4a.u32.u32 %r2204, %r2205, %r5790, %r2200; + // chi + lop3.b32 %r29951, %r10774, %r10750, %r10825, 0xD2; + lop3.b32 %r29952, %r10778, %r10754, %r10828, 0xD2; // end inline asm - ld.const.u32 %r2209, [matrix+496]; + st.local.v2.u32 [%rd3+48], {%r29951, %r29952}; // begin inline asm - dp4a.u32.u32 %r2208, %r2209, %r5794, %r2204; + // chi + lop3.b32 %r29949, %r10750, %r10825, %r10718, 0xD2; + lop3.b32 %r29950, %r10754, %r10828, %r10722, 0xD2; // end inline asm - ld.const.u32 %r2213, [matrix+500]; + st.local.v2.u32 [%rd3+56], {%r29949, %r29950}; // begin inline asm - dp4a.u32.u32 %r2212, %r2213, %r5798, %r2208; + // chi + lop3.b32 %r29945, %r10766, %r10726, %r10782, 0xD2; + lop3.b32 %r29946, %r10770, %r10730, %r10786, 0xD2; // end inline asm - ld.const.u32 %r2217, [matrix+504]; + st.local.v2.u32 [%rd3+64], {%r29945, %r29946}; // begin inline asm - dp4a.u32.u32 %r2216, %r2217, %r5802, %r2212; + // chi + lop3.b32 %r29957, %r10726, %r10782, %r10758, 0xD2; + lop3.b32 %r29958, %r10730, %r10786, %r10762, 0xD2; // end inline asm - ld.const.u32 %r2221, [matrix+508]; + st.local.v2.u32 [%rd3+72], {%r29957, %r29958}; // begin inline asm - dp4a.u32.u32 %r2220, %r2221, %r5806, %r2216; + // chi + lop3.b32 %r29953, %r10782, %r10758, %r10734, 0xD2; + lop3.b32 %r29954, %r10786, %r10762, %r10738, 0xD2; // end inline asm - shr.u32 %r5981, %r2156, 6; - and.b32 %r5982, %r5981, 240; - shr.u32 %r5983, %r2220, 10; - or.b32 %r5984, %r5983, %r5982; - xor.b32 %r5985, %r5824, %r5984; - ld.const.u32 %r2225, [matrix+512]; + st.local.v2.u32 [%rd3+80], {%r29953, %r29954}; + add.s64 %rd598, %rd597, 184; // begin inline asm - dp4a.u32.u32 %r2224, %r2225, %r5746, %r6244; + ld.global.nc.v2.u32 {%r10854,%r10855}, [%rd598]; + // end inline asm + xor.b32 %r29947, %r10790, %r10854; + xor.b32 %r29948, %r10791, %r10855; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; + st.local.u64 [%rd84], %rd354; + mov.u64 %rd602, 1179641; + st.local.u64 [%rd84+8], %rd602; + st.local.u32 [%rd84+16], %r31; + ld.global.u64 %rd603, [%rd35]; + ld.global.u64 %rd604, [%rd35+8]; + ld.global.u64 %rd605, [%rd35+16]; + ld.global.u64 %rd606, [%rd35+24]; + ld.global.u64 %rd607, [%rd35+32]; + ld.global.u64 %rd608, [%rd35+40]; + ld.global.u64 %rd609, [%rd35+48]; + ld.global.u64 %rd610, [%rd35+56]; + st.local.u64 [%rd84+32], %rd604; + st.local.u64 [%rd84+40], %rd605; + st.local.u64 [%rd84+48], %rd606; + st.local.u64 [%rd84+56], %rd607; + st.local.u64 [%rd84+64], %rd608; + st.local.u64 [%rd84+72], %rd609; + st.local.u64 [%rd84+80], %rd610; + cvt.u32.u64 %r10907, %rd603; + xor.b32 %r10908, %r31, %r10907; + st.local.u64 [%rd84+24], %rd603; + st.local.u32 [%rd84+24], %r10908; + mov.u32 %r29827, 0; + st.local.v2.u32 [%rd84+96], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+104], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+112], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+120], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+128], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+136], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+144], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+152], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+160], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+168], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+176], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+184], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+192], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+200], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+208], {%r29827, %r29827}; + st.local.v2.u32 [%rd84+216], {%r29827, %r29827}; + mov.u32 %r29842, -2147483648; + st.local.v2.u32 [%rd84+88], {%r10870, %r29842}; + ld.local.v2.u32 {%r29863, %r29864}, [%rd84+24]; + mov.b64 {%r29861, %r29862}, %rd608; + shr.u64 %rd611, %rd604, 32; + cvt.u32.u64 %r29875, %rd604; + cvt.u32.u64 %r29876, %rd611; + shr.u64 %rd612, %rd609, 32; + cvt.u32.u64 %r29873, %rd609; + cvt.u32.u64 %r29874, %rd612; + shr.u64 %rd613, %rd605, 32; + cvt.u32.u64 %r29871, %rd605; + cvt.u32.u64 %r29872, %rd613; + shr.u64 %rd614, %rd610, 32; + cvt.u32.u64 %r29869, %rd610; + cvt.u32.u64 %r29870, %rd614; + shr.u64 %rd615, %rd606, 32; + cvt.u32.u64 %r29867, %rd606; + cvt.u32.u64 %r29868, %rd615; + shr.u64 %rd616, %rd607, 32; + cvt.u32.u64 %r29865, %rd607; + cvt.u32.u64 %r29866, %rd616; + mov.u32 %r29828, %r29827; + mov.u32 %r29829, %r29827; + mov.u32 %r29830, %r29827; + mov.u32 %r29831, %r29827; + mov.u32 %r29832, %r29827; + mov.u32 %r29833, %r29827; + mov.u32 %r29834, %r29827; + mov.u32 %r29835, %r29827; + mov.u32 %r29836, %r29827; + mov.u32 %r29837, %r29827; + mov.u32 %r29838, %r29827; + mov.u32 %r29839, %r29827; + mov.u32 %r29840, %r29827; + mov.u32 %r29841, %r10870; + mov.u32 %r29843, %r29827; + mov.u32 %r29844, %r29827; + mov.u32 %r29845, %r29827; + mov.u32 %r29846, %r29827; + mov.u32 %r29847, %r29827; + mov.u32 %r29848, %r29827; + mov.u32 %r29849, %r29827; + mov.u32 %r29850, %r29827; + mov.u32 %r29851, %r29827; + mov.u32 %r29852, %r29827; + mov.u32 %r29853, %r29827; + mov.u32 %r29854, %r29827; + mov.u32 %r29855, %r29827; + mov.u32 %r29856, %r29827; + mov.u32 %r29857, %r29827; + mov.u32 %r29858, %r29827; + mov.u32 %r29859, %r29827; + mov.u32 %r29860, %r29827; + mov.u32 %r29877, %r29827; + +$L__BB2_36: + // begin inline asm + // xor5 + lop3.b32 %r10911, %r29863, %r29861, %r29859, 0x96; + lop3.b32 %r10911, %r10911, %r29857, %r29855, 0x96; + lop3.b32 %r10912, %r29864, %r29862, %r29860, 0x96; + lop3.b32 %r10912, %r10912, %r29858, %r29856, 0x96; // end inline asm - ld.const.u32 %r2229, [matrix+516]; // begin inline asm - dp4a.u32.u32 %r2228, %r2229, %r5750, %r2224; + // xor5 + lop3.b32 %r10923, %r29875, %r29873, %r29853, 0x96; + lop3.b32 %r10923, %r10923, %r29851, %r29849, 0x96; + lop3.b32 %r10924, %r29876, %r29874, %r29854, 0x96; + lop3.b32 %r10924, %r10924, %r29852, %r29850, 0x96; // end inline asm - ld.const.u32 %r2233, [matrix+520]; // begin inline asm - dp4a.u32.u32 %r2232, %r2233, %r5754, %r2228; + // xor5 + lop3.b32 %r10935, %r29871, %r29869, %r29847, 0x96; + lop3.b32 %r10935, %r10935, %r29845, %r29843, 0x96; + lop3.b32 %r10936, %r29872, %r29870, %r29848, 0x96; + lop3.b32 %r10936, %r10936, %r29846, %r29844, 0x96; // end inline asm - ld.const.u32 %r2237, [matrix+524]; // begin inline asm - dp4a.u32.u32 %r2236, %r2237, %r5758, %r2232; + // xor5 + lop3.b32 %r10947, %r29867, %r29841, %r29839, 0x96; + lop3.b32 %r10947, %r10947, %r29837, %r29835, 0x96; + lop3.b32 %r10948, %r29868, %r29842, %r29840, 0x96; + lop3.b32 %r10948, %r10948, %r29838, %r29836, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r10959, %r29865, %r29833, %r29831, 0x96; + lop3.b32 %r10959, %r10959, %r29829, %r29827, 0x96; + lop3.b32 %r10960, %r29866, %r29834, %r29832, 0x96; + lop3.b32 %r10960, %r10960, %r29830, %r29828, 0x96; // end inline asm - ld.const.u32 %r2241, [matrix+528]; // begin inline asm - dp4a.u32.u32 %r2240, %r2241, %r5762, %r2236; + shf.l.wrap.b32 %r10971, %r10924, %r10923, %r10870; // end inline asm - ld.const.u32 %r2245, [matrix+532]; // begin inline asm - dp4a.u32.u32 %r2244, %r2245, %r5766, %r2240; + shf.l.wrap.b32 %r10975, %r10923, %r10924, %r10870; // end inline asm - ld.const.u32 %r2249, [matrix+536]; + xor.b32 %r11405, %r10971, %r10959; + xor.b32 %r11406, %r10975, %r10960; + xor.b32 %r11238, %r29863, %r11405; + xor.b32 %r11241, %r29864, %r11406; + xor.b32 %r11145, %r29861, %r11405; + xor.b32 %r11144, %r29862, %r11406; + xor.b32 %r11192, %r29859, %r11405; + xor.b32 %r11193, %r29860, %r11406; + xor.b32 %r11097, %r29857, %r11405; + xor.b32 %r11096, %r29858, %r11406; + xor.b32 %r11048, %r29855, %r11405; + xor.b32 %r11049, %r29856, %r11406; // begin inline asm - dp4a.u32.u32 %r2248, %r2249, %r5770, %r2244; + shf.l.wrap.b32 %r10979, %r10936, %r10935, %r10870; // end inline asm - ld.const.u32 %r2253, [matrix+540]; // begin inline asm - dp4a.u32.u32 %r2252, %r2253, %r5774, %r2248; + shf.l.wrap.b32 %r10983, %r10935, %r10936, %r10870; // end inline asm - ld.const.u32 %r2257, [matrix+544]; + xor.b32 %r11407, %r10979, %r10911; + xor.b32 %r11408, %r10983, %r10912; + xor.b32 %r11200, %r29875, %r11407; + xor.b32 %r11201, %r29876, %r11408; + xor.b32 %r11017, %r29873, %r11407; + xor.b32 %r11016, %r29874, %r11408; + xor.b32 %r11176, %r29853, %r11407; + xor.b32 %r11177, %r29854, %r11408; + xor.b32 %r11137, %r29851, %r11407; + xor.b32 %r11136, %r29852, %r11408; + xor.b32 %r11120, %r29849, %r11407; + xor.b32 %r11121, %r29850, %r11408; // begin inline asm - dp4a.u32.u32 %r2256, %r2257, %r5778, %r2252; + shf.l.wrap.b32 %r10987, %r10948, %r10947, %r10870; // end inline asm - ld.const.u32 %r2261, [matrix+548]; // begin inline asm - dp4a.u32.u32 %r2260, %r2261, %r5782, %r2256; + shf.l.wrap.b32 %r10991, %r10947, %r10948, %r10870; // end inline asm - ld.const.u32 %r2265, [matrix+552]; + xor.b32 %r11409, %r10987, %r10923; + xor.b32 %r11410, %r10991, %r10924; + xor.b32 %r11057, %r29871, %r11409; + xor.b32 %r11056, %r29872, %r11410; + xor.b32 %r11184, %r29869, %r11409; + xor.b32 %r11185, %r29870, %r11410; + xor.b32 %r11065, %r29847, %r11409; + xor.b32 %r11064, %r29848, %r11410; + xor.b32 %r11168, %r29845, %r11409; + xor.b32 %r11169, %r29846, %r11410; + xor.b32 %r11033, %r29843, %r11409; + xor.b32 %r11032, %r29844, %r11410; // begin inline asm - dp4a.u32.u32 %r2264, %r2265, %r5786, %r2260; + shf.l.wrap.b32 %r10995, %r10960, %r10959, %r10870; // end inline asm - ld.const.u32 %r2269, [matrix+556]; // begin inline asm - dp4a.u32.u32 %r2268, %r2269, %r5790, %r2264; + shf.l.wrap.b32 %r10999, %r10959, %r10960, %r10870; // end inline asm - ld.const.u32 %r2273, [matrix+560]; + xor.b32 %r11411, %r10995, %r10935; + xor.b32 %r11412, %r10999, %r10936; + xor.b32 %r11152, %r29867, %r11411; + xor.b32 %r11153, %r29868, %r11412; + xor.b32 %r11129, %r29841, %r11411; + xor.b32 %r11128, %r29842, %r11412; + xor.b32 %r11072, %r29839, %r11411; + xor.b32 %r11073, %r29840, %r11412; + xor.b32 %r11160, %r29837, %r11411; + xor.b32 %r11161, %r29838, %r11412; + xor.b32 %r11089, %r29835, %r11411; + xor.b32 %r11088, %r29836, %r11412; // begin inline asm - dp4a.u32.u32 %r2272, %r2273, %r5794, %r2268; + shf.l.wrap.b32 %r11003, %r10912, %r10911, %r10870; // end inline asm - ld.const.u32 %r2277, [matrix+564]; // begin inline asm - dp4a.u32.u32 %r2276, %r2277, %r5798, %r2272; + shf.l.wrap.b32 %r11007, %r10911, %r10912, %r10870; // end inline asm - ld.const.u32 %r2281, [matrix+568]; + xor.b32 %r11413, %r11003, %r10947; + xor.b32 %r11414, %r11007, %r10948; + xor.b32 %r11104, %r29865, %r11413; + xor.b32 %r11105, %r29866, %r11414; + xor.b32 %r11024, %r29833, %r11413; + xor.b32 %r11025, %r29834, %r11414; + xor.b32 %r11041, %r29831, %r11413; + xor.b32 %r11040, %r29832, %r11414; + xor.b32 %r11080, %r29829, %r11413; + xor.b32 %r11081, %r29830, %r11414; + xor.b32 %r11112, %r29827, %r11413; + xor.b32 %r11113, %r29828, %r11414; + mov.u32 %r11018, 44; // begin inline asm - dp4a.u32.u32 %r2280, %r2281, %r5802, %r2276; + shf.l.wrap.b32 %r11011, %r11017, %r11016, %r11018; // end inline asm - ld.const.u32 %r2285, [matrix+572]; // begin inline asm - dp4a.u32.u32 %r2284, %r2285, %r5806, %r2280; + shf.l.wrap.b32 %r11015, %r11016, %r11017, %r11018; // end inline asm - ld.const.u32 %r2289, [matrix+576]; + mov.u32 %r11026, 20; // begin inline asm - dp4a.u32.u32 %r2288, %r2289, %r5746, %r6244; + shf.l.wrap.b32 %r11019, %r11025, %r11024, %r11026; // end inline asm - ld.const.u32 %r2293, [matrix+580]; // begin inline asm - dp4a.u32.u32 %r2292, %r2293, %r5750, %r2288; + shf.l.wrap.b32 %r11023, %r11024, %r11025, %r11026; // end inline asm - ld.const.u32 %r2297, [matrix+584]; + mov.u32 %r11034, 61; // begin inline asm - dp4a.u32.u32 %r2296, %r2297, %r5754, %r2292; + shf.l.wrap.b32 %r11027, %r11033, %r11032, %r11034; // end inline asm - ld.const.u32 %r2301, [matrix+588]; // begin inline asm - dp4a.u32.u32 %r2300, %r2301, %r5758, %r2296; + shf.l.wrap.b32 %r11031, %r11032, %r11033, %r11034; // end inline asm - ld.const.u32 %r2305, [matrix+592]; + mov.u32 %r11042, 39; // begin inline asm - dp4a.u32.u32 %r2304, %r2305, %r5762, %r2300; + shf.l.wrap.b32 %r11035, %r11041, %r11040, %r11042; // end inline asm - ld.const.u32 %r2309, [matrix+596]; // begin inline asm - dp4a.u32.u32 %r2308, %r2309, %r5766, %r2304; + shf.l.wrap.b32 %r11039, %r11040, %r11041, %r11042; // end inline asm - ld.const.u32 %r2313, [matrix+600]; + mov.u32 %r11050, 18; // begin inline asm - dp4a.u32.u32 %r2312, %r2313, %r5770, %r2308; + shf.l.wrap.b32 %r11043, %r11049, %r11048, %r11050; // end inline asm - ld.const.u32 %r2317, [matrix+604]; // begin inline asm - dp4a.u32.u32 %r2316, %r2317, %r5774, %r2312; + shf.l.wrap.b32 %r11047, %r11048, %r11049, %r11050; // end inline asm - ld.const.u32 %r2321, [matrix+608]; + mov.u32 %r11058, 62; // begin inline asm - dp4a.u32.u32 %r2320, %r2321, %r5778, %r2316; + shf.l.wrap.b32 %r11051, %r11057, %r11056, %r11058; // end inline asm - ld.const.u32 %r2325, [matrix+612]; // begin inline asm - dp4a.u32.u32 %r2324, %r2325, %r5782, %r2320; + shf.l.wrap.b32 %r11055, %r11056, %r11057, %r11058; // end inline asm - ld.const.u32 %r2329, [matrix+616]; + mov.u32 %r11066, 43; // begin inline asm - dp4a.u32.u32 %r2328, %r2329, %r5786, %r2324; + shf.l.wrap.b32 %r11059, %r11065, %r11064, %r11066; // end inline asm - ld.const.u32 %r2333, [matrix+620]; // begin inline asm - dp4a.u32.u32 %r2332, %r2333, %r5790, %r2328; + shf.l.wrap.b32 %r11063, %r11064, %r11065, %r11066; // end inline asm - ld.const.u32 %r2337, [matrix+624]; + mov.u32 %r11074, 25; // begin inline asm - dp4a.u32.u32 %r2336, %r2337, %r5794, %r2332; + shf.l.wrap.b32 %r11067, %r11073, %r11072, %r11074; // end inline asm - ld.const.u32 %r2341, [matrix+628]; // begin inline asm - dp4a.u32.u32 %r2340, %r2341, %r5798, %r2336; + shf.l.wrap.b32 %r11071, %r11072, %r11073, %r11074; // end inline asm - ld.const.u32 %r2345, [matrix+632]; + mov.u32 %r11082, 8; // begin inline asm - dp4a.u32.u32 %r2344, %r2345, %r5802, %r2340; + shf.l.wrap.b32 %r11075, %r11081, %r11080, %r11082; // end inline asm - ld.const.u32 %r2349, [matrix+636]; // begin inline asm - dp4a.u32.u32 %r2348, %r2349, %r5806, %r2344; + shf.l.wrap.b32 %r11079, %r11080, %r11081, %r11082; // end inline asm - shr.u32 %r5986, %r2284, 6; - and.b32 %r5987, %r5986, 240; - shr.u32 %r5988, %r2348, 10; - or.b32 %r5989, %r5988, %r5987; - cvt.u64.u32 %rd211, %r5989; - xor.b64 %rd212, %rd10, %rd211; - ld.const.u32 %r2353, [matrix+640]; + mov.u32 %r11090, 56; // begin inline asm - dp4a.u32.u32 %r2352, %r2353, %r5746, %r6244; + shf.l.wrap.b32 %r11083, %r11089, %r11088, %r11090; // end inline asm - ld.const.u32 %r2357, [matrix+644]; // begin inline asm - dp4a.u32.u32 %r2356, %r2357, %r5750, %r2352; + shf.l.wrap.b32 %r11087, %r11088, %r11089, %r11090; // end inline asm - ld.const.u32 %r2361, [matrix+648]; + mov.u32 %r11098, 41; // begin inline asm - dp4a.u32.u32 %r2360, %r2361, %r5754, %r2356; + shf.l.wrap.b32 %r11091, %r11097, %r11096, %r11098; // end inline asm - ld.const.u32 %r2365, [matrix+652]; // begin inline asm - dp4a.u32.u32 %r2364, %r2365, %r5758, %r2360; + shf.l.wrap.b32 %r11095, %r11096, %r11097, %r11098; // end inline asm - ld.const.u32 %r2369, [matrix+656]; + mov.u32 %r11106, 27; // begin inline asm - dp4a.u32.u32 %r2368, %r2369, %r5762, %r2364; + shf.l.wrap.b32 %r11099, %r11105, %r11104, %r11106; // end inline asm - ld.const.u32 %r2373, [matrix+660]; // begin inline asm - dp4a.u32.u32 %r2372, %r2373, %r5766, %r2368; + shf.l.wrap.b32 %r11103, %r11104, %r11105, %r11106; // end inline asm - ld.const.u32 %r2377, [matrix+664]; + mov.u32 %r11114, 14; // begin inline asm - dp4a.u32.u32 %r2376, %r2377, %r5770, %r2372; + shf.l.wrap.b32 %r11107, %r11113, %r11112, %r11114; // end inline asm - ld.const.u32 %r2381, [matrix+668]; // begin inline asm - dp4a.u32.u32 %r2380, %r2381, %r5774, %r2376; + shf.l.wrap.b32 %r11111, %r11112, %r11113, %r11114; // end inline asm - ld.const.u32 %r2385, [matrix+672]; + mov.u32 %r11122, 2; // begin inline asm - dp4a.u32.u32 %r2384, %r2385, %r5778, %r2380; + shf.l.wrap.b32 %r11115, %r11121, %r11120, %r11122; // end inline asm - ld.const.u32 %r2389, [matrix+676]; // begin inline asm - dp4a.u32.u32 %r2388, %r2389, %r5782, %r2384; + shf.l.wrap.b32 %r11119, %r11120, %r11121, %r11122; // end inline asm - ld.const.u32 %r2393, [matrix+680]; + mov.u32 %r11130, 55; // begin inline asm - dp4a.u32.u32 %r2392, %r2393, %r5786, %r2388; + shf.l.wrap.b32 %r11123, %r11129, %r11128, %r11130; // end inline asm - ld.const.u32 %r2397, [matrix+684]; // begin inline asm - dp4a.u32.u32 %r2396, %r2397, %r5790, %r2392; + shf.l.wrap.b32 %r11127, %r11128, %r11129, %r11130; // end inline asm - ld.const.u32 %r2401, [matrix+688]; + mov.u32 %r11138, 45; // begin inline asm - dp4a.u32.u32 %r2400, %r2401, %r5794, %r2396; + shf.l.wrap.b32 %r11131, %r11137, %r11136, %r11138; // end inline asm - ld.const.u32 %r2405, [matrix+692]; // begin inline asm - dp4a.u32.u32 %r2404, %r2405, %r5798, %r2400; + shf.l.wrap.b32 %r11135, %r11136, %r11137, %r11138; // end inline asm - ld.const.u32 %r2409, [matrix+696]; + mov.u32 %r11146, 36; // begin inline asm - dp4a.u32.u32 %r2408, %r2409, %r5802, %r2404; + shf.l.wrap.b32 %r11139, %r11145, %r11144, %r11146; // end inline asm - ld.const.u32 %r2413, [matrix+700]; // begin inline asm - dp4a.u32.u32 %r2412, %r2413, %r5806, %r2408; + shf.l.wrap.b32 %r11143, %r11144, %r11145, %r11146; // end inline asm - ld.const.u32 %r2417, [matrix+704]; + mov.u32 %r11154, 28; // begin inline asm - dp4a.u32.u32 %r2416, %r2417, %r5746, %r6244; + shf.l.wrap.b32 %r11147, %r11153, %r11152, %r11154; // end inline asm - ld.const.u32 %r2421, [matrix+708]; // begin inline asm - dp4a.u32.u32 %r2420, %r2421, %r5750, %r2416; + shf.l.wrap.b32 %r11151, %r11152, %r11153, %r11154; // end inline asm - ld.const.u32 %r2425, [matrix+712]; + mov.u32 %r11162, 21; // begin inline asm - dp4a.u32.u32 %r2424, %r2425, %r5754, %r2420; + shf.l.wrap.b32 %r11155, %r11161, %r11160, %r11162; // end inline asm - ld.const.u32 %r2429, [matrix+716]; // begin inline asm - dp4a.u32.u32 %r2428, %r2429, %r5758, %r2424; + shf.l.wrap.b32 %r11159, %r11160, %r11161, %r11162; // end inline asm - ld.const.u32 %r2433, [matrix+720]; + mov.u32 %r11170, 15; // begin inline asm - dp4a.u32.u32 %r2432, %r2433, %r5762, %r2428; + shf.l.wrap.b32 %r11163, %r11169, %r11168, %r11170; // end inline asm - ld.const.u32 %r2437, [matrix+724]; // begin inline asm - dp4a.u32.u32 %r2436, %r2437, %r5766, %r2432; + shf.l.wrap.b32 %r11167, %r11168, %r11169, %r11170; // end inline asm - ld.const.u32 %r2441, [matrix+728]; + mov.u32 %r11178, 10; // begin inline asm - dp4a.u32.u32 %r2440, %r2441, %r5770, %r2436; + shf.l.wrap.b32 %r11171, %r11177, %r11176, %r11178; // end inline asm - ld.const.u32 %r2445, [matrix+732]; // begin inline asm - dp4a.u32.u32 %r2444, %r2445, %r5774, %r2440; + shf.l.wrap.b32 %r11175, %r11176, %r11177, %r11178; // end inline asm - ld.const.u32 %r2449, [matrix+736]; + mov.u32 %r11186, 6; // begin inline asm - dp4a.u32.u32 %r2448, %r2449, %r5778, %r2444; + shf.l.wrap.b32 %r11179, %r11185, %r11184, %r11186; // end inline asm - ld.const.u32 %r2453, [matrix+740]; // begin inline asm - dp4a.u32.u32 %r2452, %r2453, %r5782, %r2448; + shf.l.wrap.b32 %r11183, %r11184, %r11185, %r11186; // end inline asm - ld.const.u32 %r2457, [matrix+744]; + mov.u32 %r11194, 3; // begin inline asm - dp4a.u32.u32 %r2456, %r2457, %r5786, %r2452; + shf.l.wrap.b32 %r11187, %r11193, %r11192, %r11194; // end inline asm - ld.const.u32 %r2461, [matrix+748]; // begin inline asm - dp4a.u32.u32 %r2460, %r2461, %r5790, %r2456; + shf.l.wrap.b32 %r11191, %r11192, %r11193, %r11194; // end inline asm - ld.const.u32 %r2465, [matrix+752]; // begin inline asm - dp4a.u32.u32 %r2464, %r2465, %r5794, %r2460; + shf.l.wrap.b32 %r11195, %r11201, %r11200, %r10870; // end inline asm - ld.const.u32 %r2469, [matrix+756]; // begin inline asm - dp4a.u32.u32 %r2468, %r2469, %r5798, %r2464; + shf.l.wrap.b32 %r11199, %r11200, %r11201, %r10870; // end inline asm - ld.const.u32 %r2473, [matrix+760]; // begin inline asm - dp4a.u32.u32 %r2472, %r2473, %r5802, %r2468; + // chi + lop3.b32 %r11203, %r11238, %r11011, %r11059, 0xD2; + lop3.b32 %r11204, %r11241, %r11015, %r11063, 0xD2; // end inline asm - ld.const.u32 %r2477, [matrix+764]; // begin inline asm - dp4a.u32.u32 %r2476, %r2477, %r5806, %r2472; + // chi + lop3.b32 %r29875, %r11011, %r11059, %r11155, 0xD2; + lop3.b32 %r29876, %r11015, %r11063, %r11159, 0xD2; // end inline asm - shr.u32 %r5990, %r2412, 6; - and.b32 %r5991, %r5990, 240; - shr.u32 %r5992, %r2476, 10; - or.b32 %r5993, %r5992, %r5991; - cvt.u64.u32 %rd213, %r5993; - xor.b64 %rd214, %rd11, %rd213; - ld.const.u32 %r2481, [matrix+768]; // begin inline asm - dp4a.u32.u32 %r2480, %r2481, %r5746, %r6244; + // chi + lop3.b32 %r29871, %r11059, %r11155, %r11107, 0xD2; + lop3.b32 %r29872, %r11063, %r11159, %r11111, 0xD2; // end inline asm - ld.const.u32 %r2485, [matrix+772]; // begin inline asm - dp4a.u32.u32 %r2484, %r2485, %r5750, %r2480; + // chi + lop3.b32 %r29867, %r11155, %r11107, %r11238, 0xD2; + lop3.b32 %r29868, %r11159, %r11111, %r11241, 0xD2; // end inline asm - ld.const.u32 %r2489, [matrix+776]; // begin inline asm - dp4a.u32.u32 %r2488, %r2489, %r5754, %r2484; + // chi + lop3.b32 %r29865, %r11107, %r11238, %r11011, 0xD2; + lop3.b32 %r29866, %r11111, %r11241, %r11015, 0xD2; // end inline asm - ld.const.u32 %r2493, [matrix+780]; // begin inline asm - dp4a.u32.u32 %r2492, %r2493, %r5758, %r2488; + // chi + lop3.b32 %r29861, %r11147, %r11019, %r11187, 0xD2; + lop3.b32 %r29862, %r11151, %r11023, %r11191, 0xD2; // end inline asm - ld.const.u32 %r2497, [matrix+784]; // begin inline asm - dp4a.u32.u32 %r2496, %r2497, %r5762, %r2492; + // chi + lop3.b32 %r29873, %r11019, %r11187, %r11131, 0xD2; + lop3.b32 %r29874, %r11023, %r11191, %r11135, 0xD2; // end inline asm - ld.const.u32 %r2501, [matrix+788]; // begin inline asm - dp4a.u32.u32 %r2500, %r2501, %r5766, %r2496; + // chi + lop3.b32 %r29869, %r11187, %r11131, %r11027, 0xD2; + lop3.b32 %r29870, %r11191, %r11135, %r11031, 0xD2; // end inline asm - ld.const.u32 %r2505, [matrix+792]; // begin inline asm - dp4a.u32.u32 %r2504, %r2505, %r5770, %r2500; + // chi + lop3.b32 %r29841, %r11131, %r11027, %r11147, 0xD2; + lop3.b32 %r29842, %r11135, %r11031, %r11151, 0xD2; // end inline asm - ld.const.u32 %r2509, [matrix+796]; + st.local.v2.u32 [%rd84+88], {%r29841, %r29842}; // begin inline asm - dp4a.u32.u32 %r2508, %r2509, %r5774, %r2504; + // chi + lop3.b32 %r29833, %r11027, %r11147, %r11019, 0xD2; + lop3.b32 %r29834, %r11031, %r11151, %r11023, 0xD2; // end inline asm - ld.const.u32 %r2513, [matrix+800]; + st.local.v2.u32 [%rd84+96], {%r29833, %r29834}; // begin inline asm - dp4a.u32.u32 %r2512, %r2513, %r5778, %r2508; + // chi + lop3.b32 %r29859, %r11195, %r11179, %r11067, 0xD2; + lop3.b32 %r29860, %r11199, %r11183, %r11071, 0xD2; // end inline asm - ld.const.u32 %r2517, [matrix+804]; + st.local.v2.u32 [%rd84+104], {%r29859, %r29860}; // begin inline asm - dp4a.u32.u32 %r2516, %r2517, %r5782, %r2512; + // chi + lop3.b32 %r29853, %r11179, %r11067, %r11075, 0xD2; + lop3.b32 %r29854, %r11183, %r11071, %r11079, 0xD2; // end inline asm - ld.const.u32 %r2521, [matrix+808]; + st.local.v2.u32 [%rd84+112], {%r29853, %r29854}; // begin inline asm - dp4a.u32.u32 %r2520, %r2521, %r5786, %r2516; + // chi + lop3.b32 %r29847, %r11067, %r11075, %r11043, 0xD2; + lop3.b32 %r29848, %r11071, %r11079, %r11047, 0xD2; // end inline asm - ld.const.u32 %r2525, [matrix+812]; + st.local.v2.u32 [%rd84+120], {%r29847, %r29848}; // begin inline asm - dp4a.u32.u32 %r2524, %r2525, %r5790, %r2520; + // chi + lop3.b32 %r29839, %r11075, %r11043, %r11195, 0xD2; + lop3.b32 %r29840, %r11079, %r11047, %r11199, 0xD2; // end inline asm - ld.const.u32 %r2529, [matrix+816]; + st.local.v2.u32 [%rd84+128], {%r29839, %r29840}; // begin inline asm - dp4a.u32.u32 %r2528, %r2529, %r5794, %r2524; + // chi + lop3.b32 %r29831, %r11043, %r11195, %r11179, 0xD2; + lop3.b32 %r29832, %r11047, %r11199, %r11183, 0xD2; // end inline asm - ld.const.u32 %r2533, [matrix+820]; + st.local.v2.u32 [%rd84+136], {%r29831, %r29832}; // begin inline asm - dp4a.u32.u32 %r2532, %r2533, %r5798, %r2528; + // chi + lop3.b32 %r29857, %r11099, %r11139, %r11171, 0xD2; + lop3.b32 %r29858, %r11103, %r11143, %r11175, 0xD2; // end inline asm - ld.const.u32 %r2537, [matrix+824]; + st.local.v2.u32 [%rd84+144], {%r29857, %r29858}; // begin inline asm - dp4a.u32.u32 %r2536, %r2537, %r5802, %r2532; + // chi + lop3.b32 %r29851, %r11139, %r11171, %r11163, 0xD2; + lop3.b32 %r29852, %r11143, %r11175, %r11167, 0xD2; // end inline asm - ld.const.u32 %r2541, [matrix+828]; + st.local.v2.u32 [%rd84+152], {%r29851, %r29852}; // begin inline asm - dp4a.u32.u32 %r2540, %r2541, %r5806, %r2536; + // chi + lop3.b32 %r29845, %r11171, %r11163, %r11083, 0xD2; + lop3.b32 %r29846, %r11175, %r11167, %r11087, 0xD2; // end inline asm - ld.const.u32 %r2545, [matrix+832]; + st.local.v2.u32 [%rd84+160], {%r29845, %r29846}; // begin inline asm - dp4a.u32.u32 %r2544, %r2545, %r5746, %r6244; + // chi + lop3.b32 %r29837, %r11163, %r11083, %r11099, 0xD2; + lop3.b32 %r29838, %r11167, %r11087, %r11103, 0xD2; // end inline asm - ld.const.u32 %r2549, [matrix+836]; + st.local.v2.u32 [%rd84+168], {%r29837, %r29838}; // begin inline asm - dp4a.u32.u32 %r2548, %r2549, %r5750, %r2544; + // chi + lop3.b32 %r29829, %r11083, %r11099, %r11139, 0xD2; + lop3.b32 %r29830, %r11087, %r11103, %r11143, 0xD2; // end inline asm - ld.const.u32 %r2553, [matrix+840]; + st.local.v2.u32 [%rd84+176], {%r29829, %r29830}; // begin inline asm - dp4a.u32.u32 %r2552, %r2553, %r5754, %r2548; + // chi + lop3.b32 %r29855, %r11051, %r11123, %r11035, 0xD2; + lop3.b32 %r29856, %r11055, %r11127, %r11039, 0xD2; // end inline asm - ld.const.u32 %r2557, [matrix+844]; + st.local.v2.u32 [%rd84+184], {%r29855, %r29856}; // begin inline asm - dp4a.u32.u32 %r2556, %r2557, %r5758, %r2552; + // chi + lop3.b32 %r29849, %r11123, %r11035, %r11091, 0xD2; + lop3.b32 %r29850, %r11127, %r11039, %r11095, 0xD2; // end inline asm - ld.const.u32 %r2561, [matrix+848]; + st.local.v2.u32 [%rd84+192], {%r29849, %r29850}; // begin inline asm - dp4a.u32.u32 %r2560, %r2561, %r5762, %r2556; + // chi + lop3.b32 %r29843, %r11035, %r11091, %r11115, 0xD2; + lop3.b32 %r29844, %r11039, %r11095, %r11119, 0xD2; // end inline asm - ld.const.u32 %r2565, [matrix+852]; + st.local.v2.u32 [%rd84+200], {%r29843, %r29844}; // begin inline asm - dp4a.u32.u32 %r2564, %r2565, %r5766, %r2560; + // chi + lop3.b32 %r29835, %r11091, %r11115, %r11051, 0xD2; + lop3.b32 %r29836, %r11095, %r11119, %r11055, 0xD2; // end inline asm - ld.const.u32 %r2569, [matrix+856]; + st.local.v2.u32 [%rd84+208], {%r29835, %r29836}; // begin inline asm - dp4a.u32.u32 %r2568, %r2569, %r5770, %r2564; + // chi + lop3.b32 %r29827, %r11115, %r11051, %r11123, 0xD2; + lop3.b32 %r29828, %r11119, %r11055, %r11127, 0xD2; // end inline asm - ld.const.u32 %r2573, [matrix+860]; + st.local.v2.u32 [%rd84+216], {%r29827, %r29828}; + mul.wide.s32 %rd618, %r29877, 8; + add.s64 %rd617, %rd597, %rd618; // begin inline asm - dp4a.u32.u32 %r2572, %r2573, %r5774, %r2568; + ld.global.nc.v2.u32 {%r11403,%r11404}, [%rd617]; // end inline asm - ld.const.u32 %r2577, [matrix+864]; + xor.b32 %r29863, %r11203, %r11403; + xor.b32 %r29864, %r11204, %r11404; + add.s32 %r29877, %r29877, 1; + setp.lt.u32 %p24, %r29877, 23; + @%p24 bra $L__BB2_36; + + mov.u32 %r29910, 0; + mov.u32 %r11514, 1; + st.local.v2.u32 [%rd84+32], {%r29875, %r29876}; + st.local.v2.u32 [%rd84+72], {%r29873, %r29874}; + st.local.v2.u32 [%rd84+40], {%r29871, %r29872}; + st.local.v2.u32 [%rd84+80], {%r29869, %r29870}; + st.local.v2.u32 [%rd84+48], {%r29867, %r29868}; + st.local.v2.u32 [%rd84+56], {%r29865, %r29866}; + st.local.v2.u32 [%rd84+24], {%r29863, %r29864}; // begin inline asm - dp4a.u32.u32 %r2576, %r2577, %r5778, %r2572; + // xor5 + lop3.b32 %r11415, %r29863, %r29861, %r29859, 0x96; + lop3.b32 %r11415, %r11415, %r29857, %r29855, 0x96; + lop3.b32 %r11416, %r29864, %r29862, %r29860, 0x96; + lop3.b32 %r11416, %r11416, %r29858, %r29856, 0x96; // end inline asm - ld.const.u32 %r2581, [matrix+868]; // begin inline asm - dp4a.u32.u32 %r2580, %r2581, %r5782, %r2576; + // xor5 + lop3.b32 %r11427, %r29875, %r29873, %r29853, 0x96; + lop3.b32 %r11427, %r11427, %r29851, %r29849, 0x96; + lop3.b32 %r11428, %r29876, %r29874, %r29854, 0x96; + lop3.b32 %r11428, %r11428, %r29852, %r29850, 0x96; // end inline asm - ld.const.u32 %r2585, [matrix+872]; // begin inline asm - dp4a.u32.u32 %r2584, %r2585, %r5786, %r2580; + // xor5 + lop3.b32 %r11439, %r29871, %r29869, %r29847, 0x96; + lop3.b32 %r11439, %r11439, %r29845, %r29843, 0x96; + lop3.b32 %r11440, %r29872, %r29870, %r29848, 0x96; + lop3.b32 %r11440, %r11440, %r29846, %r29844, 0x96; // end inline asm - ld.const.u32 %r2589, [matrix+876]; // begin inline asm - dp4a.u32.u32 %r2588, %r2589, %r5790, %r2584; + // xor5 + lop3.b32 %r11451, %r29867, %r29841, %r29839, 0x96; + lop3.b32 %r11451, %r11451, %r29837, %r29835, 0x96; + lop3.b32 %r11452, %r29868, %r29842, %r29840, 0x96; + lop3.b32 %r11452, %r11452, %r29838, %r29836, 0x96; // end inline asm - ld.const.u32 %r2593, [matrix+880]; // begin inline asm - dp4a.u32.u32 %r2592, %r2593, %r5794, %r2588; + // xor5 + lop3.b32 %r11463, %r29865, %r29833, %r29831, 0x96; + lop3.b32 %r11463, %r11463, %r29829, %r29827, 0x96; + lop3.b32 %r11464, %r29866, %r29834, %r29832, 0x96; + lop3.b32 %r11464, %r11464, %r29830, %r29828, 0x96; // end inline asm - ld.const.u32 %r2597, [matrix+884]; // begin inline asm - dp4a.u32.u32 %r2596, %r2597, %r5798, %r2592; + shf.l.wrap.b32 %r11475, %r11428, %r11427, %r11514; // end inline asm - ld.const.u32 %r2601, [matrix+888]; // begin inline asm - dp4a.u32.u32 %r2600, %r2601, %r5802, %r2596; + shf.l.wrap.b32 %r11479, %r11427, %r11428, %r11514; // end inline asm - ld.const.u32 %r2605, [matrix+892]; + xor.b32 %r11654, %r11475, %r11463; + xor.b32 %r11655, %r11479, %r11464; + xor.b32 %r11622, %r29863, %r11654; + xor.b32 %r11625, %r29864, %r11655; + xor.b32 %r11585, %r29860, %r11655; + xor.b32 %r11584, %r29859, %r11654; + st.local.v2.u32 [%rd84+104], {%r11584, %r11585}; // begin inline asm - dp4a.u32.u32 %r2604, %r2605, %r5806, %r2600; + shf.l.wrap.b32 %r11483, %r11440, %r11439, %r11514; // end inline asm - shr.u32 %r5994, %r2540, 6; - and.b32 %r5995, %r5994, 240; - shr.u32 %r5996, %r2604, 10; - or.b32 %r5997, %r5996, %r5995; - cvt.u64.u32 %rd215, %r5997; - xor.b64 %rd216, %rd12, %rd215; - ld.const.u32 %r2609, [matrix+896]; // begin inline asm - dp4a.u32.u32 %r2608, %r2609, %r5746, %r6244; + shf.l.wrap.b32 %r11487, %r11439, %r11440, %r11514; // end inline asm - ld.const.u32 %r2613, [matrix+900]; + xor.b32 %r11656, %r11483, %r11415; + xor.b32 %r11657, %r11487, %r11416; + xor.b32 %r11521, %r29873, %r11656; + xor.b32 %r11520, %r29874, %r11657; + xor.b32 %r11560, %r29852, %r11657; + xor.b32 %r11561, %r29851, %r11656; + st.local.v2.u32 [%rd84+152], {%r11561, %r11560}; // begin inline asm - dp4a.u32.u32 %r2612, %r2613, %r5750, %r2608; + shf.l.wrap.b32 %r11491, %r11452, %r11451, %r11514; // end inline asm - ld.const.u32 %r2617, [matrix+904]; // begin inline asm - dp4a.u32.u32 %r2616, %r2617, %r5754, %r2612; + shf.l.wrap.b32 %r11495, %r11451, %r11452, %r11514; // end inline asm - ld.const.u32 %r2621, [matrix+908]; + xor.b32 %r11658, %r11491, %r11427; + xor.b32 %r11659, %r11495, %r11428; + xor.b32 %r11544, %r29848, %r11659; + xor.b32 %r11545, %r29847, %r11658; + st.local.v2.u32 [%rd84+120], {%r11545, %r11544}; + xor.b32 %r11536, %r29844, %r11659; + xor.b32 %r11537, %r29843, %r11658; + st.local.v2.u32 [%rd84+200], {%r11537, %r11536}; // begin inline asm - dp4a.u32.u32 %r2620, %r2621, %r5758, %r2616; + shf.l.wrap.b32 %r11499, %r11464, %r11463, %r11514; // end inline asm - ld.const.u32 %r2625, [matrix+912]; // begin inline asm - dp4a.u32.u32 %r2624, %r2625, %r5762, %r2620; + shf.l.wrap.b32 %r11503, %r11463, %r11464, %r11514; // end inline asm - ld.const.u32 %r2629, [matrix+916]; + xor.b32 %r11660, %r11499, %r11439; + xor.b32 %r11661, %r11503, %r11440; + xor.b32 %r11568, %r29867, %r11660; + xor.b32 %r11569, %r29868, %r11661; + xor.b32 %r11577, %r29838, %r11661; + xor.b32 %r11576, %r29837, %r11660; + st.local.v2.u32 [%rd84+168], {%r11576, %r11577}; // begin inline asm - dp4a.u32.u32 %r2628, %r2629, %r5766, %r2624; + shf.l.wrap.b32 %r11507, %r11416, %r11415, %r11514; // end inline asm - ld.const.u32 %r2633, [matrix+920]; // begin inline asm - dp4a.u32.u32 %r2632, %r2633, %r5770, %r2628; + shf.l.wrap.b32 %r11511, %r11415, %r11416, %r11514; // end inline asm - ld.const.u32 %r2637, [matrix+924]; + xor.b32 %r11662, %r11507, %r11451; + xor.b32 %r11663, %r11511, %r11452; + xor.b32 %r11528, %r29833, %r11662; + xor.b32 %r11529, %r29834, %r11663; + xor.b32 %r11553, %r29828, %r11663; + xor.b32 %r11552, %r29827, %r11662; + st.local.v2.u32 [%rd84+216], {%r11552, %r11553}; // begin inline asm - dp4a.u32.u32 %r2636, %r2637, %r5774, %r2632; + shf.l.wrap.b32 %r11515, %r11521, %r11520, %r11018; // end inline asm - ld.const.u32 %r2641, [matrix+928]; // begin inline asm - dp4a.u32.u32 %r2640, %r2641, %r5778, %r2636; + shf.l.wrap.b32 %r11519, %r11520, %r11521, %r11018; // end inline asm - ld.const.u32 %r2645, [matrix+932]; // begin inline asm - dp4a.u32.u32 %r2644, %r2645, %r5782, %r2640; + shf.l.wrap.b32 %r11523, %r11529, %r11528, %r11026; // end inline asm - ld.const.u32 %r2649, [matrix+936]; // begin inline asm - dp4a.u32.u32 %r2648, %r2649, %r5786, %r2644; + shf.l.wrap.b32 %r11527, %r11528, %r11529, %r11026; // end inline asm - ld.const.u32 %r2653, [matrix+940]; // begin inline asm - dp4a.u32.u32 %r2652, %r2653, %r5790, %r2648; + shf.l.wrap.b32 %r11535, %r11536, %r11537, %r11034; // end inline asm - ld.const.u32 %r2657, [matrix+944]; // begin inline asm - dp4a.u32.u32 %r2656, %r2657, %r5794, %r2652; + shf.l.wrap.b32 %r11531, %r11537, %r11536, %r11034; // end inline asm - ld.const.u32 %r2661, [matrix+948]; + st.local.v2.u32 [%rd84+96], {%r11531, %r11535}; // begin inline asm - dp4a.u32.u32 %r2660, %r2661, %r5798, %r2656; + shf.l.wrap.b32 %r11539, %r11545, %r11544, %r11066; // end inline asm - ld.const.u32 %r2665, [matrix+952]; // begin inline asm - dp4a.u32.u32 %r2664, %r2665, %r5802, %r2660; + shf.l.wrap.b32 %r11543, %r11544, %r11545, %r11066; // end inline asm - ld.const.u32 %r2669, [matrix+956]; // begin inline asm - dp4a.u32.u32 %r2668, %r2669, %r5806, %r2664; + shf.l.wrap.b32 %r11547, %r11553, %r11552, %r11114; // end inline asm - ld.const.u32 %r2673, [matrix+960]; // begin inline asm - dp4a.u32.u32 %r2672, %r2673, %r5746, %r6244; + shf.l.wrap.b32 %r11551, %r11552, %r11553, %r11114; // end inline asm - ld.const.u32 %r2677, [matrix+964]; // begin inline asm - dp4a.u32.u32 %r2676, %r2677, %r5750, %r2672; + shf.l.wrap.b32 %r11559, %r11560, %r11561, %r11138; // end inline asm - ld.const.u32 %r2681, [matrix+968]; // begin inline asm - dp4a.u32.u32 %r2680, %r2681, %r5754, %r2676; + shf.l.wrap.b32 %r11555, %r11561, %r11560, %r11138; // end inline asm - ld.const.u32 %r2685, [matrix+972]; + st.local.v2.u32 [%rd84+88], {%r11555, %r11559}; // begin inline asm - dp4a.u32.u32 %r2684, %r2685, %r5758, %r2680; + shf.l.wrap.b32 %r11563, %r11569, %r11568, %r11154; // end inline asm - ld.const.u32 %r2689, [matrix+976]; // begin inline asm - dp4a.u32.u32 %r2688, %r2689, %r5762, %r2684; + shf.l.wrap.b32 %r11567, %r11568, %r11569, %r11154; // end inline asm - ld.const.u32 %r2693, [matrix+980]; // begin inline asm - dp4a.u32.u32 %r2692, %r2693, %r5766, %r2688; + shf.l.wrap.b32 %r11571, %r11577, %r11576, %r11162; // end inline asm - ld.const.u32 %r2697, [matrix+984]; // begin inline asm - dp4a.u32.u32 %r2696, %r2697, %r5770, %r2692; + shf.l.wrap.b32 %r11575, %r11576, %r11577, %r11162; // end inline asm - ld.const.u32 %r2701, [matrix+988]; // begin inline asm - dp4a.u32.u32 %r2700, %r2701, %r5774, %r2696; + shf.l.wrap.b32 %r11579, %r11585, %r11584, %r11194; // end inline asm - ld.const.u32 %r2705, [matrix+992]; // begin inline asm - dp4a.u32.u32 %r2704, %r2705, %r5778, %r2700; + shf.l.wrap.b32 %r11583, %r11584, %r11585, %r11194; // end inline asm - ld.const.u32 %r2709, [matrix+996]; // begin inline asm - dp4a.u32.u32 %r2708, %r2709, %r5782, %r2704; + // chi + lop3.b32 %r11587, %r11622, %r11515, %r11539, 0xD2; + lop3.b32 %r11588, %r11625, %r11519, %r11543, 0xD2; // end inline asm - ld.const.u32 %r2713, [matrix+1000]; // begin inline asm - dp4a.u32.u32 %r2712, %r2713, %r5786, %r2708; + // chi + lop3.b32 %r30010, %r11515, %r11539, %r11571, 0xD2; + lop3.b32 %r30011, %r11519, %r11543, %r11575, 0xD2; // end inline asm - ld.const.u32 %r2717, [matrix+1004]; + st.local.v2.u32 [%rd84+32], {%r30010, %r30011}; // begin inline asm - dp4a.u32.u32 %r2716, %r2717, %r5790, %r2712; + // chi + lop3.b32 %r30006, %r11539, %r11571, %r11547, 0xD2; + lop3.b32 %r30007, %r11543, %r11575, %r11551, 0xD2; // end inline asm - ld.const.u32 %r2721, [matrix+1008]; + st.local.v2.u32 [%rd84+40], {%r30006, %r30007}; // begin inline asm - dp4a.u32.u32 %r2720, %r2721, %r5794, %r2716; + // chi + lop3.b32 %r30002, %r11571, %r11547, %r11622, 0xD2; + lop3.b32 %r30003, %r11575, %r11551, %r11625, 0xD2; // end inline asm - ld.const.u32 %r2725, [matrix+1012]; + st.local.v2.u32 [%rd84+48], {%r30002, %r30003}; // begin inline asm - dp4a.u32.u32 %r2724, %r2725, %r5798, %r2720; + // chi + lop3.b32 %r30000, %r11547, %r11622, %r11515, 0xD2; + lop3.b32 %r30001, %r11551, %r11625, %r11519, 0xD2; // end inline asm - ld.const.u32 %r2729, [matrix+1016]; + st.local.v2.u32 [%rd84+56], {%r30000, %r30001}; // begin inline asm - dp4a.u32.u32 %r2728, %r2729, %r5802, %r2724; + // chi + lop3.b32 %r29996, %r11563, %r11523, %r11579, 0xD2; + lop3.b32 %r29997, %r11567, %r11527, %r11583, 0xD2; // end inline asm - ld.const.u32 %r2733, [matrix+1020]; + st.local.v2.u32 [%rd84+64], {%r29996, %r29997}; // begin inline asm - dp4a.u32.u32 %r2732, %r2733, %r5806, %r2728; + // chi + lop3.b32 %r30008, %r11523, %r11579, %r11555, 0xD2; + lop3.b32 %r30009, %r11527, %r11583, %r11559, 0xD2; // end inline asm - shr.u32 %r5998, %r2668, 6; - and.b32 %r5999, %r5998, 240; - ld.const.u32 %r2737, [matrix+1024]; + st.local.v2.u32 [%rd84+72], {%r30008, %r30009}; // begin inline asm - dp4a.u32.u32 %r2736, %r2737, %r5746, %r6244; + // chi + lop3.b32 %r30004, %r11579, %r11555, %r11531, 0xD2; + lop3.b32 %r30005, %r11583, %r11559, %r11535, 0xD2; // end inline asm - ld.const.u32 %r2741, [matrix+1028]; + st.local.v2.u32 [%rd84+80], {%r30004, %r30005}; // begin inline asm - dp4a.u32.u32 %r2740, %r2741, %r5750, %r2736; + ld.global.nc.v2.u32 {%r11651,%r11652}, [%rd598]; + // end inline asm + xor.b32 %r29998, %r11587, %r11651; + xor.b32 %r29999, %r11588, %r11652; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; + add.s64 %rd86, %rd84, 24; + add.s64 %rd87, %rd3, 24; + +$L__BB2_38: + shl.b32 %r11664, %r29910, 2; + cvt.u64.u32 %rd628, %r11664; + and.b64 %rd629, %rd628, 60; + add.s64 %rd630, %rd87, %rd629; + xor.b32 %r11665, %r30, %r29910; + mul.lo.s32 %r11666, %r11665, 16777619; + ld.local.u32 %r11667, [%rd630]; + xor.b32 %r11668, %r11666, %r11667; + mul.wide.u32 %rd631, %r11668, -954391867; + shr.u64 %rd632, %rd631, 32; + cvt.u32.u64 %r11669, %rd632; + sub.s32 %r11670, %r11668, %r11669; + shr.u32 %r11671, %r11670, 1; + add.s32 %r11672, %r11671, %r11669; + shr.u32 %r11673, %r11672, 20; + mul.lo.s32 %r11674, %r11673, 1179641; + sub.s32 %r11675, %r11668, %r11674; + mul.wide.u32 %rd633, %r11675, 64; + add.s64 %rd634, %rd471, %rd633; + mul.lo.s32 %r11676, %r29947, 16777619; + ld.global.u32 %r11677, [%rd634]; + xor.b32 %r29947, %r11676, %r11677; + mul.lo.s32 %r11678, %r29948, 16777619; + ld.global.u32 %r11679, [%rd634+4]; + xor.b32 %r29948, %r11678, %r11679; + mul.lo.s32 %r11680, %r29959, 16777619; + ld.global.u32 %r11681, [%rd634+8]; + mul.lo.s32 %r11682, %r29960, 16777619; + ld.global.u32 %r11683, [%rd634+12]; + xor.b32 %r11684, %r11682, %r11683; + xor.b32 %r29959, %r11680, %r11681; + mov.b64 %rd635, {%r29959, %r11684}; + mul.lo.s32 %r11685, %r29955, 16777619; + ld.global.u32 %r11686, [%rd634+16]; + mul.lo.s32 %r11687, %r29956, 16777619; + ld.global.u32 %r11688, [%rd634+20]; + xor.b32 %r11689, %r11687, %r11688; + xor.b32 %r29955, %r11685, %r11686; + mov.b64 %rd636, {%r29955, %r11689}; + mul.lo.s32 %r11690, %r29951, 16777619; + ld.global.u32 %r11691, [%rd634+24]; + mul.lo.s32 %r11692, %r29952, 16777619; + ld.global.u32 %r11693, [%rd634+28]; + xor.b32 %r11694, %r11692, %r11693; + xor.b32 %r29951, %r11690, %r11691; + mov.b64 %rd637, {%r29951, %r11694}; + mul.lo.s32 %r11695, %r29949, 16777619; + ld.global.u32 %r11696, [%rd634+32]; + mul.lo.s32 %r11697, %r29950, 16777619; + ld.global.u32 %r11698, [%rd634+36]; + xor.b32 %r11699, %r11697, %r11698; + xor.b32 %r29949, %r11695, %r11696; + mov.b64 %rd638, {%r29949, %r11699}; + mul.lo.s32 %r11700, %r29945, 16777619; + ld.global.u32 %r11701, [%rd634+40]; + xor.b32 %r29945, %r11700, %r11701; + mul.lo.s32 %r11702, %r29946, 16777619; + ld.global.u32 %r11703, [%rd634+44]; + xor.b32 %r29946, %r11702, %r11703; + mul.lo.s32 %r11704, %r29957, 16777619; + ld.global.u32 %r11705, [%rd634+48]; + mul.lo.s32 %r11706, %r29958, 16777619; + ld.global.u32 %r11707, [%rd634+52]; + xor.b32 %r11708, %r11706, %r11707; + xor.b32 %r29957, %r11704, %r11705; + mov.b64 %rd639, {%r29957, %r11708}; + mul.lo.s32 %r11709, %r29953, 16777619; + ld.global.u32 %r11710, [%rd634+56]; + mul.lo.s32 %r11711, %r29954, 16777619; + ld.global.u32 %r11712, [%rd634+60]; + xor.b32 %r11713, %r11711, %r11712; + xor.b32 %r29953, %r11709, %r11710; + mov.b64 %rd640, {%r29953, %r11713}; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; + st.local.v2.u32 [%rd3+32], {%r29959, %r11684}; + st.local.v2.u32 [%rd3+40], {%r29955, %r11689}; + st.local.v2.u32 [%rd3+48], {%r29951, %r11694}; + st.local.v2.u32 [%rd3+56], {%r29949, %r11699}; + st.local.v2.u32 [%rd3+64], {%r29945, %r29946}; + st.local.v2.u32 [%rd3+72], {%r29957, %r11708}; + st.local.v2.u32 [%rd3+80], {%r29953, %r11713}; + add.s64 %rd641, %rd86, %rd629; + xor.b32 %r11714, %r31, %r29910; + mul.lo.s32 %r11715, %r11714, 16777619; + ld.local.u32 %r11716, [%rd641]; + xor.b32 %r11717, %r11715, %r11716; + mul.wide.u32 %rd642, %r11717, -954391867; + shr.u64 %rd643, %rd642, 32; + cvt.u32.u64 %r11718, %rd643; + sub.s32 %r11719, %r11717, %r11718; + shr.u32 %r11720, %r11719, 1; + add.s32 %r11721, %r11720, %r11718; + shr.u32 %r11722, %r11721, 20; + mul.lo.s32 %r11723, %r11722, 1179641; + sub.s32 %r11724, %r11717, %r11723; + mul.wide.u32 %rd644, %r11724, 64; + add.s64 %rd645, %rd471, %rd644; + mul.lo.s32 %r11725, %r29998, 16777619; + ld.global.u32 %r11726, [%rd645]; + xor.b32 %r29998, %r11725, %r11726; + mul.lo.s32 %r11727, %r29999, 16777619; + ld.global.u32 %r11728, [%rd645+4]; + xor.b32 %r29999, %r11727, %r11728; + mul.lo.s32 %r11729, %r30010, 16777619; + ld.global.u32 %r11730, [%rd645+8]; + mul.lo.s32 %r11731, %r30011, 16777619; + ld.global.u32 %r11732, [%rd645+12]; + xor.b32 %r11733, %r11731, %r11732; + xor.b32 %r30010, %r11729, %r11730; + mov.b64 %rd646, {%r30010, %r11733}; + mul.lo.s32 %r11734, %r30006, 16777619; + ld.global.u32 %r11735, [%rd645+16]; + mul.lo.s32 %r11736, %r30007, 16777619; + ld.global.u32 %r11737, [%rd645+20]; + xor.b32 %r11738, %r11736, %r11737; + xor.b32 %r30006, %r11734, %r11735; + mov.b64 %rd647, {%r30006, %r11738}; + mul.lo.s32 %r11739, %r30002, 16777619; + ld.global.u32 %r11740, [%rd645+24]; + mul.lo.s32 %r11741, %r30003, 16777619; + ld.global.u32 %r11742, [%rd645+28]; + xor.b32 %r11743, %r11741, %r11742; + xor.b32 %r30002, %r11739, %r11740; + mov.b64 %rd648, {%r30002, %r11743}; + mul.lo.s32 %r11744, %r30000, 16777619; + ld.global.u32 %r11745, [%rd645+32]; + mul.lo.s32 %r11746, %r30001, 16777619; + ld.global.u32 %r11747, [%rd645+36]; + xor.b32 %r11748, %r11746, %r11747; + xor.b32 %r30000, %r11744, %r11745; + mov.b64 %rd649, {%r30000, %r11748}; + mul.lo.s32 %r11749, %r29996, 16777619; + ld.global.u32 %r11750, [%rd645+40]; + xor.b32 %r29996, %r11749, %r11750; + mul.lo.s32 %r11751, %r29997, 16777619; + ld.global.u32 %r11752, [%rd645+44]; + xor.b32 %r29997, %r11751, %r11752; + mul.lo.s32 %r11753, %r30008, 16777619; + ld.global.u32 %r11754, [%rd645+48]; + mul.lo.s32 %r11755, %r30009, 16777619; + ld.global.u32 %r11756, [%rd645+52]; + xor.b32 %r11757, %r11755, %r11756; + xor.b32 %r30008, %r11753, %r11754; + mov.b64 %rd650, {%r30008, %r11757}; + mul.lo.s32 %r11758, %r30004, 16777619; + ld.global.u32 %r11759, [%rd645+56]; + mul.lo.s32 %r11760, %r30005, 16777619; + ld.global.u32 %r11761, [%rd645+60]; + xor.b32 %r11762, %r11760, %r11761; + xor.b32 %r30004, %r11758, %r11759; + mov.b64 %rd651, {%r30004, %r11762}; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; + st.local.v2.u32 [%rd84+32], {%r30010, %r11733}; + st.local.v2.u32 [%rd84+40], {%r30006, %r11738}; + st.local.v2.u32 [%rd84+48], {%r30002, %r11743}; + st.local.v2.u32 [%rd84+56], {%r30000, %r11748}; + st.local.v2.u32 [%rd84+64], {%r29996, %r29997}; + st.local.v2.u32 [%rd84+72], {%r30008, %r11757}; + st.local.v2.u32 [%rd84+80], {%r30004, %r11762}; + add.s32 %r29910, %r29910, 1; + setp.lt.u32 %p25, %r29910, 512; + shr.u64 %rd652, %rd635, 32; + cvt.u32.u64 %r29960, %rd652; + shr.u64 %rd653, %rd636, 32; + cvt.u32.u64 %r29956, %rd653; + shr.u64 %rd654, %rd637, 32; + cvt.u32.u64 %r29952, %rd654; + shr.u64 %rd655, %rd638, 32; + cvt.u32.u64 %r29950, %rd655; + shr.u64 %rd656, %rd639, 32; + cvt.u32.u64 %r29958, %rd656; + shr.u64 %rd657, %rd640, 32; + cvt.u32.u64 %r29954, %rd657; + shr.u64 %rd658, %rd646, 32; + cvt.u32.u64 %r30011, %rd658; + shr.u64 %rd659, %rd647, 32; + cvt.u32.u64 %r30007, %rd659; + shr.u64 %rd660, %rd648, 32; + cvt.u32.u64 %r30003, %rd660; + shr.u64 %rd661, %rd649, 32; + cvt.u32.u64 %r30001, %rd661; + shr.u64 %rd662, %rd650, 32; + cvt.u32.u64 %r30009, %rd662; + shr.u64 %rd663, %rd651, 32; + cvt.u32.u64 %r30005, %rd663; + @%p25 bra $L__BB2_38; + + mov.u32 %r29911, 0; + st.local.v2.u32 [%rd3+96], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+104], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+112], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+120], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+128], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+136], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+144], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+152], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+160], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+168], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+176], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+184], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+192], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+200], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+208], {%r29911, %r29911}; + st.local.v2.u32 [%rd3+216], {%r29911, %r29911}; + mov.u32 %r29926, -2147483648; + mov.u32 %r11777, 1; + st.local.v2.u32 [%rd3+88], {%r11777, %r29926}; + mov.u32 %r29912, %r29911; + mov.u32 %r29913, %r29911; + mov.u32 %r29914, %r29911; + mov.u32 %r29915, %r29911; + mov.u32 %r29916, %r29911; + mov.u32 %r29917, %r29911; + mov.u32 %r29918, %r29911; + mov.u32 %r29919, %r29911; + mov.u32 %r29920, %r29911; + mov.u32 %r29921, %r29911; + mov.u32 %r29922, %r29911; + mov.u32 %r29923, %r29911; + mov.u32 %r29924, %r29911; + mov.u32 %r29925, %r11777; + mov.u32 %r29927, %r29911; + mov.u32 %r29928, %r29911; + mov.u32 %r29929, %r29911; + mov.u32 %r29930, %r29911; + mov.u32 %r29931, %r29911; + mov.u32 %r29932, %r29911; + mov.u32 %r29933, %r29911; + mov.u32 %r29934, %r29911; + mov.u32 %r29935, %r29911; + mov.u32 %r29936, %r29911; + mov.u32 %r29937, %r29911; + mov.u32 %r29938, %r29911; + mov.u32 %r29939, %r29911; + mov.u32 %r29940, %r29911; + mov.u32 %r29941, %r29911; + mov.u32 %r29942, %r29911; + mov.u32 %r29943, %r29911; + mov.u32 %r29944, %r29911; + mov.u32 %r29961, %r29911; + +$L__BB2_40: + // begin inline asm + // xor5 + lop3.b32 %r11804, %r29947, %r29945, %r29943, 0x96; + lop3.b32 %r11804, %r11804, %r29941, %r29939, 0x96; + lop3.b32 %r11805, %r29948, %r29946, %r29944, 0x96; + lop3.b32 %r11805, %r11805, %r29942, %r29940, 0x96; // end inline asm - ld.const.u32 %r2745, [matrix+1032]; // begin inline asm - dp4a.u32.u32 %r2744, %r2745, %r5754, %r2740; + // xor5 + lop3.b32 %r11816, %r29959, %r29957, %r29937, 0x96; + lop3.b32 %r11816, %r11816, %r29935, %r29933, 0x96; + lop3.b32 %r11817, %r29960, %r29958, %r29938, 0x96; + lop3.b32 %r11817, %r11817, %r29936, %r29934, 0x96; // end inline asm - ld.const.u32 %r2749, [matrix+1036]; // begin inline asm - dp4a.u32.u32 %r2748, %r2749, %r5758, %r2744; + // xor5 + lop3.b32 %r11828, %r29955, %r29953, %r29931, 0x96; + lop3.b32 %r11828, %r11828, %r29929, %r29927, 0x96; + lop3.b32 %r11829, %r29956, %r29954, %r29932, 0x96; + lop3.b32 %r11829, %r11829, %r29930, %r29928, 0x96; // end inline asm - ld.const.u32 %r2753, [matrix+1040]; // begin inline asm - dp4a.u32.u32 %r2752, %r2753, %r5762, %r2748; + // xor5 + lop3.b32 %r11840, %r29951, %r29925, %r29923, 0x96; + lop3.b32 %r11840, %r11840, %r29921, %r29919, 0x96; + lop3.b32 %r11841, %r29952, %r29926, %r29924, 0x96; + lop3.b32 %r11841, %r11841, %r29922, %r29920, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r11852, %r29949, %r29917, %r29915, 0x96; + lop3.b32 %r11852, %r11852, %r29913, %r29911, 0x96; + lop3.b32 %r11853, %r29950, %r29918, %r29916, 0x96; + lop3.b32 %r11853, %r11853, %r29914, %r29912, 0x96; // end inline asm - ld.const.u32 %r2757, [matrix+1044]; // begin inline asm - dp4a.u32.u32 %r2756, %r2757, %r5766, %r2752; + shf.l.wrap.b32 %r11864, %r11817, %r11816, %r11777; // end inline asm - ld.const.u32 %r2761, [matrix+1048]; // begin inline asm - dp4a.u32.u32 %r2760, %r2761, %r5770, %r2756; + shf.l.wrap.b32 %r11868, %r11816, %r11817, %r11777; // end inline asm - ld.const.u32 %r2765, [matrix+1052]; + xor.b32 %r12298, %r11864, %r11852; + xor.b32 %r12299, %r11868, %r11853; + xor.b32 %r12131, %r29947, %r12298; + xor.b32 %r12134, %r29948, %r12299; + xor.b32 %r12038, %r29945, %r12298; + xor.b32 %r12037, %r29946, %r12299; + xor.b32 %r12085, %r29943, %r12298; + xor.b32 %r12086, %r29944, %r12299; + xor.b32 %r11990, %r29941, %r12298; + xor.b32 %r11989, %r29942, %r12299; + xor.b32 %r11941, %r29939, %r12298; + xor.b32 %r11942, %r29940, %r12299; // begin inline asm - dp4a.u32.u32 %r2764, %r2765, %r5774, %r2760; + shf.l.wrap.b32 %r11872, %r11829, %r11828, %r11777; // end inline asm - ld.const.u32 %r2769, [matrix+1056]; // begin inline asm - dp4a.u32.u32 %r2768, %r2769, %r5778, %r2764; + shf.l.wrap.b32 %r11876, %r11828, %r11829, %r11777; // end inline asm - ld.const.u32 %r2773, [matrix+1060]; + xor.b32 %r12300, %r11872, %r11804; + xor.b32 %r12301, %r11876, %r11805; + xor.b32 %r12093, %r29959, %r12300; + xor.b32 %r12094, %r29960, %r12301; + xor.b32 %r11910, %r29957, %r12300; + xor.b32 %r11909, %r29958, %r12301; + xor.b32 %r12069, %r29937, %r12300; + xor.b32 %r12070, %r29938, %r12301; + xor.b32 %r12030, %r29935, %r12300; + xor.b32 %r12029, %r29936, %r12301; + xor.b32 %r12013, %r29933, %r12300; + xor.b32 %r12014, %r29934, %r12301; // begin inline asm - dp4a.u32.u32 %r2772, %r2773, %r5782, %r2768; + shf.l.wrap.b32 %r11880, %r11841, %r11840, %r11777; // end inline asm - ld.const.u32 %r2777, [matrix+1064]; // begin inline asm - dp4a.u32.u32 %r2776, %r2777, %r5786, %r2772; + shf.l.wrap.b32 %r11884, %r11840, %r11841, %r11777; // end inline asm - ld.const.u32 %r2781, [matrix+1068]; + xor.b32 %r12302, %r11880, %r11816; + xor.b32 %r12303, %r11884, %r11817; + xor.b32 %r11950, %r29955, %r12302; + xor.b32 %r11949, %r29956, %r12303; + xor.b32 %r12077, %r29953, %r12302; + xor.b32 %r12078, %r29954, %r12303; + xor.b32 %r11958, %r29931, %r12302; + xor.b32 %r11957, %r29932, %r12303; + xor.b32 %r12061, %r29929, %r12302; + xor.b32 %r12062, %r29930, %r12303; + xor.b32 %r11926, %r29927, %r12302; + xor.b32 %r11925, %r29928, %r12303; // begin inline asm - dp4a.u32.u32 %r2780, %r2781, %r5790, %r2776; + shf.l.wrap.b32 %r11888, %r11853, %r11852, %r11777; // end inline asm - ld.const.u32 %r2785, [matrix+1072]; // begin inline asm - dp4a.u32.u32 %r2784, %r2785, %r5794, %r2780; + shf.l.wrap.b32 %r11892, %r11852, %r11853, %r11777; // end inline asm - ld.const.u32 %r2789, [matrix+1076]; + xor.b32 %r12304, %r11888, %r11828; + xor.b32 %r12305, %r11892, %r11829; + xor.b32 %r12045, %r29951, %r12304; + xor.b32 %r12046, %r29952, %r12305; + xor.b32 %r12022, %r29925, %r12304; + xor.b32 %r12021, %r29926, %r12305; + xor.b32 %r11965, %r29923, %r12304; + xor.b32 %r11966, %r29924, %r12305; + xor.b32 %r12053, %r29921, %r12304; + xor.b32 %r12054, %r29922, %r12305; + xor.b32 %r11982, %r29919, %r12304; + xor.b32 %r11981, %r29920, %r12305; // begin inline asm - dp4a.u32.u32 %r2788, %r2789, %r5798, %r2784; + shf.l.wrap.b32 %r11896, %r11805, %r11804, %r11777; // end inline asm - ld.const.u32 %r2793, [matrix+1080]; // begin inline asm - dp4a.u32.u32 %r2792, %r2793, %r5802, %r2788; + shf.l.wrap.b32 %r11900, %r11804, %r11805, %r11777; // end inline asm - ld.const.u32 %r2797, [matrix+1084]; + xor.b32 %r12306, %r11896, %r11840; + xor.b32 %r12307, %r11900, %r11841; + xor.b32 %r11997, %r29949, %r12306; + xor.b32 %r11998, %r29950, %r12307; + xor.b32 %r11917, %r29917, %r12306; + xor.b32 %r11918, %r29918, %r12307; + xor.b32 %r11934, %r29915, %r12306; + xor.b32 %r11933, %r29916, %r12307; + xor.b32 %r11973, %r29913, %r12306; + xor.b32 %r11974, %r29914, %r12307; + xor.b32 %r12005, %r29911, %r12306; + xor.b32 %r12006, %r29912, %r12307; + mov.u32 %r11911, 44; // begin inline asm - dp4a.u32.u32 %r2796, %r2797, %r5806, %r2792; + shf.l.wrap.b32 %r11904, %r11910, %r11909, %r11911; // end inline asm - ld.const.u32 %r2801, [matrix+1088]; // begin inline asm - dp4a.u32.u32 %r2800, %r2801, %r5746, %r6244; + shf.l.wrap.b32 %r11908, %r11909, %r11910, %r11911; // end inline asm - ld.const.u32 %r2805, [matrix+1092]; + mov.u32 %r11919, 20; // begin inline asm - dp4a.u32.u32 %r2804, %r2805, %r5750, %r2800; + shf.l.wrap.b32 %r11912, %r11918, %r11917, %r11919; // end inline asm - ld.const.u32 %r2809, [matrix+1096]; // begin inline asm - dp4a.u32.u32 %r2808, %r2809, %r5754, %r2804; + shf.l.wrap.b32 %r11916, %r11917, %r11918, %r11919; // end inline asm - ld.const.u32 %r2813, [matrix+1100]; + mov.u32 %r11927, 61; // begin inline asm - dp4a.u32.u32 %r2812, %r2813, %r5758, %r2808; + shf.l.wrap.b32 %r11920, %r11926, %r11925, %r11927; // end inline asm - ld.const.u32 %r2817, [matrix+1104]; // begin inline asm - dp4a.u32.u32 %r2816, %r2817, %r5762, %r2812; + shf.l.wrap.b32 %r11924, %r11925, %r11926, %r11927; // end inline asm - ld.const.u32 %r2821, [matrix+1108]; + mov.u32 %r11935, 39; // begin inline asm - dp4a.u32.u32 %r2820, %r2821, %r5766, %r2816; + shf.l.wrap.b32 %r11928, %r11934, %r11933, %r11935; // end inline asm - ld.const.u32 %r2825, [matrix+1112]; // begin inline asm - dp4a.u32.u32 %r2824, %r2825, %r5770, %r2820; + shf.l.wrap.b32 %r11932, %r11933, %r11934, %r11935; // end inline asm - ld.const.u32 %r2829, [matrix+1116]; + mov.u32 %r11943, 18; // begin inline asm - dp4a.u32.u32 %r2828, %r2829, %r5774, %r2824; + shf.l.wrap.b32 %r11936, %r11942, %r11941, %r11943; // end inline asm - ld.const.u32 %r2833, [matrix+1120]; // begin inline asm - dp4a.u32.u32 %r2832, %r2833, %r5778, %r2828; + shf.l.wrap.b32 %r11940, %r11941, %r11942, %r11943; // end inline asm - ld.const.u32 %r2837, [matrix+1124]; + mov.u32 %r11951, 62; // begin inline asm - dp4a.u32.u32 %r2836, %r2837, %r5782, %r2832; + shf.l.wrap.b32 %r11944, %r11950, %r11949, %r11951; // end inline asm - ld.const.u32 %r2841, [matrix+1128]; // begin inline asm - dp4a.u32.u32 %r2840, %r2841, %r5786, %r2836; + shf.l.wrap.b32 %r11948, %r11949, %r11950, %r11951; // end inline asm - ld.const.u32 %r2845, [matrix+1132]; + mov.u32 %r11959, 43; // begin inline asm - dp4a.u32.u32 %r2844, %r2845, %r5790, %r2840; + shf.l.wrap.b32 %r11952, %r11958, %r11957, %r11959; // end inline asm - ld.const.u32 %r2849, [matrix+1136]; // begin inline asm - dp4a.u32.u32 %r2848, %r2849, %r5794, %r2844; + shf.l.wrap.b32 %r11956, %r11957, %r11958, %r11959; // end inline asm - ld.const.u32 %r2853, [matrix+1140]; + mov.u32 %r11967, 25; // begin inline asm - dp4a.u32.u32 %r2852, %r2853, %r5798, %r2848; + shf.l.wrap.b32 %r11960, %r11966, %r11965, %r11967; // end inline asm - ld.const.u32 %r2857, [matrix+1144]; // begin inline asm - dp4a.u32.u32 %r2856, %r2857, %r5802, %r2852; + shf.l.wrap.b32 %r11964, %r11965, %r11966, %r11967; // end inline asm - ld.const.u32 %r2861, [matrix+1148]; + mov.u32 %r11975, 8; // begin inline asm - dp4a.u32.u32 %r2860, %r2861, %r5806, %r2856; + shf.l.wrap.b32 %r11968, %r11974, %r11973, %r11975; // end inline asm - shr.u32 %r6000, %r2796, 6; - and.b32 %r6001, %r6000, 240; - shr.u32 %r6002, %r2860, 10; - or.b32 %r6003, %r6002, %r6001; - xor.b32 %r6004, %r11, %r6003; - ld.const.u32 %r2865, [matrix+1152]; // begin inline asm - dp4a.u32.u32 %r2864, %r2865, %r5746, %r6244; + shf.l.wrap.b32 %r11972, %r11973, %r11974, %r11975; // end inline asm - ld.const.u32 %r2869, [matrix+1156]; + mov.u32 %r11983, 56; // begin inline asm - dp4a.u32.u32 %r2868, %r2869, %r5750, %r2864; + shf.l.wrap.b32 %r11976, %r11982, %r11981, %r11983; // end inline asm - ld.const.u32 %r2873, [matrix+1160]; // begin inline asm - dp4a.u32.u32 %r2872, %r2873, %r5754, %r2868; + shf.l.wrap.b32 %r11980, %r11981, %r11982, %r11983; // end inline asm - ld.const.u32 %r2877, [matrix+1164]; + mov.u32 %r11991, 41; // begin inline asm - dp4a.u32.u32 %r2876, %r2877, %r5758, %r2872; + shf.l.wrap.b32 %r11984, %r11990, %r11989, %r11991; // end inline asm - ld.const.u32 %r2881, [matrix+1168]; // begin inline asm - dp4a.u32.u32 %r2880, %r2881, %r5762, %r2876; + shf.l.wrap.b32 %r11988, %r11989, %r11990, %r11991; // end inline asm - ld.const.u32 %r2885, [matrix+1172]; + mov.u32 %r11999, 27; // begin inline asm - dp4a.u32.u32 %r2884, %r2885, %r5766, %r2880; + shf.l.wrap.b32 %r11992, %r11998, %r11997, %r11999; // end inline asm - ld.const.u32 %r2889, [matrix+1176]; // begin inline asm - dp4a.u32.u32 %r2888, %r2889, %r5770, %r2884; + shf.l.wrap.b32 %r11996, %r11997, %r11998, %r11999; // end inline asm - ld.const.u32 %r2893, [matrix+1180]; + mov.u32 %r12007, 14; // begin inline asm - dp4a.u32.u32 %r2892, %r2893, %r5774, %r2888; + shf.l.wrap.b32 %r12000, %r12006, %r12005, %r12007; // end inline asm - ld.const.u32 %r2897, [matrix+1184]; // begin inline asm - dp4a.u32.u32 %r2896, %r2897, %r5778, %r2892; + shf.l.wrap.b32 %r12004, %r12005, %r12006, %r12007; // end inline asm - ld.const.u32 %r2901, [matrix+1188]; + mov.u32 %r12015, 2; // begin inline asm - dp4a.u32.u32 %r2900, %r2901, %r5782, %r2896; + shf.l.wrap.b32 %r12008, %r12014, %r12013, %r12015; // end inline asm - ld.const.u32 %r2905, [matrix+1192]; // begin inline asm - dp4a.u32.u32 %r2904, %r2905, %r5786, %r2900; + shf.l.wrap.b32 %r12012, %r12013, %r12014, %r12015; // end inline asm - ld.const.u32 %r2909, [matrix+1196]; + mov.u32 %r12023, 55; // begin inline asm - dp4a.u32.u32 %r2908, %r2909, %r5790, %r2904; + shf.l.wrap.b32 %r12016, %r12022, %r12021, %r12023; // end inline asm - ld.const.u32 %r2913, [matrix+1200]; // begin inline asm - dp4a.u32.u32 %r2912, %r2913, %r5794, %r2908; + shf.l.wrap.b32 %r12020, %r12021, %r12022, %r12023; // end inline asm - ld.const.u32 %r2917, [matrix+1204]; + mov.u32 %r12031, 45; // begin inline asm - dp4a.u32.u32 %r2916, %r2917, %r5798, %r2912; + shf.l.wrap.b32 %r12024, %r12030, %r12029, %r12031; // end inline asm - ld.const.u32 %r2921, [matrix+1208]; // begin inline asm - dp4a.u32.u32 %r2920, %r2921, %r5802, %r2916; + shf.l.wrap.b32 %r12028, %r12029, %r12030, %r12031; // end inline asm - ld.const.u32 %r2925, [matrix+1212]; + mov.u32 %r12039, 36; // begin inline asm - dp4a.u32.u32 %r2924, %r2925, %r5806, %r2920; + shf.l.wrap.b32 %r12032, %r12038, %r12037, %r12039; // end inline asm - ld.const.u32 %r2929, [matrix+1216]; // begin inline asm - dp4a.u32.u32 %r2928, %r2929, %r5746, %r6244; + shf.l.wrap.b32 %r12036, %r12037, %r12038, %r12039; // end inline asm - ld.const.u32 %r2933, [matrix+1220]; + mov.u32 %r12047, 28; // begin inline asm - dp4a.u32.u32 %r2932, %r2933, %r5750, %r2928; + shf.l.wrap.b32 %r12040, %r12046, %r12045, %r12047; // end inline asm - ld.const.u32 %r2937, [matrix+1224]; // begin inline asm - dp4a.u32.u32 %r2936, %r2937, %r5754, %r2932; + shf.l.wrap.b32 %r12044, %r12045, %r12046, %r12047; // end inline asm - ld.const.u32 %r2941, [matrix+1228]; + mov.u32 %r12055, 21; // begin inline asm - dp4a.u32.u32 %r2940, %r2941, %r5758, %r2936; + shf.l.wrap.b32 %r12048, %r12054, %r12053, %r12055; // end inline asm - ld.const.u32 %r2945, [matrix+1232]; // begin inline asm - dp4a.u32.u32 %r2944, %r2945, %r5762, %r2940; + shf.l.wrap.b32 %r12052, %r12053, %r12054, %r12055; // end inline asm - ld.const.u32 %r2949, [matrix+1236]; + mov.u32 %r12063, 15; // begin inline asm - dp4a.u32.u32 %r2948, %r2949, %r5766, %r2944; + shf.l.wrap.b32 %r12056, %r12062, %r12061, %r12063; // end inline asm - ld.const.u32 %r2953, [matrix+1240]; // begin inline asm - dp4a.u32.u32 %r2952, %r2953, %r5770, %r2948; + shf.l.wrap.b32 %r12060, %r12061, %r12062, %r12063; // end inline asm - ld.const.u32 %r2957, [matrix+1244]; + mov.u32 %r12071, 10; // begin inline asm - dp4a.u32.u32 %r2956, %r2957, %r5774, %r2952; + shf.l.wrap.b32 %r12064, %r12070, %r12069, %r12071; // end inline asm - ld.const.u32 %r2961, [matrix+1248]; // begin inline asm - dp4a.u32.u32 %r2960, %r2961, %r5778, %r2956; + shf.l.wrap.b32 %r12068, %r12069, %r12070, %r12071; // end inline asm - ld.const.u32 %r2965, [matrix+1252]; + mov.u32 %r12079, 6; // begin inline asm - dp4a.u32.u32 %r2964, %r2965, %r5782, %r2960; + shf.l.wrap.b32 %r12072, %r12078, %r12077, %r12079; // end inline asm - ld.const.u32 %r2969, [matrix+1256]; // begin inline asm - dp4a.u32.u32 %r2968, %r2969, %r5786, %r2964; + shf.l.wrap.b32 %r12076, %r12077, %r12078, %r12079; // end inline asm - ld.const.u32 %r2973, [matrix+1260]; + mov.u32 %r12087, 3; // begin inline asm - dp4a.u32.u32 %r2972, %r2973, %r5790, %r2968; + shf.l.wrap.b32 %r12080, %r12086, %r12085, %r12087; // end inline asm - ld.const.u32 %r2977, [matrix+1264]; // begin inline asm - dp4a.u32.u32 %r2976, %r2977, %r5794, %r2972; + shf.l.wrap.b32 %r12084, %r12085, %r12086, %r12087; // end inline asm - ld.const.u32 %r2981, [matrix+1268]; // begin inline asm - dp4a.u32.u32 %r2980, %r2981, %r5798, %r2976; + shf.l.wrap.b32 %r12088, %r12094, %r12093, %r11777; // end inline asm - ld.const.u32 %r2985, [matrix+1272]; // begin inline asm - dp4a.u32.u32 %r2984, %r2985, %r5802, %r2980; + shf.l.wrap.b32 %r12092, %r12093, %r12094, %r11777; // end inline asm - ld.const.u32 %r2989, [matrix+1276]; // begin inline asm - dp4a.u32.u32 %r2988, %r2989, %r5806, %r2984; + // chi + lop3.b32 %r12096, %r12131, %r11904, %r11952, 0xD2; + lop3.b32 %r12097, %r12134, %r11908, %r11956, 0xD2; // end inline asm - shr.u32 %r6005, %r2924, 6; - and.b32 %r6006, %r6005, 240; - shr.u32 %r6007, %r2988, 10; - or.b32 %r6008, %r6007, %r6006; - xor.b32 %r6009, %r5848, %r6008; - ld.const.u32 %r2993, [matrix+1280]; // begin inline asm - dp4a.u32.u32 %r2992, %r2993, %r5746, %r6244; + // chi + lop3.b32 %r29959, %r11904, %r11952, %r12048, 0xD2; + lop3.b32 %r29960, %r11908, %r11956, %r12052, 0xD2; // end inline asm - ld.const.u32 %r2997, [matrix+1284]; // begin inline asm - dp4a.u32.u32 %r2996, %r2997, %r5750, %r2992; + // chi + lop3.b32 %r29955, %r11952, %r12048, %r12000, 0xD2; + lop3.b32 %r29956, %r11956, %r12052, %r12004, 0xD2; // end inline asm - ld.const.u32 %r3001, [matrix+1288]; // begin inline asm - dp4a.u32.u32 %r3000, %r3001, %r5754, %r2996; + // chi + lop3.b32 %r29951, %r12048, %r12000, %r12131, 0xD2; + lop3.b32 %r29952, %r12052, %r12004, %r12134, 0xD2; // end inline asm - ld.const.u32 %r3005, [matrix+1292]; // begin inline asm - dp4a.u32.u32 %r3004, %r3005, %r5758, %r3000; + // chi + lop3.b32 %r29949, %r12000, %r12131, %r11904, 0xD2; + lop3.b32 %r29950, %r12004, %r12134, %r11908, 0xD2; // end inline asm - ld.const.u32 %r3009, [matrix+1296]; // begin inline asm - dp4a.u32.u32 %r3008, %r3009, %r5762, %r3004; + // chi + lop3.b32 %r29945, %r12040, %r11912, %r12080, 0xD2; + lop3.b32 %r29946, %r12044, %r11916, %r12084, 0xD2; // end inline asm - ld.const.u32 %r3013, [matrix+1300]; // begin inline asm - dp4a.u32.u32 %r3012, %r3013, %r5766, %r3008; + // chi + lop3.b32 %r29957, %r11912, %r12080, %r12024, 0xD2; + lop3.b32 %r29958, %r11916, %r12084, %r12028, 0xD2; // end inline asm - ld.const.u32 %r3017, [matrix+1304]; // begin inline asm - dp4a.u32.u32 %r3016, %r3017, %r5770, %r3012; + // chi + lop3.b32 %r29953, %r12080, %r12024, %r11920, 0xD2; + lop3.b32 %r29954, %r12084, %r12028, %r11924, 0xD2; // end inline asm - ld.const.u32 %r3021, [matrix+1308]; // begin inline asm - dp4a.u32.u32 %r3020, %r3021, %r5774, %r3016; + // chi + lop3.b32 %r29925, %r12024, %r11920, %r12040, 0xD2; + lop3.b32 %r29926, %r12028, %r11924, %r12044, 0xD2; // end inline asm - ld.const.u32 %r3025, [matrix+1312]; + st.local.v2.u32 [%rd3+88], {%r29925, %r29926}; // begin inline asm - dp4a.u32.u32 %r3024, %r3025, %r5778, %r3020; + // chi + lop3.b32 %r29917, %r11920, %r12040, %r11912, 0xD2; + lop3.b32 %r29918, %r11924, %r12044, %r11916, 0xD2; // end inline asm - ld.const.u32 %r3029, [matrix+1316]; + st.local.v2.u32 [%rd3+96], {%r29917, %r29918}; // begin inline asm - dp4a.u32.u32 %r3028, %r3029, %r5782, %r3024; + // chi + lop3.b32 %r29943, %r12088, %r12072, %r11960, 0xD2; + lop3.b32 %r29944, %r12092, %r12076, %r11964, 0xD2; // end inline asm - ld.const.u32 %r3033, [matrix+1320]; + st.local.v2.u32 [%rd3+104], {%r29943, %r29944}; // begin inline asm - dp4a.u32.u32 %r3032, %r3033, %r5786, %r3028; + // chi + lop3.b32 %r29937, %r12072, %r11960, %r11968, 0xD2; + lop3.b32 %r29938, %r12076, %r11964, %r11972, 0xD2; // end inline asm - ld.const.u32 %r3037, [matrix+1324]; + st.local.v2.u32 [%rd3+112], {%r29937, %r29938}; // begin inline asm - dp4a.u32.u32 %r3036, %r3037, %r5790, %r3032; + // chi + lop3.b32 %r29931, %r11960, %r11968, %r11936, 0xD2; + lop3.b32 %r29932, %r11964, %r11972, %r11940, 0xD2; // end inline asm - ld.const.u32 %r3041, [matrix+1328]; + st.local.v2.u32 [%rd3+120], {%r29931, %r29932}; // begin inline asm - dp4a.u32.u32 %r3040, %r3041, %r5794, %r3036; + // chi + lop3.b32 %r29923, %r11968, %r11936, %r12088, 0xD2; + lop3.b32 %r29924, %r11972, %r11940, %r12092, 0xD2; // end inline asm - ld.const.u32 %r3045, [matrix+1332]; + st.local.v2.u32 [%rd3+128], {%r29923, %r29924}; // begin inline asm - dp4a.u32.u32 %r3044, %r3045, %r5798, %r3040; + // chi + lop3.b32 %r29915, %r11936, %r12088, %r12072, 0xD2; + lop3.b32 %r29916, %r11940, %r12092, %r12076, 0xD2; // end inline asm - ld.const.u32 %r3049, [matrix+1336]; + st.local.v2.u32 [%rd3+136], {%r29915, %r29916}; // begin inline asm - dp4a.u32.u32 %r3048, %r3049, %r5802, %r3044; + // chi + lop3.b32 %r29941, %r11992, %r12032, %r12064, 0xD2; + lop3.b32 %r29942, %r11996, %r12036, %r12068, 0xD2; // end inline asm - ld.const.u32 %r3053, [matrix+1340]; + st.local.v2.u32 [%rd3+144], {%r29941, %r29942}; // begin inline asm - dp4a.u32.u32 %r3052, %r3053, %r5806, %r3048; + // chi + lop3.b32 %r29935, %r12032, %r12064, %r12056, 0xD2; + lop3.b32 %r29936, %r12036, %r12068, %r12060, 0xD2; // end inline asm - ld.const.u32 %r3057, [matrix+1344]; + st.local.v2.u32 [%rd3+152], {%r29935, %r29936}; // begin inline asm - dp4a.u32.u32 %r3056, %r3057, %r5746, %r6244; + // chi + lop3.b32 %r29929, %r12064, %r12056, %r11976, 0xD2; + lop3.b32 %r29930, %r12068, %r12060, %r11980, 0xD2; // end inline asm - ld.const.u32 %r3061, [matrix+1348]; + st.local.v2.u32 [%rd3+160], {%r29929, %r29930}; // begin inline asm - dp4a.u32.u32 %r3060, %r3061, %r5750, %r3056; + // chi + lop3.b32 %r29921, %r12056, %r11976, %r11992, 0xD2; + lop3.b32 %r29922, %r12060, %r11980, %r11996, 0xD2; // end inline asm - ld.const.u32 %r3065, [matrix+1352]; + st.local.v2.u32 [%rd3+168], {%r29921, %r29922}; // begin inline asm - dp4a.u32.u32 %r3064, %r3065, %r5754, %r3060; + // chi + lop3.b32 %r29913, %r11976, %r11992, %r12032, 0xD2; + lop3.b32 %r29914, %r11980, %r11996, %r12036, 0xD2; // end inline asm - ld.const.u32 %r3069, [matrix+1356]; + st.local.v2.u32 [%rd3+176], {%r29913, %r29914}; // begin inline asm - dp4a.u32.u32 %r3068, %r3069, %r5758, %r3064; + // chi + lop3.b32 %r29939, %r11944, %r12016, %r11928, 0xD2; + lop3.b32 %r29940, %r11948, %r12020, %r11932, 0xD2; // end inline asm - ld.const.u32 %r3073, [matrix+1360]; + st.local.v2.u32 [%rd3+184], {%r29939, %r29940}; // begin inline asm - dp4a.u32.u32 %r3072, %r3073, %r5762, %r3068; + // chi + lop3.b32 %r29933, %r12016, %r11928, %r11984, 0xD2; + lop3.b32 %r29934, %r12020, %r11932, %r11988, 0xD2; // end inline asm - ld.const.u32 %r3077, [matrix+1364]; + st.local.v2.u32 [%rd3+192], {%r29933, %r29934}; // begin inline asm - dp4a.u32.u32 %r3076, %r3077, %r5766, %r3072; + // chi + lop3.b32 %r29927, %r11928, %r11984, %r12008, 0xD2; + lop3.b32 %r29928, %r11932, %r11988, %r12012, 0xD2; // end inline asm - ld.const.u32 %r3081, [matrix+1368]; + st.local.v2.u32 [%rd3+200], {%r29927, %r29928}; // begin inline asm - dp4a.u32.u32 %r3080, %r3081, %r5770, %r3076; + // chi + lop3.b32 %r29919, %r11984, %r12008, %r11944, 0xD2; + lop3.b32 %r29920, %r11988, %r12012, %r11948, 0xD2; // end inline asm - ld.const.u32 %r3085, [matrix+1372]; + st.local.v2.u32 [%rd3+208], {%r29919, %r29920}; // begin inline asm - dp4a.u32.u32 %r3084, %r3085, %r5774, %r3080; + // chi + lop3.b32 %r29911, %r12008, %r11944, %r12016, 0xD2; + lop3.b32 %r29912, %r12012, %r11948, %r12020, 0xD2; // end inline asm - ld.const.u32 %r3089, [matrix+1376]; + st.local.v2.u32 [%rd3+216], {%r29911, %r29912}; + mul.wide.s32 %rd665, %r29961, 8; + add.s64 %rd664, %rd597, %rd665; // begin inline asm - dp4a.u32.u32 %r3088, %r3089, %r5778, %r3084; + ld.global.nc.v2.u32 {%r12296,%r12297}, [%rd664]; // end inline asm - ld.const.u32 %r3093, [matrix+1380]; + xor.b32 %r29947, %r12096, %r12296; + xor.b32 %r29948, %r12097, %r12297; + add.s32 %r29961, %r29961, 1; + setp.lt.u32 %p26, %r29961, 23; + @%p26 bra $L__BB2_40; + + st.local.v2.u32 [%rd3+32], {%r29959, %r29960}; + st.local.v2.u32 [%rd3+72], {%r29957, %r29958}; + st.local.v2.u32 [%rd3+40], {%r29955, %r29956}; + st.local.v2.u32 [%rd3+80], {%r29953, %r29954}; + st.local.v2.u32 [%rd3+48], {%r29951, %r29952}; + st.local.v2.u32 [%rd3+56], {%r29949, %r29950}; + st.local.v2.u32 [%rd3+24], {%r29947, %r29948}; // begin inline asm - dp4a.u32.u32 %r3092, %r3093, %r5782, %r3088; + // xor5 + lop3.b32 %r12308, %r29947, %r29945, %r29943, 0x96; + lop3.b32 %r12308, %r12308, %r29941, %r29939, 0x96; + lop3.b32 %r12309, %r29948, %r29946, %r29944, 0x96; + lop3.b32 %r12309, %r12309, %r29942, %r29940, 0x96; // end inline asm - ld.const.u32 %r3097, [matrix+1384]; // begin inline asm - dp4a.u32.u32 %r3096, %r3097, %r5786, %r3092; + // xor5 + lop3.b32 %r12320, %r29959, %r29957, %r29937, 0x96; + lop3.b32 %r12320, %r12320, %r29935, %r29933, 0x96; + lop3.b32 %r12321, %r29960, %r29958, %r29938, 0x96; + lop3.b32 %r12321, %r12321, %r29936, %r29934, 0x96; // end inline asm - ld.const.u32 %r3101, [matrix+1388]; // begin inline asm - dp4a.u32.u32 %r3100, %r3101, %r5790, %r3096; + // xor5 + lop3.b32 %r12332, %r29955, %r29953, %r29931, 0x96; + lop3.b32 %r12332, %r12332, %r29929, %r29927, 0x96; + lop3.b32 %r12333, %r29956, %r29954, %r29932, 0x96; + lop3.b32 %r12333, %r12333, %r29930, %r29928, 0x96; // end inline asm - ld.const.u32 %r3105, [matrix+1392]; // begin inline asm - dp4a.u32.u32 %r3104, %r3105, %r5794, %r3100; + // xor5 + lop3.b32 %r12344, %r29951, %r29925, %r29923, 0x96; + lop3.b32 %r12344, %r12344, %r29921, %r29919, 0x96; + lop3.b32 %r12345, %r29952, %r29926, %r29924, 0x96; + lop3.b32 %r12345, %r12345, %r29922, %r29920, 0x96; // end inline asm - ld.const.u32 %r3109, [matrix+1396]; // begin inline asm - dp4a.u32.u32 %r3108, %r3109, %r5798, %r3104; + // xor5 + lop3.b32 %r12356, %r29949, %r29917, %r29915, 0x96; + lop3.b32 %r12356, %r12356, %r29913, %r29911, 0x96; + lop3.b32 %r12357, %r29950, %r29918, %r29916, 0x96; + lop3.b32 %r12357, %r12357, %r29914, %r29912, 0x96; // end inline asm - ld.const.u32 %r3113, [matrix+1400]; + mov.u32 %r12560, 1; // begin inline asm - dp4a.u32.u32 %r3112, %r3113, %r5802, %r3108; + shf.l.wrap.b32 %r12368, %r12321, %r12320, %r12560; // end inline asm - ld.const.u32 %r3117, [matrix+1404]; // begin inline asm - dp4a.u32.u32 %r3116, %r3117, %r5806, %r3112; + shf.l.wrap.b32 %r12372, %r12320, %r12321, %r12560; // end inline asm - shr.u32 %r6010, %r3052, 6; - and.b32 %r6011, %r6010, 240; - shr.u32 %r6012, %r3116, 10; - or.b32 %r6013, %r6012, %r6011; - xor.b32 %r6014, %r5860, %r6013; - ld.const.u32 %r3121, [matrix+1408]; + xor.b32 %r12587, %r12368, %r12356; + xor.b32 %r12588, %r12372, %r12357; + xor.b32 %r12515, %r29947, %r12587; + xor.b32 %r12518, %r29948, %r12588; + xor.b32 %r12478, %r29944, %r12588; + xor.b32 %r12477, %r29943, %r12587; + st.local.v2.u32 [%rd3+104], {%r12477, %r12478}; // begin inline asm - dp4a.u32.u32 %r3120, %r3121, %r5746, %r6244; + shf.l.wrap.b32 %r12376, %r12333, %r12332, %r12560; // end inline asm - ld.const.u32 %r3125, [matrix+1412]; // begin inline asm - dp4a.u32.u32 %r3124, %r3125, %r5750, %r3120; + shf.l.wrap.b32 %r12380, %r12332, %r12333, %r12560; // end inline asm - ld.const.u32 %r3129, [matrix+1416]; + xor.b32 %r12589, %r12376, %r12308; + xor.b32 %r12590, %r12380, %r12309; + xor.b32 %r12414, %r29957, %r12589; + xor.b32 %r12413, %r29958, %r12590; + xor.b32 %r12453, %r29936, %r12590; + xor.b32 %r12454, %r29935, %r12589; + st.local.v2.u32 [%rd3+152], {%r12454, %r12453}; // begin inline asm - dp4a.u32.u32 %r3128, %r3129, %r5754, %r3124; + shf.l.wrap.b32 %r12384, %r12345, %r12344, %r12560; // end inline asm - ld.const.u32 %r3133, [matrix+1420]; // begin inline asm - dp4a.u32.u32 %r3132, %r3133, %r5758, %r3128; + shf.l.wrap.b32 %r12388, %r12344, %r12345, %r12560; // end inline asm - ld.const.u32 %r3137, [matrix+1424]; + xor.b32 %r12591, %r12384, %r12320; + xor.b32 %r12592, %r12388, %r12321; + xor.b32 %r12437, %r29932, %r12592; + xor.b32 %r12438, %r29931, %r12591; + st.local.v2.u32 [%rd3+120], {%r12438, %r12437}; + xor.b32 %r12429, %r29928, %r12592; + xor.b32 %r12430, %r29927, %r12591; + st.local.v2.u32 [%rd3+200], {%r12430, %r12429}; // begin inline asm - dp4a.u32.u32 %r3136, %r3137, %r5762, %r3132; + shf.l.wrap.b32 %r12392, %r12357, %r12356, %r12560; // end inline asm - ld.const.u32 %r3141, [matrix+1428]; // begin inline asm - dp4a.u32.u32 %r3140, %r3141, %r5766, %r3136; + shf.l.wrap.b32 %r12396, %r12356, %r12357, %r12560; // end inline asm - ld.const.u32 %r3145, [matrix+1432]; + xor.b32 %r12593, %r12392, %r12332; + xor.b32 %r12594, %r12396, %r12333; + xor.b32 %r12461, %r29951, %r12593; + xor.b32 %r12462, %r29952, %r12594; + xor.b32 %r12470, %r29922, %r12594; + xor.b32 %r12469, %r29921, %r12593; + st.local.v2.u32 [%rd3+168], {%r12469, %r12470}; // begin inline asm - dp4a.u32.u32 %r3144, %r3145, %r5770, %r3140; + shf.l.wrap.b32 %r12400, %r12309, %r12308, %r12560; // end inline asm - ld.const.u32 %r3149, [matrix+1436]; // begin inline asm - dp4a.u32.u32 %r3148, %r3149, %r5774, %r3144; + shf.l.wrap.b32 %r12404, %r12308, %r12309, %r12560; // end inline asm - ld.const.u32 %r3153, [matrix+1440]; + xor.b32 %r12595, %r12400, %r12344; + xor.b32 %r12596, %r12404, %r12345; + xor.b32 %r12421, %r29917, %r12595; + xor.b32 %r12422, %r29918, %r12596; + xor.b32 %r12446, %r29912, %r12596; + xor.b32 %r12445, %r29911, %r12595; + st.local.v2.u32 [%rd3+216], {%r12445, %r12446}; // begin inline asm - dp4a.u32.u32 %r3152, %r3153, %r5778, %r3148; + shf.l.wrap.b32 %r12408, %r12414, %r12413, %r11911; // end inline asm - ld.const.u32 %r3157, [matrix+1444]; // begin inline asm - dp4a.u32.u32 %r3156, %r3157, %r5782, %r3152; + shf.l.wrap.b32 %r12412, %r12413, %r12414, %r11911; // end inline asm - ld.const.u32 %r3161, [matrix+1448]; // begin inline asm - dp4a.u32.u32 %r3160, %r3161, %r5786, %r3156; + shf.l.wrap.b32 %r12416, %r12422, %r12421, %r11919; // end inline asm - ld.const.u32 %r3165, [matrix+1452]; // begin inline asm - dp4a.u32.u32 %r3164, %r3165, %r5790, %r3160; + shf.l.wrap.b32 %r12420, %r12421, %r12422, %r11919; // end inline asm - ld.const.u32 %r3169, [matrix+1456]; // begin inline asm - dp4a.u32.u32 %r3168, %r3169, %r5794, %r3164; + shf.l.wrap.b32 %r12428, %r12429, %r12430, %r11927; // end inline asm - ld.const.u32 %r3173, [matrix+1460]; // begin inline asm - dp4a.u32.u32 %r3172, %r3173, %r5798, %r3168; + shf.l.wrap.b32 %r12424, %r12430, %r12429, %r11927; // end inline asm - ld.const.u32 %r3177, [matrix+1464]; + st.local.v2.u32 [%rd3+96], {%r12424, %r12428}; // begin inline asm - dp4a.u32.u32 %r3176, %r3177, %r5802, %r3172; + shf.l.wrap.b32 %r12432, %r12438, %r12437, %r11959; // end inline asm - ld.const.u32 %r3181, [matrix+1468]; // begin inline asm - dp4a.u32.u32 %r3180, %r3181, %r5806, %r3176; + shf.l.wrap.b32 %r12436, %r12437, %r12438, %r11959; // end inline asm - ld.const.u32 %r3185, [matrix+1472]; // begin inline asm - dp4a.u32.u32 %r3184, %r3185, %r5746, %r6244; + shf.l.wrap.b32 %r12440, %r12446, %r12445, %r12007; // end inline asm - ld.const.u32 %r3189, [matrix+1476]; // begin inline asm - dp4a.u32.u32 %r3188, %r3189, %r5750, %r3184; + shf.l.wrap.b32 %r12444, %r12445, %r12446, %r12007; // end inline asm - ld.const.u32 %r3193, [matrix+1480]; // begin inline asm - dp4a.u32.u32 %r3192, %r3193, %r5754, %r3188; + shf.l.wrap.b32 %r12452, %r12453, %r12454, %r12031; // end inline asm - ld.const.u32 %r3197, [matrix+1484]; // begin inline asm - dp4a.u32.u32 %r3196, %r3197, %r5758, %r3192; + shf.l.wrap.b32 %r12448, %r12454, %r12453, %r12031; // end inline asm - ld.const.u32 %r3201, [matrix+1488]; + st.local.v2.u32 [%rd3+88], {%r12448, %r12452}; // begin inline asm - dp4a.u32.u32 %r3200, %r3201, %r5762, %r3196; + shf.l.wrap.b32 %r12456, %r12462, %r12461, %r12047; // end inline asm - ld.const.u32 %r3205, [matrix+1492]; // begin inline asm - dp4a.u32.u32 %r3204, %r3205, %r5766, %r3200; + shf.l.wrap.b32 %r12460, %r12461, %r12462, %r12047; // end inline asm - ld.const.u32 %r3209, [matrix+1496]; // begin inline asm - dp4a.u32.u32 %r3208, %r3209, %r5770, %r3204; + shf.l.wrap.b32 %r12464, %r12470, %r12469, %r12055; // end inline asm - ld.const.u32 %r3213, [matrix+1500]; // begin inline asm - dp4a.u32.u32 %r3212, %r3213, %r5774, %r3208; + shf.l.wrap.b32 %r12468, %r12469, %r12470, %r12055; // end inline asm - ld.const.u32 %r3217, [matrix+1504]; // begin inline asm - dp4a.u32.u32 %r3216, %r3217, %r5778, %r3212; + shf.l.wrap.b32 %r12472, %r12478, %r12477, %r12087; // end inline asm - ld.const.u32 %r3221, [matrix+1508]; // begin inline asm - dp4a.u32.u32 %r3220, %r3221, %r5782, %r3216; + shf.l.wrap.b32 %r12476, %r12477, %r12478, %r12087; // end inline asm - ld.const.u32 %r3225, [matrix+1512]; // begin inline asm - dp4a.u32.u32 %r3224, %r3225, %r5786, %r3220; + // chi + lop3.b32 %r12480, %r12515, %r12408, %r12432, 0xD2; + lop3.b32 %r12481, %r12518, %r12412, %r12436, 0xD2; // end inline asm - ld.const.u32 %r3229, [matrix+1516]; // begin inline asm - dp4a.u32.u32 %r3228, %r3229, %r5790, %r3224; + // chi + lop3.b32 %r12488, %r12408, %r12432, %r12464, 0xD2; + lop3.b32 %r12489, %r12412, %r12436, %r12468, 0xD2; // end inline asm - ld.const.u32 %r3233, [matrix+1520]; + st.local.v2.u32 [%rd3+32], {%r12488, %r12489}; // begin inline asm - dp4a.u32.u32 %r3232, %r3233, %r5794, %r3228; + // chi + lop3.b32 %r12496, %r12432, %r12464, %r12440, 0xD2; + lop3.b32 %r12497, %r12436, %r12468, %r12444, 0xD2; // end inline asm - ld.const.u32 %r3237, [matrix+1524]; + st.local.v2.u32 [%rd3+40], {%r12496, %r12497}; // begin inline asm - dp4a.u32.u32 %r3236, %r3237, %r5798, %r3232; + // chi + lop3.b32 %r12504, %r12464, %r12440, %r12515, 0xD2; + lop3.b32 %r12505, %r12468, %r12444, %r12518, 0xD2; // end inline asm - ld.const.u32 %r3241, [matrix+1528]; + st.local.v2.u32 [%rd3+48], {%r12504, %r12505}; // begin inline asm - dp4a.u32.u32 %r3240, %r3241, %r5802, %r3236; + // chi + lop3.b32 %r12512, %r12440, %r12515, %r12408, 0xD2; + lop3.b32 %r12513, %r12444, %r12518, %r12412, 0xD2; // end inline asm - ld.const.u32 %r3245, [matrix+1532]; + st.local.v2.u32 [%rd3+56], {%r12512, %r12513}; // begin inline asm - dp4a.u32.u32 %r3244, %r3245, %r5806, %r3240; + // chi + lop3.b32 %r12520, %r12456, %r12416, %r12472, 0xD2; + lop3.b32 %r12521, %r12460, %r12420, %r12476, 0xD2; // end inline asm - shr.u32 %r6015, %r3180, 6; - and.b32 %r6016, %r6015, 240; - shr.u32 %r6017, %r3244, 10; - or.b32 %r6018, %r6017, %r6016; - xor.b32 %r6019, %r5862, %r6018; - ld.const.u32 %r3249, [matrix+1536]; + st.local.v2.u32 [%rd3+64], {%r12520, %r12521}; // begin inline asm - dp4a.u32.u32 %r3248, %r3249, %r5746, %r6244; + // chi + lop3.b32 %r12528, %r12416, %r12472, %r12448, 0xD2; + lop3.b32 %r12529, %r12420, %r12476, %r12452, 0xD2; // end inline asm - ld.const.u32 %r3253, [matrix+1540]; + st.local.v2.u32 [%rd3+72], {%r12528, %r12529}; // begin inline asm - dp4a.u32.u32 %r3252, %r3253, %r5750, %r3248; + // chi + lop3.b32 %r12536, %r12472, %r12448, %r12424, 0xD2; + lop3.b32 %r12537, %r12476, %r12452, %r12428, 0xD2; // end inline asm - ld.const.u32 %r3257, [matrix+1544]; + st.local.v2.u32 [%rd3+80], {%r12536, %r12537}; // begin inline asm - dp4a.u32.u32 %r3256, %r3257, %r5754, %r3252; + ld.global.nc.v2.u32 {%r12544,%r12545}, [%rd598]; + // end inline asm + xor.b32 %r12597, %r12481, %r12545; + xor.b32 %r12598, %r12480, %r12544; + mov.b64 %rd1261, {%r12598, %r12597}; + mov.b64 %rd1262, {%r12488, %r12489}; + mov.b64 %rd1263, {%r12496, %r12497}; + mov.b64 %rd1264, {%r12512, %r12513}; + mov.u32 %r29962, 0; + st.local.v2.u32 [%rd3+24], {%r12598, %r12597}; + st.local.v2.u32 [%rd84+96], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+104], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+112], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+120], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+128], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+136], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+144], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+152], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+160], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+168], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+176], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+184], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+192], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+200], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+208], {%r29962, %r29962}; + st.local.v2.u32 [%rd84+216], {%r29962, %r29962}; + mov.u32 %r29977, -2147483648; + st.local.v2.u32 [%rd84+88], {%r12560, %r29977}; + mov.u32 %r29963, %r29962; + mov.u32 %r29964, %r29962; + mov.u32 %r29965, %r29962; + mov.u32 %r29966, %r29962; + mov.u32 %r29967, %r29962; + mov.u32 %r29968, %r29962; + mov.u32 %r29969, %r29962; + mov.u32 %r29970, %r29962; + mov.u32 %r29971, %r29962; + mov.u32 %r29972, %r29962; + mov.u32 %r29973, %r29962; + mov.u32 %r29974, %r29962; + mov.u32 %r29975, %r29962; + mov.u32 %r29976, %r12560; + mov.u32 %r29978, %r29962; + mov.u32 %r29979, %r29962; + mov.u32 %r29980, %r29962; + mov.u32 %r29981, %r29962; + mov.u32 %r29982, %r29962; + mov.u32 %r29983, %r29962; + mov.u32 %r29984, %r29962; + mov.u32 %r29985, %r29962; + mov.u32 %r29986, %r29962; + mov.u32 %r29987, %r29962; + mov.u32 %r29988, %r29962; + mov.u32 %r29989, %r29962; + mov.u32 %r29990, %r29962; + mov.u32 %r29991, %r29962; + mov.u32 %r29992, %r29962; + mov.u32 %r29993, %r29962; + mov.u32 %r29994, %r29962; + mov.u32 %r29995, %r29962; + mov.u32 %r30012, %r29962; + +$L__BB2_42: + // begin inline asm + // xor5 + lop3.b32 %r12599, %r29998, %r29996, %r29994, 0x96; + lop3.b32 %r12599, %r12599, %r29992, %r29990, 0x96; + lop3.b32 %r12600, %r29999, %r29997, %r29995, 0x96; + lop3.b32 %r12600, %r12600, %r29993, %r29991, 0x96; // end inline asm - ld.const.u32 %r3261, [matrix+1548]; // begin inline asm - dp4a.u32.u32 %r3260, %r3261, %r5758, %r3256; + // xor5 + lop3.b32 %r12611, %r30010, %r30008, %r29988, 0x96; + lop3.b32 %r12611, %r12611, %r29986, %r29984, 0x96; + lop3.b32 %r12612, %r30011, %r30009, %r29989, 0x96; + lop3.b32 %r12612, %r12612, %r29987, %r29985, 0x96; // end inline asm - ld.const.u32 %r3265, [matrix+1552]; // begin inline asm - dp4a.u32.u32 %r3264, %r3265, %r5762, %r3260; + // xor5 + lop3.b32 %r12623, %r30006, %r30004, %r29982, 0x96; + lop3.b32 %r12623, %r12623, %r29980, %r29978, 0x96; + lop3.b32 %r12624, %r30007, %r30005, %r29983, 0x96; + lop3.b32 %r12624, %r12624, %r29981, %r29979, 0x96; // end inline asm - ld.const.u32 %r3269, [matrix+1556]; // begin inline asm - dp4a.u32.u32 %r3268, %r3269, %r5766, %r3264; + // xor5 + lop3.b32 %r12635, %r30002, %r29976, %r29974, 0x96; + lop3.b32 %r12635, %r12635, %r29972, %r29970, 0x96; + lop3.b32 %r12636, %r30003, %r29977, %r29975, 0x96; + lop3.b32 %r12636, %r12636, %r29973, %r29971, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r12647, %r30000, %r29968, %r29966, 0x96; + lop3.b32 %r12647, %r12647, %r29964, %r29962, 0x96; + lop3.b32 %r12648, %r30001, %r29969, %r29967, 0x96; + lop3.b32 %r12648, %r12648, %r29965, %r29963, 0x96; // end inline asm - ld.const.u32 %r3273, [matrix+1560]; // begin inline asm - dp4a.u32.u32 %r3272, %r3273, %r5770, %r3268; + shf.l.wrap.b32 %r12659, %r12612, %r12611, %r12560; // end inline asm - ld.const.u32 %r3277, [matrix+1564]; // begin inline asm - dp4a.u32.u32 %r3276, %r3277, %r5774, %r3272; + shf.l.wrap.b32 %r12663, %r12611, %r12612, %r12560; // end inline asm - ld.const.u32 %r3281, [matrix+1568]; + xor.b32 %r13093, %r12659, %r12647; + xor.b32 %r13094, %r12663, %r12648; + xor.b32 %r12926, %r29998, %r13093; + xor.b32 %r12929, %r29999, %r13094; + xor.b32 %r12833, %r29996, %r13093; + xor.b32 %r12832, %r29997, %r13094; + xor.b32 %r12880, %r29994, %r13093; + xor.b32 %r12881, %r29995, %r13094; + xor.b32 %r12785, %r29992, %r13093; + xor.b32 %r12784, %r29993, %r13094; + xor.b32 %r12736, %r29990, %r13093; + xor.b32 %r12737, %r29991, %r13094; // begin inline asm - dp4a.u32.u32 %r3280, %r3281, %r5778, %r3276; + shf.l.wrap.b32 %r12667, %r12624, %r12623, %r12560; // end inline asm - ld.const.u32 %r3285, [matrix+1572]; // begin inline asm - dp4a.u32.u32 %r3284, %r3285, %r5782, %r3280; + shf.l.wrap.b32 %r12671, %r12623, %r12624, %r12560; // end inline asm - ld.const.u32 %r3289, [matrix+1576]; + xor.b32 %r13095, %r12667, %r12599; + xor.b32 %r13096, %r12671, %r12600; + xor.b32 %r12888, %r30010, %r13095; + xor.b32 %r12889, %r30011, %r13096; + xor.b32 %r12705, %r30008, %r13095; + xor.b32 %r12704, %r30009, %r13096; + xor.b32 %r12864, %r29988, %r13095; + xor.b32 %r12865, %r29989, %r13096; + xor.b32 %r12825, %r29986, %r13095; + xor.b32 %r12824, %r29987, %r13096; + xor.b32 %r12808, %r29984, %r13095; + xor.b32 %r12809, %r29985, %r13096; // begin inline asm - dp4a.u32.u32 %r3288, %r3289, %r5786, %r3284; + shf.l.wrap.b32 %r12675, %r12636, %r12635, %r12560; // end inline asm - ld.const.u32 %r3293, [matrix+1580]; // begin inline asm - dp4a.u32.u32 %r3292, %r3293, %r5790, %r3288; + shf.l.wrap.b32 %r12679, %r12635, %r12636, %r12560; // end inline asm - ld.const.u32 %r3297, [matrix+1584]; + xor.b32 %r13097, %r12675, %r12611; + xor.b32 %r13098, %r12679, %r12612; + xor.b32 %r12745, %r30006, %r13097; + xor.b32 %r12744, %r30007, %r13098; + xor.b32 %r12872, %r30004, %r13097; + xor.b32 %r12873, %r30005, %r13098; + xor.b32 %r12753, %r29982, %r13097; + xor.b32 %r12752, %r29983, %r13098; + xor.b32 %r12856, %r29980, %r13097; + xor.b32 %r12857, %r29981, %r13098; + xor.b32 %r12721, %r29978, %r13097; + xor.b32 %r12720, %r29979, %r13098; // begin inline asm - dp4a.u32.u32 %r3296, %r3297, %r5794, %r3292; + shf.l.wrap.b32 %r12683, %r12648, %r12647, %r12560; // end inline asm - ld.const.u32 %r3301, [matrix+1588]; // begin inline asm - dp4a.u32.u32 %r3300, %r3301, %r5798, %r3296; + shf.l.wrap.b32 %r12687, %r12647, %r12648, %r12560; // end inline asm - ld.const.u32 %r3305, [matrix+1592]; + xor.b32 %r13099, %r12683, %r12623; + xor.b32 %r13100, %r12687, %r12624; + xor.b32 %r12840, %r30002, %r13099; + xor.b32 %r12841, %r30003, %r13100; + xor.b32 %r12817, %r29976, %r13099; + xor.b32 %r12816, %r29977, %r13100; + xor.b32 %r12760, %r29974, %r13099; + xor.b32 %r12761, %r29975, %r13100; + xor.b32 %r12848, %r29972, %r13099; + xor.b32 %r12849, %r29973, %r13100; + xor.b32 %r12777, %r29970, %r13099; + xor.b32 %r12776, %r29971, %r13100; // begin inline asm - dp4a.u32.u32 %r3304, %r3305, %r5802, %r3300; + shf.l.wrap.b32 %r12691, %r12600, %r12599, %r12560; // end inline asm - ld.const.u32 %r3309, [matrix+1596]; // begin inline asm - dp4a.u32.u32 %r3308, %r3309, %r5806, %r3304; + shf.l.wrap.b32 %r12695, %r12599, %r12600, %r12560; // end inline asm - ld.const.u32 %r3313, [matrix+1600]; + xor.b32 %r13101, %r12691, %r12635; + xor.b32 %r13102, %r12695, %r12636; + xor.b32 %r12792, %r30000, %r13101; + xor.b32 %r12793, %r30001, %r13102; + xor.b32 %r12712, %r29968, %r13101; + xor.b32 %r12713, %r29969, %r13102; + xor.b32 %r12729, %r29966, %r13101; + xor.b32 %r12728, %r29967, %r13102; + xor.b32 %r12768, %r29964, %r13101; + xor.b32 %r12769, %r29965, %r13102; + xor.b32 %r12800, %r29962, %r13101; + xor.b32 %r12801, %r29963, %r13102; + mov.u32 %r12706, 44; // begin inline asm - dp4a.u32.u32 %r3312, %r3313, %r5746, %r6244; + shf.l.wrap.b32 %r12699, %r12705, %r12704, %r12706; // end inline asm - ld.const.u32 %r3317, [matrix+1604]; // begin inline asm - dp4a.u32.u32 %r3316, %r3317, %r5750, %r3312; + shf.l.wrap.b32 %r12703, %r12704, %r12705, %r12706; // end inline asm - ld.const.u32 %r3321, [matrix+1608]; + mov.u32 %r12714, 20; // begin inline asm - dp4a.u32.u32 %r3320, %r3321, %r5754, %r3316; + shf.l.wrap.b32 %r12707, %r12713, %r12712, %r12714; // end inline asm - ld.const.u32 %r3325, [matrix+1612]; // begin inline asm - dp4a.u32.u32 %r3324, %r3325, %r5758, %r3320; + shf.l.wrap.b32 %r12711, %r12712, %r12713, %r12714; // end inline asm - ld.const.u32 %r3329, [matrix+1616]; + mov.u32 %r12722, 61; // begin inline asm - dp4a.u32.u32 %r3328, %r3329, %r5762, %r3324; + shf.l.wrap.b32 %r12715, %r12721, %r12720, %r12722; // end inline asm - ld.const.u32 %r3333, [matrix+1620]; // begin inline asm - dp4a.u32.u32 %r3332, %r3333, %r5766, %r3328; + shf.l.wrap.b32 %r12719, %r12720, %r12721, %r12722; // end inline asm - ld.const.u32 %r3337, [matrix+1624]; + mov.u32 %r12730, 39; // begin inline asm - dp4a.u32.u32 %r3336, %r3337, %r5770, %r3332; + shf.l.wrap.b32 %r12723, %r12729, %r12728, %r12730; // end inline asm - ld.const.u32 %r3341, [matrix+1628]; // begin inline asm - dp4a.u32.u32 %r3340, %r3341, %r5774, %r3336; + shf.l.wrap.b32 %r12727, %r12728, %r12729, %r12730; // end inline asm - ld.const.u32 %r3345, [matrix+1632]; + mov.u32 %r12738, 18; // begin inline asm - dp4a.u32.u32 %r3344, %r3345, %r5778, %r3340; + shf.l.wrap.b32 %r12731, %r12737, %r12736, %r12738; // end inline asm - ld.const.u32 %r3349, [matrix+1636]; // begin inline asm - dp4a.u32.u32 %r3348, %r3349, %r5782, %r3344; + shf.l.wrap.b32 %r12735, %r12736, %r12737, %r12738; // end inline asm - ld.const.u32 %r3353, [matrix+1640]; + mov.u32 %r12746, 62; // begin inline asm - dp4a.u32.u32 %r3352, %r3353, %r5786, %r3348; + shf.l.wrap.b32 %r12739, %r12745, %r12744, %r12746; // end inline asm - ld.const.u32 %r3357, [matrix+1644]; // begin inline asm - dp4a.u32.u32 %r3356, %r3357, %r5790, %r3352; + shf.l.wrap.b32 %r12743, %r12744, %r12745, %r12746; // end inline asm - ld.const.u32 %r3361, [matrix+1648]; + mov.u32 %r12754, 43; // begin inline asm - dp4a.u32.u32 %r3360, %r3361, %r5794, %r3356; + shf.l.wrap.b32 %r12747, %r12753, %r12752, %r12754; // end inline asm - ld.const.u32 %r3365, [matrix+1652]; // begin inline asm - dp4a.u32.u32 %r3364, %r3365, %r5798, %r3360; + shf.l.wrap.b32 %r12751, %r12752, %r12753, %r12754; // end inline asm - ld.const.u32 %r3369, [matrix+1656]; + mov.u32 %r12762, 25; // begin inline asm - dp4a.u32.u32 %r3368, %r3369, %r5802, %r3364; + shf.l.wrap.b32 %r12755, %r12761, %r12760, %r12762; // end inline asm - ld.const.u32 %r3373, [matrix+1660]; // begin inline asm - dp4a.u32.u32 %r3372, %r3373, %r5806, %r3368; + shf.l.wrap.b32 %r12759, %r12760, %r12761, %r12762; // end inline asm - shr.u32 %r6020, %r3308, 6; - and.b32 %r6021, %r6020, 240; - shr.u32 %r6022, %r3372, 10; - or.b32 %r6023, %r6022, %r6021; - cvt.u64.u32 %rd217, %r6023; - xor.b64 %rd218, %rd13, %rd217; - ld.const.u32 %r3377, [matrix+1664]; + mov.u32 %r12770, 8; // begin inline asm - dp4a.u32.u32 %r3376, %r3377, %r5746, %r6244; + shf.l.wrap.b32 %r12763, %r12769, %r12768, %r12770; // end inline asm - ld.const.u32 %r3381, [matrix+1668]; // begin inline asm - dp4a.u32.u32 %r3380, %r3381, %r5750, %r3376; + shf.l.wrap.b32 %r12767, %r12768, %r12769, %r12770; // end inline asm - ld.const.u32 %r3385, [matrix+1672]; + mov.u32 %r12778, 56; // begin inline asm - dp4a.u32.u32 %r3384, %r3385, %r5754, %r3380; + shf.l.wrap.b32 %r12771, %r12777, %r12776, %r12778; // end inline asm - ld.const.u32 %r3389, [matrix+1676]; // begin inline asm - dp4a.u32.u32 %r3388, %r3389, %r5758, %r3384; + shf.l.wrap.b32 %r12775, %r12776, %r12777, %r12778; // end inline asm - ld.const.u32 %r3393, [matrix+1680]; + mov.u32 %r12786, 41; // begin inline asm - dp4a.u32.u32 %r3392, %r3393, %r5762, %r3388; + shf.l.wrap.b32 %r12779, %r12785, %r12784, %r12786; // end inline asm - ld.const.u32 %r3397, [matrix+1684]; // begin inline asm - dp4a.u32.u32 %r3396, %r3397, %r5766, %r3392; + shf.l.wrap.b32 %r12783, %r12784, %r12785, %r12786; // end inline asm - ld.const.u32 %r3401, [matrix+1688]; + mov.u32 %r12794, 27; // begin inline asm - dp4a.u32.u32 %r3400, %r3401, %r5770, %r3396; + shf.l.wrap.b32 %r12787, %r12793, %r12792, %r12794; // end inline asm - ld.const.u32 %r3405, [matrix+1692]; // begin inline asm - dp4a.u32.u32 %r3404, %r3405, %r5774, %r3400; + shf.l.wrap.b32 %r12791, %r12792, %r12793, %r12794; // end inline asm - ld.const.u32 %r3409, [matrix+1696]; + mov.u32 %r12802, 14; // begin inline asm - dp4a.u32.u32 %r3408, %r3409, %r5778, %r3404; + shf.l.wrap.b32 %r12795, %r12801, %r12800, %r12802; // end inline asm - ld.const.u32 %r3413, [matrix+1700]; // begin inline asm - dp4a.u32.u32 %r3412, %r3413, %r5782, %r3408; + shf.l.wrap.b32 %r12799, %r12800, %r12801, %r12802; // end inline asm - ld.const.u32 %r3417, [matrix+1704]; + mov.u32 %r12810, 2; // begin inline asm - dp4a.u32.u32 %r3416, %r3417, %r5786, %r3412; + shf.l.wrap.b32 %r12803, %r12809, %r12808, %r12810; // end inline asm - ld.const.u32 %r3421, [matrix+1708]; // begin inline asm - dp4a.u32.u32 %r3420, %r3421, %r5790, %r3416; + shf.l.wrap.b32 %r12807, %r12808, %r12809, %r12810; // end inline asm - ld.const.u32 %r3425, [matrix+1712]; + mov.u32 %r12818, 55; // begin inline asm - dp4a.u32.u32 %r3424, %r3425, %r5794, %r3420; + shf.l.wrap.b32 %r12811, %r12817, %r12816, %r12818; // end inline asm - ld.const.u32 %r3429, [matrix+1716]; // begin inline asm - dp4a.u32.u32 %r3428, %r3429, %r5798, %r3424; + shf.l.wrap.b32 %r12815, %r12816, %r12817, %r12818; // end inline asm - ld.const.u32 %r3433, [matrix+1720]; + mov.u32 %r12826, 45; // begin inline asm - dp4a.u32.u32 %r3432, %r3433, %r5802, %r3428; + shf.l.wrap.b32 %r12819, %r12825, %r12824, %r12826; // end inline asm - ld.const.u32 %r3437, [matrix+1724]; // begin inline asm - dp4a.u32.u32 %r3436, %r3437, %r5806, %r3432; + shf.l.wrap.b32 %r12823, %r12824, %r12825, %r12826; // end inline asm - ld.const.u32 %r3441, [matrix+1728]; + mov.u32 %r12834, 36; // begin inline asm - dp4a.u32.u32 %r3440, %r3441, %r5746, %r6244; + shf.l.wrap.b32 %r12827, %r12833, %r12832, %r12834; // end inline asm - ld.const.u32 %r3445, [matrix+1732]; // begin inline asm - dp4a.u32.u32 %r3444, %r3445, %r5750, %r3440; + shf.l.wrap.b32 %r12831, %r12832, %r12833, %r12834; // end inline asm - ld.const.u32 %r3449, [matrix+1736]; + mov.u32 %r12842, 28; // begin inline asm - dp4a.u32.u32 %r3448, %r3449, %r5754, %r3444; + shf.l.wrap.b32 %r12835, %r12841, %r12840, %r12842; // end inline asm - ld.const.u32 %r3453, [matrix+1740]; // begin inline asm - dp4a.u32.u32 %r3452, %r3453, %r5758, %r3448; + shf.l.wrap.b32 %r12839, %r12840, %r12841, %r12842; // end inline asm - ld.const.u32 %r3457, [matrix+1744]; + mov.u32 %r12850, 21; // begin inline asm - dp4a.u32.u32 %r3456, %r3457, %r5762, %r3452; + shf.l.wrap.b32 %r12843, %r12849, %r12848, %r12850; // end inline asm - ld.const.u32 %r3461, [matrix+1748]; // begin inline asm - dp4a.u32.u32 %r3460, %r3461, %r5766, %r3456; + shf.l.wrap.b32 %r12847, %r12848, %r12849, %r12850; // end inline asm - ld.const.u32 %r3465, [matrix+1752]; + mov.u32 %r12858, 15; // begin inline asm - dp4a.u32.u32 %r3464, %r3465, %r5770, %r3460; + shf.l.wrap.b32 %r12851, %r12857, %r12856, %r12858; // end inline asm - ld.const.u32 %r3469, [matrix+1756]; // begin inline asm - dp4a.u32.u32 %r3468, %r3469, %r5774, %r3464; + shf.l.wrap.b32 %r12855, %r12856, %r12857, %r12858; // end inline asm - ld.const.u32 %r3473, [matrix+1760]; + mov.u32 %r12866, 10; // begin inline asm - dp4a.u32.u32 %r3472, %r3473, %r5778, %r3468; + shf.l.wrap.b32 %r12859, %r12865, %r12864, %r12866; // end inline asm - ld.const.u32 %r3477, [matrix+1764]; // begin inline asm - dp4a.u32.u32 %r3476, %r3477, %r5782, %r3472; + shf.l.wrap.b32 %r12863, %r12864, %r12865, %r12866; // end inline asm - ld.const.u32 %r3481, [matrix+1768]; + mov.u32 %r12874, 6; // begin inline asm - dp4a.u32.u32 %r3480, %r3481, %r5786, %r3476; + shf.l.wrap.b32 %r12867, %r12873, %r12872, %r12874; // end inline asm - ld.const.u32 %r3485, [matrix+1772]; // begin inline asm - dp4a.u32.u32 %r3484, %r3485, %r5790, %r3480; + shf.l.wrap.b32 %r12871, %r12872, %r12873, %r12874; // end inline asm - ld.const.u32 %r3489, [matrix+1776]; + mov.u32 %r12882, 3; // begin inline asm - dp4a.u32.u32 %r3488, %r3489, %r5794, %r3484; + shf.l.wrap.b32 %r12875, %r12881, %r12880, %r12882; // end inline asm - ld.const.u32 %r3493, [matrix+1780]; // begin inline asm - dp4a.u32.u32 %r3492, %r3493, %r5798, %r3488; + shf.l.wrap.b32 %r12879, %r12880, %r12881, %r12882; // end inline asm - ld.const.u32 %r3497, [matrix+1784]; // begin inline asm - dp4a.u32.u32 %r3496, %r3497, %r5802, %r3492; + shf.l.wrap.b32 %r12883, %r12889, %r12888, %r12560; // end inline asm - ld.const.u32 %r3501, [matrix+1788]; // begin inline asm - dp4a.u32.u32 %r3500, %r3501, %r5806, %r3496; + shf.l.wrap.b32 %r12887, %r12888, %r12889, %r12560; // end inline asm - shr.u32 %r6024, %r3436, 6; - and.b32 %r6025, %r6024, 240; - shr.u32 %r6026, %r3500, 10; - or.b32 %r6027, %r6026, %r6025; - cvt.u64.u32 %rd219, %r6027; - xor.b64 %rd220, %rd14, %rd219; - ld.const.u32 %r3505, [matrix+1792]; // begin inline asm - dp4a.u32.u32 %r3504, %r3505, %r5746, %r6244; + // chi + lop3.b32 %r12891, %r12926, %r12699, %r12747, 0xD2; + lop3.b32 %r12892, %r12929, %r12703, %r12751, 0xD2; // end inline asm - ld.const.u32 %r3509, [matrix+1796]; // begin inline asm - dp4a.u32.u32 %r3508, %r3509, %r5750, %r3504; + // chi + lop3.b32 %r30010, %r12699, %r12747, %r12843, 0xD2; + lop3.b32 %r30011, %r12703, %r12751, %r12847, 0xD2; // end inline asm - ld.const.u32 %r3513, [matrix+1800]; // begin inline asm - dp4a.u32.u32 %r3512, %r3513, %r5754, %r3508; + // chi + lop3.b32 %r30006, %r12747, %r12843, %r12795, 0xD2; + lop3.b32 %r30007, %r12751, %r12847, %r12799, 0xD2; // end inline asm - ld.const.u32 %r3517, [matrix+1804]; // begin inline asm - dp4a.u32.u32 %r3516, %r3517, %r5758, %r3512; + // chi + lop3.b32 %r30002, %r12843, %r12795, %r12926, 0xD2; + lop3.b32 %r30003, %r12847, %r12799, %r12929, 0xD2; // end inline asm - ld.const.u32 %r3521, [matrix+1808]; // begin inline asm - dp4a.u32.u32 %r3520, %r3521, %r5762, %r3516; + // chi + lop3.b32 %r30000, %r12795, %r12926, %r12699, 0xD2; + lop3.b32 %r30001, %r12799, %r12929, %r12703, 0xD2; // end inline asm - ld.const.u32 %r3525, [matrix+1812]; // begin inline asm - dp4a.u32.u32 %r3524, %r3525, %r5766, %r3520; + // chi + lop3.b32 %r29996, %r12835, %r12707, %r12875, 0xD2; + lop3.b32 %r29997, %r12839, %r12711, %r12879, 0xD2; // end inline asm - ld.const.u32 %r3529, [matrix+1816]; // begin inline asm - dp4a.u32.u32 %r3528, %r3529, %r5770, %r3524; + // chi + lop3.b32 %r30008, %r12707, %r12875, %r12819, 0xD2; + lop3.b32 %r30009, %r12711, %r12879, %r12823, 0xD2; // end inline asm - ld.const.u32 %r3533, [matrix+1820]; // begin inline asm - dp4a.u32.u32 %r3532, %r3533, %r5774, %r3528; + // chi + lop3.b32 %r30004, %r12875, %r12819, %r12715, 0xD2; + lop3.b32 %r30005, %r12879, %r12823, %r12719, 0xD2; // end inline asm - ld.const.u32 %r3537, [matrix+1824]; // begin inline asm - dp4a.u32.u32 %r3536, %r3537, %r5778, %r3532; + // chi + lop3.b32 %r29976, %r12819, %r12715, %r12835, 0xD2; + lop3.b32 %r29977, %r12823, %r12719, %r12839, 0xD2; // end inline asm - ld.const.u32 %r3541, [matrix+1828]; + st.local.v2.u32 [%rd84+88], {%r29976, %r29977}; // begin inline asm - dp4a.u32.u32 %r3540, %r3541, %r5782, %r3536; + // chi + lop3.b32 %r29968, %r12715, %r12835, %r12707, 0xD2; + lop3.b32 %r29969, %r12719, %r12839, %r12711, 0xD2; // end inline asm - ld.const.u32 %r3545, [matrix+1832]; + st.local.v2.u32 [%rd84+96], {%r29968, %r29969}; // begin inline asm - dp4a.u32.u32 %r3544, %r3545, %r5786, %r3540; + // chi + lop3.b32 %r29994, %r12883, %r12867, %r12755, 0xD2; + lop3.b32 %r29995, %r12887, %r12871, %r12759, 0xD2; // end inline asm - ld.const.u32 %r3549, [matrix+1836]; + st.local.v2.u32 [%rd84+104], {%r29994, %r29995}; // begin inline asm - dp4a.u32.u32 %r3548, %r3549, %r5790, %r3544; + // chi + lop3.b32 %r29988, %r12867, %r12755, %r12763, 0xD2; + lop3.b32 %r29989, %r12871, %r12759, %r12767, 0xD2; // end inline asm - ld.const.u32 %r3553, [matrix+1840]; + st.local.v2.u32 [%rd84+112], {%r29988, %r29989}; // begin inline asm - dp4a.u32.u32 %r3552, %r3553, %r5794, %r3548; + // chi + lop3.b32 %r29982, %r12755, %r12763, %r12731, 0xD2; + lop3.b32 %r29983, %r12759, %r12767, %r12735, 0xD2; // end inline asm - ld.const.u32 %r3557, [matrix+1844]; + st.local.v2.u32 [%rd84+120], {%r29982, %r29983}; // begin inline asm - dp4a.u32.u32 %r3556, %r3557, %r5798, %r3552; + // chi + lop3.b32 %r29974, %r12763, %r12731, %r12883, 0xD2; + lop3.b32 %r29975, %r12767, %r12735, %r12887, 0xD2; // end inline asm - ld.const.u32 %r3561, [matrix+1848]; + st.local.v2.u32 [%rd84+128], {%r29974, %r29975}; // begin inline asm - dp4a.u32.u32 %r3560, %r3561, %r5802, %r3556; + // chi + lop3.b32 %r29966, %r12731, %r12883, %r12867, 0xD2; + lop3.b32 %r29967, %r12735, %r12887, %r12871, 0xD2; // end inline asm - ld.const.u32 %r3565, [matrix+1852]; + st.local.v2.u32 [%rd84+136], {%r29966, %r29967}; // begin inline asm - dp4a.u32.u32 %r3564, %r3565, %r5806, %r3560; + // chi + lop3.b32 %r29992, %r12787, %r12827, %r12859, 0xD2; + lop3.b32 %r29993, %r12791, %r12831, %r12863, 0xD2; // end inline asm - ld.const.u32 %r3569, [matrix+1856]; + st.local.v2.u32 [%rd84+144], {%r29992, %r29993}; // begin inline asm - dp4a.u32.u32 %r3568, %r3569, %r5746, %r6244; + // chi + lop3.b32 %r29986, %r12827, %r12859, %r12851, 0xD2; + lop3.b32 %r29987, %r12831, %r12863, %r12855, 0xD2; // end inline asm - ld.const.u32 %r3573, [matrix+1860]; + st.local.v2.u32 [%rd84+152], {%r29986, %r29987}; // begin inline asm - dp4a.u32.u32 %r3572, %r3573, %r5750, %r3568; + // chi + lop3.b32 %r29980, %r12859, %r12851, %r12771, 0xD2; + lop3.b32 %r29981, %r12863, %r12855, %r12775, 0xD2; // end inline asm - ld.const.u32 %r3577, [matrix+1864]; + st.local.v2.u32 [%rd84+160], {%r29980, %r29981}; // begin inline asm - dp4a.u32.u32 %r3576, %r3577, %r5754, %r3572; + // chi + lop3.b32 %r29972, %r12851, %r12771, %r12787, 0xD2; + lop3.b32 %r29973, %r12855, %r12775, %r12791, 0xD2; // end inline asm - ld.const.u32 %r3581, [matrix+1868]; + st.local.v2.u32 [%rd84+168], {%r29972, %r29973}; // begin inline asm - dp4a.u32.u32 %r3580, %r3581, %r5758, %r3576; + // chi + lop3.b32 %r29964, %r12771, %r12787, %r12827, 0xD2; + lop3.b32 %r29965, %r12775, %r12791, %r12831, 0xD2; // end inline asm - ld.const.u32 %r3585, [matrix+1872]; + st.local.v2.u32 [%rd84+176], {%r29964, %r29965}; // begin inline asm - dp4a.u32.u32 %r3584, %r3585, %r5762, %r3580; + // chi + lop3.b32 %r29990, %r12739, %r12811, %r12723, 0xD2; + lop3.b32 %r29991, %r12743, %r12815, %r12727, 0xD2; // end inline asm - ld.const.u32 %r3589, [matrix+1876]; + st.local.v2.u32 [%rd84+184], {%r29990, %r29991}; // begin inline asm - dp4a.u32.u32 %r3588, %r3589, %r5766, %r3584; + // chi + lop3.b32 %r29984, %r12811, %r12723, %r12779, 0xD2; + lop3.b32 %r29985, %r12815, %r12727, %r12783, 0xD2; // end inline asm - ld.const.u32 %r3593, [matrix+1880]; + st.local.v2.u32 [%rd84+192], {%r29984, %r29985}; // begin inline asm - dp4a.u32.u32 %r3592, %r3593, %r5770, %r3588; + // chi + lop3.b32 %r29978, %r12723, %r12779, %r12803, 0xD2; + lop3.b32 %r29979, %r12727, %r12783, %r12807, 0xD2; // end inline asm - ld.const.u32 %r3597, [matrix+1884]; + st.local.v2.u32 [%rd84+200], {%r29978, %r29979}; // begin inline asm - dp4a.u32.u32 %r3596, %r3597, %r5774, %r3592; + // chi + lop3.b32 %r29970, %r12779, %r12803, %r12739, 0xD2; + lop3.b32 %r29971, %r12783, %r12807, %r12743, 0xD2; // end inline asm - ld.const.u32 %r3601, [matrix+1888]; + st.local.v2.u32 [%rd84+208], {%r29970, %r29971}; // begin inline asm - dp4a.u32.u32 %r3600, %r3601, %r5778, %r3596; + // chi + lop3.b32 %r29962, %r12803, %r12739, %r12811, 0xD2; + lop3.b32 %r29963, %r12807, %r12743, %r12815, 0xD2; // end inline asm - ld.const.u32 %r3605, [matrix+1892]; + st.local.v2.u32 [%rd84+216], {%r29962, %r29963}; + mul.wide.s32 %rd672, %r30012, 8; + add.s64 %rd671, %rd597, %rd672; // begin inline asm - dp4a.u32.u32 %r3604, %r3605, %r5782, %r3600; + ld.global.nc.v2.u32 {%r13091,%r13092}, [%rd671]; // end inline asm - ld.const.u32 %r3609, [matrix+1896]; + xor.b32 %r29998, %r12891, %r13091; + xor.b32 %r29999, %r12892, %r13092; + add.s32 %r30012, %r30012, 1; + setp.lt.u32 %p27, %r30012, 23; + @%p27 bra $L__BB2_42; + + mov.u32 %r13202, 1; + st.local.v2.u32 [%rd84+32], {%r30010, %r30011}; + st.local.v2.u32 [%rd84+72], {%r30008, %r30009}; + st.local.v2.u32 [%rd84+40], {%r30006, %r30007}; + st.local.v2.u32 [%rd84+80], {%r30004, %r30005}; + st.local.v2.u32 [%rd84+48], {%r30002, %r30003}; + st.local.v2.u32 [%rd84+56], {%r30000, %r30001}; + st.local.v2.u32 [%rd84+24], {%r29998, %r29999}; // begin inline asm - dp4a.u32.u32 %r3608, %r3609, %r5786, %r3604; + // xor5 + lop3.b32 %r13103, %r29998, %r29996, %r29994, 0x96; + lop3.b32 %r13103, %r13103, %r29992, %r29990, 0x96; + lop3.b32 %r13104, %r29999, %r29997, %r29995, 0x96; + lop3.b32 %r13104, %r13104, %r29993, %r29991, 0x96; // end inline asm - ld.const.u32 %r3613, [matrix+1900]; // begin inline asm - dp4a.u32.u32 %r3612, %r3613, %r5790, %r3608; + // xor5 + lop3.b32 %r13115, %r30010, %r30008, %r29988, 0x96; + lop3.b32 %r13115, %r13115, %r29986, %r29984, 0x96; + lop3.b32 %r13116, %r30011, %r30009, %r29989, 0x96; + lop3.b32 %r13116, %r13116, %r29987, %r29985, 0x96; // end inline asm - ld.const.u32 %r3617, [matrix+1904]; // begin inline asm - dp4a.u32.u32 %r3616, %r3617, %r5794, %r3612; + // xor5 + lop3.b32 %r13127, %r30006, %r30004, %r29982, 0x96; + lop3.b32 %r13127, %r13127, %r29980, %r29978, 0x96; + lop3.b32 %r13128, %r30007, %r30005, %r29983, 0x96; + lop3.b32 %r13128, %r13128, %r29981, %r29979, 0x96; // end inline asm - ld.const.u32 %r3621, [matrix+1908]; // begin inline asm - dp4a.u32.u32 %r3620, %r3621, %r5798, %r3616; + // xor5 + lop3.b32 %r13139, %r30002, %r29976, %r29974, 0x96; + lop3.b32 %r13139, %r13139, %r29972, %r29970, 0x96; + lop3.b32 %r13140, %r30003, %r29977, %r29975, 0x96; + lop3.b32 %r13140, %r13140, %r29973, %r29971, 0x96; // end inline asm - ld.const.u32 %r3625, [matrix+1912]; // begin inline asm - dp4a.u32.u32 %r3624, %r3625, %r5802, %r3620; + // xor5 + lop3.b32 %r13151, %r30000, %r29968, %r29966, 0x96; + lop3.b32 %r13151, %r13151, %r29964, %r29962, 0x96; + lop3.b32 %r13152, %r30001, %r29969, %r29967, 0x96; + lop3.b32 %r13152, %r13152, %r29965, %r29963, 0x96; // end inline asm - ld.const.u32 %r3629, [matrix+1916]; // begin inline asm - dp4a.u32.u32 %r3628, %r3629, %r5806, %r3624; + shf.l.wrap.b32 %r13163, %r13116, %r13115, %r13202; // end inline asm - shr.u32 %r6028, %r3564, 6; - and.b32 %r6029, %r6028, 240; - shr.u32 %r6030, %r3628, 10; - or.b32 %r6031, %r6030, %r6029; - cvt.u64.u32 %rd221, %r6031; - xor.b64 %rd222, %rd15, %rd221; - ld.const.u32 %r3633, [matrix+1920]; // begin inline asm - dp4a.u32.u32 %r3632, %r3633, %r5746, %r6244; + shf.l.wrap.b32 %r13167, %r13115, %r13116, %r13202; // end inline asm - ld.const.u32 %r3637, [matrix+1924]; + xor.b32 %r13341, %r13163, %r13151; + xor.b32 %r13342, %r13167, %r13152; + xor.b32 %r13310, %r29998, %r13341; + xor.b32 %r13313, %r29999, %r13342; + xor.b32 %r13273, %r29995, %r13342; + xor.b32 %r13272, %r29994, %r13341; + st.local.v2.u32 [%rd84+104], {%r13272, %r13273}; // begin inline asm - dp4a.u32.u32 %r3636, %r3637, %r5750, %r3632; + shf.l.wrap.b32 %r13171, %r13128, %r13127, %r13202; // end inline asm - ld.const.u32 %r3641, [matrix+1928]; // begin inline asm - dp4a.u32.u32 %r3640, %r3641, %r5754, %r3636; + shf.l.wrap.b32 %r13175, %r13127, %r13128, %r13202; // end inline asm - ld.const.u32 %r3645, [matrix+1932]; + xor.b32 %r13343, %r13171, %r13103; + xor.b32 %r13344, %r13175, %r13104; + xor.b32 %r13209, %r30008, %r13343; + xor.b32 %r13208, %r30009, %r13344; + xor.b32 %r13248, %r29987, %r13344; + xor.b32 %r13249, %r29986, %r13343; + st.local.v2.u32 [%rd84+152], {%r13249, %r13248}; // begin inline asm - dp4a.u32.u32 %r3644, %r3645, %r5758, %r3640; + shf.l.wrap.b32 %r13179, %r13140, %r13139, %r13202; // end inline asm - ld.const.u32 %r3649, [matrix+1936]; // begin inline asm - dp4a.u32.u32 %r3648, %r3649, %r5762, %r3644; + shf.l.wrap.b32 %r13183, %r13139, %r13140, %r13202; // end inline asm - ld.const.u32 %r3653, [matrix+1940]; + xor.b32 %r13345, %r13179, %r13115; + xor.b32 %r13346, %r13183, %r13116; + xor.b32 %r13232, %r29983, %r13346; + xor.b32 %r13233, %r29982, %r13345; + st.local.v2.u32 [%rd84+120], {%r13233, %r13232}; + xor.b32 %r13224, %r29979, %r13346; + xor.b32 %r13225, %r29978, %r13345; + st.local.v2.u32 [%rd84+200], {%r13225, %r13224}; // begin inline asm - dp4a.u32.u32 %r3652, %r3653, %r5766, %r3648; + shf.l.wrap.b32 %r13187, %r13152, %r13151, %r13202; // end inline asm - ld.const.u32 %r3657, [matrix+1944]; // begin inline asm - dp4a.u32.u32 %r3656, %r3657, %r5770, %r3652; + shf.l.wrap.b32 %r13191, %r13151, %r13152, %r13202; // end inline asm - ld.const.u32 %r3661, [matrix+1948]; + xor.b32 %r13347, %r13187, %r13127; + xor.b32 %r13348, %r13191, %r13128; + xor.b32 %r13256, %r30002, %r13347; + xor.b32 %r13257, %r30003, %r13348; + xor.b32 %r13265, %r29973, %r13348; + xor.b32 %r13264, %r29972, %r13347; + st.local.v2.u32 [%rd84+168], {%r13264, %r13265}; // begin inline asm - dp4a.u32.u32 %r3660, %r3661, %r5774, %r3656; + shf.l.wrap.b32 %r13195, %r13104, %r13103, %r13202; // end inline asm - ld.const.u32 %r3665, [matrix+1952]; // begin inline asm - dp4a.u32.u32 %r3664, %r3665, %r5778, %r3660; + shf.l.wrap.b32 %r13199, %r13103, %r13104, %r13202; // end inline asm - ld.const.u32 %r3669, [matrix+1956]; + xor.b32 %r13349, %r13195, %r13139; + xor.b32 %r13350, %r13199, %r13140; + xor.b32 %r13216, %r29968, %r13349; + xor.b32 %r13217, %r29969, %r13350; + xor.b32 %r13241, %r29963, %r13350; + xor.b32 %r13240, %r29962, %r13349; + st.local.v2.u32 [%rd84+216], {%r13240, %r13241}; // begin inline asm - dp4a.u32.u32 %r3668, %r3669, %r5782, %r3664; + shf.l.wrap.b32 %r13203, %r13209, %r13208, %r12706; // end inline asm - ld.const.u32 %r3673, [matrix+1960]; // begin inline asm - dp4a.u32.u32 %r3672, %r3673, %r5786, %r3668; + shf.l.wrap.b32 %r13207, %r13208, %r13209, %r12706; // end inline asm - ld.const.u32 %r3677, [matrix+1964]; // begin inline asm - dp4a.u32.u32 %r3676, %r3677, %r5790, %r3672; + shf.l.wrap.b32 %r13211, %r13217, %r13216, %r12714; // end inline asm - ld.const.u32 %r3681, [matrix+1968]; // begin inline asm - dp4a.u32.u32 %r3680, %r3681, %r5794, %r3676; + shf.l.wrap.b32 %r13215, %r13216, %r13217, %r12714; // end inline asm - ld.const.u32 %r3685, [matrix+1972]; // begin inline asm - dp4a.u32.u32 %r3684, %r3685, %r5798, %r3680; + shf.l.wrap.b32 %r13223, %r13224, %r13225, %r12722; // end inline asm - ld.const.u32 %r3689, [matrix+1976]; // begin inline asm - dp4a.u32.u32 %r3688, %r3689, %r5802, %r3684; + shf.l.wrap.b32 %r13219, %r13225, %r13224, %r12722; // end inline asm - ld.const.u32 %r3693, [matrix+1980]; + st.local.v2.u32 [%rd84+96], {%r13219, %r13223}; // begin inline asm - dp4a.u32.u32 %r3692, %r3693, %r5806, %r3688; + shf.l.wrap.b32 %r13227, %r13233, %r13232, %r12754; // end inline asm - ld.const.u32 %r3697, [matrix+1984]; // begin inline asm - dp4a.u32.u32 %r3696, %r3697, %r5746, %r6244; + shf.l.wrap.b32 %r13231, %r13232, %r13233, %r12754; // end inline asm - ld.const.u32 %r3701, [matrix+1988]; // begin inline asm - dp4a.u32.u32 %r3700, %r3701, %r5750, %r3696; + shf.l.wrap.b32 %r13235, %r13241, %r13240, %r12802; // end inline asm - ld.const.u32 %r3705, [matrix+1992]; // begin inline asm - dp4a.u32.u32 %r3704, %r3705, %r5754, %r3700; + shf.l.wrap.b32 %r13239, %r13240, %r13241, %r12802; // end inline asm - ld.const.u32 %r3709, [matrix+1996]; // begin inline asm - dp4a.u32.u32 %r3708, %r3709, %r5758, %r3704; + shf.l.wrap.b32 %r13247, %r13248, %r13249, %r12826; // end inline asm - ld.const.u32 %r3713, [matrix+2000]; // begin inline asm - dp4a.u32.u32 %r3712, %r3713, %r5762, %r3708; + shf.l.wrap.b32 %r13243, %r13249, %r13248, %r12826; // end inline asm - ld.const.u32 %r3717, [matrix+2004]; + st.local.v2.u32 [%rd84+88], {%r13243, %r13247}; // begin inline asm - dp4a.u32.u32 %r3716, %r3717, %r5766, %r3712; + shf.l.wrap.b32 %r13251, %r13257, %r13256, %r12842; // end inline asm - ld.const.u32 %r3721, [matrix+2008]; // begin inline asm - dp4a.u32.u32 %r3720, %r3721, %r5770, %r3716; + shf.l.wrap.b32 %r13255, %r13256, %r13257, %r12842; // end inline asm - ld.const.u32 %r3725, [matrix+2012]; // begin inline asm - dp4a.u32.u32 %r3724, %r3725, %r5774, %r3720; + shf.l.wrap.b32 %r13259, %r13265, %r13264, %r12850; // end inline asm - ld.const.u32 %r3729, [matrix+2016]; // begin inline asm - dp4a.u32.u32 %r3728, %r3729, %r5778, %r3724; + shf.l.wrap.b32 %r13263, %r13264, %r13265, %r12850; // end inline asm - ld.const.u32 %r3733, [matrix+2020]; // begin inline asm - dp4a.u32.u32 %r3732, %r3733, %r5782, %r3728; + shf.l.wrap.b32 %r13267, %r13273, %r13272, %r12882; // end inline asm - ld.const.u32 %r3737, [matrix+2024]; // begin inline asm - dp4a.u32.u32 %r3736, %r3737, %r5786, %r3732; + shf.l.wrap.b32 %r13271, %r13272, %r13273, %r12882; // end inline asm - ld.const.u32 %r3741, [matrix+2028]; // begin inline asm - dp4a.u32.u32 %r3740, %r3741, %r5790, %r3736; + // chi + lop3.b32 %r13275, %r13310, %r13203, %r13227, 0xD2; + lop3.b32 %r13276, %r13313, %r13207, %r13231, 0xD2; // end inline asm - ld.const.u32 %r3745, [matrix+2032]; // begin inline asm - dp4a.u32.u32 %r3744, %r3745, %r5794, %r3740; + // chi + lop3.b32 %r13283, %r13203, %r13227, %r13259, 0xD2; + lop3.b32 %r13284, %r13207, %r13231, %r13263, 0xD2; // end inline asm - ld.const.u32 %r3749, [matrix+2036]; + st.local.v2.u32 [%rd84+32], {%r13283, %r13284}; // begin inline asm - dp4a.u32.u32 %r3748, %r3749, %r5798, %r3744; + // chi + lop3.b32 %r13291, %r13227, %r13259, %r13235, 0xD2; + lop3.b32 %r13292, %r13231, %r13263, %r13239, 0xD2; // end inline asm - ld.const.u32 %r3753, [matrix+2040]; + st.local.v2.u32 [%rd84+40], {%r13291, %r13292}; // begin inline asm - dp4a.u32.u32 %r3752, %r3753, %r5802, %r3748; + // chi + lop3.b32 %r13299, %r13259, %r13235, %r13310, 0xD2; + lop3.b32 %r13300, %r13263, %r13239, %r13313, 0xD2; // end inline asm - ld.const.u32 %r3757, [matrix+2044]; + st.local.v2.u32 [%rd84+48], {%r13299, %r13300}; // begin inline asm - dp4a.u32.u32 %r3756, %r3757, %r5806, %r3752; + // chi + lop3.b32 %r13307, %r13235, %r13310, %r13203, 0xD2; + lop3.b32 %r13308, %r13239, %r13313, %r13207, 0xD2; // end inline asm - shr.u32 %r6032, %r3692, 6; - and.b32 %r6033, %r6032, 240; - ld.const.u32 %r3761, [matrix+2048]; + st.local.v2.u32 [%rd84+56], {%r13307, %r13308}; // begin inline asm - dp4a.u32.u32 %r3760, %r3761, %r5746, %r6244; + // chi + lop3.b32 %r13315, %r13251, %r13211, %r13267, 0xD2; + lop3.b32 %r13316, %r13255, %r13215, %r13271, 0xD2; // end inline asm - ld.const.u32 %r3765, [matrix+2052]; + st.local.v2.u32 [%rd84+64], {%r13315, %r13316}; // begin inline asm - dp4a.u32.u32 %r3764, %r3765, %r5750, %r3760; + // chi + lop3.b32 %r13323, %r13211, %r13267, %r13243, 0xD2; + lop3.b32 %r13324, %r13215, %r13271, %r13247, 0xD2; // end inline asm - ld.const.u32 %r3769, [matrix+2056]; + st.local.v2.u32 [%rd84+72], {%r13323, %r13324}; // begin inline asm - dp4a.u32.u32 %r3768, %r3769, %r5754, %r3764; + // chi + lop3.b32 %r13331, %r13267, %r13243, %r13219, 0xD2; + lop3.b32 %r13332, %r13271, %r13247, %r13223, 0xD2; // end inline asm - ld.const.u32 %r3773, [matrix+2060]; + st.local.v2.u32 [%rd84+80], {%r13331, %r13332}; // begin inline asm - dp4a.u32.u32 %r3772, %r3773, %r5758, %r3768; + ld.global.nc.v2.u32 {%r13339,%r13340}, [%rd598]; // end inline asm - ld.const.u32 %r3777, [matrix+2064]; + xor.b32 %r13351, %r13276, %r13340; + xor.b32 %r13352, %r13275, %r13339; + st.local.v2.u32 [%rd84+24], {%r13352, %r13351}; + bra.uni $L__BB2_44; + +$L__BB2_22: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd479, 1179641; + st.local.u64 [%rd3+8], %rd479; + st.local.u32 [%rd3+16], %r30; + ld.global.u64 %rd480, [%rd34]; + ld.global.u64 %rd481, [%rd34+8]; + ld.global.u64 %rd482, [%rd34+16]; + ld.global.u64 %rd483, [%rd34+24]; + ld.global.u64 %rd484, [%rd34+32]; + ld.global.u64 %rd485, [%rd34+40]; + ld.global.u64 %rd486, [%rd34+48]; + ld.global.u64 %rd487, [%rd34+56]; + st.local.u64 [%rd3+24], %rd480; + st.local.u64 [%rd3+32], %rd481; + st.local.u64 [%rd3+40], %rd482; + st.local.u64 [%rd3+48], %rd483; + st.local.u64 [%rd3+56], %rd484; + st.local.u64 [%rd3+64], %rd485; + st.local.u64 [%rd3+72], %rd486; + st.local.u64 [%rd3+80], %rd487; + cvt.u32.u64 %r6826, %rd480; + xor.b32 %r6827, %r30, %r6826; + st.local.u32 [%rd3+24], %r6827; + mov.u32 %r29539, 0; + st.local.v2.u32 [%rd3+96], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+104], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+112], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+120], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+128], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+136], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+144], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+152], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+160], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+168], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+176], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+184], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+192], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+200], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+208], {%r29539, %r29539}; + st.local.v2.u32 [%rd3+216], {%r29539, %r29539}; + mov.u32 %r29554, -2147483648; + mov.u32 %r6799, 1; + st.local.v2.u32 [%rd3+88], {%r6799, %r29554}; + ld.local.v2.u32 {%r29575, %r29576}, [%rd3+24]; + mov.b64 {%r29573, %r29574}, %rd485; + shr.u64 %rd488, %rd481, 32; + cvt.u32.u64 %r29587, %rd481; + cvt.u32.u64 %r29588, %rd488; + shr.u64 %rd489, %rd486, 32; + cvt.u32.u64 %r29585, %rd486; + cvt.u32.u64 %r29586, %rd489; + shr.u64 %rd490, %rd482, 32; + cvt.u32.u64 %r29583, %rd482; + cvt.u32.u64 %r29584, %rd490; + shr.u64 %rd491, %rd487, 32; + cvt.u32.u64 %r29581, %rd487; + cvt.u32.u64 %r29582, %rd491; + shr.u64 %rd492, %rd483, 32; + cvt.u32.u64 %r29579, %rd483; + cvt.u32.u64 %r29580, %rd492; + shr.u64 %rd493, %rd484, 32; + cvt.u32.u64 %r29577, %rd484; + cvt.u32.u64 %r29578, %rd493; + mov.u32 %r29540, %r29539; + mov.u32 %r29541, %r29539; + mov.u32 %r29542, %r29539; + mov.u32 %r29543, %r29539; + mov.u32 %r29544, %r29539; + mov.u32 %r29545, %r29539; + mov.u32 %r29546, %r29539; + mov.u32 %r29547, %r29539; + mov.u32 %r29548, %r29539; + mov.u32 %r29549, %r29539; + mov.u32 %r29550, %r29539; + mov.u32 %r29551, %r29539; + mov.u32 %r29552, %r29539; + mov.u32 %r29553, %r6799; + mov.u32 %r29555, %r29539; + mov.u32 %r29556, %r29539; + mov.u32 %r29557, %r29539; + mov.u32 %r29558, %r29539; + mov.u32 %r29559, %r29539; + mov.u32 %r29560, %r29539; + mov.u32 %r29561, %r29539; + mov.u32 %r29562, %r29539; + mov.u32 %r29563, %r29539; + mov.u32 %r29564, %r29539; + mov.u32 %r29565, %r29539; + mov.u32 %r29566, %r29539; + mov.u32 %r29567, %r29539; + mov.u32 %r29568, %r29539; + mov.u32 %r29569, %r29539; + mov.u32 %r29570, %r29539; + mov.u32 %r29571, %r29539; + mov.u32 %r29572, %r29539; + mov.u32 %r29589, %r29539; + +$L__BB2_23: // begin inline asm - dp4a.u32.u32 %r3776, %r3777, %r5762, %r3772; + // xor5 + lop3.b32 %r6830, %r29575, %r29573, %r29571, 0x96; + lop3.b32 %r6830, %r6830, %r29569, %r29567, 0x96; + lop3.b32 %r6831, %r29576, %r29574, %r29572, 0x96; + lop3.b32 %r6831, %r6831, %r29570, %r29568, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6842, %r29587, %r29585, %r29565, 0x96; + lop3.b32 %r6842, %r6842, %r29563, %r29561, 0x96; + lop3.b32 %r6843, %r29588, %r29586, %r29566, 0x96; + lop3.b32 %r6843, %r6843, %r29564, %r29562, 0x96; // end inline asm - ld.const.u32 %r3781, [matrix+2068]; // begin inline asm - dp4a.u32.u32 %r3780, %r3781, %r5766, %r3776; + // xor5 + lop3.b32 %r6854, %r29583, %r29581, %r29559, 0x96; + lop3.b32 %r6854, %r6854, %r29557, %r29555, 0x96; + lop3.b32 %r6855, %r29584, %r29582, %r29560, 0x96; + lop3.b32 %r6855, %r6855, %r29558, %r29556, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r6866, %r29579, %r29553, %r29551, 0x96; + lop3.b32 %r6866, %r6866, %r29549, %r29547, 0x96; + lop3.b32 %r6867, %r29580, %r29554, %r29552, 0x96; + lop3.b32 %r6867, %r6867, %r29550, %r29548, 0x96; // end inline asm - ld.const.u32 %r3785, [matrix+2072]; // begin inline asm - dp4a.u32.u32 %r3784, %r3785, %r5770, %r3780; + // xor5 + lop3.b32 %r6878, %r29577, %r29545, %r29543, 0x96; + lop3.b32 %r6878, %r6878, %r29541, %r29539, 0x96; + lop3.b32 %r6879, %r29578, %r29546, %r29544, 0x96; + lop3.b32 %r6879, %r6879, %r29542, %r29540, 0x96; // end inline asm - ld.const.u32 %r3789, [matrix+2076]; // begin inline asm - dp4a.u32.u32 %r3788, %r3789, %r5774, %r3784; + shf.l.wrap.b32 %r6890, %r6843, %r6842, %r6799; // end inline asm - ld.const.u32 %r3793, [matrix+2080]; // begin inline asm - dp4a.u32.u32 %r3792, %r3793, %r5778, %r3788; + shf.l.wrap.b32 %r6894, %r6842, %r6843, %r6799; // end inline asm - ld.const.u32 %r3797, [matrix+2084]; + xor.b32 %r7324, %r6890, %r6878; + xor.b32 %r7325, %r6894, %r6879; + xor.b32 %r7157, %r29575, %r7324; + xor.b32 %r7160, %r29576, %r7325; + xor.b32 %r7064, %r29573, %r7324; + xor.b32 %r7063, %r29574, %r7325; + xor.b32 %r7111, %r29571, %r7324; + xor.b32 %r7112, %r29572, %r7325; + xor.b32 %r7016, %r29569, %r7324; + xor.b32 %r7015, %r29570, %r7325; + xor.b32 %r6967, %r29567, %r7324; + xor.b32 %r6968, %r29568, %r7325; // begin inline asm - dp4a.u32.u32 %r3796, %r3797, %r5782, %r3792; + shf.l.wrap.b32 %r6898, %r6855, %r6854, %r6799; // end inline asm - ld.const.u32 %r3801, [matrix+2088]; // begin inline asm - dp4a.u32.u32 %r3800, %r3801, %r5786, %r3796; + shf.l.wrap.b32 %r6902, %r6854, %r6855, %r6799; // end inline asm - ld.const.u32 %r3805, [matrix+2092]; + xor.b32 %r7326, %r6898, %r6830; + xor.b32 %r7327, %r6902, %r6831; + xor.b32 %r7119, %r29587, %r7326; + xor.b32 %r7120, %r29588, %r7327; + xor.b32 %r6936, %r29585, %r7326; + xor.b32 %r6935, %r29586, %r7327; + xor.b32 %r7095, %r29565, %r7326; + xor.b32 %r7096, %r29566, %r7327; + xor.b32 %r7056, %r29563, %r7326; + xor.b32 %r7055, %r29564, %r7327; + xor.b32 %r7039, %r29561, %r7326; + xor.b32 %r7040, %r29562, %r7327; // begin inline asm - dp4a.u32.u32 %r3804, %r3805, %r5790, %r3800; + shf.l.wrap.b32 %r6906, %r6867, %r6866, %r6799; // end inline asm - ld.const.u32 %r3809, [matrix+2096]; // begin inline asm - dp4a.u32.u32 %r3808, %r3809, %r5794, %r3804; + shf.l.wrap.b32 %r6910, %r6866, %r6867, %r6799; // end inline asm - ld.const.u32 %r3813, [matrix+2100]; + xor.b32 %r7328, %r6906, %r6842; + xor.b32 %r7329, %r6910, %r6843; + xor.b32 %r6976, %r29583, %r7328; + xor.b32 %r6975, %r29584, %r7329; + xor.b32 %r7103, %r29581, %r7328; + xor.b32 %r7104, %r29582, %r7329; + xor.b32 %r6984, %r29559, %r7328; + xor.b32 %r6983, %r29560, %r7329; + xor.b32 %r7087, %r29557, %r7328; + xor.b32 %r7088, %r29558, %r7329; + xor.b32 %r6952, %r29555, %r7328; + xor.b32 %r6951, %r29556, %r7329; // begin inline asm - dp4a.u32.u32 %r3812, %r3813, %r5798, %r3808; + shf.l.wrap.b32 %r6914, %r6879, %r6878, %r6799; // end inline asm - ld.const.u32 %r3817, [matrix+2104]; // begin inline asm - dp4a.u32.u32 %r3816, %r3817, %r5802, %r3812; + shf.l.wrap.b32 %r6918, %r6878, %r6879, %r6799; // end inline asm - ld.const.u32 %r3821, [matrix+2108]; + xor.b32 %r7330, %r6914, %r6854; + xor.b32 %r7331, %r6918, %r6855; + xor.b32 %r7071, %r29579, %r7330; + xor.b32 %r7072, %r29580, %r7331; + xor.b32 %r7048, %r29553, %r7330; + xor.b32 %r7047, %r29554, %r7331; + xor.b32 %r6991, %r29551, %r7330; + xor.b32 %r6992, %r29552, %r7331; + xor.b32 %r7079, %r29549, %r7330; + xor.b32 %r7080, %r29550, %r7331; + xor.b32 %r7008, %r29547, %r7330; + xor.b32 %r7007, %r29548, %r7331; // begin inline asm - dp4a.u32.u32 %r3820, %r3821, %r5806, %r3816; + shf.l.wrap.b32 %r6922, %r6831, %r6830, %r6799; // end inline asm - ld.const.u32 %r3825, [matrix+2112]; // begin inline asm - dp4a.u32.u32 %r3824, %r3825, %r5746, %r6244; + shf.l.wrap.b32 %r6926, %r6830, %r6831, %r6799; // end inline asm - ld.const.u32 %r3829, [matrix+2116]; + xor.b32 %r7332, %r6922, %r6866; + xor.b32 %r7333, %r6926, %r6867; + xor.b32 %r7023, %r29577, %r7332; + xor.b32 %r7024, %r29578, %r7333; + xor.b32 %r6943, %r29545, %r7332; + xor.b32 %r6944, %r29546, %r7333; + xor.b32 %r6960, %r29543, %r7332; + xor.b32 %r6959, %r29544, %r7333; + xor.b32 %r6999, %r29541, %r7332; + xor.b32 %r7000, %r29542, %r7333; + xor.b32 %r7031, %r29539, %r7332; + xor.b32 %r7032, %r29540, %r7333; + mov.u32 %r6937, 44; // begin inline asm - dp4a.u32.u32 %r3828, %r3829, %r5750, %r3824; + shf.l.wrap.b32 %r6930, %r6936, %r6935, %r6937; // end inline asm - ld.const.u32 %r3833, [matrix+2120]; // begin inline asm - dp4a.u32.u32 %r3832, %r3833, %r5754, %r3828; + shf.l.wrap.b32 %r6934, %r6935, %r6936, %r6937; // end inline asm - ld.const.u32 %r3837, [matrix+2124]; + mov.u32 %r6945, 20; // begin inline asm - dp4a.u32.u32 %r3836, %r3837, %r5758, %r3832; + shf.l.wrap.b32 %r6938, %r6944, %r6943, %r6945; // end inline asm - ld.const.u32 %r3841, [matrix+2128]; // begin inline asm - dp4a.u32.u32 %r3840, %r3841, %r5762, %r3836; + shf.l.wrap.b32 %r6942, %r6943, %r6944, %r6945; // end inline asm - ld.const.u32 %r3845, [matrix+2132]; + mov.u32 %r6953, 61; // begin inline asm - dp4a.u32.u32 %r3844, %r3845, %r5766, %r3840; + shf.l.wrap.b32 %r6946, %r6952, %r6951, %r6953; // end inline asm - ld.const.u32 %r3849, [matrix+2136]; // begin inline asm - dp4a.u32.u32 %r3848, %r3849, %r5770, %r3844; + shf.l.wrap.b32 %r6950, %r6951, %r6952, %r6953; // end inline asm - ld.const.u32 %r3853, [matrix+2140]; + mov.u32 %r6961, 39; // begin inline asm - dp4a.u32.u32 %r3852, %r3853, %r5774, %r3848; + shf.l.wrap.b32 %r6954, %r6960, %r6959, %r6961; // end inline asm - ld.const.u32 %r3857, [matrix+2144]; // begin inline asm - dp4a.u32.u32 %r3856, %r3857, %r5778, %r3852; + shf.l.wrap.b32 %r6958, %r6959, %r6960, %r6961; // end inline asm - ld.const.u32 %r3861, [matrix+2148]; + mov.u32 %r6969, 18; // begin inline asm - dp4a.u32.u32 %r3860, %r3861, %r5782, %r3856; + shf.l.wrap.b32 %r6962, %r6968, %r6967, %r6969; // end inline asm - ld.const.u32 %r3865, [matrix+2152]; // begin inline asm - dp4a.u32.u32 %r3864, %r3865, %r5786, %r3860; + shf.l.wrap.b32 %r6966, %r6967, %r6968, %r6969; // end inline asm - ld.const.u32 %r3869, [matrix+2156]; + mov.u32 %r6977, 62; // begin inline asm - dp4a.u32.u32 %r3868, %r3869, %r5790, %r3864; + shf.l.wrap.b32 %r6970, %r6976, %r6975, %r6977; // end inline asm - ld.const.u32 %r3873, [matrix+2160]; // begin inline asm - dp4a.u32.u32 %r3872, %r3873, %r5794, %r3868; + shf.l.wrap.b32 %r6974, %r6975, %r6976, %r6977; // end inline asm - ld.const.u32 %r3877, [matrix+2164]; + mov.u32 %r6985, 43; // begin inline asm - dp4a.u32.u32 %r3876, %r3877, %r5798, %r3872; + shf.l.wrap.b32 %r6978, %r6984, %r6983, %r6985; // end inline asm - ld.const.u32 %r3881, [matrix+2168]; // begin inline asm - dp4a.u32.u32 %r3880, %r3881, %r5802, %r3876; + shf.l.wrap.b32 %r6982, %r6983, %r6984, %r6985; // end inline asm - ld.const.u32 %r3885, [matrix+2172]; + mov.u32 %r6993, 25; // begin inline asm - dp4a.u32.u32 %r3884, %r3885, %r5806, %r3880; + shf.l.wrap.b32 %r6986, %r6992, %r6991, %r6993; // end inline asm - shr.u32 %r6034, %r3820, 6; - and.b32 %r6035, %r6034, 240; - shr.u32 %r6036, %r3884, 10; - or.b32 %r6037, %r6036, %r6035; - xor.b32 %r6038, %r13, %r6037; - ld.const.u32 %r3889, [matrix+2176]; // begin inline asm - dp4a.u32.u32 %r3888, %r3889, %r5746, %r6244; + shf.l.wrap.b32 %r6990, %r6991, %r6992, %r6993; // end inline asm - ld.const.u32 %r3893, [matrix+2180]; + mov.u32 %r7001, 8; // begin inline asm - dp4a.u32.u32 %r3892, %r3893, %r5750, %r3888; + shf.l.wrap.b32 %r6994, %r7000, %r6999, %r7001; // end inline asm - ld.const.u32 %r3897, [matrix+2184]; // begin inline asm - dp4a.u32.u32 %r3896, %r3897, %r5754, %r3892; + shf.l.wrap.b32 %r6998, %r6999, %r7000, %r7001; // end inline asm - ld.const.u32 %r3901, [matrix+2188]; + mov.u32 %r7009, 56; // begin inline asm - dp4a.u32.u32 %r3900, %r3901, %r5758, %r3896; + shf.l.wrap.b32 %r7002, %r7008, %r7007, %r7009; // end inline asm - ld.const.u32 %r3905, [matrix+2192]; // begin inline asm - dp4a.u32.u32 %r3904, %r3905, %r5762, %r3900; + shf.l.wrap.b32 %r7006, %r7007, %r7008, %r7009; // end inline asm - ld.const.u32 %r3909, [matrix+2196]; + mov.u32 %r7017, 41; // begin inline asm - dp4a.u32.u32 %r3908, %r3909, %r5766, %r3904; + shf.l.wrap.b32 %r7010, %r7016, %r7015, %r7017; // end inline asm - ld.const.u32 %r3913, [matrix+2200]; // begin inline asm - dp4a.u32.u32 %r3912, %r3913, %r5770, %r3908; + shf.l.wrap.b32 %r7014, %r7015, %r7016, %r7017; // end inline asm - ld.const.u32 %r3917, [matrix+2204]; + mov.u32 %r7025, 27; // begin inline asm - dp4a.u32.u32 %r3916, %r3917, %r5774, %r3912; + shf.l.wrap.b32 %r7018, %r7024, %r7023, %r7025; // end inline asm - ld.const.u32 %r3921, [matrix+2208]; // begin inline asm - dp4a.u32.u32 %r3920, %r3921, %r5778, %r3916; + shf.l.wrap.b32 %r7022, %r7023, %r7024, %r7025; // end inline asm - ld.const.u32 %r3925, [matrix+2212]; + mov.u32 %r7033, 14; // begin inline asm - dp4a.u32.u32 %r3924, %r3925, %r5782, %r3920; + shf.l.wrap.b32 %r7026, %r7032, %r7031, %r7033; // end inline asm - ld.const.u32 %r3929, [matrix+2216]; // begin inline asm - dp4a.u32.u32 %r3928, %r3929, %r5786, %r3924; + shf.l.wrap.b32 %r7030, %r7031, %r7032, %r7033; // end inline asm - ld.const.u32 %r3933, [matrix+2220]; + mov.u32 %r7041, 2; // begin inline asm - dp4a.u32.u32 %r3932, %r3933, %r5790, %r3928; + shf.l.wrap.b32 %r7034, %r7040, %r7039, %r7041; // end inline asm - ld.const.u32 %r3937, [matrix+2224]; // begin inline asm - dp4a.u32.u32 %r3936, %r3937, %r5794, %r3932; + shf.l.wrap.b32 %r7038, %r7039, %r7040, %r7041; // end inline asm - ld.const.u32 %r3941, [matrix+2228]; + mov.u32 %r7049, 55; // begin inline asm - dp4a.u32.u32 %r3940, %r3941, %r5798, %r3936; + shf.l.wrap.b32 %r7042, %r7048, %r7047, %r7049; // end inline asm - ld.const.u32 %r3945, [matrix+2232]; // begin inline asm - dp4a.u32.u32 %r3944, %r3945, %r5802, %r3940; + shf.l.wrap.b32 %r7046, %r7047, %r7048, %r7049; // end inline asm - ld.const.u32 %r3949, [matrix+2236]; + mov.u32 %r7057, 45; // begin inline asm - dp4a.u32.u32 %r3948, %r3949, %r5806, %r3944; + shf.l.wrap.b32 %r7050, %r7056, %r7055, %r7057; // end inline asm - ld.const.u32 %r3953, [matrix+2240]; // begin inline asm - dp4a.u32.u32 %r3952, %r3953, %r5746, %r6244; + shf.l.wrap.b32 %r7054, %r7055, %r7056, %r7057; // end inline asm - ld.const.u32 %r3957, [matrix+2244]; + mov.u32 %r7065, 36; // begin inline asm - dp4a.u32.u32 %r3956, %r3957, %r5750, %r3952; + shf.l.wrap.b32 %r7058, %r7064, %r7063, %r7065; // end inline asm - ld.const.u32 %r3961, [matrix+2248]; // begin inline asm - dp4a.u32.u32 %r3960, %r3961, %r5754, %r3956; + shf.l.wrap.b32 %r7062, %r7063, %r7064, %r7065; // end inline asm - ld.const.u32 %r3965, [matrix+2252]; + mov.u32 %r7073, 28; // begin inline asm - dp4a.u32.u32 %r3964, %r3965, %r5758, %r3960; + shf.l.wrap.b32 %r7066, %r7072, %r7071, %r7073; // end inline asm - ld.const.u32 %r3969, [matrix+2256]; // begin inline asm - dp4a.u32.u32 %r3968, %r3969, %r5762, %r3964; + shf.l.wrap.b32 %r7070, %r7071, %r7072, %r7073; // end inline asm - ld.const.u32 %r3973, [matrix+2260]; + mov.u32 %r7081, 21; // begin inline asm - dp4a.u32.u32 %r3972, %r3973, %r5766, %r3968; + shf.l.wrap.b32 %r7074, %r7080, %r7079, %r7081; // end inline asm - ld.const.u32 %r3977, [matrix+2264]; // begin inline asm - dp4a.u32.u32 %r3976, %r3977, %r5770, %r3972; + shf.l.wrap.b32 %r7078, %r7079, %r7080, %r7081; // end inline asm - ld.const.u32 %r3981, [matrix+2268]; + mov.u32 %r7089, 15; // begin inline asm - dp4a.u32.u32 %r3980, %r3981, %r5774, %r3976; + shf.l.wrap.b32 %r7082, %r7088, %r7087, %r7089; // end inline asm - ld.const.u32 %r3985, [matrix+2272]; // begin inline asm - dp4a.u32.u32 %r3984, %r3985, %r5778, %r3980; + shf.l.wrap.b32 %r7086, %r7087, %r7088, %r7089; // end inline asm - ld.const.u32 %r3989, [matrix+2276]; + mov.u32 %r7097, 10; // begin inline asm - dp4a.u32.u32 %r3988, %r3989, %r5782, %r3984; + shf.l.wrap.b32 %r7090, %r7096, %r7095, %r7097; // end inline asm - ld.const.u32 %r3993, [matrix+2280]; // begin inline asm - dp4a.u32.u32 %r3992, %r3993, %r5786, %r3988; + shf.l.wrap.b32 %r7094, %r7095, %r7096, %r7097; // end inline asm - ld.const.u32 %r3997, [matrix+2284]; + mov.u32 %r7105, 6; // begin inline asm - dp4a.u32.u32 %r3996, %r3997, %r5790, %r3992; + shf.l.wrap.b32 %r7098, %r7104, %r7103, %r7105; // end inline asm - ld.const.u32 %r4001, [matrix+2288]; // begin inline asm - dp4a.u32.u32 %r4000, %r4001, %r5794, %r3996; + shf.l.wrap.b32 %r7102, %r7103, %r7104, %r7105; // end inline asm - ld.const.u32 %r4005, [matrix+2292]; + mov.u32 %r7113, 3; // begin inline asm - dp4a.u32.u32 %r4004, %r4005, %r5798, %r4000; + shf.l.wrap.b32 %r7106, %r7112, %r7111, %r7113; // end inline asm - ld.const.u32 %r4009, [matrix+2296]; // begin inline asm - dp4a.u32.u32 %r4008, %r4009, %r5802, %r4004; + shf.l.wrap.b32 %r7110, %r7111, %r7112, %r7113; // end inline asm - ld.const.u32 %r4013, [matrix+2300]; // begin inline asm - dp4a.u32.u32 %r4012, %r4013, %r5806, %r4008; + shf.l.wrap.b32 %r7114, %r7120, %r7119, %r6799; // end inline asm - shr.u32 %r6039, %r3948, 6; - and.b32 %r6040, %r6039, 240; - shr.u32 %r6041, %r4012, 10; - or.b32 %r6042, %r6041, %r6040; - xor.b32 %r6043, %r5886, %r6042; - ld.const.u32 %r4017, [matrix+2304]; // begin inline asm - dp4a.u32.u32 %r4016, %r4017, %r5746, %r6244; + shf.l.wrap.b32 %r7118, %r7119, %r7120, %r6799; // end inline asm - ld.const.u32 %r4021, [matrix+2308]; // begin inline asm - dp4a.u32.u32 %r4020, %r4021, %r5750, %r4016; + // chi + lop3.b32 %r7122, %r7157, %r6930, %r6978, 0xD2; + lop3.b32 %r7123, %r7160, %r6934, %r6982, 0xD2; // end inline asm - ld.const.u32 %r4025, [matrix+2312]; // begin inline asm - dp4a.u32.u32 %r4024, %r4025, %r5754, %r4020; + // chi + lop3.b32 %r29587, %r6930, %r6978, %r7074, 0xD2; + lop3.b32 %r29588, %r6934, %r6982, %r7078, 0xD2; // end inline asm - ld.const.u32 %r4029, [matrix+2316]; // begin inline asm - dp4a.u32.u32 %r4028, %r4029, %r5758, %r4024; + // chi + lop3.b32 %r29583, %r6978, %r7074, %r7026, 0xD2; + lop3.b32 %r29584, %r6982, %r7078, %r7030, 0xD2; // end inline asm - ld.const.u32 %r4033, [matrix+2320]; // begin inline asm - dp4a.u32.u32 %r4032, %r4033, %r5762, %r4028; + // chi + lop3.b32 %r29579, %r7074, %r7026, %r7157, 0xD2; + lop3.b32 %r29580, %r7078, %r7030, %r7160, 0xD2; // end inline asm - ld.const.u32 %r4037, [matrix+2324]; // begin inline asm - dp4a.u32.u32 %r4036, %r4037, %r5766, %r4032; + // chi + lop3.b32 %r29577, %r7026, %r7157, %r6930, 0xD2; + lop3.b32 %r29578, %r7030, %r7160, %r6934, 0xD2; // end inline asm - ld.const.u32 %r4041, [matrix+2328]; // begin inline asm - dp4a.u32.u32 %r4040, %r4041, %r5770, %r4036; + // chi + lop3.b32 %r29573, %r7066, %r6938, %r7106, 0xD2; + lop3.b32 %r29574, %r7070, %r6942, %r7110, 0xD2; // end inline asm - ld.const.u32 %r4045, [matrix+2332]; // begin inline asm - dp4a.u32.u32 %r4044, %r4045, %r5774, %r4040; + // chi + lop3.b32 %r29585, %r6938, %r7106, %r7050, 0xD2; + lop3.b32 %r29586, %r6942, %r7110, %r7054, 0xD2; // end inline asm - ld.const.u32 %r4049, [matrix+2336]; // begin inline asm - dp4a.u32.u32 %r4048, %r4049, %r5778, %r4044; + // chi + lop3.b32 %r29581, %r7106, %r7050, %r6946, 0xD2; + lop3.b32 %r29582, %r7110, %r7054, %r6950, 0xD2; // end inline asm - ld.const.u32 %r4053, [matrix+2340]; // begin inline asm - dp4a.u32.u32 %r4052, %r4053, %r5782, %r4048; + // chi + lop3.b32 %r29553, %r7050, %r6946, %r7066, 0xD2; + lop3.b32 %r29554, %r7054, %r6950, %r7070, 0xD2; // end inline asm - ld.const.u32 %r4057, [matrix+2344]; + st.local.v2.u32 [%rd3+88], {%r29553, %r29554}; // begin inline asm - dp4a.u32.u32 %r4056, %r4057, %r5786, %r4052; + // chi + lop3.b32 %r29545, %r6946, %r7066, %r6938, 0xD2; + lop3.b32 %r29546, %r6950, %r7070, %r6942, 0xD2; // end inline asm - ld.const.u32 %r4061, [matrix+2348]; + st.local.v2.u32 [%rd3+96], {%r29545, %r29546}; // begin inline asm - dp4a.u32.u32 %r4060, %r4061, %r5790, %r4056; + // chi + lop3.b32 %r29571, %r7114, %r7098, %r6986, 0xD2; + lop3.b32 %r29572, %r7118, %r7102, %r6990, 0xD2; // end inline asm - ld.const.u32 %r4065, [matrix+2352]; + st.local.v2.u32 [%rd3+104], {%r29571, %r29572}; // begin inline asm - dp4a.u32.u32 %r4064, %r4065, %r5794, %r4060; + // chi + lop3.b32 %r29565, %r7098, %r6986, %r6994, 0xD2; + lop3.b32 %r29566, %r7102, %r6990, %r6998, 0xD2; // end inline asm - ld.const.u32 %r4069, [matrix+2356]; + st.local.v2.u32 [%rd3+112], {%r29565, %r29566}; // begin inline asm - dp4a.u32.u32 %r4068, %r4069, %r5798, %r4064; + // chi + lop3.b32 %r29559, %r6986, %r6994, %r6962, 0xD2; + lop3.b32 %r29560, %r6990, %r6998, %r6966, 0xD2; // end inline asm - ld.const.u32 %r4073, [matrix+2360]; + st.local.v2.u32 [%rd3+120], {%r29559, %r29560}; // begin inline asm - dp4a.u32.u32 %r4072, %r4073, %r5802, %r4068; + // chi + lop3.b32 %r29551, %r6994, %r6962, %r7114, 0xD2; + lop3.b32 %r29552, %r6998, %r6966, %r7118, 0xD2; // end inline asm - ld.const.u32 %r4077, [matrix+2364]; + st.local.v2.u32 [%rd3+128], {%r29551, %r29552}; // begin inline asm - dp4a.u32.u32 %r4076, %r4077, %r5806, %r4072; + // chi + lop3.b32 %r29543, %r6962, %r7114, %r7098, 0xD2; + lop3.b32 %r29544, %r6966, %r7118, %r7102, 0xD2; // end inline asm - ld.const.u32 %r4081, [matrix+2368]; + st.local.v2.u32 [%rd3+136], {%r29543, %r29544}; // begin inline asm - dp4a.u32.u32 %r4080, %r4081, %r5746, %r6244; + // chi + lop3.b32 %r29569, %r7018, %r7058, %r7090, 0xD2; + lop3.b32 %r29570, %r7022, %r7062, %r7094, 0xD2; // end inline asm - ld.const.u32 %r4085, [matrix+2372]; + st.local.v2.u32 [%rd3+144], {%r29569, %r29570}; // begin inline asm - dp4a.u32.u32 %r4084, %r4085, %r5750, %r4080; + // chi + lop3.b32 %r29563, %r7058, %r7090, %r7082, 0xD2; + lop3.b32 %r29564, %r7062, %r7094, %r7086, 0xD2; // end inline asm - ld.const.u32 %r4089, [matrix+2376]; + st.local.v2.u32 [%rd3+152], {%r29563, %r29564}; // begin inline asm - dp4a.u32.u32 %r4088, %r4089, %r5754, %r4084; + // chi + lop3.b32 %r29557, %r7090, %r7082, %r7002, 0xD2; + lop3.b32 %r29558, %r7094, %r7086, %r7006, 0xD2; // end inline asm - ld.const.u32 %r4093, [matrix+2380]; + st.local.v2.u32 [%rd3+160], {%r29557, %r29558}; // begin inline asm - dp4a.u32.u32 %r4092, %r4093, %r5758, %r4088; + // chi + lop3.b32 %r29549, %r7082, %r7002, %r7018, 0xD2; + lop3.b32 %r29550, %r7086, %r7006, %r7022, 0xD2; // end inline asm - ld.const.u32 %r4097, [matrix+2384]; + st.local.v2.u32 [%rd3+168], {%r29549, %r29550}; // begin inline asm - dp4a.u32.u32 %r4096, %r4097, %r5762, %r4092; + // chi + lop3.b32 %r29541, %r7002, %r7018, %r7058, 0xD2; + lop3.b32 %r29542, %r7006, %r7022, %r7062, 0xD2; // end inline asm - ld.const.u32 %r4101, [matrix+2388]; + st.local.v2.u32 [%rd3+176], {%r29541, %r29542}; // begin inline asm - dp4a.u32.u32 %r4100, %r4101, %r5766, %r4096; + // chi + lop3.b32 %r29567, %r6970, %r7042, %r6954, 0xD2; + lop3.b32 %r29568, %r6974, %r7046, %r6958, 0xD2; // end inline asm - ld.const.u32 %r4105, [matrix+2392]; + st.local.v2.u32 [%rd3+184], {%r29567, %r29568}; // begin inline asm - dp4a.u32.u32 %r4104, %r4105, %r5770, %r4100; + // chi + lop3.b32 %r29561, %r7042, %r6954, %r7010, 0xD2; + lop3.b32 %r29562, %r7046, %r6958, %r7014, 0xD2; // end inline asm - ld.const.u32 %r4109, [matrix+2396]; + st.local.v2.u32 [%rd3+192], {%r29561, %r29562}; // begin inline asm - dp4a.u32.u32 %r4108, %r4109, %r5774, %r4104; + // chi + lop3.b32 %r29555, %r6954, %r7010, %r7034, 0xD2; + lop3.b32 %r29556, %r6958, %r7014, %r7038, 0xD2; // end inline asm - ld.const.u32 %r4113, [matrix+2400]; + st.local.v2.u32 [%rd3+200], {%r29555, %r29556}; // begin inline asm - dp4a.u32.u32 %r4112, %r4113, %r5778, %r4108; + // chi + lop3.b32 %r29547, %r7010, %r7034, %r6970, 0xD2; + lop3.b32 %r29548, %r7014, %r7038, %r6974, 0xD2; // end inline asm - ld.const.u32 %r4117, [matrix+2404]; + st.local.v2.u32 [%rd3+208], {%r29547, %r29548}; // begin inline asm - dp4a.u32.u32 %r4116, %r4117, %r5782, %r4112; + // chi + lop3.b32 %r29539, %r7034, %r6970, %r7042, 0xD2; + lop3.b32 %r29540, %r7038, %r6974, %r7046, 0xD2; // end inline asm - ld.const.u32 %r4121, [matrix+2408]; + st.local.v2.u32 [%rd3+216], {%r29539, %r29540}; + mul.wide.s32 %rd495, %r29589, 8; + mov.u64 %rd496, keccak_round_constants; + cvta.const.u64 %rd497, %rd496; + add.s64 %rd494, %rd497, %rd495; // begin inline asm - dp4a.u32.u32 %r4120, %r4121, %r5786, %r4116; + ld.global.nc.v2.u32 {%r7322,%r7323}, [%rd494]; // end inline asm - ld.const.u32 %r4125, [matrix+2412]; + xor.b32 %r29575, %r7122, %r7322; + xor.b32 %r29576, %r7123, %r7323; + add.s32 %r29589, %r29589, 1; + setp.lt.u32 %p18, %r29589, 23; + @%p18 bra $L__BB2_23; + + add.u64 %rd55, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r29587, %r29588}; + st.local.v2.u32 [%rd3+72], {%r29585, %r29586}; + st.local.v2.u32 [%rd3+40], {%r29583, %r29584}; + st.local.v2.u32 [%rd3+80], {%r29581, %r29582}; + st.local.v2.u32 [%rd3+48], {%r29579, %r29580}; + st.local.v2.u32 [%rd3+56], {%r29577, %r29578}; + st.local.v2.u32 [%rd3+24], {%r29575, %r29576}; // begin inline asm - dp4a.u32.u32 %r4124, %r4125, %r5790, %r4120; + // xor5 + lop3.b32 %r7334, %r29575, %r29573, %r29571, 0x96; + lop3.b32 %r7334, %r7334, %r29569, %r29567, 0x96; + lop3.b32 %r7335, %r29576, %r29574, %r29572, 0x96; + lop3.b32 %r7335, %r7335, %r29570, %r29568, 0x96; // end inline asm - ld.const.u32 %r4129, [matrix+2416]; // begin inline asm - dp4a.u32.u32 %r4128, %r4129, %r5794, %r4124; + // xor5 + lop3.b32 %r7346, %r29587, %r29585, %r29565, 0x96; + lop3.b32 %r7346, %r7346, %r29563, %r29561, 0x96; + lop3.b32 %r7347, %r29588, %r29586, %r29566, 0x96; + lop3.b32 %r7347, %r7347, %r29564, %r29562, 0x96; // end inline asm - ld.const.u32 %r4133, [matrix+2420]; // begin inline asm - dp4a.u32.u32 %r4132, %r4133, %r5798, %r4128; + // xor5 + lop3.b32 %r7358, %r29583, %r29581, %r29559, 0x96; + lop3.b32 %r7358, %r7358, %r29557, %r29555, 0x96; + lop3.b32 %r7359, %r29584, %r29582, %r29560, 0x96; + lop3.b32 %r7359, %r7359, %r29558, %r29556, 0x96; // end inline asm - ld.const.u32 %r4137, [matrix+2424]; // begin inline asm - dp4a.u32.u32 %r4136, %r4137, %r5802, %r4132; + // xor5 + lop3.b32 %r7370, %r29579, %r29553, %r29551, 0x96; + lop3.b32 %r7370, %r7370, %r29549, %r29547, 0x96; + lop3.b32 %r7371, %r29580, %r29554, %r29552, 0x96; + lop3.b32 %r7371, %r7371, %r29550, %r29548, 0x96; // end inline asm - ld.const.u32 %r4141, [matrix+2428]; // begin inline asm - dp4a.u32.u32 %r4140, %r4141, %r5806, %r4136; + // xor5 + lop3.b32 %r7382, %r29577, %r29545, %r29543, 0x96; + lop3.b32 %r7382, %r7382, %r29541, %r29539, 0x96; + lop3.b32 %r7383, %r29578, %r29546, %r29544, 0x96; + lop3.b32 %r7383, %r7383, %r29542, %r29540, 0x96; // end inline asm - shr.u32 %r6044, %r4076, 6; - and.b32 %r6045, %r6044, 240; - shr.u32 %r6046, %r4140, 10; - or.b32 %r6047, %r6046, %r6045; - xor.b32 %r6048, %r5898, %r6047; - ld.const.u32 %r4145, [matrix+2432]; + mov.u32 %r7586, 1; // begin inline asm - dp4a.u32.u32 %r4144, %r4145, %r5746, %r6244; + shf.l.wrap.b32 %r7394, %r7347, %r7346, %r7586; // end inline asm - ld.const.u32 %r4149, [matrix+2436]; // begin inline asm - dp4a.u32.u32 %r4148, %r4149, %r5750, %r4144; + shf.l.wrap.b32 %r7398, %r7346, %r7347, %r7586; // end inline asm - ld.const.u32 %r4153, [matrix+2440]; + xor.b32 %r7613, %r7394, %r7382; + xor.b32 %r7614, %r7398, %r7383; + xor.b32 %r7541, %r29575, %r7613; + xor.b32 %r7544, %r29576, %r7614; + xor.b32 %r7504, %r29572, %r7614; + xor.b32 %r7503, %r29571, %r7613; + st.local.v2.u32 [%rd3+104], {%r7503, %r7504}; // begin inline asm - dp4a.u32.u32 %r4152, %r4153, %r5754, %r4148; + shf.l.wrap.b32 %r7402, %r7359, %r7358, %r7586; // end inline asm - ld.const.u32 %r4157, [matrix+2444]; // begin inline asm - dp4a.u32.u32 %r4156, %r4157, %r5758, %r4152; + shf.l.wrap.b32 %r7406, %r7358, %r7359, %r7586; // end inline asm - ld.const.u32 %r4161, [matrix+2448]; + xor.b32 %r7615, %r7402, %r7334; + xor.b32 %r7616, %r7406, %r7335; + xor.b32 %r7440, %r29585, %r7615; + xor.b32 %r7439, %r29586, %r7616; + xor.b32 %r7479, %r29564, %r7616; + xor.b32 %r7480, %r29563, %r7615; + st.local.v2.u32 [%rd3+152], {%r7480, %r7479}; // begin inline asm - dp4a.u32.u32 %r4160, %r4161, %r5762, %r4156; + shf.l.wrap.b32 %r7410, %r7371, %r7370, %r7586; // end inline asm - ld.const.u32 %r4165, [matrix+2452]; // begin inline asm - dp4a.u32.u32 %r4164, %r4165, %r5766, %r4160; + shf.l.wrap.b32 %r7414, %r7370, %r7371, %r7586; // end inline asm - ld.const.u32 %r4169, [matrix+2456]; + xor.b32 %r7617, %r7410, %r7346; + xor.b32 %r7618, %r7414, %r7347; + xor.b32 %r7463, %r29560, %r7618; + xor.b32 %r7464, %r29559, %r7617; + st.local.v2.u32 [%rd3+120], {%r7464, %r7463}; + xor.b32 %r7455, %r29556, %r7618; + xor.b32 %r7456, %r29555, %r7617; + st.local.v2.u32 [%rd3+200], {%r7456, %r7455}; // begin inline asm - dp4a.u32.u32 %r4168, %r4169, %r5770, %r4164; + shf.l.wrap.b32 %r7418, %r7383, %r7382, %r7586; // end inline asm - ld.const.u32 %r4173, [matrix+2460]; // begin inline asm - dp4a.u32.u32 %r4172, %r4173, %r5774, %r4168; + shf.l.wrap.b32 %r7422, %r7382, %r7383, %r7586; // end inline asm - ld.const.u32 %r4177, [matrix+2464]; + xor.b32 %r7619, %r7418, %r7358; + xor.b32 %r7620, %r7422, %r7359; + xor.b32 %r7487, %r29579, %r7619; + xor.b32 %r7488, %r29580, %r7620; + xor.b32 %r7496, %r29550, %r7620; + xor.b32 %r7495, %r29549, %r7619; + st.local.v2.u32 [%rd3+168], {%r7495, %r7496}; // begin inline asm - dp4a.u32.u32 %r4176, %r4177, %r5778, %r4172; + shf.l.wrap.b32 %r7426, %r7335, %r7334, %r7586; // end inline asm - ld.const.u32 %r4181, [matrix+2468]; // begin inline asm - dp4a.u32.u32 %r4180, %r4181, %r5782, %r4176; + shf.l.wrap.b32 %r7430, %r7334, %r7335, %r7586; // end inline asm - ld.const.u32 %r4185, [matrix+2472]; + xor.b32 %r7621, %r7426, %r7370; + xor.b32 %r7622, %r7430, %r7371; + xor.b32 %r7447, %r29545, %r7621; + xor.b32 %r7448, %r29546, %r7622; + xor.b32 %r7472, %r29540, %r7622; + xor.b32 %r7471, %r29539, %r7621; + st.local.v2.u32 [%rd3+216], {%r7471, %r7472}; // begin inline asm - dp4a.u32.u32 %r4184, %r4185, %r5786, %r4180; + shf.l.wrap.b32 %r7434, %r7440, %r7439, %r6937; // end inline asm - ld.const.u32 %r4189, [matrix+2476]; // begin inline asm - dp4a.u32.u32 %r4188, %r4189, %r5790, %r4184; + shf.l.wrap.b32 %r7438, %r7439, %r7440, %r6937; // end inline asm - ld.const.u32 %r4193, [matrix+2480]; // begin inline asm - dp4a.u32.u32 %r4192, %r4193, %r5794, %r4188; + shf.l.wrap.b32 %r7442, %r7448, %r7447, %r6945; // end inline asm - ld.const.u32 %r4197, [matrix+2484]; // begin inline asm - dp4a.u32.u32 %r4196, %r4197, %r5798, %r4192; + shf.l.wrap.b32 %r7446, %r7447, %r7448, %r6945; // end inline asm - ld.const.u32 %r4201, [matrix+2488]; // begin inline asm - dp4a.u32.u32 %r4200, %r4201, %r5802, %r4196; + shf.l.wrap.b32 %r7454, %r7455, %r7456, %r6953; // end inline asm - ld.const.u32 %r4205, [matrix+2492]; // begin inline asm - dp4a.u32.u32 %r4204, %r4205, %r5806, %r4200; + shf.l.wrap.b32 %r7450, %r7456, %r7455, %r6953; // end inline asm - ld.const.u32 %r4209, [matrix+2496]; + st.local.v2.u32 [%rd3+96], {%r7450, %r7454}; // begin inline asm - dp4a.u32.u32 %r4208, %r4209, %r5746, %r6244; + shf.l.wrap.b32 %r7458, %r7464, %r7463, %r6985; // end inline asm - ld.const.u32 %r4213, [matrix+2500]; // begin inline asm - dp4a.u32.u32 %r4212, %r4213, %r5750, %r4208; + shf.l.wrap.b32 %r7462, %r7463, %r7464, %r6985; // end inline asm - ld.const.u32 %r4217, [matrix+2504]; // begin inline asm - dp4a.u32.u32 %r4216, %r4217, %r5754, %r4212; + shf.l.wrap.b32 %r7466, %r7472, %r7471, %r7033; // end inline asm - ld.const.u32 %r4221, [matrix+2508]; // begin inline asm - dp4a.u32.u32 %r4220, %r4221, %r5758, %r4216; + shf.l.wrap.b32 %r7470, %r7471, %r7472, %r7033; // end inline asm - ld.const.u32 %r4225, [matrix+2512]; // begin inline asm - dp4a.u32.u32 %r4224, %r4225, %r5762, %r4220; + shf.l.wrap.b32 %r7478, %r7479, %r7480, %r7057; // end inline asm - ld.const.u32 %r4229, [matrix+2516]; // begin inline asm - dp4a.u32.u32 %r4228, %r4229, %r5766, %r4224; + shf.l.wrap.b32 %r7474, %r7480, %r7479, %r7057; // end inline asm - ld.const.u32 %r4233, [matrix+2520]; + st.local.v2.u32 [%rd3+88], {%r7474, %r7478}; // begin inline asm - dp4a.u32.u32 %r4232, %r4233, %r5770, %r4228; + shf.l.wrap.b32 %r7482, %r7488, %r7487, %r7073; // end inline asm - ld.const.u32 %r4237, [matrix+2524]; // begin inline asm - dp4a.u32.u32 %r4236, %r4237, %r5774, %r4232; + shf.l.wrap.b32 %r7486, %r7487, %r7488, %r7073; // end inline asm - ld.const.u32 %r4241, [matrix+2528]; // begin inline asm - dp4a.u32.u32 %r4240, %r4241, %r5778, %r4236; + shf.l.wrap.b32 %r7490, %r7496, %r7495, %r7081; // end inline asm - ld.const.u32 %r4245, [matrix+2532]; // begin inline asm - dp4a.u32.u32 %r4244, %r4245, %r5782, %r4240; + shf.l.wrap.b32 %r7494, %r7495, %r7496, %r7081; // end inline asm - ld.const.u32 %r4249, [matrix+2536]; // begin inline asm - dp4a.u32.u32 %r4248, %r4249, %r5786, %r4244; + shf.l.wrap.b32 %r7498, %r7504, %r7503, %r7113; // end inline asm - ld.const.u32 %r4253, [matrix+2540]; // begin inline asm - dp4a.u32.u32 %r4252, %r4253, %r5790, %r4248; + shf.l.wrap.b32 %r7502, %r7503, %r7504, %r7113; // end inline asm - ld.const.u32 %r4257, [matrix+2544]; // begin inline asm - dp4a.u32.u32 %r4256, %r4257, %r5794, %r4252; + // chi + lop3.b32 %r7506, %r7541, %r7434, %r7458, 0xD2; + lop3.b32 %r7507, %r7544, %r7438, %r7462, 0xD2; // end inline asm - ld.const.u32 %r4261, [matrix+2548]; // begin inline asm - dp4a.u32.u32 %r4260, %r4261, %r5798, %r4256; + // chi + lop3.b32 %r29722, %r7434, %r7458, %r7490, 0xD2; + lop3.b32 %r29723, %r7438, %r7462, %r7494, 0xD2; // end inline asm - ld.const.u32 %r4265, [matrix+2552]; + st.local.v2.u32 [%rd3+32], {%r29722, %r29723}; // begin inline asm - dp4a.u32.u32 %r4264, %r4265, %r5802, %r4260; + // chi + lop3.b32 %r29718, %r7458, %r7490, %r7466, 0xD2; + lop3.b32 %r29719, %r7462, %r7494, %r7470, 0xD2; // end inline asm - ld.const.u32 %r4269, [matrix+2556]; + st.local.v2.u32 [%rd3+40], {%r29718, %r29719}; // begin inline asm - dp4a.u32.u32 %r4268, %r4269, %r5806, %r4264; + // chi + lop3.b32 %r29714, %r7490, %r7466, %r7541, 0xD2; + lop3.b32 %r29715, %r7494, %r7470, %r7544, 0xD2; // end inline asm - shr.u32 %r6049, %r4204, 6; - and.b32 %r6050, %r6049, 240; - shr.u32 %r6051, %r4268, 10; - or.b32 %r6052, %r6051, %r6050; - xor.b32 %r6053, %r5900, %r6052; - ld.const.u32 %r4273, [matrix+2560]; + st.local.v2.u32 [%rd3+48], {%r29714, %r29715}; // begin inline asm - dp4a.u32.u32 %r4272, %r4273, %r5746, %r6244; + // chi + lop3.b32 %r29712, %r7466, %r7541, %r7434, 0xD2; + lop3.b32 %r29713, %r7470, %r7544, %r7438, 0xD2; // end inline asm - ld.const.u32 %r4277, [matrix+2564]; + st.local.v2.u32 [%rd3+56], {%r29712, %r29713}; // begin inline asm - dp4a.u32.u32 %r4276, %r4277, %r5750, %r4272; + // chi + lop3.b32 %r29708, %r7482, %r7442, %r7498, 0xD2; + lop3.b32 %r29709, %r7486, %r7446, %r7502, 0xD2; // end inline asm - ld.const.u32 %r4281, [matrix+2568]; + st.local.v2.u32 [%rd3+64], {%r29708, %r29709}; // begin inline asm - dp4a.u32.u32 %r4280, %r4281, %r5754, %r4276; + // chi + lop3.b32 %r29720, %r7442, %r7498, %r7474, 0xD2; + lop3.b32 %r29721, %r7446, %r7502, %r7478, 0xD2; // end inline asm - ld.const.u32 %r4285, [matrix+2572]; + st.local.v2.u32 [%rd3+72], {%r29720, %r29721}; // begin inline asm - dp4a.u32.u32 %r4284, %r4285, %r5758, %r4280; + // chi + lop3.b32 %r29716, %r7498, %r7474, %r7450, 0xD2; + lop3.b32 %r29717, %r7502, %r7478, %r7454, 0xD2; // end inline asm - ld.const.u32 %r4289, [matrix+2576]; + st.local.v2.u32 [%rd3+80], {%r29716, %r29717}; + add.s64 %rd498, %rd497, 184; // begin inline asm - dp4a.u32.u32 %r4288, %r4289, %r5762, %r4284; + ld.global.nc.v2.u32 {%r7570,%r7571}, [%rd498]; // end inline asm - ld.const.u32 %r4293, [matrix+2580]; + xor.b32 %r29710, %r7506, %r7570; + xor.b32 %r29711, %r7507, %r7571; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; + st.local.u64 [%rd55], %rd354; + mov.u64 %rd502, 1179641; + st.local.u64 [%rd55+8], %rd502; + add.s32 %r226, %r30, 1; + st.local.u32 [%rd55+16], %r226; + ld.global.u64 %rd503, [%rd35]; + ld.global.u64 %rd504, [%rd35+8]; + ld.global.u64 %rd505, [%rd35+16]; + ld.global.u64 %rd506, [%rd35+24]; + ld.global.u64 %rd507, [%rd35+32]; + ld.global.u64 %rd508, [%rd35+40]; + ld.global.u64 %rd509, [%rd35+48]; + ld.global.u64 %rd510, [%rd35+56]; + st.local.u64 [%rd55+32], %rd504; + st.local.u64 [%rd55+40], %rd505; + st.local.u64 [%rd55+48], %rd506; + st.local.u64 [%rd55+56], %rd507; + st.local.u64 [%rd55+64], %rd508; + st.local.u64 [%rd55+72], %rd509; + st.local.u64 [%rd55+80], %rd510; + cvt.u32.u64 %r7623, %rd503; + xor.b32 %r7624, %r226, %r7623; + st.local.u64 [%rd55+24], %rd503; + st.local.u32 [%rd55+24], %r7624; + mov.u32 %r29590, 0; + st.local.v2.u32 [%rd55+96], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+104], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+112], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+120], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+128], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+136], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+144], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+152], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+160], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+168], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+176], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+184], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+192], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+200], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+208], {%r29590, %r29590}; + st.local.v2.u32 [%rd55+216], {%r29590, %r29590}; + mov.u32 %r29605, -2147483648; + st.local.v2.u32 [%rd55+88], {%r7586, %r29605}; + ld.local.v2.u32 {%r29626, %r29627}, [%rd55+24]; + mov.b64 {%r29624, %r29625}, %rd508; + shr.u64 %rd511, %rd504, 32; + cvt.u32.u64 %r29638, %rd504; + cvt.u32.u64 %r29639, %rd511; + shr.u64 %rd512, %rd509, 32; + cvt.u32.u64 %r29636, %rd509; + cvt.u32.u64 %r29637, %rd512; + shr.u64 %rd513, %rd505, 32; + cvt.u32.u64 %r29634, %rd505; + cvt.u32.u64 %r29635, %rd513; + shr.u64 %rd514, %rd510, 32; + cvt.u32.u64 %r29632, %rd510; + cvt.u32.u64 %r29633, %rd514; + shr.u64 %rd515, %rd506, 32; + cvt.u32.u64 %r29630, %rd506; + cvt.u32.u64 %r29631, %rd515; + shr.u64 %rd516, %rd507, 32; + cvt.u32.u64 %r29628, %rd507; + cvt.u32.u64 %r29629, %rd516; + mov.u32 %r29591, %r29590; + mov.u32 %r29592, %r29590; + mov.u32 %r29593, %r29590; + mov.u32 %r29594, %r29590; + mov.u32 %r29595, %r29590; + mov.u32 %r29596, %r29590; + mov.u32 %r29597, %r29590; + mov.u32 %r29598, %r29590; + mov.u32 %r29599, %r29590; + mov.u32 %r29600, %r29590; + mov.u32 %r29601, %r29590; + mov.u32 %r29602, %r29590; + mov.u32 %r29603, %r29590; + mov.u32 %r29604, %r7586; + mov.u32 %r29606, %r29590; + mov.u32 %r29607, %r29590; + mov.u32 %r29608, %r29590; + mov.u32 %r29609, %r29590; + mov.u32 %r29610, %r29590; + mov.u32 %r29611, %r29590; + mov.u32 %r29612, %r29590; + mov.u32 %r29613, %r29590; + mov.u32 %r29614, %r29590; + mov.u32 %r29615, %r29590; + mov.u32 %r29616, %r29590; + mov.u32 %r29617, %r29590; + mov.u32 %r29618, %r29590; + mov.u32 %r29619, %r29590; + mov.u32 %r29620, %r29590; + mov.u32 %r29621, %r29590; + mov.u32 %r29622, %r29590; + mov.u32 %r29623, %r29590; + mov.u32 %r29640, %r29590; + +$L__BB2_25: // begin inline asm - dp4a.u32.u32 %r4292, %r4293, %r5766, %r4288; + // xor5 + lop3.b32 %r7627, %r29626, %r29624, %r29622, 0x96; + lop3.b32 %r7627, %r7627, %r29620, %r29618, 0x96; + lop3.b32 %r7628, %r29627, %r29625, %r29623, 0x96; + lop3.b32 %r7628, %r7628, %r29621, %r29619, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7639, %r29638, %r29636, %r29616, 0x96; + lop3.b32 %r7639, %r7639, %r29614, %r29612, 0x96; + lop3.b32 %r7640, %r29639, %r29637, %r29617, 0x96; + lop3.b32 %r7640, %r7640, %r29615, %r29613, 0x96; // end inline asm - ld.const.u32 %r4297, [matrix+2584]; // begin inline asm - dp4a.u32.u32 %r4296, %r4297, %r5770, %r4292; + // xor5 + lop3.b32 %r7651, %r29634, %r29632, %r29610, 0x96; + lop3.b32 %r7651, %r7651, %r29608, %r29606, 0x96; + lop3.b32 %r7652, %r29635, %r29633, %r29611, 0x96; + lop3.b32 %r7652, %r7652, %r29609, %r29607, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r7663, %r29630, %r29604, %r29602, 0x96; + lop3.b32 %r7663, %r7663, %r29600, %r29598, 0x96; + lop3.b32 %r7664, %r29631, %r29605, %r29603, 0x96; + lop3.b32 %r7664, %r7664, %r29601, %r29599, 0x96; // end inline asm - ld.const.u32 %r4301, [matrix+2588]; // begin inline asm - dp4a.u32.u32 %r4300, %r4301, %r5774, %r4296; + // xor5 + lop3.b32 %r7675, %r29628, %r29596, %r29594, 0x96; + lop3.b32 %r7675, %r7675, %r29592, %r29590, 0x96; + lop3.b32 %r7676, %r29629, %r29597, %r29595, 0x96; + lop3.b32 %r7676, %r7676, %r29593, %r29591, 0x96; // end inline asm - ld.const.u32 %r4305, [matrix+2592]; // begin inline asm - dp4a.u32.u32 %r4304, %r4305, %r5778, %r4300; + shf.l.wrap.b32 %r7687, %r7640, %r7639, %r7586; // end inline asm - ld.const.u32 %r4309, [matrix+2596]; // begin inline asm - dp4a.u32.u32 %r4308, %r4309, %r5782, %r4304; + shf.l.wrap.b32 %r7691, %r7639, %r7640, %r7586; // end inline asm - ld.const.u32 %r4313, [matrix+2600]; + xor.b32 %r8121, %r7687, %r7675; + xor.b32 %r8122, %r7691, %r7676; + xor.b32 %r7954, %r29626, %r8121; + xor.b32 %r7957, %r29627, %r8122; + xor.b32 %r7861, %r29624, %r8121; + xor.b32 %r7860, %r29625, %r8122; + xor.b32 %r7908, %r29622, %r8121; + xor.b32 %r7909, %r29623, %r8122; + xor.b32 %r7813, %r29620, %r8121; + xor.b32 %r7812, %r29621, %r8122; + xor.b32 %r7764, %r29618, %r8121; + xor.b32 %r7765, %r29619, %r8122; // begin inline asm - dp4a.u32.u32 %r4312, %r4313, %r5786, %r4308; + shf.l.wrap.b32 %r7695, %r7652, %r7651, %r7586; // end inline asm - ld.const.u32 %r4317, [matrix+2604]; // begin inline asm - dp4a.u32.u32 %r4316, %r4317, %r5790, %r4312; + shf.l.wrap.b32 %r7699, %r7651, %r7652, %r7586; // end inline asm - ld.const.u32 %r4321, [matrix+2608]; + xor.b32 %r8123, %r7695, %r7627; + xor.b32 %r8124, %r7699, %r7628; + xor.b32 %r7916, %r29638, %r8123; + xor.b32 %r7917, %r29639, %r8124; + xor.b32 %r7733, %r29636, %r8123; + xor.b32 %r7732, %r29637, %r8124; + xor.b32 %r7892, %r29616, %r8123; + xor.b32 %r7893, %r29617, %r8124; + xor.b32 %r7853, %r29614, %r8123; + xor.b32 %r7852, %r29615, %r8124; + xor.b32 %r7836, %r29612, %r8123; + xor.b32 %r7837, %r29613, %r8124; // begin inline asm - dp4a.u32.u32 %r4320, %r4321, %r5794, %r4316; + shf.l.wrap.b32 %r7703, %r7664, %r7663, %r7586; // end inline asm - ld.const.u32 %r4325, [matrix+2612]; // begin inline asm - dp4a.u32.u32 %r4324, %r4325, %r5798, %r4320; + shf.l.wrap.b32 %r7707, %r7663, %r7664, %r7586; // end inline asm - ld.const.u32 %r4329, [matrix+2616]; + xor.b32 %r8125, %r7703, %r7639; + xor.b32 %r8126, %r7707, %r7640; + xor.b32 %r7773, %r29634, %r8125; + xor.b32 %r7772, %r29635, %r8126; + xor.b32 %r7900, %r29632, %r8125; + xor.b32 %r7901, %r29633, %r8126; + xor.b32 %r7781, %r29610, %r8125; + xor.b32 %r7780, %r29611, %r8126; + xor.b32 %r7884, %r29608, %r8125; + xor.b32 %r7885, %r29609, %r8126; + xor.b32 %r7749, %r29606, %r8125; + xor.b32 %r7748, %r29607, %r8126; // begin inline asm - dp4a.u32.u32 %r4328, %r4329, %r5802, %r4324; + shf.l.wrap.b32 %r7711, %r7676, %r7675, %r7586; // end inline asm - ld.const.u32 %r4333, [matrix+2620]; // begin inline asm - dp4a.u32.u32 %r4332, %r4333, %r5806, %r4328; + shf.l.wrap.b32 %r7715, %r7675, %r7676, %r7586; // end inline asm - ld.const.u32 %r4337, [matrix+2624]; + xor.b32 %r8127, %r7711, %r7651; + xor.b32 %r8128, %r7715, %r7652; + xor.b32 %r7868, %r29630, %r8127; + xor.b32 %r7869, %r29631, %r8128; + xor.b32 %r7845, %r29604, %r8127; + xor.b32 %r7844, %r29605, %r8128; + xor.b32 %r7788, %r29602, %r8127; + xor.b32 %r7789, %r29603, %r8128; + xor.b32 %r7876, %r29600, %r8127; + xor.b32 %r7877, %r29601, %r8128; + xor.b32 %r7805, %r29598, %r8127; + xor.b32 %r7804, %r29599, %r8128; // begin inline asm - dp4a.u32.u32 %r4336, %r4337, %r5746, %r6244; + shf.l.wrap.b32 %r7719, %r7628, %r7627, %r7586; // end inline asm - ld.const.u32 %r4341, [matrix+2628]; // begin inline asm - dp4a.u32.u32 %r4340, %r4341, %r5750, %r4336; + shf.l.wrap.b32 %r7723, %r7627, %r7628, %r7586; // end inline asm - ld.const.u32 %r4345, [matrix+2632]; + xor.b32 %r8129, %r7719, %r7663; + xor.b32 %r8130, %r7723, %r7664; + xor.b32 %r7820, %r29628, %r8129; + xor.b32 %r7821, %r29629, %r8130; + xor.b32 %r7740, %r29596, %r8129; + xor.b32 %r7741, %r29597, %r8130; + xor.b32 %r7757, %r29594, %r8129; + xor.b32 %r7756, %r29595, %r8130; + xor.b32 %r7796, %r29592, %r8129; + xor.b32 %r7797, %r29593, %r8130; + xor.b32 %r7828, %r29590, %r8129; + xor.b32 %r7829, %r29591, %r8130; + mov.u32 %r7734, 44; // begin inline asm - dp4a.u32.u32 %r4344, %r4345, %r5754, %r4340; + shf.l.wrap.b32 %r7727, %r7733, %r7732, %r7734; // end inline asm - ld.const.u32 %r4349, [matrix+2636]; // begin inline asm - dp4a.u32.u32 %r4348, %r4349, %r5758, %r4344; + shf.l.wrap.b32 %r7731, %r7732, %r7733, %r7734; // end inline asm - ld.const.u32 %r4353, [matrix+2640]; + mov.u32 %r7742, 20; // begin inline asm - dp4a.u32.u32 %r4352, %r4353, %r5762, %r4348; + shf.l.wrap.b32 %r7735, %r7741, %r7740, %r7742; // end inline asm - ld.const.u32 %r4357, [matrix+2644]; // begin inline asm - dp4a.u32.u32 %r4356, %r4357, %r5766, %r4352; + shf.l.wrap.b32 %r7739, %r7740, %r7741, %r7742; // end inline asm - ld.const.u32 %r4361, [matrix+2648]; + mov.u32 %r7750, 61; // begin inline asm - dp4a.u32.u32 %r4360, %r4361, %r5770, %r4356; + shf.l.wrap.b32 %r7743, %r7749, %r7748, %r7750; // end inline asm - ld.const.u32 %r4365, [matrix+2652]; // begin inline asm - dp4a.u32.u32 %r4364, %r4365, %r5774, %r4360; + shf.l.wrap.b32 %r7747, %r7748, %r7749, %r7750; // end inline asm - ld.const.u32 %r4369, [matrix+2656]; + mov.u32 %r7758, 39; // begin inline asm - dp4a.u32.u32 %r4368, %r4369, %r5778, %r4364; + shf.l.wrap.b32 %r7751, %r7757, %r7756, %r7758; // end inline asm - ld.const.u32 %r4373, [matrix+2660]; // begin inline asm - dp4a.u32.u32 %r4372, %r4373, %r5782, %r4368; + shf.l.wrap.b32 %r7755, %r7756, %r7757, %r7758; // end inline asm - ld.const.u32 %r4377, [matrix+2664]; + mov.u32 %r7766, 18; // begin inline asm - dp4a.u32.u32 %r4376, %r4377, %r5786, %r4372; + shf.l.wrap.b32 %r7759, %r7765, %r7764, %r7766; // end inline asm - ld.const.u32 %r4381, [matrix+2668]; // begin inline asm - dp4a.u32.u32 %r4380, %r4381, %r5790, %r4376; + shf.l.wrap.b32 %r7763, %r7764, %r7765, %r7766; // end inline asm - ld.const.u32 %r4385, [matrix+2672]; + mov.u32 %r7774, 62; // begin inline asm - dp4a.u32.u32 %r4384, %r4385, %r5794, %r4380; + shf.l.wrap.b32 %r7767, %r7773, %r7772, %r7774; // end inline asm - ld.const.u32 %r4389, [matrix+2676]; // begin inline asm - dp4a.u32.u32 %r4388, %r4389, %r5798, %r4384; + shf.l.wrap.b32 %r7771, %r7772, %r7773, %r7774; // end inline asm - ld.const.u32 %r4393, [matrix+2680]; + mov.u32 %r7782, 43; // begin inline asm - dp4a.u32.u32 %r4392, %r4393, %r5802, %r4388; + shf.l.wrap.b32 %r7775, %r7781, %r7780, %r7782; // end inline asm - ld.const.u32 %r4397, [matrix+2684]; // begin inline asm - dp4a.u32.u32 %r4396, %r4397, %r5806, %r4392; + shf.l.wrap.b32 %r7779, %r7780, %r7781, %r7782; // end inline asm - shr.u32 %r6054, %r4332, 6; - and.b32 %r6055, %r6054, 240; - shr.u32 %r6056, %r4396, 10; - or.b32 %r6057, %r6056, %r6055; - cvt.u64.u32 %rd223, %r6057; - xor.b64 %rd224, %rd16, %rd223; - ld.const.u32 %r4401, [matrix+2688]; + mov.u32 %r7790, 25; // begin inline asm - dp4a.u32.u32 %r4400, %r4401, %r5746, %r6244; + shf.l.wrap.b32 %r7783, %r7789, %r7788, %r7790; // end inline asm - ld.const.u32 %r4405, [matrix+2692]; // begin inline asm - dp4a.u32.u32 %r4404, %r4405, %r5750, %r4400; + shf.l.wrap.b32 %r7787, %r7788, %r7789, %r7790; // end inline asm - ld.const.u32 %r4409, [matrix+2696]; + mov.u32 %r7798, 8; // begin inline asm - dp4a.u32.u32 %r4408, %r4409, %r5754, %r4404; + shf.l.wrap.b32 %r7791, %r7797, %r7796, %r7798; // end inline asm - ld.const.u32 %r4413, [matrix+2700]; // begin inline asm - dp4a.u32.u32 %r4412, %r4413, %r5758, %r4408; + shf.l.wrap.b32 %r7795, %r7796, %r7797, %r7798; // end inline asm - ld.const.u32 %r4417, [matrix+2704]; + mov.u32 %r7806, 56; // begin inline asm - dp4a.u32.u32 %r4416, %r4417, %r5762, %r4412; + shf.l.wrap.b32 %r7799, %r7805, %r7804, %r7806; // end inline asm - ld.const.u32 %r4421, [matrix+2708]; // begin inline asm - dp4a.u32.u32 %r4420, %r4421, %r5766, %r4416; + shf.l.wrap.b32 %r7803, %r7804, %r7805, %r7806; // end inline asm - ld.const.u32 %r4425, [matrix+2712]; + mov.u32 %r7814, 41; // begin inline asm - dp4a.u32.u32 %r4424, %r4425, %r5770, %r4420; + shf.l.wrap.b32 %r7807, %r7813, %r7812, %r7814; // end inline asm - ld.const.u32 %r4429, [matrix+2716]; // begin inline asm - dp4a.u32.u32 %r4428, %r4429, %r5774, %r4424; + shf.l.wrap.b32 %r7811, %r7812, %r7813, %r7814; // end inline asm - ld.const.u32 %r4433, [matrix+2720]; + mov.u32 %r7822, 27; // begin inline asm - dp4a.u32.u32 %r4432, %r4433, %r5778, %r4428; + shf.l.wrap.b32 %r7815, %r7821, %r7820, %r7822; // end inline asm - ld.const.u32 %r4437, [matrix+2724]; // begin inline asm - dp4a.u32.u32 %r4436, %r4437, %r5782, %r4432; + shf.l.wrap.b32 %r7819, %r7820, %r7821, %r7822; // end inline asm - ld.const.u32 %r4441, [matrix+2728]; + mov.u32 %r7830, 14; // begin inline asm - dp4a.u32.u32 %r4440, %r4441, %r5786, %r4436; + shf.l.wrap.b32 %r7823, %r7829, %r7828, %r7830; // end inline asm - ld.const.u32 %r4445, [matrix+2732]; // begin inline asm - dp4a.u32.u32 %r4444, %r4445, %r5790, %r4440; + shf.l.wrap.b32 %r7827, %r7828, %r7829, %r7830; // end inline asm - ld.const.u32 %r4449, [matrix+2736]; + mov.u32 %r7838, 2; // begin inline asm - dp4a.u32.u32 %r4448, %r4449, %r5794, %r4444; + shf.l.wrap.b32 %r7831, %r7837, %r7836, %r7838; // end inline asm - ld.const.u32 %r4453, [matrix+2740]; // begin inline asm - dp4a.u32.u32 %r4452, %r4453, %r5798, %r4448; + shf.l.wrap.b32 %r7835, %r7836, %r7837, %r7838; // end inline asm - ld.const.u32 %r4457, [matrix+2744]; + mov.u32 %r7846, 55; // begin inline asm - dp4a.u32.u32 %r4456, %r4457, %r5802, %r4452; + shf.l.wrap.b32 %r7839, %r7845, %r7844, %r7846; // end inline asm - ld.const.u32 %r4461, [matrix+2748]; // begin inline asm - dp4a.u32.u32 %r4460, %r4461, %r5806, %r4456; + shf.l.wrap.b32 %r7843, %r7844, %r7845, %r7846; // end inline asm - ld.const.u32 %r4465, [matrix+2752]; + mov.u32 %r7854, 45; // begin inline asm - dp4a.u32.u32 %r4464, %r4465, %r5746, %r6244; + shf.l.wrap.b32 %r7847, %r7853, %r7852, %r7854; // end inline asm - ld.const.u32 %r4469, [matrix+2756]; // begin inline asm - dp4a.u32.u32 %r4468, %r4469, %r5750, %r4464; + shf.l.wrap.b32 %r7851, %r7852, %r7853, %r7854; // end inline asm - ld.const.u32 %r4473, [matrix+2760]; + mov.u32 %r7862, 36; // begin inline asm - dp4a.u32.u32 %r4472, %r4473, %r5754, %r4468; + shf.l.wrap.b32 %r7855, %r7861, %r7860, %r7862; // end inline asm - ld.const.u32 %r4477, [matrix+2764]; // begin inline asm - dp4a.u32.u32 %r4476, %r4477, %r5758, %r4472; + shf.l.wrap.b32 %r7859, %r7860, %r7861, %r7862; // end inline asm - ld.const.u32 %r4481, [matrix+2768]; + mov.u32 %r7870, 28; // begin inline asm - dp4a.u32.u32 %r4480, %r4481, %r5762, %r4476; + shf.l.wrap.b32 %r7863, %r7869, %r7868, %r7870; // end inline asm - ld.const.u32 %r4485, [matrix+2772]; // begin inline asm - dp4a.u32.u32 %r4484, %r4485, %r5766, %r4480; + shf.l.wrap.b32 %r7867, %r7868, %r7869, %r7870; // end inline asm - ld.const.u32 %r4489, [matrix+2776]; + mov.u32 %r7878, 21; // begin inline asm - dp4a.u32.u32 %r4488, %r4489, %r5770, %r4484; + shf.l.wrap.b32 %r7871, %r7877, %r7876, %r7878; // end inline asm - ld.const.u32 %r4493, [matrix+2780]; // begin inline asm - dp4a.u32.u32 %r4492, %r4493, %r5774, %r4488; + shf.l.wrap.b32 %r7875, %r7876, %r7877, %r7878; // end inline asm - ld.const.u32 %r4497, [matrix+2784]; + mov.u32 %r7886, 15; // begin inline asm - dp4a.u32.u32 %r4496, %r4497, %r5778, %r4492; + shf.l.wrap.b32 %r7879, %r7885, %r7884, %r7886; // end inline asm - ld.const.u32 %r4501, [matrix+2788]; // begin inline asm - dp4a.u32.u32 %r4500, %r4501, %r5782, %r4496; + shf.l.wrap.b32 %r7883, %r7884, %r7885, %r7886; // end inline asm - ld.const.u32 %r4505, [matrix+2792]; + mov.u32 %r7894, 10; // begin inline asm - dp4a.u32.u32 %r4504, %r4505, %r5786, %r4500; + shf.l.wrap.b32 %r7887, %r7893, %r7892, %r7894; // end inline asm - ld.const.u32 %r4509, [matrix+2796]; // begin inline asm - dp4a.u32.u32 %r4508, %r4509, %r5790, %r4504; + shf.l.wrap.b32 %r7891, %r7892, %r7893, %r7894; // end inline asm - ld.const.u32 %r4513, [matrix+2800]; + mov.u32 %r7902, 6; // begin inline asm - dp4a.u32.u32 %r4512, %r4513, %r5794, %r4508; + shf.l.wrap.b32 %r7895, %r7901, %r7900, %r7902; // end inline asm - ld.const.u32 %r4517, [matrix+2804]; // begin inline asm - dp4a.u32.u32 %r4516, %r4517, %r5798, %r4512; + shf.l.wrap.b32 %r7899, %r7900, %r7901, %r7902; // end inline asm - ld.const.u32 %r4521, [matrix+2808]; + mov.u32 %r7910, 3; // begin inline asm - dp4a.u32.u32 %r4520, %r4521, %r5802, %r4516; + shf.l.wrap.b32 %r7903, %r7909, %r7908, %r7910; // end inline asm - ld.const.u32 %r4525, [matrix+2812]; // begin inline asm - dp4a.u32.u32 %r4524, %r4525, %r5806, %r4520; + shf.l.wrap.b32 %r7907, %r7908, %r7909, %r7910; // end inline asm - shr.u32 %r6058, %r4460, 6; - and.b32 %r6059, %r6058, 240; - shr.u32 %r6060, %r4524, 10; - or.b32 %r6061, %r6060, %r6059; - cvt.u64.u32 %rd225, %r6061; - xor.b64 %rd226, %rd17, %rd225; - ld.const.u32 %r4529, [matrix+2816]; // begin inline asm - dp4a.u32.u32 %r4528, %r4529, %r5746, %r6244; + shf.l.wrap.b32 %r7911, %r7917, %r7916, %r7586; // end inline asm - ld.const.u32 %r4533, [matrix+2820]; // begin inline asm - dp4a.u32.u32 %r4532, %r4533, %r5750, %r4528; + shf.l.wrap.b32 %r7915, %r7916, %r7917, %r7586; // end inline asm - ld.const.u32 %r4537, [matrix+2824]; // begin inline asm - dp4a.u32.u32 %r4536, %r4537, %r5754, %r4532; + // chi + lop3.b32 %r7919, %r7954, %r7727, %r7775, 0xD2; + lop3.b32 %r7920, %r7957, %r7731, %r7779, 0xD2; // end inline asm - ld.const.u32 %r4541, [matrix+2828]; // begin inline asm - dp4a.u32.u32 %r4540, %r4541, %r5758, %r4536; + // chi + lop3.b32 %r29638, %r7727, %r7775, %r7871, 0xD2; + lop3.b32 %r29639, %r7731, %r7779, %r7875, 0xD2; // end inline asm - ld.const.u32 %r4545, [matrix+2832]; // begin inline asm - dp4a.u32.u32 %r4544, %r4545, %r5762, %r4540; + // chi + lop3.b32 %r29634, %r7775, %r7871, %r7823, 0xD2; + lop3.b32 %r29635, %r7779, %r7875, %r7827, 0xD2; // end inline asm - ld.const.u32 %r4549, [matrix+2836]; // begin inline asm - dp4a.u32.u32 %r4548, %r4549, %r5766, %r4544; + // chi + lop3.b32 %r29630, %r7871, %r7823, %r7954, 0xD2; + lop3.b32 %r29631, %r7875, %r7827, %r7957, 0xD2; // end inline asm - ld.const.u32 %r4553, [matrix+2840]; // begin inline asm - dp4a.u32.u32 %r4552, %r4553, %r5770, %r4548; + // chi + lop3.b32 %r29628, %r7823, %r7954, %r7727, 0xD2; + lop3.b32 %r29629, %r7827, %r7957, %r7731, 0xD2; // end inline asm - ld.const.u32 %r4557, [matrix+2844]; // begin inline asm - dp4a.u32.u32 %r4556, %r4557, %r5774, %r4552; + // chi + lop3.b32 %r29624, %r7863, %r7735, %r7903, 0xD2; + lop3.b32 %r29625, %r7867, %r7739, %r7907, 0xD2; // end inline asm - ld.const.u32 %r4561, [matrix+2848]; // begin inline asm - dp4a.u32.u32 %r4560, %r4561, %r5778, %r4556; + // chi + lop3.b32 %r29636, %r7735, %r7903, %r7847, 0xD2; + lop3.b32 %r29637, %r7739, %r7907, %r7851, 0xD2; // end inline asm - ld.const.u32 %r4565, [matrix+2852]; // begin inline asm - dp4a.u32.u32 %r4564, %r4565, %r5782, %r4560; + // chi + lop3.b32 %r29632, %r7903, %r7847, %r7743, 0xD2; + lop3.b32 %r29633, %r7907, %r7851, %r7747, 0xD2; // end inline asm - ld.const.u32 %r4569, [matrix+2856]; // begin inline asm - dp4a.u32.u32 %r4568, %r4569, %r5786, %r4564; + // chi + lop3.b32 %r29604, %r7847, %r7743, %r7863, 0xD2; + lop3.b32 %r29605, %r7851, %r7747, %r7867, 0xD2; // end inline asm - ld.const.u32 %r4573, [matrix+2860]; + st.local.v2.u32 [%rd55+88], {%r29604, %r29605}; // begin inline asm - dp4a.u32.u32 %r4572, %r4573, %r5790, %r4568; + // chi + lop3.b32 %r29596, %r7743, %r7863, %r7735, 0xD2; + lop3.b32 %r29597, %r7747, %r7867, %r7739, 0xD2; // end inline asm - ld.const.u32 %r4577, [matrix+2864]; + st.local.v2.u32 [%rd55+96], {%r29596, %r29597}; // begin inline asm - dp4a.u32.u32 %r4576, %r4577, %r5794, %r4572; + // chi + lop3.b32 %r29622, %r7911, %r7895, %r7783, 0xD2; + lop3.b32 %r29623, %r7915, %r7899, %r7787, 0xD2; // end inline asm - ld.const.u32 %r4581, [matrix+2868]; + st.local.v2.u32 [%rd55+104], {%r29622, %r29623}; // begin inline asm - dp4a.u32.u32 %r4580, %r4581, %r5798, %r4576; + // chi + lop3.b32 %r29616, %r7895, %r7783, %r7791, 0xD2; + lop3.b32 %r29617, %r7899, %r7787, %r7795, 0xD2; // end inline asm - ld.const.u32 %r4585, [matrix+2872]; + st.local.v2.u32 [%rd55+112], {%r29616, %r29617}; // begin inline asm - dp4a.u32.u32 %r4584, %r4585, %r5802, %r4580; + // chi + lop3.b32 %r29610, %r7783, %r7791, %r7759, 0xD2; + lop3.b32 %r29611, %r7787, %r7795, %r7763, 0xD2; // end inline asm - ld.const.u32 %r4589, [matrix+2876]; + st.local.v2.u32 [%rd55+120], {%r29610, %r29611}; // begin inline asm - dp4a.u32.u32 %r4588, %r4589, %r5806, %r4584; + // chi + lop3.b32 %r29602, %r7791, %r7759, %r7911, 0xD2; + lop3.b32 %r29603, %r7795, %r7763, %r7915, 0xD2; // end inline asm - ld.const.u32 %r4593, [matrix+2880]; + st.local.v2.u32 [%rd55+128], {%r29602, %r29603}; // begin inline asm - dp4a.u32.u32 %r4592, %r4593, %r5746, %r6244; + // chi + lop3.b32 %r29594, %r7759, %r7911, %r7895, 0xD2; + lop3.b32 %r29595, %r7763, %r7915, %r7899, 0xD2; // end inline asm - ld.const.u32 %r4597, [matrix+2884]; + st.local.v2.u32 [%rd55+136], {%r29594, %r29595}; // begin inline asm - dp4a.u32.u32 %r4596, %r4597, %r5750, %r4592; + // chi + lop3.b32 %r29620, %r7815, %r7855, %r7887, 0xD2; + lop3.b32 %r29621, %r7819, %r7859, %r7891, 0xD2; // end inline asm - ld.const.u32 %r4601, [matrix+2888]; + st.local.v2.u32 [%rd55+144], {%r29620, %r29621}; // begin inline asm - dp4a.u32.u32 %r4600, %r4601, %r5754, %r4596; + // chi + lop3.b32 %r29614, %r7855, %r7887, %r7879, 0xD2; + lop3.b32 %r29615, %r7859, %r7891, %r7883, 0xD2; // end inline asm - ld.const.u32 %r4605, [matrix+2892]; + st.local.v2.u32 [%rd55+152], {%r29614, %r29615}; // begin inline asm - dp4a.u32.u32 %r4604, %r4605, %r5758, %r4600; + // chi + lop3.b32 %r29608, %r7887, %r7879, %r7799, 0xD2; + lop3.b32 %r29609, %r7891, %r7883, %r7803, 0xD2; // end inline asm - ld.const.u32 %r4609, [matrix+2896]; + st.local.v2.u32 [%rd55+160], {%r29608, %r29609}; // begin inline asm - dp4a.u32.u32 %r4608, %r4609, %r5762, %r4604; + // chi + lop3.b32 %r29600, %r7879, %r7799, %r7815, 0xD2; + lop3.b32 %r29601, %r7883, %r7803, %r7819, 0xD2; // end inline asm - ld.const.u32 %r4613, [matrix+2900]; + st.local.v2.u32 [%rd55+168], {%r29600, %r29601}; // begin inline asm - dp4a.u32.u32 %r4612, %r4613, %r5766, %r4608; + // chi + lop3.b32 %r29592, %r7799, %r7815, %r7855, 0xD2; + lop3.b32 %r29593, %r7803, %r7819, %r7859, 0xD2; // end inline asm - ld.const.u32 %r4617, [matrix+2904]; + st.local.v2.u32 [%rd55+176], {%r29592, %r29593}; // begin inline asm - dp4a.u32.u32 %r4616, %r4617, %r5770, %r4612; + // chi + lop3.b32 %r29618, %r7767, %r7839, %r7751, 0xD2; + lop3.b32 %r29619, %r7771, %r7843, %r7755, 0xD2; // end inline asm - ld.const.u32 %r4621, [matrix+2908]; + st.local.v2.u32 [%rd55+184], {%r29618, %r29619}; // begin inline asm - dp4a.u32.u32 %r4620, %r4621, %r5774, %r4616; + // chi + lop3.b32 %r29612, %r7839, %r7751, %r7807, 0xD2; + lop3.b32 %r29613, %r7843, %r7755, %r7811, 0xD2; // end inline asm - ld.const.u32 %r4625, [matrix+2912]; + st.local.v2.u32 [%rd55+192], {%r29612, %r29613}; // begin inline asm - dp4a.u32.u32 %r4624, %r4625, %r5778, %r4620; + // chi + lop3.b32 %r29606, %r7751, %r7807, %r7831, 0xD2; + lop3.b32 %r29607, %r7755, %r7811, %r7835, 0xD2; // end inline asm - ld.const.u32 %r4629, [matrix+2916]; + st.local.v2.u32 [%rd55+200], {%r29606, %r29607}; // begin inline asm - dp4a.u32.u32 %r4628, %r4629, %r5782, %r4624; + // chi + lop3.b32 %r29598, %r7807, %r7831, %r7767, 0xD2; + lop3.b32 %r29599, %r7811, %r7835, %r7771, 0xD2; // end inline asm - ld.const.u32 %r4633, [matrix+2920]; + st.local.v2.u32 [%rd55+208], {%r29598, %r29599}; // begin inline asm - dp4a.u32.u32 %r4632, %r4633, %r5786, %r4628; + // chi + lop3.b32 %r29590, %r7831, %r7767, %r7839, 0xD2; + lop3.b32 %r29591, %r7835, %r7771, %r7843, 0xD2; // end inline asm - ld.const.u32 %r4637, [matrix+2924]; + st.local.v2.u32 [%rd55+216], {%r29590, %r29591}; + mul.wide.s32 %rd518, %r29640, 8; + add.s64 %rd517, %rd497, %rd518; // begin inline asm - dp4a.u32.u32 %r4636, %r4637, %r5790, %r4632; + ld.global.nc.v2.u32 {%r8119,%r8120}, [%rd517]; // end inline asm - ld.const.u32 %r4641, [matrix+2928]; + xor.b32 %r29626, %r7919, %r8119; + xor.b32 %r29627, %r7920, %r8120; + add.s32 %r29640, %r29640, 1; + setp.lt.u32 %p19, %r29640, 23; + @%p19 bra $L__BB2_25; + + mov.u32 %r29673, 0; + mov.u32 %r8230, 1; + st.local.v2.u32 [%rd55+32], {%r29638, %r29639}; + st.local.v2.u32 [%rd55+72], {%r29636, %r29637}; + st.local.v2.u32 [%rd55+40], {%r29634, %r29635}; + st.local.v2.u32 [%rd55+80], {%r29632, %r29633}; + st.local.v2.u32 [%rd55+48], {%r29630, %r29631}; + st.local.v2.u32 [%rd55+56], {%r29628, %r29629}; + st.local.v2.u32 [%rd55+24], {%r29626, %r29627}; // begin inline asm - dp4a.u32.u32 %r4640, %r4641, %r5794, %r4636; + // xor5 + lop3.b32 %r8131, %r29626, %r29624, %r29622, 0x96; + lop3.b32 %r8131, %r8131, %r29620, %r29618, 0x96; + lop3.b32 %r8132, %r29627, %r29625, %r29623, 0x96; + lop3.b32 %r8132, %r8132, %r29621, %r29619, 0x96; // end inline asm - ld.const.u32 %r4645, [matrix+2932]; // begin inline asm - dp4a.u32.u32 %r4644, %r4645, %r5798, %r4640; + // xor5 + lop3.b32 %r8143, %r29638, %r29636, %r29616, 0x96; + lop3.b32 %r8143, %r8143, %r29614, %r29612, 0x96; + lop3.b32 %r8144, %r29639, %r29637, %r29617, 0x96; + lop3.b32 %r8144, %r8144, %r29615, %r29613, 0x96; // end inline asm - ld.const.u32 %r4649, [matrix+2936]; // begin inline asm - dp4a.u32.u32 %r4648, %r4649, %r5802, %r4644; + // xor5 + lop3.b32 %r8155, %r29634, %r29632, %r29610, 0x96; + lop3.b32 %r8155, %r8155, %r29608, %r29606, 0x96; + lop3.b32 %r8156, %r29635, %r29633, %r29611, 0x96; + lop3.b32 %r8156, %r8156, %r29609, %r29607, 0x96; // end inline asm - ld.const.u32 %r4653, [matrix+2940]; // begin inline asm - dp4a.u32.u32 %r4652, %r4653, %r5806, %r4648; + // xor5 + lop3.b32 %r8167, %r29630, %r29604, %r29602, 0x96; + lop3.b32 %r8167, %r8167, %r29600, %r29598, 0x96; + lop3.b32 %r8168, %r29631, %r29605, %r29603, 0x96; + lop3.b32 %r8168, %r8168, %r29601, %r29599, 0x96; // end inline asm - shr.u32 %r6062, %r4588, 6; - and.b32 %r6063, %r6062, 240; - shr.u32 %r6064, %r4652, 10; - or.b32 %r6065, %r6064, %r6063; - cvt.u64.u32 %rd227, %r6065; - xor.b64 %rd228, %rd18, %rd227; - ld.const.u32 %r4657, [matrix+2944]; // begin inline asm - dp4a.u32.u32 %r4656, %r4657, %r5746, %r6244; + // xor5 + lop3.b32 %r8179, %r29628, %r29596, %r29594, 0x96; + lop3.b32 %r8179, %r8179, %r29592, %r29590, 0x96; + lop3.b32 %r8180, %r29629, %r29597, %r29595, 0x96; + lop3.b32 %r8180, %r8180, %r29593, %r29591, 0x96; // end inline asm - ld.const.u32 %r4661, [matrix+2948]; // begin inline asm - dp4a.u32.u32 %r4660, %r4661, %r5750, %r4656; + shf.l.wrap.b32 %r8191, %r8144, %r8143, %r8230; // end inline asm - ld.const.u32 %r4665, [matrix+2952]; // begin inline asm - dp4a.u32.u32 %r4664, %r4665, %r5754, %r4660; + shf.l.wrap.b32 %r8195, %r8143, %r8144, %r8230; // end inline asm - ld.const.u32 %r4669, [matrix+2956]; + xor.b32 %r8370, %r8191, %r8179; + xor.b32 %r8371, %r8195, %r8180; + xor.b32 %r8338, %r29626, %r8370; + xor.b32 %r8341, %r29627, %r8371; + xor.b32 %r8301, %r29623, %r8371; + xor.b32 %r8300, %r29622, %r8370; + st.local.v2.u32 [%rd55+104], {%r8300, %r8301}; // begin inline asm - dp4a.u32.u32 %r4668, %r4669, %r5758, %r4664; + shf.l.wrap.b32 %r8199, %r8156, %r8155, %r8230; // end inline asm - ld.const.u32 %r4673, [matrix+2960]; // begin inline asm - dp4a.u32.u32 %r4672, %r4673, %r5762, %r4668; + shf.l.wrap.b32 %r8203, %r8155, %r8156, %r8230; // end inline asm - ld.const.u32 %r4677, [matrix+2964]; + xor.b32 %r8372, %r8199, %r8131; + xor.b32 %r8373, %r8203, %r8132; + xor.b32 %r8237, %r29636, %r8372; + xor.b32 %r8236, %r29637, %r8373; + xor.b32 %r8276, %r29615, %r8373; + xor.b32 %r8277, %r29614, %r8372; + st.local.v2.u32 [%rd55+152], {%r8277, %r8276}; // begin inline asm - dp4a.u32.u32 %r4676, %r4677, %r5766, %r4672; + shf.l.wrap.b32 %r8207, %r8168, %r8167, %r8230; // end inline asm - ld.const.u32 %r4681, [matrix+2968]; // begin inline asm - dp4a.u32.u32 %r4680, %r4681, %r5770, %r4676; + shf.l.wrap.b32 %r8211, %r8167, %r8168, %r8230; // end inline asm - ld.const.u32 %r4685, [matrix+2972]; + xor.b32 %r8374, %r8207, %r8143; + xor.b32 %r8375, %r8211, %r8144; + xor.b32 %r8260, %r29611, %r8375; + xor.b32 %r8261, %r29610, %r8374; + st.local.v2.u32 [%rd55+120], {%r8261, %r8260}; + xor.b32 %r8252, %r29607, %r8375; + xor.b32 %r8253, %r29606, %r8374; + st.local.v2.u32 [%rd55+200], {%r8253, %r8252}; // begin inline asm - dp4a.u32.u32 %r4684, %r4685, %r5774, %r4680; + shf.l.wrap.b32 %r8215, %r8180, %r8179, %r8230; // end inline asm - ld.const.u32 %r4689, [matrix+2976]; // begin inline asm - dp4a.u32.u32 %r4688, %r4689, %r5778, %r4684; + shf.l.wrap.b32 %r8219, %r8179, %r8180, %r8230; // end inline asm - ld.const.u32 %r4693, [matrix+2980]; + xor.b32 %r8376, %r8215, %r8155; + xor.b32 %r8377, %r8219, %r8156; + xor.b32 %r8284, %r29630, %r8376; + xor.b32 %r8285, %r29631, %r8377; + xor.b32 %r8293, %r29601, %r8377; + xor.b32 %r8292, %r29600, %r8376; + st.local.v2.u32 [%rd55+168], {%r8292, %r8293}; // begin inline asm - dp4a.u32.u32 %r4692, %r4693, %r5782, %r4688; + shf.l.wrap.b32 %r8223, %r8132, %r8131, %r8230; // end inline asm - ld.const.u32 %r4697, [matrix+2984]; // begin inline asm - dp4a.u32.u32 %r4696, %r4697, %r5786, %r4692; + shf.l.wrap.b32 %r8227, %r8131, %r8132, %r8230; // end inline asm - ld.const.u32 %r4701, [matrix+2988]; + xor.b32 %r8378, %r8223, %r8167; + xor.b32 %r8379, %r8227, %r8168; + xor.b32 %r8244, %r29596, %r8378; + xor.b32 %r8245, %r29597, %r8379; + xor.b32 %r8269, %r29591, %r8379; + xor.b32 %r8268, %r29590, %r8378; + st.local.v2.u32 [%rd55+216], {%r8268, %r8269}; // begin inline asm - dp4a.u32.u32 %r4700, %r4701, %r5790, %r4696; + shf.l.wrap.b32 %r8231, %r8237, %r8236, %r7734; // end inline asm - ld.const.u32 %r4705, [matrix+2992]; // begin inline asm - dp4a.u32.u32 %r4704, %r4705, %r5794, %r4700; + shf.l.wrap.b32 %r8235, %r8236, %r8237, %r7734; // end inline asm - ld.const.u32 %r4709, [matrix+2996]; // begin inline asm - dp4a.u32.u32 %r4708, %r4709, %r5798, %r4704; + shf.l.wrap.b32 %r8239, %r8245, %r8244, %r7742; // end inline asm - ld.const.u32 %r4713, [matrix+3000]; // begin inline asm - dp4a.u32.u32 %r4712, %r4713, %r5802, %r4708; + shf.l.wrap.b32 %r8243, %r8244, %r8245, %r7742; // end inline asm - ld.const.u32 %r4717, [matrix+3004]; // begin inline asm - dp4a.u32.u32 %r4716, %r4717, %r5806, %r4712; + shf.l.wrap.b32 %r8251, %r8252, %r8253, %r7750; // end inline asm - ld.const.u32 %r4721, [matrix+3008]; // begin inline asm - dp4a.u32.u32 %r4720, %r4721, %r5746, %r6244; + shf.l.wrap.b32 %r8247, %r8253, %r8252, %r7750; // end inline asm - ld.const.u32 %r4725, [matrix+3012]; + st.local.v2.u32 [%rd55+96], {%r8247, %r8251}; // begin inline asm - dp4a.u32.u32 %r4724, %r4725, %r5750, %r4720; + shf.l.wrap.b32 %r8255, %r8261, %r8260, %r7782; // end inline asm - ld.const.u32 %r4729, [matrix+3016]; // begin inline asm - dp4a.u32.u32 %r4728, %r4729, %r5754, %r4724; + shf.l.wrap.b32 %r8259, %r8260, %r8261, %r7782; // end inline asm - ld.const.u32 %r4733, [matrix+3020]; // begin inline asm - dp4a.u32.u32 %r4732, %r4733, %r5758, %r4728; + shf.l.wrap.b32 %r8263, %r8269, %r8268, %r7830; // end inline asm - ld.const.u32 %r4737, [matrix+3024]; // begin inline asm - dp4a.u32.u32 %r4736, %r4737, %r5762, %r4732; + shf.l.wrap.b32 %r8267, %r8268, %r8269, %r7830; // end inline asm - ld.const.u32 %r4741, [matrix+3028]; // begin inline asm - dp4a.u32.u32 %r4740, %r4741, %r5766, %r4736; + shf.l.wrap.b32 %r8275, %r8276, %r8277, %r7854; // end inline asm - ld.const.u32 %r4745, [matrix+3032]; // begin inline asm - dp4a.u32.u32 %r4744, %r4745, %r5770, %r4740; + shf.l.wrap.b32 %r8271, %r8277, %r8276, %r7854; // end inline asm - ld.const.u32 %r4749, [matrix+3036]; + st.local.v2.u32 [%rd55+88], {%r8271, %r8275}; // begin inline asm - dp4a.u32.u32 %r4748, %r4749, %r5774, %r4744; + shf.l.wrap.b32 %r8279, %r8285, %r8284, %r7870; // end inline asm - ld.const.u32 %r4753, [matrix+3040]; // begin inline asm - dp4a.u32.u32 %r4752, %r4753, %r5778, %r4748; + shf.l.wrap.b32 %r8283, %r8284, %r8285, %r7870; // end inline asm - ld.const.u32 %r4757, [matrix+3044]; // begin inline asm - dp4a.u32.u32 %r4756, %r4757, %r5782, %r4752; + shf.l.wrap.b32 %r8287, %r8293, %r8292, %r7878; // end inline asm - ld.const.u32 %r4761, [matrix+3048]; // begin inline asm - dp4a.u32.u32 %r4760, %r4761, %r5786, %r4756; + shf.l.wrap.b32 %r8291, %r8292, %r8293, %r7878; // end inline asm - ld.const.u32 %r4765, [matrix+3052]; // begin inline asm - dp4a.u32.u32 %r4764, %r4765, %r5790, %r4760; + shf.l.wrap.b32 %r8295, %r8301, %r8300, %r7910; // end inline asm - ld.const.u32 %r4769, [matrix+3056]; // begin inline asm - dp4a.u32.u32 %r4768, %r4769, %r5794, %r4764; + shf.l.wrap.b32 %r8299, %r8300, %r8301, %r7910; // end inline asm - ld.const.u32 %r4773, [matrix+3060]; // begin inline asm - dp4a.u32.u32 %r4772, %r4773, %r5798, %r4768; + // chi + lop3.b32 %r8303, %r8338, %r8231, %r8255, 0xD2; + lop3.b32 %r8304, %r8341, %r8235, %r8259, 0xD2; // end inline asm - ld.const.u32 %r4777, [matrix+3064]; // begin inline asm - dp4a.u32.u32 %r4776, %r4777, %r5802, %r4772; + // chi + lop3.b32 %r29773, %r8231, %r8255, %r8287, 0xD2; + lop3.b32 %r29774, %r8235, %r8259, %r8291, 0xD2; // end inline asm - ld.const.u32 %r4781, [matrix+3068]; + st.local.v2.u32 [%rd55+32], {%r29773, %r29774}; // begin inline asm - dp4a.u32.u32 %r4780, %r4781, %r5806, %r4776; + // chi + lop3.b32 %r29769, %r8255, %r8287, %r8263, 0xD2; + lop3.b32 %r29770, %r8259, %r8291, %r8267, 0xD2; // end inline asm - shr.u32 %r6066, %r4716, 6; - and.b32 %r6067, %r6066, 240; - ld.const.u32 %r4785, [matrix+3072]; + st.local.v2.u32 [%rd55+40], {%r29769, %r29770}; // begin inline asm - dp4a.u32.u32 %r4784, %r4785, %r5746, %r6244; + // chi + lop3.b32 %r29765, %r8287, %r8263, %r8338, 0xD2; + lop3.b32 %r29766, %r8291, %r8267, %r8341, 0xD2; // end inline asm - ld.const.u32 %r4789, [matrix+3076]; + st.local.v2.u32 [%rd55+48], {%r29765, %r29766}; // begin inline asm - dp4a.u32.u32 %r4788, %r4789, %r5750, %r4784; + // chi + lop3.b32 %r29763, %r8263, %r8338, %r8231, 0xD2; + lop3.b32 %r29764, %r8267, %r8341, %r8235, 0xD2; // end inline asm - ld.const.u32 %r4793, [matrix+3080]; + st.local.v2.u32 [%rd55+56], {%r29763, %r29764}; // begin inline asm - dp4a.u32.u32 %r4792, %r4793, %r5754, %r4788; + // chi + lop3.b32 %r29759, %r8279, %r8239, %r8295, 0xD2; + lop3.b32 %r29760, %r8283, %r8243, %r8299, 0xD2; // end inline asm - ld.const.u32 %r4797, [matrix+3084]; + st.local.v2.u32 [%rd55+64], {%r29759, %r29760}; // begin inline asm - dp4a.u32.u32 %r4796, %r4797, %r5758, %r4792; + // chi + lop3.b32 %r29771, %r8239, %r8295, %r8271, 0xD2; + lop3.b32 %r29772, %r8243, %r8299, %r8275, 0xD2; // end inline asm - ld.const.u32 %r4801, [matrix+3088]; + st.local.v2.u32 [%rd55+72], {%r29771, %r29772}; // begin inline asm - dp4a.u32.u32 %r4800, %r4801, %r5762, %r4796; + // chi + lop3.b32 %r29767, %r8295, %r8271, %r8247, 0xD2; + lop3.b32 %r29768, %r8299, %r8275, %r8251, 0xD2; // end inline asm - ld.const.u32 %r4805, [matrix+3092]; + st.local.v2.u32 [%rd55+80], {%r29767, %r29768}; // begin inline asm - dp4a.u32.u32 %r4804, %r4805, %r5766, %r4800; + ld.global.nc.v2.u32 {%r8367,%r8368}, [%rd498]; // end inline asm - ld.const.u32 %r4809, [matrix+3096]; + xor.b32 %r29761, %r8303, %r8367; + xor.b32 %r29762, %r8304, %r8368; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; + add.s64 %rd57, %rd55, 24; + add.s64 %rd58, %rd3, 24; + +$L__BB2_27: + shl.b32 %r8380, %r29673, 2; + cvt.u64.u32 %rd528, %r8380; + and.b64 %rd529, %rd528, 60; + add.s64 %rd530, %rd58, %rd529; + xor.b32 %r8381, %r30, %r29673; + mul.lo.s32 %r8382, %r8381, 16777619; + ld.local.u32 %r8383, [%rd530]; + xor.b32 %r8384, %r8382, %r8383; + mul.wide.u32 %rd531, %r8384, -954391867; + shr.u64 %rd532, %rd531, 32; + cvt.u32.u64 %r8385, %rd532; + sub.s32 %r8386, %r8384, %r8385; + shr.u32 %r8387, %r8386, 1; + add.s32 %r8388, %r8387, %r8385; + shr.u32 %r8389, %r8388, 20; + mul.lo.s32 %r8390, %r8389, 1179641; + sub.s32 %r8391, %r8384, %r8390; + mul.wide.u32 %rd533, %r8391, 64; + add.s64 %rd534, %rd471, %rd533; + mul.lo.s32 %r8392, %r29710, 16777619; + ld.global.u32 %r8393, [%rd534]; + xor.b32 %r29710, %r8392, %r8393; + mul.lo.s32 %r8394, %r29711, 16777619; + ld.global.u32 %r8395, [%rd534+4]; + xor.b32 %r29711, %r8394, %r8395; + mul.lo.s32 %r8396, %r29722, 16777619; + ld.global.u32 %r8397, [%rd534+8]; + mul.lo.s32 %r8398, %r29723, 16777619; + ld.global.u32 %r8399, [%rd534+12]; + xor.b32 %r8400, %r8398, %r8399; + xor.b32 %r29722, %r8396, %r8397; + mov.b64 %rd535, {%r29722, %r8400}; + mul.lo.s32 %r8401, %r29718, 16777619; + ld.global.u32 %r8402, [%rd534+16]; + mul.lo.s32 %r8403, %r29719, 16777619; + ld.global.u32 %r8404, [%rd534+20]; + xor.b32 %r8405, %r8403, %r8404; + xor.b32 %r29718, %r8401, %r8402; + mov.b64 %rd536, {%r29718, %r8405}; + mul.lo.s32 %r8406, %r29714, 16777619; + ld.global.u32 %r8407, [%rd534+24]; + mul.lo.s32 %r8408, %r29715, 16777619; + ld.global.u32 %r8409, [%rd534+28]; + xor.b32 %r8410, %r8408, %r8409; + xor.b32 %r29714, %r8406, %r8407; + mov.b64 %rd537, {%r29714, %r8410}; + mul.lo.s32 %r8411, %r29712, 16777619; + ld.global.u32 %r8412, [%rd534+32]; + mul.lo.s32 %r8413, %r29713, 16777619; + ld.global.u32 %r8414, [%rd534+36]; + xor.b32 %r8415, %r8413, %r8414; + xor.b32 %r29712, %r8411, %r8412; + mov.b64 %rd538, {%r29712, %r8415}; + mul.lo.s32 %r8416, %r29708, 16777619; + ld.global.u32 %r8417, [%rd534+40]; + xor.b32 %r29708, %r8416, %r8417; + mul.lo.s32 %r8418, %r29709, 16777619; + ld.global.u32 %r8419, [%rd534+44]; + xor.b32 %r29709, %r8418, %r8419; + mul.lo.s32 %r8420, %r29720, 16777619; + ld.global.u32 %r8421, [%rd534+48]; + mul.lo.s32 %r8422, %r29721, 16777619; + ld.global.u32 %r8423, [%rd534+52]; + xor.b32 %r8424, %r8422, %r8423; + xor.b32 %r29720, %r8420, %r8421; + mov.b64 %rd539, {%r29720, %r8424}; + mul.lo.s32 %r8425, %r29716, 16777619; + ld.global.u32 %r8426, [%rd534+56]; + mul.lo.s32 %r8427, %r29717, 16777619; + ld.global.u32 %r8428, [%rd534+60]; + xor.b32 %r8429, %r8427, %r8428; + xor.b32 %r29716, %r8425, %r8426; + mov.b64 %rd540, {%r29716, %r8429}; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; + st.local.v2.u32 [%rd3+32], {%r29722, %r8400}; + st.local.v2.u32 [%rd3+40], {%r29718, %r8405}; + st.local.v2.u32 [%rd3+48], {%r29714, %r8410}; + st.local.v2.u32 [%rd3+56], {%r29712, %r8415}; + st.local.v2.u32 [%rd3+64], {%r29708, %r29709}; + st.local.v2.u32 [%rd3+72], {%r29720, %r8424}; + st.local.v2.u32 [%rd3+80], {%r29716, %r8429}; + add.s64 %rd541, %rd57, %rd529; + xor.b32 %r8430, %r226, %r29673; + mul.lo.s32 %r8431, %r8430, 16777619; + ld.local.u32 %r8432, [%rd541]; + xor.b32 %r8433, %r8431, %r8432; + mul.wide.u32 %rd542, %r8433, -954391867; + shr.u64 %rd543, %rd542, 32; + cvt.u32.u64 %r8434, %rd543; + sub.s32 %r8435, %r8433, %r8434; + shr.u32 %r8436, %r8435, 1; + add.s32 %r8437, %r8436, %r8434; + shr.u32 %r8438, %r8437, 20; + mul.lo.s32 %r8439, %r8438, 1179641; + sub.s32 %r8440, %r8433, %r8439; + mul.wide.u32 %rd544, %r8440, 64; + add.s64 %rd545, %rd471, %rd544; + mul.lo.s32 %r8441, %r29761, 16777619; + ld.global.u32 %r8442, [%rd545]; + xor.b32 %r29761, %r8441, %r8442; + mul.lo.s32 %r8443, %r29762, 16777619; + ld.global.u32 %r8444, [%rd545+4]; + xor.b32 %r29762, %r8443, %r8444; + mul.lo.s32 %r8445, %r29773, 16777619; + ld.global.u32 %r8446, [%rd545+8]; + mul.lo.s32 %r8447, %r29774, 16777619; + ld.global.u32 %r8448, [%rd545+12]; + xor.b32 %r8449, %r8447, %r8448; + xor.b32 %r29773, %r8445, %r8446; + mov.b64 %rd546, {%r29773, %r8449}; + mul.lo.s32 %r8450, %r29769, 16777619; + ld.global.u32 %r8451, [%rd545+16]; + mul.lo.s32 %r8452, %r29770, 16777619; + ld.global.u32 %r8453, [%rd545+20]; + xor.b32 %r8454, %r8452, %r8453; + xor.b32 %r29769, %r8450, %r8451; + mov.b64 %rd547, {%r29769, %r8454}; + mul.lo.s32 %r8455, %r29765, 16777619; + ld.global.u32 %r8456, [%rd545+24]; + mul.lo.s32 %r8457, %r29766, 16777619; + ld.global.u32 %r8458, [%rd545+28]; + xor.b32 %r8459, %r8457, %r8458; + xor.b32 %r29765, %r8455, %r8456; + mov.b64 %rd548, {%r29765, %r8459}; + mul.lo.s32 %r8460, %r29763, 16777619; + ld.global.u32 %r8461, [%rd545+32]; + mul.lo.s32 %r8462, %r29764, 16777619; + ld.global.u32 %r8463, [%rd545+36]; + xor.b32 %r8464, %r8462, %r8463; + xor.b32 %r29763, %r8460, %r8461; + mov.b64 %rd549, {%r29763, %r8464}; + mul.lo.s32 %r8465, %r29759, 16777619; + ld.global.u32 %r8466, [%rd545+40]; + xor.b32 %r29759, %r8465, %r8466; + mul.lo.s32 %r8467, %r29760, 16777619; + ld.global.u32 %r8468, [%rd545+44]; + xor.b32 %r29760, %r8467, %r8468; + mul.lo.s32 %r8469, %r29771, 16777619; + ld.global.u32 %r8470, [%rd545+48]; + mul.lo.s32 %r8471, %r29772, 16777619; + ld.global.u32 %r8472, [%rd545+52]; + xor.b32 %r8473, %r8471, %r8472; + xor.b32 %r29771, %r8469, %r8470; + mov.b64 %rd550, {%r29771, %r8473}; + mul.lo.s32 %r8474, %r29767, 16777619; + ld.global.u32 %r8475, [%rd545+56]; + mul.lo.s32 %r8476, %r29768, 16777619; + ld.global.u32 %r8477, [%rd545+60]; + xor.b32 %r8478, %r8476, %r8477; + xor.b32 %r29767, %r8474, %r8475; + mov.b64 %rd551, {%r29767, %r8478}; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; + st.local.v2.u32 [%rd55+32], {%r29773, %r8449}; + st.local.v2.u32 [%rd55+40], {%r29769, %r8454}; + st.local.v2.u32 [%rd55+48], {%r29765, %r8459}; + st.local.v2.u32 [%rd55+56], {%r29763, %r8464}; + st.local.v2.u32 [%rd55+64], {%r29759, %r29760}; + st.local.v2.u32 [%rd55+72], {%r29771, %r8473}; + st.local.v2.u32 [%rd55+80], {%r29767, %r8478}; + add.s32 %r29673, %r29673, 1; + setp.lt.u32 %p20, %r29673, 512; + shr.u64 %rd552, %rd535, 32; + cvt.u32.u64 %r29723, %rd552; + shr.u64 %rd553, %rd536, 32; + cvt.u32.u64 %r29719, %rd553; + shr.u64 %rd554, %rd537, 32; + cvt.u32.u64 %r29715, %rd554; + shr.u64 %rd555, %rd538, 32; + cvt.u32.u64 %r29713, %rd555; + shr.u64 %rd556, %rd539, 32; + cvt.u32.u64 %r29721, %rd556; + shr.u64 %rd557, %rd540, 32; + cvt.u32.u64 %r29717, %rd557; + shr.u64 %rd558, %rd546, 32; + cvt.u32.u64 %r29774, %rd558; + shr.u64 %rd559, %rd547, 32; + cvt.u32.u64 %r29770, %rd559; + shr.u64 %rd560, %rd548, 32; + cvt.u32.u64 %r29766, %rd560; + shr.u64 %rd561, %rd549, 32; + cvt.u32.u64 %r29764, %rd561; + shr.u64 %rd562, %rd550, 32; + cvt.u32.u64 %r29772, %rd562; + shr.u64 %rd563, %rd551, 32; + cvt.u32.u64 %r29768, %rd563; + @%p20 bra $L__BB2_27; + + mov.u32 %r29674, 0; + st.local.v2.u32 [%rd3+96], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+104], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+112], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+120], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+128], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+136], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+144], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+152], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+160], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+168], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+176], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+184], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+192], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+200], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+208], {%r29674, %r29674}; + st.local.v2.u32 [%rd3+216], {%r29674, %r29674}; + mov.u32 %r29689, -2147483648; + mov.u32 %r8493, 1; + st.local.v2.u32 [%rd3+88], {%r8493, %r29689}; + mov.u32 %r29675, %r29674; + mov.u32 %r29676, %r29674; + mov.u32 %r29677, %r29674; + mov.u32 %r29678, %r29674; + mov.u32 %r29679, %r29674; + mov.u32 %r29680, %r29674; + mov.u32 %r29681, %r29674; + mov.u32 %r29682, %r29674; + mov.u32 %r29683, %r29674; + mov.u32 %r29684, %r29674; + mov.u32 %r29685, %r29674; + mov.u32 %r29686, %r29674; + mov.u32 %r29687, %r29674; + mov.u32 %r29688, %r8493; + mov.u32 %r29690, %r29674; + mov.u32 %r29691, %r29674; + mov.u32 %r29692, %r29674; + mov.u32 %r29693, %r29674; + mov.u32 %r29694, %r29674; + mov.u32 %r29695, %r29674; + mov.u32 %r29696, %r29674; + mov.u32 %r29697, %r29674; + mov.u32 %r29698, %r29674; + mov.u32 %r29699, %r29674; + mov.u32 %r29700, %r29674; + mov.u32 %r29701, %r29674; + mov.u32 %r29702, %r29674; + mov.u32 %r29703, %r29674; + mov.u32 %r29704, %r29674; + mov.u32 %r29705, %r29674; + mov.u32 %r29706, %r29674; + mov.u32 %r29707, %r29674; + mov.u32 %r29724, %r29674; + +$L__BB2_29: // begin inline asm - dp4a.u32.u32 %r4808, %r4809, %r5770, %r4804; + // xor5 + lop3.b32 %r8520, %r29710, %r29708, %r29706, 0x96; + lop3.b32 %r8520, %r8520, %r29704, %r29702, 0x96; + lop3.b32 %r8521, %r29711, %r29709, %r29707, 0x96; + lop3.b32 %r8521, %r8521, %r29705, %r29703, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8532, %r29722, %r29720, %r29700, 0x96; + lop3.b32 %r8532, %r8532, %r29698, %r29696, 0x96; + lop3.b32 %r8533, %r29723, %r29721, %r29701, 0x96; + lop3.b32 %r8533, %r8533, %r29699, %r29697, 0x96; // end inline asm - ld.const.u32 %r4813, [matrix+3100]; // begin inline asm - dp4a.u32.u32 %r4812, %r4813, %r5774, %r4808; + // xor5 + lop3.b32 %r8544, %r29718, %r29716, %r29694, 0x96; + lop3.b32 %r8544, %r8544, %r29692, %r29690, 0x96; + lop3.b32 %r8545, %r29719, %r29717, %r29695, 0x96; + lop3.b32 %r8545, %r8545, %r29693, %r29691, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r8556, %r29714, %r29688, %r29686, 0x96; + lop3.b32 %r8556, %r8556, %r29684, %r29682, 0x96; + lop3.b32 %r8557, %r29715, %r29689, %r29687, 0x96; + lop3.b32 %r8557, %r8557, %r29685, %r29683, 0x96; // end inline asm - ld.const.u32 %r4817, [matrix+3104]; // begin inline asm - dp4a.u32.u32 %r4816, %r4817, %r5778, %r4812; + // xor5 + lop3.b32 %r8568, %r29712, %r29680, %r29678, 0x96; + lop3.b32 %r8568, %r8568, %r29676, %r29674, 0x96; + lop3.b32 %r8569, %r29713, %r29681, %r29679, 0x96; + lop3.b32 %r8569, %r8569, %r29677, %r29675, 0x96; // end inline asm - ld.const.u32 %r4821, [matrix+3108]; // begin inline asm - dp4a.u32.u32 %r4820, %r4821, %r5782, %r4816; + shf.l.wrap.b32 %r8580, %r8533, %r8532, %r8493; // end inline asm - ld.const.u32 %r4825, [matrix+3112]; // begin inline asm - dp4a.u32.u32 %r4824, %r4825, %r5786, %r4820; + shf.l.wrap.b32 %r8584, %r8532, %r8533, %r8493; // end inline asm - ld.const.u32 %r4829, [matrix+3116]; + xor.b32 %r9014, %r8580, %r8568; + xor.b32 %r9015, %r8584, %r8569; + xor.b32 %r8847, %r29710, %r9014; + xor.b32 %r8850, %r29711, %r9015; + xor.b32 %r8754, %r29708, %r9014; + xor.b32 %r8753, %r29709, %r9015; + xor.b32 %r8801, %r29706, %r9014; + xor.b32 %r8802, %r29707, %r9015; + xor.b32 %r8706, %r29704, %r9014; + xor.b32 %r8705, %r29705, %r9015; + xor.b32 %r8657, %r29702, %r9014; + xor.b32 %r8658, %r29703, %r9015; // begin inline asm - dp4a.u32.u32 %r4828, %r4829, %r5790, %r4824; + shf.l.wrap.b32 %r8588, %r8545, %r8544, %r8493; // end inline asm - ld.const.u32 %r4833, [matrix+3120]; // begin inline asm - dp4a.u32.u32 %r4832, %r4833, %r5794, %r4828; + shf.l.wrap.b32 %r8592, %r8544, %r8545, %r8493; // end inline asm - ld.const.u32 %r4837, [matrix+3124]; + xor.b32 %r9016, %r8588, %r8520; + xor.b32 %r9017, %r8592, %r8521; + xor.b32 %r8809, %r29722, %r9016; + xor.b32 %r8810, %r29723, %r9017; + xor.b32 %r8626, %r29720, %r9016; + xor.b32 %r8625, %r29721, %r9017; + xor.b32 %r8785, %r29700, %r9016; + xor.b32 %r8786, %r29701, %r9017; + xor.b32 %r8746, %r29698, %r9016; + xor.b32 %r8745, %r29699, %r9017; + xor.b32 %r8729, %r29696, %r9016; + xor.b32 %r8730, %r29697, %r9017; // begin inline asm - dp4a.u32.u32 %r4836, %r4837, %r5798, %r4832; + shf.l.wrap.b32 %r8596, %r8557, %r8556, %r8493; // end inline asm - ld.const.u32 %r4841, [matrix+3128]; // begin inline asm - dp4a.u32.u32 %r4840, %r4841, %r5802, %r4836; + shf.l.wrap.b32 %r8600, %r8556, %r8557, %r8493; // end inline asm - ld.const.u32 %r4845, [matrix+3132]; + xor.b32 %r9018, %r8596, %r8532; + xor.b32 %r9019, %r8600, %r8533; + xor.b32 %r8666, %r29718, %r9018; + xor.b32 %r8665, %r29719, %r9019; + xor.b32 %r8793, %r29716, %r9018; + xor.b32 %r8794, %r29717, %r9019; + xor.b32 %r8674, %r29694, %r9018; + xor.b32 %r8673, %r29695, %r9019; + xor.b32 %r8777, %r29692, %r9018; + xor.b32 %r8778, %r29693, %r9019; + xor.b32 %r8642, %r29690, %r9018; + xor.b32 %r8641, %r29691, %r9019; // begin inline asm - dp4a.u32.u32 %r4844, %r4845, %r5806, %r4840; + shf.l.wrap.b32 %r8604, %r8569, %r8568, %r8493; // end inline asm - ld.const.u32 %r4849, [matrix+3136]; // begin inline asm - dp4a.u32.u32 %r4848, %r4849, %r5746, %r6244; + shf.l.wrap.b32 %r8608, %r8568, %r8569, %r8493; // end inline asm - ld.const.u32 %r4853, [matrix+3140]; + xor.b32 %r9020, %r8604, %r8544; + xor.b32 %r9021, %r8608, %r8545; + xor.b32 %r8761, %r29714, %r9020; + xor.b32 %r8762, %r29715, %r9021; + xor.b32 %r8738, %r29688, %r9020; + xor.b32 %r8737, %r29689, %r9021; + xor.b32 %r8681, %r29686, %r9020; + xor.b32 %r8682, %r29687, %r9021; + xor.b32 %r8769, %r29684, %r9020; + xor.b32 %r8770, %r29685, %r9021; + xor.b32 %r8698, %r29682, %r9020; + xor.b32 %r8697, %r29683, %r9021; // begin inline asm - dp4a.u32.u32 %r4852, %r4853, %r5750, %r4848; + shf.l.wrap.b32 %r8612, %r8521, %r8520, %r8493; // end inline asm - ld.const.u32 %r4857, [matrix+3144]; // begin inline asm - dp4a.u32.u32 %r4856, %r4857, %r5754, %r4852; + shf.l.wrap.b32 %r8616, %r8520, %r8521, %r8493; // end inline asm - ld.const.u32 %r4861, [matrix+3148]; + xor.b32 %r9022, %r8612, %r8556; + xor.b32 %r9023, %r8616, %r8557; + xor.b32 %r8713, %r29712, %r9022; + xor.b32 %r8714, %r29713, %r9023; + xor.b32 %r8633, %r29680, %r9022; + xor.b32 %r8634, %r29681, %r9023; + xor.b32 %r8650, %r29678, %r9022; + xor.b32 %r8649, %r29679, %r9023; + xor.b32 %r8689, %r29676, %r9022; + xor.b32 %r8690, %r29677, %r9023; + xor.b32 %r8721, %r29674, %r9022; + xor.b32 %r8722, %r29675, %r9023; + mov.u32 %r8627, 44; // begin inline asm - dp4a.u32.u32 %r4860, %r4861, %r5758, %r4856; + shf.l.wrap.b32 %r8620, %r8626, %r8625, %r8627; // end inline asm - ld.const.u32 %r4865, [matrix+3152]; // begin inline asm - dp4a.u32.u32 %r4864, %r4865, %r5762, %r4860; + shf.l.wrap.b32 %r8624, %r8625, %r8626, %r8627; // end inline asm - ld.const.u32 %r4869, [matrix+3156]; + mov.u32 %r8635, 20; // begin inline asm - dp4a.u32.u32 %r4868, %r4869, %r5766, %r4864; + shf.l.wrap.b32 %r8628, %r8634, %r8633, %r8635; // end inline asm - ld.const.u32 %r4873, [matrix+3160]; // begin inline asm - dp4a.u32.u32 %r4872, %r4873, %r5770, %r4868; + shf.l.wrap.b32 %r8632, %r8633, %r8634, %r8635; // end inline asm - ld.const.u32 %r4877, [matrix+3164]; + mov.u32 %r8643, 61; // begin inline asm - dp4a.u32.u32 %r4876, %r4877, %r5774, %r4872; + shf.l.wrap.b32 %r8636, %r8642, %r8641, %r8643; // end inline asm - ld.const.u32 %r4881, [matrix+3168]; // begin inline asm - dp4a.u32.u32 %r4880, %r4881, %r5778, %r4876; + shf.l.wrap.b32 %r8640, %r8641, %r8642, %r8643; // end inline asm - ld.const.u32 %r4885, [matrix+3172]; + mov.u32 %r8651, 39; // begin inline asm - dp4a.u32.u32 %r4884, %r4885, %r5782, %r4880; + shf.l.wrap.b32 %r8644, %r8650, %r8649, %r8651; // end inline asm - ld.const.u32 %r4889, [matrix+3176]; // begin inline asm - dp4a.u32.u32 %r4888, %r4889, %r5786, %r4884; + shf.l.wrap.b32 %r8648, %r8649, %r8650, %r8651; // end inline asm - ld.const.u32 %r4893, [matrix+3180]; + mov.u32 %r8659, 18; // begin inline asm - dp4a.u32.u32 %r4892, %r4893, %r5790, %r4888; + shf.l.wrap.b32 %r8652, %r8658, %r8657, %r8659; // end inline asm - ld.const.u32 %r4897, [matrix+3184]; // begin inline asm - dp4a.u32.u32 %r4896, %r4897, %r5794, %r4892; + shf.l.wrap.b32 %r8656, %r8657, %r8658, %r8659; // end inline asm - ld.const.u32 %r4901, [matrix+3188]; + mov.u32 %r8667, 62; // begin inline asm - dp4a.u32.u32 %r4900, %r4901, %r5798, %r4896; + shf.l.wrap.b32 %r8660, %r8666, %r8665, %r8667; // end inline asm - ld.const.u32 %r4905, [matrix+3192]; // begin inline asm - dp4a.u32.u32 %r4904, %r4905, %r5802, %r4900; + shf.l.wrap.b32 %r8664, %r8665, %r8666, %r8667; // end inline asm - ld.const.u32 %r4909, [matrix+3196]; + mov.u32 %r8675, 43; // begin inline asm - dp4a.u32.u32 %r4908, %r4909, %r5806, %r4904; + shf.l.wrap.b32 %r8668, %r8674, %r8673, %r8675; // end inline asm - shr.u32 %r6068, %r4844, 6; - and.b32 %r6069, %r6068, 240; - shr.u32 %r6070, %r4908, 10; - and.b32 %r6071, %r6070, 255; - or.b32 %r6072, %r6071, %r6069; - cvt.u64.u32 %rd229, %r6072; - ld.const.u32 %r4913, [matrix+3200]; // begin inline asm - dp4a.u32.u32 %r4912, %r4913, %r5746, %r6244; + shf.l.wrap.b32 %r8672, %r8673, %r8674, %r8675; // end inline asm - ld.const.u32 %r4917, [matrix+3204]; + mov.u32 %r8683, 25; // begin inline asm - dp4a.u32.u32 %r4916, %r4917, %r5750, %r4912; + shf.l.wrap.b32 %r8676, %r8682, %r8681, %r8683; // end inline asm - ld.const.u32 %r4921, [matrix+3208]; // begin inline asm - dp4a.u32.u32 %r4920, %r4921, %r5754, %r4916; + shf.l.wrap.b32 %r8680, %r8681, %r8682, %r8683; // end inline asm - ld.const.u32 %r4925, [matrix+3212]; + mov.u32 %r8691, 8; // begin inline asm - dp4a.u32.u32 %r4924, %r4925, %r5758, %r4920; + shf.l.wrap.b32 %r8684, %r8690, %r8689, %r8691; // end inline asm - ld.const.u32 %r4929, [matrix+3216]; // begin inline asm - dp4a.u32.u32 %r4928, %r4929, %r5762, %r4924; + shf.l.wrap.b32 %r8688, %r8689, %r8690, %r8691; // end inline asm - ld.const.u32 %r4933, [matrix+3220]; + mov.u32 %r8699, 56; // begin inline asm - dp4a.u32.u32 %r4932, %r4933, %r5766, %r4928; + shf.l.wrap.b32 %r8692, %r8698, %r8697, %r8699; // end inline asm - ld.const.u32 %r4937, [matrix+3224]; // begin inline asm - dp4a.u32.u32 %r4936, %r4937, %r5770, %r4932; + shf.l.wrap.b32 %r8696, %r8697, %r8698, %r8699; // end inline asm - ld.const.u32 %r4941, [matrix+3228]; + mov.u32 %r8707, 41; // begin inline asm - dp4a.u32.u32 %r4940, %r4941, %r5774, %r4936; + shf.l.wrap.b32 %r8700, %r8706, %r8705, %r8707; // end inline asm - ld.const.u32 %r4945, [matrix+3232]; // begin inline asm - dp4a.u32.u32 %r4944, %r4945, %r5778, %r4940; + shf.l.wrap.b32 %r8704, %r8705, %r8706, %r8707; // end inline asm - ld.const.u32 %r4949, [matrix+3236]; + mov.u32 %r8715, 27; // begin inline asm - dp4a.u32.u32 %r4948, %r4949, %r5782, %r4944; + shf.l.wrap.b32 %r8708, %r8714, %r8713, %r8715; // end inline asm - ld.const.u32 %r4953, [matrix+3240]; // begin inline asm - dp4a.u32.u32 %r4952, %r4953, %r5786, %r4948; + shf.l.wrap.b32 %r8712, %r8713, %r8714, %r8715; // end inline asm - ld.const.u32 %r4957, [matrix+3244]; + mov.u32 %r8723, 14; // begin inline asm - dp4a.u32.u32 %r4956, %r4957, %r5790, %r4952; + shf.l.wrap.b32 %r8716, %r8722, %r8721, %r8723; // end inline asm - ld.const.u32 %r4961, [matrix+3248]; // begin inline asm - dp4a.u32.u32 %r4960, %r4961, %r5794, %r4956; + shf.l.wrap.b32 %r8720, %r8721, %r8722, %r8723; // end inline asm - ld.const.u32 %r4965, [matrix+3252]; + mov.u32 %r8731, 2; // begin inline asm - dp4a.u32.u32 %r4964, %r4965, %r5798, %r4960; + shf.l.wrap.b32 %r8724, %r8730, %r8729, %r8731; // end inline asm - ld.const.u32 %r4969, [matrix+3256]; // begin inline asm - dp4a.u32.u32 %r4968, %r4969, %r5802, %r4964; + shf.l.wrap.b32 %r8728, %r8729, %r8730, %r8731; // end inline asm - ld.const.u32 %r4973, [matrix+3260]; + mov.u32 %r8739, 55; // begin inline asm - dp4a.u32.u32 %r4972, %r4973, %r5806, %r4968; + shf.l.wrap.b32 %r8732, %r8738, %r8737, %r8739; // end inline asm - ld.const.u32 %r4977, [matrix+3264]; // begin inline asm - dp4a.u32.u32 %r4976, %r4977, %r5746, %r6244; + shf.l.wrap.b32 %r8736, %r8737, %r8738, %r8739; // end inline asm - ld.const.u32 %r4981, [matrix+3268]; + mov.u32 %r8747, 45; // begin inline asm - dp4a.u32.u32 %r4980, %r4981, %r5750, %r4976; + shf.l.wrap.b32 %r8740, %r8746, %r8745, %r8747; // end inline asm - ld.const.u32 %r4985, [matrix+3272]; // begin inline asm - dp4a.u32.u32 %r4984, %r4985, %r5754, %r4980; + shf.l.wrap.b32 %r8744, %r8745, %r8746, %r8747; // end inline asm - ld.const.u32 %r4989, [matrix+3276]; + mov.u32 %r8755, 36; // begin inline asm - dp4a.u32.u32 %r4988, %r4989, %r5758, %r4984; + shf.l.wrap.b32 %r8748, %r8754, %r8753, %r8755; // end inline asm - ld.const.u32 %r4993, [matrix+3280]; // begin inline asm - dp4a.u32.u32 %r4992, %r4993, %r5762, %r4988; + shf.l.wrap.b32 %r8752, %r8753, %r8754, %r8755; // end inline asm - ld.const.u32 %r4997, [matrix+3284]; + mov.u32 %r8763, 28; // begin inline asm - dp4a.u32.u32 %r4996, %r4997, %r5766, %r4992; + shf.l.wrap.b32 %r8756, %r8762, %r8761, %r8763; // end inline asm - ld.const.u32 %r5001, [matrix+3288]; // begin inline asm - dp4a.u32.u32 %r5000, %r5001, %r5770, %r4996; + shf.l.wrap.b32 %r8760, %r8761, %r8762, %r8763; // end inline asm - ld.const.u32 %r5005, [matrix+3292]; + mov.u32 %r8771, 21; // begin inline asm - dp4a.u32.u32 %r5004, %r5005, %r5774, %r5000; + shf.l.wrap.b32 %r8764, %r8770, %r8769, %r8771; // end inline asm - ld.const.u32 %r5009, [matrix+3296]; // begin inline asm - dp4a.u32.u32 %r5008, %r5009, %r5778, %r5004; + shf.l.wrap.b32 %r8768, %r8769, %r8770, %r8771; // end inline asm - ld.const.u32 %r5013, [matrix+3300]; + mov.u32 %r8779, 15; // begin inline asm - dp4a.u32.u32 %r5012, %r5013, %r5782, %r5008; + shf.l.wrap.b32 %r8772, %r8778, %r8777, %r8779; // end inline asm - ld.const.u32 %r5017, [matrix+3304]; // begin inline asm - dp4a.u32.u32 %r5016, %r5017, %r5786, %r5012; + shf.l.wrap.b32 %r8776, %r8777, %r8778, %r8779; // end inline asm - ld.const.u32 %r5021, [matrix+3308]; + mov.u32 %r8787, 10; // begin inline asm - dp4a.u32.u32 %r5020, %r5021, %r5790, %r5016; + shf.l.wrap.b32 %r8780, %r8786, %r8785, %r8787; // end inline asm - ld.const.u32 %r5025, [matrix+3312]; // begin inline asm - dp4a.u32.u32 %r5024, %r5025, %r5794, %r5020; + shf.l.wrap.b32 %r8784, %r8785, %r8786, %r8787; // end inline asm - ld.const.u32 %r5029, [matrix+3316]; + mov.u32 %r8795, 6; // begin inline asm - dp4a.u32.u32 %r5028, %r5029, %r5798, %r5024; + shf.l.wrap.b32 %r8788, %r8794, %r8793, %r8795; // end inline asm - ld.const.u32 %r5033, [matrix+3320]; // begin inline asm - dp4a.u32.u32 %r5032, %r5033, %r5802, %r5028; + shf.l.wrap.b32 %r8792, %r8793, %r8794, %r8795; // end inline asm - ld.const.u32 %r5037, [matrix+3324]; + mov.u32 %r8803, 3; // begin inline asm - dp4a.u32.u32 %r5036, %r5037, %r5806, %r5032; + shf.l.wrap.b32 %r8796, %r8802, %r8801, %r8803; // end inline asm - shr.u32 %r6073, %r4972, 6; - and.b32 %r6074, %r6073, 240; - shr.u32 %r6075, %r5036, 10; - or.b32 %r6076, %r6075, %r6074; - cvt.u64.u32 %rd230, %r6076; - xor.b64 %rd231, %rd201, %rd230; - and.b64 %rd232, %rd9, 255; - xor.b64 %rd233, %rd232, %rd229; - ld.const.u32 %r5041, [matrix+3328]; // begin inline asm - dp4a.u32.u32 %r5040, %r5041, %r5746, %r6244; + shf.l.wrap.b32 %r8800, %r8801, %r8802, %r8803; // end inline asm - ld.const.u32 %r5045, [matrix+3332]; // begin inline asm - dp4a.u32.u32 %r5044, %r5045, %r5750, %r5040; + shf.l.wrap.b32 %r8804, %r8810, %r8809, %r8493; // end inline asm - ld.const.u32 %r5049, [matrix+3336]; // begin inline asm - dp4a.u32.u32 %r5048, %r5049, %r5754, %r5044; + shf.l.wrap.b32 %r8808, %r8809, %r8810, %r8493; // end inline asm - ld.const.u32 %r5053, [matrix+3340]; // begin inline asm - dp4a.u32.u32 %r5052, %r5053, %r5758, %r5048; + // chi + lop3.b32 %r8812, %r8847, %r8620, %r8668, 0xD2; + lop3.b32 %r8813, %r8850, %r8624, %r8672, 0xD2; // end inline asm - ld.const.u32 %r5057, [matrix+3344]; // begin inline asm - dp4a.u32.u32 %r5056, %r5057, %r5762, %r5052; + // chi + lop3.b32 %r29722, %r8620, %r8668, %r8764, 0xD2; + lop3.b32 %r29723, %r8624, %r8672, %r8768, 0xD2; // end inline asm - ld.const.u32 %r5061, [matrix+3348]; // begin inline asm - dp4a.u32.u32 %r5060, %r5061, %r5766, %r5056; + // chi + lop3.b32 %r29718, %r8668, %r8764, %r8716, 0xD2; + lop3.b32 %r29719, %r8672, %r8768, %r8720, 0xD2; // end inline asm - ld.const.u32 %r5065, [matrix+3352]; // begin inline asm - dp4a.u32.u32 %r5064, %r5065, %r5770, %r5060; + // chi + lop3.b32 %r29714, %r8764, %r8716, %r8847, 0xD2; + lop3.b32 %r29715, %r8768, %r8720, %r8850, 0xD2; // end inline asm - ld.const.u32 %r5069, [matrix+3356]; // begin inline asm - dp4a.u32.u32 %r5068, %r5069, %r5774, %r5064; + // chi + lop3.b32 %r29712, %r8716, %r8847, %r8620, 0xD2; + lop3.b32 %r29713, %r8720, %r8850, %r8624, 0xD2; // end inline asm - ld.const.u32 %r5073, [matrix+3360]; // begin inline asm - dp4a.u32.u32 %r5072, %r5073, %r5778, %r5068; + // chi + lop3.b32 %r29708, %r8756, %r8628, %r8796, 0xD2; + lop3.b32 %r29709, %r8760, %r8632, %r8800, 0xD2; // end inline asm - ld.const.u32 %r5077, [matrix+3364]; // begin inline asm - dp4a.u32.u32 %r5076, %r5077, %r5782, %r5072; + // chi + lop3.b32 %r29720, %r8628, %r8796, %r8740, 0xD2; + lop3.b32 %r29721, %r8632, %r8800, %r8744, 0xD2; // end inline asm - ld.const.u32 %r5081, [matrix+3368]; // begin inline asm - dp4a.u32.u32 %r5080, %r5081, %r5786, %r5076; + // chi + lop3.b32 %r29716, %r8796, %r8740, %r8636, 0xD2; + lop3.b32 %r29717, %r8800, %r8744, %r8640, 0xD2; // end inline asm - ld.const.u32 %r5085, [matrix+3372]; // begin inline asm - dp4a.u32.u32 %r5084, %r5085, %r5790, %r5080; + // chi + lop3.b32 %r29688, %r8740, %r8636, %r8756, 0xD2; + lop3.b32 %r29689, %r8744, %r8640, %r8760, 0xD2; // end inline asm - ld.const.u32 %r5089, [matrix+3376]; + st.local.v2.u32 [%rd3+88], {%r29688, %r29689}; // begin inline asm - dp4a.u32.u32 %r5088, %r5089, %r5794, %r5084; + // chi + lop3.b32 %r29680, %r8636, %r8756, %r8628, 0xD2; + lop3.b32 %r29681, %r8640, %r8760, %r8632, 0xD2; // end inline asm - ld.const.u32 %r5093, [matrix+3380]; + st.local.v2.u32 [%rd3+96], {%r29680, %r29681}; // begin inline asm - dp4a.u32.u32 %r5092, %r5093, %r5798, %r5088; + // chi + lop3.b32 %r29706, %r8804, %r8788, %r8676, 0xD2; + lop3.b32 %r29707, %r8808, %r8792, %r8680, 0xD2; // end inline asm - ld.const.u32 %r5097, [matrix+3384]; + st.local.v2.u32 [%rd3+104], {%r29706, %r29707}; // begin inline asm - dp4a.u32.u32 %r5096, %r5097, %r5802, %r5092; + // chi + lop3.b32 %r29700, %r8788, %r8676, %r8684, 0xD2; + lop3.b32 %r29701, %r8792, %r8680, %r8688, 0xD2; // end inline asm - ld.const.u32 %r5101, [matrix+3388]; + st.local.v2.u32 [%rd3+112], {%r29700, %r29701}; // begin inline asm - dp4a.u32.u32 %r5100, %r5101, %r5806, %r5096; + // chi + lop3.b32 %r29694, %r8676, %r8684, %r8652, 0xD2; + lop3.b32 %r29695, %r8680, %r8688, %r8656, 0xD2; // end inline asm - ld.const.u32 %r5105, [matrix+3392]; + st.local.v2.u32 [%rd3+120], {%r29694, %r29695}; // begin inline asm - dp4a.u32.u32 %r5104, %r5105, %r5746, %r6244; + // chi + lop3.b32 %r29686, %r8684, %r8652, %r8804, 0xD2; + lop3.b32 %r29687, %r8688, %r8656, %r8808, 0xD2; // end inline asm - ld.const.u32 %r5109, [matrix+3396]; + st.local.v2.u32 [%rd3+128], {%r29686, %r29687}; // begin inline asm - dp4a.u32.u32 %r5108, %r5109, %r5750, %r5104; + // chi + lop3.b32 %r29678, %r8652, %r8804, %r8788, 0xD2; + lop3.b32 %r29679, %r8656, %r8808, %r8792, 0xD2; // end inline asm - ld.const.u32 %r5113, [matrix+3400]; + st.local.v2.u32 [%rd3+136], {%r29678, %r29679}; // begin inline asm - dp4a.u32.u32 %r5112, %r5113, %r5754, %r5108; + // chi + lop3.b32 %r29704, %r8708, %r8748, %r8780, 0xD2; + lop3.b32 %r29705, %r8712, %r8752, %r8784, 0xD2; // end inline asm - ld.const.u32 %r5117, [matrix+3404]; + st.local.v2.u32 [%rd3+144], {%r29704, %r29705}; // begin inline asm - dp4a.u32.u32 %r5116, %r5117, %r5758, %r5112; + // chi + lop3.b32 %r29698, %r8748, %r8780, %r8772, 0xD2; + lop3.b32 %r29699, %r8752, %r8784, %r8776, 0xD2; // end inline asm - ld.const.u32 %r5121, [matrix+3408]; + st.local.v2.u32 [%rd3+152], {%r29698, %r29699}; // begin inline asm - dp4a.u32.u32 %r5120, %r5121, %r5762, %r5116; + // chi + lop3.b32 %r29692, %r8780, %r8772, %r8692, 0xD2; + lop3.b32 %r29693, %r8784, %r8776, %r8696, 0xD2; // end inline asm - ld.const.u32 %r5125, [matrix+3412]; + st.local.v2.u32 [%rd3+160], {%r29692, %r29693}; // begin inline asm - dp4a.u32.u32 %r5124, %r5125, %r5766, %r5120; + // chi + lop3.b32 %r29684, %r8772, %r8692, %r8708, 0xD2; + lop3.b32 %r29685, %r8776, %r8696, %r8712, 0xD2; // end inline asm - ld.const.u32 %r5129, [matrix+3416]; + st.local.v2.u32 [%rd3+168], {%r29684, %r29685}; // begin inline asm - dp4a.u32.u32 %r5128, %r5129, %r5770, %r5124; + // chi + lop3.b32 %r29676, %r8692, %r8708, %r8748, 0xD2; + lop3.b32 %r29677, %r8696, %r8712, %r8752, 0xD2; // end inline asm - ld.const.u32 %r5133, [matrix+3420]; + st.local.v2.u32 [%rd3+176], {%r29676, %r29677}; // begin inline asm - dp4a.u32.u32 %r5132, %r5133, %r5774, %r5128; + // chi + lop3.b32 %r29702, %r8660, %r8732, %r8644, 0xD2; + lop3.b32 %r29703, %r8664, %r8736, %r8648, 0xD2; // end inline asm - ld.const.u32 %r5137, [matrix+3424]; + st.local.v2.u32 [%rd3+184], {%r29702, %r29703}; // begin inline asm - dp4a.u32.u32 %r5136, %r5137, %r5778, %r5132; + // chi + lop3.b32 %r29696, %r8732, %r8644, %r8700, 0xD2; + lop3.b32 %r29697, %r8736, %r8648, %r8704, 0xD2; // end inline asm - ld.const.u32 %r5141, [matrix+3428]; + st.local.v2.u32 [%rd3+192], {%r29696, %r29697}; // begin inline asm - dp4a.u32.u32 %r5140, %r5141, %r5782, %r5136; + // chi + lop3.b32 %r29690, %r8644, %r8700, %r8724, 0xD2; + lop3.b32 %r29691, %r8648, %r8704, %r8728, 0xD2; // end inline asm - ld.const.u32 %r5145, [matrix+3432]; + st.local.v2.u32 [%rd3+200], {%r29690, %r29691}; // begin inline asm - dp4a.u32.u32 %r5144, %r5145, %r5786, %r5140; + // chi + lop3.b32 %r29682, %r8700, %r8724, %r8660, 0xD2; + lop3.b32 %r29683, %r8704, %r8728, %r8664, 0xD2; // end inline asm - ld.const.u32 %r5149, [matrix+3436]; + st.local.v2.u32 [%rd3+208], {%r29682, %r29683}; // begin inline asm - dp4a.u32.u32 %r5148, %r5149, %r5790, %r5144; + // chi + lop3.b32 %r29674, %r8724, %r8660, %r8732, 0xD2; + lop3.b32 %r29675, %r8728, %r8664, %r8736, 0xD2; // end inline asm - ld.const.u32 %r5153, [matrix+3440]; + st.local.v2.u32 [%rd3+216], {%r29674, %r29675}; + mul.wide.s32 %rd565, %r29724, 8; + add.s64 %rd564, %rd497, %rd565; // begin inline asm - dp4a.u32.u32 %r5152, %r5153, %r5794, %r5148; + ld.global.nc.v2.u32 {%r9012,%r9013}, [%rd564]; // end inline asm - ld.const.u32 %r5157, [matrix+3444]; + xor.b32 %r29710, %r8812, %r9012; + xor.b32 %r29711, %r8813, %r9013; + add.s32 %r29724, %r29724, 1; + setp.lt.u32 %p21, %r29724, 23; + @%p21 bra $L__BB2_29; + + st.local.v2.u32 [%rd3+32], {%r29722, %r29723}; + st.local.v2.u32 [%rd3+72], {%r29720, %r29721}; + st.local.v2.u32 [%rd3+40], {%r29718, %r29719}; + st.local.v2.u32 [%rd3+80], {%r29716, %r29717}; + st.local.v2.u32 [%rd3+48], {%r29714, %r29715}; + st.local.v2.u32 [%rd3+56], {%r29712, %r29713}; + st.local.v2.u32 [%rd3+24], {%r29710, %r29711}; // begin inline asm - dp4a.u32.u32 %r5156, %r5157, %r5798, %r5152; + // xor5 + lop3.b32 %r9024, %r29710, %r29708, %r29706, 0x96; + lop3.b32 %r9024, %r9024, %r29704, %r29702, 0x96; + lop3.b32 %r9025, %r29711, %r29709, %r29707, 0x96; + lop3.b32 %r9025, %r9025, %r29705, %r29703, 0x96; // end inline asm - ld.const.u32 %r5161, [matrix+3448]; // begin inline asm - dp4a.u32.u32 %r5160, %r5161, %r5802, %r5156; + // xor5 + lop3.b32 %r9036, %r29722, %r29720, %r29700, 0x96; + lop3.b32 %r9036, %r9036, %r29698, %r29696, 0x96; + lop3.b32 %r9037, %r29723, %r29721, %r29701, 0x96; + lop3.b32 %r9037, %r9037, %r29699, %r29697, 0x96; // end inline asm - ld.const.u32 %r5165, [matrix+3452]; // begin inline asm - dp4a.u32.u32 %r5164, %r5165, %r5806, %r5160; + // xor5 + lop3.b32 %r9048, %r29718, %r29716, %r29694, 0x96; + lop3.b32 %r9048, %r9048, %r29692, %r29690, 0x96; + lop3.b32 %r9049, %r29719, %r29717, %r29695, 0x96; + lop3.b32 %r9049, %r9049, %r29693, %r29691, 0x96; // end inline asm - shr.u32 %r6077, %r5100, 6; - and.b32 %r6078, %r6077, 240; - shr.u32 %r6079, %r5164, 10; - or.b32 %r6080, %r6079, %r6078; - cvt.u64.u32 %rd234, %r6080; - xor.b64 %rd235, %rd202, %rd234; - ld.const.u32 %r5169, [matrix+3456]; // begin inline asm - dp4a.u32.u32 %r5168, %r5169, %r5746, %r6244; + // xor5 + lop3.b32 %r9060, %r29714, %r29688, %r29686, 0x96; + lop3.b32 %r9060, %r9060, %r29684, %r29682, 0x96; + lop3.b32 %r9061, %r29715, %r29689, %r29687, 0x96; + lop3.b32 %r9061, %r9061, %r29685, %r29683, 0x96; // end inline asm - ld.const.u32 %r5173, [matrix+3460]; // begin inline asm - dp4a.u32.u32 %r5172, %r5173, %r5750, %r5168; + // xor5 + lop3.b32 %r9072, %r29712, %r29680, %r29678, 0x96; + lop3.b32 %r9072, %r9072, %r29676, %r29674, 0x96; + lop3.b32 %r9073, %r29713, %r29681, %r29679, 0x96; + lop3.b32 %r9073, %r9073, %r29677, %r29675, 0x96; // end inline asm - ld.const.u32 %r5177, [matrix+3464]; + mov.u32 %r9276, 1; // begin inline asm - dp4a.u32.u32 %r5176, %r5177, %r5754, %r5172; + shf.l.wrap.b32 %r9084, %r9037, %r9036, %r9276; // end inline asm - ld.const.u32 %r5181, [matrix+3468]; // begin inline asm - dp4a.u32.u32 %r5180, %r5181, %r5758, %r5176; + shf.l.wrap.b32 %r9088, %r9036, %r9037, %r9276; // end inline asm - ld.const.u32 %r5185, [matrix+3472]; + xor.b32 %r9303, %r9084, %r9072; + xor.b32 %r9304, %r9088, %r9073; + xor.b32 %r9231, %r29710, %r9303; + xor.b32 %r9234, %r29711, %r9304; + xor.b32 %r9194, %r29707, %r9304; + xor.b32 %r9193, %r29706, %r9303; + st.local.v2.u32 [%rd3+104], {%r9193, %r9194}; // begin inline asm - dp4a.u32.u32 %r5184, %r5185, %r5762, %r5180; + shf.l.wrap.b32 %r9092, %r9049, %r9048, %r9276; // end inline asm - ld.const.u32 %r5189, [matrix+3476]; // begin inline asm - dp4a.u32.u32 %r5188, %r5189, %r5766, %r5184; + shf.l.wrap.b32 %r9096, %r9048, %r9049, %r9276; // end inline asm - ld.const.u32 %r5193, [matrix+3480]; + xor.b32 %r9305, %r9092, %r9024; + xor.b32 %r9306, %r9096, %r9025; + xor.b32 %r9130, %r29720, %r9305; + xor.b32 %r9129, %r29721, %r9306; + xor.b32 %r9169, %r29699, %r9306; + xor.b32 %r9170, %r29698, %r9305; + st.local.v2.u32 [%rd3+152], {%r9170, %r9169}; // begin inline asm - dp4a.u32.u32 %r5192, %r5193, %r5770, %r5188; + shf.l.wrap.b32 %r9100, %r9061, %r9060, %r9276; // end inline asm - ld.const.u32 %r5197, [matrix+3484]; // begin inline asm - dp4a.u32.u32 %r5196, %r5197, %r5774, %r5192; + shf.l.wrap.b32 %r9104, %r9060, %r9061, %r9276; // end inline asm - ld.const.u32 %r5201, [matrix+3488]; + xor.b32 %r9307, %r9100, %r9036; + xor.b32 %r9308, %r9104, %r9037; + xor.b32 %r9153, %r29695, %r9308; + xor.b32 %r9154, %r29694, %r9307; + st.local.v2.u32 [%rd3+120], {%r9154, %r9153}; + xor.b32 %r9145, %r29691, %r9308; + xor.b32 %r9146, %r29690, %r9307; + st.local.v2.u32 [%rd3+200], {%r9146, %r9145}; // begin inline asm - dp4a.u32.u32 %r5200, %r5201, %r5778, %r5196; + shf.l.wrap.b32 %r9108, %r9073, %r9072, %r9276; // end inline asm - ld.const.u32 %r5205, [matrix+3492]; // begin inline asm - dp4a.u32.u32 %r5204, %r5205, %r5782, %r5200; + shf.l.wrap.b32 %r9112, %r9072, %r9073, %r9276; // end inline asm - ld.const.u32 %r5209, [matrix+3496]; + xor.b32 %r9309, %r9108, %r9048; + xor.b32 %r9310, %r9112, %r9049; + xor.b32 %r9177, %r29714, %r9309; + xor.b32 %r9178, %r29715, %r9310; + xor.b32 %r9186, %r29685, %r9310; + xor.b32 %r9185, %r29684, %r9309; + st.local.v2.u32 [%rd3+168], {%r9185, %r9186}; // begin inline asm - dp4a.u32.u32 %r5208, %r5209, %r5786, %r5204; + shf.l.wrap.b32 %r9116, %r9025, %r9024, %r9276; // end inline asm - ld.const.u32 %r5213, [matrix+3500]; // begin inline asm - dp4a.u32.u32 %r5212, %r5213, %r5790, %r5208; + shf.l.wrap.b32 %r9120, %r9024, %r9025, %r9276; // end inline asm - ld.const.u32 %r5217, [matrix+3504]; + xor.b32 %r9311, %r9116, %r9060; + xor.b32 %r9312, %r9120, %r9061; + xor.b32 %r9137, %r29680, %r9311; + xor.b32 %r9138, %r29681, %r9312; + xor.b32 %r9162, %r29675, %r9312; + xor.b32 %r9161, %r29674, %r9311; + st.local.v2.u32 [%rd3+216], {%r9161, %r9162}; // begin inline asm - dp4a.u32.u32 %r5216, %r5217, %r5794, %r5212; + shf.l.wrap.b32 %r9124, %r9130, %r9129, %r8627; // end inline asm - ld.const.u32 %r5221, [matrix+3508]; // begin inline asm - dp4a.u32.u32 %r5220, %r5221, %r5798, %r5216; + shf.l.wrap.b32 %r9128, %r9129, %r9130, %r8627; // end inline asm - ld.const.u32 %r5225, [matrix+3512]; // begin inline asm - dp4a.u32.u32 %r5224, %r5225, %r5802, %r5220; + shf.l.wrap.b32 %r9132, %r9138, %r9137, %r8635; // end inline asm - ld.const.u32 %r5229, [matrix+3516]; // begin inline asm - dp4a.u32.u32 %r5228, %r5229, %r5806, %r5224; + shf.l.wrap.b32 %r9136, %r9137, %r9138, %r8635; // end inline asm - ld.const.u32 %r5233, [matrix+3520]; // begin inline asm - dp4a.u32.u32 %r5232, %r5233, %r5746, %r6244; + shf.l.wrap.b32 %r9144, %r9145, %r9146, %r8643; // end inline asm - ld.const.u32 %r5237, [matrix+3524]; // begin inline asm - dp4a.u32.u32 %r5236, %r5237, %r5750, %r5232; + shf.l.wrap.b32 %r9140, %r9146, %r9145, %r8643; // end inline asm - ld.const.u32 %r5241, [matrix+3528]; + st.local.v2.u32 [%rd3+96], {%r9140, %r9144}; // begin inline asm - dp4a.u32.u32 %r5240, %r5241, %r5754, %r5236; + shf.l.wrap.b32 %r9148, %r9154, %r9153, %r8675; // end inline asm - ld.const.u32 %r5245, [matrix+3532]; // begin inline asm - dp4a.u32.u32 %r5244, %r5245, %r5758, %r5240; + shf.l.wrap.b32 %r9152, %r9153, %r9154, %r8675; // end inline asm - ld.const.u32 %r5249, [matrix+3536]; // begin inline asm - dp4a.u32.u32 %r5248, %r5249, %r5762, %r5244; + shf.l.wrap.b32 %r9156, %r9162, %r9161, %r8723; // end inline asm - ld.const.u32 %r5253, [matrix+3540]; // begin inline asm - dp4a.u32.u32 %r5252, %r5253, %r5766, %r5248; + shf.l.wrap.b32 %r9160, %r9161, %r9162, %r8723; // end inline asm - ld.const.u32 %r5257, [matrix+3544]; // begin inline asm - dp4a.u32.u32 %r5256, %r5257, %r5770, %r5252; + shf.l.wrap.b32 %r9168, %r9169, %r9170, %r8747; // end inline asm - ld.const.u32 %r5261, [matrix+3548]; // begin inline asm - dp4a.u32.u32 %r5260, %r5261, %r5774, %r5256; + shf.l.wrap.b32 %r9164, %r9170, %r9169, %r8747; // end inline asm - ld.const.u32 %r5265, [matrix+3552]; + st.local.v2.u32 [%rd3+88], {%r9164, %r9168}; // begin inline asm - dp4a.u32.u32 %r5264, %r5265, %r5778, %r5260; + shf.l.wrap.b32 %r9172, %r9178, %r9177, %r8763; // end inline asm - ld.const.u32 %r5269, [matrix+3556]; // begin inline asm - dp4a.u32.u32 %r5268, %r5269, %r5782, %r5264; + shf.l.wrap.b32 %r9176, %r9177, %r9178, %r8763; // end inline asm - ld.const.u32 %r5273, [matrix+3560]; // begin inline asm - dp4a.u32.u32 %r5272, %r5273, %r5786, %r5268; + shf.l.wrap.b32 %r9180, %r9186, %r9185, %r8771; // end inline asm - ld.const.u32 %r5277, [matrix+3564]; // begin inline asm - dp4a.u32.u32 %r5276, %r5277, %r5790, %r5272; + shf.l.wrap.b32 %r9184, %r9185, %r9186, %r8771; // end inline asm - ld.const.u32 %r5281, [matrix+3568]; // begin inline asm - dp4a.u32.u32 %r5280, %r5281, %r5794, %r5276; + shf.l.wrap.b32 %r9188, %r9194, %r9193, %r8803; // end inline asm - ld.const.u32 %r5285, [matrix+3572]; // begin inline asm - dp4a.u32.u32 %r5284, %r5285, %r5798, %r5280; + shf.l.wrap.b32 %r9192, %r9193, %r9194, %r8803; // end inline asm - ld.const.u32 %r5289, [matrix+3576]; // begin inline asm - dp4a.u32.u32 %r5288, %r5289, %r5802, %r5284; + // chi + lop3.b32 %r9196, %r9231, %r9124, %r9148, 0xD2; + lop3.b32 %r9197, %r9234, %r9128, %r9152, 0xD2; // end inline asm - ld.const.u32 %r5293, [matrix+3580]; // begin inline asm - dp4a.u32.u32 %r5292, %r5293, %r5806, %r5288; + // chi + lop3.b32 %r9204, %r9124, %r9148, %r9180, 0xD2; + lop3.b32 %r9205, %r9128, %r9152, %r9184, 0xD2; // end inline asm - shr.u32 %r6081, %r5228, 6; - and.b32 %r6082, %r6081, 240; - shr.u32 %r6083, %r5292, 10; - or.b32 %r6084, %r6083, %r6082; - cvt.u64.u32 %rd236, %r6084; - xor.b64 %rd237, %rd203, %rd236; - ld.const.u32 %r5297, [matrix+3584]; + st.local.v2.u32 [%rd3+32], {%r9204, %r9205}; // begin inline asm - dp4a.u32.u32 %r5296, %r5297, %r5746, %r6244; + // chi + lop3.b32 %r9212, %r9148, %r9180, %r9156, 0xD2; + lop3.b32 %r9213, %r9152, %r9184, %r9160, 0xD2; // end inline asm - ld.const.u32 %r5301, [matrix+3588]; + st.local.v2.u32 [%rd3+40], {%r9212, %r9213}; // begin inline asm - dp4a.u32.u32 %r5300, %r5301, %r5750, %r5296; + // chi + lop3.b32 %r9220, %r9180, %r9156, %r9231, 0xD2; + lop3.b32 %r9221, %r9184, %r9160, %r9234, 0xD2; // end inline asm - ld.const.u32 %r5305, [matrix+3592]; + st.local.v2.u32 [%rd3+48], {%r9220, %r9221}; // begin inline asm - dp4a.u32.u32 %r5304, %r5305, %r5754, %r5300; + // chi + lop3.b32 %r9228, %r9156, %r9231, %r9124, 0xD2; + lop3.b32 %r9229, %r9160, %r9234, %r9128, 0xD2; // end inline asm - ld.const.u32 %r5309, [matrix+3596]; + st.local.v2.u32 [%rd3+56], {%r9228, %r9229}; // begin inline asm - dp4a.u32.u32 %r5308, %r5309, %r5758, %r5304; + // chi + lop3.b32 %r9236, %r9172, %r9132, %r9188, 0xD2; + lop3.b32 %r9237, %r9176, %r9136, %r9192, 0xD2; // end inline asm - ld.const.u32 %r5313, [matrix+3600]; + st.local.v2.u32 [%rd3+64], {%r9236, %r9237}; // begin inline asm - dp4a.u32.u32 %r5312, %r5313, %r5762, %r5308; + // chi + lop3.b32 %r9244, %r9132, %r9188, %r9164, 0xD2; + lop3.b32 %r9245, %r9136, %r9192, %r9168, 0xD2; // end inline asm - ld.const.u32 %r5317, [matrix+3604]; + st.local.v2.u32 [%rd3+72], {%r9244, %r9245}; // begin inline asm - dp4a.u32.u32 %r5316, %r5317, %r5766, %r5312; + // chi + lop3.b32 %r9252, %r9188, %r9164, %r9140, 0xD2; + lop3.b32 %r9253, %r9192, %r9168, %r9144, 0xD2; // end inline asm - ld.const.u32 %r5321, [matrix+3608]; + st.local.v2.u32 [%rd3+80], {%r9252, %r9253}; // begin inline asm - dp4a.u32.u32 %r5320, %r5321, %r5770, %r5316; + ld.global.nc.v2.u32 {%r9260,%r9261}, [%rd498]; // end inline asm - ld.const.u32 %r5325, [matrix+3612]; + xor.b32 %r9313, %r9197, %r9261; + xor.b32 %r9314, %r9196, %r9260; + mov.b64 %rd1261, {%r9314, %r9313}; + mov.b64 %rd1262, {%r9204, %r9205}; + mov.b64 %rd1263, {%r9212, %r9213}; + mov.b64 %rd62, {%r9220, %r9221}; + mov.b64 %rd1264, {%r9228, %r9229}; + mov.b64 %rd64, {%r9236, %r9237}; + mov.b64 %rd65, {%r9244, %r9245}; + mov.b64 %rd66, {%r9252, %r9253}; + mov.u32 %r29725, 0; + st.local.v2.u32 [%rd3+24], {%r9314, %r9313}; + st.local.v2.u32 [%rd55+96], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+104], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+112], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+120], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+128], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+136], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+144], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+152], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+160], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+168], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+176], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+184], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+192], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+200], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+208], {%r29725, %r29725}; + st.local.v2.u32 [%rd55+216], {%r29725, %r29725}; + mov.u32 %r29740, -2147483648; + st.local.v2.u32 [%rd55+88], {%r9276, %r29740}; + mov.u32 %r29726, %r29725; + mov.u32 %r29727, %r29725; + mov.u32 %r29728, %r29725; + mov.u32 %r29729, %r29725; + mov.u32 %r29730, %r29725; + mov.u32 %r29731, %r29725; + mov.u32 %r29732, %r29725; + mov.u32 %r29733, %r29725; + mov.u32 %r29734, %r29725; + mov.u32 %r29735, %r29725; + mov.u32 %r29736, %r29725; + mov.u32 %r29737, %r29725; + mov.u32 %r29738, %r29725; + mov.u32 %r29739, %r9276; + mov.u32 %r29741, %r29725; + mov.u32 %r29742, %r29725; + mov.u32 %r29743, %r29725; + mov.u32 %r29744, %r29725; + mov.u32 %r29745, %r29725; + mov.u32 %r29746, %r29725; + mov.u32 %r29747, %r29725; + mov.u32 %r29748, %r29725; + mov.u32 %r29749, %r29725; + mov.u32 %r29750, %r29725; + mov.u32 %r29751, %r29725; + mov.u32 %r29752, %r29725; + mov.u32 %r29753, %r29725; + mov.u32 %r29754, %r29725; + mov.u32 %r29755, %r29725; + mov.u32 %r29756, %r29725; + mov.u32 %r29757, %r29725; + mov.u32 %r29758, %r29725; + mov.u32 %r29775, %r29725; + +$L__BB2_31: // begin inline asm - dp4a.u32.u32 %r5324, %r5325, %r5774, %r5320; + // xor5 + lop3.b32 %r9315, %r29761, %r29759, %r29757, 0x96; + lop3.b32 %r9315, %r9315, %r29755, %r29753, 0x96; + lop3.b32 %r9316, %r29762, %r29760, %r29758, 0x96; + lop3.b32 %r9316, %r9316, %r29756, %r29754, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9327, %r29773, %r29771, %r29751, 0x96; + lop3.b32 %r9327, %r9327, %r29749, %r29747, 0x96; + lop3.b32 %r9328, %r29774, %r29772, %r29752, 0x96; + lop3.b32 %r9328, %r9328, %r29750, %r29748, 0x96; // end inline asm - ld.const.u32 %r5329, [matrix+3616]; // begin inline asm - dp4a.u32.u32 %r5328, %r5329, %r5778, %r5324; + // xor5 + lop3.b32 %r9339, %r29769, %r29767, %r29745, 0x96; + lop3.b32 %r9339, %r9339, %r29743, %r29741, 0x96; + lop3.b32 %r9340, %r29770, %r29768, %r29746, 0x96; + lop3.b32 %r9340, %r9340, %r29744, %r29742, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r9351, %r29765, %r29739, %r29737, 0x96; + lop3.b32 %r9351, %r9351, %r29735, %r29733, 0x96; + lop3.b32 %r9352, %r29766, %r29740, %r29738, 0x96; + lop3.b32 %r9352, %r9352, %r29736, %r29734, 0x96; // end inline asm - ld.const.u32 %r5333, [matrix+3620]; // begin inline asm - dp4a.u32.u32 %r5332, %r5333, %r5782, %r5328; + // xor5 + lop3.b32 %r9363, %r29763, %r29731, %r29729, 0x96; + lop3.b32 %r9363, %r9363, %r29727, %r29725, 0x96; + lop3.b32 %r9364, %r29764, %r29732, %r29730, 0x96; + lop3.b32 %r9364, %r9364, %r29728, %r29726, 0x96; // end inline asm - ld.const.u32 %r5337, [matrix+3624]; // begin inline asm - dp4a.u32.u32 %r5336, %r5337, %r5786, %r5332; + shf.l.wrap.b32 %r9375, %r9328, %r9327, %r9276; // end inline asm - ld.const.u32 %r5341, [matrix+3628]; // begin inline asm - dp4a.u32.u32 %r5340, %r5341, %r5790, %r5336; + shf.l.wrap.b32 %r9379, %r9327, %r9328, %r9276; // end inline asm - ld.const.u32 %r5345, [matrix+3632]; + xor.b32 %r9809, %r9375, %r9363; + xor.b32 %r9810, %r9379, %r9364; + xor.b32 %r9642, %r29761, %r9809; + xor.b32 %r9645, %r29762, %r9810; + xor.b32 %r9549, %r29759, %r9809; + xor.b32 %r9548, %r29760, %r9810; + xor.b32 %r9596, %r29757, %r9809; + xor.b32 %r9597, %r29758, %r9810; + xor.b32 %r9501, %r29755, %r9809; + xor.b32 %r9500, %r29756, %r9810; + xor.b32 %r9452, %r29753, %r9809; + xor.b32 %r9453, %r29754, %r9810; // begin inline asm - dp4a.u32.u32 %r5344, %r5345, %r5794, %r5340; + shf.l.wrap.b32 %r9383, %r9340, %r9339, %r9276; // end inline asm - ld.const.u32 %r5349, [matrix+3636]; // begin inline asm - dp4a.u32.u32 %r5348, %r5349, %r5798, %r5344; + shf.l.wrap.b32 %r9387, %r9339, %r9340, %r9276; // end inline asm - ld.const.u32 %r5353, [matrix+3640]; + xor.b32 %r9811, %r9383, %r9315; + xor.b32 %r9812, %r9387, %r9316; + xor.b32 %r9604, %r29773, %r9811; + xor.b32 %r9605, %r29774, %r9812; + xor.b32 %r9421, %r29771, %r9811; + xor.b32 %r9420, %r29772, %r9812; + xor.b32 %r9580, %r29751, %r9811; + xor.b32 %r9581, %r29752, %r9812; + xor.b32 %r9541, %r29749, %r9811; + xor.b32 %r9540, %r29750, %r9812; + xor.b32 %r9524, %r29747, %r9811; + xor.b32 %r9525, %r29748, %r9812; // begin inline asm - dp4a.u32.u32 %r5352, %r5353, %r5802, %r5348; + shf.l.wrap.b32 %r9391, %r9352, %r9351, %r9276; // end inline asm - ld.const.u32 %r5357, [matrix+3644]; // begin inline asm - dp4a.u32.u32 %r5356, %r5357, %r5806, %r5352; + shf.l.wrap.b32 %r9395, %r9351, %r9352, %r9276; // end inline asm - ld.const.u32 %r5361, [matrix+3648]; + xor.b32 %r9813, %r9391, %r9327; + xor.b32 %r9814, %r9395, %r9328; + xor.b32 %r9461, %r29769, %r9813; + xor.b32 %r9460, %r29770, %r9814; + xor.b32 %r9588, %r29767, %r9813; + xor.b32 %r9589, %r29768, %r9814; + xor.b32 %r9469, %r29745, %r9813; + xor.b32 %r9468, %r29746, %r9814; + xor.b32 %r9572, %r29743, %r9813; + xor.b32 %r9573, %r29744, %r9814; + xor.b32 %r9437, %r29741, %r9813; + xor.b32 %r9436, %r29742, %r9814; // begin inline asm - dp4a.u32.u32 %r5360, %r5361, %r5746, %r6244; + shf.l.wrap.b32 %r9399, %r9364, %r9363, %r9276; // end inline asm - ld.const.u32 %r5365, [matrix+3652]; // begin inline asm - dp4a.u32.u32 %r5364, %r5365, %r5750, %r5360; + shf.l.wrap.b32 %r9403, %r9363, %r9364, %r9276; // end inline asm - ld.const.u32 %r5369, [matrix+3656]; + xor.b32 %r9815, %r9399, %r9339; + xor.b32 %r9816, %r9403, %r9340; + xor.b32 %r9556, %r29765, %r9815; + xor.b32 %r9557, %r29766, %r9816; + xor.b32 %r9533, %r29739, %r9815; + xor.b32 %r9532, %r29740, %r9816; + xor.b32 %r9476, %r29737, %r9815; + xor.b32 %r9477, %r29738, %r9816; + xor.b32 %r9564, %r29735, %r9815; + xor.b32 %r9565, %r29736, %r9816; + xor.b32 %r9493, %r29733, %r9815; + xor.b32 %r9492, %r29734, %r9816; // begin inline asm - dp4a.u32.u32 %r5368, %r5369, %r5754, %r5364; + shf.l.wrap.b32 %r9407, %r9316, %r9315, %r9276; // end inline asm - ld.const.u32 %r5373, [matrix+3660]; // begin inline asm - dp4a.u32.u32 %r5372, %r5373, %r5758, %r5368; + shf.l.wrap.b32 %r9411, %r9315, %r9316, %r9276; // end inline asm - ld.const.u32 %r5377, [matrix+3664]; + xor.b32 %r9817, %r9407, %r9351; + xor.b32 %r9818, %r9411, %r9352; + xor.b32 %r9508, %r29763, %r9817; + xor.b32 %r9509, %r29764, %r9818; + xor.b32 %r9428, %r29731, %r9817; + xor.b32 %r9429, %r29732, %r9818; + xor.b32 %r9445, %r29729, %r9817; + xor.b32 %r9444, %r29730, %r9818; + xor.b32 %r9484, %r29727, %r9817; + xor.b32 %r9485, %r29728, %r9818; + xor.b32 %r9516, %r29725, %r9817; + xor.b32 %r9517, %r29726, %r9818; + mov.u32 %r9422, 44; // begin inline asm - dp4a.u32.u32 %r5376, %r5377, %r5762, %r5372; + shf.l.wrap.b32 %r9415, %r9421, %r9420, %r9422; // end inline asm - ld.const.u32 %r5381, [matrix+3668]; // begin inline asm - dp4a.u32.u32 %r5380, %r5381, %r5766, %r5376; + shf.l.wrap.b32 %r9419, %r9420, %r9421, %r9422; // end inline asm - ld.const.u32 %r5385, [matrix+3672]; + mov.u32 %r9430, 20; // begin inline asm - dp4a.u32.u32 %r5384, %r5385, %r5770, %r5380; + shf.l.wrap.b32 %r9423, %r9429, %r9428, %r9430; // end inline asm - ld.const.u32 %r5389, [matrix+3676]; // begin inline asm - dp4a.u32.u32 %r5388, %r5389, %r5774, %r5384; + shf.l.wrap.b32 %r9427, %r9428, %r9429, %r9430; // end inline asm - ld.const.u32 %r5393, [matrix+3680]; + mov.u32 %r9438, 61; // begin inline asm - dp4a.u32.u32 %r5392, %r5393, %r5778, %r5388; + shf.l.wrap.b32 %r9431, %r9437, %r9436, %r9438; // end inline asm - ld.const.u32 %r5397, [matrix+3684]; // begin inline asm - dp4a.u32.u32 %r5396, %r5397, %r5782, %r5392; + shf.l.wrap.b32 %r9435, %r9436, %r9437, %r9438; // end inline asm - ld.const.u32 %r5401, [matrix+3688]; + mov.u32 %r9446, 39; // begin inline asm - dp4a.u32.u32 %r5400, %r5401, %r5786, %r5396; + shf.l.wrap.b32 %r9439, %r9445, %r9444, %r9446; // end inline asm - ld.const.u32 %r5405, [matrix+3692]; // begin inline asm - dp4a.u32.u32 %r5404, %r5405, %r5790, %r5400; + shf.l.wrap.b32 %r9443, %r9444, %r9445, %r9446; // end inline asm - ld.const.u32 %r5409, [matrix+3696]; + mov.u32 %r9454, 18; // begin inline asm - dp4a.u32.u32 %r5408, %r5409, %r5794, %r5404; + shf.l.wrap.b32 %r9447, %r9453, %r9452, %r9454; // end inline asm - ld.const.u32 %r5413, [matrix+3700]; // begin inline asm - dp4a.u32.u32 %r5412, %r5413, %r5798, %r5408; + shf.l.wrap.b32 %r9451, %r9452, %r9453, %r9454; // end inline asm - ld.const.u32 %r5417, [matrix+3704]; + mov.u32 %r9462, 62; // begin inline asm - dp4a.u32.u32 %r5416, %r5417, %r5802, %r5412; + shf.l.wrap.b32 %r9455, %r9461, %r9460, %r9462; // end inline asm - ld.const.u32 %r5421, [matrix+3708]; // begin inline asm - dp4a.u32.u32 %r5420, %r5421, %r5806, %r5416; + shf.l.wrap.b32 %r9459, %r9460, %r9461, %r9462; // end inline asm - shr.u32 %r6085, %r5356, 6; - and.b32 %r6086, %r6085, 240; - shr.u32 %r6087, %r5420, 10; - or.b32 %r6088, %r6087, %r6086; - cvt.u64.u32 %rd238, %r6088; - xor.b64 %rd239, %rd204, %rd238; - ld.const.u32 %r5425, [matrix+3712]; + mov.u32 %r9470, 43; // begin inline asm - dp4a.u32.u32 %r5424, %r5425, %r5746, %r6244; + shf.l.wrap.b32 %r9463, %r9469, %r9468, %r9470; // end inline asm - ld.const.u32 %r5429, [matrix+3716]; // begin inline asm - dp4a.u32.u32 %r5428, %r5429, %r5750, %r5424; + shf.l.wrap.b32 %r9467, %r9468, %r9469, %r9470; // end inline asm - ld.const.u32 %r5433, [matrix+3720]; + mov.u32 %r9478, 25; // begin inline asm - dp4a.u32.u32 %r5432, %r5433, %r5754, %r5428; + shf.l.wrap.b32 %r9471, %r9477, %r9476, %r9478; // end inline asm - ld.const.u32 %r5437, [matrix+3724]; // begin inline asm - dp4a.u32.u32 %r5436, %r5437, %r5758, %r5432; + shf.l.wrap.b32 %r9475, %r9476, %r9477, %r9478; // end inline asm - ld.const.u32 %r5441, [matrix+3728]; + mov.u32 %r9486, 8; // begin inline asm - dp4a.u32.u32 %r5440, %r5441, %r5762, %r5436; + shf.l.wrap.b32 %r9479, %r9485, %r9484, %r9486; // end inline asm - ld.const.u32 %r5445, [matrix+3732]; // begin inline asm - dp4a.u32.u32 %r5444, %r5445, %r5766, %r5440; + shf.l.wrap.b32 %r9483, %r9484, %r9485, %r9486; // end inline asm - ld.const.u32 %r5449, [matrix+3736]; + mov.u32 %r9494, 56; // begin inline asm - dp4a.u32.u32 %r5448, %r5449, %r5770, %r5444; + shf.l.wrap.b32 %r9487, %r9493, %r9492, %r9494; // end inline asm - ld.const.u32 %r5453, [matrix+3740]; // begin inline asm - dp4a.u32.u32 %r5452, %r5453, %r5774, %r5448; + shf.l.wrap.b32 %r9491, %r9492, %r9493, %r9494; // end inline asm - ld.const.u32 %r5457, [matrix+3744]; + mov.u32 %r9502, 41; // begin inline asm - dp4a.u32.u32 %r5456, %r5457, %r5778, %r5452; + shf.l.wrap.b32 %r9495, %r9501, %r9500, %r9502; // end inline asm - ld.const.u32 %r5461, [matrix+3748]; // begin inline asm - dp4a.u32.u32 %r5460, %r5461, %r5782, %r5456; + shf.l.wrap.b32 %r9499, %r9500, %r9501, %r9502; // end inline asm - ld.const.u32 %r5465, [matrix+3752]; + mov.u32 %r9510, 27; // begin inline asm - dp4a.u32.u32 %r5464, %r5465, %r5786, %r5460; + shf.l.wrap.b32 %r9503, %r9509, %r9508, %r9510; // end inline asm - ld.const.u32 %r5469, [matrix+3756]; // begin inline asm - dp4a.u32.u32 %r5468, %r5469, %r5790, %r5464; + shf.l.wrap.b32 %r9507, %r9508, %r9509, %r9510; // end inline asm - ld.const.u32 %r5473, [matrix+3760]; + mov.u32 %r9518, 14; // begin inline asm - dp4a.u32.u32 %r5472, %r5473, %r5794, %r5468; + shf.l.wrap.b32 %r9511, %r9517, %r9516, %r9518; // end inline asm - ld.const.u32 %r5477, [matrix+3764]; // begin inline asm - dp4a.u32.u32 %r5476, %r5477, %r5798, %r5472; + shf.l.wrap.b32 %r9515, %r9516, %r9517, %r9518; // end inline asm - ld.const.u32 %r5481, [matrix+3768]; + mov.u32 %r9526, 2; // begin inline asm - dp4a.u32.u32 %r5480, %r5481, %r5802, %r5476; + shf.l.wrap.b32 %r9519, %r9525, %r9524, %r9526; // end inline asm - ld.const.u32 %r5485, [matrix+3772]; // begin inline asm - dp4a.u32.u32 %r5484, %r5485, %r5806, %r5480; + shf.l.wrap.b32 %r9523, %r9524, %r9525, %r9526; // end inline asm - ld.const.u32 %r5489, [matrix+3776]; + mov.u32 %r9534, 55; // begin inline asm - dp4a.u32.u32 %r5488, %r5489, %r5746, %r6244; + shf.l.wrap.b32 %r9527, %r9533, %r9532, %r9534; // end inline asm - ld.const.u32 %r5493, [matrix+3780]; // begin inline asm - dp4a.u32.u32 %r5492, %r5493, %r5750, %r5488; + shf.l.wrap.b32 %r9531, %r9532, %r9533, %r9534; // end inline asm - ld.const.u32 %r5497, [matrix+3784]; + mov.u32 %r9542, 45; // begin inline asm - dp4a.u32.u32 %r5496, %r5497, %r5754, %r5492; + shf.l.wrap.b32 %r9535, %r9541, %r9540, %r9542; // end inline asm - ld.const.u32 %r5501, [matrix+3788]; // begin inline asm - dp4a.u32.u32 %r5500, %r5501, %r5758, %r5496; + shf.l.wrap.b32 %r9539, %r9540, %r9541, %r9542; // end inline asm - ld.const.u32 %r5505, [matrix+3792]; + mov.u32 %r9550, 36; // begin inline asm - dp4a.u32.u32 %r5504, %r5505, %r5762, %r5500; + shf.l.wrap.b32 %r9543, %r9549, %r9548, %r9550; // end inline asm - ld.const.u32 %r5509, [matrix+3796]; // begin inline asm - dp4a.u32.u32 %r5508, %r5509, %r5766, %r5504; + shf.l.wrap.b32 %r9547, %r9548, %r9549, %r9550; // end inline asm - ld.const.u32 %r5513, [matrix+3800]; + mov.u32 %r9558, 28; // begin inline asm - dp4a.u32.u32 %r5512, %r5513, %r5770, %r5508; + shf.l.wrap.b32 %r9551, %r9557, %r9556, %r9558; // end inline asm - ld.const.u32 %r5517, [matrix+3804]; // begin inline asm - dp4a.u32.u32 %r5516, %r5517, %r5774, %r5512; + shf.l.wrap.b32 %r9555, %r9556, %r9557, %r9558; // end inline asm - ld.const.u32 %r5521, [matrix+3808]; + mov.u32 %r9566, 21; // begin inline asm - dp4a.u32.u32 %r5520, %r5521, %r5778, %r5516; + shf.l.wrap.b32 %r9559, %r9565, %r9564, %r9566; // end inline asm - ld.const.u32 %r5525, [matrix+3812]; // begin inline asm - dp4a.u32.u32 %r5524, %r5525, %r5782, %r5520; + shf.l.wrap.b32 %r9563, %r9564, %r9565, %r9566; // end inline asm - ld.const.u32 %r5529, [matrix+3816]; + mov.u32 %r9574, 15; // begin inline asm - dp4a.u32.u32 %r5528, %r5529, %r5786, %r5524; + shf.l.wrap.b32 %r9567, %r9573, %r9572, %r9574; // end inline asm - ld.const.u32 %r5533, [matrix+3820]; // begin inline asm - dp4a.u32.u32 %r5532, %r5533, %r5790, %r5528; + shf.l.wrap.b32 %r9571, %r9572, %r9573, %r9574; // end inline asm - ld.const.u32 %r5537, [matrix+3824]; + mov.u32 %r9582, 10; // begin inline asm - dp4a.u32.u32 %r5536, %r5537, %r5794, %r5532; + shf.l.wrap.b32 %r9575, %r9581, %r9580, %r9582; // end inline asm - ld.const.u32 %r5541, [matrix+3828]; // begin inline asm - dp4a.u32.u32 %r5540, %r5541, %r5798, %r5536; + shf.l.wrap.b32 %r9579, %r9580, %r9581, %r9582; // end inline asm - ld.const.u32 %r5545, [matrix+3832]; + mov.u32 %r9590, 6; // begin inline asm - dp4a.u32.u32 %r5544, %r5545, %r5802, %r5540; + shf.l.wrap.b32 %r9583, %r9589, %r9588, %r9590; // end inline asm - ld.const.u32 %r5549, [matrix+3836]; // begin inline asm - dp4a.u32.u32 %r5548, %r5549, %r5806, %r5544; + shf.l.wrap.b32 %r9587, %r9588, %r9589, %r9590; // end inline asm - shr.u32 %r6089, %r5484, 6; - and.b32 %r6090, %r6089, 240; - shr.u32 %r6091, %r5548, 10; - or.b32 %r6092, %r6091, %r6090; - cvt.u64.u32 %rd240, %r6092; - xor.b64 %rd241, %rd206, %rd240; - ld.const.u32 %r5553, [matrix+3840]; + mov.u32 %r9598, 3; // begin inline asm - dp4a.u32.u32 %r5552, %r5553, %r5746, %r6244; + shf.l.wrap.b32 %r9591, %r9597, %r9596, %r9598; // end inline asm - ld.const.u32 %r5557, [matrix+3844]; // begin inline asm - dp4a.u32.u32 %r5556, %r5557, %r5750, %r5552; + shf.l.wrap.b32 %r9595, %r9596, %r9597, %r9598; // end inline asm - ld.const.u32 %r5561, [matrix+3848]; // begin inline asm - dp4a.u32.u32 %r5560, %r5561, %r5754, %r5556; + shf.l.wrap.b32 %r9599, %r9605, %r9604, %r9276; // end inline asm - ld.const.u32 %r5565, [matrix+3852]; // begin inline asm - dp4a.u32.u32 %r5564, %r5565, %r5758, %r5560; + shf.l.wrap.b32 %r9603, %r9604, %r9605, %r9276; // end inline asm - ld.const.u32 %r5569, [matrix+3856]; // begin inline asm - dp4a.u32.u32 %r5568, %r5569, %r5762, %r5564; + // chi + lop3.b32 %r9607, %r9642, %r9415, %r9463, 0xD2; + lop3.b32 %r9608, %r9645, %r9419, %r9467, 0xD2; // end inline asm - ld.const.u32 %r5573, [matrix+3860]; // begin inline asm - dp4a.u32.u32 %r5572, %r5573, %r5766, %r5568; + // chi + lop3.b32 %r29773, %r9415, %r9463, %r9559, 0xD2; + lop3.b32 %r29774, %r9419, %r9467, %r9563, 0xD2; // end inline asm - ld.const.u32 %r5577, [matrix+3864]; // begin inline asm - dp4a.u32.u32 %r5576, %r5577, %r5770, %r5572; + // chi + lop3.b32 %r29769, %r9463, %r9559, %r9511, 0xD2; + lop3.b32 %r29770, %r9467, %r9563, %r9515, 0xD2; // end inline asm - ld.const.u32 %r5581, [matrix+3868]; // begin inline asm - dp4a.u32.u32 %r5580, %r5581, %r5774, %r5576; + // chi + lop3.b32 %r29765, %r9559, %r9511, %r9642, 0xD2; + lop3.b32 %r29766, %r9563, %r9515, %r9645, 0xD2; // end inline asm - ld.const.u32 %r5585, [matrix+3872]; // begin inline asm - dp4a.u32.u32 %r5584, %r5585, %r5778, %r5580; + // chi + lop3.b32 %r29763, %r9511, %r9642, %r9415, 0xD2; + lop3.b32 %r29764, %r9515, %r9645, %r9419, 0xD2; // end inline asm - ld.const.u32 %r5589, [matrix+3876]; // begin inline asm - dp4a.u32.u32 %r5588, %r5589, %r5782, %r5584; + // chi + lop3.b32 %r29759, %r9551, %r9423, %r9591, 0xD2; + lop3.b32 %r29760, %r9555, %r9427, %r9595, 0xD2; // end inline asm - ld.const.u32 %r5593, [matrix+3880]; // begin inline asm - dp4a.u32.u32 %r5592, %r5593, %r5786, %r5588; + // chi + lop3.b32 %r29771, %r9423, %r9591, %r9535, 0xD2; + lop3.b32 %r29772, %r9427, %r9595, %r9539, 0xD2; // end inline asm - ld.const.u32 %r5597, [matrix+3884]; // begin inline asm - dp4a.u32.u32 %r5596, %r5597, %r5790, %r5592; + // chi + lop3.b32 %r29767, %r9591, %r9535, %r9431, 0xD2; + lop3.b32 %r29768, %r9595, %r9539, %r9435, 0xD2; // end inline asm - ld.const.u32 %r5601, [matrix+3888]; // begin inline asm - dp4a.u32.u32 %r5600, %r5601, %r5794, %r5596; + // chi + lop3.b32 %r29739, %r9535, %r9431, %r9551, 0xD2; + lop3.b32 %r29740, %r9539, %r9435, %r9555, 0xD2; // end inline asm - ld.const.u32 %r5605, [matrix+3892]; + st.local.v2.u32 [%rd55+88], {%r29739, %r29740}; // begin inline asm - dp4a.u32.u32 %r5604, %r5605, %r5798, %r5600; + // chi + lop3.b32 %r29731, %r9431, %r9551, %r9423, 0xD2; + lop3.b32 %r29732, %r9435, %r9555, %r9427, 0xD2; // end inline asm - ld.const.u32 %r5609, [matrix+3896]; + st.local.v2.u32 [%rd55+96], {%r29731, %r29732}; // begin inline asm - dp4a.u32.u32 %r5608, %r5609, %r5802, %r5604; + // chi + lop3.b32 %r29757, %r9599, %r9583, %r9471, 0xD2; + lop3.b32 %r29758, %r9603, %r9587, %r9475, 0xD2; // end inline asm - ld.const.u32 %r5613, [matrix+3900]; + st.local.v2.u32 [%rd55+104], {%r29757, %r29758}; // begin inline asm - dp4a.u32.u32 %r5612, %r5613, %r5806, %r5608; + // chi + lop3.b32 %r29751, %r9583, %r9471, %r9479, 0xD2; + lop3.b32 %r29752, %r9587, %r9475, %r9483, 0xD2; // end inline asm - ld.const.u32 %r5617, [matrix+3904]; + st.local.v2.u32 [%rd55+112], {%r29751, %r29752}; // begin inline asm - dp4a.u32.u32 %r5616, %r5617, %r5746, %r6244; + // chi + lop3.b32 %r29745, %r9471, %r9479, %r9447, 0xD2; + lop3.b32 %r29746, %r9475, %r9483, %r9451, 0xD2; // end inline asm - ld.const.u32 %r5621, [matrix+3908]; + st.local.v2.u32 [%rd55+120], {%r29745, %r29746}; // begin inline asm - dp4a.u32.u32 %r5620, %r5621, %r5750, %r5616; + // chi + lop3.b32 %r29737, %r9479, %r9447, %r9599, 0xD2; + lop3.b32 %r29738, %r9483, %r9451, %r9603, 0xD2; // end inline asm - ld.const.u32 %r5625, [matrix+3912]; + st.local.v2.u32 [%rd55+128], {%r29737, %r29738}; // begin inline asm - dp4a.u32.u32 %r5624, %r5625, %r5754, %r5620; + // chi + lop3.b32 %r29729, %r9447, %r9599, %r9583, 0xD2; + lop3.b32 %r29730, %r9451, %r9603, %r9587, 0xD2; // end inline asm - ld.const.u32 %r5629, [matrix+3916]; + st.local.v2.u32 [%rd55+136], {%r29729, %r29730}; // begin inline asm - dp4a.u32.u32 %r5628, %r5629, %r5758, %r5624; + // chi + lop3.b32 %r29755, %r9503, %r9543, %r9575, 0xD2; + lop3.b32 %r29756, %r9507, %r9547, %r9579, 0xD2; // end inline asm - ld.const.u32 %r5633, [matrix+3920]; + st.local.v2.u32 [%rd55+144], {%r29755, %r29756}; // begin inline asm - dp4a.u32.u32 %r5632, %r5633, %r5762, %r5628; + // chi + lop3.b32 %r29749, %r9543, %r9575, %r9567, 0xD2; + lop3.b32 %r29750, %r9547, %r9579, %r9571, 0xD2; // end inline asm - ld.const.u32 %r5637, [matrix+3924]; + st.local.v2.u32 [%rd55+152], {%r29749, %r29750}; // begin inline asm - dp4a.u32.u32 %r5636, %r5637, %r5766, %r5632; + // chi + lop3.b32 %r29743, %r9575, %r9567, %r9487, 0xD2; + lop3.b32 %r29744, %r9579, %r9571, %r9491, 0xD2; // end inline asm - ld.const.u32 %r5641, [matrix+3928]; + st.local.v2.u32 [%rd55+160], {%r29743, %r29744}; // begin inline asm - dp4a.u32.u32 %r5640, %r5641, %r5770, %r5636; + // chi + lop3.b32 %r29735, %r9567, %r9487, %r9503, 0xD2; + lop3.b32 %r29736, %r9571, %r9491, %r9507, 0xD2; // end inline asm - ld.const.u32 %r5645, [matrix+3932]; + st.local.v2.u32 [%rd55+168], {%r29735, %r29736}; // begin inline asm - dp4a.u32.u32 %r5644, %r5645, %r5774, %r5640; + // chi + lop3.b32 %r29727, %r9487, %r9503, %r9543, 0xD2; + lop3.b32 %r29728, %r9491, %r9507, %r9547, 0xD2; // end inline asm - ld.const.u32 %r5649, [matrix+3936]; + st.local.v2.u32 [%rd55+176], {%r29727, %r29728}; // begin inline asm - dp4a.u32.u32 %r5648, %r5649, %r5778, %r5644; + // chi + lop3.b32 %r29753, %r9455, %r9527, %r9439, 0xD2; + lop3.b32 %r29754, %r9459, %r9531, %r9443, 0xD2; // end inline asm - ld.const.u32 %r5653, [matrix+3940]; + st.local.v2.u32 [%rd55+184], {%r29753, %r29754}; // begin inline asm - dp4a.u32.u32 %r5652, %r5653, %r5782, %r5648; + // chi + lop3.b32 %r29747, %r9527, %r9439, %r9495, 0xD2; + lop3.b32 %r29748, %r9531, %r9443, %r9499, 0xD2; // end inline asm - ld.const.u32 %r5657, [matrix+3944]; + st.local.v2.u32 [%rd55+192], {%r29747, %r29748}; // begin inline asm - dp4a.u32.u32 %r5656, %r5657, %r5786, %r5652; + // chi + lop3.b32 %r29741, %r9439, %r9495, %r9519, 0xD2; + lop3.b32 %r29742, %r9443, %r9499, %r9523, 0xD2; // end inline asm - ld.const.u32 %r5661, [matrix+3948]; + st.local.v2.u32 [%rd55+200], {%r29741, %r29742}; // begin inline asm - dp4a.u32.u32 %r5660, %r5661, %r5790, %r5656; + // chi + lop3.b32 %r29733, %r9495, %r9519, %r9455, 0xD2; + lop3.b32 %r29734, %r9499, %r9523, %r9459, 0xD2; // end inline asm - ld.const.u32 %r5665, [matrix+3952]; + st.local.v2.u32 [%rd55+208], {%r29733, %r29734}; // begin inline asm - dp4a.u32.u32 %r5664, %r5665, %r5794, %r5660; + // chi + lop3.b32 %r29725, %r9519, %r9455, %r9527, 0xD2; + lop3.b32 %r29726, %r9523, %r9459, %r9531, 0xD2; // end inline asm - ld.const.u32 %r5669, [matrix+3956]; + st.local.v2.u32 [%rd55+216], {%r29725, %r29726}; + mul.wide.s32 %rd572, %r29775, 8; + add.s64 %rd571, %rd497, %rd572; // begin inline asm - dp4a.u32.u32 %r5668, %r5669, %r5798, %r5664; + ld.global.nc.v2.u32 {%r9807,%r9808}, [%rd571]; // end inline asm - ld.const.u32 %r5673, [matrix+3960]; + xor.b32 %r29761, %r9607, %r9807; + xor.b32 %r29762, %r9608, %r9808; + add.s32 %r29775, %r29775, 1; + setp.lt.u32 %p22, %r29775, 23; + @%p22 bra $L__BB2_31; + + mov.u32 %r9918, 1; + st.local.v2.u32 [%rd55+32], {%r29773, %r29774}; + st.local.v2.u32 [%rd55+72], {%r29771, %r29772}; + st.local.v2.u32 [%rd55+40], {%r29769, %r29770}; + st.local.v2.u32 [%rd55+80], {%r29767, %r29768}; + st.local.v2.u32 [%rd55+48], {%r29765, %r29766}; + st.local.v2.u32 [%rd55+56], {%r29763, %r29764}; + st.local.v2.u32 [%rd55+24], {%r29761, %r29762}; // begin inline asm - dp4a.u32.u32 %r5672, %r5673, %r5802, %r5668; + // xor5 + lop3.b32 %r9819, %r29761, %r29759, %r29757, 0x96; + lop3.b32 %r9819, %r9819, %r29755, %r29753, 0x96; + lop3.b32 %r9820, %r29762, %r29760, %r29758, 0x96; + lop3.b32 %r9820, %r9820, %r29756, %r29754, 0x96; // end inline asm - ld.const.u32 %r5677, [matrix+3964]; // begin inline asm - dp4a.u32.u32 %r5676, %r5677, %r5806, %r5672; + // xor5 + lop3.b32 %r9831, %r29773, %r29771, %r29751, 0x96; + lop3.b32 %r9831, %r9831, %r29749, %r29747, 0x96; + lop3.b32 %r9832, %r29774, %r29772, %r29752, 0x96; + lop3.b32 %r9832, %r9832, %r29750, %r29748, 0x96; // end inline asm - shr.u32 %r6093, %r5612, 6; - and.b32 %r6094, %r6093, 240; - shr.u32 %r6095, %r5676, 10; - or.b32 %r6096, %r6095, %r6094; - cvt.u64.u32 %rd242, %r6096; - xor.b64 %rd243, %rd208, %rd242; - ld.const.u32 %r5681, [matrix+3968]; // begin inline asm - dp4a.u32.u32 %r5680, %r5681, %r5746, %r6244; + // xor5 + lop3.b32 %r9843, %r29769, %r29767, %r29745, 0x96; + lop3.b32 %r9843, %r9843, %r29743, %r29741, 0x96; + lop3.b32 %r9844, %r29770, %r29768, %r29746, 0x96; + lop3.b32 %r9844, %r9844, %r29744, %r29742, 0x96; // end inline asm - ld.const.u32 %r5685, [matrix+3972]; // begin inline asm - dp4a.u32.u32 %r5684, %r5685, %r5750, %r5680; + // xor5 + lop3.b32 %r9855, %r29765, %r29739, %r29737, 0x96; + lop3.b32 %r9855, %r9855, %r29735, %r29733, 0x96; + lop3.b32 %r9856, %r29766, %r29740, %r29738, 0x96; + lop3.b32 %r9856, %r9856, %r29736, %r29734, 0x96; // end inline asm - ld.const.u32 %r5689, [matrix+3976]; // begin inline asm - dp4a.u32.u32 %r5688, %r5689, %r5754, %r5684; + // xor5 + lop3.b32 %r9867, %r29763, %r29731, %r29729, 0x96; + lop3.b32 %r9867, %r9867, %r29727, %r29725, 0x96; + lop3.b32 %r9868, %r29764, %r29732, %r29730, 0x96; + lop3.b32 %r9868, %r9868, %r29728, %r29726, 0x96; // end inline asm - ld.const.u32 %r5693, [matrix+3980]; // begin inline asm - dp4a.u32.u32 %r5692, %r5693, %r5758, %r5688; + shf.l.wrap.b32 %r9879, %r9832, %r9831, %r9918; // end inline asm - ld.const.u32 %r5697, [matrix+3984]; // begin inline asm - dp4a.u32.u32 %r5696, %r5697, %r5762, %r5692; + shf.l.wrap.b32 %r9883, %r9831, %r9832, %r9918; // end inline asm - ld.const.u32 %r5701, [matrix+3988]; + xor.b32 %r10057, %r9879, %r9867; + xor.b32 %r10058, %r9883, %r9868; + xor.b32 %r10026, %r29761, %r10057; + xor.b32 %r10029, %r29762, %r10058; + xor.b32 %r9989, %r29758, %r10058; + xor.b32 %r9988, %r29757, %r10057; + st.local.v2.u32 [%rd55+104], {%r9988, %r9989}; // begin inline asm - dp4a.u32.u32 %r5700, %r5701, %r5766, %r5696; + shf.l.wrap.b32 %r9887, %r9844, %r9843, %r9918; // end inline asm - ld.const.u32 %r5705, [matrix+3992]; // begin inline asm - dp4a.u32.u32 %r5704, %r5705, %r5770, %r5700; + shf.l.wrap.b32 %r9891, %r9843, %r9844, %r9918; // end inline asm - ld.const.u32 %r5709, [matrix+3996]; + xor.b32 %r10059, %r9887, %r9819; + xor.b32 %r10060, %r9891, %r9820; + xor.b32 %r9925, %r29771, %r10059; + xor.b32 %r9924, %r29772, %r10060; + xor.b32 %r9964, %r29750, %r10060; + xor.b32 %r9965, %r29749, %r10059; + st.local.v2.u32 [%rd55+152], {%r9965, %r9964}; // begin inline asm - dp4a.u32.u32 %r5708, %r5709, %r5774, %r5704; + shf.l.wrap.b32 %r9895, %r9856, %r9855, %r9918; // end inline asm - ld.const.u32 %r5713, [matrix+4000]; // begin inline asm - dp4a.u32.u32 %r5712, %r5713, %r5778, %r5708; + shf.l.wrap.b32 %r9899, %r9855, %r9856, %r9918; // end inline asm - ld.const.u32 %r5717, [matrix+4004]; + xor.b32 %r10061, %r9895, %r9831; + xor.b32 %r10062, %r9899, %r9832; + xor.b32 %r9948, %r29746, %r10062; + xor.b32 %r9949, %r29745, %r10061; + st.local.v2.u32 [%rd55+120], {%r9949, %r9948}; + xor.b32 %r9940, %r29742, %r10062; + xor.b32 %r9941, %r29741, %r10061; + st.local.v2.u32 [%rd55+200], {%r9941, %r9940}; // begin inline asm - dp4a.u32.u32 %r5716, %r5717, %r5782, %r5712; + shf.l.wrap.b32 %r9903, %r9868, %r9867, %r9918; // end inline asm - ld.const.u32 %r5721, [matrix+4008]; // begin inline asm - dp4a.u32.u32 %r5720, %r5721, %r5786, %r5716; + shf.l.wrap.b32 %r9907, %r9867, %r9868, %r9918; // end inline asm - ld.const.u32 %r5725, [matrix+4012]; + xor.b32 %r10063, %r9903, %r9843; + xor.b32 %r10064, %r9907, %r9844; + xor.b32 %r9972, %r29765, %r10063; + xor.b32 %r9973, %r29766, %r10064; + xor.b32 %r9981, %r29736, %r10064; + xor.b32 %r9980, %r29735, %r10063; + st.local.v2.u32 [%rd55+168], {%r9980, %r9981}; // begin inline asm - dp4a.u32.u32 %r5724, %r5725, %r5790, %r5720; + shf.l.wrap.b32 %r9911, %r9820, %r9819, %r9918; // end inline asm - ld.const.u32 %r5729, [matrix+4016]; // begin inline asm - dp4a.u32.u32 %r5728, %r5729, %r5794, %r5724; + shf.l.wrap.b32 %r9915, %r9819, %r9820, %r9918; // end inline asm - ld.const.u32 %r5733, [matrix+4020]; + xor.b32 %r10065, %r9911, %r9855; + xor.b32 %r10066, %r9915, %r9856; + xor.b32 %r9932, %r29731, %r10065; + xor.b32 %r9933, %r29732, %r10066; + xor.b32 %r9957, %r29726, %r10066; + xor.b32 %r9956, %r29725, %r10065; + st.local.v2.u32 [%rd55+216], {%r9956, %r9957}; // begin inline asm - dp4a.u32.u32 %r5732, %r5733, %r5798, %r5728; + shf.l.wrap.b32 %r9919, %r9925, %r9924, %r9422; // end inline asm - ld.const.u32 %r5737, [matrix+4024]; // begin inline asm - dp4a.u32.u32 %r5736, %r5737, %r5802, %r5732; + shf.l.wrap.b32 %r9923, %r9924, %r9925, %r9422; // end inline asm - ld.const.u32 %r5741, [matrix+4028]; // begin inline asm - dp4a.u32.u32 %r5740, %r5741, %r5806, %r5736; + shf.l.wrap.b32 %r9927, %r9933, %r9932, %r9430; // end inline asm - ld.const.u32 %r5745, [matrix+4032]; // begin inline asm - dp4a.u32.u32 %r5744, %r5745, %r5746, %r6244; + shf.l.wrap.b32 %r9931, %r9932, %r9933, %r9430; // end inline asm - ld.const.u32 %r5749, [matrix+4036]; // begin inline asm - dp4a.u32.u32 %r5748, %r5749, %r5750, %r5744; + shf.l.wrap.b32 %r9939, %r9940, %r9941, %r9438; // end inline asm - ld.const.u32 %r5753, [matrix+4040]; // begin inline asm - dp4a.u32.u32 %r5752, %r5753, %r5754, %r5748; + shf.l.wrap.b32 %r9935, %r9941, %r9940, %r9438; // end inline asm - ld.const.u32 %r5757, [matrix+4044]; + st.local.v2.u32 [%rd55+96], {%r9935, %r9939}; // begin inline asm - dp4a.u32.u32 %r5756, %r5757, %r5758, %r5752; + shf.l.wrap.b32 %r9943, %r9949, %r9948, %r9470; // end inline asm - ld.const.u32 %r5761, [matrix+4048]; // begin inline asm - dp4a.u32.u32 %r5760, %r5761, %r5762, %r5756; + shf.l.wrap.b32 %r9947, %r9948, %r9949, %r9470; // end inline asm - ld.const.u32 %r5765, [matrix+4052]; // begin inline asm - dp4a.u32.u32 %r5764, %r5765, %r5766, %r5760; + shf.l.wrap.b32 %r9951, %r9957, %r9956, %r9518; // end inline asm - ld.const.u32 %r5769, [matrix+4056]; // begin inline asm - dp4a.u32.u32 %r5768, %r5769, %r5770, %r5764; + shf.l.wrap.b32 %r9955, %r9956, %r9957, %r9518; // end inline asm - ld.const.u32 %r5773, [matrix+4060]; // begin inline asm - dp4a.u32.u32 %r5772, %r5773, %r5774, %r5768; + shf.l.wrap.b32 %r9963, %r9964, %r9965, %r9542; // end inline asm - ld.const.u32 %r5777, [matrix+4064]; // begin inline asm - dp4a.u32.u32 %r5776, %r5777, %r5778, %r5772; + shf.l.wrap.b32 %r9959, %r9965, %r9964, %r9542; // end inline asm - ld.const.u32 %r5781, [matrix+4068]; + st.local.v2.u32 [%rd55+88], {%r9959, %r9963}; // begin inline asm - dp4a.u32.u32 %r5780, %r5781, %r5782, %r5776; + shf.l.wrap.b32 %r9967, %r9973, %r9972, %r9558; // end inline asm - ld.const.u32 %r5785, [matrix+4072]; // begin inline asm - dp4a.u32.u32 %r5784, %r5785, %r5786, %r5780; + shf.l.wrap.b32 %r9971, %r9972, %r9973, %r9558; // end inline asm - ld.const.u32 %r5789, [matrix+4076]; // begin inline asm - dp4a.u32.u32 %r5788, %r5789, %r5790, %r5784; + shf.l.wrap.b32 %r9975, %r9981, %r9980, %r9566; // end inline asm - ld.const.u32 %r5793, [matrix+4080]; // begin inline asm - dp4a.u32.u32 %r5792, %r5793, %r5794, %r5788; + shf.l.wrap.b32 %r9979, %r9980, %r9981, %r9566; // end inline asm - ld.const.u32 %r5797, [matrix+4084]; // begin inline asm - dp4a.u32.u32 %r5796, %r5797, %r5798, %r5792; + shf.l.wrap.b32 %r9983, %r9989, %r9988, %r9598; // end inline asm - ld.const.u32 %r5801, [matrix+4088]; // begin inline asm - dp4a.u32.u32 %r5800, %r5801, %r5802, %r5796; + shf.l.wrap.b32 %r9987, %r9988, %r9989, %r9598; // end inline asm - ld.const.u32 %r5805, [matrix+4092]; // begin inline asm - dp4a.u32.u32 %r5804, %r5805, %r5806, %r5800; + // chi + lop3.b32 %r9991, %r10026, %r9919, %r9943, 0xD2; + lop3.b32 %r9992, %r10029, %r9923, %r9947, 0xD2; // end inline asm - shr.u32 %r6097, %r5740, 6; - and.b32 %r6098, %r6097, 240; - shr.u32 %r6099, %r5804, 10; - or.b32 %r6100, %r6099, %r6098; - cvt.u64.u32 %rd244, %r6100; - xor.b64 %rd245, %rd210, %rd244; - shl.b32 %r6101, %r5985, 24; - cvt.u64.u32 %rd246, %r6101; - shl.b32 %r6102, %r5980, 16; - and.b32 %r6103, %r6102, 16711680; - cvt.u64.u32 %rd247, %r6103; - shl.b32 %r6104, %r5975, 8; - and.b32 %r6105, %r6104, 65280; - cvt.u64.u32 %rd248, %r6105; - and.b32 %r6106, %r5970, 255; - cvt.u64.u32 %rd249, %r6106; - shl.b32 %r6107, %r6019, 24; - cvt.u64.u32 %rd250, %r6107; - shl.b32 %r6108, %r6014, 16; - and.b32 %r6109, %r6108, 16711680; - cvt.u64.u32 %rd251, %r6109; - shl.b32 %r6110, %r6009, 8; - and.b32 %r6111, %r6110, 65280; - cvt.u64.u32 %rd252, %r6111; - and.b32 %r6112, %r6004, 255; - cvt.u64.u32 %rd253, %r6112; - shl.b32 %r6113, %r6053, 24; - cvt.u64.u32 %rd254, %r6113; - shl.b32 %r6114, %r6048, 16; - and.b32 %r6115, %r6114, 16711680; - cvt.u64.u32 %rd255, %r6115; - shl.b32 %r6116, %r6043, 8; - and.b32 %r6117, %r6116, 65280; - cvt.u64.u32 %rd256, %r6117; - and.b32 %r6118, %r6038, 255; - cvt.u64.u32 %rd257, %r6118; - shr.u32 %r6119, %r2732, 10; - or.b32 %r6120, %r6119, %r5999; - xor.b32 %r6121, %r10, %r6120; - cvt.u64.u32 %rd258, %r6121; - shl.b64 %rd259, %rd258, 56; - shl.b64 %rd260, %rd216, 48; - and.b64 %rd261, %rd260, 71776119061217280; - or.b64 %rd262, %rd259, %rd261; - shl.b64 %rd263, %rd214, 40; - and.b64 %rd264, %rd263, 280375465082880; - or.b64 %rd265, %rd262, %rd264; - shl.b64 %rd266, %rd212, 32; - and.b64 %rd267, %rd266, 1095216660480; - or.b64 %rd268, %rd265, %rd267; - or.b64 %rd269, %rd268, %rd246; - or.b64 %rd270, %rd269, %rd247; - or.b64 %rd271, %rd270, %rd248; - or.b64 %rd272, %rd271, %rd249; - xor.b64 %rd73, %rd272, 4239941492252378377; - shr.u32 %r6122, %r3756, 10; - or.b32 %r6123, %r6122, %r6033; - xor.b32 %r6124, %r12, %r6123; - cvt.u64.u32 %rd273, %r6124; - shl.b64 %rd274, %rd273, 56; - shl.b64 %rd275, %rd222, 48; - and.b64 %rd276, %rd275, 71776119061217280; - or.b64 %rd277, %rd274, %rd276; - shl.b64 %rd278, %rd220, 40; - and.b64 %rd279, %rd278, 280375465082880; - or.b64 %rd280, %rd277, %rd279; - shl.b64 %rd281, %rd218, 32; - and.b64 %rd282, %rd281, 1095216660480; - or.b64 %rd283, %rd280, %rd282; - or.b64 %rd284, %rd283, %rd250; - or.b64 %rd285, %rd284, %rd251; - or.b64 %rd286, %rd285, %rd252; - or.b64 %rd287, %rd286, %rd253; - xor.b64 %rd484, %rd287, 8746723911537738262; - shr.u32 %r6125, %r4780, 10; - or.b32 %r6126, %r6125, %r6067; - xor.b32 %r6127, %r14, %r6126; - cvt.u64.u32 %rd288, %r6127; - shl.b64 %rd289, %rd288, 56; - shl.b64 %rd290, %rd228, 48; - and.b64 %rd291, %rd290, 71776119061217280; - or.b64 %rd292, %rd289, %rd291; - shl.b64 %rd293, %rd226, 40; - and.b64 %rd294, %rd293, 280375465082880; - or.b64 %rd295, %rd292, %rd294; - shl.b64 %rd296, %rd224, 32; - and.b64 %rd297, %rd296, 1095216660480; - or.b64 %rd298, %rd295, %rd297; - or.b64 %rd299, %rd298, %rd254; - or.b64 %rd300, %rd299, %rd255; - or.b64 %rd301, %rd300, %rd256; - or.b64 %rd302, %rd301, %rd257; - xor.b64 %rd479, %rd302, 8796936657246353646; - shl.b64 %rd303, %rd245, 56; - shl.b64 %rd304, %rd243, 48; - and.b64 %rd305, %rd304, 71776119061217280; - or.b64 %rd306, %rd303, %rd305; - shl.b64 %rd307, %rd241, 40; - and.b64 %rd308, %rd307, 280375465082880; - or.b64 %rd309, %rd306, %rd308; - shl.b64 %rd310, %rd239, 32; - and.b64 %rd311, %rd310, 1095216660480; - or.b64 %rd312, %rd309, %rd311; - shl.b64 %rd313, %rd237, 24; - and.b64 %rd314, %rd313, 4278190080; - or.b64 %rd315, %rd312, %rd314; - shl.b64 %rd316, %rd235, 16; - and.b64 %rd317, %rd316, 16711680; - shl.b64 %rd318, %rd231, 8; - and.b64 %rd319, %rd318, 65280; - or.b64 %rd320, %rd315, %rd317; - or.b64 %rd321, %rd320, %rd319; - or.b64 %rd322, %rd321, %rd233; - xor.b64 %rd474, %rd322, 1272090201925444760; - mov.u64 %rd488, 8270816933120786537; - mov.u64 %rd487, -850687345431043546; - mov.u64 %rd486, 8596393687355028144; - mov.u64 %rd485, -4073852189716399785; - mov.u64 %rd483, -4539347866060507718; - mov.u64 %rd482, -3233781605604422593; - mov.u64 %rd481, 570094237299545110; - mov.u64 %rd480, 5171152063242093102; - mov.u64 %rd478, 6782861118970774626; - mov.u64 %rd477, 7812475424661425213; - mov.u64 %rd476, 9119540418498120711; - mov.u64 %rd475, -7873636174015165430; - mov.u64 %rd473, -9207053471590684088; - mov.u64 %rd472, 3370482334374859748; - mov.u64 %rd471, -1544774801229058759; - mov.u64 %rd470, 6096431547456407061; - mov.u64 %rd469, -1792185402154627366; - mov.u64 %rd468, -6864424130110145268; - mov.u64 %rd467, 5690099369266491460; - mov.u64 %rd466, -5074726839974049192; - mov.u64 %rd465, 1592359455985097269; - mov.u64 %rd464, RC; - -$L__BB0_9: - xor.b64 %rd323, %rd488, %rd73; - xor.b64 %rd324, %rd323, %rd487; - xor.b64 %rd325, %rd324, %rd486; - xor.b64 %rd326, %rd325, %rd485; - xor.b64 %rd327, %rd483, %rd484; - xor.b64 %rd328, %rd327, %rd482; - xor.b64 %rd329, %rd328, %rd481; - xor.b64 %rd330, %rd329, %rd480; - xor.b64 %rd331, %rd478, %rd479; - xor.b64 %rd332, %rd331, %rd477; - xor.b64 %rd333, %rd332, %rd476; - xor.b64 %rd334, %rd333, %rd475; - xor.b64 %rd335, %rd473, %rd474; - xor.b64 %rd336, %rd335, %rd472; - xor.b64 %rd337, %rd336, %rd471; - xor.b64 %rd338, %rd337, %rd470; - xor.b64 %rd339, %rd468, %rd469; - xor.b64 %rd340, %rd339, %rd467; - xor.b64 %rd341, %rd340, %rd466; - xor.b64 %rd342, %rd341, %rd465; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6128}, %rd330; - } - { - .reg .b32 %dummy; - mov.b64 {%r6129,%dummy}, %rd330; - } - shf.l.wrap.b32 %r6130, %r6129, %r6128, 1; - shf.l.wrap.b32 %r6131, %r6128, %r6129, 1; - mov.b64 %rd343, {%r6131, %r6130}; - xor.b64 %rd344, %rd342, %rd343; - xor.b64 %rd345, %rd344, %rd73; - xor.b64 %rd346, %rd488, %rd344; - xor.b64 %rd347, %rd487, %rd344; - xor.b64 %rd348, %rd486, %rd344; - xor.b64 %rd349, %rd485, %rd344; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6132}, %rd334; - } - { - .reg .b32 %dummy; - mov.b64 {%r6133,%dummy}, %rd334; - } - shf.l.wrap.b32 %r6134, %r6133, %r6132, 1; - shf.l.wrap.b32 %r6135, %r6132, %r6133, 1; - mov.b64 %rd350, {%r6135, %r6134}; - xor.b64 %rd351, %rd350, %rd326; - xor.b64 %rd352, %rd484, %rd351; - xor.b64 %rd353, %rd483, %rd351; - xor.b64 %rd354, %rd482, %rd351; - xor.b64 %rd355, %rd481, %rd351; - xor.b64 %rd356, %rd480, %rd351; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6136}, %rd338; - } - { - .reg .b32 %dummy; - mov.b64 {%r6137,%dummy}, %rd338; - } - shf.l.wrap.b32 %r6138, %r6137, %r6136, 1; - shf.l.wrap.b32 %r6139, %r6136, %r6137, 1; - mov.b64 %rd357, {%r6139, %r6138}; - xor.b64 %rd358, %rd357, %rd330; - xor.b64 %rd359, %rd479, %rd358; - xor.b64 %rd360, %rd478, %rd358; - xor.b64 %rd361, %rd477, %rd358; - xor.b64 %rd362, %rd476, %rd358; - xor.b64 %rd363, %rd475, %rd358; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6140}, %rd342; - } - { - .reg .b32 %dummy; - mov.b64 {%r6141,%dummy}, %rd342; - } - shf.l.wrap.b32 %r6142, %r6141, %r6140, 1; - shf.l.wrap.b32 %r6143, %r6140, %r6141, 1; - mov.b64 %rd364, {%r6143, %r6142}; - xor.b64 %rd365, %rd364, %rd334; - xor.b64 %rd366, %rd474, %rd365; - xor.b64 %rd367, %rd473, %rd365; - xor.b64 %rd368, %rd472, %rd365; - xor.b64 %rd369, %rd471, %rd365; - xor.b64 %rd370, %rd470, %rd365; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6144}, %rd326; - } - { - .reg .b32 %dummy; - mov.b64 {%r6145,%dummy}, %rd326; - } - shf.l.wrap.b32 %r6146, %r6145, %r6144, 1; - shf.l.wrap.b32 %r6147, %r6144, %r6145, 1; - mov.b64 %rd371, {%r6147, %r6146}; - xor.b64 %rd372, %rd338, %rd371; - xor.b64 %rd373, %rd469, %rd372; - xor.b64 %rd374, %rd468, %rd372; - xor.b64 %rd375, %rd467, %rd372; - xor.b64 %rd376, %rd466, %rd372; - xor.b64 %rd377, %rd465, %rd372; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6148}, %rd352; - } - { - .reg .b32 %dummy; - mov.b64 {%r6149,%dummy}, %rd352; - } - shf.l.wrap.b32 %r6150, %r6149, %r6148, 1; - shf.l.wrap.b32 %r6151, %r6148, %r6149, 1; - mov.b64 %rd378, {%r6151, %r6150}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6152}, %rd347; - } - { - .reg .b32 %dummy; - mov.b64 {%r6153,%dummy}, %rd347; - } - shf.l.wrap.b32 %r6154, %r6153, %r6152, 3; - shf.l.wrap.b32 %r6155, %r6152, %r6153, 3; - mov.b64 %rd379, {%r6155, %r6154}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6156}, %rd360; - } - { - .reg .b32 %dummy; - mov.b64 {%r6157,%dummy}, %rd360; - } - shf.l.wrap.b32 %r6158, %r6157, %r6156, 6; - shf.l.wrap.b32 %r6159, %r6156, %r6157, 6; - mov.b64 %rd380, {%r6159, %r6158}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6160}, %rd354; - } - { - .reg .b32 %dummy; - mov.b64 {%r6161,%dummy}, %rd354; - } - shf.l.wrap.b32 %r6162, %r6161, %r6160, 10; - shf.l.wrap.b32 %r6163, %r6160, %r6161, 10; - mov.b64 %rd381, {%r6163, %r6162}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6164}, %rd362; - } - { - .reg .b32 %dummy; - mov.b64 {%r6165,%dummy}, %rd362; - } - shf.l.wrap.b32 %r6166, %r6165, %r6164, 15; - shf.l.wrap.b32 %r6167, %r6164, %r6165, 15; - mov.b64 %rd382, {%r6167, %r6166}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6168}, %rd369; - } - { - .reg .b32 %dummy; - mov.b64 {%r6169,%dummy}, %rd369; - } - shf.l.wrap.b32 %r6170, %r6169, %r6168, 21; - shf.l.wrap.b32 %r6171, %r6168, %r6169, 21; - mov.b64 %rd383, {%r6171, %r6170}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6172}, %rd366; - } - { - .reg .b32 %dummy; - mov.b64 {%r6173,%dummy}, %rd366; - } - shf.l.wrap.b32 %r6174, %r6173, %r6172, 28; - shf.l.wrap.b32 %r6175, %r6172, %r6173, 28; - mov.b64 %rd384, {%r6175, %r6174}; - { - .reg .b32 %dummy; - mov.b64 {%r6176,%dummy}, %rd346; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6177}, %rd346; - } - shf.r.wrap.b32 %r6178, %r6177, %r6176, 28; - shf.r.wrap.b32 %r6179, %r6176, %r6177, 28; - mov.b64 %rd385, {%r6179, %r6178}; - { - .reg .b32 %dummy; - mov.b64 {%r6180,%dummy}, %rd355; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6181}, %rd355; - } - shf.r.wrap.b32 %r6182, %r6181, %r6180, 19; - shf.r.wrap.b32 %r6183, %r6180, %r6181, 19; - mov.b64 %rd386, {%r6183, %r6182}; - { - .reg .b32 %dummy; - mov.b64 {%r6184,%dummy}, %rd367; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6185}, %rd367; - } - shf.r.wrap.b32 %r6186, %r6185, %r6184, 9; - shf.r.wrap.b32 %r6187, %r6184, %r6185, 9; - mov.b64 %rd387, {%r6187, %r6186}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6188}, %rd356; - } - { - .reg .b32 %dummy; - mov.b64 {%r6189,%dummy}, %rd356; - } - shf.l.wrap.b32 %r6190, %r6189, %r6188, 2; - shf.l.wrap.b32 %r6191, %r6188, %r6189, 2; - mov.b64 %rd388, {%r6191, %r6190}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6192}, %rd377; - } - { - .reg .b32 %dummy; - mov.b64 {%r6193,%dummy}, %rd377; - } - shf.l.wrap.b32 %r6194, %r6193, %r6192, 14; - shf.l.wrap.b32 %r6195, %r6192, %r6193, 14; - mov.b64 %rd389, {%r6195, %r6194}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6196}, %rd373; - } - { - .reg .b32 %dummy; - mov.b64 {%r6197,%dummy}, %rd373; - } - shf.l.wrap.b32 %r6198, %r6197, %r6196, 27; - shf.l.wrap.b32 %r6199, %r6196, %r6197, 27; - mov.b64 %rd390, {%r6199, %r6198}; - { - .reg .b32 %dummy; - mov.b64 {%r6200,%dummy}, %rd348; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6201}, %rd348; - } - shf.r.wrap.b32 %r6202, %r6201, %r6200, 23; - shf.r.wrap.b32 %r6203, %r6200, %r6201, 23; - mov.b64 %rd391, {%r6203, %r6202}; - { - .reg .b32 %dummy; - mov.b64 {%r6204,%dummy}, %rd370; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6205}, %rd370; - } - shf.r.wrap.b32 %r6206, %r6205, %r6204, 8; - shf.r.wrap.b32 %r6207, %r6204, %r6205, 8; - mov.b64 %rd392, {%r6207, %r6206}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6208}, %rd376; - } - { - .reg .b32 %dummy; - mov.b64 {%r6209,%dummy}, %rd376; - } - shf.l.wrap.b32 %r6210, %r6209, %r6208, 8; - shf.l.wrap.b32 %r6211, %r6208, %r6209, 8; - mov.b64 %rd393, {%r6211, %r6210}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6212}, %rd368; - } - { - .reg .b32 %dummy; - mov.b64 {%r6213,%dummy}, %rd368; - } - shf.l.wrap.b32 %r6214, %r6213, %r6212, 25; - shf.l.wrap.b32 %r6215, %r6212, %r6213, 25; - mov.b64 %rd394, {%r6215, %r6214}; - { - .reg .b32 %dummy; - mov.b64 {%r6216,%dummy}, %rd361; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6217}, %rd361; - } - shf.r.wrap.b32 %r6218, %r6217, %r6216, 21; - shf.r.wrap.b32 %r6219, %r6216, %r6217, 21; - mov.b64 %rd395, {%r6219, %r6218}; - { - .reg .b32 %dummy; - mov.b64 {%r6220,%dummy}, %rd359; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6221}, %rd359; - } - shf.r.wrap.b32 %r6222, %r6221, %r6220, 2; - shf.r.wrap.b32 %r6223, %r6220, %r6221, 2; - mov.b64 %rd396, {%r6223, %r6222}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6224}, %rd349; - } - { - .reg .b32 %dummy; - mov.b64 {%r6225,%dummy}, %rd349; - } - shf.l.wrap.b32 %r6226, %r6225, %r6224, 18; - shf.l.wrap.b32 %r6227, %r6224, %r6225, 18; - mov.b64 %rd397, {%r6227, %r6226}; - { - .reg .b32 %dummy; - mov.b64 {%r6228,%dummy}, %rd375; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6229}, %rd375; - } - shf.r.wrap.b32 %r6230, %r6229, %r6228, 25; - shf.r.wrap.b32 %r6231, %r6228, %r6229, 25; - mov.b64 %rd398, {%r6231, %r6230}; - { - .reg .b32 %dummy; - mov.b64 {%r6232,%dummy}, %rd363; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6233}, %rd363; - } - shf.r.wrap.b32 %r6234, %r6233, %r6232, 3; - shf.r.wrap.b32 %r6235, %r6232, %r6233, 3; - mov.b64 %rd399, {%r6235, %r6234}; - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6236}, %rd374; - } - { - .reg .b32 %dummy; - mov.b64 {%r6237,%dummy}, %rd374; - } - shf.l.wrap.b32 %r6238, %r6237, %r6236, 20; - shf.l.wrap.b32 %r6239, %r6236, %r6237, 20; - mov.b64 %rd400, {%r6239, %r6238}; - { - .reg .b32 %dummy; - mov.b64 {%r6240,%dummy}, %rd353; - } - { - .reg .b32 %dummy; - mov.b64 {%dummy,%r6241}, %rd353; - } - shf.r.wrap.b32 %r6242, %r6241, %r6240, 20; - shf.r.wrap.b32 %r6243, %r6240, %r6241, 20; - mov.b64 %rd401, {%r6243, %r6242}; - not.b64 %rd402, %rd401; - and.b64 %rd403, %rd395, %rd402; - xor.b64 %rd404, %rd403, %rd345; - not.b64 %rd405, %rd395; - and.b64 %rd406, %rd383, %rd405; - xor.b64 %rd484, %rd406, %rd401; - not.b64 %rd407, %rd383; - and.b64 %rd408, %rd389, %rd407; - xor.b64 %rd479, %rd408, %rd395; - not.b64 %rd409, %rd389; - and.b64 %rd410, %rd345, %rd409; - xor.b64 %rd474, %rd410, %rd383; - not.b64 %rd411, %rd345; - and.b64 %rd412, %rd401, %rd411; - xor.b64 %rd469, %rd389, %rd412; - not.b64 %rd413, %rd400; - and.b64 %rd414, %rd379, %rd413; - xor.b64 %rd488, %rd414, %rd384; - not.b64 %rd415, %rd379; - and.b64 %rd416, %rd386, %rd415; - xor.b64 %rd483, %rd416, %rd400; - not.b64 %rd417, %rd386; - and.b64 %rd418, %rd399, %rd417; - xor.b64 %rd478, %rd418, %rd379; - not.b64 %rd419, %rd399; - and.b64 %rd420, %rd384, %rd419; - xor.b64 %rd473, %rd420, %rd386; - not.b64 %rd421, %rd384; - and.b64 %rd422, %rd400, %rd421; - xor.b64 %rd468, %rd399, %rd422; - not.b64 %rd423, %rd380; - and.b64 %rd424, %rd394, %rd423; - xor.b64 %rd487, %rd424, %rd378; - not.b64 %rd425, %rd394; - and.b64 %rd426, %rd393, %rd425; - xor.b64 %rd482, %rd426, %rd380; - not.b64 %rd427, %rd393; - and.b64 %rd428, %rd397, %rd427; - xor.b64 %rd477, %rd428, %rd394; - not.b64 %rd429, %rd397; - and.b64 %rd430, %rd378, %rd429; - xor.b64 %rd472, %rd430, %rd393; - not.b64 %rd431, %rd378; - and.b64 %rd432, %rd380, %rd431; - xor.b64 %rd467, %rd397, %rd432; - not.b64 %rd433, %rd385; - and.b64 %rd434, %rd381, %rd433; - xor.b64 %rd486, %rd434, %rd390; - not.b64 %rd435, %rd381; - and.b64 %rd436, %rd382, %rd435; - xor.b64 %rd481, %rd436, %rd385; - not.b64 %rd437, %rd382; - and.b64 %rd438, %rd392, %rd437; - xor.b64 %rd476, %rd438, %rd381; - not.b64 %rd439, %rd392; - and.b64 %rd440, %rd390, %rd439; - xor.b64 %rd471, %rd440, %rd382; - not.b64 %rd441, %rd390; - and.b64 %rd442, %rd385, %rd441; - xor.b64 %rd466, %rd392, %rd442; - not.b64 %rd443, %rd387; - and.b64 %rd444, %rd398, %rd443; - xor.b64 %rd485, %rd444, %rd396; - not.b64 %rd445, %rd398; - and.b64 %rd446, %rd391, %rd445; - xor.b64 %rd480, %rd446, %rd387; - not.b64 %rd447, %rd391; - and.b64 %rd448, %rd388, %rd447; - xor.b64 %rd475, %rd448, %rd398; - not.b64 %rd449, %rd388; - and.b64 %rd450, %rd396, %rd449; - xor.b64 %rd470, %rd450, %rd391; - not.b64 %rd451, %rd396; - and.b64 %rd452, %rd387, %rd451; - xor.b64 %rd465, %rd388, %rd452; - ld.global.nc.u64 %rd453, [%rd464]; - xor.b64 %rd73, %rd404, %rd453; - add.s64 %rd464, %rd464, 8; - add.s32 %r6244, %r6244, 1; - setp.ne.s32 %p11, %r6244, 24; - @%p11 bra $L__BB0_9; - - ld.const.u64 %rd75, [target+24]; - setp.eq.s64 %p12, %rd474, %rd75; - @%p12 bra $L__BB0_12; - bra.uni $L__BB0_11; + // begin inline asm + // chi + lop3.b32 %r9999, %r9919, %r9943, %r9975, 0xD2; + lop3.b32 %r10000, %r9923, %r9947, %r9979, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+32], {%r9999, %r10000}; + // begin inline asm + // chi + lop3.b32 %r10007, %r9943, %r9975, %r9951, 0xD2; + lop3.b32 %r10008, %r9947, %r9979, %r9955, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+40], {%r10007, %r10008}; + // begin inline asm + // chi + lop3.b32 %r10015, %r9975, %r9951, %r10026, 0xD2; + lop3.b32 %r10016, %r9979, %r9955, %r10029, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+48], {%r10015, %r10016}; + // begin inline asm + // chi + lop3.b32 %r10023, %r9951, %r10026, %r9919, 0xD2; + lop3.b32 %r10024, %r9955, %r10029, %r9923, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+56], {%r10023, %r10024}; + // begin inline asm + // chi + lop3.b32 %r10031, %r9967, %r9927, %r9983, 0xD2; + lop3.b32 %r10032, %r9971, %r9931, %r9987, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+64], {%r10031, %r10032}; + // begin inline asm + // chi + lop3.b32 %r10039, %r9927, %r9983, %r9959, 0xD2; + lop3.b32 %r10040, %r9931, %r9987, %r9963, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+72], {%r10039, %r10040}; + // begin inline asm + // chi + lop3.b32 %r10047, %r9983, %r9959, %r9935, 0xD2; + lop3.b32 %r10048, %r9987, %r9963, %r9939, 0xD2; + // end inline asm + st.local.v2.u32 [%rd55+80], {%r10047, %r10048}; + // begin inline asm + ld.global.nc.v2.u32 {%r10055,%r10056}, [%rd498]; + // end inline asm + xor.b32 %r10067, %r9992, %r10056; + xor.b32 %r10068, %r9991, %r10055; + st.local.v2.u32 [%rd55+24], {%r10068, %r10067}; + st.global.u64 [%rd36], %rd1261; + st.global.u64 [%rd36+8], %rd1262; + st.global.u64 [%rd36+16], %rd1263; + st.global.u64 [%rd36+24], %rd62; + st.global.u64 [%rd36+32], %rd1264; + st.global.u64 [%rd36+40], %rd64; + st.global.u64 [%rd36+48], %rd65; + st.global.u64 [%rd36+56], %rd66; + st.global.v2.u32 [%rd36+64], {%r10068, %r10067}; + st.global.v2.u32 [%rd36+72], {%r9999, %r10000}; + st.global.v2.u32 [%rd36+80], {%r10007, %r10008}; + st.global.v2.u32 [%rd36+88], {%r10015, %r10016}; + st.global.v2.u32 [%rd36+96], {%r10023, %r10024}; + st.global.v2.u32 [%rd36+104], {%r10031, %r10032}; + st.global.v2.u32 [%rd36+112], {%r10039, %r10040}; + st.global.v2.u32 [%rd36+120], {%r10047, %r10048}; + +$L__BB2_44: + shl.b32 %r1678, %r25, 1; + mul.wide.u32 %rd678, %r1678, -954391867; + shr.u64 %rd679, %rd678, 32; + cvt.u32.u64 %r13353, %rd679; + sub.s32 %r13354, %r1678, %r13353; + shr.u32 %r13355, %r13354, 1; + add.s32 %r13356, %r13355, %r13353; + shr.u32 %r13357, %r13356, 20; + mul.lo.s32 %r13358, %r13357, 1179641; + sub.s32 %r13359, %r1678, %r13358; + mul.wide.u32 %rd681, %r13359, 64; + add.s64 %rd128, %rd471, %rd681; + or.b32 %r1679, %r1678, 1; + mul.wide.u32 %rd682, %r1679, -954391867; + shr.u64 %rd683, %rd682, 32; + cvt.u32.u64 %r13360, %rd683; + sub.s32 %r13361, %r1679, %r13360; + shr.u32 %r13362, %r13361, 1; + add.s32 %r13363, %r13362, %r13360; + shr.u32 %r13364, %r13363, 20; + mul.lo.s32 %r13365, %r13364, 1179641; + sub.s32 %r13366, %r1679, %r13365; + mul.wide.u32 %rd684, %r13366, 64; + add.s64 %rd129, %rd471, %rd684; + @%p16 bra $L__BB2_58; + + cvta.to.global.u64 %rd685, %rd353; + mul.wide.u32 %rd686, %r25, 128; + add.s64 %rd130, %rd685, %rd686; + ld.global.u64 %rd1265, [%rd130]; + setp.eq.s64 %p29, %rd1265, 0; + @%p29 bra $L__BB2_47; + + ld.global.u64 %rd1268, [%rd130+32]; + ld.global.u64 %rd1267, [%rd130+16]; + ld.global.u64 %rd1266, [%rd130+8]; + bra.uni $L__BB2_69; + +$L__BB2_58: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd788, 1179641; + st.local.u64 [%rd3+8], %rd788; + st.local.u32 [%rd3+16], %r1678; + ld.global.u64 %rd789, [%rd128]; + ld.global.u64 %rd790, [%rd128+8]; + ld.global.u64 %rd791, [%rd128+16]; + ld.global.u64 %rd792, [%rd128+24]; + ld.global.u64 %rd793, [%rd128+32]; + ld.global.u64 %rd794, [%rd128+40]; + ld.global.u64 %rd795, [%rd128+48]; + ld.global.u64 %rd796, [%rd128+56]; + st.local.u64 [%rd3+24], %rd789; + st.local.u64 [%rd3+32], %rd790; + st.local.u64 [%rd3+40], %rd791; + st.local.u64 [%rd3+48], %rd792; + st.local.u64 [%rd3+56], %rd793; + st.local.u64 [%rd3+64], %rd794; + st.local.u64 [%rd3+72], %rd795; + st.local.u64 [%rd3+80], %rd796; + cvt.u32.u64 %r16692, %rd789; + xor.b32 %r16693, %r1678, %r16692; + st.local.u32 [%rd3+24], %r16693; + mov.u32 %r30250, 0; + st.local.v2.u32 [%rd3+96], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+104], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+112], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+120], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+128], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+136], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+144], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+152], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+160], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+168], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+176], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+184], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+192], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+200], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+208], {%r30250, %r30250}; + st.local.v2.u32 [%rd3+216], {%r30250, %r30250}; + mov.u32 %r30265, -2147483648; + mov.u32 %r16665, 1; + st.local.v2.u32 [%rd3+88], {%r16665, %r30265}; + ld.local.v2.u32 {%r30286, %r30287}, [%rd3+24]; + mov.b64 {%r30284, %r30285}, %rd794; + shr.u64 %rd797, %rd790, 32; + cvt.u32.u64 %r30298, %rd790; + cvt.u32.u64 %r30299, %rd797; + shr.u64 %rd798, %rd795, 32; + cvt.u32.u64 %r30296, %rd795; + cvt.u32.u64 %r30297, %rd798; + shr.u64 %rd799, %rd791, 32; + cvt.u32.u64 %r30294, %rd791; + cvt.u32.u64 %r30295, %rd799; + shr.u64 %rd800, %rd796, 32; + cvt.u32.u64 %r30292, %rd796; + cvt.u32.u64 %r30293, %rd800; + shr.u64 %rd801, %rd792, 32; + cvt.u32.u64 %r30290, %rd792; + cvt.u32.u64 %r30291, %rd801; + shr.u64 %rd802, %rd793, 32; + cvt.u32.u64 %r30288, %rd793; + cvt.u32.u64 %r30289, %rd802; + mov.u32 %r30251, %r30250; + mov.u32 %r30252, %r30250; + mov.u32 %r30253, %r30250; + mov.u32 %r30254, %r30250; + mov.u32 %r30255, %r30250; + mov.u32 %r30256, %r30250; + mov.u32 %r30257, %r30250; + mov.u32 %r30258, %r30250; + mov.u32 %r30259, %r30250; + mov.u32 %r30260, %r30250; + mov.u32 %r30261, %r30250; + mov.u32 %r30262, %r30250; + mov.u32 %r30263, %r30250; + mov.u32 %r30264, %r16665; + mov.u32 %r30266, %r30250; + mov.u32 %r30267, %r30250; + mov.u32 %r30268, %r30250; + mov.u32 %r30269, %r30250; + mov.u32 %r30270, %r30250; + mov.u32 %r30271, %r30250; + mov.u32 %r30272, %r30250; + mov.u32 %r30273, %r30250; + mov.u32 %r30274, %r30250; + mov.u32 %r30275, %r30250; + mov.u32 %r30276, %r30250; + mov.u32 %r30277, %r30250; + mov.u32 %r30278, %r30250; + mov.u32 %r30279, %r30250; + mov.u32 %r30280, %r30250; + mov.u32 %r30281, %r30250; + mov.u32 %r30282, %r30250; + mov.u32 %r30283, %r30250; + mov.u32 %r30300, %r30250; + +$L__BB2_59: + // begin inline asm + // xor5 + lop3.b32 %r16696, %r30286, %r30284, %r30282, 0x96; + lop3.b32 %r16696, %r16696, %r30280, %r30278, 0x96; + lop3.b32 %r16697, %r30287, %r30285, %r30283, 0x96; + lop3.b32 %r16697, %r16697, %r30281, %r30279, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16708, %r30298, %r30296, %r30276, 0x96; + lop3.b32 %r16708, %r16708, %r30274, %r30272, 0x96; + lop3.b32 %r16709, %r30299, %r30297, %r30277, 0x96; + lop3.b32 %r16709, %r16709, %r30275, %r30273, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16720, %r30294, %r30292, %r30270, 0x96; + lop3.b32 %r16720, %r16720, %r30268, %r30266, 0x96; + lop3.b32 %r16721, %r30295, %r30293, %r30271, 0x96; + lop3.b32 %r16721, %r16721, %r30269, %r30267, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16732, %r30290, %r30264, %r30262, 0x96; + lop3.b32 %r16732, %r16732, %r30260, %r30258, 0x96; + lop3.b32 %r16733, %r30291, %r30265, %r30263, 0x96; + lop3.b32 %r16733, %r16733, %r30261, %r30259, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16744, %r30288, %r30256, %r30254, 0x96; + lop3.b32 %r16744, %r16744, %r30252, %r30250, 0x96; + lop3.b32 %r16745, %r30289, %r30257, %r30255, 0x96; + lop3.b32 %r16745, %r16745, %r30253, %r30251, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16756, %r16709, %r16708, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16760, %r16708, %r16709, %r16665; + // end inline asm + xor.b32 %r17190, %r16756, %r16744; + xor.b32 %r17191, %r16760, %r16745; + xor.b32 %r17023, %r30286, %r17190; + xor.b32 %r17026, %r30287, %r17191; + xor.b32 %r16930, %r30284, %r17190; + xor.b32 %r16929, %r30285, %r17191; + xor.b32 %r16977, %r30282, %r17190; + xor.b32 %r16978, %r30283, %r17191; + xor.b32 %r16882, %r30280, %r17190; + xor.b32 %r16881, %r30281, %r17191; + xor.b32 %r16833, %r30278, %r17190; + xor.b32 %r16834, %r30279, %r17191; + // begin inline asm + shf.l.wrap.b32 %r16764, %r16721, %r16720, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16768, %r16720, %r16721, %r16665; + // end inline asm + xor.b32 %r17192, %r16764, %r16696; + xor.b32 %r17193, %r16768, %r16697; + xor.b32 %r16985, %r30298, %r17192; + xor.b32 %r16986, %r30299, %r17193; + xor.b32 %r16802, %r30296, %r17192; + xor.b32 %r16801, %r30297, %r17193; + xor.b32 %r16961, %r30276, %r17192; + xor.b32 %r16962, %r30277, %r17193; + xor.b32 %r16922, %r30274, %r17192; + xor.b32 %r16921, %r30275, %r17193; + xor.b32 %r16905, %r30272, %r17192; + xor.b32 %r16906, %r30273, %r17193; + // begin inline asm + shf.l.wrap.b32 %r16772, %r16733, %r16732, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16776, %r16732, %r16733, %r16665; + // end inline asm + xor.b32 %r17194, %r16772, %r16708; + xor.b32 %r17195, %r16776, %r16709; + xor.b32 %r16842, %r30294, %r17194; + xor.b32 %r16841, %r30295, %r17195; + xor.b32 %r16969, %r30292, %r17194; + xor.b32 %r16970, %r30293, %r17195; + xor.b32 %r16850, %r30270, %r17194; + xor.b32 %r16849, %r30271, %r17195; + xor.b32 %r16953, %r30268, %r17194; + xor.b32 %r16954, %r30269, %r17195; + xor.b32 %r16818, %r30266, %r17194; + xor.b32 %r16817, %r30267, %r17195; + // begin inline asm + shf.l.wrap.b32 %r16780, %r16745, %r16744, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16784, %r16744, %r16745, %r16665; + // end inline asm + xor.b32 %r17196, %r16780, %r16720; + xor.b32 %r17197, %r16784, %r16721; + xor.b32 %r16937, %r30290, %r17196; + xor.b32 %r16938, %r30291, %r17197; + xor.b32 %r16914, %r30264, %r17196; + xor.b32 %r16913, %r30265, %r17197; + xor.b32 %r16857, %r30262, %r17196; + xor.b32 %r16858, %r30263, %r17197; + xor.b32 %r16945, %r30260, %r17196; + xor.b32 %r16946, %r30261, %r17197; + xor.b32 %r16874, %r30258, %r17196; + xor.b32 %r16873, %r30259, %r17197; + // begin inline asm + shf.l.wrap.b32 %r16788, %r16697, %r16696, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16792, %r16696, %r16697, %r16665; + // end inline asm + xor.b32 %r17198, %r16788, %r16732; + xor.b32 %r17199, %r16792, %r16733; + xor.b32 %r16889, %r30288, %r17198; + xor.b32 %r16890, %r30289, %r17199; + xor.b32 %r16809, %r30256, %r17198; + xor.b32 %r16810, %r30257, %r17199; + xor.b32 %r16826, %r30254, %r17198; + xor.b32 %r16825, %r30255, %r17199; + xor.b32 %r16865, %r30252, %r17198; + xor.b32 %r16866, %r30253, %r17199; + xor.b32 %r16897, %r30250, %r17198; + xor.b32 %r16898, %r30251, %r17199; + mov.u32 %r16803, 44; + // begin inline asm + shf.l.wrap.b32 %r16796, %r16802, %r16801, %r16803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16800, %r16801, %r16802, %r16803; + // end inline asm + mov.u32 %r16811, 20; + // begin inline asm + shf.l.wrap.b32 %r16804, %r16810, %r16809, %r16811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16808, %r16809, %r16810, %r16811; + // end inline asm + mov.u32 %r16819, 61; + // begin inline asm + shf.l.wrap.b32 %r16812, %r16818, %r16817, %r16819; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16816, %r16817, %r16818, %r16819; + // end inline asm + mov.u32 %r16827, 39; + // begin inline asm + shf.l.wrap.b32 %r16820, %r16826, %r16825, %r16827; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16824, %r16825, %r16826, %r16827; + // end inline asm + mov.u32 %r16835, 18; + // begin inline asm + shf.l.wrap.b32 %r16828, %r16834, %r16833, %r16835; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16832, %r16833, %r16834, %r16835; + // end inline asm + mov.u32 %r16843, 62; + // begin inline asm + shf.l.wrap.b32 %r16836, %r16842, %r16841, %r16843; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16840, %r16841, %r16842, %r16843; + // end inline asm + mov.u32 %r16851, 43; + // begin inline asm + shf.l.wrap.b32 %r16844, %r16850, %r16849, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16848, %r16849, %r16850, %r16851; + // end inline asm + mov.u32 %r16859, 25; + // begin inline asm + shf.l.wrap.b32 %r16852, %r16858, %r16857, %r16859; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16856, %r16857, %r16858, %r16859; + // end inline asm + mov.u32 %r16867, 8; + // begin inline asm + shf.l.wrap.b32 %r16860, %r16866, %r16865, %r16867; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16864, %r16865, %r16866, %r16867; + // end inline asm + mov.u32 %r16875, 56; + // begin inline asm + shf.l.wrap.b32 %r16868, %r16874, %r16873, %r16875; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16872, %r16873, %r16874, %r16875; + // end inline asm + mov.u32 %r16883, 41; + // begin inline asm + shf.l.wrap.b32 %r16876, %r16882, %r16881, %r16883; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16880, %r16881, %r16882, %r16883; + // end inline asm + mov.u32 %r16891, 27; + // begin inline asm + shf.l.wrap.b32 %r16884, %r16890, %r16889, %r16891; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16888, %r16889, %r16890, %r16891; + // end inline asm + mov.u32 %r16899, 14; + // begin inline asm + shf.l.wrap.b32 %r16892, %r16898, %r16897, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16896, %r16897, %r16898, %r16899; + // end inline asm + mov.u32 %r16907, 2; + // begin inline asm + shf.l.wrap.b32 %r16900, %r16906, %r16905, %r16907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16904, %r16905, %r16906, %r16907; + // end inline asm + mov.u32 %r16915, 55; + // begin inline asm + shf.l.wrap.b32 %r16908, %r16914, %r16913, %r16915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16912, %r16913, %r16914, %r16915; + // end inline asm + mov.u32 %r16923, 45; + // begin inline asm + shf.l.wrap.b32 %r16916, %r16922, %r16921, %r16923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16920, %r16921, %r16922, %r16923; + // end inline asm + mov.u32 %r16931, 36; + // begin inline asm + shf.l.wrap.b32 %r16924, %r16930, %r16929, %r16931; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16928, %r16929, %r16930, %r16931; + // end inline asm + mov.u32 %r16939, 28; + // begin inline asm + shf.l.wrap.b32 %r16932, %r16938, %r16937, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16936, %r16937, %r16938, %r16939; + // end inline asm + mov.u32 %r16947, 21; + // begin inline asm + shf.l.wrap.b32 %r16940, %r16946, %r16945, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16944, %r16945, %r16946, %r16947; + // end inline asm + mov.u32 %r16955, 15; + // begin inline asm + shf.l.wrap.b32 %r16948, %r16954, %r16953, %r16955; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16952, %r16953, %r16954, %r16955; + // end inline asm + mov.u32 %r16963, 10; + // begin inline asm + shf.l.wrap.b32 %r16956, %r16962, %r16961, %r16963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16960, %r16961, %r16962, %r16963; + // end inline asm + mov.u32 %r16971, 6; + // begin inline asm + shf.l.wrap.b32 %r16964, %r16970, %r16969, %r16971; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16968, %r16969, %r16970, %r16971; + // end inline asm + mov.u32 %r16979, 3; + // begin inline asm + shf.l.wrap.b32 %r16972, %r16978, %r16977, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16976, %r16977, %r16978, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16980, %r16986, %r16985, %r16665; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16984, %r16985, %r16986, %r16665; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16988, %r17023, %r16796, %r16844, 0xD2; + lop3.b32 %r16989, %r17026, %r16800, %r16848, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30298, %r16796, %r16844, %r16940, 0xD2; + lop3.b32 %r30299, %r16800, %r16848, %r16944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30294, %r16844, %r16940, %r16892, 0xD2; + lop3.b32 %r30295, %r16848, %r16944, %r16896, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30290, %r16940, %r16892, %r17023, 0xD2; + lop3.b32 %r30291, %r16944, %r16896, %r17026, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30288, %r16892, %r17023, %r16796, 0xD2; + lop3.b32 %r30289, %r16896, %r17026, %r16800, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30284, %r16932, %r16804, %r16972, 0xD2; + lop3.b32 %r30285, %r16936, %r16808, %r16976, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30296, %r16804, %r16972, %r16916, 0xD2; + lop3.b32 %r30297, %r16808, %r16976, %r16920, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30292, %r16972, %r16916, %r16812, 0xD2; + lop3.b32 %r30293, %r16976, %r16920, %r16816, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30264, %r16916, %r16812, %r16932, 0xD2; + lop3.b32 %r30265, %r16920, %r16816, %r16936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30264, %r30265}; + // begin inline asm + // chi + lop3.b32 %r30256, %r16812, %r16932, %r16804, 0xD2; + lop3.b32 %r30257, %r16816, %r16936, %r16808, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30256, %r30257}; + // begin inline asm + // chi + lop3.b32 %r30282, %r16980, %r16964, %r16852, 0xD2; + lop3.b32 %r30283, %r16984, %r16968, %r16856, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30282, %r30283}; + // begin inline asm + // chi + lop3.b32 %r30276, %r16964, %r16852, %r16860, 0xD2; + lop3.b32 %r30277, %r16968, %r16856, %r16864, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30276, %r30277}; + // begin inline asm + // chi + lop3.b32 %r30270, %r16852, %r16860, %r16828, 0xD2; + lop3.b32 %r30271, %r16856, %r16864, %r16832, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30270, %r30271}; + // begin inline asm + // chi + lop3.b32 %r30262, %r16860, %r16828, %r16980, 0xD2; + lop3.b32 %r30263, %r16864, %r16832, %r16984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30262, %r30263}; + // begin inline asm + // chi + lop3.b32 %r30254, %r16828, %r16980, %r16964, 0xD2; + lop3.b32 %r30255, %r16832, %r16984, %r16968, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30254, %r30255}; + // begin inline asm + // chi + lop3.b32 %r30280, %r16884, %r16924, %r16956, 0xD2; + lop3.b32 %r30281, %r16888, %r16928, %r16960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30280, %r30281}; + // begin inline asm + // chi + lop3.b32 %r30274, %r16924, %r16956, %r16948, 0xD2; + lop3.b32 %r30275, %r16928, %r16960, %r16952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30274, %r30275}; + // begin inline asm + // chi + lop3.b32 %r30268, %r16956, %r16948, %r16868, 0xD2; + lop3.b32 %r30269, %r16960, %r16952, %r16872, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30268, %r30269}; + // begin inline asm + // chi + lop3.b32 %r30260, %r16948, %r16868, %r16884, 0xD2; + lop3.b32 %r30261, %r16952, %r16872, %r16888, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30260, %r30261}; + // begin inline asm + // chi + lop3.b32 %r30252, %r16868, %r16884, %r16924, 0xD2; + lop3.b32 %r30253, %r16872, %r16888, %r16928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30252, %r30253}; + // begin inline asm + // chi + lop3.b32 %r30278, %r16836, %r16908, %r16820, 0xD2; + lop3.b32 %r30279, %r16840, %r16912, %r16824, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30278, %r30279}; + // begin inline asm + // chi + lop3.b32 %r30272, %r16908, %r16820, %r16876, 0xD2; + lop3.b32 %r30273, %r16912, %r16824, %r16880, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30272, %r30273}; + // begin inline asm + // chi + lop3.b32 %r30266, %r16820, %r16876, %r16900, 0xD2; + lop3.b32 %r30267, %r16824, %r16880, %r16904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30266, %r30267}; + // begin inline asm + // chi + lop3.b32 %r30258, %r16876, %r16900, %r16836, 0xD2; + lop3.b32 %r30259, %r16880, %r16904, %r16840, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30258, %r30259}; + // begin inline asm + // chi + lop3.b32 %r30250, %r16900, %r16836, %r16908, 0xD2; + lop3.b32 %r30251, %r16904, %r16840, %r16912, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30250, %r30251}; + mul.wide.s32 %rd804, %r30300, 8; + mov.u64 %rd805, keccak_round_constants; + cvta.const.u64 %rd806, %rd805; + add.s64 %rd803, %rd806, %rd804; + // begin inline asm + ld.global.nc.v2.u32 {%r17188,%r17189}, [%rd803]; + // end inline asm + xor.b32 %r30286, %r16988, %r17188; + xor.b32 %r30287, %r16989, %r17189; + add.s32 %r30300, %r30300, 1; + setp.lt.u32 %p35, %r30300, 23; + @%p35 bra $L__BB2_59; + + add.u64 %rd178, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30298, %r30299}; + st.local.v2.u32 [%rd3+72], {%r30296, %r30297}; + st.local.v2.u32 [%rd3+40], {%r30294, %r30295}; + st.local.v2.u32 [%rd3+80], {%r30292, %r30293}; + st.local.v2.u32 [%rd3+48], {%r30290, %r30291}; + st.local.v2.u32 [%rd3+56], {%r30288, %r30289}; + st.local.v2.u32 [%rd3+24], {%r30286, %r30287}; + // begin inline asm + // xor5 + lop3.b32 %r17200, %r30286, %r30284, %r30282, 0x96; + lop3.b32 %r17200, %r17200, %r30280, %r30278, 0x96; + lop3.b32 %r17201, %r30287, %r30285, %r30283, 0x96; + lop3.b32 %r17201, %r17201, %r30281, %r30279, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17212, %r30298, %r30296, %r30276, 0x96; + lop3.b32 %r17212, %r17212, %r30274, %r30272, 0x96; + lop3.b32 %r17213, %r30299, %r30297, %r30277, 0x96; + lop3.b32 %r17213, %r17213, %r30275, %r30273, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17224, %r30294, %r30292, %r30270, 0x96; + lop3.b32 %r17224, %r17224, %r30268, %r30266, 0x96; + lop3.b32 %r17225, %r30295, %r30293, %r30271, 0x96; + lop3.b32 %r17225, %r17225, %r30269, %r30267, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17236, %r30290, %r30264, %r30262, 0x96; + lop3.b32 %r17236, %r17236, %r30260, %r30258, 0x96; + lop3.b32 %r17237, %r30291, %r30265, %r30263, 0x96; + lop3.b32 %r17237, %r17237, %r30261, %r30259, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17248, %r30288, %r30256, %r30254, 0x96; + lop3.b32 %r17248, %r17248, %r30252, %r30250, 0x96; + lop3.b32 %r17249, %r30289, %r30257, %r30255, 0x96; + lop3.b32 %r17249, %r17249, %r30253, %r30251, 0x96; + // end inline asm + mov.u32 %r17452, 1; + // begin inline asm + shf.l.wrap.b32 %r17260, %r17213, %r17212, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17264, %r17212, %r17213, %r17452; + // end inline asm + xor.b32 %r17479, %r17260, %r17248; + xor.b32 %r17480, %r17264, %r17249; + xor.b32 %r17407, %r30286, %r17479; + xor.b32 %r17410, %r30287, %r17480; + xor.b32 %r17370, %r30283, %r17480; + xor.b32 %r17369, %r30282, %r17479; + st.local.v2.u32 [%rd3+104], {%r17369, %r17370}; + // begin inline asm + shf.l.wrap.b32 %r17268, %r17225, %r17224, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17272, %r17224, %r17225, %r17452; + // end inline asm + xor.b32 %r17481, %r17268, %r17200; + xor.b32 %r17482, %r17272, %r17201; + xor.b32 %r17306, %r30296, %r17481; + xor.b32 %r17305, %r30297, %r17482; + xor.b32 %r17345, %r30275, %r17482; + xor.b32 %r17346, %r30274, %r17481; + st.local.v2.u32 [%rd3+152], {%r17346, %r17345}; + // begin inline asm + shf.l.wrap.b32 %r17276, %r17237, %r17236, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17280, %r17236, %r17237, %r17452; + // end inline asm + xor.b32 %r17483, %r17276, %r17212; + xor.b32 %r17484, %r17280, %r17213; + xor.b32 %r17329, %r30271, %r17484; + xor.b32 %r17330, %r30270, %r17483; + st.local.v2.u32 [%rd3+120], {%r17330, %r17329}; + xor.b32 %r17321, %r30267, %r17484; + xor.b32 %r17322, %r30266, %r17483; + st.local.v2.u32 [%rd3+200], {%r17322, %r17321}; + // begin inline asm + shf.l.wrap.b32 %r17284, %r17249, %r17248, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17288, %r17248, %r17249, %r17452; + // end inline asm + xor.b32 %r17485, %r17284, %r17224; + xor.b32 %r17486, %r17288, %r17225; + xor.b32 %r17353, %r30290, %r17485; + xor.b32 %r17354, %r30291, %r17486; + xor.b32 %r17362, %r30261, %r17486; + xor.b32 %r17361, %r30260, %r17485; + st.local.v2.u32 [%rd3+168], {%r17361, %r17362}; + // begin inline asm + shf.l.wrap.b32 %r17292, %r17201, %r17200, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17296, %r17200, %r17201, %r17452; + // end inline asm + xor.b32 %r17487, %r17292, %r17236; + xor.b32 %r17488, %r17296, %r17237; + xor.b32 %r17313, %r30256, %r17487; + xor.b32 %r17314, %r30257, %r17488; + xor.b32 %r17338, %r30251, %r17488; + xor.b32 %r17337, %r30250, %r17487; + st.local.v2.u32 [%rd3+216], {%r17337, %r17338}; + // begin inline asm + shf.l.wrap.b32 %r17300, %r17306, %r17305, %r16803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17304, %r17305, %r17306, %r16803; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17308, %r17314, %r17313, %r16811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17312, %r17313, %r17314, %r16811; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17320, %r17321, %r17322, %r16819; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17316, %r17322, %r17321, %r16819; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r17316, %r17320}; + // begin inline asm + shf.l.wrap.b32 %r17324, %r17330, %r17329, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17328, %r17329, %r17330, %r16851; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17332, %r17338, %r17337, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17336, %r17337, %r17338, %r16899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17344, %r17345, %r17346, %r16923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17340, %r17346, %r17345, %r16923; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r17340, %r17344}; + // begin inline asm + shf.l.wrap.b32 %r17348, %r17354, %r17353, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17352, %r17353, %r17354, %r16939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17356, %r17362, %r17361, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17360, %r17361, %r17362, %r16947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17364, %r17370, %r17369, %r16979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17368, %r17369, %r17370, %r16979; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17372, %r17407, %r17300, %r17324, 0xD2; + lop3.b32 %r17373, %r17410, %r17304, %r17328, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30433, %r17300, %r17324, %r17356, 0xD2; + lop3.b32 %r30434, %r17304, %r17328, %r17360, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30433, %r30434}; + // begin inline asm + // chi + lop3.b32 %r30429, %r17324, %r17356, %r17332, 0xD2; + lop3.b32 %r30430, %r17328, %r17360, %r17336, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30429, %r30430}; + // begin inline asm + // chi + lop3.b32 %r30425, %r17356, %r17332, %r17407, 0xD2; + lop3.b32 %r30426, %r17360, %r17336, %r17410, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30425, %r30426}; + // begin inline asm + // chi + lop3.b32 %r30423, %r17332, %r17407, %r17300, 0xD2; + lop3.b32 %r30424, %r17336, %r17410, %r17304, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30423, %r30424}; + // begin inline asm + // chi + lop3.b32 %r30419, %r17348, %r17308, %r17364, 0xD2; + lop3.b32 %r30420, %r17352, %r17312, %r17368, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30419, %r30420}; + // begin inline asm + // chi + lop3.b32 %r30431, %r17308, %r17364, %r17340, 0xD2; + lop3.b32 %r30432, %r17312, %r17368, %r17344, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30431, %r30432}; + // begin inline asm + // chi + lop3.b32 %r30427, %r17364, %r17340, %r17316, 0xD2; + lop3.b32 %r30428, %r17368, %r17344, %r17320, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30427, %r30428}; + add.s64 %rd807, %rd806, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r17436,%r17437}, [%rd807]; + // end inline asm + xor.b32 %r30421, %r17372, %r17436; + xor.b32 %r30422, %r17373, %r17437; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + st.local.u64 [%rd178], %rd354; + mov.u64 %rd811, 1179641; + st.local.u64 [%rd178+8], %rd811; + st.local.u32 [%rd178+16], %r1679; + ld.global.u64 %rd812, [%rd129]; + ld.global.u64 %rd813, [%rd129+8]; + ld.global.u64 %rd814, [%rd129+16]; + ld.global.u64 %rd815, [%rd129+24]; + ld.global.u64 %rd816, [%rd129+32]; + ld.global.u64 %rd817, [%rd129+40]; + ld.global.u64 %rd818, [%rd129+48]; + ld.global.u64 %rd819, [%rd129+56]; + st.local.u64 [%rd178+32], %rd813; + st.local.u64 [%rd178+40], %rd814; + st.local.u64 [%rd178+48], %rd815; + st.local.u64 [%rd178+56], %rd816; + st.local.u64 [%rd178+64], %rd817; + st.local.u64 [%rd178+72], %rd818; + st.local.u64 [%rd178+80], %rd819; + cvt.u32.u64 %r17489, %rd812; + xor.b32 %r17490, %r1679, %r17489; + st.local.u64 [%rd178+24], %rd812; + st.local.u32 [%rd178+24], %r17490; + mov.u32 %r30301, 0; + st.local.v2.u32 [%rd178+96], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+104], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+112], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+120], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+128], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+136], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+144], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+152], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+160], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+168], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+176], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+184], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+192], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+200], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+208], {%r30301, %r30301}; + st.local.v2.u32 [%rd178+216], {%r30301, %r30301}; + mov.u32 %r30316, -2147483648; + st.local.v2.u32 [%rd178+88], {%r17452, %r30316}; + ld.local.v2.u32 {%r30337, %r30338}, [%rd178+24]; + mov.b64 {%r30335, %r30336}, %rd817; + shr.u64 %rd820, %rd813, 32; + cvt.u32.u64 %r30349, %rd813; + cvt.u32.u64 %r30350, %rd820; + shr.u64 %rd821, %rd818, 32; + cvt.u32.u64 %r30347, %rd818; + cvt.u32.u64 %r30348, %rd821; + shr.u64 %rd822, %rd814, 32; + cvt.u32.u64 %r30345, %rd814; + cvt.u32.u64 %r30346, %rd822; + shr.u64 %rd823, %rd819, 32; + cvt.u32.u64 %r30343, %rd819; + cvt.u32.u64 %r30344, %rd823; + shr.u64 %rd824, %rd815, 32; + cvt.u32.u64 %r30341, %rd815; + cvt.u32.u64 %r30342, %rd824; + shr.u64 %rd825, %rd816, 32; + cvt.u32.u64 %r30339, %rd816; + cvt.u32.u64 %r30340, %rd825; + mov.u32 %r30302, %r30301; + mov.u32 %r30303, %r30301; + mov.u32 %r30304, %r30301; + mov.u32 %r30305, %r30301; + mov.u32 %r30306, %r30301; + mov.u32 %r30307, %r30301; + mov.u32 %r30308, %r30301; + mov.u32 %r30309, %r30301; + mov.u32 %r30310, %r30301; + mov.u32 %r30311, %r30301; + mov.u32 %r30312, %r30301; + mov.u32 %r30313, %r30301; + mov.u32 %r30314, %r30301; + mov.u32 %r30315, %r17452; + mov.u32 %r30317, %r30301; + mov.u32 %r30318, %r30301; + mov.u32 %r30319, %r30301; + mov.u32 %r30320, %r30301; + mov.u32 %r30321, %r30301; + mov.u32 %r30322, %r30301; + mov.u32 %r30323, %r30301; + mov.u32 %r30324, %r30301; + mov.u32 %r30325, %r30301; + mov.u32 %r30326, %r30301; + mov.u32 %r30327, %r30301; + mov.u32 %r30328, %r30301; + mov.u32 %r30329, %r30301; + mov.u32 %r30330, %r30301; + mov.u32 %r30331, %r30301; + mov.u32 %r30332, %r30301; + mov.u32 %r30333, %r30301; + mov.u32 %r30334, %r30301; + mov.u32 %r30351, %r30301; + +$L__BB2_61: + // begin inline asm + // xor5 + lop3.b32 %r17493, %r30337, %r30335, %r30333, 0x96; + lop3.b32 %r17493, %r17493, %r30331, %r30329, 0x96; + lop3.b32 %r17494, %r30338, %r30336, %r30334, 0x96; + lop3.b32 %r17494, %r17494, %r30332, %r30330, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17505, %r30349, %r30347, %r30327, 0x96; + lop3.b32 %r17505, %r17505, %r30325, %r30323, 0x96; + lop3.b32 %r17506, %r30350, %r30348, %r30328, 0x96; + lop3.b32 %r17506, %r17506, %r30326, %r30324, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17517, %r30345, %r30343, %r30321, 0x96; + lop3.b32 %r17517, %r17517, %r30319, %r30317, 0x96; + lop3.b32 %r17518, %r30346, %r30344, %r30322, 0x96; + lop3.b32 %r17518, %r17518, %r30320, %r30318, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17529, %r30341, %r30315, %r30313, 0x96; + lop3.b32 %r17529, %r17529, %r30311, %r30309, 0x96; + lop3.b32 %r17530, %r30342, %r30316, %r30314, 0x96; + lop3.b32 %r17530, %r17530, %r30312, %r30310, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r17541, %r30339, %r30307, %r30305, 0x96; + lop3.b32 %r17541, %r17541, %r30303, %r30301, 0x96; + lop3.b32 %r17542, %r30340, %r30308, %r30306, 0x96; + lop3.b32 %r17542, %r17542, %r30304, %r30302, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17553, %r17506, %r17505, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17557, %r17505, %r17506, %r17452; + // end inline asm + xor.b32 %r17987, %r17553, %r17541; + xor.b32 %r17988, %r17557, %r17542; + xor.b32 %r17820, %r30337, %r17987; + xor.b32 %r17823, %r30338, %r17988; + xor.b32 %r17727, %r30335, %r17987; + xor.b32 %r17726, %r30336, %r17988; + xor.b32 %r17774, %r30333, %r17987; + xor.b32 %r17775, %r30334, %r17988; + xor.b32 %r17679, %r30331, %r17987; + xor.b32 %r17678, %r30332, %r17988; + xor.b32 %r17630, %r30329, %r17987; + xor.b32 %r17631, %r30330, %r17988; + // begin inline asm + shf.l.wrap.b32 %r17561, %r17518, %r17517, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17565, %r17517, %r17518, %r17452; + // end inline asm + xor.b32 %r17989, %r17561, %r17493; + xor.b32 %r17990, %r17565, %r17494; + xor.b32 %r17782, %r30349, %r17989; + xor.b32 %r17783, %r30350, %r17990; + xor.b32 %r17599, %r30347, %r17989; + xor.b32 %r17598, %r30348, %r17990; + xor.b32 %r17758, %r30327, %r17989; + xor.b32 %r17759, %r30328, %r17990; + xor.b32 %r17719, %r30325, %r17989; + xor.b32 %r17718, %r30326, %r17990; + xor.b32 %r17702, %r30323, %r17989; + xor.b32 %r17703, %r30324, %r17990; + // begin inline asm + shf.l.wrap.b32 %r17569, %r17530, %r17529, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17573, %r17529, %r17530, %r17452; + // end inline asm + xor.b32 %r17991, %r17569, %r17505; + xor.b32 %r17992, %r17573, %r17506; + xor.b32 %r17639, %r30345, %r17991; + xor.b32 %r17638, %r30346, %r17992; + xor.b32 %r17766, %r30343, %r17991; + xor.b32 %r17767, %r30344, %r17992; + xor.b32 %r17647, %r30321, %r17991; + xor.b32 %r17646, %r30322, %r17992; + xor.b32 %r17750, %r30319, %r17991; + xor.b32 %r17751, %r30320, %r17992; + xor.b32 %r17615, %r30317, %r17991; + xor.b32 %r17614, %r30318, %r17992; + // begin inline asm + shf.l.wrap.b32 %r17577, %r17542, %r17541, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17581, %r17541, %r17542, %r17452; + // end inline asm + xor.b32 %r17993, %r17577, %r17517; + xor.b32 %r17994, %r17581, %r17518; + xor.b32 %r17734, %r30341, %r17993; + xor.b32 %r17735, %r30342, %r17994; + xor.b32 %r17711, %r30315, %r17993; + xor.b32 %r17710, %r30316, %r17994; + xor.b32 %r17654, %r30313, %r17993; + xor.b32 %r17655, %r30314, %r17994; + xor.b32 %r17742, %r30311, %r17993; + xor.b32 %r17743, %r30312, %r17994; + xor.b32 %r17671, %r30309, %r17993; + xor.b32 %r17670, %r30310, %r17994; + // begin inline asm + shf.l.wrap.b32 %r17585, %r17494, %r17493, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17589, %r17493, %r17494, %r17452; + // end inline asm + xor.b32 %r17995, %r17585, %r17529; + xor.b32 %r17996, %r17589, %r17530; + xor.b32 %r17686, %r30339, %r17995; + xor.b32 %r17687, %r30340, %r17996; + xor.b32 %r17606, %r30307, %r17995; + xor.b32 %r17607, %r30308, %r17996; + xor.b32 %r17623, %r30305, %r17995; + xor.b32 %r17622, %r30306, %r17996; + xor.b32 %r17662, %r30303, %r17995; + xor.b32 %r17663, %r30304, %r17996; + xor.b32 %r17694, %r30301, %r17995; + xor.b32 %r17695, %r30302, %r17996; + mov.u32 %r17600, 44; + // begin inline asm + shf.l.wrap.b32 %r17593, %r17599, %r17598, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17597, %r17598, %r17599, %r17600; + // end inline asm + mov.u32 %r17608, 20; + // begin inline asm + shf.l.wrap.b32 %r17601, %r17607, %r17606, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17605, %r17606, %r17607, %r17608; + // end inline asm + mov.u32 %r17616, 61; + // begin inline asm + shf.l.wrap.b32 %r17609, %r17615, %r17614, %r17616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17613, %r17614, %r17615, %r17616; + // end inline asm + mov.u32 %r17624, 39; + // begin inline asm + shf.l.wrap.b32 %r17617, %r17623, %r17622, %r17624; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17621, %r17622, %r17623, %r17624; + // end inline asm + mov.u32 %r17632, 18; + // begin inline asm + shf.l.wrap.b32 %r17625, %r17631, %r17630, %r17632; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17629, %r17630, %r17631, %r17632; + // end inline asm + mov.u32 %r17640, 62; + // begin inline asm + shf.l.wrap.b32 %r17633, %r17639, %r17638, %r17640; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17637, %r17638, %r17639, %r17640; + // end inline asm + mov.u32 %r17648, 43; + // begin inline asm + shf.l.wrap.b32 %r17641, %r17647, %r17646, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17645, %r17646, %r17647, %r17648; + // end inline asm + mov.u32 %r17656, 25; + // begin inline asm + shf.l.wrap.b32 %r17649, %r17655, %r17654, %r17656; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17653, %r17654, %r17655, %r17656; + // end inline asm + mov.u32 %r17664, 8; + // begin inline asm + shf.l.wrap.b32 %r17657, %r17663, %r17662, %r17664; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17661, %r17662, %r17663, %r17664; + // end inline asm + mov.u32 %r17672, 56; + // begin inline asm + shf.l.wrap.b32 %r17665, %r17671, %r17670, %r17672; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17669, %r17670, %r17671, %r17672; + // end inline asm + mov.u32 %r17680, 41; + // begin inline asm + shf.l.wrap.b32 %r17673, %r17679, %r17678, %r17680; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17677, %r17678, %r17679, %r17680; + // end inline asm + mov.u32 %r17688, 27; + // begin inline asm + shf.l.wrap.b32 %r17681, %r17687, %r17686, %r17688; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17685, %r17686, %r17687, %r17688; + // end inline asm + mov.u32 %r17696, 14; + // begin inline asm + shf.l.wrap.b32 %r17689, %r17695, %r17694, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17693, %r17694, %r17695, %r17696; + // end inline asm + mov.u32 %r17704, 2; + // begin inline asm + shf.l.wrap.b32 %r17697, %r17703, %r17702, %r17704; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17701, %r17702, %r17703, %r17704; + // end inline asm + mov.u32 %r17712, 55; + // begin inline asm + shf.l.wrap.b32 %r17705, %r17711, %r17710, %r17712; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17709, %r17710, %r17711, %r17712; + // end inline asm + mov.u32 %r17720, 45; + // begin inline asm + shf.l.wrap.b32 %r17713, %r17719, %r17718, %r17720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17717, %r17718, %r17719, %r17720; + // end inline asm + mov.u32 %r17728, 36; + // begin inline asm + shf.l.wrap.b32 %r17721, %r17727, %r17726, %r17728; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17725, %r17726, %r17727, %r17728; + // end inline asm + mov.u32 %r17736, 28; + // begin inline asm + shf.l.wrap.b32 %r17729, %r17735, %r17734, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17733, %r17734, %r17735, %r17736; + // end inline asm + mov.u32 %r17744, 21; + // begin inline asm + shf.l.wrap.b32 %r17737, %r17743, %r17742, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17741, %r17742, %r17743, %r17744; + // end inline asm + mov.u32 %r17752, 15; + // begin inline asm + shf.l.wrap.b32 %r17745, %r17751, %r17750, %r17752; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17749, %r17750, %r17751, %r17752; + // end inline asm + mov.u32 %r17760, 10; + // begin inline asm + shf.l.wrap.b32 %r17753, %r17759, %r17758, %r17760; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17757, %r17758, %r17759, %r17760; + // end inline asm + mov.u32 %r17768, 6; + // begin inline asm + shf.l.wrap.b32 %r17761, %r17767, %r17766, %r17768; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17765, %r17766, %r17767, %r17768; + // end inline asm + mov.u32 %r17776, 3; + // begin inline asm + shf.l.wrap.b32 %r17769, %r17775, %r17774, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17773, %r17774, %r17775, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17777, %r17783, %r17782, %r17452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r17781, %r17782, %r17783, %r17452; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r17785, %r17820, %r17593, %r17641, 0xD2; + lop3.b32 %r17786, %r17823, %r17597, %r17645, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30349, %r17593, %r17641, %r17737, 0xD2; + lop3.b32 %r30350, %r17597, %r17645, %r17741, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30345, %r17641, %r17737, %r17689, 0xD2; + lop3.b32 %r30346, %r17645, %r17741, %r17693, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30341, %r17737, %r17689, %r17820, 0xD2; + lop3.b32 %r30342, %r17741, %r17693, %r17823, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30339, %r17689, %r17820, %r17593, 0xD2; + lop3.b32 %r30340, %r17693, %r17823, %r17597, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30335, %r17729, %r17601, %r17769, 0xD2; + lop3.b32 %r30336, %r17733, %r17605, %r17773, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30347, %r17601, %r17769, %r17713, 0xD2; + lop3.b32 %r30348, %r17605, %r17773, %r17717, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30343, %r17769, %r17713, %r17609, 0xD2; + lop3.b32 %r30344, %r17773, %r17717, %r17613, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30315, %r17713, %r17609, %r17729, 0xD2; + lop3.b32 %r30316, %r17717, %r17613, %r17733, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r30315, %r30316}; + // begin inline asm + // chi + lop3.b32 %r30307, %r17609, %r17729, %r17601, 0xD2; + lop3.b32 %r30308, %r17613, %r17733, %r17605, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r30307, %r30308}; + // begin inline asm + // chi + lop3.b32 %r30333, %r17777, %r17761, %r17649, 0xD2; + lop3.b32 %r30334, %r17781, %r17765, %r17653, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+104], {%r30333, %r30334}; + // begin inline asm + // chi + lop3.b32 %r30327, %r17761, %r17649, %r17657, 0xD2; + lop3.b32 %r30328, %r17765, %r17653, %r17661, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+112], {%r30327, %r30328}; + // begin inline asm + // chi + lop3.b32 %r30321, %r17649, %r17657, %r17625, 0xD2; + lop3.b32 %r30322, %r17653, %r17661, %r17629, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+120], {%r30321, %r30322}; + // begin inline asm + // chi + lop3.b32 %r30313, %r17657, %r17625, %r17777, 0xD2; + lop3.b32 %r30314, %r17661, %r17629, %r17781, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+128], {%r30313, %r30314}; + // begin inline asm + // chi + lop3.b32 %r30305, %r17625, %r17777, %r17761, 0xD2; + lop3.b32 %r30306, %r17629, %r17781, %r17765, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+136], {%r30305, %r30306}; + // begin inline asm + // chi + lop3.b32 %r30331, %r17681, %r17721, %r17753, 0xD2; + lop3.b32 %r30332, %r17685, %r17725, %r17757, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+144], {%r30331, %r30332}; + // begin inline asm + // chi + lop3.b32 %r30325, %r17721, %r17753, %r17745, 0xD2; + lop3.b32 %r30326, %r17725, %r17757, %r17749, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+152], {%r30325, %r30326}; + // begin inline asm + // chi + lop3.b32 %r30319, %r17753, %r17745, %r17665, 0xD2; + lop3.b32 %r30320, %r17757, %r17749, %r17669, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+160], {%r30319, %r30320}; + // begin inline asm + // chi + lop3.b32 %r30311, %r17745, %r17665, %r17681, 0xD2; + lop3.b32 %r30312, %r17749, %r17669, %r17685, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+168], {%r30311, %r30312}; + // begin inline asm + // chi + lop3.b32 %r30303, %r17665, %r17681, %r17721, 0xD2; + lop3.b32 %r30304, %r17669, %r17685, %r17725, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+176], {%r30303, %r30304}; + // begin inline asm + // chi + lop3.b32 %r30329, %r17633, %r17705, %r17617, 0xD2; + lop3.b32 %r30330, %r17637, %r17709, %r17621, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+184], {%r30329, %r30330}; + // begin inline asm + // chi + lop3.b32 %r30323, %r17705, %r17617, %r17673, 0xD2; + lop3.b32 %r30324, %r17709, %r17621, %r17677, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+192], {%r30323, %r30324}; + // begin inline asm + // chi + lop3.b32 %r30317, %r17617, %r17673, %r17697, 0xD2; + lop3.b32 %r30318, %r17621, %r17677, %r17701, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+200], {%r30317, %r30318}; + // begin inline asm + // chi + lop3.b32 %r30309, %r17673, %r17697, %r17633, 0xD2; + lop3.b32 %r30310, %r17677, %r17701, %r17637, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+208], {%r30309, %r30310}; + // begin inline asm + // chi + lop3.b32 %r30301, %r17697, %r17633, %r17705, 0xD2; + lop3.b32 %r30302, %r17701, %r17637, %r17709, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+216], {%r30301, %r30302}; + mul.wide.s32 %rd827, %r30351, 8; + add.s64 %rd826, %rd806, %rd827; + // begin inline asm + ld.global.nc.v2.u32 {%r17985,%r17986}, [%rd826]; + // end inline asm + xor.b32 %r30337, %r17785, %r17985; + xor.b32 %r30338, %r17786, %r17986; + add.s32 %r30351, %r30351, 1; + setp.lt.u32 %p36, %r30351, 23; + @%p36 bra $L__BB2_61; + + mov.u32 %r30384, 0; + mov.u32 %r18096, 1; + st.local.v2.u32 [%rd178+32], {%r30349, %r30350}; + st.local.v2.u32 [%rd178+72], {%r30347, %r30348}; + st.local.v2.u32 [%rd178+40], {%r30345, %r30346}; + st.local.v2.u32 [%rd178+80], {%r30343, %r30344}; + st.local.v2.u32 [%rd178+48], {%r30341, %r30342}; + st.local.v2.u32 [%rd178+56], {%r30339, %r30340}; + st.local.v2.u32 [%rd178+24], {%r30337, %r30338}; + // begin inline asm + // xor5 + lop3.b32 %r17997, %r30337, %r30335, %r30333, 0x96; + lop3.b32 %r17997, %r17997, %r30331, %r30329, 0x96; + lop3.b32 %r17998, %r30338, %r30336, %r30334, 0x96; + lop3.b32 %r17998, %r17998, %r30332, %r30330, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18009, %r30349, %r30347, %r30327, 0x96; + lop3.b32 %r18009, %r18009, %r30325, %r30323, 0x96; + lop3.b32 %r18010, %r30350, %r30348, %r30328, 0x96; + lop3.b32 %r18010, %r18010, %r30326, %r30324, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18021, %r30345, %r30343, %r30321, 0x96; + lop3.b32 %r18021, %r18021, %r30319, %r30317, 0x96; + lop3.b32 %r18022, %r30346, %r30344, %r30322, 0x96; + lop3.b32 %r18022, %r18022, %r30320, %r30318, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18033, %r30341, %r30315, %r30313, 0x96; + lop3.b32 %r18033, %r18033, %r30311, %r30309, 0x96; + lop3.b32 %r18034, %r30342, %r30316, %r30314, 0x96; + lop3.b32 %r18034, %r18034, %r30312, %r30310, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18045, %r30339, %r30307, %r30305, 0x96; + lop3.b32 %r18045, %r18045, %r30303, %r30301, 0x96; + lop3.b32 %r18046, %r30340, %r30308, %r30306, 0x96; + lop3.b32 %r18046, %r18046, %r30304, %r30302, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18057, %r18010, %r18009, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18061, %r18009, %r18010, %r18096; + // end inline asm + xor.b32 %r18236, %r18057, %r18045; + xor.b32 %r18237, %r18061, %r18046; + xor.b32 %r18204, %r30337, %r18236; + xor.b32 %r18207, %r30338, %r18237; + xor.b32 %r18167, %r30334, %r18237; + xor.b32 %r18166, %r30333, %r18236; + st.local.v2.u32 [%rd178+104], {%r18166, %r18167}; + // begin inline asm + shf.l.wrap.b32 %r18065, %r18022, %r18021, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18069, %r18021, %r18022, %r18096; + // end inline asm + xor.b32 %r18238, %r18065, %r17997; + xor.b32 %r18239, %r18069, %r17998; + xor.b32 %r18103, %r30347, %r18238; + xor.b32 %r18102, %r30348, %r18239; + xor.b32 %r18142, %r30326, %r18239; + xor.b32 %r18143, %r30325, %r18238; + st.local.v2.u32 [%rd178+152], {%r18143, %r18142}; + // begin inline asm + shf.l.wrap.b32 %r18073, %r18034, %r18033, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18077, %r18033, %r18034, %r18096; + // end inline asm + xor.b32 %r18240, %r18073, %r18009; + xor.b32 %r18241, %r18077, %r18010; + xor.b32 %r18126, %r30322, %r18241; + xor.b32 %r18127, %r30321, %r18240; + st.local.v2.u32 [%rd178+120], {%r18127, %r18126}; + xor.b32 %r18118, %r30318, %r18241; + xor.b32 %r18119, %r30317, %r18240; + st.local.v2.u32 [%rd178+200], {%r18119, %r18118}; + // begin inline asm + shf.l.wrap.b32 %r18081, %r18046, %r18045, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18085, %r18045, %r18046, %r18096; + // end inline asm + xor.b32 %r18242, %r18081, %r18021; + xor.b32 %r18243, %r18085, %r18022; + xor.b32 %r18150, %r30341, %r18242; + xor.b32 %r18151, %r30342, %r18243; + xor.b32 %r18159, %r30312, %r18243; + xor.b32 %r18158, %r30311, %r18242; + st.local.v2.u32 [%rd178+168], {%r18158, %r18159}; + // begin inline asm + shf.l.wrap.b32 %r18089, %r17998, %r17997, %r18096; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18093, %r17997, %r17998, %r18096; + // end inline asm + xor.b32 %r18244, %r18089, %r18033; + xor.b32 %r18245, %r18093, %r18034; + xor.b32 %r18110, %r30307, %r18244; + xor.b32 %r18111, %r30308, %r18245; + xor.b32 %r18135, %r30302, %r18245; + xor.b32 %r18134, %r30301, %r18244; + st.local.v2.u32 [%rd178+216], {%r18134, %r18135}; + // begin inline asm + shf.l.wrap.b32 %r18097, %r18103, %r18102, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18101, %r18102, %r18103, %r17600; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18105, %r18111, %r18110, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18109, %r18110, %r18111, %r17608; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18117, %r18118, %r18119, %r17616; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18113, %r18119, %r18118, %r17616; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r18113, %r18117}; + // begin inline asm + shf.l.wrap.b32 %r18121, %r18127, %r18126, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18125, %r18126, %r18127, %r17648; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18129, %r18135, %r18134, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18133, %r18134, %r18135, %r17696; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18141, %r18142, %r18143, %r17720; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18137, %r18143, %r18142, %r17720; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r18137, %r18141}; + // begin inline asm + shf.l.wrap.b32 %r18145, %r18151, %r18150, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18149, %r18150, %r18151, %r17736; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18153, %r18159, %r18158, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18157, %r18158, %r18159, %r17744; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18161, %r18167, %r18166, %r17776; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18165, %r18166, %r18167, %r17776; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18169, %r18204, %r18097, %r18121, 0xD2; + lop3.b32 %r18170, %r18207, %r18101, %r18125, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30484, %r18097, %r18121, %r18153, 0xD2; + lop3.b32 %r30485, %r18101, %r18125, %r18157, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+32], {%r30484, %r30485}; + // begin inline asm + // chi + lop3.b32 %r30480, %r18121, %r18153, %r18129, 0xD2; + lop3.b32 %r30481, %r18125, %r18157, %r18133, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+40], {%r30480, %r30481}; + // begin inline asm + // chi + lop3.b32 %r30476, %r18153, %r18129, %r18204, 0xD2; + lop3.b32 %r30477, %r18157, %r18133, %r18207, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+48], {%r30476, %r30477}; + // begin inline asm + // chi + lop3.b32 %r30474, %r18129, %r18204, %r18097, 0xD2; + lop3.b32 %r30475, %r18133, %r18207, %r18101, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+56], {%r30474, %r30475}; + // begin inline asm + // chi + lop3.b32 %r30470, %r18145, %r18105, %r18161, 0xD2; + lop3.b32 %r30471, %r18149, %r18109, %r18165, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+64], {%r30470, %r30471}; + // begin inline asm + // chi + lop3.b32 %r30482, %r18105, %r18161, %r18137, 0xD2; + lop3.b32 %r30483, %r18109, %r18165, %r18141, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+72], {%r30482, %r30483}; + // begin inline asm + // chi + lop3.b32 %r30478, %r18161, %r18137, %r18113, 0xD2; + lop3.b32 %r30479, %r18165, %r18141, %r18117, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+80], {%r30478, %r30479}; + // begin inline asm + ld.global.nc.v2.u32 {%r18233,%r18234}, [%rd807]; + // end inline asm + xor.b32 %r30472, %r18169, %r18233; + xor.b32 %r30473, %r18170, %r18234; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + add.s64 %rd180, %rd178, 24; + add.s64 %rd181, %rd3, 24; + +$L__BB2_63: + shl.b32 %r18246, %r30384, 2; + cvt.u64.u32 %rd837, %r18246; + and.b64 %rd838, %rd837, 60; + add.s64 %rd839, %rd181, %rd838; + xor.b32 %r18247, %r1678, %r30384; + mul.lo.s32 %r18248, %r18247, 16777619; + ld.local.u32 %r18249, [%rd839]; + xor.b32 %r18250, %r18248, %r18249; + mul.wide.u32 %rd840, %r18250, -954391867; + shr.u64 %rd841, %rd840, 32; + cvt.u32.u64 %r18251, %rd841; + sub.s32 %r18252, %r18250, %r18251; + shr.u32 %r18253, %r18252, 1; + add.s32 %r18254, %r18253, %r18251; + shr.u32 %r18255, %r18254, 20; + mul.lo.s32 %r18256, %r18255, 1179641; + sub.s32 %r18257, %r18250, %r18256; + mul.wide.u32 %rd842, %r18257, 64; + add.s64 %rd843, %rd471, %rd842; + mul.lo.s32 %r18258, %r30421, 16777619; + ld.global.u32 %r18259, [%rd843]; + xor.b32 %r30421, %r18258, %r18259; + mul.lo.s32 %r18260, %r30422, 16777619; + ld.global.u32 %r18261, [%rd843+4]; + xor.b32 %r30422, %r18260, %r18261; + mul.lo.s32 %r18262, %r30433, 16777619; + ld.global.u32 %r18263, [%rd843+8]; + mul.lo.s32 %r18264, %r30434, 16777619; + ld.global.u32 %r18265, [%rd843+12]; + xor.b32 %r18266, %r18264, %r18265; + xor.b32 %r30433, %r18262, %r18263; + mov.b64 %rd844, {%r30433, %r18266}; + mul.lo.s32 %r18267, %r30429, 16777619; + ld.global.u32 %r18268, [%rd843+16]; + mul.lo.s32 %r18269, %r30430, 16777619; + ld.global.u32 %r18270, [%rd843+20]; + xor.b32 %r18271, %r18269, %r18270; + xor.b32 %r30429, %r18267, %r18268; + mov.b64 %rd845, {%r30429, %r18271}; + mul.lo.s32 %r18272, %r30425, 16777619; + ld.global.u32 %r18273, [%rd843+24]; + mul.lo.s32 %r18274, %r30426, 16777619; + ld.global.u32 %r18275, [%rd843+28]; + xor.b32 %r18276, %r18274, %r18275; + xor.b32 %r30425, %r18272, %r18273; + mov.b64 %rd846, {%r30425, %r18276}; + mul.lo.s32 %r18277, %r30423, 16777619; + ld.global.u32 %r18278, [%rd843+32]; + mul.lo.s32 %r18279, %r30424, 16777619; + ld.global.u32 %r18280, [%rd843+36]; + xor.b32 %r18281, %r18279, %r18280; + xor.b32 %r30423, %r18277, %r18278; + mov.b64 %rd847, {%r30423, %r18281}; + mul.lo.s32 %r18282, %r30419, 16777619; + ld.global.u32 %r18283, [%rd843+40]; + xor.b32 %r30419, %r18282, %r18283; + mul.lo.s32 %r18284, %r30420, 16777619; + ld.global.u32 %r18285, [%rd843+44]; + xor.b32 %r30420, %r18284, %r18285; + mul.lo.s32 %r18286, %r30431, 16777619; + ld.global.u32 %r18287, [%rd843+48]; + mul.lo.s32 %r18288, %r30432, 16777619; + ld.global.u32 %r18289, [%rd843+52]; + xor.b32 %r18290, %r18288, %r18289; + xor.b32 %r30431, %r18286, %r18287; + mov.b64 %rd848, {%r30431, %r18290}; + mul.lo.s32 %r18291, %r30427, 16777619; + ld.global.u32 %r18292, [%rd843+56]; + mul.lo.s32 %r18293, %r30428, 16777619; + ld.global.u32 %r18294, [%rd843+60]; + xor.b32 %r18295, %r18293, %r18294; + xor.b32 %r30427, %r18291, %r18292; + mov.b64 %rd849, {%r30427, %r18295}; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + st.local.v2.u32 [%rd3+32], {%r30433, %r18266}; + st.local.v2.u32 [%rd3+40], {%r30429, %r18271}; + st.local.v2.u32 [%rd3+48], {%r30425, %r18276}; + st.local.v2.u32 [%rd3+56], {%r30423, %r18281}; + st.local.v2.u32 [%rd3+64], {%r30419, %r30420}; + st.local.v2.u32 [%rd3+72], {%r30431, %r18290}; + st.local.v2.u32 [%rd3+80], {%r30427, %r18295}; + add.s64 %rd850, %rd180, %rd838; + xor.b32 %r18296, %r1679, %r30384; + mul.lo.s32 %r18297, %r18296, 16777619; + ld.local.u32 %r18298, [%rd850]; + xor.b32 %r18299, %r18297, %r18298; + mul.wide.u32 %rd851, %r18299, -954391867; + shr.u64 %rd852, %rd851, 32; + cvt.u32.u64 %r18300, %rd852; + sub.s32 %r18301, %r18299, %r18300; + shr.u32 %r18302, %r18301, 1; + add.s32 %r18303, %r18302, %r18300; + shr.u32 %r18304, %r18303, 20; + mul.lo.s32 %r18305, %r18304, 1179641; + sub.s32 %r18306, %r18299, %r18305; + mul.wide.u32 %rd853, %r18306, 64; + add.s64 %rd854, %rd471, %rd853; + mul.lo.s32 %r18307, %r30472, 16777619; + ld.global.u32 %r18308, [%rd854]; + xor.b32 %r30472, %r18307, %r18308; + mul.lo.s32 %r18309, %r30473, 16777619; + ld.global.u32 %r18310, [%rd854+4]; + xor.b32 %r30473, %r18309, %r18310; + mul.lo.s32 %r18311, %r30484, 16777619; + ld.global.u32 %r18312, [%rd854+8]; + mul.lo.s32 %r18313, %r30485, 16777619; + ld.global.u32 %r18314, [%rd854+12]; + xor.b32 %r18315, %r18313, %r18314; + xor.b32 %r30484, %r18311, %r18312; + mov.b64 %rd855, {%r30484, %r18315}; + mul.lo.s32 %r18316, %r30480, 16777619; + ld.global.u32 %r18317, [%rd854+16]; + mul.lo.s32 %r18318, %r30481, 16777619; + ld.global.u32 %r18319, [%rd854+20]; + xor.b32 %r18320, %r18318, %r18319; + xor.b32 %r30480, %r18316, %r18317; + mov.b64 %rd856, {%r30480, %r18320}; + mul.lo.s32 %r18321, %r30476, 16777619; + ld.global.u32 %r18322, [%rd854+24]; + mul.lo.s32 %r18323, %r30477, 16777619; + ld.global.u32 %r18324, [%rd854+28]; + xor.b32 %r18325, %r18323, %r18324; + xor.b32 %r30476, %r18321, %r18322; + mov.b64 %rd857, {%r30476, %r18325}; + mul.lo.s32 %r18326, %r30474, 16777619; + ld.global.u32 %r18327, [%rd854+32]; + mul.lo.s32 %r18328, %r30475, 16777619; + ld.global.u32 %r18329, [%rd854+36]; + xor.b32 %r18330, %r18328, %r18329; + xor.b32 %r30474, %r18326, %r18327; + mov.b64 %rd858, {%r30474, %r18330}; + mul.lo.s32 %r18331, %r30470, 16777619; + ld.global.u32 %r18332, [%rd854+40]; + xor.b32 %r30470, %r18331, %r18332; + mul.lo.s32 %r18333, %r30471, 16777619; + ld.global.u32 %r18334, [%rd854+44]; + xor.b32 %r30471, %r18333, %r18334; + mul.lo.s32 %r18335, %r30482, 16777619; + ld.global.u32 %r18336, [%rd854+48]; + mul.lo.s32 %r18337, %r30483, 16777619; + ld.global.u32 %r18338, [%rd854+52]; + xor.b32 %r18339, %r18337, %r18338; + xor.b32 %r30482, %r18335, %r18336; + mov.b64 %rd859, {%r30482, %r18339}; + mul.lo.s32 %r18340, %r30478, 16777619; + ld.global.u32 %r18341, [%rd854+56]; + mul.lo.s32 %r18342, %r30479, 16777619; + ld.global.u32 %r18343, [%rd854+60]; + xor.b32 %r18344, %r18342, %r18343; + xor.b32 %r30478, %r18340, %r18341; + mov.b64 %rd860, {%r30478, %r18344}; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + st.local.v2.u32 [%rd178+32], {%r30484, %r18315}; + st.local.v2.u32 [%rd178+40], {%r30480, %r18320}; + st.local.v2.u32 [%rd178+48], {%r30476, %r18325}; + st.local.v2.u32 [%rd178+56], {%r30474, %r18330}; + st.local.v2.u32 [%rd178+64], {%r30470, %r30471}; + st.local.v2.u32 [%rd178+72], {%r30482, %r18339}; + st.local.v2.u32 [%rd178+80], {%r30478, %r18344}; + add.s32 %r30384, %r30384, 1; + setp.lt.u32 %p37, %r30384, 512; + shr.u64 %rd861, %rd844, 32; + cvt.u32.u64 %r30434, %rd861; + shr.u64 %rd862, %rd845, 32; + cvt.u32.u64 %r30430, %rd862; + shr.u64 %rd863, %rd846, 32; + cvt.u32.u64 %r30426, %rd863; + shr.u64 %rd864, %rd847, 32; + cvt.u32.u64 %r30424, %rd864; + shr.u64 %rd865, %rd848, 32; + cvt.u32.u64 %r30432, %rd865; + shr.u64 %rd866, %rd849, 32; + cvt.u32.u64 %r30428, %rd866; + shr.u64 %rd867, %rd855, 32; + cvt.u32.u64 %r30485, %rd867; + shr.u64 %rd868, %rd856, 32; + cvt.u32.u64 %r30481, %rd868; + shr.u64 %rd869, %rd857, 32; + cvt.u32.u64 %r30477, %rd869; + shr.u64 %rd870, %rd858, 32; + cvt.u32.u64 %r30475, %rd870; + shr.u64 %rd871, %rd859, 32; + cvt.u32.u64 %r30483, %rd871; + shr.u64 %rd872, %rd860, 32; + cvt.u32.u64 %r30479, %rd872; + @%p37 bra $L__BB2_63; + + mov.u32 %r30385, 0; + st.local.v2.u32 [%rd3+96], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+104], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+112], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+120], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+128], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+136], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+144], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+152], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+160], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+168], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+176], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+184], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+192], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+200], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+208], {%r30385, %r30385}; + st.local.v2.u32 [%rd3+216], {%r30385, %r30385}; + mov.u32 %r30400, -2147483648; + mov.u32 %r18359, 1; + st.local.v2.u32 [%rd3+88], {%r18359, %r30400}; + mov.u32 %r30386, %r30385; + mov.u32 %r30387, %r30385; + mov.u32 %r30388, %r30385; + mov.u32 %r30389, %r30385; + mov.u32 %r30390, %r30385; + mov.u32 %r30391, %r30385; + mov.u32 %r30392, %r30385; + mov.u32 %r30393, %r30385; + mov.u32 %r30394, %r30385; + mov.u32 %r30395, %r30385; + mov.u32 %r30396, %r30385; + mov.u32 %r30397, %r30385; + mov.u32 %r30398, %r30385; + mov.u32 %r30399, %r18359; + mov.u32 %r30401, %r30385; + mov.u32 %r30402, %r30385; + mov.u32 %r30403, %r30385; + mov.u32 %r30404, %r30385; + mov.u32 %r30405, %r30385; + mov.u32 %r30406, %r30385; + mov.u32 %r30407, %r30385; + mov.u32 %r30408, %r30385; + mov.u32 %r30409, %r30385; + mov.u32 %r30410, %r30385; + mov.u32 %r30411, %r30385; + mov.u32 %r30412, %r30385; + mov.u32 %r30413, %r30385; + mov.u32 %r30414, %r30385; + mov.u32 %r30415, %r30385; + mov.u32 %r30416, %r30385; + mov.u32 %r30417, %r30385; + mov.u32 %r30418, %r30385; + mov.u32 %r30435, %r30385; + +$L__BB2_65: + // begin inline asm + // xor5 + lop3.b32 %r18386, %r30421, %r30419, %r30417, 0x96; + lop3.b32 %r18386, %r18386, %r30415, %r30413, 0x96; + lop3.b32 %r18387, %r30422, %r30420, %r30418, 0x96; + lop3.b32 %r18387, %r18387, %r30416, %r30414, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18398, %r30433, %r30431, %r30411, 0x96; + lop3.b32 %r18398, %r18398, %r30409, %r30407, 0x96; + lop3.b32 %r18399, %r30434, %r30432, %r30412, 0x96; + lop3.b32 %r18399, %r18399, %r30410, %r30408, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18410, %r30429, %r30427, %r30405, 0x96; + lop3.b32 %r18410, %r18410, %r30403, %r30401, 0x96; + lop3.b32 %r18411, %r30430, %r30428, %r30406, 0x96; + lop3.b32 %r18411, %r18411, %r30404, %r30402, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18422, %r30425, %r30399, %r30397, 0x96; + lop3.b32 %r18422, %r18422, %r30395, %r30393, 0x96; + lop3.b32 %r18423, %r30426, %r30400, %r30398, 0x96; + lop3.b32 %r18423, %r18423, %r30396, %r30394, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18434, %r30423, %r30391, %r30389, 0x96; + lop3.b32 %r18434, %r18434, %r30387, %r30385, 0x96; + lop3.b32 %r18435, %r30424, %r30392, %r30390, 0x96; + lop3.b32 %r18435, %r18435, %r30388, %r30386, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18446, %r18399, %r18398, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18450, %r18398, %r18399, %r18359; + // end inline asm + xor.b32 %r18880, %r18446, %r18434; + xor.b32 %r18881, %r18450, %r18435; + xor.b32 %r18713, %r30421, %r18880; + xor.b32 %r18716, %r30422, %r18881; + xor.b32 %r18620, %r30419, %r18880; + xor.b32 %r18619, %r30420, %r18881; + xor.b32 %r18667, %r30417, %r18880; + xor.b32 %r18668, %r30418, %r18881; + xor.b32 %r18572, %r30415, %r18880; + xor.b32 %r18571, %r30416, %r18881; + xor.b32 %r18523, %r30413, %r18880; + xor.b32 %r18524, %r30414, %r18881; + // begin inline asm + shf.l.wrap.b32 %r18454, %r18411, %r18410, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18458, %r18410, %r18411, %r18359; + // end inline asm + xor.b32 %r18882, %r18454, %r18386; + xor.b32 %r18883, %r18458, %r18387; + xor.b32 %r18675, %r30433, %r18882; + xor.b32 %r18676, %r30434, %r18883; + xor.b32 %r18492, %r30431, %r18882; + xor.b32 %r18491, %r30432, %r18883; + xor.b32 %r18651, %r30411, %r18882; + xor.b32 %r18652, %r30412, %r18883; + xor.b32 %r18612, %r30409, %r18882; + xor.b32 %r18611, %r30410, %r18883; + xor.b32 %r18595, %r30407, %r18882; + xor.b32 %r18596, %r30408, %r18883; + // begin inline asm + shf.l.wrap.b32 %r18462, %r18423, %r18422, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18466, %r18422, %r18423, %r18359; + // end inline asm + xor.b32 %r18884, %r18462, %r18398; + xor.b32 %r18885, %r18466, %r18399; + xor.b32 %r18532, %r30429, %r18884; + xor.b32 %r18531, %r30430, %r18885; + xor.b32 %r18659, %r30427, %r18884; + xor.b32 %r18660, %r30428, %r18885; + xor.b32 %r18540, %r30405, %r18884; + xor.b32 %r18539, %r30406, %r18885; + xor.b32 %r18643, %r30403, %r18884; + xor.b32 %r18644, %r30404, %r18885; + xor.b32 %r18508, %r30401, %r18884; + xor.b32 %r18507, %r30402, %r18885; + // begin inline asm + shf.l.wrap.b32 %r18470, %r18435, %r18434, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18474, %r18434, %r18435, %r18359; + // end inline asm + xor.b32 %r18886, %r18470, %r18410; + xor.b32 %r18887, %r18474, %r18411; + xor.b32 %r18627, %r30425, %r18886; + xor.b32 %r18628, %r30426, %r18887; + xor.b32 %r18604, %r30399, %r18886; + xor.b32 %r18603, %r30400, %r18887; + xor.b32 %r18547, %r30397, %r18886; + xor.b32 %r18548, %r30398, %r18887; + xor.b32 %r18635, %r30395, %r18886; + xor.b32 %r18636, %r30396, %r18887; + xor.b32 %r18564, %r30393, %r18886; + xor.b32 %r18563, %r30394, %r18887; + // begin inline asm + shf.l.wrap.b32 %r18478, %r18387, %r18386, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18482, %r18386, %r18387, %r18359; + // end inline asm + xor.b32 %r18888, %r18478, %r18422; + xor.b32 %r18889, %r18482, %r18423; + xor.b32 %r18579, %r30423, %r18888; + xor.b32 %r18580, %r30424, %r18889; + xor.b32 %r18499, %r30391, %r18888; + xor.b32 %r18500, %r30392, %r18889; + xor.b32 %r18516, %r30389, %r18888; + xor.b32 %r18515, %r30390, %r18889; + xor.b32 %r18555, %r30387, %r18888; + xor.b32 %r18556, %r30388, %r18889; + xor.b32 %r18587, %r30385, %r18888; + xor.b32 %r18588, %r30386, %r18889; + mov.u32 %r18493, 44; + // begin inline asm + shf.l.wrap.b32 %r18486, %r18492, %r18491, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18490, %r18491, %r18492, %r18493; + // end inline asm + mov.u32 %r18501, 20; + // begin inline asm + shf.l.wrap.b32 %r18494, %r18500, %r18499, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18498, %r18499, %r18500, %r18501; + // end inline asm + mov.u32 %r18509, 61; + // begin inline asm + shf.l.wrap.b32 %r18502, %r18508, %r18507, %r18509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18506, %r18507, %r18508, %r18509; + // end inline asm + mov.u32 %r18517, 39; + // begin inline asm + shf.l.wrap.b32 %r18510, %r18516, %r18515, %r18517; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18514, %r18515, %r18516, %r18517; + // end inline asm + mov.u32 %r18525, 18; + // begin inline asm + shf.l.wrap.b32 %r18518, %r18524, %r18523, %r18525; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18522, %r18523, %r18524, %r18525; + // end inline asm + mov.u32 %r18533, 62; + // begin inline asm + shf.l.wrap.b32 %r18526, %r18532, %r18531, %r18533; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18530, %r18531, %r18532, %r18533; + // end inline asm + mov.u32 %r18541, 43; + // begin inline asm + shf.l.wrap.b32 %r18534, %r18540, %r18539, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18538, %r18539, %r18540, %r18541; + // end inline asm + mov.u32 %r18549, 25; + // begin inline asm + shf.l.wrap.b32 %r18542, %r18548, %r18547, %r18549; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18546, %r18547, %r18548, %r18549; + // end inline asm + mov.u32 %r18557, 8; + // begin inline asm + shf.l.wrap.b32 %r18550, %r18556, %r18555, %r18557; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18554, %r18555, %r18556, %r18557; + // end inline asm + mov.u32 %r18565, 56; + // begin inline asm + shf.l.wrap.b32 %r18558, %r18564, %r18563, %r18565; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18562, %r18563, %r18564, %r18565; + // end inline asm + mov.u32 %r18573, 41; + // begin inline asm + shf.l.wrap.b32 %r18566, %r18572, %r18571, %r18573; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18570, %r18571, %r18572, %r18573; + // end inline asm + mov.u32 %r18581, 27; + // begin inline asm + shf.l.wrap.b32 %r18574, %r18580, %r18579, %r18581; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18578, %r18579, %r18580, %r18581; + // end inline asm + mov.u32 %r18589, 14; + // begin inline asm + shf.l.wrap.b32 %r18582, %r18588, %r18587, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18586, %r18587, %r18588, %r18589; + // end inline asm + mov.u32 %r18597, 2; + // begin inline asm + shf.l.wrap.b32 %r18590, %r18596, %r18595, %r18597; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18594, %r18595, %r18596, %r18597; + // end inline asm + mov.u32 %r18605, 55; + // begin inline asm + shf.l.wrap.b32 %r18598, %r18604, %r18603, %r18605; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18602, %r18603, %r18604, %r18605; + // end inline asm + mov.u32 %r18613, 45; + // begin inline asm + shf.l.wrap.b32 %r18606, %r18612, %r18611, %r18613; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18610, %r18611, %r18612, %r18613; + // end inline asm + mov.u32 %r18621, 36; + // begin inline asm + shf.l.wrap.b32 %r18614, %r18620, %r18619, %r18621; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18618, %r18619, %r18620, %r18621; + // end inline asm + mov.u32 %r18629, 28; + // begin inline asm + shf.l.wrap.b32 %r18622, %r18628, %r18627, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18626, %r18627, %r18628, %r18629; + // end inline asm + mov.u32 %r18637, 21; + // begin inline asm + shf.l.wrap.b32 %r18630, %r18636, %r18635, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18634, %r18635, %r18636, %r18637; + // end inline asm + mov.u32 %r18645, 15; + // begin inline asm + shf.l.wrap.b32 %r18638, %r18644, %r18643, %r18645; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18642, %r18643, %r18644, %r18645; + // end inline asm + mov.u32 %r18653, 10; + // begin inline asm + shf.l.wrap.b32 %r18646, %r18652, %r18651, %r18653; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18650, %r18651, %r18652, %r18653; + // end inline asm + mov.u32 %r18661, 6; + // begin inline asm + shf.l.wrap.b32 %r18654, %r18660, %r18659, %r18661; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18658, %r18659, %r18660, %r18661; + // end inline asm + mov.u32 %r18669, 3; + // begin inline asm + shf.l.wrap.b32 %r18662, %r18668, %r18667, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18666, %r18667, %r18668, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18670, %r18676, %r18675, %r18359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18674, %r18675, %r18676, %r18359; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r18678, %r18713, %r18486, %r18534, 0xD2; + lop3.b32 %r18679, %r18716, %r18490, %r18538, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30433, %r18486, %r18534, %r18630, 0xD2; + lop3.b32 %r30434, %r18490, %r18538, %r18634, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30429, %r18534, %r18630, %r18582, 0xD2; + lop3.b32 %r30430, %r18538, %r18634, %r18586, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30425, %r18630, %r18582, %r18713, 0xD2; + lop3.b32 %r30426, %r18634, %r18586, %r18716, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30423, %r18582, %r18713, %r18486, 0xD2; + lop3.b32 %r30424, %r18586, %r18716, %r18490, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30419, %r18622, %r18494, %r18662, 0xD2; + lop3.b32 %r30420, %r18626, %r18498, %r18666, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30431, %r18494, %r18662, %r18606, 0xD2; + lop3.b32 %r30432, %r18498, %r18666, %r18610, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30427, %r18662, %r18606, %r18502, 0xD2; + lop3.b32 %r30428, %r18666, %r18610, %r18506, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30399, %r18606, %r18502, %r18622, 0xD2; + lop3.b32 %r30400, %r18610, %r18506, %r18626, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30399, %r30400}; + // begin inline asm + // chi + lop3.b32 %r30391, %r18502, %r18622, %r18494, 0xD2; + lop3.b32 %r30392, %r18506, %r18626, %r18498, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30391, %r30392}; + // begin inline asm + // chi + lop3.b32 %r30417, %r18670, %r18654, %r18542, 0xD2; + lop3.b32 %r30418, %r18674, %r18658, %r18546, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30417, %r30418}; + // begin inline asm + // chi + lop3.b32 %r30411, %r18654, %r18542, %r18550, 0xD2; + lop3.b32 %r30412, %r18658, %r18546, %r18554, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30411, %r30412}; + // begin inline asm + // chi + lop3.b32 %r30405, %r18542, %r18550, %r18518, 0xD2; + lop3.b32 %r30406, %r18546, %r18554, %r18522, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30405, %r30406}; + // begin inline asm + // chi + lop3.b32 %r30397, %r18550, %r18518, %r18670, 0xD2; + lop3.b32 %r30398, %r18554, %r18522, %r18674, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30397, %r30398}; + // begin inline asm + // chi + lop3.b32 %r30389, %r18518, %r18670, %r18654, 0xD2; + lop3.b32 %r30390, %r18522, %r18674, %r18658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30389, %r30390}; + // begin inline asm + // chi + lop3.b32 %r30415, %r18574, %r18614, %r18646, 0xD2; + lop3.b32 %r30416, %r18578, %r18618, %r18650, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30415, %r30416}; + // begin inline asm + // chi + lop3.b32 %r30409, %r18614, %r18646, %r18638, 0xD2; + lop3.b32 %r30410, %r18618, %r18650, %r18642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30409, %r30410}; + // begin inline asm + // chi + lop3.b32 %r30403, %r18646, %r18638, %r18558, 0xD2; + lop3.b32 %r30404, %r18650, %r18642, %r18562, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30403, %r30404}; + // begin inline asm + // chi + lop3.b32 %r30395, %r18638, %r18558, %r18574, 0xD2; + lop3.b32 %r30396, %r18642, %r18562, %r18578, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30395, %r30396}; + // begin inline asm + // chi + lop3.b32 %r30387, %r18558, %r18574, %r18614, 0xD2; + lop3.b32 %r30388, %r18562, %r18578, %r18618, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30387, %r30388}; + // begin inline asm + // chi + lop3.b32 %r30413, %r18526, %r18598, %r18510, 0xD2; + lop3.b32 %r30414, %r18530, %r18602, %r18514, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30413, %r30414}; + // begin inline asm + // chi + lop3.b32 %r30407, %r18598, %r18510, %r18566, 0xD2; + lop3.b32 %r30408, %r18602, %r18514, %r18570, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30407, %r30408}; + // begin inline asm + // chi + lop3.b32 %r30401, %r18510, %r18566, %r18590, 0xD2; + lop3.b32 %r30402, %r18514, %r18570, %r18594, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30401, %r30402}; + // begin inline asm + // chi + lop3.b32 %r30393, %r18566, %r18590, %r18526, 0xD2; + lop3.b32 %r30394, %r18570, %r18594, %r18530, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30393, %r30394}; + // begin inline asm + // chi + lop3.b32 %r30385, %r18590, %r18526, %r18598, 0xD2; + lop3.b32 %r30386, %r18594, %r18530, %r18602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30385, %r30386}; + mul.wide.s32 %rd874, %r30435, 8; + add.s64 %rd873, %rd806, %rd874; + // begin inline asm + ld.global.nc.v2.u32 {%r18878,%r18879}, [%rd873]; + // end inline asm + xor.b32 %r30421, %r18678, %r18878; + xor.b32 %r30422, %r18679, %r18879; + add.s32 %r30435, %r30435, 1; + setp.lt.u32 %p38, %r30435, 23; + @%p38 bra $L__BB2_65; + + st.local.v2.u32 [%rd3+32], {%r30433, %r30434}; + st.local.v2.u32 [%rd3+72], {%r30431, %r30432}; + st.local.v2.u32 [%rd3+40], {%r30429, %r30430}; + st.local.v2.u32 [%rd3+80], {%r30427, %r30428}; + st.local.v2.u32 [%rd3+48], {%r30425, %r30426}; + st.local.v2.u32 [%rd3+56], {%r30423, %r30424}; + st.local.v2.u32 [%rd3+24], {%r30421, %r30422}; + // begin inline asm + // xor5 + lop3.b32 %r18890, %r30421, %r30419, %r30417, 0x96; + lop3.b32 %r18890, %r18890, %r30415, %r30413, 0x96; + lop3.b32 %r18891, %r30422, %r30420, %r30418, 0x96; + lop3.b32 %r18891, %r18891, %r30416, %r30414, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18902, %r30433, %r30431, %r30411, 0x96; + lop3.b32 %r18902, %r18902, %r30409, %r30407, 0x96; + lop3.b32 %r18903, %r30434, %r30432, %r30412, 0x96; + lop3.b32 %r18903, %r18903, %r30410, %r30408, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18914, %r30429, %r30427, %r30405, 0x96; + lop3.b32 %r18914, %r18914, %r30403, %r30401, 0x96; + lop3.b32 %r18915, %r30430, %r30428, %r30406, 0x96; + lop3.b32 %r18915, %r18915, %r30404, %r30402, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18926, %r30425, %r30399, %r30397, 0x96; + lop3.b32 %r18926, %r18926, %r30395, %r30393, 0x96; + lop3.b32 %r18927, %r30426, %r30400, %r30398, 0x96; + lop3.b32 %r18927, %r18927, %r30396, %r30394, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r18938, %r30423, %r30391, %r30389, 0x96; + lop3.b32 %r18938, %r18938, %r30387, %r30385, 0x96; + lop3.b32 %r18939, %r30424, %r30392, %r30390, 0x96; + lop3.b32 %r18939, %r18939, %r30388, %r30386, 0x96; + // end inline asm + mov.u32 %r19142, 1; + // begin inline asm + shf.l.wrap.b32 %r18950, %r18903, %r18902, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18954, %r18902, %r18903, %r19142; + // end inline asm + xor.b32 %r19169, %r18950, %r18938; + xor.b32 %r19170, %r18954, %r18939; + xor.b32 %r19097, %r30421, %r19169; + xor.b32 %r19100, %r30422, %r19170; + xor.b32 %r19060, %r30418, %r19170; + xor.b32 %r19059, %r30417, %r19169; + st.local.v2.u32 [%rd3+104], {%r19059, %r19060}; + // begin inline asm + shf.l.wrap.b32 %r18958, %r18915, %r18914, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18962, %r18914, %r18915, %r19142; + // end inline asm + xor.b32 %r19171, %r18958, %r18890; + xor.b32 %r19172, %r18962, %r18891; + xor.b32 %r18996, %r30431, %r19171; + xor.b32 %r18995, %r30432, %r19172; + xor.b32 %r19035, %r30410, %r19172; + xor.b32 %r19036, %r30409, %r19171; + st.local.v2.u32 [%rd3+152], {%r19036, %r19035}; + // begin inline asm + shf.l.wrap.b32 %r18966, %r18927, %r18926, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18970, %r18926, %r18927, %r19142; + // end inline asm + xor.b32 %r19173, %r18966, %r18902; + xor.b32 %r19174, %r18970, %r18903; + xor.b32 %r19019, %r30406, %r19174; + xor.b32 %r19020, %r30405, %r19173; + st.local.v2.u32 [%rd3+120], {%r19020, %r19019}; + xor.b32 %r19011, %r30402, %r19174; + xor.b32 %r19012, %r30401, %r19173; + st.local.v2.u32 [%rd3+200], {%r19012, %r19011}; + // begin inline asm + shf.l.wrap.b32 %r18974, %r18939, %r18938, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18978, %r18938, %r18939, %r19142; + // end inline asm + xor.b32 %r19175, %r18974, %r18914; + xor.b32 %r19176, %r18978, %r18915; + xor.b32 %r19043, %r30425, %r19175; + xor.b32 %r19044, %r30426, %r19176; + xor.b32 %r19052, %r30396, %r19176; + xor.b32 %r19051, %r30395, %r19175; + st.local.v2.u32 [%rd3+168], {%r19051, %r19052}; + // begin inline asm + shf.l.wrap.b32 %r18982, %r18891, %r18890, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18986, %r18890, %r18891, %r19142; + // end inline asm + xor.b32 %r19177, %r18982, %r18926; + xor.b32 %r19178, %r18986, %r18927; + xor.b32 %r19003, %r30391, %r19177; + xor.b32 %r19004, %r30392, %r19178; + xor.b32 %r19028, %r30386, %r19178; + xor.b32 %r19027, %r30385, %r19177; + st.local.v2.u32 [%rd3+216], {%r19027, %r19028}; + // begin inline asm + shf.l.wrap.b32 %r18990, %r18996, %r18995, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18994, %r18995, %r18996, %r18493; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r18998, %r19004, %r19003, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19002, %r19003, %r19004, %r18501; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19010, %r19011, %r19012, %r18509; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19006, %r19012, %r19011, %r18509; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r19006, %r19010}; + // begin inline asm + shf.l.wrap.b32 %r19014, %r19020, %r19019, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19018, %r19019, %r19020, %r18541; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19022, %r19028, %r19027, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19026, %r19027, %r19028, %r18589; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19034, %r19035, %r19036, %r18613; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19030, %r19036, %r19035, %r18613; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r19030, %r19034}; + // begin inline asm + shf.l.wrap.b32 %r19038, %r19044, %r19043, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19042, %r19043, %r19044, %r18629; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19046, %r19052, %r19051, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19050, %r19051, %r19052, %r18637; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19054, %r19060, %r19059, %r18669; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19058, %r19059, %r19060, %r18669; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19062, %r19097, %r18990, %r19014, 0xD2; + lop3.b32 %r19063, %r19100, %r18994, %r19018, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19070, %r18990, %r19014, %r19046, 0xD2; + lop3.b32 %r19071, %r18994, %r19018, %r19050, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r19070, %r19071}; + // begin inline asm + // chi + lop3.b32 %r19078, %r19014, %r19046, %r19022, 0xD2; + lop3.b32 %r19079, %r19018, %r19050, %r19026, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r19078, %r19079}; + // begin inline asm + // chi + lop3.b32 %r19086, %r19046, %r19022, %r19097, 0xD2; + lop3.b32 %r19087, %r19050, %r19026, %r19100, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r19086, %r19087}; + // begin inline asm + // chi + lop3.b32 %r19094, %r19022, %r19097, %r18990, 0xD2; + lop3.b32 %r19095, %r19026, %r19100, %r18994, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r19094, %r19095}; + // begin inline asm + // chi + lop3.b32 %r19102, %r19038, %r18998, %r19054, 0xD2; + lop3.b32 %r19103, %r19042, %r19002, %r19058, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r19102, %r19103}; + // begin inline asm + // chi + lop3.b32 %r19110, %r18998, %r19054, %r19030, 0xD2; + lop3.b32 %r19111, %r19002, %r19058, %r19034, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r19110, %r19111}; + // begin inline asm + // chi + lop3.b32 %r19118, %r19054, %r19030, %r19006, 0xD2; + lop3.b32 %r19119, %r19058, %r19034, %r19010, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r19118, %r19119}; + // begin inline asm + ld.global.nc.v2.u32 {%r19126,%r19127}, [%rd807]; + // end inline asm + xor.b32 %r19179, %r19063, %r19127; + xor.b32 %r19180, %r19062, %r19126; + mov.b64 %rd1265, {%r19180, %r19179}; + mov.b64 %rd1266, {%r19070, %r19071}; + mov.b64 %rd1267, {%r19078, %r19079}; + mov.b64 %rd1268, {%r19094, %r19095}; + mov.u32 %r30436, 0; + st.local.v2.u32 [%rd3+24], {%r19180, %r19179}; + st.local.v2.u32 [%rd178+96], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+104], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+112], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+120], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+128], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+136], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+144], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+152], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+160], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+168], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+176], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+184], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+192], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+200], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+208], {%r30436, %r30436}; + st.local.v2.u32 [%rd178+216], {%r30436, %r30436}; + mov.u32 %r30451, -2147483648; + st.local.v2.u32 [%rd178+88], {%r19142, %r30451}; + mov.u32 %r30437, %r30436; + mov.u32 %r30438, %r30436; + mov.u32 %r30439, %r30436; + mov.u32 %r30440, %r30436; + mov.u32 %r30441, %r30436; + mov.u32 %r30442, %r30436; + mov.u32 %r30443, %r30436; + mov.u32 %r30444, %r30436; + mov.u32 %r30445, %r30436; + mov.u32 %r30446, %r30436; + mov.u32 %r30447, %r30436; + mov.u32 %r30448, %r30436; + mov.u32 %r30449, %r30436; + mov.u32 %r30450, %r19142; + mov.u32 %r30452, %r30436; + mov.u32 %r30453, %r30436; + mov.u32 %r30454, %r30436; + mov.u32 %r30455, %r30436; + mov.u32 %r30456, %r30436; + mov.u32 %r30457, %r30436; + mov.u32 %r30458, %r30436; + mov.u32 %r30459, %r30436; + mov.u32 %r30460, %r30436; + mov.u32 %r30461, %r30436; + mov.u32 %r30462, %r30436; + mov.u32 %r30463, %r30436; + mov.u32 %r30464, %r30436; + mov.u32 %r30465, %r30436; + mov.u32 %r30466, %r30436; + mov.u32 %r30467, %r30436; + mov.u32 %r30468, %r30436; + mov.u32 %r30469, %r30436; + mov.u32 %r30486, %r30436; + +$L__BB2_67: + // begin inline asm + // xor5 + lop3.b32 %r19181, %r30472, %r30470, %r30468, 0x96; + lop3.b32 %r19181, %r19181, %r30466, %r30464, 0x96; + lop3.b32 %r19182, %r30473, %r30471, %r30469, 0x96; + lop3.b32 %r19182, %r19182, %r30467, %r30465, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19193, %r30484, %r30482, %r30462, 0x96; + lop3.b32 %r19193, %r19193, %r30460, %r30458, 0x96; + lop3.b32 %r19194, %r30485, %r30483, %r30463, 0x96; + lop3.b32 %r19194, %r19194, %r30461, %r30459, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19205, %r30480, %r30478, %r30456, 0x96; + lop3.b32 %r19205, %r19205, %r30454, %r30452, 0x96; + lop3.b32 %r19206, %r30481, %r30479, %r30457, 0x96; + lop3.b32 %r19206, %r19206, %r30455, %r30453, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19217, %r30476, %r30450, %r30448, 0x96; + lop3.b32 %r19217, %r19217, %r30446, %r30444, 0x96; + lop3.b32 %r19218, %r30477, %r30451, %r30449, 0x96; + lop3.b32 %r19218, %r19218, %r30447, %r30445, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19229, %r30474, %r30442, %r30440, 0x96; + lop3.b32 %r19229, %r19229, %r30438, %r30436, 0x96; + lop3.b32 %r19230, %r30475, %r30443, %r30441, 0x96; + lop3.b32 %r19230, %r19230, %r30439, %r30437, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19241, %r19194, %r19193, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19245, %r19193, %r19194, %r19142; + // end inline asm + xor.b32 %r19675, %r19241, %r19229; + xor.b32 %r19676, %r19245, %r19230; + xor.b32 %r19508, %r30472, %r19675; + xor.b32 %r19511, %r30473, %r19676; + xor.b32 %r19415, %r30470, %r19675; + xor.b32 %r19414, %r30471, %r19676; + xor.b32 %r19462, %r30468, %r19675; + xor.b32 %r19463, %r30469, %r19676; + xor.b32 %r19367, %r30466, %r19675; + xor.b32 %r19366, %r30467, %r19676; + xor.b32 %r19318, %r30464, %r19675; + xor.b32 %r19319, %r30465, %r19676; + // begin inline asm + shf.l.wrap.b32 %r19249, %r19206, %r19205, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19253, %r19205, %r19206, %r19142; + // end inline asm + xor.b32 %r19677, %r19249, %r19181; + xor.b32 %r19678, %r19253, %r19182; + xor.b32 %r19470, %r30484, %r19677; + xor.b32 %r19471, %r30485, %r19678; + xor.b32 %r19287, %r30482, %r19677; + xor.b32 %r19286, %r30483, %r19678; + xor.b32 %r19446, %r30462, %r19677; + xor.b32 %r19447, %r30463, %r19678; + xor.b32 %r19407, %r30460, %r19677; + xor.b32 %r19406, %r30461, %r19678; + xor.b32 %r19390, %r30458, %r19677; + xor.b32 %r19391, %r30459, %r19678; + // begin inline asm + shf.l.wrap.b32 %r19257, %r19218, %r19217, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19261, %r19217, %r19218, %r19142; + // end inline asm + xor.b32 %r19679, %r19257, %r19193; + xor.b32 %r19680, %r19261, %r19194; + xor.b32 %r19327, %r30480, %r19679; + xor.b32 %r19326, %r30481, %r19680; + xor.b32 %r19454, %r30478, %r19679; + xor.b32 %r19455, %r30479, %r19680; + xor.b32 %r19335, %r30456, %r19679; + xor.b32 %r19334, %r30457, %r19680; + xor.b32 %r19438, %r30454, %r19679; + xor.b32 %r19439, %r30455, %r19680; + xor.b32 %r19303, %r30452, %r19679; + xor.b32 %r19302, %r30453, %r19680; + // begin inline asm + shf.l.wrap.b32 %r19265, %r19230, %r19229, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19269, %r19229, %r19230, %r19142; + // end inline asm + xor.b32 %r19681, %r19265, %r19205; + xor.b32 %r19682, %r19269, %r19206; + xor.b32 %r19422, %r30476, %r19681; + xor.b32 %r19423, %r30477, %r19682; + xor.b32 %r19399, %r30450, %r19681; + xor.b32 %r19398, %r30451, %r19682; + xor.b32 %r19342, %r30448, %r19681; + xor.b32 %r19343, %r30449, %r19682; + xor.b32 %r19430, %r30446, %r19681; + xor.b32 %r19431, %r30447, %r19682; + xor.b32 %r19359, %r30444, %r19681; + xor.b32 %r19358, %r30445, %r19682; + // begin inline asm + shf.l.wrap.b32 %r19273, %r19182, %r19181, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19277, %r19181, %r19182, %r19142; + // end inline asm + xor.b32 %r19683, %r19273, %r19217; + xor.b32 %r19684, %r19277, %r19218; + xor.b32 %r19374, %r30474, %r19683; + xor.b32 %r19375, %r30475, %r19684; + xor.b32 %r19294, %r30442, %r19683; + xor.b32 %r19295, %r30443, %r19684; + xor.b32 %r19311, %r30440, %r19683; + xor.b32 %r19310, %r30441, %r19684; + xor.b32 %r19350, %r30438, %r19683; + xor.b32 %r19351, %r30439, %r19684; + xor.b32 %r19382, %r30436, %r19683; + xor.b32 %r19383, %r30437, %r19684; + mov.u32 %r19288, 44; + // begin inline asm + shf.l.wrap.b32 %r19281, %r19287, %r19286, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19285, %r19286, %r19287, %r19288; + // end inline asm + mov.u32 %r19296, 20; + // begin inline asm + shf.l.wrap.b32 %r19289, %r19295, %r19294, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19293, %r19294, %r19295, %r19296; + // end inline asm + mov.u32 %r19304, 61; + // begin inline asm + shf.l.wrap.b32 %r19297, %r19303, %r19302, %r19304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19301, %r19302, %r19303, %r19304; + // end inline asm + mov.u32 %r19312, 39; + // begin inline asm + shf.l.wrap.b32 %r19305, %r19311, %r19310, %r19312; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19309, %r19310, %r19311, %r19312; + // end inline asm + mov.u32 %r19320, 18; + // begin inline asm + shf.l.wrap.b32 %r19313, %r19319, %r19318, %r19320; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19317, %r19318, %r19319, %r19320; + // end inline asm + mov.u32 %r19328, 62; + // begin inline asm + shf.l.wrap.b32 %r19321, %r19327, %r19326, %r19328; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19325, %r19326, %r19327, %r19328; + // end inline asm + mov.u32 %r19336, 43; + // begin inline asm + shf.l.wrap.b32 %r19329, %r19335, %r19334, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19333, %r19334, %r19335, %r19336; + // end inline asm + mov.u32 %r19344, 25; + // begin inline asm + shf.l.wrap.b32 %r19337, %r19343, %r19342, %r19344; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19341, %r19342, %r19343, %r19344; + // end inline asm + mov.u32 %r19352, 8; + // begin inline asm + shf.l.wrap.b32 %r19345, %r19351, %r19350, %r19352; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19349, %r19350, %r19351, %r19352; + // end inline asm + mov.u32 %r19360, 56; + // begin inline asm + shf.l.wrap.b32 %r19353, %r19359, %r19358, %r19360; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19357, %r19358, %r19359, %r19360; + // end inline asm + mov.u32 %r19368, 41; + // begin inline asm + shf.l.wrap.b32 %r19361, %r19367, %r19366, %r19368; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19365, %r19366, %r19367, %r19368; + // end inline asm + mov.u32 %r19376, 27; + // begin inline asm + shf.l.wrap.b32 %r19369, %r19375, %r19374, %r19376; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19373, %r19374, %r19375, %r19376; + // end inline asm + mov.u32 %r19384, 14; + // begin inline asm + shf.l.wrap.b32 %r19377, %r19383, %r19382, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19381, %r19382, %r19383, %r19384; + // end inline asm + mov.u32 %r19392, 2; + // begin inline asm + shf.l.wrap.b32 %r19385, %r19391, %r19390, %r19392; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19389, %r19390, %r19391, %r19392; + // end inline asm + mov.u32 %r19400, 55; + // begin inline asm + shf.l.wrap.b32 %r19393, %r19399, %r19398, %r19400; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19397, %r19398, %r19399, %r19400; + // end inline asm + mov.u32 %r19408, 45; + // begin inline asm + shf.l.wrap.b32 %r19401, %r19407, %r19406, %r19408; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19405, %r19406, %r19407, %r19408; + // end inline asm + mov.u32 %r19416, 36; + // begin inline asm + shf.l.wrap.b32 %r19409, %r19415, %r19414, %r19416; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19413, %r19414, %r19415, %r19416; + // end inline asm + mov.u32 %r19424, 28; + // begin inline asm + shf.l.wrap.b32 %r19417, %r19423, %r19422, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19421, %r19422, %r19423, %r19424; + // end inline asm + mov.u32 %r19432, 21; + // begin inline asm + shf.l.wrap.b32 %r19425, %r19431, %r19430, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19429, %r19430, %r19431, %r19432; + // end inline asm + mov.u32 %r19440, 15; + // begin inline asm + shf.l.wrap.b32 %r19433, %r19439, %r19438, %r19440; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19437, %r19438, %r19439, %r19440; + // end inline asm + mov.u32 %r19448, 10; + // begin inline asm + shf.l.wrap.b32 %r19441, %r19447, %r19446, %r19448; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19445, %r19446, %r19447, %r19448; + // end inline asm + mov.u32 %r19456, 6; + // begin inline asm + shf.l.wrap.b32 %r19449, %r19455, %r19454, %r19456; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19453, %r19454, %r19455, %r19456; + // end inline asm + mov.u32 %r19464, 3; + // begin inline asm + shf.l.wrap.b32 %r19457, %r19463, %r19462, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19461, %r19462, %r19463, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19465, %r19471, %r19470, %r19142; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19469, %r19470, %r19471, %r19142; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19473, %r19508, %r19281, %r19329, 0xD2; + lop3.b32 %r19474, %r19511, %r19285, %r19333, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30484, %r19281, %r19329, %r19425, 0xD2; + lop3.b32 %r30485, %r19285, %r19333, %r19429, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30480, %r19329, %r19425, %r19377, 0xD2; + lop3.b32 %r30481, %r19333, %r19429, %r19381, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30476, %r19425, %r19377, %r19508, 0xD2; + lop3.b32 %r30477, %r19429, %r19381, %r19511, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30474, %r19377, %r19508, %r19281, 0xD2; + lop3.b32 %r30475, %r19381, %r19511, %r19285, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30470, %r19417, %r19289, %r19457, 0xD2; + lop3.b32 %r30471, %r19421, %r19293, %r19461, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30482, %r19289, %r19457, %r19401, 0xD2; + lop3.b32 %r30483, %r19293, %r19461, %r19405, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30478, %r19457, %r19401, %r19297, 0xD2; + lop3.b32 %r30479, %r19461, %r19405, %r19301, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30450, %r19401, %r19297, %r19417, 0xD2; + lop3.b32 %r30451, %r19405, %r19301, %r19421, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r30450, %r30451}; + // begin inline asm + // chi + lop3.b32 %r30442, %r19297, %r19417, %r19289, 0xD2; + lop3.b32 %r30443, %r19301, %r19421, %r19293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r30442, %r30443}; + // begin inline asm + // chi + lop3.b32 %r30468, %r19465, %r19449, %r19337, 0xD2; + lop3.b32 %r30469, %r19469, %r19453, %r19341, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+104], {%r30468, %r30469}; + // begin inline asm + // chi + lop3.b32 %r30462, %r19449, %r19337, %r19345, 0xD2; + lop3.b32 %r30463, %r19453, %r19341, %r19349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+112], {%r30462, %r30463}; + // begin inline asm + // chi + lop3.b32 %r30456, %r19337, %r19345, %r19313, 0xD2; + lop3.b32 %r30457, %r19341, %r19349, %r19317, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+120], {%r30456, %r30457}; + // begin inline asm + // chi + lop3.b32 %r30448, %r19345, %r19313, %r19465, 0xD2; + lop3.b32 %r30449, %r19349, %r19317, %r19469, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+128], {%r30448, %r30449}; + // begin inline asm + // chi + lop3.b32 %r30440, %r19313, %r19465, %r19449, 0xD2; + lop3.b32 %r30441, %r19317, %r19469, %r19453, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+136], {%r30440, %r30441}; + // begin inline asm + // chi + lop3.b32 %r30466, %r19369, %r19409, %r19441, 0xD2; + lop3.b32 %r30467, %r19373, %r19413, %r19445, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+144], {%r30466, %r30467}; + // begin inline asm + // chi + lop3.b32 %r30460, %r19409, %r19441, %r19433, 0xD2; + lop3.b32 %r30461, %r19413, %r19445, %r19437, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+152], {%r30460, %r30461}; + // begin inline asm + // chi + lop3.b32 %r30454, %r19441, %r19433, %r19353, 0xD2; + lop3.b32 %r30455, %r19445, %r19437, %r19357, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+160], {%r30454, %r30455}; + // begin inline asm + // chi + lop3.b32 %r30446, %r19433, %r19353, %r19369, 0xD2; + lop3.b32 %r30447, %r19437, %r19357, %r19373, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+168], {%r30446, %r30447}; + // begin inline asm + // chi + lop3.b32 %r30438, %r19353, %r19369, %r19409, 0xD2; + lop3.b32 %r30439, %r19357, %r19373, %r19413, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+176], {%r30438, %r30439}; + // begin inline asm + // chi + lop3.b32 %r30464, %r19321, %r19393, %r19305, 0xD2; + lop3.b32 %r30465, %r19325, %r19397, %r19309, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+184], {%r30464, %r30465}; + // begin inline asm + // chi + lop3.b32 %r30458, %r19393, %r19305, %r19361, 0xD2; + lop3.b32 %r30459, %r19397, %r19309, %r19365, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+192], {%r30458, %r30459}; + // begin inline asm + // chi + lop3.b32 %r30452, %r19305, %r19361, %r19385, 0xD2; + lop3.b32 %r30453, %r19309, %r19365, %r19389, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+200], {%r30452, %r30453}; + // begin inline asm + // chi + lop3.b32 %r30444, %r19361, %r19385, %r19321, 0xD2; + lop3.b32 %r30445, %r19365, %r19389, %r19325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+208], {%r30444, %r30445}; + // begin inline asm + // chi + lop3.b32 %r30436, %r19385, %r19321, %r19393, 0xD2; + lop3.b32 %r30437, %r19389, %r19325, %r19397, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+216], {%r30436, %r30437}; + mul.wide.s32 %rd881, %r30486, 8; + add.s64 %rd880, %rd806, %rd881; + // begin inline asm + ld.global.nc.v2.u32 {%r19673,%r19674}, [%rd880]; + // end inline asm + xor.b32 %r30472, %r19473, %r19673; + xor.b32 %r30473, %r19474, %r19674; + add.s32 %r30486, %r30486, 1; + setp.lt.u32 %p39, %r30486, 23; + @%p39 bra $L__BB2_67; + + mov.u32 %r19784, 1; + st.local.v2.u32 [%rd178+32], {%r30484, %r30485}; + st.local.v2.u32 [%rd178+72], {%r30482, %r30483}; + st.local.v2.u32 [%rd178+40], {%r30480, %r30481}; + st.local.v2.u32 [%rd178+80], {%r30478, %r30479}; + st.local.v2.u32 [%rd178+48], {%r30476, %r30477}; + st.local.v2.u32 [%rd178+56], {%r30474, %r30475}; + st.local.v2.u32 [%rd178+24], {%r30472, %r30473}; + // begin inline asm + // xor5 + lop3.b32 %r19685, %r30472, %r30470, %r30468, 0x96; + lop3.b32 %r19685, %r19685, %r30466, %r30464, 0x96; + lop3.b32 %r19686, %r30473, %r30471, %r30469, 0x96; + lop3.b32 %r19686, %r19686, %r30467, %r30465, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19697, %r30484, %r30482, %r30462, 0x96; + lop3.b32 %r19697, %r19697, %r30460, %r30458, 0x96; + lop3.b32 %r19698, %r30485, %r30483, %r30463, 0x96; + lop3.b32 %r19698, %r19698, %r30461, %r30459, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19709, %r30480, %r30478, %r30456, 0x96; + lop3.b32 %r19709, %r19709, %r30454, %r30452, 0x96; + lop3.b32 %r19710, %r30481, %r30479, %r30457, 0x96; + lop3.b32 %r19710, %r19710, %r30455, %r30453, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19721, %r30476, %r30450, %r30448, 0x96; + lop3.b32 %r19721, %r19721, %r30446, %r30444, 0x96; + lop3.b32 %r19722, %r30477, %r30451, %r30449, 0x96; + lop3.b32 %r19722, %r19722, %r30447, %r30445, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r19733, %r30474, %r30442, %r30440, 0x96; + lop3.b32 %r19733, %r19733, %r30438, %r30436, 0x96; + lop3.b32 %r19734, %r30475, %r30443, %r30441, 0x96; + lop3.b32 %r19734, %r19734, %r30439, %r30437, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19745, %r19698, %r19697, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19749, %r19697, %r19698, %r19784; + // end inline asm + xor.b32 %r19923, %r19745, %r19733; + xor.b32 %r19924, %r19749, %r19734; + xor.b32 %r19892, %r30472, %r19923; + xor.b32 %r19895, %r30473, %r19924; + xor.b32 %r19855, %r30469, %r19924; + xor.b32 %r19854, %r30468, %r19923; + st.local.v2.u32 [%rd178+104], {%r19854, %r19855}; + // begin inline asm + shf.l.wrap.b32 %r19753, %r19710, %r19709, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19757, %r19709, %r19710, %r19784; + // end inline asm + xor.b32 %r19925, %r19753, %r19685; + xor.b32 %r19926, %r19757, %r19686; + xor.b32 %r19791, %r30482, %r19925; + xor.b32 %r19790, %r30483, %r19926; + xor.b32 %r19830, %r30461, %r19926; + xor.b32 %r19831, %r30460, %r19925; + st.local.v2.u32 [%rd178+152], {%r19831, %r19830}; + // begin inline asm + shf.l.wrap.b32 %r19761, %r19722, %r19721, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19765, %r19721, %r19722, %r19784; + // end inline asm + xor.b32 %r19927, %r19761, %r19697; + xor.b32 %r19928, %r19765, %r19698; + xor.b32 %r19814, %r30457, %r19928; + xor.b32 %r19815, %r30456, %r19927; + st.local.v2.u32 [%rd178+120], {%r19815, %r19814}; + xor.b32 %r19806, %r30453, %r19928; + xor.b32 %r19807, %r30452, %r19927; + st.local.v2.u32 [%rd178+200], {%r19807, %r19806}; + // begin inline asm + shf.l.wrap.b32 %r19769, %r19734, %r19733, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19773, %r19733, %r19734, %r19784; + // end inline asm + xor.b32 %r19929, %r19769, %r19709; + xor.b32 %r19930, %r19773, %r19710; + xor.b32 %r19838, %r30476, %r19929; + xor.b32 %r19839, %r30477, %r19930; + xor.b32 %r19847, %r30447, %r19930; + xor.b32 %r19846, %r30446, %r19929; + st.local.v2.u32 [%rd178+168], {%r19846, %r19847}; + // begin inline asm + shf.l.wrap.b32 %r19777, %r19686, %r19685, %r19784; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19781, %r19685, %r19686, %r19784; + // end inline asm + xor.b32 %r19931, %r19777, %r19721; + xor.b32 %r19932, %r19781, %r19722; + xor.b32 %r19798, %r30442, %r19931; + xor.b32 %r19799, %r30443, %r19932; + xor.b32 %r19823, %r30437, %r19932; + xor.b32 %r19822, %r30436, %r19931; + st.local.v2.u32 [%rd178+216], {%r19822, %r19823}; + // begin inline asm + shf.l.wrap.b32 %r19785, %r19791, %r19790, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19789, %r19790, %r19791, %r19288; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19793, %r19799, %r19798, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19797, %r19798, %r19799, %r19296; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19805, %r19806, %r19807, %r19304; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19801, %r19807, %r19806, %r19304; + // end inline asm + st.local.v2.u32 [%rd178+96], {%r19801, %r19805}; + // begin inline asm + shf.l.wrap.b32 %r19809, %r19815, %r19814, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19813, %r19814, %r19815, %r19336; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19817, %r19823, %r19822, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19821, %r19822, %r19823, %r19384; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19829, %r19830, %r19831, %r19408; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19825, %r19831, %r19830, %r19408; + // end inline asm + st.local.v2.u32 [%rd178+88], {%r19825, %r19829}; + // begin inline asm + shf.l.wrap.b32 %r19833, %r19839, %r19838, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19837, %r19838, %r19839, %r19424; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19841, %r19847, %r19846, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19845, %r19846, %r19847, %r19432; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19849, %r19855, %r19854, %r19464; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r19853, %r19854, %r19855, %r19464; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19857, %r19892, %r19785, %r19809, 0xD2; + lop3.b32 %r19858, %r19895, %r19789, %r19813, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r19865, %r19785, %r19809, %r19841, 0xD2; + lop3.b32 %r19866, %r19789, %r19813, %r19845, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+32], {%r19865, %r19866}; + // begin inline asm + // chi + lop3.b32 %r19873, %r19809, %r19841, %r19817, 0xD2; + lop3.b32 %r19874, %r19813, %r19845, %r19821, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+40], {%r19873, %r19874}; + // begin inline asm + // chi + lop3.b32 %r19881, %r19841, %r19817, %r19892, 0xD2; + lop3.b32 %r19882, %r19845, %r19821, %r19895, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+48], {%r19881, %r19882}; + // begin inline asm + // chi + lop3.b32 %r19889, %r19817, %r19892, %r19785, 0xD2; + lop3.b32 %r19890, %r19821, %r19895, %r19789, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+56], {%r19889, %r19890}; + // begin inline asm + // chi + lop3.b32 %r19897, %r19833, %r19793, %r19849, 0xD2; + lop3.b32 %r19898, %r19837, %r19797, %r19853, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+64], {%r19897, %r19898}; + // begin inline asm + // chi + lop3.b32 %r19905, %r19793, %r19849, %r19825, 0xD2; + lop3.b32 %r19906, %r19797, %r19853, %r19829, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+72], {%r19905, %r19906}; + // begin inline asm + // chi + lop3.b32 %r19913, %r19849, %r19825, %r19801, 0xD2; + lop3.b32 %r19914, %r19853, %r19829, %r19805, 0xD2; + // end inline asm + st.local.v2.u32 [%rd178+80], {%r19913, %r19914}; + // begin inline asm + ld.global.nc.v2.u32 {%r19921,%r19922}, [%rd807]; + // end inline asm + xor.b32 %r19933, %r19858, %r19922; + xor.b32 %r19934, %r19857, %r19921; + st.local.v2.u32 [%rd178+24], {%r19934, %r19933}; + bra.uni $L__BB2_69; + +$L__BB2_47: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd688, 1179641; + st.local.u64 [%rd3+8], %rd688; + st.local.u32 [%rd3+16], %r1678; + ld.global.u64 %rd689, [%rd128]; + ld.global.u64 %rd690, [%rd128+8]; + ld.global.u64 %rd691, [%rd128+16]; + ld.global.u64 %rd692, [%rd128+24]; + ld.global.u64 %rd693, [%rd128+32]; + ld.global.u64 %rd694, [%rd128+40]; + ld.global.u64 %rd695, [%rd128+48]; + ld.global.u64 %rd696, [%rd128+56]; + st.local.u64 [%rd3+24], %rd689; + st.local.u64 [%rd3+32], %rd690; + st.local.u64 [%rd3+40], %rd691; + st.local.u64 [%rd3+48], %rd692; + st.local.u64 [%rd3+56], %rd693; + st.local.u64 [%rd3+64], %rd694; + st.local.u64 [%rd3+72], %rd695; + st.local.u64 [%rd3+80], %rd696; + cvt.u32.u64 %r13408, %rd689; + xor.b32 %r13409, %r1678, %r13408; + st.local.u32 [%rd3+24], %r13409; + mov.u32 %r30013, 0; + st.local.v2.u32 [%rd3+96], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+104], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+112], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+120], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+128], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+136], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+144], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+152], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+160], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+168], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+176], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+184], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+192], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+200], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+208], {%r30013, %r30013}; + st.local.v2.u32 [%rd3+216], {%r30013, %r30013}; + mov.u32 %r30028, -2147483648; + mov.u32 %r13381, 1; + st.local.v2.u32 [%rd3+88], {%r13381, %r30028}; + ld.local.v2.u32 {%r30049, %r30050}, [%rd3+24]; + mov.b64 {%r30047, %r30048}, %rd694; + shr.u64 %rd697, %rd690, 32; + cvt.u32.u64 %r30061, %rd690; + cvt.u32.u64 %r30062, %rd697; + shr.u64 %rd698, %rd695, 32; + cvt.u32.u64 %r30059, %rd695; + cvt.u32.u64 %r30060, %rd698; + shr.u64 %rd699, %rd691, 32; + cvt.u32.u64 %r30057, %rd691; + cvt.u32.u64 %r30058, %rd699; + shr.u64 %rd700, %rd696, 32; + cvt.u32.u64 %r30055, %rd696; + cvt.u32.u64 %r30056, %rd700; + shr.u64 %rd701, %rd692, 32; + cvt.u32.u64 %r30053, %rd692; + cvt.u32.u64 %r30054, %rd701; + shr.u64 %rd702, %rd693, 32; + cvt.u32.u64 %r30051, %rd693; + cvt.u32.u64 %r30052, %rd702; + mov.u32 %r30014, %r30013; + mov.u32 %r30015, %r30013; + mov.u32 %r30016, %r30013; + mov.u32 %r30017, %r30013; + mov.u32 %r30018, %r30013; + mov.u32 %r30019, %r30013; + mov.u32 %r30020, %r30013; + mov.u32 %r30021, %r30013; + mov.u32 %r30022, %r30013; + mov.u32 %r30023, %r30013; + mov.u32 %r30024, %r30013; + mov.u32 %r30025, %r30013; + mov.u32 %r30026, %r30013; + mov.u32 %r30027, %r13381; + mov.u32 %r30029, %r30013; + mov.u32 %r30030, %r30013; + mov.u32 %r30031, %r30013; + mov.u32 %r30032, %r30013; + mov.u32 %r30033, %r30013; + mov.u32 %r30034, %r30013; + mov.u32 %r30035, %r30013; + mov.u32 %r30036, %r30013; + mov.u32 %r30037, %r30013; + mov.u32 %r30038, %r30013; + mov.u32 %r30039, %r30013; + mov.u32 %r30040, %r30013; + mov.u32 %r30041, %r30013; + mov.u32 %r30042, %r30013; + mov.u32 %r30043, %r30013; + mov.u32 %r30044, %r30013; + mov.u32 %r30045, %r30013; + mov.u32 %r30046, %r30013; + mov.u32 %r30063, %r30013; + +$L__BB2_48: + // begin inline asm + // xor5 + lop3.b32 %r13412, %r30049, %r30047, %r30045, 0x96; + lop3.b32 %r13412, %r13412, %r30043, %r30041, 0x96; + lop3.b32 %r13413, %r30050, %r30048, %r30046, 0x96; + lop3.b32 %r13413, %r13413, %r30044, %r30042, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13424, %r30061, %r30059, %r30039, 0x96; + lop3.b32 %r13424, %r13424, %r30037, %r30035, 0x96; + lop3.b32 %r13425, %r30062, %r30060, %r30040, 0x96; + lop3.b32 %r13425, %r13425, %r30038, %r30036, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13436, %r30057, %r30055, %r30033, 0x96; + lop3.b32 %r13436, %r13436, %r30031, %r30029, 0x96; + lop3.b32 %r13437, %r30058, %r30056, %r30034, 0x96; + lop3.b32 %r13437, %r13437, %r30032, %r30030, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13448, %r30053, %r30027, %r30025, 0x96; + lop3.b32 %r13448, %r13448, %r30023, %r30021, 0x96; + lop3.b32 %r13449, %r30054, %r30028, %r30026, 0x96; + lop3.b32 %r13449, %r13449, %r30024, %r30022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13460, %r30051, %r30019, %r30017, 0x96; + lop3.b32 %r13460, %r13460, %r30015, %r30013, 0x96; + lop3.b32 %r13461, %r30052, %r30020, %r30018, 0x96; + lop3.b32 %r13461, %r13461, %r30016, %r30014, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13472, %r13425, %r13424, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13476, %r13424, %r13425, %r13381; + // end inline asm + xor.b32 %r13906, %r13472, %r13460; + xor.b32 %r13907, %r13476, %r13461; + xor.b32 %r13739, %r30049, %r13906; + xor.b32 %r13742, %r30050, %r13907; + xor.b32 %r13646, %r30047, %r13906; + xor.b32 %r13645, %r30048, %r13907; + xor.b32 %r13693, %r30045, %r13906; + xor.b32 %r13694, %r30046, %r13907; + xor.b32 %r13598, %r30043, %r13906; + xor.b32 %r13597, %r30044, %r13907; + xor.b32 %r13549, %r30041, %r13906; + xor.b32 %r13550, %r30042, %r13907; + // begin inline asm + shf.l.wrap.b32 %r13480, %r13437, %r13436, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13484, %r13436, %r13437, %r13381; + // end inline asm + xor.b32 %r13908, %r13480, %r13412; + xor.b32 %r13909, %r13484, %r13413; + xor.b32 %r13701, %r30061, %r13908; + xor.b32 %r13702, %r30062, %r13909; + xor.b32 %r13518, %r30059, %r13908; + xor.b32 %r13517, %r30060, %r13909; + xor.b32 %r13677, %r30039, %r13908; + xor.b32 %r13678, %r30040, %r13909; + xor.b32 %r13638, %r30037, %r13908; + xor.b32 %r13637, %r30038, %r13909; + xor.b32 %r13621, %r30035, %r13908; + xor.b32 %r13622, %r30036, %r13909; + // begin inline asm + shf.l.wrap.b32 %r13488, %r13449, %r13448, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13492, %r13448, %r13449, %r13381; + // end inline asm + xor.b32 %r13910, %r13488, %r13424; + xor.b32 %r13911, %r13492, %r13425; + xor.b32 %r13558, %r30057, %r13910; + xor.b32 %r13557, %r30058, %r13911; + xor.b32 %r13685, %r30055, %r13910; + xor.b32 %r13686, %r30056, %r13911; + xor.b32 %r13566, %r30033, %r13910; + xor.b32 %r13565, %r30034, %r13911; + xor.b32 %r13669, %r30031, %r13910; + xor.b32 %r13670, %r30032, %r13911; + xor.b32 %r13534, %r30029, %r13910; + xor.b32 %r13533, %r30030, %r13911; + // begin inline asm + shf.l.wrap.b32 %r13496, %r13461, %r13460, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13500, %r13460, %r13461, %r13381; + // end inline asm + xor.b32 %r13912, %r13496, %r13436; + xor.b32 %r13913, %r13500, %r13437; + xor.b32 %r13653, %r30053, %r13912; + xor.b32 %r13654, %r30054, %r13913; + xor.b32 %r13630, %r30027, %r13912; + xor.b32 %r13629, %r30028, %r13913; + xor.b32 %r13573, %r30025, %r13912; + xor.b32 %r13574, %r30026, %r13913; + xor.b32 %r13661, %r30023, %r13912; + xor.b32 %r13662, %r30024, %r13913; + xor.b32 %r13590, %r30021, %r13912; + xor.b32 %r13589, %r30022, %r13913; + // begin inline asm + shf.l.wrap.b32 %r13504, %r13413, %r13412, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13508, %r13412, %r13413, %r13381; + // end inline asm + xor.b32 %r13914, %r13504, %r13448; + xor.b32 %r13915, %r13508, %r13449; + xor.b32 %r13605, %r30051, %r13914; + xor.b32 %r13606, %r30052, %r13915; + xor.b32 %r13525, %r30019, %r13914; + xor.b32 %r13526, %r30020, %r13915; + xor.b32 %r13542, %r30017, %r13914; + xor.b32 %r13541, %r30018, %r13915; + xor.b32 %r13581, %r30015, %r13914; + xor.b32 %r13582, %r30016, %r13915; + xor.b32 %r13613, %r30013, %r13914; + xor.b32 %r13614, %r30014, %r13915; + mov.u32 %r13519, 44; + // begin inline asm + shf.l.wrap.b32 %r13512, %r13518, %r13517, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13516, %r13517, %r13518, %r13519; + // end inline asm + mov.u32 %r13527, 20; + // begin inline asm + shf.l.wrap.b32 %r13520, %r13526, %r13525, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13524, %r13525, %r13526, %r13527; + // end inline asm + mov.u32 %r13535, 61; + // begin inline asm + shf.l.wrap.b32 %r13528, %r13534, %r13533, %r13535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13532, %r13533, %r13534, %r13535; + // end inline asm + mov.u32 %r13543, 39; + // begin inline asm + shf.l.wrap.b32 %r13536, %r13542, %r13541, %r13543; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13540, %r13541, %r13542, %r13543; + // end inline asm + mov.u32 %r13551, 18; + // begin inline asm + shf.l.wrap.b32 %r13544, %r13550, %r13549, %r13551; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13548, %r13549, %r13550, %r13551; + // end inline asm + mov.u32 %r13559, 62; + // begin inline asm + shf.l.wrap.b32 %r13552, %r13558, %r13557, %r13559; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13556, %r13557, %r13558, %r13559; + // end inline asm + mov.u32 %r13567, 43; + // begin inline asm + shf.l.wrap.b32 %r13560, %r13566, %r13565, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13564, %r13565, %r13566, %r13567; + // end inline asm + mov.u32 %r13575, 25; + // begin inline asm + shf.l.wrap.b32 %r13568, %r13574, %r13573, %r13575; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13572, %r13573, %r13574, %r13575; + // end inline asm + mov.u32 %r13583, 8; + // begin inline asm + shf.l.wrap.b32 %r13576, %r13582, %r13581, %r13583; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13580, %r13581, %r13582, %r13583; + // end inline asm + mov.u32 %r13591, 56; + // begin inline asm + shf.l.wrap.b32 %r13584, %r13590, %r13589, %r13591; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13588, %r13589, %r13590, %r13591; + // end inline asm + mov.u32 %r13599, 41; + // begin inline asm + shf.l.wrap.b32 %r13592, %r13598, %r13597, %r13599; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13596, %r13597, %r13598, %r13599; + // end inline asm + mov.u32 %r13607, 27; + // begin inline asm + shf.l.wrap.b32 %r13600, %r13606, %r13605, %r13607; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13604, %r13605, %r13606, %r13607; + // end inline asm + mov.u32 %r13615, 14; + // begin inline asm + shf.l.wrap.b32 %r13608, %r13614, %r13613, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13612, %r13613, %r13614, %r13615; + // end inline asm + mov.u32 %r13623, 2; + // begin inline asm + shf.l.wrap.b32 %r13616, %r13622, %r13621, %r13623; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13620, %r13621, %r13622, %r13623; + // end inline asm + mov.u32 %r13631, 55; + // begin inline asm + shf.l.wrap.b32 %r13624, %r13630, %r13629, %r13631; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13628, %r13629, %r13630, %r13631; + // end inline asm + mov.u32 %r13639, 45; + // begin inline asm + shf.l.wrap.b32 %r13632, %r13638, %r13637, %r13639; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13636, %r13637, %r13638, %r13639; + // end inline asm + mov.u32 %r13647, 36; + // begin inline asm + shf.l.wrap.b32 %r13640, %r13646, %r13645, %r13647; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13644, %r13645, %r13646, %r13647; + // end inline asm + mov.u32 %r13655, 28; + // begin inline asm + shf.l.wrap.b32 %r13648, %r13654, %r13653, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13652, %r13653, %r13654, %r13655; + // end inline asm + mov.u32 %r13663, 21; + // begin inline asm + shf.l.wrap.b32 %r13656, %r13662, %r13661, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13660, %r13661, %r13662, %r13663; + // end inline asm + mov.u32 %r13671, 15; + // begin inline asm + shf.l.wrap.b32 %r13664, %r13670, %r13669, %r13671; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13668, %r13669, %r13670, %r13671; + // end inline asm + mov.u32 %r13679, 10; + // begin inline asm + shf.l.wrap.b32 %r13672, %r13678, %r13677, %r13679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13676, %r13677, %r13678, %r13679; + // end inline asm + mov.u32 %r13687, 6; + // begin inline asm + shf.l.wrap.b32 %r13680, %r13686, %r13685, %r13687; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13684, %r13685, %r13686, %r13687; + // end inline asm + mov.u32 %r13695, 3; + // begin inline asm + shf.l.wrap.b32 %r13688, %r13694, %r13693, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13692, %r13693, %r13694, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13696, %r13702, %r13701, %r13381; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13700, %r13701, %r13702, %r13381; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r13704, %r13739, %r13512, %r13560, 0xD2; + lop3.b32 %r13705, %r13742, %r13516, %r13564, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30061, %r13512, %r13560, %r13656, 0xD2; + lop3.b32 %r30062, %r13516, %r13564, %r13660, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30057, %r13560, %r13656, %r13608, 0xD2; + lop3.b32 %r30058, %r13564, %r13660, %r13612, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30053, %r13656, %r13608, %r13739, 0xD2; + lop3.b32 %r30054, %r13660, %r13612, %r13742, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30051, %r13608, %r13739, %r13512, 0xD2; + lop3.b32 %r30052, %r13612, %r13742, %r13516, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30047, %r13648, %r13520, %r13688, 0xD2; + lop3.b32 %r30048, %r13652, %r13524, %r13692, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30059, %r13520, %r13688, %r13632, 0xD2; + lop3.b32 %r30060, %r13524, %r13692, %r13636, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30055, %r13688, %r13632, %r13528, 0xD2; + lop3.b32 %r30056, %r13692, %r13636, %r13532, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30027, %r13632, %r13528, %r13648, 0xD2; + lop3.b32 %r30028, %r13636, %r13532, %r13652, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30027, %r30028}; + // begin inline asm + // chi + lop3.b32 %r30019, %r13528, %r13648, %r13520, 0xD2; + lop3.b32 %r30020, %r13532, %r13652, %r13524, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30019, %r30020}; + // begin inline asm + // chi + lop3.b32 %r30045, %r13696, %r13680, %r13568, 0xD2; + lop3.b32 %r30046, %r13700, %r13684, %r13572, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30045, %r30046}; + // begin inline asm + // chi + lop3.b32 %r30039, %r13680, %r13568, %r13576, 0xD2; + lop3.b32 %r30040, %r13684, %r13572, %r13580, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30039, %r30040}; + // begin inline asm + // chi + lop3.b32 %r30033, %r13568, %r13576, %r13544, 0xD2; + lop3.b32 %r30034, %r13572, %r13580, %r13548, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30033, %r30034}; + // begin inline asm + // chi + lop3.b32 %r30025, %r13576, %r13544, %r13696, 0xD2; + lop3.b32 %r30026, %r13580, %r13548, %r13700, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30025, %r30026}; + // begin inline asm + // chi + lop3.b32 %r30017, %r13544, %r13696, %r13680, 0xD2; + lop3.b32 %r30018, %r13548, %r13700, %r13684, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30017, %r30018}; + // begin inline asm + // chi + lop3.b32 %r30043, %r13600, %r13640, %r13672, 0xD2; + lop3.b32 %r30044, %r13604, %r13644, %r13676, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30043, %r30044}; + // begin inline asm + // chi + lop3.b32 %r30037, %r13640, %r13672, %r13664, 0xD2; + lop3.b32 %r30038, %r13644, %r13676, %r13668, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30037, %r30038}; + // begin inline asm + // chi + lop3.b32 %r30031, %r13672, %r13664, %r13584, 0xD2; + lop3.b32 %r30032, %r13676, %r13668, %r13588, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30031, %r30032}; + // begin inline asm + // chi + lop3.b32 %r30023, %r13664, %r13584, %r13600, 0xD2; + lop3.b32 %r30024, %r13668, %r13588, %r13604, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30023, %r30024}; + // begin inline asm + // chi + lop3.b32 %r30015, %r13584, %r13600, %r13640, 0xD2; + lop3.b32 %r30016, %r13588, %r13604, %r13644, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30015, %r30016}; + // begin inline asm + // chi + lop3.b32 %r30041, %r13552, %r13624, %r13536, 0xD2; + lop3.b32 %r30042, %r13556, %r13628, %r13540, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30041, %r30042}; + // begin inline asm + // chi + lop3.b32 %r30035, %r13624, %r13536, %r13592, 0xD2; + lop3.b32 %r30036, %r13628, %r13540, %r13596, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30035, %r30036}; + // begin inline asm + // chi + lop3.b32 %r30029, %r13536, %r13592, %r13616, 0xD2; + lop3.b32 %r30030, %r13540, %r13596, %r13620, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30029, %r30030}; + // begin inline asm + // chi + lop3.b32 %r30021, %r13592, %r13616, %r13552, 0xD2; + lop3.b32 %r30022, %r13596, %r13620, %r13556, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30021, %r30022}; + // begin inline asm + // chi + lop3.b32 %r30013, %r13616, %r13552, %r13624, 0xD2; + lop3.b32 %r30014, %r13620, %r13556, %r13628, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30013, %r30014}; + mul.wide.s32 %rd704, %r30063, 8; + mov.u64 %rd705, keccak_round_constants; + cvta.const.u64 %rd706, %rd705; + add.s64 %rd703, %rd706, %rd704; + // begin inline asm + ld.global.nc.v2.u32 {%r13904,%r13905}, [%rd703]; + // end inline asm + xor.b32 %r30049, %r13704, %r13904; + xor.b32 %r30050, %r13705, %r13905; + add.s32 %r30063, %r30063, 1; + setp.lt.u32 %p30, %r30063, 23; + @%p30 bra $L__BB2_48; + + add.u64 %rd149, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30061, %r30062}; + st.local.v2.u32 [%rd3+72], {%r30059, %r30060}; + st.local.v2.u32 [%rd3+40], {%r30057, %r30058}; + st.local.v2.u32 [%rd3+80], {%r30055, %r30056}; + st.local.v2.u32 [%rd3+48], {%r30053, %r30054}; + st.local.v2.u32 [%rd3+56], {%r30051, %r30052}; + st.local.v2.u32 [%rd3+24], {%r30049, %r30050}; + // begin inline asm + // xor5 + lop3.b32 %r13916, %r30049, %r30047, %r30045, 0x96; + lop3.b32 %r13916, %r13916, %r30043, %r30041, 0x96; + lop3.b32 %r13917, %r30050, %r30048, %r30046, 0x96; + lop3.b32 %r13917, %r13917, %r30044, %r30042, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13928, %r30061, %r30059, %r30039, 0x96; + lop3.b32 %r13928, %r13928, %r30037, %r30035, 0x96; + lop3.b32 %r13929, %r30062, %r30060, %r30040, 0x96; + lop3.b32 %r13929, %r13929, %r30038, %r30036, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13940, %r30057, %r30055, %r30033, 0x96; + lop3.b32 %r13940, %r13940, %r30031, %r30029, 0x96; + lop3.b32 %r13941, %r30058, %r30056, %r30034, 0x96; + lop3.b32 %r13941, %r13941, %r30032, %r30030, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13952, %r30053, %r30027, %r30025, 0x96; + lop3.b32 %r13952, %r13952, %r30023, %r30021, 0x96; + lop3.b32 %r13953, %r30054, %r30028, %r30026, 0x96; + lop3.b32 %r13953, %r13953, %r30024, %r30022, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r13964, %r30051, %r30019, %r30017, 0x96; + lop3.b32 %r13964, %r13964, %r30015, %r30013, 0x96; + lop3.b32 %r13965, %r30052, %r30020, %r30018, 0x96; + lop3.b32 %r13965, %r13965, %r30016, %r30014, 0x96; + // end inline asm + mov.u32 %r14168, 1; + // begin inline asm + shf.l.wrap.b32 %r13976, %r13929, %r13928, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13980, %r13928, %r13929, %r14168; + // end inline asm + xor.b32 %r14195, %r13976, %r13964; + xor.b32 %r14196, %r13980, %r13965; + xor.b32 %r14123, %r30049, %r14195; + xor.b32 %r14126, %r30050, %r14196; + xor.b32 %r14086, %r30046, %r14196; + xor.b32 %r14085, %r30045, %r14195; + st.local.v2.u32 [%rd3+104], {%r14085, %r14086}; + // begin inline asm + shf.l.wrap.b32 %r13984, %r13941, %r13940, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13988, %r13940, %r13941, %r14168; + // end inline asm + xor.b32 %r14197, %r13984, %r13916; + xor.b32 %r14198, %r13988, %r13917; + xor.b32 %r14022, %r30059, %r14197; + xor.b32 %r14021, %r30060, %r14198; + xor.b32 %r14061, %r30038, %r14198; + xor.b32 %r14062, %r30037, %r14197; + st.local.v2.u32 [%rd3+152], {%r14062, %r14061}; + // begin inline asm + shf.l.wrap.b32 %r13992, %r13953, %r13952, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r13996, %r13952, %r13953, %r14168; + // end inline asm + xor.b32 %r14199, %r13992, %r13928; + xor.b32 %r14200, %r13996, %r13929; + xor.b32 %r14045, %r30034, %r14200; + xor.b32 %r14046, %r30033, %r14199; + st.local.v2.u32 [%rd3+120], {%r14046, %r14045}; + xor.b32 %r14037, %r30030, %r14200; + xor.b32 %r14038, %r30029, %r14199; + st.local.v2.u32 [%rd3+200], {%r14038, %r14037}; + // begin inline asm + shf.l.wrap.b32 %r14000, %r13965, %r13964, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14004, %r13964, %r13965, %r14168; + // end inline asm + xor.b32 %r14201, %r14000, %r13940; + xor.b32 %r14202, %r14004, %r13941; + xor.b32 %r14069, %r30053, %r14201; + xor.b32 %r14070, %r30054, %r14202; + xor.b32 %r14078, %r30024, %r14202; + xor.b32 %r14077, %r30023, %r14201; + st.local.v2.u32 [%rd3+168], {%r14077, %r14078}; + // begin inline asm + shf.l.wrap.b32 %r14008, %r13917, %r13916, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14012, %r13916, %r13917, %r14168; + // end inline asm + xor.b32 %r14203, %r14008, %r13952; + xor.b32 %r14204, %r14012, %r13953; + xor.b32 %r14029, %r30019, %r14203; + xor.b32 %r14030, %r30020, %r14204; + xor.b32 %r14054, %r30014, %r14204; + xor.b32 %r14053, %r30013, %r14203; + st.local.v2.u32 [%rd3+216], {%r14053, %r14054}; + // begin inline asm + shf.l.wrap.b32 %r14016, %r14022, %r14021, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14020, %r14021, %r14022, %r13519; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14024, %r14030, %r14029, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14028, %r14029, %r14030, %r13527; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14036, %r14037, %r14038, %r13535; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14032, %r14038, %r14037, %r13535; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r14032, %r14036}; + // begin inline asm + shf.l.wrap.b32 %r14040, %r14046, %r14045, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14044, %r14045, %r14046, %r13567; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14048, %r14054, %r14053, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14052, %r14053, %r14054, %r13615; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14060, %r14061, %r14062, %r13639; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14056, %r14062, %r14061, %r13639; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r14056, %r14060}; + // begin inline asm + shf.l.wrap.b32 %r14064, %r14070, %r14069, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14068, %r14069, %r14070, %r13655; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14072, %r14078, %r14077, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14076, %r14077, %r14078, %r13663; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14080, %r14086, %r14085, %r13695; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14084, %r14085, %r14086, %r13695; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14088, %r14123, %r14016, %r14040, 0xD2; + lop3.b32 %r14089, %r14126, %r14020, %r14044, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30196, %r14016, %r14040, %r14072, 0xD2; + lop3.b32 %r30197, %r14020, %r14044, %r14076, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30196, %r30197}; + // begin inline asm + // chi + lop3.b32 %r30192, %r14040, %r14072, %r14048, 0xD2; + lop3.b32 %r30193, %r14044, %r14076, %r14052, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30192, %r30193}; + // begin inline asm + // chi + lop3.b32 %r30188, %r14072, %r14048, %r14123, 0xD2; + lop3.b32 %r30189, %r14076, %r14052, %r14126, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30188, %r30189}; + // begin inline asm + // chi + lop3.b32 %r30186, %r14048, %r14123, %r14016, 0xD2; + lop3.b32 %r30187, %r14052, %r14126, %r14020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30186, %r30187}; + // begin inline asm + // chi + lop3.b32 %r30182, %r14064, %r14024, %r14080, 0xD2; + lop3.b32 %r30183, %r14068, %r14028, %r14084, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30182, %r30183}; + // begin inline asm + // chi + lop3.b32 %r30194, %r14024, %r14080, %r14056, 0xD2; + lop3.b32 %r30195, %r14028, %r14084, %r14060, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30194, %r30195}; + // begin inline asm + // chi + lop3.b32 %r30190, %r14080, %r14056, %r14032, 0xD2; + lop3.b32 %r30191, %r14084, %r14060, %r14036, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30190, %r30191}; + add.s64 %rd707, %rd706, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r14152,%r14153}, [%rd707]; + // end inline asm + xor.b32 %r30184, %r14088, %r14152; + xor.b32 %r30185, %r14089, %r14153; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + st.local.u64 [%rd149], %rd354; + mov.u64 %rd711, 1179641; + st.local.u64 [%rd149+8], %rd711; + add.s32 %r1874, %r1678, 1; + st.local.u32 [%rd149+16], %r1874; + ld.global.u64 %rd712, [%rd129]; + ld.global.u64 %rd713, [%rd129+8]; + ld.global.u64 %rd714, [%rd129+16]; + ld.global.u64 %rd715, [%rd129+24]; + ld.global.u64 %rd716, [%rd129+32]; + ld.global.u64 %rd717, [%rd129+40]; + ld.global.u64 %rd718, [%rd129+48]; + ld.global.u64 %rd719, [%rd129+56]; + st.local.u64 [%rd149+32], %rd713; + st.local.u64 [%rd149+40], %rd714; + st.local.u64 [%rd149+48], %rd715; + st.local.u64 [%rd149+56], %rd716; + st.local.u64 [%rd149+64], %rd717; + st.local.u64 [%rd149+72], %rd718; + st.local.u64 [%rd149+80], %rd719; + cvt.u32.u64 %r14205, %rd712; + xor.b32 %r14206, %r1874, %r14205; + st.local.u64 [%rd149+24], %rd712; + st.local.u32 [%rd149+24], %r14206; + mov.u32 %r30064, 0; + st.local.v2.u32 [%rd149+96], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+104], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+112], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+120], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+128], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+136], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+144], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+152], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+160], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+168], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+176], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+184], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+192], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+200], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+208], {%r30064, %r30064}; + st.local.v2.u32 [%rd149+216], {%r30064, %r30064}; + mov.u32 %r30079, -2147483648; + st.local.v2.u32 [%rd149+88], {%r14168, %r30079}; + ld.local.v2.u32 {%r30100, %r30101}, [%rd149+24]; + mov.b64 {%r30098, %r30099}, %rd717; + shr.u64 %rd720, %rd713, 32; + cvt.u32.u64 %r30112, %rd713; + cvt.u32.u64 %r30113, %rd720; + shr.u64 %rd721, %rd718, 32; + cvt.u32.u64 %r30110, %rd718; + cvt.u32.u64 %r30111, %rd721; + shr.u64 %rd722, %rd714, 32; + cvt.u32.u64 %r30108, %rd714; + cvt.u32.u64 %r30109, %rd722; + shr.u64 %rd723, %rd719, 32; + cvt.u32.u64 %r30106, %rd719; + cvt.u32.u64 %r30107, %rd723; + shr.u64 %rd724, %rd715, 32; + cvt.u32.u64 %r30104, %rd715; + cvt.u32.u64 %r30105, %rd724; + shr.u64 %rd725, %rd716, 32; + cvt.u32.u64 %r30102, %rd716; + cvt.u32.u64 %r30103, %rd725; + mov.u32 %r30065, %r30064; + mov.u32 %r30066, %r30064; + mov.u32 %r30067, %r30064; + mov.u32 %r30068, %r30064; + mov.u32 %r30069, %r30064; + mov.u32 %r30070, %r30064; + mov.u32 %r30071, %r30064; + mov.u32 %r30072, %r30064; + mov.u32 %r30073, %r30064; + mov.u32 %r30074, %r30064; + mov.u32 %r30075, %r30064; + mov.u32 %r30076, %r30064; + mov.u32 %r30077, %r30064; + mov.u32 %r30078, %r14168; + mov.u32 %r30080, %r30064; + mov.u32 %r30081, %r30064; + mov.u32 %r30082, %r30064; + mov.u32 %r30083, %r30064; + mov.u32 %r30084, %r30064; + mov.u32 %r30085, %r30064; + mov.u32 %r30086, %r30064; + mov.u32 %r30087, %r30064; + mov.u32 %r30088, %r30064; + mov.u32 %r30089, %r30064; + mov.u32 %r30090, %r30064; + mov.u32 %r30091, %r30064; + mov.u32 %r30092, %r30064; + mov.u32 %r30093, %r30064; + mov.u32 %r30094, %r30064; + mov.u32 %r30095, %r30064; + mov.u32 %r30096, %r30064; + mov.u32 %r30097, %r30064; + mov.u32 %r30114, %r30064; + +$L__BB2_50: + // begin inline asm + // xor5 + lop3.b32 %r14209, %r30100, %r30098, %r30096, 0x96; + lop3.b32 %r14209, %r14209, %r30094, %r30092, 0x96; + lop3.b32 %r14210, %r30101, %r30099, %r30097, 0x96; + lop3.b32 %r14210, %r14210, %r30095, %r30093, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14221, %r30112, %r30110, %r30090, 0x96; + lop3.b32 %r14221, %r14221, %r30088, %r30086, 0x96; + lop3.b32 %r14222, %r30113, %r30111, %r30091, 0x96; + lop3.b32 %r14222, %r14222, %r30089, %r30087, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14233, %r30108, %r30106, %r30084, 0x96; + lop3.b32 %r14233, %r14233, %r30082, %r30080, 0x96; + lop3.b32 %r14234, %r30109, %r30107, %r30085, 0x96; + lop3.b32 %r14234, %r14234, %r30083, %r30081, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14245, %r30104, %r30078, %r30076, 0x96; + lop3.b32 %r14245, %r14245, %r30074, %r30072, 0x96; + lop3.b32 %r14246, %r30105, %r30079, %r30077, 0x96; + lop3.b32 %r14246, %r14246, %r30075, %r30073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14257, %r30102, %r30070, %r30068, 0x96; + lop3.b32 %r14257, %r14257, %r30066, %r30064, 0x96; + lop3.b32 %r14258, %r30103, %r30071, %r30069, 0x96; + lop3.b32 %r14258, %r14258, %r30067, %r30065, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14269, %r14222, %r14221, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14273, %r14221, %r14222, %r14168; + // end inline asm + xor.b32 %r14703, %r14269, %r14257; + xor.b32 %r14704, %r14273, %r14258; + xor.b32 %r14536, %r30100, %r14703; + xor.b32 %r14539, %r30101, %r14704; + xor.b32 %r14443, %r30098, %r14703; + xor.b32 %r14442, %r30099, %r14704; + xor.b32 %r14490, %r30096, %r14703; + xor.b32 %r14491, %r30097, %r14704; + xor.b32 %r14395, %r30094, %r14703; + xor.b32 %r14394, %r30095, %r14704; + xor.b32 %r14346, %r30092, %r14703; + xor.b32 %r14347, %r30093, %r14704; + // begin inline asm + shf.l.wrap.b32 %r14277, %r14234, %r14233, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14281, %r14233, %r14234, %r14168; + // end inline asm + xor.b32 %r14705, %r14277, %r14209; + xor.b32 %r14706, %r14281, %r14210; + xor.b32 %r14498, %r30112, %r14705; + xor.b32 %r14499, %r30113, %r14706; + xor.b32 %r14315, %r30110, %r14705; + xor.b32 %r14314, %r30111, %r14706; + xor.b32 %r14474, %r30090, %r14705; + xor.b32 %r14475, %r30091, %r14706; + xor.b32 %r14435, %r30088, %r14705; + xor.b32 %r14434, %r30089, %r14706; + xor.b32 %r14418, %r30086, %r14705; + xor.b32 %r14419, %r30087, %r14706; + // begin inline asm + shf.l.wrap.b32 %r14285, %r14246, %r14245, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14289, %r14245, %r14246, %r14168; + // end inline asm + xor.b32 %r14707, %r14285, %r14221; + xor.b32 %r14708, %r14289, %r14222; + xor.b32 %r14355, %r30108, %r14707; + xor.b32 %r14354, %r30109, %r14708; + xor.b32 %r14482, %r30106, %r14707; + xor.b32 %r14483, %r30107, %r14708; + xor.b32 %r14363, %r30084, %r14707; + xor.b32 %r14362, %r30085, %r14708; + xor.b32 %r14466, %r30082, %r14707; + xor.b32 %r14467, %r30083, %r14708; + xor.b32 %r14331, %r30080, %r14707; + xor.b32 %r14330, %r30081, %r14708; + // begin inline asm + shf.l.wrap.b32 %r14293, %r14258, %r14257, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14297, %r14257, %r14258, %r14168; + // end inline asm + xor.b32 %r14709, %r14293, %r14233; + xor.b32 %r14710, %r14297, %r14234; + xor.b32 %r14450, %r30104, %r14709; + xor.b32 %r14451, %r30105, %r14710; + xor.b32 %r14427, %r30078, %r14709; + xor.b32 %r14426, %r30079, %r14710; + xor.b32 %r14370, %r30076, %r14709; + xor.b32 %r14371, %r30077, %r14710; + xor.b32 %r14458, %r30074, %r14709; + xor.b32 %r14459, %r30075, %r14710; + xor.b32 %r14387, %r30072, %r14709; + xor.b32 %r14386, %r30073, %r14710; + // begin inline asm + shf.l.wrap.b32 %r14301, %r14210, %r14209, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14305, %r14209, %r14210, %r14168; + // end inline asm + xor.b32 %r14711, %r14301, %r14245; + xor.b32 %r14712, %r14305, %r14246; + xor.b32 %r14402, %r30102, %r14711; + xor.b32 %r14403, %r30103, %r14712; + xor.b32 %r14322, %r30070, %r14711; + xor.b32 %r14323, %r30071, %r14712; + xor.b32 %r14339, %r30068, %r14711; + xor.b32 %r14338, %r30069, %r14712; + xor.b32 %r14378, %r30066, %r14711; + xor.b32 %r14379, %r30067, %r14712; + xor.b32 %r14410, %r30064, %r14711; + xor.b32 %r14411, %r30065, %r14712; + mov.u32 %r14316, 44; + // begin inline asm + shf.l.wrap.b32 %r14309, %r14315, %r14314, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14313, %r14314, %r14315, %r14316; + // end inline asm + mov.u32 %r14324, 20; + // begin inline asm + shf.l.wrap.b32 %r14317, %r14323, %r14322, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14321, %r14322, %r14323, %r14324; + // end inline asm + mov.u32 %r14332, 61; + // begin inline asm + shf.l.wrap.b32 %r14325, %r14331, %r14330, %r14332; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14329, %r14330, %r14331, %r14332; + // end inline asm + mov.u32 %r14340, 39; + // begin inline asm + shf.l.wrap.b32 %r14333, %r14339, %r14338, %r14340; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14337, %r14338, %r14339, %r14340; + // end inline asm + mov.u32 %r14348, 18; + // begin inline asm + shf.l.wrap.b32 %r14341, %r14347, %r14346, %r14348; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14345, %r14346, %r14347, %r14348; + // end inline asm + mov.u32 %r14356, 62; + // begin inline asm + shf.l.wrap.b32 %r14349, %r14355, %r14354, %r14356; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14353, %r14354, %r14355, %r14356; + // end inline asm + mov.u32 %r14364, 43; + // begin inline asm + shf.l.wrap.b32 %r14357, %r14363, %r14362, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14361, %r14362, %r14363, %r14364; + // end inline asm + mov.u32 %r14372, 25; + // begin inline asm + shf.l.wrap.b32 %r14365, %r14371, %r14370, %r14372; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14369, %r14370, %r14371, %r14372; + // end inline asm + mov.u32 %r14380, 8; + // begin inline asm + shf.l.wrap.b32 %r14373, %r14379, %r14378, %r14380; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14377, %r14378, %r14379, %r14380; + // end inline asm + mov.u32 %r14388, 56; + // begin inline asm + shf.l.wrap.b32 %r14381, %r14387, %r14386, %r14388; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14385, %r14386, %r14387, %r14388; + // end inline asm + mov.u32 %r14396, 41; + // begin inline asm + shf.l.wrap.b32 %r14389, %r14395, %r14394, %r14396; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14393, %r14394, %r14395, %r14396; + // end inline asm + mov.u32 %r14404, 27; + // begin inline asm + shf.l.wrap.b32 %r14397, %r14403, %r14402, %r14404; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14401, %r14402, %r14403, %r14404; + // end inline asm + mov.u32 %r14412, 14; + // begin inline asm + shf.l.wrap.b32 %r14405, %r14411, %r14410, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14409, %r14410, %r14411, %r14412; + // end inline asm + mov.u32 %r14420, 2; + // begin inline asm + shf.l.wrap.b32 %r14413, %r14419, %r14418, %r14420; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14417, %r14418, %r14419, %r14420; + // end inline asm + mov.u32 %r14428, 55; + // begin inline asm + shf.l.wrap.b32 %r14421, %r14427, %r14426, %r14428; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14425, %r14426, %r14427, %r14428; + // end inline asm + mov.u32 %r14436, 45; + // begin inline asm + shf.l.wrap.b32 %r14429, %r14435, %r14434, %r14436; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14433, %r14434, %r14435, %r14436; + // end inline asm + mov.u32 %r14444, 36; + // begin inline asm + shf.l.wrap.b32 %r14437, %r14443, %r14442, %r14444; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14441, %r14442, %r14443, %r14444; + // end inline asm + mov.u32 %r14452, 28; + // begin inline asm + shf.l.wrap.b32 %r14445, %r14451, %r14450, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14449, %r14450, %r14451, %r14452; + // end inline asm + mov.u32 %r14460, 21; + // begin inline asm + shf.l.wrap.b32 %r14453, %r14459, %r14458, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14457, %r14458, %r14459, %r14460; + // end inline asm + mov.u32 %r14468, 15; + // begin inline asm + shf.l.wrap.b32 %r14461, %r14467, %r14466, %r14468; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14465, %r14466, %r14467, %r14468; + // end inline asm + mov.u32 %r14476, 10; + // begin inline asm + shf.l.wrap.b32 %r14469, %r14475, %r14474, %r14476; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14473, %r14474, %r14475, %r14476; + // end inline asm + mov.u32 %r14484, 6; + // begin inline asm + shf.l.wrap.b32 %r14477, %r14483, %r14482, %r14484; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14481, %r14482, %r14483, %r14484; + // end inline asm + mov.u32 %r14492, 3; + // begin inline asm + shf.l.wrap.b32 %r14485, %r14491, %r14490, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14489, %r14490, %r14491, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14493, %r14499, %r14498, %r14168; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14497, %r14498, %r14499, %r14168; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14501, %r14536, %r14309, %r14357, 0xD2; + lop3.b32 %r14502, %r14539, %r14313, %r14361, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30112, %r14309, %r14357, %r14453, 0xD2; + lop3.b32 %r30113, %r14313, %r14361, %r14457, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30108, %r14357, %r14453, %r14405, 0xD2; + lop3.b32 %r30109, %r14361, %r14457, %r14409, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30104, %r14453, %r14405, %r14536, 0xD2; + lop3.b32 %r30105, %r14457, %r14409, %r14539, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30102, %r14405, %r14536, %r14309, 0xD2; + lop3.b32 %r30103, %r14409, %r14539, %r14313, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30098, %r14445, %r14317, %r14485, 0xD2; + lop3.b32 %r30099, %r14449, %r14321, %r14489, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30110, %r14317, %r14485, %r14429, 0xD2; + lop3.b32 %r30111, %r14321, %r14489, %r14433, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30106, %r14485, %r14429, %r14325, 0xD2; + lop3.b32 %r30107, %r14489, %r14433, %r14329, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30078, %r14429, %r14325, %r14445, 0xD2; + lop3.b32 %r30079, %r14433, %r14329, %r14449, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r30078, %r30079}; + // begin inline asm + // chi + lop3.b32 %r30070, %r14325, %r14445, %r14317, 0xD2; + lop3.b32 %r30071, %r14329, %r14449, %r14321, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r30070, %r30071}; + // begin inline asm + // chi + lop3.b32 %r30096, %r14493, %r14477, %r14365, 0xD2; + lop3.b32 %r30097, %r14497, %r14481, %r14369, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+104], {%r30096, %r30097}; + // begin inline asm + // chi + lop3.b32 %r30090, %r14477, %r14365, %r14373, 0xD2; + lop3.b32 %r30091, %r14481, %r14369, %r14377, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+112], {%r30090, %r30091}; + // begin inline asm + // chi + lop3.b32 %r30084, %r14365, %r14373, %r14341, 0xD2; + lop3.b32 %r30085, %r14369, %r14377, %r14345, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+120], {%r30084, %r30085}; + // begin inline asm + // chi + lop3.b32 %r30076, %r14373, %r14341, %r14493, 0xD2; + lop3.b32 %r30077, %r14377, %r14345, %r14497, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+128], {%r30076, %r30077}; + // begin inline asm + // chi + lop3.b32 %r30068, %r14341, %r14493, %r14477, 0xD2; + lop3.b32 %r30069, %r14345, %r14497, %r14481, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+136], {%r30068, %r30069}; + // begin inline asm + // chi + lop3.b32 %r30094, %r14397, %r14437, %r14469, 0xD2; + lop3.b32 %r30095, %r14401, %r14441, %r14473, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+144], {%r30094, %r30095}; + // begin inline asm + // chi + lop3.b32 %r30088, %r14437, %r14469, %r14461, 0xD2; + lop3.b32 %r30089, %r14441, %r14473, %r14465, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+152], {%r30088, %r30089}; + // begin inline asm + // chi + lop3.b32 %r30082, %r14469, %r14461, %r14381, 0xD2; + lop3.b32 %r30083, %r14473, %r14465, %r14385, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+160], {%r30082, %r30083}; + // begin inline asm + // chi + lop3.b32 %r30074, %r14461, %r14381, %r14397, 0xD2; + lop3.b32 %r30075, %r14465, %r14385, %r14401, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+168], {%r30074, %r30075}; + // begin inline asm + // chi + lop3.b32 %r30066, %r14381, %r14397, %r14437, 0xD2; + lop3.b32 %r30067, %r14385, %r14401, %r14441, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+176], {%r30066, %r30067}; + // begin inline asm + // chi + lop3.b32 %r30092, %r14349, %r14421, %r14333, 0xD2; + lop3.b32 %r30093, %r14353, %r14425, %r14337, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+184], {%r30092, %r30093}; + // begin inline asm + // chi + lop3.b32 %r30086, %r14421, %r14333, %r14389, 0xD2; + lop3.b32 %r30087, %r14425, %r14337, %r14393, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+192], {%r30086, %r30087}; + // begin inline asm + // chi + lop3.b32 %r30080, %r14333, %r14389, %r14413, 0xD2; + lop3.b32 %r30081, %r14337, %r14393, %r14417, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+200], {%r30080, %r30081}; + // begin inline asm + // chi + lop3.b32 %r30072, %r14389, %r14413, %r14349, 0xD2; + lop3.b32 %r30073, %r14393, %r14417, %r14353, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+208], {%r30072, %r30073}; + // begin inline asm + // chi + lop3.b32 %r30064, %r14413, %r14349, %r14421, 0xD2; + lop3.b32 %r30065, %r14417, %r14353, %r14425, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+216], {%r30064, %r30065}; + mul.wide.s32 %rd727, %r30114, 8; + add.s64 %rd726, %rd706, %rd727; + // begin inline asm + ld.global.nc.v2.u32 {%r14701,%r14702}, [%rd726]; + // end inline asm + xor.b32 %r30100, %r14501, %r14701; + xor.b32 %r30101, %r14502, %r14702; + add.s32 %r30114, %r30114, 1; + setp.lt.u32 %p31, %r30114, 23; + @%p31 bra $L__BB2_50; + + mov.u32 %r30147, 0; + mov.u32 %r14812, 1; + st.local.v2.u32 [%rd149+32], {%r30112, %r30113}; + st.local.v2.u32 [%rd149+72], {%r30110, %r30111}; + st.local.v2.u32 [%rd149+40], {%r30108, %r30109}; + st.local.v2.u32 [%rd149+80], {%r30106, %r30107}; + st.local.v2.u32 [%rd149+48], {%r30104, %r30105}; + st.local.v2.u32 [%rd149+56], {%r30102, %r30103}; + st.local.v2.u32 [%rd149+24], {%r30100, %r30101}; + // begin inline asm + // xor5 + lop3.b32 %r14713, %r30100, %r30098, %r30096, 0x96; + lop3.b32 %r14713, %r14713, %r30094, %r30092, 0x96; + lop3.b32 %r14714, %r30101, %r30099, %r30097, 0x96; + lop3.b32 %r14714, %r14714, %r30095, %r30093, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14725, %r30112, %r30110, %r30090, 0x96; + lop3.b32 %r14725, %r14725, %r30088, %r30086, 0x96; + lop3.b32 %r14726, %r30113, %r30111, %r30091, 0x96; + lop3.b32 %r14726, %r14726, %r30089, %r30087, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14737, %r30108, %r30106, %r30084, 0x96; + lop3.b32 %r14737, %r14737, %r30082, %r30080, 0x96; + lop3.b32 %r14738, %r30109, %r30107, %r30085, 0x96; + lop3.b32 %r14738, %r14738, %r30083, %r30081, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14749, %r30104, %r30078, %r30076, 0x96; + lop3.b32 %r14749, %r14749, %r30074, %r30072, 0x96; + lop3.b32 %r14750, %r30105, %r30079, %r30077, 0x96; + lop3.b32 %r14750, %r14750, %r30075, %r30073, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r14761, %r30102, %r30070, %r30068, 0x96; + lop3.b32 %r14761, %r14761, %r30066, %r30064, 0x96; + lop3.b32 %r14762, %r30103, %r30071, %r30069, 0x96; + lop3.b32 %r14762, %r14762, %r30067, %r30065, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14773, %r14726, %r14725, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14777, %r14725, %r14726, %r14812; + // end inline asm + xor.b32 %r14952, %r14773, %r14761; + xor.b32 %r14953, %r14777, %r14762; + xor.b32 %r14920, %r30100, %r14952; + xor.b32 %r14923, %r30101, %r14953; + xor.b32 %r14883, %r30097, %r14953; + xor.b32 %r14882, %r30096, %r14952; + st.local.v2.u32 [%rd149+104], {%r14882, %r14883}; + // begin inline asm + shf.l.wrap.b32 %r14781, %r14738, %r14737, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14785, %r14737, %r14738, %r14812; + // end inline asm + xor.b32 %r14954, %r14781, %r14713; + xor.b32 %r14955, %r14785, %r14714; + xor.b32 %r14819, %r30110, %r14954; + xor.b32 %r14818, %r30111, %r14955; + xor.b32 %r14858, %r30089, %r14955; + xor.b32 %r14859, %r30088, %r14954; + st.local.v2.u32 [%rd149+152], {%r14859, %r14858}; + // begin inline asm + shf.l.wrap.b32 %r14789, %r14750, %r14749, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14793, %r14749, %r14750, %r14812; + // end inline asm + xor.b32 %r14956, %r14789, %r14725; + xor.b32 %r14957, %r14793, %r14726; + xor.b32 %r14842, %r30085, %r14957; + xor.b32 %r14843, %r30084, %r14956; + st.local.v2.u32 [%rd149+120], {%r14843, %r14842}; + xor.b32 %r14834, %r30081, %r14957; + xor.b32 %r14835, %r30080, %r14956; + st.local.v2.u32 [%rd149+200], {%r14835, %r14834}; + // begin inline asm + shf.l.wrap.b32 %r14797, %r14762, %r14761, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14801, %r14761, %r14762, %r14812; + // end inline asm + xor.b32 %r14958, %r14797, %r14737; + xor.b32 %r14959, %r14801, %r14738; + xor.b32 %r14866, %r30104, %r14958; + xor.b32 %r14867, %r30105, %r14959; + xor.b32 %r14875, %r30075, %r14959; + xor.b32 %r14874, %r30074, %r14958; + st.local.v2.u32 [%rd149+168], {%r14874, %r14875}; + // begin inline asm + shf.l.wrap.b32 %r14805, %r14714, %r14713, %r14812; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14809, %r14713, %r14714, %r14812; + // end inline asm + xor.b32 %r14960, %r14805, %r14749; + xor.b32 %r14961, %r14809, %r14750; + xor.b32 %r14826, %r30070, %r14960; + xor.b32 %r14827, %r30071, %r14961; + xor.b32 %r14851, %r30065, %r14961; + xor.b32 %r14850, %r30064, %r14960; + st.local.v2.u32 [%rd149+216], {%r14850, %r14851}; + // begin inline asm + shf.l.wrap.b32 %r14813, %r14819, %r14818, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14817, %r14818, %r14819, %r14316; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14821, %r14827, %r14826, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14825, %r14826, %r14827, %r14324; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14833, %r14834, %r14835, %r14332; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14829, %r14835, %r14834, %r14332; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r14829, %r14833}; + // begin inline asm + shf.l.wrap.b32 %r14837, %r14843, %r14842, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14841, %r14842, %r14843, %r14364; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14845, %r14851, %r14850, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14849, %r14850, %r14851, %r14412; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14857, %r14858, %r14859, %r14436; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14853, %r14859, %r14858, %r14436; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r14853, %r14857}; + // begin inline asm + shf.l.wrap.b32 %r14861, %r14867, %r14866, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14865, %r14866, %r14867, %r14452; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14869, %r14875, %r14874, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14873, %r14874, %r14875, %r14460; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14877, %r14883, %r14882, %r14492; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r14881, %r14882, %r14883, %r14492; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r14885, %r14920, %r14813, %r14837, 0xD2; + lop3.b32 %r14886, %r14923, %r14817, %r14841, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30247, %r14813, %r14837, %r14869, 0xD2; + lop3.b32 %r30248, %r14817, %r14841, %r14873, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+32], {%r30247, %r30248}; + // begin inline asm + // chi + lop3.b32 %r30243, %r14837, %r14869, %r14845, 0xD2; + lop3.b32 %r30244, %r14841, %r14873, %r14849, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+40], {%r30243, %r30244}; + // begin inline asm + // chi + lop3.b32 %r30239, %r14869, %r14845, %r14920, 0xD2; + lop3.b32 %r30240, %r14873, %r14849, %r14923, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+48], {%r30239, %r30240}; + // begin inline asm + // chi + lop3.b32 %r30237, %r14845, %r14920, %r14813, 0xD2; + lop3.b32 %r30238, %r14849, %r14923, %r14817, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+56], {%r30237, %r30238}; + // begin inline asm + // chi + lop3.b32 %r30233, %r14861, %r14821, %r14877, 0xD2; + lop3.b32 %r30234, %r14865, %r14825, %r14881, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+64], {%r30233, %r30234}; + // begin inline asm + // chi + lop3.b32 %r30245, %r14821, %r14877, %r14853, 0xD2; + lop3.b32 %r30246, %r14825, %r14881, %r14857, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+72], {%r30245, %r30246}; + // begin inline asm + // chi + lop3.b32 %r30241, %r14877, %r14853, %r14829, 0xD2; + lop3.b32 %r30242, %r14881, %r14857, %r14833, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+80], {%r30241, %r30242}; + // begin inline asm + ld.global.nc.v2.u32 {%r14949,%r14950}, [%rd707]; + // end inline asm + xor.b32 %r30235, %r14885, %r14949; + xor.b32 %r30236, %r14886, %r14950; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + add.s64 %rd151, %rd149, 24; + add.s64 %rd152, %rd3, 24; + +$L__BB2_52: + shl.b32 %r14962, %r30147, 2; + cvt.u64.u32 %rd737, %r14962; + and.b64 %rd738, %rd737, 60; + add.s64 %rd739, %rd152, %rd738; + xor.b32 %r14963, %r1678, %r30147; + mul.lo.s32 %r14964, %r14963, 16777619; + ld.local.u32 %r14965, [%rd739]; + xor.b32 %r14966, %r14964, %r14965; + mul.wide.u32 %rd740, %r14966, -954391867; + shr.u64 %rd741, %rd740, 32; + cvt.u32.u64 %r14967, %rd741; + sub.s32 %r14968, %r14966, %r14967; + shr.u32 %r14969, %r14968, 1; + add.s32 %r14970, %r14969, %r14967; + shr.u32 %r14971, %r14970, 20; + mul.lo.s32 %r14972, %r14971, 1179641; + sub.s32 %r14973, %r14966, %r14972; + mul.wide.u32 %rd742, %r14973, 64; + add.s64 %rd743, %rd471, %rd742; + mul.lo.s32 %r14974, %r30184, 16777619; + ld.global.u32 %r14975, [%rd743]; + xor.b32 %r30184, %r14974, %r14975; + mul.lo.s32 %r14976, %r30185, 16777619; + ld.global.u32 %r14977, [%rd743+4]; + xor.b32 %r30185, %r14976, %r14977; + mul.lo.s32 %r14978, %r30196, 16777619; + ld.global.u32 %r14979, [%rd743+8]; + mul.lo.s32 %r14980, %r30197, 16777619; + ld.global.u32 %r14981, [%rd743+12]; + xor.b32 %r14982, %r14980, %r14981; + xor.b32 %r30196, %r14978, %r14979; + mov.b64 %rd744, {%r30196, %r14982}; + mul.lo.s32 %r14983, %r30192, 16777619; + ld.global.u32 %r14984, [%rd743+16]; + mul.lo.s32 %r14985, %r30193, 16777619; + ld.global.u32 %r14986, [%rd743+20]; + xor.b32 %r14987, %r14985, %r14986; + xor.b32 %r30192, %r14983, %r14984; + mov.b64 %rd745, {%r30192, %r14987}; + mul.lo.s32 %r14988, %r30188, 16777619; + ld.global.u32 %r14989, [%rd743+24]; + mul.lo.s32 %r14990, %r30189, 16777619; + ld.global.u32 %r14991, [%rd743+28]; + xor.b32 %r14992, %r14990, %r14991; + xor.b32 %r30188, %r14988, %r14989; + mov.b64 %rd746, {%r30188, %r14992}; + mul.lo.s32 %r14993, %r30186, 16777619; + ld.global.u32 %r14994, [%rd743+32]; + mul.lo.s32 %r14995, %r30187, 16777619; + ld.global.u32 %r14996, [%rd743+36]; + xor.b32 %r14997, %r14995, %r14996; + xor.b32 %r30186, %r14993, %r14994; + mov.b64 %rd747, {%r30186, %r14997}; + mul.lo.s32 %r14998, %r30182, 16777619; + ld.global.u32 %r14999, [%rd743+40]; + xor.b32 %r30182, %r14998, %r14999; + mul.lo.s32 %r15000, %r30183, 16777619; + ld.global.u32 %r15001, [%rd743+44]; + xor.b32 %r30183, %r15000, %r15001; + mul.lo.s32 %r15002, %r30194, 16777619; + ld.global.u32 %r15003, [%rd743+48]; + mul.lo.s32 %r15004, %r30195, 16777619; + ld.global.u32 %r15005, [%rd743+52]; + xor.b32 %r15006, %r15004, %r15005; + xor.b32 %r30194, %r15002, %r15003; + mov.b64 %rd748, {%r30194, %r15006}; + mul.lo.s32 %r15007, %r30190, 16777619; + ld.global.u32 %r15008, [%rd743+56]; + mul.lo.s32 %r15009, %r30191, 16777619; + ld.global.u32 %r15010, [%rd743+60]; + xor.b32 %r15011, %r15009, %r15010; + xor.b32 %r30190, %r15007, %r15008; + mov.b64 %rd749, {%r30190, %r15011}; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + st.local.v2.u32 [%rd3+32], {%r30196, %r14982}; + st.local.v2.u32 [%rd3+40], {%r30192, %r14987}; + st.local.v2.u32 [%rd3+48], {%r30188, %r14992}; + st.local.v2.u32 [%rd3+56], {%r30186, %r14997}; + st.local.v2.u32 [%rd3+64], {%r30182, %r30183}; + st.local.v2.u32 [%rd3+72], {%r30194, %r15006}; + st.local.v2.u32 [%rd3+80], {%r30190, %r15011}; + add.s64 %rd750, %rd151, %rd738; + xor.b32 %r15012, %r1874, %r30147; + mul.lo.s32 %r15013, %r15012, 16777619; + ld.local.u32 %r15014, [%rd750]; + xor.b32 %r15015, %r15013, %r15014; + mul.wide.u32 %rd751, %r15015, -954391867; + shr.u64 %rd752, %rd751, 32; + cvt.u32.u64 %r15016, %rd752; + sub.s32 %r15017, %r15015, %r15016; + shr.u32 %r15018, %r15017, 1; + add.s32 %r15019, %r15018, %r15016; + shr.u32 %r15020, %r15019, 20; + mul.lo.s32 %r15021, %r15020, 1179641; + sub.s32 %r15022, %r15015, %r15021; + mul.wide.u32 %rd753, %r15022, 64; + add.s64 %rd754, %rd471, %rd753; + mul.lo.s32 %r15023, %r30235, 16777619; + ld.global.u32 %r15024, [%rd754]; + xor.b32 %r30235, %r15023, %r15024; + mul.lo.s32 %r15025, %r30236, 16777619; + ld.global.u32 %r15026, [%rd754+4]; + xor.b32 %r30236, %r15025, %r15026; + mul.lo.s32 %r15027, %r30247, 16777619; + ld.global.u32 %r15028, [%rd754+8]; + mul.lo.s32 %r15029, %r30248, 16777619; + ld.global.u32 %r15030, [%rd754+12]; + xor.b32 %r15031, %r15029, %r15030; + xor.b32 %r30247, %r15027, %r15028; + mov.b64 %rd755, {%r30247, %r15031}; + mul.lo.s32 %r15032, %r30243, 16777619; + ld.global.u32 %r15033, [%rd754+16]; + mul.lo.s32 %r15034, %r30244, 16777619; + ld.global.u32 %r15035, [%rd754+20]; + xor.b32 %r15036, %r15034, %r15035; + xor.b32 %r30243, %r15032, %r15033; + mov.b64 %rd756, {%r30243, %r15036}; + mul.lo.s32 %r15037, %r30239, 16777619; + ld.global.u32 %r15038, [%rd754+24]; + mul.lo.s32 %r15039, %r30240, 16777619; + ld.global.u32 %r15040, [%rd754+28]; + xor.b32 %r15041, %r15039, %r15040; + xor.b32 %r30239, %r15037, %r15038; + mov.b64 %rd757, {%r30239, %r15041}; + mul.lo.s32 %r15042, %r30237, 16777619; + ld.global.u32 %r15043, [%rd754+32]; + mul.lo.s32 %r15044, %r30238, 16777619; + ld.global.u32 %r15045, [%rd754+36]; + xor.b32 %r15046, %r15044, %r15045; + xor.b32 %r30237, %r15042, %r15043; + mov.b64 %rd758, {%r30237, %r15046}; + mul.lo.s32 %r15047, %r30233, 16777619; + ld.global.u32 %r15048, [%rd754+40]; + xor.b32 %r30233, %r15047, %r15048; + mul.lo.s32 %r15049, %r30234, 16777619; + ld.global.u32 %r15050, [%rd754+44]; + xor.b32 %r30234, %r15049, %r15050; + mul.lo.s32 %r15051, %r30245, 16777619; + ld.global.u32 %r15052, [%rd754+48]; + mul.lo.s32 %r15053, %r30246, 16777619; + ld.global.u32 %r15054, [%rd754+52]; + xor.b32 %r15055, %r15053, %r15054; + xor.b32 %r30245, %r15051, %r15052; + mov.b64 %rd759, {%r30245, %r15055}; + mul.lo.s32 %r15056, %r30241, 16777619; + ld.global.u32 %r15057, [%rd754+56]; + mul.lo.s32 %r15058, %r30242, 16777619; + ld.global.u32 %r15059, [%rd754+60]; + xor.b32 %r15060, %r15058, %r15059; + xor.b32 %r30241, %r15056, %r15057; + mov.b64 %rd760, {%r30241, %r15060}; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + st.local.v2.u32 [%rd149+32], {%r30247, %r15031}; + st.local.v2.u32 [%rd149+40], {%r30243, %r15036}; + st.local.v2.u32 [%rd149+48], {%r30239, %r15041}; + st.local.v2.u32 [%rd149+56], {%r30237, %r15046}; + st.local.v2.u32 [%rd149+64], {%r30233, %r30234}; + st.local.v2.u32 [%rd149+72], {%r30245, %r15055}; + st.local.v2.u32 [%rd149+80], {%r30241, %r15060}; + add.s32 %r30147, %r30147, 1; + setp.lt.u32 %p32, %r30147, 512; + shr.u64 %rd761, %rd744, 32; + cvt.u32.u64 %r30197, %rd761; + shr.u64 %rd762, %rd745, 32; + cvt.u32.u64 %r30193, %rd762; + shr.u64 %rd763, %rd746, 32; + cvt.u32.u64 %r30189, %rd763; + shr.u64 %rd764, %rd747, 32; + cvt.u32.u64 %r30187, %rd764; + shr.u64 %rd765, %rd748, 32; + cvt.u32.u64 %r30195, %rd765; + shr.u64 %rd766, %rd749, 32; + cvt.u32.u64 %r30191, %rd766; + shr.u64 %rd767, %rd755, 32; + cvt.u32.u64 %r30248, %rd767; + shr.u64 %rd768, %rd756, 32; + cvt.u32.u64 %r30244, %rd768; + shr.u64 %rd769, %rd757, 32; + cvt.u32.u64 %r30240, %rd769; + shr.u64 %rd770, %rd758, 32; + cvt.u32.u64 %r30238, %rd770; + shr.u64 %rd771, %rd759, 32; + cvt.u32.u64 %r30246, %rd771; + shr.u64 %rd772, %rd760, 32; + cvt.u32.u64 %r30242, %rd772; + @%p32 bra $L__BB2_52; + + mov.u32 %r30148, 0; + st.local.v2.u32 [%rd3+96], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+104], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+112], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+120], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+128], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+136], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+144], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+152], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+160], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+168], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+176], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+184], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+192], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+200], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+208], {%r30148, %r30148}; + st.local.v2.u32 [%rd3+216], {%r30148, %r30148}; + mov.u32 %r30163, -2147483648; + mov.u32 %r15075, 1; + st.local.v2.u32 [%rd3+88], {%r15075, %r30163}; + mov.u32 %r30149, %r30148; + mov.u32 %r30150, %r30148; + mov.u32 %r30151, %r30148; + mov.u32 %r30152, %r30148; + mov.u32 %r30153, %r30148; + mov.u32 %r30154, %r30148; + mov.u32 %r30155, %r30148; + mov.u32 %r30156, %r30148; + mov.u32 %r30157, %r30148; + mov.u32 %r30158, %r30148; + mov.u32 %r30159, %r30148; + mov.u32 %r30160, %r30148; + mov.u32 %r30161, %r30148; + mov.u32 %r30162, %r15075; + mov.u32 %r30164, %r30148; + mov.u32 %r30165, %r30148; + mov.u32 %r30166, %r30148; + mov.u32 %r30167, %r30148; + mov.u32 %r30168, %r30148; + mov.u32 %r30169, %r30148; + mov.u32 %r30170, %r30148; + mov.u32 %r30171, %r30148; + mov.u32 %r30172, %r30148; + mov.u32 %r30173, %r30148; + mov.u32 %r30174, %r30148; + mov.u32 %r30175, %r30148; + mov.u32 %r30176, %r30148; + mov.u32 %r30177, %r30148; + mov.u32 %r30178, %r30148; + mov.u32 %r30179, %r30148; + mov.u32 %r30180, %r30148; + mov.u32 %r30181, %r30148; + mov.u32 %r30198, %r30148; + +$L__BB2_54: + // begin inline asm + // xor5 + lop3.b32 %r15102, %r30184, %r30182, %r30180, 0x96; + lop3.b32 %r15102, %r15102, %r30178, %r30176, 0x96; + lop3.b32 %r15103, %r30185, %r30183, %r30181, 0x96; + lop3.b32 %r15103, %r15103, %r30179, %r30177, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15114, %r30196, %r30194, %r30174, 0x96; + lop3.b32 %r15114, %r15114, %r30172, %r30170, 0x96; + lop3.b32 %r15115, %r30197, %r30195, %r30175, 0x96; + lop3.b32 %r15115, %r15115, %r30173, %r30171, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15126, %r30192, %r30190, %r30168, 0x96; + lop3.b32 %r15126, %r15126, %r30166, %r30164, 0x96; + lop3.b32 %r15127, %r30193, %r30191, %r30169, 0x96; + lop3.b32 %r15127, %r15127, %r30167, %r30165, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15138, %r30188, %r30162, %r30160, 0x96; + lop3.b32 %r15138, %r15138, %r30158, %r30156, 0x96; + lop3.b32 %r15139, %r30189, %r30163, %r30161, 0x96; + lop3.b32 %r15139, %r15139, %r30159, %r30157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15150, %r30186, %r30154, %r30152, 0x96; + lop3.b32 %r15150, %r15150, %r30150, %r30148, 0x96; + lop3.b32 %r15151, %r30187, %r30155, %r30153, 0x96; + lop3.b32 %r15151, %r15151, %r30151, %r30149, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15162, %r15115, %r15114, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15166, %r15114, %r15115, %r15075; + // end inline asm + xor.b32 %r15596, %r15162, %r15150; + xor.b32 %r15597, %r15166, %r15151; + xor.b32 %r15429, %r30184, %r15596; + xor.b32 %r15432, %r30185, %r15597; + xor.b32 %r15336, %r30182, %r15596; + xor.b32 %r15335, %r30183, %r15597; + xor.b32 %r15383, %r30180, %r15596; + xor.b32 %r15384, %r30181, %r15597; + xor.b32 %r15288, %r30178, %r15596; + xor.b32 %r15287, %r30179, %r15597; + xor.b32 %r15239, %r30176, %r15596; + xor.b32 %r15240, %r30177, %r15597; + // begin inline asm + shf.l.wrap.b32 %r15170, %r15127, %r15126, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15174, %r15126, %r15127, %r15075; + // end inline asm + xor.b32 %r15598, %r15170, %r15102; + xor.b32 %r15599, %r15174, %r15103; + xor.b32 %r15391, %r30196, %r15598; + xor.b32 %r15392, %r30197, %r15599; + xor.b32 %r15208, %r30194, %r15598; + xor.b32 %r15207, %r30195, %r15599; + xor.b32 %r15367, %r30174, %r15598; + xor.b32 %r15368, %r30175, %r15599; + xor.b32 %r15328, %r30172, %r15598; + xor.b32 %r15327, %r30173, %r15599; + xor.b32 %r15311, %r30170, %r15598; + xor.b32 %r15312, %r30171, %r15599; + // begin inline asm + shf.l.wrap.b32 %r15178, %r15139, %r15138, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15182, %r15138, %r15139, %r15075; + // end inline asm + xor.b32 %r15600, %r15178, %r15114; + xor.b32 %r15601, %r15182, %r15115; + xor.b32 %r15248, %r30192, %r15600; + xor.b32 %r15247, %r30193, %r15601; + xor.b32 %r15375, %r30190, %r15600; + xor.b32 %r15376, %r30191, %r15601; + xor.b32 %r15256, %r30168, %r15600; + xor.b32 %r15255, %r30169, %r15601; + xor.b32 %r15359, %r30166, %r15600; + xor.b32 %r15360, %r30167, %r15601; + xor.b32 %r15224, %r30164, %r15600; + xor.b32 %r15223, %r30165, %r15601; + // begin inline asm + shf.l.wrap.b32 %r15186, %r15151, %r15150, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15190, %r15150, %r15151, %r15075; + // end inline asm + xor.b32 %r15602, %r15186, %r15126; + xor.b32 %r15603, %r15190, %r15127; + xor.b32 %r15343, %r30188, %r15602; + xor.b32 %r15344, %r30189, %r15603; + xor.b32 %r15320, %r30162, %r15602; + xor.b32 %r15319, %r30163, %r15603; + xor.b32 %r15263, %r30160, %r15602; + xor.b32 %r15264, %r30161, %r15603; + xor.b32 %r15351, %r30158, %r15602; + xor.b32 %r15352, %r30159, %r15603; + xor.b32 %r15280, %r30156, %r15602; + xor.b32 %r15279, %r30157, %r15603; + // begin inline asm + shf.l.wrap.b32 %r15194, %r15103, %r15102, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15198, %r15102, %r15103, %r15075; + // end inline asm + xor.b32 %r15604, %r15194, %r15138; + xor.b32 %r15605, %r15198, %r15139; + xor.b32 %r15295, %r30186, %r15604; + xor.b32 %r15296, %r30187, %r15605; + xor.b32 %r15215, %r30154, %r15604; + xor.b32 %r15216, %r30155, %r15605; + xor.b32 %r15232, %r30152, %r15604; + xor.b32 %r15231, %r30153, %r15605; + xor.b32 %r15271, %r30150, %r15604; + xor.b32 %r15272, %r30151, %r15605; + xor.b32 %r15303, %r30148, %r15604; + xor.b32 %r15304, %r30149, %r15605; + mov.u32 %r15209, 44; + // begin inline asm + shf.l.wrap.b32 %r15202, %r15208, %r15207, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15206, %r15207, %r15208, %r15209; + // end inline asm + mov.u32 %r15217, 20; + // begin inline asm + shf.l.wrap.b32 %r15210, %r15216, %r15215, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15214, %r15215, %r15216, %r15217; + // end inline asm + mov.u32 %r15225, 61; + // begin inline asm + shf.l.wrap.b32 %r15218, %r15224, %r15223, %r15225; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15222, %r15223, %r15224, %r15225; + // end inline asm + mov.u32 %r15233, 39; + // begin inline asm + shf.l.wrap.b32 %r15226, %r15232, %r15231, %r15233; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15230, %r15231, %r15232, %r15233; + // end inline asm + mov.u32 %r15241, 18; + // begin inline asm + shf.l.wrap.b32 %r15234, %r15240, %r15239, %r15241; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15238, %r15239, %r15240, %r15241; + // end inline asm + mov.u32 %r15249, 62; + // begin inline asm + shf.l.wrap.b32 %r15242, %r15248, %r15247, %r15249; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15246, %r15247, %r15248, %r15249; + // end inline asm + mov.u32 %r15257, 43; + // begin inline asm + shf.l.wrap.b32 %r15250, %r15256, %r15255, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15254, %r15255, %r15256, %r15257; + // end inline asm + mov.u32 %r15265, 25; + // begin inline asm + shf.l.wrap.b32 %r15258, %r15264, %r15263, %r15265; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15262, %r15263, %r15264, %r15265; + // end inline asm + mov.u32 %r15273, 8; + // begin inline asm + shf.l.wrap.b32 %r15266, %r15272, %r15271, %r15273; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15270, %r15271, %r15272, %r15273; + // end inline asm + mov.u32 %r15281, 56; + // begin inline asm + shf.l.wrap.b32 %r15274, %r15280, %r15279, %r15281; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15278, %r15279, %r15280, %r15281; + // end inline asm + mov.u32 %r15289, 41; + // begin inline asm + shf.l.wrap.b32 %r15282, %r15288, %r15287, %r15289; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15286, %r15287, %r15288, %r15289; + // end inline asm + mov.u32 %r15297, 27; + // begin inline asm + shf.l.wrap.b32 %r15290, %r15296, %r15295, %r15297; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15294, %r15295, %r15296, %r15297; + // end inline asm + mov.u32 %r15305, 14; + // begin inline asm + shf.l.wrap.b32 %r15298, %r15304, %r15303, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15302, %r15303, %r15304, %r15305; + // end inline asm + mov.u32 %r15313, 2; + // begin inline asm + shf.l.wrap.b32 %r15306, %r15312, %r15311, %r15313; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15310, %r15311, %r15312, %r15313; + // end inline asm + mov.u32 %r15321, 55; + // begin inline asm + shf.l.wrap.b32 %r15314, %r15320, %r15319, %r15321; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15318, %r15319, %r15320, %r15321; + // end inline asm + mov.u32 %r15329, 45; + // begin inline asm + shf.l.wrap.b32 %r15322, %r15328, %r15327, %r15329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15326, %r15327, %r15328, %r15329; + // end inline asm + mov.u32 %r15337, 36; + // begin inline asm + shf.l.wrap.b32 %r15330, %r15336, %r15335, %r15337; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15334, %r15335, %r15336, %r15337; + // end inline asm + mov.u32 %r15345, 28; + // begin inline asm + shf.l.wrap.b32 %r15338, %r15344, %r15343, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15342, %r15343, %r15344, %r15345; + // end inline asm + mov.u32 %r15353, 21; + // begin inline asm + shf.l.wrap.b32 %r15346, %r15352, %r15351, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15350, %r15351, %r15352, %r15353; + // end inline asm + mov.u32 %r15361, 15; + // begin inline asm + shf.l.wrap.b32 %r15354, %r15360, %r15359, %r15361; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15358, %r15359, %r15360, %r15361; + // end inline asm + mov.u32 %r15369, 10; + // begin inline asm + shf.l.wrap.b32 %r15362, %r15368, %r15367, %r15369; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15366, %r15367, %r15368, %r15369; + // end inline asm + mov.u32 %r15377, 6; + // begin inline asm + shf.l.wrap.b32 %r15370, %r15376, %r15375, %r15377; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15374, %r15375, %r15376, %r15377; + // end inline asm + mov.u32 %r15385, 3; + // begin inline asm + shf.l.wrap.b32 %r15378, %r15384, %r15383, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15382, %r15383, %r15384, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15386, %r15392, %r15391, %r15075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15390, %r15391, %r15392, %r15075; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15394, %r15429, %r15202, %r15250, 0xD2; + lop3.b32 %r15395, %r15432, %r15206, %r15254, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30196, %r15202, %r15250, %r15346, 0xD2; + lop3.b32 %r30197, %r15206, %r15254, %r15350, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30192, %r15250, %r15346, %r15298, 0xD2; + lop3.b32 %r30193, %r15254, %r15350, %r15302, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30188, %r15346, %r15298, %r15429, 0xD2; + lop3.b32 %r30189, %r15350, %r15302, %r15432, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30186, %r15298, %r15429, %r15202, 0xD2; + lop3.b32 %r30187, %r15302, %r15432, %r15206, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30182, %r15338, %r15210, %r15378, 0xD2; + lop3.b32 %r30183, %r15342, %r15214, %r15382, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30194, %r15210, %r15378, %r15322, 0xD2; + lop3.b32 %r30195, %r15214, %r15382, %r15326, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30190, %r15378, %r15322, %r15218, 0xD2; + lop3.b32 %r30191, %r15382, %r15326, %r15222, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30162, %r15322, %r15218, %r15338, 0xD2; + lop3.b32 %r30163, %r15326, %r15222, %r15342, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30162, %r30163}; + // begin inline asm + // chi + lop3.b32 %r30154, %r15218, %r15338, %r15210, 0xD2; + lop3.b32 %r30155, %r15222, %r15342, %r15214, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30154, %r30155}; + // begin inline asm + // chi + lop3.b32 %r30180, %r15386, %r15370, %r15258, 0xD2; + lop3.b32 %r30181, %r15390, %r15374, %r15262, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30180, %r30181}; + // begin inline asm + // chi + lop3.b32 %r30174, %r15370, %r15258, %r15266, 0xD2; + lop3.b32 %r30175, %r15374, %r15262, %r15270, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30174, %r30175}; + // begin inline asm + // chi + lop3.b32 %r30168, %r15258, %r15266, %r15234, 0xD2; + lop3.b32 %r30169, %r15262, %r15270, %r15238, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30168, %r30169}; + // begin inline asm + // chi + lop3.b32 %r30160, %r15266, %r15234, %r15386, 0xD2; + lop3.b32 %r30161, %r15270, %r15238, %r15390, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30160, %r30161}; + // begin inline asm + // chi + lop3.b32 %r30152, %r15234, %r15386, %r15370, 0xD2; + lop3.b32 %r30153, %r15238, %r15390, %r15374, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30152, %r30153}; + // begin inline asm + // chi + lop3.b32 %r30178, %r15290, %r15330, %r15362, 0xD2; + lop3.b32 %r30179, %r15294, %r15334, %r15366, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30178, %r30179}; + // begin inline asm + // chi + lop3.b32 %r30172, %r15330, %r15362, %r15354, 0xD2; + lop3.b32 %r30173, %r15334, %r15366, %r15358, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30172, %r30173}; + // begin inline asm + // chi + lop3.b32 %r30166, %r15362, %r15354, %r15274, 0xD2; + lop3.b32 %r30167, %r15366, %r15358, %r15278, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30166, %r30167}; + // begin inline asm + // chi + lop3.b32 %r30158, %r15354, %r15274, %r15290, 0xD2; + lop3.b32 %r30159, %r15358, %r15278, %r15294, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30158, %r30159}; + // begin inline asm + // chi + lop3.b32 %r30150, %r15274, %r15290, %r15330, 0xD2; + lop3.b32 %r30151, %r15278, %r15294, %r15334, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30150, %r30151}; + // begin inline asm + // chi + lop3.b32 %r30176, %r15242, %r15314, %r15226, 0xD2; + lop3.b32 %r30177, %r15246, %r15318, %r15230, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30176, %r30177}; + // begin inline asm + // chi + lop3.b32 %r30170, %r15314, %r15226, %r15282, 0xD2; + lop3.b32 %r30171, %r15318, %r15230, %r15286, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30170, %r30171}; + // begin inline asm + // chi + lop3.b32 %r30164, %r15226, %r15282, %r15306, 0xD2; + lop3.b32 %r30165, %r15230, %r15286, %r15310, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30164, %r30165}; + // begin inline asm + // chi + lop3.b32 %r30156, %r15282, %r15306, %r15242, 0xD2; + lop3.b32 %r30157, %r15286, %r15310, %r15246, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30156, %r30157}; + // begin inline asm + // chi + lop3.b32 %r30148, %r15306, %r15242, %r15314, 0xD2; + lop3.b32 %r30149, %r15310, %r15246, %r15318, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30148, %r30149}; + mul.wide.s32 %rd774, %r30198, 8; + add.s64 %rd773, %rd706, %rd774; + // begin inline asm + ld.global.nc.v2.u32 {%r15594,%r15595}, [%rd773]; + // end inline asm + xor.b32 %r30184, %r15394, %r15594; + xor.b32 %r30185, %r15395, %r15595; + add.s32 %r30198, %r30198, 1; + setp.lt.u32 %p33, %r30198, 23; + @%p33 bra $L__BB2_54; + + st.local.v2.u32 [%rd3+32], {%r30196, %r30197}; + st.local.v2.u32 [%rd3+72], {%r30194, %r30195}; + st.local.v2.u32 [%rd3+40], {%r30192, %r30193}; + st.local.v2.u32 [%rd3+80], {%r30190, %r30191}; + st.local.v2.u32 [%rd3+48], {%r30188, %r30189}; + st.local.v2.u32 [%rd3+56], {%r30186, %r30187}; + st.local.v2.u32 [%rd3+24], {%r30184, %r30185}; + // begin inline asm + // xor5 + lop3.b32 %r15606, %r30184, %r30182, %r30180, 0x96; + lop3.b32 %r15606, %r15606, %r30178, %r30176, 0x96; + lop3.b32 %r15607, %r30185, %r30183, %r30181, 0x96; + lop3.b32 %r15607, %r15607, %r30179, %r30177, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15618, %r30196, %r30194, %r30174, 0x96; + lop3.b32 %r15618, %r15618, %r30172, %r30170, 0x96; + lop3.b32 %r15619, %r30197, %r30195, %r30175, 0x96; + lop3.b32 %r15619, %r15619, %r30173, %r30171, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15630, %r30192, %r30190, %r30168, 0x96; + lop3.b32 %r15630, %r15630, %r30166, %r30164, 0x96; + lop3.b32 %r15631, %r30193, %r30191, %r30169, 0x96; + lop3.b32 %r15631, %r15631, %r30167, %r30165, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15642, %r30188, %r30162, %r30160, 0x96; + lop3.b32 %r15642, %r15642, %r30158, %r30156, 0x96; + lop3.b32 %r15643, %r30189, %r30163, %r30161, 0x96; + lop3.b32 %r15643, %r15643, %r30159, %r30157, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15654, %r30186, %r30154, %r30152, 0x96; + lop3.b32 %r15654, %r15654, %r30150, %r30148, 0x96; + lop3.b32 %r15655, %r30187, %r30155, %r30153, 0x96; + lop3.b32 %r15655, %r15655, %r30151, %r30149, 0x96; + // end inline asm + mov.u32 %r15858, 1; + // begin inline asm + shf.l.wrap.b32 %r15666, %r15619, %r15618, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15670, %r15618, %r15619, %r15858; + // end inline asm + xor.b32 %r15885, %r15666, %r15654; + xor.b32 %r15886, %r15670, %r15655; + xor.b32 %r15813, %r30184, %r15885; + xor.b32 %r15816, %r30185, %r15886; + xor.b32 %r15776, %r30181, %r15886; + xor.b32 %r15775, %r30180, %r15885; + st.local.v2.u32 [%rd3+104], {%r15775, %r15776}; + // begin inline asm + shf.l.wrap.b32 %r15674, %r15631, %r15630, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15678, %r15630, %r15631, %r15858; + // end inline asm + xor.b32 %r15887, %r15674, %r15606; + xor.b32 %r15888, %r15678, %r15607; + xor.b32 %r15712, %r30194, %r15887; + xor.b32 %r15711, %r30195, %r15888; + xor.b32 %r15751, %r30173, %r15888; + xor.b32 %r15752, %r30172, %r15887; + st.local.v2.u32 [%rd3+152], {%r15752, %r15751}; + // begin inline asm + shf.l.wrap.b32 %r15682, %r15643, %r15642, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15686, %r15642, %r15643, %r15858; + // end inline asm + xor.b32 %r15889, %r15682, %r15618; + xor.b32 %r15890, %r15686, %r15619; + xor.b32 %r15735, %r30169, %r15890; + xor.b32 %r15736, %r30168, %r15889; + st.local.v2.u32 [%rd3+120], {%r15736, %r15735}; + xor.b32 %r15727, %r30165, %r15890; + xor.b32 %r15728, %r30164, %r15889; + st.local.v2.u32 [%rd3+200], {%r15728, %r15727}; + // begin inline asm + shf.l.wrap.b32 %r15690, %r15655, %r15654, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15694, %r15654, %r15655, %r15858; + // end inline asm + xor.b32 %r15891, %r15690, %r15630; + xor.b32 %r15892, %r15694, %r15631; + xor.b32 %r15759, %r30188, %r15891; + xor.b32 %r15760, %r30189, %r15892; + xor.b32 %r15768, %r30159, %r15892; + xor.b32 %r15767, %r30158, %r15891; + st.local.v2.u32 [%rd3+168], {%r15767, %r15768}; + // begin inline asm + shf.l.wrap.b32 %r15698, %r15607, %r15606, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15702, %r15606, %r15607, %r15858; + // end inline asm + xor.b32 %r15893, %r15698, %r15642; + xor.b32 %r15894, %r15702, %r15643; + xor.b32 %r15719, %r30154, %r15893; + xor.b32 %r15720, %r30155, %r15894; + xor.b32 %r15744, %r30149, %r15894; + xor.b32 %r15743, %r30148, %r15893; + st.local.v2.u32 [%rd3+216], {%r15743, %r15744}; + // begin inline asm + shf.l.wrap.b32 %r15706, %r15712, %r15711, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15710, %r15711, %r15712, %r15209; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15714, %r15720, %r15719, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15718, %r15719, %r15720, %r15217; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15726, %r15727, %r15728, %r15225; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15722, %r15728, %r15727, %r15225; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r15722, %r15726}; + // begin inline asm + shf.l.wrap.b32 %r15730, %r15736, %r15735, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15734, %r15735, %r15736, %r15257; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15738, %r15744, %r15743, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15742, %r15743, %r15744, %r15305; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15750, %r15751, %r15752, %r15329; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15746, %r15752, %r15751, %r15329; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r15746, %r15750}; + // begin inline asm + shf.l.wrap.b32 %r15754, %r15760, %r15759, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15758, %r15759, %r15760, %r15345; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15762, %r15768, %r15767, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15766, %r15767, %r15768, %r15353; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15770, %r15776, %r15775, %r15385; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15774, %r15775, %r15776, %r15385; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15778, %r15813, %r15706, %r15730, 0xD2; + lop3.b32 %r15779, %r15816, %r15710, %r15734, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r15786, %r15706, %r15730, %r15762, 0xD2; + lop3.b32 %r15787, %r15710, %r15734, %r15766, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r15786, %r15787}; + // begin inline asm + // chi + lop3.b32 %r15794, %r15730, %r15762, %r15738, 0xD2; + lop3.b32 %r15795, %r15734, %r15766, %r15742, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r15794, %r15795}; + // begin inline asm + // chi + lop3.b32 %r15802, %r15762, %r15738, %r15813, 0xD2; + lop3.b32 %r15803, %r15766, %r15742, %r15816, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r15802, %r15803}; + // begin inline asm + // chi + lop3.b32 %r15810, %r15738, %r15813, %r15706, 0xD2; + lop3.b32 %r15811, %r15742, %r15816, %r15710, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r15810, %r15811}; + // begin inline asm + // chi + lop3.b32 %r15818, %r15754, %r15714, %r15770, 0xD2; + lop3.b32 %r15819, %r15758, %r15718, %r15774, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r15818, %r15819}; + // begin inline asm + // chi + lop3.b32 %r15826, %r15714, %r15770, %r15746, 0xD2; + lop3.b32 %r15827, %r15718, %r15774, %r15750, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r15826, %r15827}; + // begin inline asm + // chi + lop3.b32 %r15834, %r15770, %r15746, %r15722, 0xD2; + lop3.b32 %r15835, %r15774, %r15750, %r15726, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r15834, %r15835}; + // begin inline asm + ld.global.nc.v2.u32 {%r15842,%r15843}, [%rd707]; + // end inline asm + xor.b32 %r15895, %r15779, %r15843; + xor.b32 %r15896, %r15778, %r15842; + mov.b64 %rd1265, {%r15896, %r15895}; + mov.b64 %rd1266, {%r15786, %r15787}; + mov.b64 %rd1267, {%r15794, %r15795}; + mov.b64 %rd156, {%r15802, %r15803}; + mov.b64 %rd1268, {%r15810, %r15811}; + mov.b64 %rd158, {%r15818, %r15819}; + mov.b64 %rd159, {%r15826, %r15827}; + mov.b64 %rd160, {%r15834, %r15835}; + mov.u32 %r30199, 0; + st.local.v2.u32 [%rd3+24], {%r15896, %r15895}; + st.local.v2.u32 [%rd149+96], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+104], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+112], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+120], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+128], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+136], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+144], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+152], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+160], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+168], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+176], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+184], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+192], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+200], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+208], {%r30199, %r30199}; + st.local.v2.u32 [%rd149+216], {%r30199, %r30199}; + mov.u32 %r30214, -2147483648; + st.local.v2.u32 [%rd149+88], {%r15858, %r30214}; + mov.u32 %r30200, %r30199; + mov.u32 %r30201, %r30199; + mov.u32 %r30202, %r30199; + mov.u32 %r30203, %r30199; + mov.u32 %r30204, %r30199; + mov.u32 %r30205, %r30199; + mov.u32 %r30206, %r30199; + mov.u32 %r30207, %r30199; + mov.u32 %r30208, %r30199; + mov.u32 %r30209, %r30199; + mov.u32 %r30210, %r30199; + mov.u32 %r30211, %r30199; + mov.u32 %r30212, %r30199; + mov.u32 %r30213, %r15858; + mov.u32 %r30215, %r30199; + mov.u32 %r30216, %r30199; + mov.u32 %r30217, %r30199; + mov.u32 %r30218, %r30199; + mov.u32 %r30219, %r30199; + mov.u32 %r30220, %r30199; + mov.u32 %r30221, %r30199; + mov.u32 %r30222, %r30199; + mov.u32 %r30223, %r30199; + mov.u32 %r30224, %r30199; + mov.u32 %r30225, %r30199; + mov.u32 %r30226, %r30199; + mov.u32 %r30227, %r30199; + mov.u32 %r30228, %r30199; + mov.u32 %r30229, %r30199; + mov.u32 %r30230, %r30199; + mov.u32 %r30231, %r30199; + mov.u32 %r30232, %r30199; + mov.u32 %r30249, %r30199; + +$L__BB2_56: + // begin inline asm + // xor5 + lop3.b32 %r15897, %r30235, %r30233, %r30231, 0x96; + lop3.b32 %r15897, %r15897, %r30229, %r30227, 0x96; + lop3.b32 %r15898, %r30236, %r30234, %r30232, 0x96; + lop3.b32 %r15898, %r15898, %r30230, %r30228, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15909, %r30247, %r30245, %r30225, 0x96; + lop3.b32 %r15909, %r15909, %r30223, %r30221, 0x96; + lop3.b32 %r15910, %r30248, %r30246, %r30226, 0x96; + lop3.b32 %r15910, %r15910, %r30224, %r30222, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15921, %r30243, %r30241, %r30219, 0x96; + lop3.b32 %r15921, %r15921, %r30217, %r30215, 0x96; + lop3.b32 %r15922, %r30244, %r30242, %r30220, 0x96; + lop3.b32 %r15922, %r15922, %r30218, %r30216, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15933, %r30239, %r30213, %r30211, 0x96; + lop3.b32 %r15933, %r15933, %r30209, %r30207, 0x96; + lop3.b32 %r15934, %r30240, %r30214, %r30212, 0x96; + lop3.b32 %r15934, %r15934, %r30210, %r30208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r15945, %r30237, %r30205, %r30203, 0x96; + lop3.b32 %r15945, %r15945, %r30201, %r30199, 0x96; + lop3.b32 %r15946, %r30238, %r30206, %r30204, 0x96; + lop3.b32 %r15946, %r15946, %r30202, %r30200, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15957, %r15910, %r15909, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15961, %r15909, %r15910, %r15858; + // end inline asm + xor.b32 %r16391, %r15957, %r15945; + xor.b32 %r16392, %r15961, %r15946; + xor.b32 %r16224, %r30235, %r16391; + xor.b32 %r16227, %r30236, %r16392; + xor.b32 %r16131, %r30233, %r16391; + xor.b32 %r16130, %r30234, %r16392; + xor.b32 %r16178, %r30231, %r16391; + xor.b32 %r16179, %r30232, %r16392; + xor.b32 %r16083, %r30229, %r16391; + xor.b32 %r16082, %r30230, %r16392; + xor.b32 %r16034, %r30227, %r16391; + xor.b32 %r16035, %r30228, %r16392; + // begin inline asm + shf.l.wrap.b32 %r15965, %r15922, %r15921, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15969, %r15921, %r15922, %r15858; + // end inline asm + xor.b32 %r16393, %r15965, %r15897; + xor.b32 %r16394, %r15969, %r15898; + xor.b32 %r16186, %r30247, %r16393; + xor.b32 %r16187, %r30248, %r16394; + xor.b32 %r16003, %r30245, %r16393; + xor.b32 %r16002, %r30246, %r16394; + xor.b32 %r16162, %r30225, %r16393; + xor.b32 %r16163, %r30226, %r16394; + xor.b32 %r16123, %r30223, %r16393; + xor.b32 %r16122, %r30224, %r16394; + xor.b32 %r16106, %r30221, %r16393; + xor.b32 %r16107, %r30222, %r16394; + // begin inline asm + shf.l.wrap.b32 %r15973, %r15934, %r15933, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15977, %r15933, %r15934, %r15858; + // end inline asm + xor.b32 %r16395, %r15973, %r15909; + xor.b32 %r16396, %r15977, %r15910; + xor.b32 %r16043, %r30243, %r16395; + xor.b32 %r16042, %r30244, %r16396; + xor.b32 %r16170, %r30241, %r16395; + xor.b32 %r16171, %r30242, %r16396; + xor.b32 %r16051, %r30219, %r16395; + xor.b32 %r16050, %r30220, %r16396; + xor.b32 %r16154, %r30217, %r16395; + xor.b32 %r16155, %r30218, %r16396; + xor.b32 %r16019, %r30215, %r16395; + xor.b32 %r16018, %r30216, %r16396; + // begin inline asm + shf.l.wrap.b32 %r15981, %r15946, %r15945, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15985, %r15945, %r15946, %r15858; + // end inline asm + xor.b32 %r16397, %r15981, %r15921; + xor.b32 %r16398, %r15985, %r15922; + xor.b32 %r16138, %r30239, %r16397; + xor.b32 %r16139, %r30240, %r16398; + xor.b32 %r16115, %r30213, %r16397; + xor.b32 %r16114, %r30214, %r16398; + xor.b32 %r16058, %r30211, %r16397; + xor.b32 %r16059, %r30212, %r16398; + xor.b32 %r16146, %r30209, %r16397; + xor.b32 %r16147, %r30210, %r16398; + xor.b32 %r16075, %r30207, %r16397; + xor.b32 %r16074, %r30208, %r16398; + // begin inline asm + shf.l.wrap.b32 %r15989, %r15898, %r15897, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r15993, %r15897, %r15898, %r15858; + // end inline asm + xor.b32 %r16399, %r15989, %r15933; + xor.b32 %r16400, %r15993, %r15934; + xor.b32 %r16090, %r30237, %r16399; + xor.b32 %r16091, %r30238, %r16400; + xor.b32 %r16010, %r30205, %r16399; + xor.b32 %r16011, %r30206, %r16400; + xor.b32 %r16027, %r30203, %r16399; + xor.b32 %r16026, %r30204, %r16400; + xor.b32 %r16066, %r30201, %r16399; + xor.b32 %r16067, %r30202, %r16400; + xor.b32 %r16098, %r30199, %r16399; + xor.b32 %r16099, %r30200, %r16400; + mov.u32 %r16004, 44; + // begin inline asm + shf.l.wrap.b32 %r15997, %r16003, %r16002, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16001, %r16002, %r16003, %r16004; + // end inline asm + mov.u32 %r16012, 20; + // begin inline asm + shf.l.wrap.b32 %r16005, %r16011, %r16010, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16009, %r16010, %r16011, %r16012; + // end inline asm + mov.u32 %r16020, 61; + // begin inline asm + shf.l.wrap.b32 %r16013, %r16019, %r16018, %r16020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16017, %r16018, %r16019, %r16020; + // end inline asm + mov.u32 %r16028, 39; + // begin inline asm + shf.l.wrap.b32 %r16021, %r16027, %r16026, %r16028; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16025, %r16026, %r16027, %r16028; + // end inline asm + mov.u32 %r16036, 18; + // begin inline asm + shf.l.wrap.b32 %r16029, %r16035, %r16034, %r16036; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16033, %r16034, %r16035, %r16036; + // end inline asm + mov.u32 %r16044, 62; + // begin inline asm + shf.l.wrap.b32 %r16037, %r16043, %r16042, %r16044; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16041, %r16042, %r16043, %r16044; + // end inline asm + mov.u32 %r16052, 43; + // begin inline asm + shf.l.wrap.b32 %r16045, %r16051, %r16050, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16049, %r16050, %r16051, %r16052; + // end inline asm + mov.u32 %r16060, 25; + // begin inline asm + shf.l.wrap.b32 %r16053, %r16059, %r16058, %r16060; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16057, %r16058, %r16059, %r16060; + // end inline asm + mov.u32 %r16068, 8; + // begin inline asm + shf.l.wrap.b32 %r16061, %r16067, %r16066, %r16068; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16065, %r16066, %r16067, %r16068; + // end inline asm + mov.u32 %r16076, 56; + // begin inline asm + shf.l.wrap.b32 %r16069, %r16075, %r16074, %r16076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16073, %r16074, %r16075, %r16076; + // end inline asm + mov.u32 %r16084, 41; + // begin inline asm + shf.l.wrap.b32 %r16077, %r16083, %r16082, %r16084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16081, %r16082, %r16083, %r16084; + // end inline asm + mov.u32 %r16092, 27; + // begin inline asm + shf.l.wrap.b32 %r16085, %r16091, %r16090, %r16092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16089, %r16090, %r16091, %r16092; + // end inline asm + mov.u32 %r16100, 14; + // begin inline asm + shf.l.wrap.b32 %r16093, %r16099, %r16098, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16097, %r16098, %r16099, %r16100; + // end inline asm + mov.u32 %r16108, 2; + // begin inline asm + shf.l.wrap.b32 %r16101, %r16107, %r16106, %r16108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16105, %r16106, %r16107, %r16108; + // end inline asm + mov.u32 %r16116, 55; + // begin inline asm + shf.l.wrap.b32 %r16109, %r16115, %r16114, %r16116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16113, %r16114, %r16115, %r16116; + // end inline asm + mov.u32 %r16124, 45; + // begin inline asm + shf.l.wrap.b32 %r16117, %r16123, %r16122, %r16124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16121, %r16122, %r16123, %r16124; + // end inline asm + mov.u32 %r16132, 36; + // begin inline asm + shf.l.wrap.b32 %r16125, %r16131, %r16130, %r16132; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16129, %r16130, %r16131, %r16132; + // end inline asm + mov.u32 %r16140, 28; + // begin inline asm + shf.l.wrap.b32 %r16133, %r16139, %r16138, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16137, %r16138, %r16139, %r16140; + // end inline asm + mov.u32 %r16148, 21; + // begin inline asm + shf.l.wrap.b32 %r16141, %r16147, %r16146, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16145, %r16146, %r16147, %r16148; + // end inline asm + mov.u32 %r16156, 15; + // begin inline asm + shf.l.wrap.b32 %r16149, %r16155, %r16154, %r16156; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16153, %r16154, %r16155, %r16156; + // end inline asm + mov.u32 %r16164, 10; + // begin inline asm + shf.l.wrap.b32 %r16157, %r16163, %r16162, %r16164; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16161, %r16162, %r16163, %r16164; + // end inline asm + mov.u32 %r16172, 6; + // begin inline asm + shf.l.wrap.b32 %r16165, %r16171, %r16170, %r16172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16169, %r16170, %r16171, %r16172; + // end inline asm + mov.u32 %r16180, 3; + // begin inline asm + shf.l.wrap.b32 %r16173, %r16179, %r16178, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16177, %r16178, %r16179, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16181, %r16187, %r16186, %r15858; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16185, %r16186, %r16187, %r15858; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16189, %r16224, %r15997, %r16045, 0xD2; + lop3.b32 %r16190, %r16227, %r16001, %r16049, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30247, %r15997, %r16045, %r16141, 0xD2; + lop3.b32 %r30248, %r16001, %r16049, %r16145, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30243, %r16045, %r16141, %r16093, 0xD2; + lop3.b32 %r30244, %r16049, %r16145, %r16097, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30239, %r16141, %r16093, %r16224, 0xD2; + lop3.b32 %r30240, %r16145, %r16097, %r16227, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30237, %r16093, %r16224, %r15997, 0xD2; + lop3.b32 %r30238, %r16097, %r16227, %r16001, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30233, %r16133, %r16005, %r16173, 0xD2; + lop3.b32 %r30234, %r16137, %r16009, %r16177, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30245, %r16005, %r16173, %r16117, 0xD2; + lop3.b32 %r30246, %r16009, %r16177, %r16121, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30241, %r16173, %r16117, %r16013, 0xD2; + lop3.b32 %r30242, %r16177, %r16121, %r16017, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30213, %r16117, %r16013, %r16133, 0xD2; + lop3.b32 %r30214, %r16121, %r16017, %r16137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r30213, %r30214}; + // begin inline asm + // chi + lop3.b32 %r30205, %r16013, %r16133, %r16005, 0xD2; + lop3.b32 %r30206, %r16017, %r16137, %r16009, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r30205, %r30206}; + // begin inline asm + // chi + lop3.b32 %r30231, %r16181, %r16165, %r16053, 0xD2; + lop3.b32 %r30232, %r16185, %r16169, %r16057, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+104], {%r30231, %r30232}; + // begin inline asm + // chi + lop3.b32 %r30225, %r16165, %r16053, %r16061, 0xD2; + lop3.b32 %r30226, %r16169, %r16057, %r16065, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+112], {%r30225, %r30226}; + // begin inline asm + // chi + lop3.b32 %r30219, %r16053, %r16061, %r16029, 0xD2; + lop3.b32 %r30220, %r16057, %r16065, %r16033, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+120], {%r30219, %r30220}; + // begin inline asm + // chi + lop3.b32 %r30211, %r16061, %r16029, %r16181, 0xD2; + lop3.b32 %r30212, %r16065, %r16033, %r16185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+128], {%r30211, %r30212}; + // begin inline asm + // chi + lop3.b32 %r30203, %r16029, %r16181, %r16165, 0xD2; + lop3.b32 %r30204, %r16033, %r16185, %r16169, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+136], {%r30203, %r30204}; + // begin inline asm + // chi + lop3.b32 %r30229, %r16085, %r16125, %r16157, 0xD2; + lop3.b32 %r30230, %r16089, %r16129, %r16161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+144], {%r30229, %r30230}; + // begin inline asm + // chi + lop3.b32 %r30223, %r16125, %r16157, %r16149, 0xD2; + lop3.b32 %r30224, %r16129, %r16161, %r16153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+152], {%r30223, %r30224}; + // begin inline asm + // chi + lop3.b32 %r30217, %r16157, %r16149, %r16069, 0xD2; + lop3.b32 %r30218, %r16161, %r16153, %r16073, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+160], {%r30217, %r30218}; + // begin inline asm + // chi + lop3.b32 %r30209, %r16149, %r16069, %r16085, 0xD2; + lop3.b32 %r30210, %r16153, %r16073, %r16089, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+168], {%r30209, %r30210}; + // begin inline asm + // chi + lop3.b32 %r30201, %r16069, %r16085, %r16125, 0xD2; + lop3.b32 %r30202, %r16073, %r16089, %r16129, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+176], {%r30201, %r30202}; + // begin inline asm + // chi + lop3.b32 %r30227, %r16037, %r16109, %r16021, 0xD2; + lop3.b32 %r30228, %r16041, %r16113, %r16025, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+184], {%r30227, %r30228}; + // begin inline asm + // chi + lop3.b32 %r30221, %r16109, %r16021, %r16077, 0xD2; + lop3.b32 %r30222, %r16113, %r16025, %r16081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+192], {%r30221, %r30222}; + // begin inline asm + // chi + lop3.b32 %r30215, %r16021, %r16077, %r16101, 0xD2; + lop3.b32 %r30216, %r16025, %r16081, %r16105, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+200], {%r30215, %r30216}; + // begin inline asm + // chi + lop3.b32 %r30207, %r16077, %r16101, %r16037, 0xD2; + lop3.b32 %r30208, %r16081, %r16105, %r16041, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+208], {%r30207, %r30208}; + // begin inline asm + // chi + lop3.b32 %r30199, %r16101, %r16037, %r16109, 0xD2; + lop3.b32 %r30200, %r16105, %r16041, %r16113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+216], {%r30199, %r30200}; + mul.wide.s32 %rd781, %r30249, 8; + add.s64 %rd780, %rd706, %rd781; + // begin inline asm + ld.global.nc.v2.u32 {%r16389,%r16390}, [%rd780]; + // end inline asm + xor.b32 %r30235, %r16189, %r16389; + xor.b32 %r30236, %r16190, %r16390; + add.s32 %r30249, %r30249, 1; + setp.lt.u32 %p34, %r30249, 23; + @%p34 bra $L__BB2_56; + + mov.u32 %r16500, 1; + st.local.v2.u32 [%rd149+32], {%r30247, %r30248}; + st.local.v2.u32 [%rd149+72], {%r30245, %r30246}; + st.local.v2.u32 [%rd149+40], {%r30243, %r30244}; + st.local.v2.u32 [%rd149+80], {%r30241, %r30242}; + st.local.v2.u32 [%rd149+48], {%r30239, %r30240}; + st.local.v2.u32 [%rd149+56], {%r30237, %r30238}; + st.local.v2.u32 [%rd149+24], {%r30235, %r30236}; + // begin inline asm + // xor5 + lop3.b32 %r16401, %r30235, %r30233, %r30231, 0x96; + lop3.b32 %r16401, %r16401, %r30229, %r30227, 0x96; + lop3.b32 %r16402, %r30236, %r30234, %r30232, 0x96; + lop3.b32 %r16402, %r16402, %r30230, %r30228, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16413, %r30247, %r30245, %r30225, 0x96; + lop3.b32 %r16413, %r16413, %r30223, %r30221, 0x96; + lop3.b32 %r16414, %r30248, %r30246, %r30226, 0x96; + lop3.b32 %r16414, %r16414, %r30224, %r30222, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16425, %r30243, %r30241, %r30219, 0x96; + lop3.b32 %r16425, %r16425, %r30217, %r30215, 0x96; + lop3.b32 %r16426, %r30244, %r30242, %r30220, 0x96; + lop3.b32 %r16426, %r16426, %r30218, %r30216, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16437, %r30239, %r30213, %r30211, 0x96; + lop3.b32 %r16437, %r16437, %r30209, %r30207, 0x96; + lop3.b32 %r16438, %r30240, %r30214, %r30212, 0x96; + lop3.b32 %r16438, %r16438, %r30210, %r30208, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r16449, %r30237, %r30205, %r30203, 0x96; + lop3.b32 %r16449, %r16449, %r30201, %r30199, 0x96; + lop3.b32 %r16450, %r30238, %r30206, %r30204, 0x96; + lop3.b32 %r16450, %r16450, %r30202, %r30200, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16461, %r16414, %r16413, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16465, %r16413, %r16414, %r16500; + // end inline asm + xor.b32 %r16639, %r16461, %r16449; + xor.b32 %r16640, %r16465, %r16450; + xor.b32 %r16608, %r30235, %r16639; + xor.b32 %r16611, %r30236, %r16640; + xor.b32 %r16571, %r30232, %r16640; + xor.b32 %r16570, %r30231, %r16639; + st.local.v2.u32 [%rd149+104], {%r16570, %r16571}; + // begin inline asm + shf.l.wrap.b32 %r16469, %r16426, %r16425, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16473, %r16425, %r16426, %r16500; + // end inline asm + xor.b32 %r16641, %r16469, %r16401; + xor.b32 %r16642, %r16473, %r16402; + xor.b32 %r16507, %r30245, %r16641; + xor.b32 %r16506, %r30246, %r16642; + xor.b32 %r16546, %r30224, %r16642; + xor.b32 %r16547, %r30223, %r16641; + st.local.v2.u32 [%rd149+152], {%r16547, %r16546}; + // begin inline asm + shf.l.wrap.b32 %r16477, %r16438, %r16437, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16481, %r16437, %r16438, %r16500; + // end inline asm + xor.b32 %r16643, %r16477, %r16413; + xor.b32 %r16644, %r16481, %r16414; + xor.b32 %r16530, %r30220, %r16644; + xor.b32 %r16531, %r30219, %r16643; + st.local.v2.u32 [%rd149+120], {%r16531, %r16530}; + xor.b32 %r16522, %r30216, %r16644; + xor.b32 %r16523, %r30215, %r16643; + st.local.v2.u32 [%rd149+200], {%r16523, %r16522}; + // begin inline asm + shf.l.wrap.b32 %r16485, %r16450, %r16449, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16489, %r16449, %r16450, %r16500; + // end inline asm + xor.b32 %r16645, %r16485, %r16425; + xor.b32 %r16646, %r16489, %r16426; + xor.b32 %r16554, %r30239, %r16645; + xor.b32 %r16555, %r30240, %r16646; + xor.b32 %r16563, %r30210, %r16646; + xor.b32 %r16562, %r30209, %r16645; + st.local.v2.u32 [%rd149+168], {%r16562, %r16563}; + // begin inline asm + shf.l.wrap.b32 %r16493, %r16402, %r16401, %r16500; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16497, %r16401, %r16402, %r16500; + // end inline asm + xor.b32 %r16647, %r16493, %r16437; + xor.b32 %r16648, %r16497, %r16438; + xor.b32 %r16514, %r30205, %r16647; + xor.b32 %r16515, %r30206, %r16648; + xor.b32 %r16539, %r30200, %r16648; + xor.b32 %r16538, %r30199, %r16647; + st.local.v2.u32 [%rd149+216], {%r16538, %r16539}; + // begin inline asm + shf.l.wrap.b32 %r16501, %r16507, %r16506, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16505, %r16506, %r16507, %r16004; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16509, %r16515, %r16514, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16513, %r16514, %r16515, %r16012; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16521, %r16522, %r16523, %r16020; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16517, %r16523, %r16522, %r16020; + // end inline asm + st.local.v2.u32 [%rd149+96], {%r16517, %r16521}; + // begin inline asm + shf.l.wrap.b32 %r16525, %r16531, %r16530, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16529, %r16530, %r16531, %r16052; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16533, %r16539, %r16538, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16537, %r16538, %r16539, %r16100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16545, %r16546, %r16547, %r16124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16541, %r16547, %r16546, %r16124; + // end inline asm + st.local.v2.u32 [%rd149+88], {%r16541, %r16545}; + // begin inline asm + shf.l.wrap.b32 %r16549, %r16555, %r16554, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16553, %r16554, %r16555, %r16140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16557, %r16563, %r16562, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16561, %r16562, %r16563, %r16148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16565, %r16571, %r16570, %r16180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r16569, %r16570, %r16571, %r16180; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16573, %r16608, %r16501, %r16525, 0xD2; + lop3.b32 %r16574, %r16611, %r16505, %r16529, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r16581, %r16501, %r16525, %r16557, 0xD2; + lop3.b32 %r16582, %r16505, %r16529, %r16561, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+32], {%r16581, %r16582}; + // begin inline asm + // chi + lop3.b32 %r16589, %r16525, %r16557, %r16533, 0xD2; + lop3.b32 %r16590, %r16529, %r16561, %r16537, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+40], {%r16589, %r16590}; + // begin inline asm + // chi + lop3.b32 %r16597, %r16557, %r16533, %r16608, 0xD2; + lop3.b32 %r16598, %r16561, %r16537, %r16611, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+48], {%r16597, %r16598}; + // begin inline asm + // chi + lop3.b32 %r16605, %r16533, %r16608, %r16501, 0xD2; + lop3.b32 %r16606, %r16537, %r16611, %r16505, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+56], {%r16605, %r16606}; + // begin inline asm + // chi + lop3.b32 %r16613, %r16549, %r16509, %r16565, 0xD2; + lop3.b32 %r16614, %r16553, %r16513, %r16569, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+64], {%r16613, %r16614}; + // begin inline asm + // chi + lop3.b32 %r16621, %r16509, %r16565, %r16541, 0xD2; + lop3.b32 %r16622, %r16513, %r16569, %r16545, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+72], {%r16621, %r16622}; + // begin inline asm + // chi + lop3.b32 %r16629, %r16565, %r16541, %r16517, 0xD2; + lop3.b32 %r16630, %r16569, %r16545, %r16521, 0xD2; + // end inline asm + st.local.v2.u32 [%rd149+80], {%r16629, %r16630}; + // begin inline asm + ld.global.nc.v2.u32 {%r16637,%r16638}, [%rd707]; + // end inline asm + xor.b32 %r16649, %r16574, %r16638; + xor.b32 %r16650, %r16573, %r16637; + st.local.v2.u32 [%rd149+24], {%r16650, %r16649}; + st.global.u64 [%rd130], %rd1265; + st.global.u64 [%rd130+8], %rd1266; + st.global.u64 [%rd130+16], %rd1267; + st.global.u64 [%rd130+24], %rd156; + st.global.u64 [%rd130+32], %rd1268; + st.global.u64 [%rd130+40], %rd158; + st.global.u64 [%rd130+48], %rd159; + st.global.u64 [%rd130+56], %rd160; + st.global.v2.u32 [%rd130+64], {%r16650, %r16649}; + st.global.v2.u32 [%rd130+72], {%r16581, %r16582}; + st.global.v2.u32 [%rd130+80], {%r16589, %r16590}; + st.global.v2.u32 [%rd130+88], {%r16597, %r16598}; + st.global.v2.u32 [%rd130+96], {%r16605, %r16606}; + st.global.v2.u32 [%rd130+104], {%r16613, %r16614}; + st.global.v2.u32 [%rd130+112], {%r16621, %r16622}; + st.global.v2.u32 [%rd130+120], {%r16629, %r16630}; + +$L__BB2_69: + shl.b32 %r3326, %r29, 1; + mul.wide.u32 %rd887, %r3326, -954391867; + shr.u64 %rd888, %rd887, 32; + cvt.u32.u64 %r19935, %rd888; + sub.s32 %r19936, %r3326, %r19935; + shr.u32 %r19937, %r19936, 1; + add.s32 %r19938, %r19937, %r19935; + shr.u32 %r19939, %r19938, 20; + mul.lo.s32 %r19940, %r19939, 1179641; + sub.s32 %r19941, %r3326, %r19940; + mul.wide.u32 %rd890, %r19941, 64; + add.s64 %rd222, %rd471, %rd890; + or.b32 %r3327, %r3326, 1; + mul.wide.u32 %rd891, %r3327, -954391867; + shr.u64 %rd892, %rd891, 32; + cvt.u32.u64 %r19942, %rd892; + sub.s32 %r19943, %r3327, %r19942; + shr.u32 %r19944, %r19943, 1; + add.s32 %r19945, %r19944, %r19942; + shr.u32 %r19946, %r19945, 20; + mul.lo.s32 %r19947, %r19946, 1179641; + sub.s32 %r19948, %r3327, %r19947; + mul.wide.u32 %rd893, %r19948, 64; + add.s64 %rd223, %rd471, %rd893; + @%p16 bra $L__BB2_83; + + cvta.to.global.u64 %rd894, %rd353; + mul.wide.u32 %rd895, %r29, 128; + add.s64 %rd224, %rd894, %rd895; + ld.global.u64 %rd1269, [%rd224]; + setp.eq.s64 %p41, %rd1269, 0; + @%p41 bra $L__BB2_72; + + ld.global.u64 %rd1272, [%rd224+32]; + ld.global.u64 %rd1271, [%rd224+16]; + ld.global.u64 %rd1270, [%rd224+8]; + bra.uni $L__BB2_94; + +$L__BB2_83: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd1011, 1179641; + st.local.u64 [%rd3+8], %rd1011; + st.local.u32 [%rd3+16], %r3326; + ld.global.u64 %rd1012, [%rd222]; + ld.global.u64 %rd1013, [%rd222+8]; + ld.global.u64 %rd1014, [%rd222+16]; + ld.global.u64 %rd1015, [%rd222+24]; + ld.global.u64 %rd1016, [%rd222+32]; + ld.global.u64 %rd1017, [%rd222+40]; + ld.global.u64 %rd1018, [%rd222+48]; + ld.global.u64 %rd1019, [%rd222+56]; + st.local.u64 [%rd3+24], %rd1012; + st.local.u64 [%rd3+32], %rd1013; + st.local.u64 [%rd3+40], %rd1014; + st.local.u64 [%rd3+48], %rd1015; + st.local.u64 [%rd3+56], %rd1016; + st.local.u64 [%rd3+64], %rd1017; + st.local.u64 [%rd3+72], %rd1018; + st.local.u64 [%rd3+80], %rd1019; + cvt.u32.u64 %r23275, %rd1012; + xor.b32 %r23276, %r3326, %r23275; + st.local.u32 [%rd3+24], %r23276; + mov.u32 %r30724, 0; + st.local.v2.u32 [%rd3+96], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+104], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+112], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+120], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+128], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+136], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+144], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+152], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+160], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+168], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+176], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+184], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+192], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+200], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+208], {%r30724, %r30724}; + st.local.v2.u32 [%rd3+216], {%r30724, %r30724}; + mov.u32 %r30739, -2147483648; + mov.u32 %r23248, 1; + st.local.v2.u32 [%rd3+88], {%r23248, %r30739}; + ld.local.v2.u32 {%r30760, %r30761}, [%rd3+24]; + mov.b64 {%r30758, %r30759}, %rd1017; + shr.u64 %rd1020, %rd1013, 32; + cvt.u32.u64 %r30772, %rd1013; + cvt.u32.u64 %r30773, %rd1020; + shr.u64 %rd1021, %rd1018, 32; + cvt.u32.u64 %r30770, %rd1018; + cvt.u32.u64 %r30771, %rd1021; + shr.u64 %rd1022, %rd1014, 32; + cvt.u32.u64 %r30768, %rd1014; + cvt.u32.u64 %r30769, %rd1022; + shr.u64 %rd1023, %rd1019, 32; + cvt.u32.u64 %r30766, %rd1019; + cvt.u32.u64 %r30767, %rd1023; + shr.u64 %rd1024, %rd1015, 32; + cvt.u32.u64 %r30764, %rd1015; + cvt.u32.u64 %r30765, %rd1024; + shr.u64 %rd1025, %rd1016, 32; + cvt.u32.u64 %r30762, %rd1016; + cvt.u32.u64 %r30763, %rd1025; + mov.u32 %r30725, %r30724; + mov.u32 %r30726, %r30724; + mov.u32 %r30727, %r30724; + mov.u32 %r30728, %r30724; + mov.u32 %r30729, %r30724; + mov.u32 %r30730, %r30724; + mov.u32 %r30731, %r30724; + mov.u32 %r30732, %r30724; + mov.u32 %r30733, %r30724; + mov.u32 %r30734, %r30724; + mov.u32 %r30735, %r30724; + mov.u32 %r30736, %r30724; + mov.u32 %r30737, %r30724; + mov.u32 %r30738, %r23248; + mov.u32 %r30740, %r30724; + mov.u32 %r30741, %r30724; + mov.u32 %r30742, %r30724; + mov.u32 %r30743, %r30724; + mov.u32 %r30744, %r30724; + mov.u32 %r30745, %r30724; + mov.u32 %r30746, %r30724; + mov.u32 %r30747, %r30724; + mov.u32 %r30748, %r30724; + mov.u32 %r30749, %r30724; + mov.u32 %r30750, %r30724; + mov.u32 %r30751, %r30724; + mov.u32 %r30752, %r30724; + mov.u32 %r30753, %r30724; + mov.u32 %r30754, %r30724; + mov.u32 %r30755, %r30724; + mov.u32 %r30756, %r30724; + mov.u32 %r30757, %r30724; + mov.u32 %r30774, %r30724; + +$L__BB2_84: + // begin inline asm + // xor5 + lop3.b32 %r23279, %r30760, %r30758, %r30756, 0x96; + lop3.b32 %r23279, %r23279, %r30754, %r30752, 0x96; + lop3.b32 %r23280, %r30761, %r30759, %r30757, 0x96; + lop3.b32 %r23280, %r23280, %r30755, %r30753, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23291, %r30772, %r30770, %r30750, 0x96; + lop3.b32 %r23291, %r23291, %r30748, %r30746, 0x96; + lop3.b32 %r23292, %r30773, %r30771, %r30751, 0x96; + lop3.b32 %r23292, %r23292, %r30749, %r30747, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23303, %r30768, %r30766, %r30744, 0x96; + lop3.b32 %r23303, %r23303, %r30742, %r30740, 0x96; + lop3.b32 %r23304, %r30769, %r30767, %r30745, 0x96; + lop3.b32 %r23304, %r23304, %r30743, %r30741, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23315, %r30764, %r30738, %r30736, 0x96; + lop3.b32 %r23315, %r23315, %r30734, %r30732, 0x96; + lop3.b32 %r23316, %r30765, %r30739, %r30737, 0x96; + lop3.b32 %r23316, %r23316, %r30735, %r30733, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23327, %r30762, %r30730, %r30728, 0x96; + lop3.b32 %r23327, %r23327, %r30726, %r30724, 0x96; + lop3.b32 %r23328, %r30763, %r30731, %r30729, 0x96; + lop3.b32 %r23328, %r23328, %r30727, %r30725, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23339, %r23292, %r23291, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23343, %r23291, %r23292, %r23248; + // end inline asm + xor.b32 %r23773, %r23339, %r23327; + xor.b32 %r23774, %r23343, %r23328; + xor.b32 %r23606, %r30760, %r23773; + xor.b32 %r23609, %r30761, %r23774; + xor.b32 %r23513, %r30758, %r23773; + xor.b32 %r23512, %r30759, %r23774; + xor.b32 %r23560, %r30756, %r23773; + xor.b32 %r23561, %r30757, %r23774; + xor.b32 %r23465, %r30754, %r23773; + xor.b32 %r23464, %r30755, %r23774; + xor.b32 %r23416, %r30752, %r23773; + xor.b32 %r23417, %r30753, %r23774; + // begin inline asm + shf.l.wrap.b32 %r23347, %r23304, %r23303, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23351, %r23303, %r23304, %r23248; + // end inline asm + xor.b32 %r23775, %r23347, %r23279; + xor.b32 %r23776, %r23351, %r23280; + xor.b32 %r23568, %r30772, %r23775; + xor.b32 %r23569, %r30773, %r23776; + xor.b32 %r23385, %r30770, %r23775; + xor.b32 %r23384, %r30771, %r23776; + xor.b32 %r23544, %r30750, %r23775; + xor.b32 %r23545, %r30751, %r23776; + xor.b32 %r23505, %r30748, %r23775; + xor.b32 %r23504, %r30749, %r23776; + xor.b32 %r23488, %r30746, %r23775; + xor.b32 %r23489, %r30747, %r23776; + // begin inline asm + shf.l.wrap.b32 %r23355, %r23316, %r23315, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23359, %r23315, %r23316, %r23248; + // end inline asm + xor.b32 %r23777, %r23355, %r23291; + xor.b32 %r23778, %r23359, %r23292; + xor.b32 %r23425, %r30768, %r23777; + xor.b32 %r23424, %r30769, %r23778; + xor.b32 %r23552, %r30766, %r23777; + xor.b32 %r23553, %r30767, %r23778; + xor.b32 %r23433, %r30744, %r23777; + xor.b32 %r23432, %r30745, %r23778; + xor.b32 %r23536, %r30742, %r23777; + xor.b32 %r23537, %r30743, %r23778; + xor.b32 %r23401, %r30740, %r23777; + xor.b32 %r23400, %r30741, %r23778; + // begin inline asm + shf.l.wrap.b32 %r23363, %r23328, %r23327, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23367, %r23327, %r23328, %r23248; + // end inline asm + xor.b32 %r23779, %r23363, %r23303; + xor.b32 %r23780, %r23367, %r23304; + xor.b32 %r23520, %r30764, %r23779; + xor.b32 %r23521, %r30765, %r23780; + xor.b32 %r23497, %r30738, %r23779; + xor.b32 %r23496, %r30739, %r23780; + xor.b32 %r23440, %r30736, %r23779; + xor.b32 %r23441, %r30737, %r23780; + xor.b32 %r23528, %r30734, %r23779; + xor.b32 %r23529, %r30735, %r23780; + xor.b32 %r23457, %r30732, %r23779; + xor.b32 %r23456, %r30733, %r23780; + // begin inline asm + shf.l.wrap.b32 %r23371, %r23280, %r23279, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23375, %r23279, %r23280, %r23248; + // end inline asm + xor.b32 %r23781, %r23371, %r23315; + xor.b32 %r23782, %r23375, %r23316; + xor.b32 %r23472, %r30762, %r23781; + xor.b32 %r23473, %r30763, %r23782; + xor.b32 %r23392, %r30730, %r23781; + xor.b32 %r23393, %r30731, %r23782; + xor.b32 %r23409, %r30728, %r23781; + xor.b32 %r23408, %r30729, %r23782; + xor.b32 %r23448, %r30726, %r23781; + xor.b32 %r23449, %r30727, %r23782; + xor.b32 %r23480, %r30724, %r23781; + xor.b32 %r23481, %r30725, %r23782; + mov.u32 %r23386, 44; + // begin inline asm + shf.l.wrap.b32 %r23379, %r23385, %r23384, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23383, %r23384, %r23385, %r23386; + // end inline asm + mov.u32 %r23394, 20; + // begin inline asm + shf.l.wrap.b32 %r23387, %r23393, %r23392, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23391, %r23392, %r23393, %r23394; + // end inline asm + mov.u32 %r23402, 61; + // begin inline asm + shf.l.wrap.b32 %r23395, %r23401, %r23400, %r23402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23399, %r23400, %r23401, %r23402; + // end inline asm + mov.u32 %r23410, 39; + // begin inline asm + shf.l.wrap.b32 %r23403, %r23409, %r23408, %r23410; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23407, %r23408, %r23409, %r23410; + // end inline asm + mov.u32 %r23418, 18; + // begin inline asm + shf.l.wrap.b32 %r23411, %r23417, %r23416, %r23418; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23415, %r23416, %r23417, %r23418; + // end inline asm + mov.u32 %r23426, 62; + // begin inline asm + shf.l.wrap.b32 %r23419, %r23425, %r23424, %r23426; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23423, %r23424, %r23425, %r23426; + // end inline asm + mov.u32 %r23434, 43; + // begin inline asm + shf.l.wrap.b32 %r23427, %r23433, %r23432, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23431, %r23432, %r23433, %r23434; + // end inline asm + mov.u32 %r23442, 25; + // begin inline asm + shf.l.wrap.b32 %r23435, %r23441, %r23440, %r23442; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23439, %r23440, %r23441, %r23442; + // end inline asm + mov.u32 %r23450, 8; + // begin inline asm + shf.l.wrap.b32 %r23443, %r23449, %r23448, %r23450; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23447, %r23448, %r23449, %r23450; + // end inline asm + mov.u32 %r23458, 56; + // begin inline asm + shf.l.wrap.b32 %r23451, %r23457, %r23456, %r23458; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23455, %r23456, %r23457, %r23458; + // end inline asm + mov.u32 %r23466, 41; + // begin inline asm + shf.l.wrap.b32 %r23459, %r23465, %r23464, %r23466; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23463, %r23464, %r23465, %r23466; + // end inline asm + mov.u32 %r23474, 27; + // begin inline asm + shf.l.wrap.b32 %r23467, %r23473, %r23472, %r23474; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23471, %r23472, %r23473, %r23474; + // end inline asm + mov.u32 %r23482, 14; + // begin inline asm + shf.l.wrap.b32 %r23475, %r23481, %r23480, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23479, %r23480, %r23481, %r23482; + // end inline asm + mov.u32 %r23490, 2; + // begin inline asm + shf.l.wrap.b32 %r23483, %r23489, %r23488, %r23490; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23487, %r23488, %r23489, %r23490; + // end inline asm + mov.u32 %r23498, 55; + // begin inline asm + shf.l.wrap.b32 %r23491, %r23497, %r23496, %r23498; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23495, %r23496, %r23497, %r23498; + // end inline asm + mov.u32 %r23506, 45; + // begin inline asm + shf.l.wrap.b32 %r23499, %r23505, %r23504, %r23506; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23503, %r23504, %r23505, %r23506; + // end inline asm + mov.u32 %r23514, 36; + // begin inline asm + shf.l.wrap.b32 %r23507, %r23513, %r23512, %r23514; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23511, %r23512, %r23513, %r23514; + // end inline asm + mov.u32 %r23522, 28; + // begin inline asm + shf.l.wrap.b32 %r23515, %r23521, %r23520, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23519, %r23520, %r23521, %r23522; + // end inline asm + mov.u32 %r23530, 21; + // begin inline asm + shf.l.wrap.b32 %r23523, %r23529, %r23528, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23527, %r23528, %r23529, %r23530; + // end inline asm + mov.u32 %r23538, 15; + // begin inline asm + shf.l.wrap.b32 %r23531, %r23537, %r23536, %r23538; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23535, %r23536, %r23537, %r23538; + // end inline asm + mov.u32 %r23546, 10; + // begin inline asm + shf.l.wrap.b32 %r23539, %r23545, %r23544, %r23546; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23543, %r23544, %r23545, %r23546; + // end inline asm + mov.u32 %r23554, 6; + // begin inline asm + shf.l.wrap.b32 %r23547, %r23553, %r23552, %r23554; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23551, %r23552, %r23553, %r23554; + // end inline asm + mov.u32 %r23562, 3; + // begin inline asm + shf.l.wrap.b32 %r23555, %r23561, %r23560, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23559, %r23560, %r23561, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23563, %r23569, %r23568, %r23248; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23567, %r23568, %r23569, %r23248; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23571, %r23606, %r23379, %r23427, 0xD2; + lop3.b32 %r23572, %r23609, %r23383, %r23431, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30772, %r23379, %r23427, %r23523, 0xD2; + lop3.b32 %r30773, %r23383, %r23431, %r23527, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30768, %r23427, %r23523, %r23475, 0xD2; + lop3.b32 %r30769, %r23431, %r23527, %r23479, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30764, %r23523, %r23475, %r23606, 0xD2; + lop3.b32 %r30765, %r23527, %r23479, %r23609, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30762, %r23475, %r23606, %r23379, 0xD2; + lop3.b32 %r30763, %r23479, %r23609, %r23383, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30758, %r23515, %r23387, %r23555, 0xD2; + lop3.b32 %r30759, %r23519, %r23391, %r23559, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30770, %r23387, %r23555, %r23499, 0xD2; + lop3.b32 %r30771, %r23391, %r23559, %r23503, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30766, %r23555, %r23499, %r23395, 0xD2; + lop3.b32 %r30767, %r23559, %r23503, %r23399, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30738, %r23499, %r23395, %r23515, 0xD2; + lop3.b32 %r30739, %r23503, %r23399, %r23519, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30738, %r30739}; + // begin inline asm + // chi + lop3.b32 %r30730, %r23395, %r23515, %r23387, 0xD2; + lop3.b32 %r30731, %r23399, %r23519, %r23391, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30730, %r30731}; + // begin inline asm + // chi + lop3.b32 %r30756, %r23563, %r23547, %r23435, 0xD2; + lop3.b32 %r30757, %r23567, %r23551, %r23439, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30756, %r30757}; + // begin inline asm + // chi + lop3.b32 %r30750, %r23547, %r23435, %r23443, 0xD2; + lop3.b32 %r30751, %r23551, %r23439, %r23447, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30750, %r30751}; + // begin inline asm + // chi + lop3.b32 %r30744, %r23435, %r23443, %r23411, 0xD2; + lop3.b32 %r30745, %r23439, %r23447, %r23415, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30744, %r30745}; + // begin inline asm + // chi + lop3.b32 %r30736, %r23443, %r23411, %r23563, 0xD2; + lop3.b32 %r30737, %r23447, %r23415, %r23567, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30736, %r30737}; + // begin inline asm + // chi + lop3.b32 %r30728, %r23411, %r23563, %r23547, 0xD2; + lop3.b32 %r30729, %r23415, %r23567, %r23551, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30728, %r30729}; + // begin inline asm + // chi + lop3.b32 %r30754, %r23467, %r23507, %r23539, 0xD2; + lop3.b32 %r30755, %r23471, %r23511, %r23543, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30754, %r30755}; + // begin inline asm + // chi + lop3.b32 %r30748, %r23507, %r23539, %r23531, 0xD2; + lop3.b32 %r30749, %r23511, %r23543, %r23535, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30748, %r30749}; + // begin inline asm + // chi + lop3.b32 %r30742, %r23539, %r23531, %r23451, 0xD2; + lop3.b32 %r30743, %r23543, %r23535, %r23455, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30742, %r30743}; + // begin inline asm + // chi + lop3.b32 %r30734, %r23531, %r23451, %r23467, 0xD2; + lop3.b32 %r30735, %r23535, %r23455, %r23471, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30734, %r30735}; + // begin inline asm + // chi + lop3.b32 %r30726, %r23451, %r23467, %r23507, 0xD2; + lop3.b32 %r30727, %r23455, %r23471, %r23511, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30726, %r30727}; + // begin inline asm + // chi + lop3.b32 %r30752, %r23419, %r23491, %r23403, 0xD2; + lop3.b32 %r30753, %r23423, %r23495, %r23407, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30752, %r30753}; + // begin inline asm + // chi + lop3.b32 %r30746, %r23491, %r23403, %r23459, 0xD2; + lop3.b32 %r30747, %r23495, %r23407, %r23463, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30746, %r30747}; + // begin inline asm + // chi + lop3.b32 %r30740, %r23403, %r23459, %r23483, 0xD2; + lop3.b32 %r30741, %r23407, %r23463, %r23487, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30740, %r30741}; + // begin inline asm + // chi + lop3.b32 %r30732, %r23459, %r23483, %r23419, 0xD2; + lop3.b32 %r30733, %r23463, %r23487, %r23423, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30732, %r30733}; + // begin inline asm + // chi + lop3.b32 %r30724, %r23483, %r23419, %r23491, 0xD2; + lop3.b32 %r30725, %r23487, %r23423, %r23495, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30724, %r30725}; + mul.wide.s32 %rd1027, %r30774, 8; + mov.u64 %rd1028, keccak_round_constants; + cvta.const.u64 %rd1029, %rd1028; + add.s64 %rd1026, %rd1029, %rd1027; + // begin inline asm + ld.global.nc.v2.u32 {%r23771,%r23772}, [%rd1026]; + // end inline asm + xor.b32 %r30760, %r23571, %r23771; + xor.b32 %r30761, %r23572, %r23772; + add.s32 %r30774, %r30774, 1; + setp.lt.u32 %p47, %r30774, 23; + @%p47 bra $L__BB2_84; + + add.u64 %rd272, %SPL, 1912; + st.local.v2.u32 [%rd3+32], {%r30772, %r30773}; + st.local.v2.u32 [%rd3+72], {%r30770, %r30771}; + st.local.v2.u32 [%rd3+40], {%r30768, %r30769}; + st.local.v2.u32 [%rd3+80], {%r30766, %r30767}; + st.local.v2.u32 [%rd3+48], {%r30764, %r30765}; + st.local.v2.u32 [%rd3+56], {%r30762, %r30763}; + st.local.v2.u32 [%rd3+24], {%r30760, %r30761}; + // begin inline asm + // xor5 + lop3.b32 %r23783, %r30760, %r30758, %r30756, 0x96; + lop3.b32 %r23783, %r23783, %r30754, %r30752, 0x96; + lop3.b32 %r23784, %r30761, %r30759, %r30757, 0x96; + lop3.b32 %r23784, %r23784, %r30755, %r30753, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23795, %r30772, %r30770, %r30750, 0x96; + lop3.b32 %r23795, %r23795, %r30748, %r30746, 0x96; + lop3.b32 %r23796, %r30773, %r30771, %r30751, 0x96; + lop3.b32 %r23796, %r23796, %r30749, %r30747, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23807, %r30768, %r30766, %r30744, 0x96; + lop3.b32 %r23807, %r23807, %r30742, %r30740, 0x96; + lop3.b32 %r23808, %r30769, %r30767, %r30745, 0x96; + lop3.b32 %r23808, %r23808, %r30743, %r30741, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23819, %r30764, %r30738, %r30736, 0x96; + lop3.b32 %r23819, %r23819, %r30734, %r30732, 0x96; + lop3.b32 %r23820, %r30765, %r30739, %r30737, 0x96; + lop3.b32 %r23820, %r23820, %r30735, %r30733, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23831, %r30762, %r30730, %r30728, 0x96; + lop3.b32 %r23831, %r23831, %r30726, %r30724, 0x96; + lop3.b32 %r23832, %r30763, %r30731, %r30729, 0x96; + lop3.b32 %r23832, %r23832, %r30727, %r30725, 0x96; + // end inline asm + mov.u32 %r24035, 1; + // begin inline asm + shf.l.wrap.b32 %r23843, %r23796, %r23795, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23847, %r23795, %r23796, %r24035; + // end inline asm + xor.b32 %r24062, %r23843, %r23831; + xor.b32 %r24063, %r23847, %r23832; + xor.b32 %r23990, %r30760, %r24062; + xor.b32 %r23993, %r30761, %r24063; + xor.b32 %r23953, %r30757, %r24063; + xor.b32 %r23952, %r30756, %r24062; + st.local.v2.u32 [%rd3+104], {%r23952, %r23953}; + // begin inline asm + shf.l.wrap.b32 %r23851, %r23808, %r23807, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23855, %r23807, %r23808, %r24035; + // end inline asm + xor.b32 %r24064, %r23851, %r23783; + xor.b32 %r24065, %r23855, %r23784; + xor.b32 %r23889, %r30770, %r24064; + xor.b32 %r23888, %r30771, %r24065; + xor.b32 %r23928, %r30749, %r24065; + xor.b32 %r23929, %r30748, %r24064; + st.local.v2.u32 [%rd3+152], {%r23929, %r23928}; + // begin inline asm + shf.l.wrap.b32 %r23859, %r23820, %r23819, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23863, %r23819, %r23820, %r24035; + // end inline asm + xor.b32 %r24066, %r23859, %r23795; + xor.b32 %r24067, %r23863, %r23796; + xor.b32 %r23912, %r30745, %r24067; + xor.b32 %r23913, %r30744, %r24066; + st.local.v2.u32 [%rd3+120], {%r23913, %r23912}; + xor.b32 %r23904, %r30741, %r24067; + xor.b32 %r23905, %r30740, %r24066; + st.local.v2.u32 [%rd3+200], {%r23905, %r23904}; + // begin inline asm + shf.l.wrap.b32 %r23867, %r23832, %r23831, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23871, %r23831, %r23832, %r24035; + // end inline asm + xor.b32 %r24068, %r23867, %r23807; + xor.b32 %r24069, %r23871, %r23808; + xor.b32 %r23936, %r30764, %r24068; + xor.b32 %r23937, %r30765, %r24069; + xor.b32 %r23945, %r30735, %r24069; + xor.b32 %r23944, %r30734, %r24068; + st.local.v2.u32 [%rd3+168], {%r23944, %r23945}; + // begin inline asm + shf.l.wrap.b32 %r23875, %r23784, %r23783, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23879, %r23783, %r23784, %r24035; + // end inline asm + xor.b32 %r24070, %r23875, %r23819; + xor.b32 %r24071, %r23879, %r23820; + xor.b32 %r23896, %r30730, %r24070; + xor.b32 %r23897, %r30731, %r24071; + xor.b32 %r23921, %r30725, %r24071; + xor.b32 %r23920, %r30724, %r24070; + st.local.v2.u32 [%rd3+216], {%r23920, %r23921}; + // begin inline asm + shf.l.wrap.b32 %r23883, %r23889, %r23888, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23887, %r23888, %r23889, %r23386; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23891, %r23897, %r23896, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23895, %r23896, %r23897, %r23394; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23903, %r23904, %r23905, %r23402; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23899, %r23905, %r23904, %r23402; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r23899, %r23903}; + // begin inline asm + shf.l.wrap.b32 %r23907, %r23913, %r23912, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23911, %r23912, %r23913, %r23434; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23915, %r23921, %r23920, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23919, %r23920, %r23921, %r23482; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23927, %r23928, %r23929, %r23506; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23923, %r23929, %r23928, %r23506; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r23923, %r23927}; + // begin inline asm + shf.l.wrap.b32 %r23931, %r23937, %r23936, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23935, %r23936, %r23937, %r23522; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23939, %r23945, %r23944, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23943, %r23944, %r23945, %r23530; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23947, %r23953, %r23952, %r23562; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23951, %r23952, %r23953, %r23562; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23955, %r23990, %r23883, %r23907, 0xD2; + lop3.b32 %r23956, %r23993, %r23887, %r23911, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30907, %r23883, %r23907, %r23939, 0xD2; + lop3.b32 %r30908, %r23887, %r23911, %r23943, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30907, %r30908}; + // begin inline asm + // chi + lop3.b32 %r30903, %r23907, %r23939, %r23915, 0xD2; + lop3.b32 %r30904, %r23911, %r23943, %r23919, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30903, %r30904}; + // begin inline asm + // chi + lop3.b32 %r30899, %r23939, %r23915, %r23990, 0xD2; + lop3.b32 %r30900, %r23943, %r23919, %r23993, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30899, %r30900}; + // begin inline asm + // chi + lop3.b32 %r30897, %r23915, %r23990, %r23883, 0xD2; + lop3.b32 %r30898, %r23919, %r23993, %r23887, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30897, %r30898}; + // begin inline asm + // chi + lop3.b32 %r30893, %r23931, %r23891, %r23947, 0xD2; + lop3.b32 %r30894, %r23935, %r23895, %r23951, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30893, %r30894}; + // begin inline asm + // chi + lop3.b32 %r30905, %r23891, %r23947, %r23923, 0xD2; + lop3.b32 %r30906, %r23895, %r23951, %r23927, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30905, %r30906}; + // begin inline asm + // chi + lop3.b32 %r30901, %r23947, %r23923, %r23899, 0xD2; + lop3.b32 %r30902, %r23951, %r23927, %r23903, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30901, %r30902}; + add.s64 %rd1030, %rd1029, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r24019,%r24020}, [%rd1030]; + // end inline asm + xor.b32 %r30895, %r23955, %r24019; + xor.b32 %r30896, %r23956, %r24020; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + st.local.u64 [%rd272], %rd354; + mov.u64 %rd1034, 1179641; + st.local.u64 [%rd272+8], %rd1034; + st.local.u32 [%rd272+16], %r3327; + ld.global.u64 %rd1035, [%rd223]; + ld.global.u64 %rd1036, [%rd223+8]; + ld.global.u64 %rd1037, [%rd223+16]; + ld.global.u64 %rd1038, [%rd223+24]; + ld.global.u64 %rd1039, [%rd223+32]; + ld.global.u64 %rd1040, [%rd223+40]; + ld.global.u64 %rd1041, [%rd223+48]; + ld.global.u64 %rd1042, [%rd223+56]; + st.local.u64 [%rd272+32], %rd1036; + st.local.u64 [%rd272+40], %rd1037; + st.local.u64 [%rd272+48], %rd1038; + st.local.u64 [%rd272+56], %rd1039; + st.local.u64 [%rd272+64], %rd1040; + st.local.u64 [%rd272+72], %rd1041; + st.local.u64 [%rd272+80], %rd1042; + cvt.u32.u64 %r24072, %rd1035; + xor.b32 %r24073, %r3327, %r24072; + st.local.u64 [%rd272+24], %rd1035; + st.local.u32 [%rd272+24], %r24073; + mov.u32 %r30775, 0; + st.local.v2.u32 [%rd272+96], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+104], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+112], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+120], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+128], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+136], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+144], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+152], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+160], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+168], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+176], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+184], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+192], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+200], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+208], {%r30775, %r30775}; + st.local.v2.u32 [%rd272+216], {%r30775, %r30775}; + mov.u32 %r30790, -2147483648; + st.local.v2.u32 [%rd272+88], {%r24035, %r30790}; + ld.local.v2.u32 {%r30811, %r30812}, [%rd272+24]; + mov.b64 {%r30809, %r30810}, %rd1040; + shr.u64 %rd1043, %rd1036, 32; + cvt.u32.u64 %r30823, %rd1036; + cvt.u32.u64 %r30824, %rd1043; + shr.u64 %rd1044, %rd1041, 32; + cvt.u32.u64 %r30821, %rd1041; + cvt.u32.u64 %r30822, %rd1044; + shr.u64 %rd1045, %rd1037, 32; + cvt.u32.u64 %r30819, %rd1037; + cvt.u32.u64 %r30820, %rd1045; + shr.u64 %rd1046, %rd1042, 32; + cvt.u32.u64 %r30817, %rd1042; + cvt.u32.u64 %r30818, %rd1046; + shr.u64 %rd1047, %rd1038, 32; + cvt.u32.u64 %r30815, %rd1038; + cvt.u32.u64 %r30816, %rd1047; + shr.u64 %rd1048, %rd1039, 32; + cvt.u32.u64 %r30813, %rd1039; + cvt.u32.u64 %r30814, %rd1048; + mov.u32 %r30776, %r30775; + mov.u32 %r30777, %r30775; + mov.u32 %r30778, %r30775; + mov.u32 %r30779, %r30775; + mov.u32 %r30780, %r30775; + mov.u32 %r30781, %r30775; + mov.u32 %r30782, %r30775; + mov.u32 %r30783, %r30775; + mov.u32 %r30784, %r30775; + mov.u32 %r30785, %r30775; + mov.u32 %r30786, %r30775; + mov.u32 %r30787, %r30775; + mov.u32 %r30788, %r30775; + mov.u32 %r30789, %r24035; + mov.u32 %r30791, %r30775; + mov.u32 %r30792, %r30775; + mov.u32 %r30793, %r30775; + mov.u32 %r30794, %r30775; + mov.u32 %r30795, %r30775; + mov.u32 %r30796, %r30775; + mov.u32 %r30797, %r30775; + mov.u32 %r30798, %r30775; + mov.u32 %r30799, %r30775; + mov.u32 %r30800, %r30775; + mov.u32 %r30801, %r30775; + mov.u32 %r30802, %r30775; + mov.u32 %r30803, %r30775; + mov.u32 %r30804, %r30775; + mov.u32 %r30805, %r30775; + mov.u32 %r30806, %r30775; + mov.u32 %r30807, %r30775; + mov.u32 %r30808, %r30775; + mov.u32 %r30825, %r30775; + +$L__BB2_86: + // begin inline asm + // xor5 + lop3.b32 %r24076, %r30811, %r30809, %r30807, 0x96; + lop3.b32 %r24076, %r24076, %r30805, %r30803, 0x96; + lop3.b32 %r24077, %r30812, %r30810, %r30808, 0x96; + lop3.b32 %r24077, %r24077, %r30806, %r30804, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24088, %r30823, %r30821, %r30801, 0x96; + lop3.b32 %r24088, %r24088, %r30799, %r30797, 0x96; + lop3.b32 %r24089, %r30824, %r30822, %r30802, 0x96; + lop3.b32 %r24089, %r24089, %r30800, %r30798, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24100, %r30819, %r30817, %r30795, 0x96; + lop3.b32 %r24100, %r24100, %r30793, %r30791, 0x96; + lop3.b32 %r24101, %r30820, %r30818, %r30796, 0x96; + lop3.b32 %r24101, %r24101, %r30794, %r30792, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24112, %r30815, %r30789, %r30787, 0x96; + lop3.b32 %r24112, %r24112, %r30785, %r30783, 0x96; + lop3.b32 %r24113, %r30816, %r30790, %r30788, 0x96; + lop3.b32 %r24113, %r24113, %r30786, %r30784, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24124, %r30813, %r30781, %r30779, 0x96; + lop3.b32 %r24124, %r24124, %r30777, %r30775, 0x96; + lop3.b32 %r24125, %r30814, %r30782, %r30780, 0x96; + lop3.b32 %r24125, %r24125, %r30778, %r30776, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24136, %r24089, %r24088, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24140, %r24088, %r24089, %r24035; + // end inline asm + xor.b32 %r24570, %r24136, %r24124; + xor.b32 %r24571, %r24140, %r24125; + xor.b32 %r24403, %r30811, %r24570; + xor.b32 %r24406, %r30812, %r24571; + xor.b32 %r24310, %r30809, %r24570; + xor.b32 %r24309, %r30810, %r24571; + xor.b32 %r24357, %r30807, %r24570; + xor.b32 %r24358, %r30808, %r24571; + xor.b32 %r24262, %r30805, %r24570; + xor.b32 %r24261, %r30806, %r24571; + xor.b32 %r24213, %r30803, %r24570; + xor.b32 %r24214, %r30804, %r24571; + // begin inline asm + shf.l.wrap.b32 %r24144, %r24101, %r24100, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24148, %r24100, %r24101, %r24035; + // end inline asm + xor.b32 %r24572, %r24144, %r24076; + xor.b32 %r24573, %r24148, %r24077; + xor.b32 %r24365, %r30823, %r24572; + xor.b32 %r24366, %r30824, %r24573; + xor.b32 %r24182, %r30821, %r24572; + xor.b32 %r24181, %r30822, %r24573; + xor.b32 %r24341, %r30801, %r24572; + xor.b32 %r24342, %r30802, %r24573; + xor.b32 %r24302, %r30799, %r24572; + xor.b32 %r24301, %r30800, %r24573; + xor.b32 %r24285, %r30797, %r24572; + xor.b32 %r24286, %r30798, %r24573; + // begin inline asm + shf.l.wrap.b32 %r24152, %r24113, %r24112, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24156, %r24112, %r24113, %r24035; + // end inline asm + xor.b32 %r24574, %r24152, %r24088; + xor.b32 %r24575, %r24156, %r24089; + xor.b32 %r24222, %r30819, %r24574; + xor.b32 %r24221, %r30820, %r24575; + xor.b32 %r24349, %r30817, %r24574; + xor.b32 %r24350, %r30818, %r24575; + xor.b32 %r24230, %r30795, %r24574; + xor.b32 %r24229, %r30796, %r24575; + xor.b32 %r24333, %r30793, %r24574; + xor.b32 %r24334, %r30794, %r24575; + xor.b32 %r24198, %r30791, %r24574; + xor.b32 %r24197, %r30792, %r24575; + // begin inline asm + shf.l.wrap.b32 %r24160, %r24125, %r24124, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24164, %r24124, %r24125, %r24035; + // end inline asm + xor.b32 %r24576, %r24160, %r24100; + xor.b32 %r24577, %r24164, %r24101; + xor.b32 %r24317, %r30815, %r24576; + xor.b32 %r24318, %r30816, %r24577; + xor.b32 %r24294, %r30789, %r24576; + xor.b32 %r24293, %r30790, %r24577; + xor.b32 %r24237, %r30787, %r24576; + xor.b32 %r24238, %r30788, %r24577; + xor.b32 %r24325, %r30785, %r24576; + xor.b32 %r24326, %r30786, %r24577; + xor.b32 %r24254, %r30783, %r24576; + xor.b32 %r24253, %r30784, %r24577; + // begin inline asm + shf.l.wrap.b32 %r24168, %r24077, %r24076, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24172, %r24076, %r24077, %r24035; + // end inline asm + xor.b32 %r24578, %r24168, %r24112; + xor.b32 %r24579, %r24172, %r24113; + xor.b32 %r24269, %r30813, %r24578; + xor.b32 %r24270, %r30814, %r24579; + xor.b32 %r24189, %r30781, %r24578; + xor.b32 %r24190, %r30782, %r24579; + xor.b32 %r24206, %r30779, %r24578; + xor.b32 %r24205, %r30780, %r24579; + xor.b32 %r24245, %r30777, %r24578; + xor.b32 %r24246, %r30778, %r24579; + xor.b32 %r24277, %r30775, %r24578; + xor.b32 %r24278, %r30776, %r24579; + mov.u32 %r24183, 44; + // begin inline asm + shf.l.wrap.b32 %r24176, %r24182, %r24181, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24180, %r24181, %r24182, %r24183; + // end inline asm + mov.u32 %r24191, 20; + // begin inline asm + shf.l.wrap.b32 %r24184, %r24190, %r24189, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24188, %r24189, %r24190, %r24191; + // end inline asm + mov.u32 %r24199, 61; + // begin inline asm + shf.l.wrap.b32 %r24192, %r24198, %r24197, %r24199; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24196, %r24197, %r24198, %r24199; + // end inline asm + mov.u32 %r24207, 39; + // begin inline asm + shf.l.wrap.b32 %r24200, %r24206, %r24205, %r24207; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24204, %r24205, %r24206, %r24207; + // end inline asm + mov.u32 %r24215, 18; + // begin inline asm + shf.l.wrap.b32 %r24208, %r24214, %r24213, %r24215; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24212, %r24213, %r24214, %r24215; + // end inline asm + mov.u32 %r24223, 62; + // begin inline asm + shf.l.wrap.b32 %r24216, %r24222, %r24221, %r24223; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24220, %r24221, %r24222, %r24223; + // end inline asm + mov.u32 %r24231, 43; + // begin inline asm + shf.l.wrap.b32 %r24224, %r24230, %r24229, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24228, %r24229, %r24230, %r24231; + // end inline asm + mov.u32 %r24239, 25; + // begin inline asm + shf.l.wrap.b32 %r24232, %r24238, %r24237, %r24239; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24236, %r24237, %r24238, %r24239; + // end inline asm + mov.u32 %r24247, 8; + // begin inline asm + shf.l.wrap.b32 %r24240, %r24246, %r24245, %r24247; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24244, %r24245, %r24246, %r24247; + // end inline asm + mov.u32 %r24255, 56; + // begin inline asm + shf.l.wrap.b32 %r24248, %r24254, %r24253, %r24255; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24252, %r24253, %r24254, %r24255; + // end inline asm + mov.u32 %r24263, 41; + // begin inline asm + shf.l.wrap.b32 %r24256, %r24262, %r24261, %r24263; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24260, %r24261, %r24262, %r24263; + // end inline asm + mov.u32 %r24271, 27; + // begin inline asm + shf.l.wrap.b32 %r24264, %r24270, %r24269, %r24271; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24268, %r24269, %r24270, %r24271; + // end inline asm + mov.u32 %r24279, 14; + // begin inline asm + shf.l.wrap.b32 %r24272, %r24278, %r24277, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24276, %r24277, %r24278, %r24279; + // end inline asm + mov.u32 %r24287, 2; + // begin inline asm + shf.l.wrap.b32 %r24280, %r24286, %r24285, %r24287; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24284, %r24285, %r24286, %r24287; + // end inline asm + mov.u32 %r24295, 55; + // begin inline asm + shf.l.wrap.b32 %r24288, %r24294, %r24293, %r24295; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24292, %r24293, %r24294, %r24295; + // end inline asm + mov.u32 %r24303, 45; + // begin inline asm + shf.l.wrap.b32 %r24296, %r24302, %r24301, %r24303; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24300, %r24301, %r24302, %r24303; + // end inline asm + mov.u32 %r24311, 36; + // begin inline asm + shf.l.wrap.b32 %r24304, %r24310, %r24309, %r24311; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24308, %r24309, %r24310, %r24311; + // end inline asm + mov.u32 %r24319, 28; + // begin inline asm + shf.l.wrap.b32 %r24312, %r24318, %r24317, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24316, %r24317, %r24318, %r24319; + // end inline asm + mov.u32 %r24327, 21; + // begin inline asm + shf.l.wrap.b32 %r24320, %r24326, %r24325, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24324, %r24325, %r24326, %r24327; + // end inline asm + mov.u32 %r24335, 15; + // begin inline asm + shf.l.wrap.b32 %r24328, %r24334, %r24333, %r24335; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24332, %r24333, %r24334, %r24335; + // end inline asm + mov.u32 %r24343, 10; + // begin inline asm + shf.l.wrap.b32 %r24336, %r24342, %r24341, %r24343; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24340, %r24341, %r24342, %r24343; + // end inline asm + mov.u32 %r24351, 6; + // begin inline asm + shf.l.wrap.b32 %r24344, %r24350, %r24349, %r24351; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24348, %r24349, %r24350, %r24351; + // end inline asm + mov.u32 %r24359, 3; + // begin inline asm + shf.l.wrap.b32 %r24352, %r24358, %r24357, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24356, %r24357, %r24358, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24360, %r24366, %r24365, %r24035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24364, %r24365, %r24366, %r24035; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24368, %r24403, %r24176, %r24224, 0xD2; + lop3.b32 %r24369, %r24406, %r24180, %r24228, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30823, %r24176, %r24224, %r24320, 0xD2; + lop3.b32 %r30824, %r24180, %r24228, %r24324, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30819, %r24224, %r24320, %r24272, 0xD2; + lop3.b32 %r30820, %r24228, %r24324, %r24276, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30815, %r24320, %r24272, %r24403, 0xD2; + lop3.b32 %r30816, %r24324, %r24276, %r24406, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30813, %r24272, %r24403, %r24176, 0xD2; + lop3.b32 %r30814, %r24276, %r24406, %r24180, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30809, %r24312, %r24184, %r24352, 0xD2; + lop3.b32 %r30810, %r24316, %r24188, %r24356, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30821, %r24184, %r24352, %r24296, 0xD2; + lop3.b32 %r30822, %r24188, %r24356, %r24300, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30817, %r24352, %r24296, %r24192, 0xD2; + lop3.b32 %r30818, %r24356, %r24300, %r24196, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30789, %r24296, %r24192, %r24312, 0xD2; + lop3.b32 %r30790, %r24300, %r24196, %r24316, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r30789, %r30790}; + // begin inline asm + // chi + lop3.b32 %r30781, %r24192, %r24312, %r24184, 0xD2; + lop3.b32 %r30782, %r24196, %r24316, %r24188, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r30781, %r30782}; + // begin inline asm + // chi + lop3.b32 %r30807, %r24360, %r24344, %r24232, 0xD2; + lop3.b32 %r30808, %r24364, %r24348, %r24236, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+104], {%r30807, %r30808}; + // begin inline asm + // chi + lop3.b32 %r30801, %r24344, %r24232, %r24240, 0xD2; + lop3.b32 %r30802, %r24348, %r24236, %r24244, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+112], {%r30801, %r30802}; + // begin inline asm + // chi + lop3.b32 %r30795, %r24232, %r24240, %r24208, 0xD2; + lop3.b32 %r30796, %r24236, %r24244, %r24212, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+120], {%r30795, %r30796}; + // begin inline asm + // chi + lop3.b32 %r30787, %r24240, %r24208, %r24360, 0xD2; + lop3.b32 %r30788, %r24244, %r24212, %r24364, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+128], {%r30787, %r30788}; + // begin inline asm + // chi + lop3.b32 %r30779, %r24208, %r24360, %r24344, 0xD2; + lop3.b32 %r30780, %r24212, %r24364, %r24348, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+136], {%r30779, %r30780}; + // begin inline asm + // chi + lop3.b32 %r30805, %r24264, %r24304, %r24336, 0xD2; + lop3.b32 %r30806, %r24268, %r24308, %r24340, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+144], {%r30805, %r30806}; + // begin inline asm + // chi + lop3.b32 %r30799, %r24304, %r24336, %r24328, 0xD2; + lop3.b32 %r30800, %r24308, %r24340, %r24332, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+152], {%r30799, %r30800}; + // begin inline asm + // chi + lop3.b32 %r30793, %r24336, %r24328, %r24248, 0xD2; + lop3.b32 %r30794, %r24340, %r24332, %r24252, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+160], {%r30793, %r30794}; + // begin inline asm + // chi + lop3.b32 %r30785, %r24328, %r24248, %r24264, 0xD2; + lop3.b32 %r30786, %r24332, %r24252, %r24268, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+168], {%r30785, %r30786}; + // begin inline asm + // chi + lop3.b32 %r30777, %r24248, %r24264, %r24304, 0xD2; + lop3.b32 %r30778, %r24252, %r24268, %r24308, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+176], {%r30777, %r30778}; + // begin inline asm + // chi + lop3.b32 %r30803, %r24216, %r24288, %r24200, 0xD2; + lop3.b32 %r30804, %r24220, %r24292, %r24204, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+184], {%r30803, %r30804}; + // begin inline asm + // chi + lop3.b32 %r30797, %r24288, %r24200, %r24256, 0xD2; + lop3.b32 %r30798, %r24292, %r24204, %r24260, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+192], {%r30797, %r30798}; + // begin inline asm + // chi + lop3.b32 %r30791, %r24200, %r24256, %r24280, 0xD2; + lop3.b32 %r30792, %r24204, %r24260, %r24284, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+200], {%r30791, %r30792}; + // begin inline asm + // chi + lop3.b32 %r30783, %r24256, %r24280, %r24216, 0xD2; + lop3.b32 %r30784, %r24260, %r24284, %r24220, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+208], {%r30783, %r30784}; + // begin inline asm + // chi + lop3.b32 %r30775, %r24280, %r24216, %r24288, 0xD2; + lop3.b32 %r30776, %r24284, %r24220, %r24292, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+216], {%r30775, %r30776}; + mul.wide.s32 %rd1050, %r30825, 8; + add.s64 %rd1049, %rd1029, %rd1050; + // begin inline asm + ld.global.nc.v2.u32 {%r24568,%r24569}, [%rd1049]; + // end inline asm + xor.b32 %r30811, %r24368, %r24568; + xor.b32 %r30812, %r24369, %r24569; + add.s32 %r30825, %r30825, 1; + setp.lt.u32 %p48, %r30825, 23; + @%p48 bra $L__BB2_86; + + mov.u32 %r30858, 0; + mov.u32 %r24679, 1; + st.local.v2.u32 [%rd272+32], {%r30823, %r30824}; + st.local.v2.u32 [%rd272+72], {%r30821, %r30822}; + st.local.v2.u32 [%rd272+40], {%r30819, %r30820}; + st.local.v2.u32 [%rd272+80], {%r30817, %r30818}; + st.local.v2.u32 [%rd272+48], {%r30815, %r30816}; + st.local.v2.u32 [%rd272+56], {%r30813, %r30814}; + st.local.v2.u32 [%rd272+24], {%r30811, %r30812}; + // begin inline asm + // xor5 + lop3.b32 %r24580, %r30811, %r30809, %r30807, 0x96; + lop3.b32 %r24580, %r24580, %r30805, %r30803, 0x96; + lop3.b32 %r24581, %r30812, %r30810, %r30808, 0x96; + lop3.b32 %r24581, %r24581, %r30806, %r30804, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24592, %r30823, %r30821, %r30801, 0x96; + lop3.b32 %r24592, %r24592, %r30799, %r30797, 0x96; + lop3.b32 %r24593, %r30824, %r30822, %r30802, 0x96; + lop3.b32 %r24593, %r24593, %r30800, %r30798, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24604, %r30819, %r30817, %r30795, 0x96; + lop3.b32 %r24604, %r24604, %r30793, %r30791, 0x96; + lop3.b32 %r24605, %r30820, %r30818, %r30796, 0x96; + lop3.b32 %r24605, %r24605, %r30794, %r30792, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24616, %r30815, %r30789, %r30787, 0x96; + lop3.b32 %r24616, %r24616, %r30785, %r30783, 0x96; + lop3.b32 %r24617, %r30816, %r30790, %r30788, 0x96; + lop3.b32 %r24617, %r24617, %r30786, %r30784, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24628, %r30813, %r30781, %r30779, 0x96; + lop3.b32 %r24628, %r24628, %r30777, %r30775, 0x96; + lop3.b32 %r24629, %r30814, %r30782, %r30780, 0x96; + lop3.b32 %r24629, %r24629, %r30778, %r30776, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24640, %r24593, %r24592, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24644, %r24592, %r24593, %r24679; + // end inline asm + xor.b32 %r24819, %r24640, %r24628; + xor.b32 %r24820, %r24644, %r24629; + xor.b32 %r24787, %r30811, %r24819; + xor.b32 %r24790, %r30812, %r24820; + xor.b32 %r24750, %r30808, %r24820; + xor.b32 %r24749, %r30807, %r24819; + st.local.v2.u32 [%rd272+104], {%r24749, %r24750}; + // begin inline asm + shf.l.wrap.b32 %r24648, %r24605, %r24604, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24652, %r24604, %r24605, %r24679; + // end inline asm + xor.b32 %r24821, %r24648, %r24580; + xor.b32 %r24822, %r24652, %r24581; + xor.b32 %r24686, %r30821, %r24821; + xor.b32 %r24685, %r30822, %r24822; + xor.b32 %r24725, %r30800, %r24822; + xor.b32 %r24726, %r30799, %r24821; + st.local.v2.u32 [%rd272+152], {%r24726, %r24725}; + // begin inline asm + shf.l.wrap.b32 %r24656, %r24617, %r24616, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24660, %r24616, %r24617, %r24679; + // end inline asm + xor.b32 %r24823, %r24656, %r24592; + xor.b32 %r24824, %r24660, %r24593; + xor.b32 %r24709, %r30796, %r24824; + xor.b32 %r24710, %r30795, %r24823; + st.local.v2.u32 [%rd272+120], {%r24710, %r24709}; + xor.b32 %r24701, %r30792, %r24824; + xor.b32 %r24702, %r30791, %r24823; + st.local.v2.u32 [%rd272+200], {%r24702, %r24701}; + // begin inline asm + shf.l.wrap.b32 %r24664, %r24629, %r24628, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24668, %r24628, %r24629, %r24679; + // end inline asm + xor.b32 %r24825, %r24664, %r24604; + xor.b32 %r24826, %r24668, %r24605; + xor.b32 %r24733, %r30815, %r24825; + xor.b32 %r24734, %r30816, %r24826; + xor.b32 %r24742, %r30786, %r24826; + xor.b32 %r24741, %r30785, %r24825; + st.local.v2.u32 [%rd272+168], {%r24741, %r24742}; + // begin inline asm + shf.l.wrap.b32 %r24672, %r24581, %r24580, %r24679; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24676, %r24580, %r24581, %r24679; + // end inline asm + xor.b32 %r24827, %r24672, %r24616; + xor.b32 %r24828, %r24676, %r24617; + xor.b32 %r24693, %r30781, %r24827; + xor.b32 %r24694, %r30782, %r24828; + xor.b32 %r24718, %r30776, %r24828; + xor.b32 %r24717, %r30775, %r24827; + st.local.v2.u32 [%rd272+216], {%r24717, %r24718}; + // begin inline asm + shf.l.wrap.b32 %r24680, %r24686, %r24685, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24684, %r24685, %r24686, %r24183; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24688, %r24694, %r24693, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24692, %r24693, %r24694, %r24191; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24700, %r24701, %r24702, %r24199; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24696, %r24702, %r24701, %r24199; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r24696, %r24700}; + // begin inline asm + shf.l.wrap.b32 %r24704, %r24710, %r24709, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24708, %r24709, %r24710, %r24231; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24712, %r24718, %r24717, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24716, %r24717, %r24718, %r24279; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24724, %r24725, %r24726, %r24303; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24720, %r24726, %r24725, %r24303; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r24720, %r24724}; + // begin inline asm + shf.l.wrap.b32 %r24728, %r24734, %r24733, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24732, %r24733, %r24734, %r24319; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24736, %r24742, %r24741, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24740, %r24741, %r24742, %r24327; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24744, %r24750, %r24749, %r24359; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r24748, %r24749, %r24750, %r24359; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r24752, %r24787, %r24680, %r24704, 0xD2; + lop3.b32 %r24753, %r24790, %r24684, %r24708, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30958, %r24680, %r24704, %r24736, 0xD2; + lop3.b32 %r30959, %r24684, %r24708, %r24740, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+32], {%r30958, %r30959}; + // begin inline asm + // chi + lop3.b32 %r30954, %r24704, %r24736, %r24712, 0xD2; + lop3.b32 %r30955, %r24708, %r24740, %r24716, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+40], {%r30954, %r30955}; + // begin inline asm + // chi + lop3.b32 %r30950, %r24736, %r24712, %r24787, 0xD2; + lop3.b32 %r30951, %r24740, %r24716, %r24790, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+48], {%r30950, %r30951}; + // begin inline asm + // chi + lop3.b32 %r30948, %r24712, %r24787, %r24680, 0xD2; + lop3.b32 %r30949, %r24716, %r24790, %r24684, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+56], {%r30948, %r30949}; + // begin inline asm + // chi + lop3.b32 %r30944, %r24728, %r24688, %r24744, 0xD2; + lop3.b32 %r30945, %r24732, %r24692, %r24748, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+64], {%r30944, %r30945}; + // begin inline asm + // chi + lop3.b32 %r30956, %r24688, %r24744, %r24720, 0xD2; + lop3.b32 %r30957, %r24692, %r24748, %r24724, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+72], {%r30956, %r30957}; + // begin inline asm + // chi + lop3.b32 %r30952, %r24744, %r24720, %r24696, 0xD2; + lop3.b32 %r30953, %r24748, %r24724, %r24700, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+80], {%r30952, %r30953}; + // begin inline asm + ld.global.nc.v2.u32 {%r24816,%r24817}, [%rd1030]; + // end inline asm + xor.b32 %r30946, %r24752, %r24816; + xor.b32 %r30947, %r24753, %r24817; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + add.s64 %rd275, %rd272, 24; + add.s64 %rd276, %rd3, 24; + +$L__BB2_88: + shl.b32 %r24829, %r30858, 2; + cvt.u64.u32 %rd1058, %r24829; + and.b64 %rd1059, %rd1058, 60; + add.s64 %rd1060, %rd276, %rd1059; + xor.b32 %r24830, %r3326, %r30858; + mul.lo.s32 %r24831, %r24830, 16777619; + ld.local.u32 %r24832, [%rd1060]; + xor.b32 %r24833, %r24831, %r24832; + mul.wide.u32 %rd1061, %r24833, -954391867; + shr.u64 %rd1062, %rd1061, 32; + cvt.u32.u64 %r24834, %rd1062; + sub.s32 %r24835, %r24833, %r24834; + shr.u32 %r24836, %r24835, 1; + add.s32 %r24837, %r24836, %r24834; + shr.u32 %r24838, %r24837, 20; + mul.lo.s32 %r24839, %r24838, 1179641; + sub.s32 %r24840, %r24833, %r24839; + mul.wide.u32 %rd1063, %r24840, 64; + add.s64 %rd1064, %rd471, %rd1063; + mul.lo.s32 %r24841, %r30895, 16777619; + ld.global.u32 %r24842, [%rd1064]; + xor.b32 %r30895, %r24841, %r24842; + mul.lo.s32 %r24843, %r30896, 16777619; + ld.global.u32 %r24844, [%rd1064+4]; + xor.b32 %r30896, %r24843, %r24844; + mul.lo.s32 %r24845, %r30907, 16777619; + ld.global.u32 %r24846, [%rd1064+8]; + mul.lo.s32 %r24847, %r30908, 16777619; + ld.global.u32 %r24848, [%rd1064+12]; + xor.b32 %r24849, %r24847, %r24848; + xor.b32 %r30907, %r24845, %r24846; + mov.b64 %rd1065, {%r30907, %r24849}; + mul.lo.s32 %r24850, %r30903, 16777619; + ld.global.u32 %r24851, [%rd1064+16]; + mul.lo.s32 %r24852, %r30904, 16777619; + ld.global.u32 %r24853, [%rd1064+20]; + xor.b32 %r24854, %r24852, %r24853; + xor.b32 %r30903, %r24850, %r24851; + mov.b64 %rd1066, {%r30903, %r24854}; + mul.lo.s32 %r24855, %r30899, 16777619; + ld.global.u32 %r24856, [%rd1064+24]; + mul.lo.s32 %r24857, %r30900, 16777619; + ld.global.u32 %r24858, [%rd1064+28]; + xor.b32 %r24859, %r24857, %r24858; + xor.b32 %r30899, %r24855, %r24856; + mov.b64 %rd1067, {%r30899, %r24859}; + mul.lo.s32 %r24860, %r30897, 16777619; + ld.global.u32 %r24861, [%rd1064+32]; + mul.lo.s32 %r24862, %r30898, 16777619; + ld.global.u32 %r24863, [%rd1064+36]; + xor.b32 %r24864, %r24862, %r24863; + xor.b32 %r30897, %r24860, %r24861; + mov.b64 %rd1068, {%r30897, %r24864}; + mul.lo.s32 %r24865, %r30893, 16777619; + ld.global.u32 %r24866, [%rd1064+40]; + xor.b32 %r30893, %r24865, %r24866; + mul.lo.s32 %r24867, %r30894, 16777619; + ld.global.u32 %r24868, [%rd1064+44]; + xor.b32 %r30894, %r24867, %r24868; + mul.lo.s32 %r24869, %r30905, 16777619; + ld.global.u32 %r24870, [%rd1064+48]; + mul.lo.s32 %r24871, %r30906, 16777619; + ld.global.u32 %r24872, [%rd1064+52]; + xor.b32 %r24873, %r24871, %r24872; + xor.b32 %r30905, %r24869, %r24870; + mov.b64 %rd1069, {%r30905, %r24873}; + mul.lo.s32 %r24874, %r30901, 16777619; + ld.global.u32 %r24875, [%rd1064+56]; + mul.lo.s32 %r24876, %r30902, 16777619; + ld.global.u32 %r24877, [%rd1064+60]; + xor.b32 %r24878, %r24876, %r24877; + xor.b32 %r30901, %r24874, %r24875; + mov.b64 %rd1070, {%r30901, %r24878}; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + st.local.v2.u32 [%rd3+32], {%r30907, %r24849}; + st.local.v2.u32 [%rd3+40], {%r30903, %r24854}; + st.local.v2.u32 [%rd3+48], {%r30899, %r24859}; + st.local.v2.u32 [%rd3+56], {%r30897, %r24864}; + st.local.v2.u32 [%rd3+64], {%r30893, %r30894}; + st.local.v2.u32 [%rd3+72], {%r30905, %r24873}; + st.local.v2.u32 [%rd3+80], {%r30901, %r24878}; + add.s64 %rd1071, %rd275, %rd1059; + xor.b32 %r24879, %r3327, %r30858; + mul.lo.s32 %r24880, %r24879, 16777619; + ld.local.u32 %r24881, [%rd1071]; + xor.b32 %r24882, %r24880, %r24881; + mul.wide.u32 %rd1072, %r24882, -954391867; + shr.u64 %rd1073, %rd1072, 32; + cvt.u32.u64 %r24883, %rd1073; + sub.s32 %r24884, %r24882, %r24883; + shr.u32 %r24885, %r24884, 1; + add.s32 %r24886, %r24885, %r24883; + shr.u32 %r24887, %r24886, 20; + mul.lo.s32 %r24888, %r24887, 1179641; + sub.s32 %r24889, %r24882, %r24888; + mul.wide.u32 %rd1074, %r24889, 64; + add.s64 %rd1075, %rd471, %rd1074; + mul.lo.s32 %r24890, %r30946, 16777619; + ld.global.u32 %r24891, [%rd1075]; + xor.b32 %r30946, %r24890, %r24891; + mul.lo.s32 %r24892, %r30947, 16777619; + ld.global.u32 %r24893, [%rd1075+4]; + xor.b32 %r30947, %r24892, %r24893; + mul.lo.s32 %r24894, %r30958, 16777619; + ld.global.u32 %r24895, [%rd1075+8]; + mul.lo.s32 %r24896, %r30959, 16777619; + ld.global.u32 %r24897, [%rd1075+12]; + xor.b32 %r24898, %r24896, %r24897; + xor.b32 %r30958, %r24894, %r24895; + mov.b64 %rd1076, {%r30958, %r24898}; + mul.lo.s32 %r24899, %r30954, 16777619; + ld.global.u32 %r24900, [%rd1075+16]; + mul.lo.s32 %r24901, %r30955, 16777619; + ld.global.u32 %r24902, [%rd1075+20]; + xor.b32 %r24903, %r24901, %r24902; + xor.b32 %r30954, %r24899, %r24900; + mov.b64 %rd1077, {%r30954, %r24903}; + mul.lo.s32 %r24904, %r30950, 16777619; + ld.global.u32 %r24905, [%rd1075+24]; + mul.lo.s32 %r24906, %r30951, 16777619; + ld.global.u32 %r24907, [%rd1075+28]; + xor.b32 %r24908, %r24906, %r24907; + xor.b32 %r30950, %r24904, %r24905; + mov.b64 %rd1078, {%r30950, %r24908}; + mul.lo.s32 %r24909, %r30948, 16777619; + ld.global.u32 %r24910, [%rd1075+32]; + mul.lo.s32 %r24911, %r30949, 16777619; + ld.global.u32 %r24912, [%rd1075+36]; + xor.b32 %r24913, %r24911, %r24912; + xor.b32 %r30948, %r24909, %r24910; + mov.b64 %rd1079, {%r30948, %r24913}; + mul.lo.s32 %r24914, %r30944, 16777619; + ld.global.u32 %r24915, [%rd1075+40]; + xor.b32 %r30944, %r24914, %r24915; + mul.lo.s32 %r24916, %r30945, 16777619; + ld.global.u32 %r24917, [%rd1075+44]; + xor.b32 %r30945, %r24916, %r24917; + mul.lo.s32 %r24918, %r30956, 16777619; + ld.global.u32 %r24919, [%rd1075+48]; + mul.lo.s32 %r24920, %r30957, 16777619; + ld.global.u32 %r24921, [%rd1075+52]; + xor.b32 %r24922, %r24920, %r24921; + xor.b32 %r30956, %r24918, %r24919; + mov.b64 %rd1080, {%r30956, %r24922}; + mul.lo.s32 %r24923, %r30952, 16777619; + ld.global.u32 %r24924, [%rd1075+56]; + mul.lo.s32 %r24925, %r30953, 16777619; + ld.global.u32 %r24926, [%rd1075+60]; + xor.b32 %r24927, %r24925, %r24926; + xor.b32 %r30952, %r24923, %r24924; + mov.b64 %rd1081, {%r30952, %r24927}; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + st.local.v2.u32 [%rd272+32], {%r30958, %r24898}; + st.local.v2.u32 [%rd272+40], {%r30954, %r24903}; + st.local.v2.u32 [%rd272+48], {%r30950, %r24908}; + st.local.v2.u32 [%rd272+56], {%r30948, %r24913}; + st.local.v2.u32 [%rd272+64], {%r30944, %r30945}; + st.local.v2.u32 [%rd272+72], {%r30956, %r24922}; + st.local.v2.u32 [%rd272+80], {%r30952, %r24927}; + add.s32 %r30858, %r30858, 1; + setp.lt.u32 %p49, %r30858, 512; + shr.u64 %rd1082, %rd1065, 32; + cvt.u32.u64 %r30908, %rd1082; + shr.u64 %rd1083, %rd1066, 32; + cvt.u32.u64 %r30904, %rd1083; + shr.u64 %rd1084, %rd1067, 32; + cvt.u32.u64 %r30900, %rd1084; + shr.u64 %rd1085, %rd1068, 32; + cvt.u32.u64 %r30898, %rd1085; + shr.u64 %rd1086, %rd1069, 32; + cvt.u32.u64 %r30906, %rd1086; + shr.u64 %rd1087, %rd1070, 32; + cvt.u32.u64 %r30902, %rd1087; + shr.u64 %rd1088, %rd1076, 32; + cvt.u32.u64 %r30959, %rd1088; + shr.u64 %rd1089, %rd1077, 32; + cvt.u32.u64 %r30955, %rd1089; + shr.u64 %rd1090, %rd1078, 32; + cvt.u32.u64 %r30951, %rd1090; + shr.u64 %rd1091, %rd1079, 32; + cvt.u32.u64 %r30949, %rd1091; + shr.u64 %rd1092, %rd1080, 32; + cvt.u32.u64 %r30957, %rd1092; + shr.u64 %rd1093, %rd1081, 32; + cvt.u32.u64 %r30953, %rd1093; + @%p49 bra $L__BB2_88; + + mov.u32 %r30859, 0; + st.local.v2.u32 [%rd3+96], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+104], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+112], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+120], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+128], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+136], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+144], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+152], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+160], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+168], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+176], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+184], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+192], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+200], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+208], {%r30859, %r30859}; + st.local.v2.u32 [%rd3+216], {%r30859, %r30859}; + mov.u32 %r30874, -2147483648; + mov.u32 %r24942, 1; + st.local.v2.u32 [%rd3+88], {%r24942, %r30874}; + mov.u32 %r30860, %r30859; + mov.u32 %r30861, %r30859; + mov.u32 %r30862, %r30859; + mov.u32 %r30863, %r30859; + mov.u32 %r30864, %r30859; + mov.u32 %r30865, %r30859; + mov.u32 %r30866, %r30859; + mov.u32 %r30867, %r30859; + mov.u32 %r30868, %r30859; + mov.u32 %r30869, %r30859; + mov.u32 %r30870, %r30859; + mov.u32 %r30871, %r30859; + mov.u32 %r30872, %r30859; + mov.u32 %r30873, %r24942; + mov.u32 %r30875, %r30859; + mov.u32 %r30876, %r30859; + mov.u32 %r30877, %r30859; + mov.u32 %r30878, %r30859; + mov.u32 %r30879, %r30859; + mov.u32 %r30880, %r30859; + mov.u32 %r30881, %r30859; + mov.u32 %r30882, %r30859; + mov.u32 %r30883, %r30859; + mov.u32 %r30884, %r30859; + mov.u32 %r30885, %r30859; + mov.u32 %r30886, %r30859; + mov.u32 %r30887, %r30859; + mov.u32 %r30888, %r30859; + mov.u32 %r30889, %r30859; + mov.u32 %r30890, %r30859; + mov.u32 %r30891, %r30859; + mov.u32 %r30892, %r30859; + mov.u32 %r30909, %r30859; + +$L__BB2_90: + // begin inline asm + // xor5 + lop3.b32 %r24969, %r30895, %r30893, %r30891, 0x96; + lop3.b32 %r24969, %r24969, %r30889, %r30887, 0x96; + lop3.b32 %r24970, %r30896, %r30894, %r30892, 0x96; + lop3.b32 %r24970, %r24970, %r30890, %r30888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24981, %r30907, %r30905, %r30885, 0x96; + lop3.b32 %r24981, %r24981, %r30883, %r30881, 0x96; + lop3.b32 %r24982, %r30908, %r30906, %r30886, 0x96; + lop3.b32 %r24982, %r24982, %r30884, %r30882, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r24993, %r30903, %r30901, %r30879, 0x96; + lop3.b32 %r24993, %r24993, %r30877, %r30875, 0x96; + lop3.b32 %r24994, %r30904, %r30902, %r30880, 0x96; + lop3.b32 %r24994, %r24994, %r30878, %r30876, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25005, %r30899, %r30873, %r30871, 0x96; + lop3.b32 %r25005, %r25005, %r30869, %r30867, 0x96; + lop3.b32 %r25006, %r30900, %r30874, %r30872, 0x96; + lop3.b32 %r25006, %r25006, %r30870, %r30868, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25017, %r30897, %r30865, %r30863, 0x96; + lop3.b32 %r25017, %r25017, %r30861, %r30859, 0x96; + lop3.b32 %r25018, %r30898, %r30866, %r30864, 0x96; + lop3.b32 %r25018, %r25018, %r30862, %r30860, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25029, %r24982, %r24981, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25033, %r24981, %r24982, %r24942; + // end inline asm + xor.b32 %r25463, %r25029, %r25017; + xor.b32 %r25464, %r25033, %r25018; + xor.b32 %r25296, %r30895, %r25463; + xor.b32 %r25299, %r30896, %r25464; + xor.b32 %r25203, %r30893, %r25463; + xor.b32 %r25202, %r30894, %r25464; + xor.b32 %r25250, %r30891, %r25463; + xor.b32 %r25251, %r30892, %r25464; + xor.b32 %r25155, %r30889, %r25463; + xor.b32 %r25154, %r30890, %r25464; + xor.b32 %r25106, %r30887, %r25463; + xor.b32 %r25107, %r30888, %r25464; + // begin inline asm + shf.l.wrap.b32 %r25037, %r24994, %r24993, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25041, %r24993, %r24994, %r24942; + // end inline asm + xor.b32 %r25465, %r25037, %r24969; + xor.b32 %r25466, %r25041, %r24970; + xor.b32 %r25258, %r30907, %r25465; + xor.b32 %r25259, %r30908, %r25466; + xor.b32 %r25075, %r30905, %r25465; + xor.b32 %r25074, %r30906, %r25466; + xor.b32 %r25234, %r30885, %r25465; + xor.b32 %r25235, %r30886, %r25466; + xor.b32 %r25195, %r30883, %r25465; + xor.b32 %r25194, %r30884, %r25466; + xor.b32 %r25178, %r30881, %r25465; + xor.b32 %r25179, %r30882, %r25466; + // begin inline asm + shf.l.wrap.b32 %r25045, %r25006, %r25005, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25049, %r25005, %r25006, %r24942; + // end inline asm + xor.b32 %r25467, %r25045, %r24981; + xor.b32 %r25468, %r25049, %r24982; + xor.b32 %r25115, %r30903, %r25467; + xor.b32 %r25114, %r30904, %r25468; + xor.b32 %r25242, %r30901, %r25467; + xor.b32 %r25243, %r30902, %r25468; + xor.b32 %r25123, %r30879, %r25467; + xor.b32 %r25122, %r30880, %r25468; + xor.b32 %r25226, %r30877, %r25467; + xor.b32 %r25227, %r30878, %r25468; + xor.b32 %r25091, %r30875, %r25467; + xor.b32 %r25090, %r30876, %r25468; + // begin inline asm + shf.l.wrap.b32 %r25053, %r25018, %r25017, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25057, %r25017, %r25018, %r24942; + // end inline asm + xor.b32 %r25469, %r25053, %r24993; + xor.b32 %r25470, %r25057, %r24994; + xor.b32 %r25210, %r30899, %r25469; + xor.b32 %r25211, %r30900, %r25470; + xor.b32 %r25187, %r30873, %r25469; + xor.b32 %r25186, %r30874, %r25470; + xor.b32 %r25130, %r30871, %r25469; + xor.b32 %r25131, %r30872, %r25470; + xor.b32 %r25218, %r30869, %r25469; + xor.b32 %r25219, %r30870, %r25470; + xor.b32 %r25147, %r30867, %r25469; + xor.b32 %r25146, %r30868, %r25470; + // begin inline asm + shf.l.wrap.b32 %r25061, %r24970, %r24969, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25065, %r24969, %r24970, %r24942; + // end inline asm + xor.b32 %r25471, %r25061, %r25005; + xor.b32 %r25472, %r25065, %r25006; + xor.b32 %r25162, %r30897, %r25471; + xor.b32 %r25163, %r30898, %r25472; + xor.b32 %r25082, %r30865, %r25471; + xor.b32 %r25083, %r30866, %r25472; + xor.b32 %r25099, %r30863, %r25471; + xor.b32 %r25098, %r30864, %r25472; + xor.b32 %r25138, %r30861, %r25471; + xor.b32 %r25139, %r30862, %r25472; + xor.b32 %r25170, %r30859, %r25471; + xor.b32 %r25171, %r30860, %r25472; + mov.u32 %r25076, 44; + // begin inline asm + shf.l.wrap.b32 %r25069, %r25075, %r25074, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25073, %r25074, %r25075, %r25076; + // end inline asm + mov.u32 %r25084, 20; + // begin inline asm + shf.l.wrap.b32 %r25077, %r25083, %r25082, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25081, %r25082, %r25083, %r25084; + // end inline asm + mov.u32 %r25092, 61; + // begin inline asm + shf.l.wrap.b32 %r25085, %r25091, %r25090, %r25092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25089, %r25090, %r25091, %r25092; + // end inline asm + mov.u32 %r25100, 39; + // begin inline asm + shf.l.wrap.b32 %r25093, %r25099, %r25098, %r25100; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25097, %r25098, %r25099, %r25100; + // end inline asm + mov.u32 %r25108, 18; + // begin inline asm + shf.l.wrap.b32 %r25101, %r25107, %r25106, %r25108; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25105, %r25106, %r25107, %r25108; + // end inline asm + mov.u32 %r25116, 62; + // begin inline asm + shf.l.wrap.b32 %r25109, %r25115, %r25114, %r25116; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25113, %r25114, %r25115, %r25116; + // end inline asm + mov.u32 %r25124, 43; + // begin inline asm + shf.l.wrap.b32 %r25117, %r25123, %r25122, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25121, %r25122, %r25123, %r25124; + // end inline asm + mov.u32 %r25132, 25; + // begin inline asm + shf.l.wrap.b32 %r25125, %r25131, %r25130, %r25132; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25129, %r25130, %r25131, %r25132; + // end inline asm + mov.u32 %r25140, 8; + // begin inline asm + shf.l.wrap.b32 %r25133, %r25139, %r25138, %r25140; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25137, %r25138, %r25139, %r25140; + // end inline asm + mov.u32 %r25148, 56; + // begin inline asm + shf.l.wrap.b32 %r25141, %r25147, %r25146, %r25148; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25145, %r25146, %r25147, %r25148; + // end inline asm + mov.u32 %r25156, 41; + // begin inline asm + shf.l.wrap.b32 %r25149, %r25155, %r25154, %r25156; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25153, %r25154, %r25155, %r25156; + // end inline asm + mov.u32 %r25164, 27; + // begin inline asm + shf.l.wrap.b32 %r25157, %r25163, %r25162, %r25164; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25161, %r25162, %r25163, %r25164; + // end inline asm + mov.u32 %r25172, 14; + // begin inline asm + shf.l.wrap.b32 %r25165, %r25171, %r25170, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25169, %r25170, %r25171, %r25172; + // end inline asm + mov.u32 %r25180, 2; + // begin inline asm + shf.l.wrap.b32 %r25173, %r25179, %r25178, %r25180; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25177, %r25178, %r25179, %r25180; + // end inline asm + mov.u32 %r25188, 55; + // begin inline asm + shf.l.wrap.b32 %r25181, %r25187, %r25186, %r25188; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25185, %r25186, %r25187, %r25188; + // end inline asm + mov.u32 %r25196, 45; + // begin inline asm + shf.l.wrap.b32 %r25189, %r25195, %r25194, %r25196; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25193, %r25194, %r25195, %r25196; + // end inline asm + mov.u32 %r25204, 36; + // begin inline asm + shf.l.wrap.b32 %r25197, %r25203, %r25202, %r25204; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25201, %r25202, %r25203, %r25204; + // end inline asm + mov.u32 %r25212, 28; + // begin inline asm + shf.l.wrap.b32 %r25205, %r25211, %r25210, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25209, %r25210, %r25211, %r25212; + // end inline asm + mov.u32 %r25220, 21; + // begin inline asm + shf.l.wrap.b32 %r25213, %r25219, %r25218, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25217, %r25218, %r25219, %r25220; + // end inline asm + mov.u32 %r25228, 15; + // begin inline asm + shf.l.wrap.b32 %r25221, %r25227, %r25226, %r25228; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25225, %r25226, %r25227, %r25228; + // end inline asm + mov.u32 %r25236, 10; + // begin inline asm + shf.l.wrap.b32 %r25229, %r25235, %r25234, %r25236; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25233, %r25234, %r25235, %r25236; + // end inline asm + mov.u32 %r25244, 6; + // begin inline asm + shf.l.wrap.b32 %r25237, %r25243, %r25242, %r25244; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25241, %r25242, %r25243, %r25244; + // end inline asm + mov.u32 %r25252, 3; + // begin inline asm + shf.l.wrap.b32 %r25245, %r25251, %r25250, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25249, %r25250, %r25251, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25253, %r25259, %r25258, %r24942; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25257, %r25258, %r25259, %r24942; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25261, %r25296, %r25069, %r25117, 0xD2; + lop3.b32 %r25262, %r25299, %r25073, %r25121, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30907, %r25069, %r25117, %r25213, 0xD2; + lop3.b32 %r30908, %r25073, %r25121, %r25217, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30903, %r25117, %r25213, %r25165, 0xD2; + lop3.b32 %r30904, %r25121, %r25217, %r25169, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30899, %r25213, %r25165, %r25296, 0xD2; + lop3.b32 %r30900, %r25217, %r25169, %r25299, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30897, %r25165, %r25296, %r25069, 0xD2; + lop3.b32 %r30898, %r25169, %r25299, %r25073, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30893, %r25205, %r25077, %r25245, 0xD2; + lop3.b32 %r30894, %r25209, %r25081, %r25249, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30905, %r25077, %r25245, %r25189, 0xD2; + lop3.b32 %r30906, %r25081, %r25249, %r25193, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30901, %r25245, %r25189, %r25085, 0xD2; + lop3.b32 %r30902, %r25249, %r25193, %r25089, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30873, %r25189, %r25085, %r25205, 0xD2; + lop3.b32 %r30874, %r25193, %r25089, %r25209, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30873, %r30874}; + // begin inline asm + // chi + lop3.b32 %r30865, %r25085, %r25205, %r25077, 0xD2; + lop3.b32 %r30866, %r25089, %r25209, %r25081, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30865, %r30866}; + // begin inline asm + // chi + lop3.b32 %r30891, %r25253, %r25237, %r25125, 0xD2; + lop3.b32 %r30892, %r25257, %r25241, %r25129, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30891, %r30892}; + // begin inline asm + // chi + lop3.b32 %r30885, %r25237, %r25125, %r25133, 0xD2; + lop3.b32 %r30886, %r25241, %r25129, %r25137, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30885, %r30886}; + // begin inline asm + // chi + lop3.b32 %r30879, %r25125, %r25133, %r25101, 0xD2; + lop3.b32 %r30880, %r25129, %r25137, %r25105, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30879, %r30880}; + // begin inline asm + // chi + lop3.b32 %r30871, %r25133, %r25101, %r25253, 0xD2; + lop3.b32 %r30872, %r25137, %r25105, %r25257, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30871, %r30872}; + // begin inline asm + // chi + lop3.b32 %r30863, %r25101, %r25253, %r25237, 0xD2; + lop3.b32 %r30864, %r25105, %r25257, %r25241, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30863, %r30864}; + // begin inline asm + // chi + lop3.b32 %r30889, %r25157, %r25197, %r25229, 0xD2; + lop3.b32 %r30890, %r25161, %r25201, %r25233, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30889, %r30890}; + // begin inline asm + // chi + lop3.b32 %r30883, %r25197, %r25229, %r25221, 0xD2; + lop3.b32 %r30884, %r25201, %r25233, %r25225, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30883, %r30884}; + // begin inline asm + // chi + lop3.b32 %r30877, %r25229, %r25221, %r25141, 0xD2; + lop3.b32 %r30878, %r25233, %r25225, %r25145, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30877, %r30878}; + // begin inline asm + // chi + lop3.b32 %r30869, %r25221, %r25141, %r25157, 0xD2; + lop3.b32 %r30870, %r25225, %r25145, %r25161, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30869, %r30870}; + // begin inline asm + // chi + lop3.b32 %r30861, %r25141, %r25157, %r25197, 0xD2; + lop3.b32 %r30862, %r25145, %r25161, %r25201, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30861, %r30862}; + // begin inline asm + // chi + lop3.b32 %r30887, %r25109, %r25181, %r25093, 0xD2; + lop3.b32 %r30888, %r25113, %r25185, %r25097, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30887, %r30888}; + // begin inline asm + // chi + lop3.b32 %r30881, %r25181, %r25093, %r25149, 0xD2; + lop3.b32 %r30882, %r25185, %r25097, %r25153, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30881, %r30882}; + // begin inline asm + // chi + lop3.b32 %r30875, %r25093, %r25149, %r25173, 0xD2; + lop3.b32 %r30876, %r25097, %r25153, %r25177, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30875, %r30876}; + // begin inline asm + // chi + lop3.b32 %r30867, %r25149, %r25173, %r25109, 0xD2; + lop3.b32 %r30868, %r25153, %r25177, %r25113, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30867, %r30868}; + // begin inline asm + // chi + lop3.b32 %r30859, %r25173, %r25109, %r25181, 0xD2; + lop3.b32 %r30860, %r25177, %r25113, %r25185, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30859, %r30860}; + mul.wide.s32 %rd1097, %r30909, 8; + add.s64 %rd1096, %rd1029, %rd1097; + // begin inline asm + ld.global.nc.v2.u32 {%r25461,%r25462}, [%rd1096]; + // end inline asm + xor.b32 %r30895, %r25261, %r25461; + xor.b32 %r30896, %r25262, %r25462; + add.s32 %r30909, %r30909, 1; + setp.lt.u32 %p50, %r30909, 23; + @%p50 bra $L__BB2_90; + + st.local.v2.u32 [%rd3+32], {%r30907, %r30908}; + st.local.v2.u32 [%rd3+72], {%r30905, %r30906}; + st.local.v2.u32 [%rd3+40], {%r30903, %r30904}; + st.local.v2.u32 [%rd3+80], {%r30901, %r30902}; + st.local.v2.u32 [%rd3+48], {%r30899, %r30900}; + st.local.v2.u32 [%rd3+56], {%r30897, %r30898}; + st.local.v2.u32 [%rd3+24], {%r30895, %r30896}; + // begin inline asm + // xor5 + lop3.b32 %r25473, %r30895, %r30893, %r30891, 0x96; + lop3.b32 %r25473, %r25473, %r30889, %r30887, 0x96; + lop3.b32 %r25474, %r30896, %r30894, %r30892, 0x96; + lop3.b32 %r25474, %r25474, %r30890, %r30888, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25485, %r30907, %r30905, %r30885, 0x96; + lop3.b32 %r25485, %r25485, %r30883, %r30881, 0x96; + lop3.b32 %r25486, %r30908, %r30906, %r30886, 0x96; + lop3.b32 %r25486, %r25486, %r30884, %r30882, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25497, %r30903, %r30901, %r30879, 0x96; + lop3.b32 %r25497, %r25497, %r30877, %r30875, 0x96; + lop3.b32 %r25498, %r30904, %r30902, %r30880, 0x96; + lop3.b32 %r25498, %r25498, %r30878, %r30876, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25509, %r30899, %r30873, %r30871, 0x96; + lop3.b32 %r25509, %r25509, %r30869, %r30867, 0x96; + lop3.b32 %r25510, %r30900, %r30874, %r30872, 0x96; + lop3.b32 %r25510, %r25510, %r30870, %r30868, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25521, %r30897, %r30865, %r30863, 0x96; + lop3.b32 %r25521, %r25521, %r30861, %r30859, 0x96; + lop3.b32 %r25522, %r30898, %r30866, %r30864, 0x96; + lop3.b32 %r25522, %r25522, %r30862, %r30860, 0x96; + // end inline asm + mov.u32 %r25725, 1; + // begin inline asm + shf.l.wrap.b32 %r25533, %r25486, %r25485, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25537, %r25485, %r25486, %r25725; + // end inline asm + xor.b32 %r25752, %r25533, %r25521; + xor.b32 %r25753, %r25537, %r25522; + xor.b32 %r25680, %r30895, %r25752; + xor.b32 %r25683, %r30896, %r25753; + xor.b32 %r25643, %r30892, %r25753; + xor.b32 %r25642, %r30891, %r25752; + st.local.v2.u32 [%rd3+104], {%r25642, %r25643}; + // begin inline asm + shf.l.wrap.b32 %r25541, %r25498, %r25497, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25545, %r25497, %r25498, %r25725; + // end inline asm + xor.b32 %r25754, %r25541, %r25473; + xor.b32 %r25755, %r25545, %r25474; + xor.b32 %r25579, %r30905, %r25754; + xor.b32 %r25578, %r30906, %r25755; + xor.b32 %r25618, %r30884, %r25755; + xor.b32 %r25619, %r30883, %r25754; + st.local.v2.u32 [%rd3+152], {%r25619, %r25618}; + // begin inline asm + shf.l.wrap.b32 %r25549, %r25510, %r25509, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25553, %r25509, %r25510, %r25725; + // end inline asm + xor.b32 %r25756, %r25549, %r25485; + xor.b32 %r25757, %r25553, %r25486; + xor.b32 %r25602, %r30880, %r25757; + xor.b32 %r25603, %r30879, %r25756; + st.local.v2.u32 [%rd3+120], {%r25603, %r25602}; + xor.b32 %r25594, %r30876, %r25757; + xor.b32 %r25595, %r30875, %r25756; + st.local.v2.u32 [%rd3+200], {%r25595, %r25594}; + // begin inline asm + shf.l.wrap.b32 %r25557, %r25522, %r25521, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25561, %r25521, %r25522, %r25725; + // end inline asm + xor.b32 %r25758, %r25557, %r25497; + xor.b32 %r25759, %r25561, %r25498; + xor.b32 %r25626, %r30899, %r25758; + xor.b32 %r25627, %r30900, %r25759; + xor.b32 %r25635, %r30870, %r25759; + xor.b32 %r25634, %r30869, %r25758; + st.local.v2.u32 [%rd3+168], {%r25634, %r25635}; + // begin inline asm + shf.l.wrap.b32 %r25565, %r25474, %r25473, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25569, %r25473, %r25474, %r25725; + // end inline asm + xor.b32 %r25760, %r25565, %r25509; + xor.b32 %r25761, %r25569, %r25510; + xor.b32 %r25586, %r30865, %r25760; + xor.b32 %r25587, %r30866, %r25761; + xor.b32 %r25611, %r30860, %r25761; + xor.b32 %r25610, %r30859, %r25760; + st.local.v2.u32 [%rd3+216], {%r25610, %r25611}; + // begin inline asm + shf.l.wrap.b32 %r25573, %r25579, %r25578, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25577, %r25578, %r25579, %r25076; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25581, %r25587, %r25586, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25585, %r25586, %r25587, %r25084; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25593, %r25594, %r25595, %r25092; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25589, %r25595, %r25594, %r25092; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r25589, %r25593}; + // begin inline asm + shf.l.wrap.b32 %r25597, %r25603, %r25602, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25601, %r25602, %r25603, %r25124; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25605, %r25611, %r25610, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25609, %r25610, %r25611, %r25172; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25617, %r25618, %r25619, %r25196; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25613, %r25619, %r25618, %r25196; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r25613, %r25617}; + // begin inline asm + shf.l.wrap.b32 %r25621, %r25627, %r25626, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25625, %r25626, %r25627, %r25212; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25629, %r25635, %r25634, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25633, %r25634, %r25635, %r25220; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25637, %r25643, %r25642, %r25252; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25641, %r25642, %r25643, %r25252; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25645, %r25680, %r25573, %r25597, 0xD2; + lop3.b32 %r25646, %r25683, %r25577, %r25601, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r25653, %r25573, %r25597, %r25629, 0xD2; + lop3.b32 %r25654, %r25577, %r25601, %r25633, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r25653, %r25654}; + // begin inline asm + // chi + lop3.b32 %r25661, %r25597, %r25629, %r25605, 0xD2; + lop3.b32 %r25662, %r25601, %r25633, %r25609, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r25661, %r25662}; + // begin inline asm + // chi + lop3.b32 %r25669, %r25629, %r25605, %r25680, 0xD2; + lop3.b32 %r25670, %r25633, %r25609, %r25683, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r25669, %r25670}; + // begin inline asm + // chi + lop3.b32 %r25677, %r25605, %r25680, %r25573, 0xD2; + lop3.b32 %r25678, %r25609, %r25683, %r25577, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r25677, %r25678}; + // begin inline asm + // chi + lop3.b32 %r25685, %r25621, %r25581, %r25637, 0xD2; + lop3.b32 %r25686, %r25625, %r25585, %r25641, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r25685, %r25686}; + // begin inline asm + // chi + lop3.b32 %r25693, %r25581, %r25637, %r25613, 0xD2; + lop3.b32 %r25694, %r25585, %r25641, %r25617, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r25693, %r25694}; + // begin inline asm + // chi + lop3.b32 %r25701, %r25637, %r25613, %r25589, 0xD2; + lop3.b32 %r25702, %r25641, %r25617, %r25593, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r25701, %r25702}; + // begin inline asm + ld.global.nc.v2.u32 {%r25709,%r25710}, [%rd1030]; + // end inline asm + xor.b32 %r25762, %r25646, %r25710; + xor.b32 %r25763, %r25645, %r25709; + mov.b64 %rd1269, {%r25763, %r25762}; + mov.b64 %rd1270, {%r25653, %r25654}; + mov.b64 %rd1271, {%r25661, %r25662}; + mov.b64 %rd1272, {%r25677, %r25678}; + mov.u32 %r30910, 0; + st.local.v2.u32 [%rd3+24], {%r25763, %r25762}; + st.local.v2.u32 [%rd272+96], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+104], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+112], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+120], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+128], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+136], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+144], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+152], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+160], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+168], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+176], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+184], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+192], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+200], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+208], {%r30910, %r30910}; + st.local.v2.u32 [%rd272+216], {%r30910, %r30910}; + mov.u32 %r30925, -2147483648; + st.local.v2.u32 [%rd272+88], {%r25725, %r30925}; + mov.u32 %r30911, %r30910; + mov.u32 %r30912, %r30910; + mov.u32 %r30913, %r30910; + mov.u32 %r30914, %r30910; + mov.u32 %r30915, %r30910; + mov.u32 %r30916, %r30910; + mov.u32 %r30917, %r30910; + mov.u32 %r30918, %r30910; + mov.u32 %r30919, %r30910; + mov.u32 %r30920, %r30910; + mov.u32 %r30921, %r30910; + mov.u32 %r30922, %r30910; + mov.u32 %r30923, %r30910; + mov.u32 %r30924, %r25725; + mov.u32 %r30926, %r30910; + mov.u32 %r30927, %r30910; + mov.u32 %r30928, %r30910; + mov.u32 %r30929, %r30910; + mov.u32 %r30930, %r30910; + mov.u32 %r30931, %r30910; + mov.u32 %r30932, %r30910; + mov.u32 %r30933, %r30910; + mov.u32 %r30934, %r30910; + mov.u32 %r30935, %r30910; + mov.u32 %r30936, %r30910; + mov.u32 %r30937, %r30910; + mov.u32 %r30938, %r30910; + mov.u32 %r30939, %r30910; + mov.u32 %r30940, %r30910; + mov.u32 %r30941, %r30910; + mov.u32 %r30942, %r30910; + mov.u32 %r30943, %r30910; + mov.u32 %r30960, %r30910; + +$L__BB2_92: + // begin inline asm + // xor5 + lop3.b32 %r25764, %r30946, %r30944, %r30942, 0x96; + lop3.b32 %r25764, %r25764, %r30940, %r30938, 0x96; + lop3.b32 %r25765, %r30947, %r30945, %r30943, 0x96; + lop3.b32 %r25765, %r25765, %r30941, %r30939, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25776, %r30958, %r30956, %r30936, 0x96; + lop3.b32 %r25776, %r25776, %r30934, %r30932, 0x96; + lop3.b32 %r25777, %r30959, %r30957, %r30937, 0x96; + lop3.b32 %r25777, %r25777, %r30935, %r30933, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25788, %r30954, %r30952, %r30930, 0x96; + lop3.b32 %r25788, %r25788, %r30928, %r30926, 0x96; + lop3.b32 %r25789, %r30955, %r30953, %r30931, 0x96; + lop3.b32 %r25789, %r25789, %r30929, %r30927, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25800, %r30950, %r30924, %r30922, 0x96; + lop3.b32 %r25800, %r25800, %r30920, %r30918, 0x96; + lop3.b32 %r25801, %r30951, %r30925, %r30923, 0x96; + lop3.b32 %r25801, %r25801, %r30921, %r30919, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r25812, %r30948, %r30916, %r30914, 0x96; + lop3.b32 %r25812, %r25812, %r30912, %r30910, 0x96; + lop3.b32 %r25813, %r30949, %r30917, %r30915, 0x96; + lop3.b32 %r25813, %r25813, %r30913, %r30911, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25824, %r25777, %r25776, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25828, %r25776, %r25777, %r25725; + // end inline asm + xor.b32 %r26258, %r25824, %r25812; + xor.b32 %r26259, %r25828, %r25813; + xor.b32 %r26091, %r30946, %r26258; + xor.b32 %r26094, %r30947, %r26259; + xor.b32 %r25998, %r30944, %r26258; + xor.b32 %r25997, %r30945, %r26259; + xor.b32 %r26045, %r30942, %r26258; + xor.b32 %r26046, %r30943, %r26259; + xor.b32 %r25950, %r30940, %r26258; + xor.b32 %r25949, %r30941, %r26259; + xor.b32 %r25901, %r30938, %r26258; + xor.b32 %r25902, %r30939, %r26259; + // begin inline asm + shf.l.wrap.b32 %r25832, %r25789, %r25788, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25836, %r25788, %r25789, %r25725; + // end inline asm + xor.b32 %r26260, %r25832, %r25764; + xor.b32 %r26261, %r25836, %r25765; + xor.b32 %r26053, %r30958, %r26260; + xor.b32 %r26054, %r30959, %r26261; + xor.b32 %r25870, %r30956, %r26260; + xor.b32 %r25869, %r30957, %r26261; + xor.b32 %r26029, %r30936, %r26260; + xor.b32 %r26030, %r30937, %r26261; + xor.b32 %r25990, %r30934, %r26260; + xor.b32 %r25989, %r30935, %r26261; + xor.b32 %r25973, %r30932, %r26260; + xor.b32 %r25974, %r30933, %r26261; + // begin inline asm + shf.l.wrap.b32 %r25840, %r25801, %r25800, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25844, %r25800, %r25801, %r25725; + // end inline asm + xor.b32 %r26262, %r25840, %r25776; + xor.b32 %r26263, %r25844, %r25777; + xor.b32 %r25910, %r30954, %r26262; + xor.b32 %r25909, %r30955, %r26263; + xor.b32 %r26037, %r30952, %r26262; + xor.b32 %r26038, %r30953, %r26263; + xor.b32 %r25918, %r30930, %r26262; + xor.b32 %r25917, %r30931, %r26263; + xor.b32 %r26021, %r30928, %r26262; + xor.b32 %r26022, %r30929, %r26263; + xor.b32 %r25886, %r30926, %r26262; + xor.b32 %r25885, %r30927, %r26263; + // begin inline asm + shf.l.wrap.b32 %r25848, %r25813, %r25812, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25852, %r25812, %r25813, %r25725; + // end inline asm + xor.b32 %r26264, %r25848, %r25788; + xor.b32 %r26265, %r25852, %r25789; + xor.b32 %r26005, %r30950, %r26264; + xor.b32 %r26006, %r30951, %r26265; + xor.b32 %r25982, %r30924, %r26264; + xor.b32 %r25981, %r30925, %r26265; + xor.b32 %r25925, %r30922, %r26264; + xor.b32 %r25926, %r30923, %r26265; + xor.b32 %r26013, %r30920, %r26264; + xor.b32 %r26014, %r30921, %r26265; + xor.b32 %r25942, %r30918, %r26264; + xor.b32 %r25941, %r30919, %r26265; + // begin inline asm + shf.l.wrap.b32 %r25856, %r25765, %r25764, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25860, %r25764, %r25765, %r25725; + // end inline asm + xor.b32 %r26266, %r25856, %r25800; + xor.b32 %r26267, %r25860, %r25801; + xor.b32 %r25957, %r30948, %r26266; + xor.b32 %r25958, %r30949, %r26267; + xor.b32 %r25877, %r30916, %r26266; + xor.b32 %r25878, %r30917, %r26267; + xor.b32 %r25894, %r30914, %r26266; + xor.b32 %r25893, %r30915, %r26267; + xor.b32 %r25933, %r30912, %r26266; + xor.b32 %r25934, %r30913, %r26267; + xor.b32 %r25965, %r30910, %r26266; + xor.b32 %r25966, %r30911, %r26267; + mov.u32 %r25871, 44; + // begin inline asm + shf.l.wrap.b32 %r25864, %r25870, %r25869, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25868, %r25869, %r25870, %r25871; + // end inline asm + mov.u32 %r25879, 20; + // begin inline asm + shf.l.wrap.b32 %r25872, %r25878, %r25877, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25876, %r25877, %r25878, %r25879; + // end inline asm + mov.u32 %r25887, 61; + // begin inline asm + shf.l.wrap.b32 %r25880, %r25886, %r25885, %r25887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25884, %r25885, %r25886, %r25887; + // end inline asm + mov.u32 %r25895, 39; + // begin inline asm + shf.l.wrap.b32 %r25888, %r25894, %r25893, %r25895; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25892, %r25893, %r25894, %r25895; + // end inline asm + mov.u32 %r25903, 18; + // begin inline asm + shf.l.wrap.b32 %r25896, %r25902, %r25901, %r25903; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25900, %r25901, %r25902, %r25903; + // end inline asm + mov.u32 %r25911, 62; + // begin inline asm + shf.l.wrap.b32 %r25904, %r25910, %r25909, %r25911; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25908, %r25909, %r25910, %r25911; + // end inline asm + mov.u32 %r25919, 43; + // begin inline asm + shf.l.wrap.b32 %r25912, %r25918, %r25917, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25916, %r25917, %r25918, %r25919; + // end inline asm + mov.u32 %r25927, 25; + // begin inline asm + shf.l.wrap.b32 %r25920, %r25926, %r25925, %r25927; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25924, %r25925, %r25926, %r25927; + // end inline asm + mov.u32 %r25935, 8; + // begin inline asm + shf.l.wrap.b32 %r25928, %r25934, %r25933, %r25935; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25932, %r25933, %r25934, %r25935; + // end inline asm + mov.u32 %r25943, 56; + // begin inline asm + shf.l.wrap.b32 %r25936, %r25942, %r25941, %r25943; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25940, %r25941, %r25942, %r25943; + // end inline asm + mov.u32 %r25951, 41; + // begin inline asm + shf.l.wrap.b32 %r25944, %r25950, %r25949, %r25951; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25948, %r25949, %r25950, %r25951; + // end inline asm + mov.u32 %r25959, 27; + // begin inline asm + shf.l.wrap.b32 %r25952, %r25958, %r25957, %r25959; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25956, %r25957, %r25958, %r25959; + // end inline asm + mov.u32 %r25967, 14; + // begin inline asm + shf.l.wrap.b32 %r25960, %r25966, %r25965, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25964, %r25965, %r25966, %r25967; + // end inline asm + mov.u32 %r25975, 2; + // begin inline asm + shf.l.wrap.b32 %r25968, %r25974, %r25973, %r25975; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25972, %r25973, %r25974, %r25975; + // end inline asm + mov.u32 %r25983, 55; + // begin inline asm + shf.l.wrap.b32 %r25976, %r25982, %r25981, %r25983; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25980, %r25981, %r25982, %r25983; + // end inline asm + mov.u32 %r25991, 45; + // begin inline asm + shf.l.wrap.b32 %r25984, %r25990, %r25989, %r25991; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25988, %r25989, %r25990, %r25991; + // end inline asm + mov.u32 %r25999, 36; + // begin inline asm + shf.l.wrap.b32 %r25992, %r25998, %r25997, %r25999; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r25996, %r25997, %r25998, %r25999; + // end inline asm + mov.u32 %r26007, 28; + // begin inline asm + shf.l.wrap.b32 %r26000, %r26006, %r26005, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26004, %r26005, %r26006, %r26007; + // end inline asm + mov.u32 %r26015, 21; + // begin inline asm + shf.l.wrap.b32 %r26008, %r26014, %r26013, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26012, %r26013, %r26014, %r26015; + // end inline asm + mov.u32 %r26023, 15; + // begin inline asm + shf.l.wrap.b32 %r26016, %r26022, %r26021, %r26023; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26020, %r26021, %r26022, %r26023; + // end inline asm + mov.u32 %r26031, 10; + // begin inline asm + shf.l.wrap.b32 %r26024, %r26030, %r26029, %r26031; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26028, %r26029, %r26030, %r26031; + // end inline asm + mov.u32 %r26039, 6; + // begin inline asm + shf.l.wrap.b32 %r26032, %r26038, %r26037, %r26039; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26036, %r26037, %r26038, %r26039; + // end inline asm + mov.u32 %r26047, 3; + // begin inline asm + shf.l.wrap.b32 %r26040, %r26046, %r26045, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26044, %r26045, %r26046, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26048, %r26054, %r26053, %r25725; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26052, %r26053, %r26054, %r25725; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26056, %r26091, %r25864, %r25912, 0xD2; + lop3.b32 %r26057, %r26094, %r25868, %r25916, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30958, %r25864, %r25912, %r26008, 0xD2; + lop3.b32 %r30959, %r25868, %r25916, %r26012, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30954, %r25912, %r26008, %r25960, 0xD2; + lop3.b32 %r30955, %r25916, %r26012, %r25964, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30950, %r26008, %r25960, %r26091, 0xD2; + lop3.b32 %r30951, %r26012, %r25964, %r26094, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30948, %r25960, %r26091, %r25864, 0xD2; + lop3.b32 %r30949, %r25964, %r26094, %r25868, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30944, %r26000, %r25872, %r26040, 0xD2; + lop3.b32 %r30945, %r26004, %r25876, %r26044, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30956, %r25872, %r26040, %r25984, 0xD2; + lop3.b32 %r30957, %r25876, %r26044, %r25988, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30952, %r26040, %r25984, %r25880, 0xD2; + lop3.b32 %r30953, %r26044, %r25988, %r25884, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30924, %r25984, %r25880, %r26000, 0xD2; + lop3.b32 %r30925, %r25988, %r25884, %r26004, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r30924, %r30925}; + // begin inline asm + // chi + lop3.b32 %r30916, %r25880, %r26000, %r25872, 0xD2; + lop3.b32 %r30917, %r25884, %r26004, %r25876, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r30916, %r30917}; + // begin inline asm + // chi + lop3.b32 %r30942, %r26048, %r26032, %r25920, 0xD2; + lop3.b32 %r30943, %r26052, %r26036, %r25924, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+104], {%r30942, %r30943}; + // begin inline asm + // chi + lop3.b32 %r30936, %r26032, %r25920, %r25928, 0xD2; + lop3.b32 %r30937, %r26036, %r25924, %r25932, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+112], {%r30936, %r30937}; + // begin inline asm + // chi + lop3.b32 %r30930, %r25920, %r25928, %r25896, 0xD2; + lop3.b32 %r30931, %r25924, %r25932, %r25900, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+120], {%r30930, %r30931}; + // begin inline asm + // chi + lop3.b32 %r30922, %r25928, %r25896, %r26048, 0xD2; + lop3.b32 %r30923, %r25932, %r25900, %r26052, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+128], {%r30922, %r30923}; + // begin inline asm + // chi + lop3.b32 %r30914, %r25896, %r26048, %r26032, 0xD2; + lop3.b32 %r30915, %r25900, %r26052, %r26036, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+136], {%r30914, %r30915}; + // begin inline asm + // chi + lop3.b32 %r30940, %r25952, %r25992, %r26024, 0xD2; + lop3.b32 %r30941, %r25956, %r25996, %r26028, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+144], {%r30940, %r30941}; + // begin inline asm + // chi + lop3.b32 %r30934, %r25992, %r26024, %r26016, 0xD2; + lop3.b32 %r30935, %r25996, %r26028, %r26020, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+152], {%r30934, %r30935}; + // begin inline asm + // chi + lop3.b32 %r30928, %r26024, %r26016, %r25936, 0xD2; + lop3.b32 %r30929, %r26028, %r26020, %r25940, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+160], {%r30928, %r30929}; + // begin inline asm + // chi + lop3.b32 %r30920, %r26016, %r25936, %r25952, 0xD2; + lop3.b32 %r30921, %r26020, %r25940, %r25956, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+168], {%r30920, %r30921}; + // begin inline asm + // chi + lop3.b32 %r30912, %r25936, %r25952, %r25992, 0xD2; + lop3.b32 %r30913, %r25940, %r25956, %r25996, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+176], {%r30912, %r30913}; + // begin inline asm + // chi + lop3.b32 %r30938, %r25904, %r25976, %r25888, 0xD2; + lop3.b32 %r30939, %r25908, %r25980, %r25892, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+184], {%r30938, %r30939}; + // begin inline asm + // chi + lop3.b32 %r30932, %r25976, %r25888, %r25944, 0xD2; + lop3.b32 %r30933, %r25980, %r25892, %r25948, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+192], {%r30932, %r30933}; + // begin inline asm + // chi + lop3.b32 %r30926, %r25888, %r25944, %r25968, 0xD2; + lop3.b32 %r30927, %r25892, %r25948, %r25972, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+200], {%r30926, %r30927}; + // begin inline asm + // chi + lop3.b32 %r30918, %r25944, %r25968, %r25904, 0xD2; + lop3.b32 %r30919, %r25948, %r25972, %r25908, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+208], {%r30918, %r30919}; + // begin inline asm + // chi + lop3.b32 %r30910, %r25968, %r25904, %r25976, 0xD2; + lop3.b32 %r30911, %r25972, %r25908, %r25980, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+216], {%r30910, %r30911}; + mul.wide.s32 %rd1108, %r30960, 8; + add.s64 %rd1107, %rd1029, %rd1108; + // begin inline asm + ld.global.nc.v2.u32 {%r26256,%r26257}, [%rd1107]; + // end inline asm + xor.b32 %r30946, %r26056, %r26256; + xor.b32 %r30947, %r26057, %r26257; + add.s32 %r30960, %r30960, 1; + setp.lt.u32 %p51, %r30960, 23; + @%p51 bra $L__BB2_92; + + mov.u32 %r26367, 1; + st.local.v2.u32 [%rd272+32], {%r30958, %r30959}; + st.local.v2.u32 [%rd272+72], {%r30956, %r30957}; + st.local.v2.u32 [%rd272+40], {%r30954, %r30955}; + st.local.v2.u32 [%rd272+80], {%r30952, %r30953}; + st.local.v2.u32 [%rd272+48], {%r30950, %r30951}; + st.local.v2.u32 [%rd272+56], {%r30948, %r30949}; + st.local.v2.u32 [%rd272+24], {%r30946, %r30947}; + // begin inline asm + // xor5 + lop3.b32 %r26268, %r30946, %r30944, %r30942, 0x96; + lop3.b32 %r26268, %r26268, %r30940, %r30938, 0x96; + lop3.b32 %r26269, %r30947, %r30945, %r30943, 0x96; + lop3.b32 %r26269, %r26269, %r30941, %r30939, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26280, %r30958, %r30956, %r30936, 0x96; + lop3.b32 %r26280, %r26280, %r30934, %r30932, 0x96; + lop3.b32 %r26281, %r30959, %r30957, %r30937, 0x96; + lop3.b32 %r26281, %r26281, %r30935, %r30933, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26292, %r30954, %r30952, %r30930, 0x96; + lop3.b32 %r26292, %r26292, %r30928, %r30926, 0x96; + lop3.b32 %r26293, %r30955, %r30953, %r30931, 0x96; + lop3.b32 %r26293, %r26293, %r30929, %r30927, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26304, %r30950, %r30924, %r30922, 0x96; + lop3.b32 %r26304, %r26304, %r30920, %r30918, 0x96; + lop3.b32 %r26305, %r30951, %r30925, %r30923, 0x96; + lop3.b32 %r26305, %r26305, %r30921, %r30919, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r26316, %r30948, %r30916, %r30914, 0x96; + lop3.b32 %r26316, %r26316, %r30912, %r30910, 0x96; + lop3.b32 %r26317, %r30949, %r30917, %r30915, 0x96; + lop3.b32 %r26317, %r26317, %r30913, %r30911, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26328, %r26281, %r26280, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26332, %r26280, %r26281, %r26367; + // end inline asm + xor.b32 %r26506, %r26328, %r26316; + xor.b32 %r26507, %r26332, %r26317; + xor.b32 %r26475, %r30946, %r26506; + xor.b32 %r26478, %r30947, %r26507; + xor.b32 %r26438, %r30943, %r26507; + xor.b32 %r26437, %r30942, %r26506; + st.local.v2.u32 [%rd272+104], {%r26437, %r26438}; + // begin inline asm + shf.l.wrap.b32 %r26336, %r26293, %r26292, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26340, %r26292, %r26293, %r26367; + // end inline asm + xor.b32 %r26508, %r26336, %r26268; + xor.b32 %r26509, %r26340, %r26269; + xor.b32 %r26374, %r30956, %r26508; + xor.b32 %r26373, %r30957, %r26509; + xor.b32 %r26413, %r30935, %r26509; + xor.b32 %r26414, %r30934, %r26508; + st.local.v2.u32 [%rd272+152], {%r26414, %r26413}; + // begin inline asm + shf.l.wrap.b32 %r26344, %r26305, %r26304, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26348, %r26304, %r26305, %r26367; + // end inline asm + xor.b32 %r26510, %r26344, %r26280; + xor.b32 %r26511, %r26348, %r26281; + xor.b32 %r26397, %r30931, %r26511; + xor.b32 %r26398, %r30930, %r26510; + st.local.v2.u32 [%rd272+120], {%r26398, %r26397}; + xor.b32 %r26389, %r30927, %r26511; + xor.b32 %r26390, %r30926, %r26510; + st.local.v2.u32 [%rd272+200], {%r26390, %r26389}; + // begin inline asm + shf.l.wrap.b32 %r26352, %r26317, %r26316, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26356, %r26316, %r26317, %r26367; + // end inline asm + xor.b32 %r26512, %r26352, %r26292; + xor.b32 %r26513, %r26356, %r26293; + xor.b32 %r26421, %r30950, %r26512; + xor.b32 %r26422, %r30951, %r26513; + xor.b32 %r26430, %r30921, %r26513; + xor.b32 %r26429, %r30920, %r26512; + st.local.v2.u32 [%rd272+168], {%r26429, %r26430}; + // begin inline asm + shf.l.wrap.b32 %r26360, %r26269, %r26268, %r26367; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26364, %r26268, %r26269, %r26367; + // end inline asm + xor.b32 %r26514, %r26360, %r26304; + xor.b32 %r26515, %r26364, %r26305; + xor.b32 %r26381, %r30916, %r26514; + xor.b32 %r26382, %r30917, %r26515; + xor.b32 %r26406, %r30911, %r26515; + xor.b32 %r26405, %r30910, %r26514; + st.local.v2.u32 [%rd272+216], {%r26405, %r26406}; + // begin inline asm + shf.l.wrap.b32 %r26368, %r26374, %r26373, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26372, %r26373, %r26374, %r25871; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26376, %r26382, %r26381, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26380, %r26381, %r26382, %r25879; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26388, %r26389, %r26390, %r25887; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26384, %r26390, %r26389, %r25887; + // end inline asm + st.local.v2.u32 [%rd272+96], {%r26384, %r26388}; + // begin inline asm + shf.l.wrap.b32 %r26392, %r26398, %r26397, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26396, %r26397, %r26398, %r25919; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26400, %r26406, %r26405, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26404, %r26405, %r26406, %r25967; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26412, %r26413, %r26414, %r25991; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26408, %r26414, %r26413, %r25991; + // end inline asm + st.local.v2.u32 [%rd272+88], {%r26408, %r26412}; + // begin inline asm + shf.l.wrap.b32 %r26416, %r26422, %r26421, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26420, %r26421, %r26422, %r26007; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26424, %r26430, %r26429, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26428, %r26429, %r26430, %r26015; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26432, %r26438, %r26437, %r26047; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r26436, %r26437, %r26438, %r26047; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26440, %r26475, %r26368, %r26392, 0xD2; + lop3.b32 %r26441, %r26478, %r26372, %r26396, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r26448, %r26368, %r26392, %r26424, 0xD2; + lop3.b32 %r26449, %r26372, %r26396, %r26428, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+32], {%r26448, %r26449}; + // begin inline asm + // chi + lop3.b32 %r26456, %r26392, %r26424, %r26400, 0xD2; + lop3.b32 %r26457, %r26396, %r26428, %r26404, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+40], {%r26456, %r26457}; + // begin inline asm + // chi + lop3.b32 %r26464, %r26424, %r26400, %r26475, 0xD2; + lop3.b32 %r26465, %r26428, %r26404, %r26478, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+48], {%r26464, %r26465}; + // begin inline asm + // chi + lop3.b32 %r26472, %r26400, %r26475, %r26368, 0xD2; + lop3.b32 %r26473, %r26404, %r26478, %r26372, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+56], {%r26472, %r26473}; + // begin inline asm + // chi + lop3.b32 %r26480, %r26416, %r26376, %r26432, 0xD2; + lop3.b32 %r26481, %r26420, %r26380, %r26436, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+64], {%r26480, %r26481}; + // begin inline asm + // chi + lop3.b32 %r26488, %r26376, %r26432, %r26408, 0xD2; + lop3.b32 %r26489, %r26380, %r26436, %r26412, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+72], {%r26488, %r26489}; + // begin inline asm + // chi + lop3.b32 %r26496, %r26432, %r26408, %r26384, 0xD2; + lop3.b32 %r26497, %r26436, %r26412, %r26388, 0xD2; + // end inline asm + st.local.v2.u32 [%rd272+80], {%r26496, %r26497}; + // begin inline asm + ld.global.nc.v2.u32 {%r26504,%r26505}, [%rd1030]; + // end inline asm + xor.b32 %r26516, %r26441, %r26505; + xor.b32 %r26517, %r26440, %r26504; + st.local.v2.u32 [%rd272+24], {%r26517, %r26516}; + bra.uni $L__BB2_94; + +$L__BB2_72: + st.local.u64 [%rd3], %rd354; + mov.u64 %rd898, 1179641; + st.local.u64 [%rd3+8], %rd898; + st.local.u32 [%rd3+16], %r3326; + ld.global.u64 %rd899, [%rd222]; + ld.global.u64 %rd900, [%rd222+8]; + ld.global.u64 %rd901, [%rd222+16]; + ld.global.u64 %rd902, [%rd222+24]; + ld.global.u64 %rd903, [%rd222+32]; + ld.global.u64 %rd904, [%rd222+40]; + ld.global.u64 %rd905, [%rd222+48]; + ld.global.u64 %rd906, [%rd222+56]; + st.local.u64 [%rd3+24], %rd899; + st.local.u64 [%rd3+32], %rd900; + st.local.u64 [%rd3+40], %rd901; + st.local.u64 [%rd3+48], %rd902; + st.local.u64 [%rd3+56], %rd903; + st.local.u64 [%rd3+64], %rd904; + st.local.u64 [%rd3+72], %rd905; + st.local.u64 [%rd3+80], %rd906; + cvt.u32.u64 %r19990, %rd899; + xor.b32 %r19991, %r3326, %r19990; + st.local.u32 [%rd3+24], %r19991; + mov.u32 %r30487, 0; + st.local.v2.u32 [%rd3+96], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+104], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+112], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+120], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+128], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+136], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+144], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+152], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+160], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+168], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+176], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+184], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+192], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+200], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+208], {%r30487, %r30487}; + st.local.v2.u32 [%rd3+216], {%r30487, %r30487}; + mov.u32 %r30502, -2147483648; + mov.u32 %r19963, 1; + st.local.v2.u32 [%rd3+88], {%r19963, %r30502}; + ld.local.v2.u32 {%r30523, %r30524}, [%rd3+24]; + mov.b64 {%r30521, %r30522}, %rd904; + shr.u64 %rd907, %rd900, 32; + cvt.u32.u64 %r30535, %rd900; + cvt.u32.u64 %r30536, %rd907; + shr.u64 %rd908, %rd905, 32; + cvt.u32.u64 %r30533, %rd905; + cvt.u32.u64 %r30534, %rd908; + shr.u64 %rd909, %rd901, 32; + cvt.u32.u64 %r30531, %rd901; + cvt.u32.u64 %r30532, %rd909; + shr.u64 %rd910, %rd906, 32; + cvt.u32.u64 %r30529, %rd906; + cvt.u32.u64 %r30530, %rd910; + shr.u64 %rd911, %rd902, 32; + cvt.u32.u64 %r30527, %rd902; + cvt.u32.u64 %r30528, %rd911; + shr.u64 %rd912, %rd903, 32; + cvt.u32.u64 %r30525, %rd903; + cvt.u32.u64 %r30526, %rd912; + mov.u32 %r30488, %r30487; + mov.u32 %r30489, %r30487; + mov.u32 %r30490, %r30487; + mov.u32 %r30491, %r30487; + mov.u32 %r30492, %r30487; + mov.u32 %r30493, %r30487; + mov.u32 %r30494, %r30487; + mov.u32 %r30495, %r30487; + mov.u32 %r30496, %r30487; + mov.u32 %r30497, %r30487; + mov.u32 %r30498, %r30487; + mov.u32 %r30499, %r30487; + mov.u32 %r30500, %r30487; + mov.u32 %r30501, %r19963; + mov.u32 %r30503, %r30487; + mov.u32 %r30504, %r30487; + mov.u32 %r30505, %r30487; + mov.u32 %r30506, %r30487; + mov.u32 %r30507, %r30487; + mov.u32 %r30508, %r30487; + mov.u32 %r30509, %r30487; + mov.u32 %r30510, %r30487; + mov.u32 %r30511, %r30487; + mov.u32 %r30512, %r30487; + mov.u32 %r30513, %r30487; + mov.u32 %r30514, %r30487; + mov.u32 %r30515, %r30487; + mov.u32 %r30516, %r30487; + mov.u32 %r30517, %r30487; + mov.u32 %r30518, %r30487; + mov.u32 %r30519, %r30487; + mov.u32 %r30520, %r30487; + mov.u32 %r30537, %r30487; + +$L__BB2_73: + // begin inline asm + // xor5 + lop3.b32 %r19994, %r30523, %r30521, %r30519, 0x96; + lop3.b32 %r19994, %r19994, %r30517, %r30515, 0x96; + lop3.b32 %r19995, %r30524, %r30522, %r30520, 0x96; + lop3.b32 %r19995, %r19995, %r30518, %r30516, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20006, %r30535, %r30533, %r30513, 0x96; + lop3.b32 %r20006, %r20006, %r30511, %r30509, 0x96; + lop3.b32 %r20007, %r30536, %r30534, %r30514, 0x96; + lop3.b32 %r20007, %r20007, %r30512, %r30510, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20018, %r30531, %r30529, %r30507, 0x96; + lop3.b32 %r20018, %r20018, %r30505, %r30503, 0x96; + lop3.b32 %r20019, %r30532, %r30530, %r30508, 0x96; + lop3.b32 %r20019, %r20019, %r30506, %r30504, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20030, %r30527, %r30501, %r30499, 0x96; + lop3.b32 %r20030, %r20030, %r30497, %r30495, 0x96; + lop3.b32 %r20031, %r30528, %r30502, %r30500, 0x96; + lop3.b32 %r20031, %r20031, %r30498, %r30496, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20042, %r30525, %r30493, %r30491, 0x96; + lop3.b32 %r20042, %r20042, %r30489, %r30487, 0x96; + lop3.b32 %r20043, %r30526, %r30494, %r30492, 0x96; + lop3.b32 %r20043, %r20043, %r30490, %r30488, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20054, %r20007, %r20006, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20058, %r20006, %r20007, %r19963; + // end inline asm + xor.b32 %r20488, %r20054, %r20042; + xor.b32 %r20489, %r20058, %r20043; + xor.b32 %r20321, %r30523, %r20488; + xor.b32 %r20324, %r30524, %r20489; + xor.b32 %r20228, %r30521, %r20488; + xor.b32 %r20227, %r30522, %r20489; + xor.b32 %r20275, %r30519, %r20488; + xor.b32 %r20276, %r30520, %r20489; + xor.b32 %r20180, %r30517, %r20488; + xor.b32 %r20179, %r30518, %r20489; + xor.b32 %r20131, %r30515, %r20488; + xor.b32 %r20132, %r30516, %r20489; + // begin inline asm + shf.l.wrap.b32 %r20062, %r20019, %r20018, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20066, %r20018, %r20019, %r19963; + // end inline asm + xor.b32 %r20490, %r20062, %r19994; + xor.b32 %r20491, %r20066, %r19995; + xor.b32 %r20283, %r30535, %r20490; + xor.b32 %r20284, %r30536, %r20491; + xor.b32 %r20100, %r30533, %r20490; + xor.b32 %r20099, %r30534, %r20491; + xor.b32 %r20259, %r30513, %r20490; + xor.b32 %r20260, %r30514, %r20491; + xor.b32 %r20220, %r30511, %r20490; + xor.b32 %r20219, %r30512, %r20491; + xor.b32 %r20203, %r30509, %r20490; + xor.b32 %r20204, %r30510, %r20491; + // begin inline asm + shf.l.wrap.b32 %r20070, %r20031, %r20030, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20074, %r20030, %r20031, %r19963; + // end inline asm + xor.b32 %r20492, %r20070, %r20006; + xor.b32 %r20493, %r20074, %r20007; + xor.b32 %r20140, %r30531, %r20492; + xor.b32 %r20139, %r30532, %r20493; + xor.b32 %r20267, %r30529, %r20492; + xor.b32 %r20268, %r30530, %r20493; + xor.b32 %r20148, %r30507, %r20492; + xor.b32 %r20147, %r30508, %r20493; + xor.b32 %r20251, %r30505, %r20492; + xor.b32 %r20252, %r30506, %r20493; + xor.b32 %r20116, %r30503, %r20492; + xor.b32 %r20115, %r30504, %r20493; + // begin inline asm + shf.l.wrap.b32 %r20078, %r20043, %r20042, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20082, %r20042, %r20043, %r19963; + // end inline asm + xor.b32 %r20494, %r20078, %r20018; + xor.b32 %r20495, %r20082, %r20019; + xor.b32 %r20235, %r30527, %r20494; + xor.b32 %r20236, %r30528, %r20495; + xor.b32 %r20212, %r30501, %r20494; + xor.b32 %r20211, %r30502, %r20495; + xor.b32 %r20155, %r30499, %r20494; + xor.b32 %r20156, %r30500, %r20495; + xor.b32 %r20243, %r30497, %r20494; + xor.b32 %r20244, %r30498, %r20495; + xor.b32 %r20172, %r30495, %r20494; + xor.b32 %r20171, %r30496, %r20495; + // begin inline asm + shf.l.wrap.b32 %r20086, %r19995, %r19994, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20090, %r19994, %r19995, %r19963; + // end inline asm + xor.b32 %r20496, %r20086, %r20030; + xor.b32 %r20497, %r20090, %r20031; + xor.b32 %r20187, %r30525, %r20496; + xor.b32 %r20188, %r30526, %r20497; + xor.b32 %r20107, %r30493, %r20496; + xor.b32 %r20108, %r30494, %r20497; + xor.b32 %r20124, %r30491, %r20496; + xor.b32 %r20123, %r30492, %r20497; + xor.b32 %r20163, %r30489, %r20496; + xor.b32 %r20164, %r30490, %r20497; + xor.b32 %r20195, %r30487, %r20496; + xor.b32 %r20196, %r30488, %r20497; + mov.u32 %r20101, 44; + // begin inline asm + shf.l.wrap.b32 %r20094, %r20100, %r20099, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20098, %r20099, %r20100, %r20101; + // end inline asm + mov.u32 %r20109, 20; + // begin inline asm + shf.l.wrap.b32 %r20102, %r20108, %r20107, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20106, %r20107, %r20108, %r20109; + // end inline asm + mov.u32 %r20117, 61; + // begin inline asm + shf.l.wrap.b32 %r20110, %r20116, %r20115, %r20117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20114, %r20115, %r20116, %r20117; + // end inline asm + mov.u32 %r20125, 39; + // begin inline asm + shf.l.wrap.b32 %r20118, %r20124, %r20123, %r20125; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20122, %r20123, %r20124, %r20125; + // end inline asm + mov.u32 %r20133, 18; + // begin inline asm + shf.l.wrap.b32 %r20126, %r20132, %r20131, %r20133; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20130, %r20131, %r20132, %r20133; + // end inline asm + mov.u32 %r20141, 62; + // begin inline asm + shf.l.wrap.b32 %r20134, %r20140, %r20139, %r20141; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20138, %r20139, %r20140, %r20141; + // end inline asm + mov.u32 %r20149, 43; + // begin inline asm + shf.l.wrap.b32 %r20142, %r20148, %r20147, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20146, %r20147, %r20148, %r20149; + // end inline asm + mov.u32 %r20157, 25; + // begin inline asm + shf.l.wrap.b32 %r20150, %r20156, %r20155, %r20157; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20154, %r20155, %r20156, %r20157; + // end inline asm + mov.u32 %r20165, 8; + // begin inline asm + shf.l.wrap.b32 %r20158, %r20164, %r20163, %r20165; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20162, %r20163, %r20164, %r20165; + // end inline asm + mov.u32 %r20173, 56; + // begin inline asm + shf.l.wrap.b32 %r20166, %r20172, %r20171, %r20173; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20170, %r20171, %r20172, %r20173; + // end inline asm + mov.u32 %r20181, 41; + // begin inline asm + shf.l.wrap.b32 %r20174, %r20180, %r20179, %r20181; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20178, %r20179, %r20180, %r20181; + // end inline asm + mov.u32 %r20189, 27; + // begin inline asm + shf.l.wrap.b32 %r20182, %r20188, %r20187, %r20189; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20186, %r20187, %r20188, %r20189; + // end inline asm + mov.u32 %r20197, 14; + // begin inline asm + shf.l.wrap.b32 %r20190, %r20196, %r20195, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20194, %r20195, %r20196, %r20197; + // end inline asm + mov.u32 %r20205, 2; + // begin inline asm + shf.l.wrap.b32 %r20198, %r20204, %r20203, %r20205; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20202, %r20203, %r20204, %r20205; + // end inline asm + mov.u32 %r20213, 55; + // begin inline asm + shf.l.wrap.b32 %r20206, %r20212, %r20211, %r20213; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20210, %r20211, %r20212, %r20213; + // end inline asm + mov.u32 %r20221, 45; + // begin inline asm + shf.l.wrap.b32 %r20214, %r20220, %r20219, %r20221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20218, %r20219, %r20220, %r20221; + // end inline asm + mov.u32 %r20229, 36; + // begin inline asm + shf.l.wrap.b32 %r20222, %r20228, %r20227, %r20229; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20226, %r20227, %r20228, %r20229; + // end inline asm + mov.u32 %r20237, 28; + // begin inline asm + shf.l.wrap.b32 %r20230, %r20236, %r20235, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20234, %r20235, %r20236, %r20237; + // end inline asm + mov.u32 %r20245, 21; + // begin inline asm + shf.l.wrap.b32 %r20238, %r20244, %r20243, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20242, %r20243, %r20244, %r20245; + // end inline asm + mov.u32 %r20253, 15; + // begin inline asm + shf.l.wrap.b32 %r20246, %r20252, %r20251, %r20253; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20250, %r20251, %r20252, %r20253; + // end inline asm + mov.u32 %r20261, 10; + // begin inline asm + shf.l.wrap.b32 %r20254, %r20260, %r20259, %r20261; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20258, %r20259, %r20260, %r20261; + // end inline asm + mov.u32 %r20269, 6; + // begin inline asm + shf.l.wrap.b32 %r20262, %r20268, %r20267, %r20269; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20266, %r20267, %r20268, %r20269; + // end inline asm + mov.u32 %r20277, 3; + // begin inline asm + shf.l.wrap.b32 %r20270, %r20276, %r20275, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20274, %r20275, %r20276, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20278, %r20284, %r20283, %r19963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20282, %r20283, %r20284, %r19963; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20286, %r20321, %r20094, %r20142, 0xD2; + lop3.b32 %r20287, %r20324, %r20098, %r20146, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30535, %r20094, %r20142, %r20238, 0xD2; + lop3.b32 %r30536, %r20098, %r20146, %r20242, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30531, %r20142, %r20238, %r20190, 0xD2; + lop3.b32 %r30532, %r20146, %r20242, %r20194, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30527, %r20238, %r20190, %r20321, 0xD2; + lop3.b32 %r30528, %r20242, %r20194, %r20324, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30525, %r20190, %r20321, %r20094, 0xD2; + lop3.b32 %r30526, %r20194, %r20324, %r20098, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30521, %r20230, %r20102, %r20270, 0xD2; + lop3.b32 %r30522, %r20234, %r20106, %r20274, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30533, %r20102, %r20270, %r20214, 0xD2; + lop3.b32 %r30534, %r20106, %r20274, %r20218, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30529, %r20270, %r20214, %r20110, 0xD2; + lop3.b32 %r30530, %r20274, %r20218, %r20114, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30501, %r20214, %r20110, %r20230, 0xD2; + lop3.b32 %r30502, %r20218, %r20114, %r20234, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30501, %r30502}; + // begin inline asm + // chi + lop3.b32 %r30493, %r20110, %r20230, %r20102, 0xD2; + lop3.b32 %r30494, %r20114, %r20234, %r20106, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30493, %r30494}; + // begin inline asm + // chi + lop3.b32 %r30519, %r20278, %r20262, %r20150, 0xD2; + lop3.b32 %r30520, %r20282, %r20266, %r20154, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30519, %r30520}; + // begin inline asm + // chi + lop3.b32 %r30513, %r20262, %r20150, %r20158, 0xD2; + lop3.b32 %r30514, %r20266, %r20154, %r20162, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30513, %r30514}; + // begin inline asm + // chi + lop3.b32 %r30507, %r20150, %r20158, %r20126, 0xD2; + lop3.b32 %r30508, %r20154, %r20162, %r20130, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30507, %r30508}; + // begin inline asm + // chi + lop3.b32 %r30499, %r20158, %r20126, %r20278, 0xD2; + lop3.b32 %r30500, %r20162, %r20130, %r20282, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30499, %r30500}; + // begin inline asm + // chi + lop3.b32 %r30491, %r20126, %r20278, %r20262, 0xD2; + lop3.b32 %r30492, %r20130, %r20282, %r20266, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30491, %r30492}; + // begin inline asm + // chi + lop3.b32 %r30517, %r20182, %r20222, %r20254, 0xD2; + lop3.b32 %r30518, %r20186, %r20226, %r20258, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30517, %r30518}; + // begin inline asm + // chi + lop3.b32 %r30511, %r20222, %r20254, %r20246, 0xD2; + lop3.b32 %r30512, %r20226, %r20258, %r20250, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30511, %r30512}; + // begin inline asm + // chi + lop3.b32 %r30505, %r20254, %r20246, %r20166, 0xD2; + lop3.b32 %r30506, %r20258, %r20250, %r20170, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30505, %r30506}; + // begin inline asm + // chi + lop3.b32 %r30497, %r20246, %r20166, %r20182, 0xD2; + lop3.b32 %r30498, %r20250, %r20170, %r20186, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30497, %r30498}; + // begin inline asm + // chi + lop3.b32 %r30489, %r20166, %r20182, %r20222, 0xD2; + lop3.b32 %r30490, %r20170, %r20186, %r20226, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30489, %r30490}; + // begin inline asm + // chi + lop3.b32 %r30515, %r20134, %r20206, %r20118, 0xD2; + lop3.b32 %r30516, %r20138, %r20210, %r20122, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30515, %r30516}; + // begin inline asm + // chi + lop3.b32 %r30509, %r20206, %r20118, %r20174, 0xD2; + lop3.b32 %r30510, %r20210, %r20122, %r20178, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30509, %r30510}; + // begin inline asm + // chi + lop3.b32 %r30503, %r20118, %r20174, %r20198, 0xD2; + lop3.b32 %r30504, %r20122, %r20178, %r20202, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30503, %r30504}; + // begin inline asm + // chi + lop3.b32 %r30495, %r20174, %r20198, %r20134, 0xD2; + lop3.b32 %r30496, %r20178, %r20202, %r20138, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30495, %r30496}; + // begin inline asm + // chi + lop3.b32 %r30487, %r20198, %r20134, %r20206, 0xD2; + lop3.b32 %r30488, %r20202, %r20138, %r20210, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30487, %r30488}; + mul.wide.s32 %rd916, %r30537, 8; + mov.u64 %rd917, keccak_round_constants; + cvta.const.u64 %rd918, %rd917; + add.s64 %rd913, %rd918, %rd916; + // begin inline asm + ld.global.nc.v2.u32 {%r20486,%r20487}, [%rd913]; + // end inline asm + xor.b32 %r30523, %r20286, %r20486; + xor.b32 %r30524, %r20287, %r20487; + add.s32 %r30537, %r30537, 1; + setp.lt.u32 %p42, %r30537, 23; + @%p42 bra $L__BB2_73; + + st.local.v2.u32 [%rd3+32], {%r30535, %r30536}; + st.local.v2.u32 [%rd3+72], {%r30533, %r30534}; + st.local.v2.u32 [%rd3+40], {%r30531, %r30532}; + st.local.v2.u32 [%rd3+80], {%r30529, %r30530}; + st.local.v2.u32 [%rd3+48], {%r30527, %r30528}; + st.local.v2.u32 [%rd3+56], {%r30525, %r30526}; + st.local.v2.u32 [%rd3+24], {%r30523, %r30524}; + // begin inline asm + // xor5 + lop3.b32 %r20498, %r30523, %r30521, %r30519, 0x96; + lop3.b32 %r20498, %r20498, %r30517, %r30515, 0x96; + lop3.b32 %r20499, %r30524, %r30522, %r30520, 0x96; + lop3.b32 %r20499, %r20499, %r30518, %r30516, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20510, %r30535, %r30533, %r30513, 0x96; + lop3.b32 %r20510, %r20510, %r30511, %r30509, 0x96; + lop3.b32 %r20511, %r30536, %r30534, %r30514, 0x96; + lop3.b32 %r20511, %r20511, %r30512, %r30510, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20522, %r30531, %r30529, %r30507, 0x96; + lop3.b32 %r20522, %r20522, %r30505, %r30503, 0x96; + lop3.b32 %r20523, %r30532, %r30530, %r30508, 0x96; + lop3.b32 %r20523, %r20523, %r30506, %r30504, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20534, %r30527, %r30501, %r30499, 0x96; + lop3.b32 %r20534, %r20534, %r30497, %r30495, 0x96; + lop3.b32 %r20535, %r30528, %r30502, %r30500, 0x96; + lop3.b32 %r20535, %r20535, %r30498, %r30496, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20546, %r30525, %r30493, %r30491, 0x96; + lop3.b32 %r20546, %r20546, %r30489, %r30487, 0x96; + lop3.b32 %r20547, %r30526, %r30494, %r30492, 0x96; + lop3.b32 %r20547, %r20547, %r30490, %r30488, 0x96; + // end inline asm + mov.u32 %r20750, 1; + // begin inline asm + shf.l.wrap.b32 %r20558, %r20511, %r20510, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20562, %r20510, %r20511, %r20750; + // end inline asm + xor.b32 %r20777, %r20558, %r20546; + xor.b32 %r20778, %r20562, %r20547; + xor.b32 %r20705, %r30523, %r20777; + xor.b32 %r20708, %r30524, %r20778; + xor.b32 %r20668, %r30520, %r20778; + xor.b32 %r20667, %r30519, %r20777; + st.local.v2.u32 [%rd3+104], {%r20667, %r20668}; + // begin inline asm + shf.l.wrap.b32 %r20566, %r20523, %r20522, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20570, %r20522, %r20523, %r20750; + // end inline asm + xor.b32 %r20779, %r20566, %r20498; + xor.b32 %r20780, %r20570, %r20499; + xor.b32 %r20604, %r30533, %r20779; + xor.b32 %r20603, %r30534, %r20780; + xor.b32 %r20643, %r30512, %r20780; + xor.b32 %r20644, %r30511, %r20779; + st.local.v2.u32 [%rd3+152], {%r20644, %r20643}; + // begin inline asm + shf.l.wrap.b32 %r20574, %r20535, %r20534, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20578, %r20534, %r20535, %r20750; + // end inline asm + xor.b32 %r20781, %r20574, %r20510; + xor.b32 %r20782, %r20578, %r20511; + xor.b32 %r20627, %r30508, %r20782; + xor.b32 %r20628, %r30507, %r20781; + st.local.v2.u32 [%rd3+120], {%r20628, %r20627}; + xor.b32 %r20619, %r30504, %r20782; + xor.b32 %r20620, %r30503, %r20781; + st.local.v2.u32 [%rd3+200], {%r20620, %r20619}; + // begin inline asm + shf.l.wrap.b32 %r20582, %r20547, %r20546, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20586, %r20546, %r20547, %r20750; + // end inline asm + xor.b32 %r20783, %r20582, %r20522; + xor.b32 %r20784, %r20586, %r20523; + xor.b32 %r20651, %r30527, %r20783; + xor.b32 %r20652, %r30528, %r20784; + xor.b32 %r20660, %r30498, %r20784; + xor.b32 %r20659, %r30497, %r20783; + st.local.v2.u32 [%rd3+168], {%r20659, %r20660}; + // begin inline asm + shf.l.wrap.b32 %r20590, %r20499, %r20498, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20594, %r20498, %r20499, %r20750; + // end inline asm + xor.b32 %r20785, %r20590, %r20534; + xor.b32 %r20786, %r20594, %r20535; + xor.b32 %r20611, %r30493, %r20785; + xor.b32 %r20612, %r30494, %r20786; + xor.b32 %r20636, %r30488, %r20786; + xor.b32 %r20635, %r30487, %r20785; + st.local.v2.u32 [%rd3+216], {%r20635, %r20636}; + // begin inline asm + shf.l.wrap.b32 %r20598, %r20604, %r20603, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20602, %r20603, %r20604, %r20101; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20606, %r20612, %r20611, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20610, %r20611, %r20612, %r20109; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20618, %r20619, %r20620, %r20117; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20614, %r20620, %r20619, %r20117; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r20614, %r20618}; + // begin inline asm + shf.l.wrap.b32 %r20622, %r20628, %r20627, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20626, %r20627, %r20628, %r20149; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20630, %r20636, %r20635, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20634, %r20635, %r20636, %r20197; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20642, %r20643, %r20644, %r20221; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20638, %r20644, %r20643, %r20221; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r20638, %r20642}; + // begin inline asm + shf.l.wrap.b32 %r20646, %r20652, %r20651, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20650, %r20651, %r20652, %r20237; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20654, %r20660, %r20659, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20658, %r20659, %r20660, %r20245; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20662, %r20668, %r20667, %r20277; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20666, %r20667, %r20668, %r20277; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r20670, %r20705, %r20598, %r20622, 0xD2; + lop3.b32 %r20671, %r20708, %r20602, %r20626, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30670, %r20598, %r20622, %r20654, 0xD2; + lop3.b32 %r30671, %r20602, %r20626, %r20658, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r30670, %r30671}; + // begin inline asm + // chi + lop3.b32 %r30666, %r20622, %r20654, %r20630, 0xD2; + lop3.b32 %r30667, %r20626, %r20658, %r20634, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r30666, %r30667}; + // begin inline asm + // chi + lop3.b32 %r30662, %r20654, %r20630, %r20705, 0xD2; + lop3.b32 %r30663, %r20658, %r20634, %r20708, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r30662, %r30663}; + // begin inline asm + // chi + lop3.b32 %r30660, %r20630, %r20705, %r20598, 0xD2; + lop3.b32 %r30661, %r20634, %r20708, %r20602, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r30660, %r30661}; + // begin inline asm + // chi + lop3.b32 %r30656, %r20646, %r20606, %r20662, 0xD2; + lop3.b32 %r30657, %r20650, %r20610, %r20666, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r30656, %r30657}; + // begin inline asm + // chi + lop3.b32 %r30668, %r20606, %r20662, %r20638, 0xD2; + lop3.b32 %r30669, %r20610, %r20666, %r20642, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r30668, %r30669}; + // begin inline asm + // chi + lop3.b32 %r30664, %r20662, %r20638, %r20614, 0xD2; + lop3.b32 %r30665, %r20666, %r20642, %r20618, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r30664, %r30665}; + add.s64 %rd919, %rd918, 184; + // begin inline asm + ld.global.nc.v2.u32 {%r20734,%r20735}, [%rd919]; + // end inline asm + xor.b32 %r30658, %r20670, %r20734; + xor.b32 %r30659, %r20671, %r20735; + add.u64 %rd925, %SPL, 1912; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + st.local.u64 [%rd925], %rd354; + mov.u64 %rd926, 1179641; + st.local.u64 [%rd925+8], %rd926; + add.s32 %r20787, %r3326, 1; + st.local.u32 [%rd925+16], %r20787; + ld.global.u64 %rd927, [%rd223]; + ld.global.u64 %rd928, [%rd223+8]; + ld.global.u64 %rd929, [%rd223+16]; + ld.global.u64 %rd930, [%rd223+24]; + ld.global.u64 %rd931, [%rd223+32]; + ld.global.u64 %rd932, [%rd223+40]; + ld.global.u64 %rd933, [%rd223+48]; + ld.global.u64 %rd934, [%rd223+56]; + st.local.u64 [%rd925+32], %rd928; + st.local.u64 [%rd925+40], %rd929; + st.local.u64 [%rd925+48], %rd930; + st.local.u64 [%rd925+56], %rd931; + st.local.u64 [%rd925+64], %rd932; + st.local.u64 [%rd925+72], %rd933; + st.local.u64 [%rd925+80], %rd934; + cvt.u32.u64 %r20788, %rd927; + xor.b32 %r20789, %r20787, %r20788; + st.local.u64 [%rd925+24], %rd927; + st.local.u32 [%rd925+24], %r20789; + mov.u32 %r30538, 0; + st.local.v2.u32 [%rd925+96], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+104], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+112], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+120], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+128], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+136], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+144], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+152], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+160], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+168], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+176], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+184], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+192], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+200], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+208], {%r30538, %r30538}; + st.local.v2.u32 [%rd925+216], {%r30538, %r30538}; + mov.u32 %r30553, -2147483648; + st.local.v2.u32 [%rd925+88], {%r20750, %r30553}; + ld.local.v2.u32 {%r30574, %r30575}, [%rd925+24]; + mov.b64 {%r30572, %r30573}, %rd932; + shr.u64 %rd935, %rd928, 32; + cvt.u32.u64 %r30586, %rd928; + cvt.u32.u64 %r30587, %rd935; + shr.u64 %rd936, %rd933, 32; + cvt.u32.u64 %r30584, %rd933; + cvt.u32.u64 %r30585, %rd936; + shr.u64 %rd937, %rd929, 32; + cvt.u32.u64 %r30582, %rd929; + cvt.u32.u64 %r30583, %rd937; + shr.u64 %rd938, %rd934, 32; + cvt.u32.u64 %r30580, %rd934; + cvt.u32.u64 %r30581, %rd938; + shr.u64 %rd939, %rd930, 32; + cvt.u32.u64 %r30578, %rd930; + cvt.u32.u64 %r30579, %rd939; + shr.u64 %rd940, %rd931, 32; + cvt.u32.u64 %r30576, %rd931; + cvt.u32.u64 %r30577, %rd940; + mov.u32 %r30539, %r30538; + mov.u32 %r30540, %r30538; + mov.u32 %r30541, %r30538; + mov.u32 %r30542, %r30538; + mov.u32 %r30543, %r30538; + mov.u32 %r30544, %r30538; + mov.u32 %r30545, %r30538; + mov.u32 %r30546, %r30538; + mov.u32 %r30547, %r30538; + mov.u32 %r30548, %r30538; + mov.u32 %r30549, %r30538; + mov.u32 %r30550, %r30538; + mov.u32 %r30551, %r30538; + mov.u32 %r30552, %r20750; + mov.u32 %r30554, %r30538; + mov.u32 %r30555, %r30538; + mov.u32 %r30556, %r30538; + mov.u32 %r30557, %r30538; + mov.u32 %r30558, %r30538; + mov.u32 %r30559, %r30538; + mov.u32 %r30560, %r30538; + mov.u32 %r30561, %r30538; + mov.u32 %r30562, %r30538; + mov.u32 %r30563, %r30538; + mov.u32 %r30564, %r30538; + mov.u32 %r30565, %r30538; + mov.u32 %r30566, %r30538; + mov.u32 %r30567, %r30538; + mov.u32 %r30568, %r30538; + mov.u32 %r30569, %r30538; + mov.u32 %r30570, %r30538; + mov.u32 %r30571, %r30538; + mov.u32 %r30588, %r30538; + +$L__BB2_75: + // begin inline asm + // xor5 + lop3.b32 %r20792, %r30574, %r30572, %r30570, 0x96; + lop3.b32 %r20792, %r20792, %r30568, %r30566, 0x96; + lop3.b32 %r20793, %r30575, %r30573, %r30571, 0x96; + lop3.b32 %r20793, %r20793, %r30569, %r30567, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20804, %r30586, %r30584, %r30564, 0x96; + lop3.b32 %r20804, %r20804, %r30562, %r30560, 0x96; + lop3.b32 %r20805, %r30587, %r30585, %r30565, 0x96; + lop3.b32 %r20805, %r20805, %r30563, %r30561, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20816, %r30582, %r30580, %r30558, 0x96; + lop3.b32 %r20816, %r20816, %r30556, %r30554, 0x96; + lop3.b32 %r20817, %r30583, %r30581, %r30559, 0x96; + lop3.b32 %r20817, %r20817, %r30557, %r30555, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20828, %r30578, %r30552, %r30550, 0x96; + lop3.b32 %r20828, %r20828, %r30548, %r30546, 0x96; + lop3.b32 %r20829, %r30579, %r30553, %r30551, 0x96; + lop3.b32 %r20829, %r20829, %r30549, %r30547, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r20840, %r30576, %r30544, %r30542, 0x96; + lop3.b32 %r20840, %r20840, %r30540, %r30538, 0x96; + lop3.b32 %r20841, %r30577, %r30545, %r30543, 0x96; + lop3.b32 %r20841, %r20841, %r30541, %r30539, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20852, %r20805, %r20804, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20856, %r20804, %r20805, %r20750; + // end inline asm + xor.b32 %r21286, %r20852, %r20840; + xor.b32 %r21287, %r20856, %r20841; + xor.b32 %r21119, %r30574, %r21286; + xor.b32 %r21122, %r30575, %r21287; + xor.b32 %r21026, %r30572, %r21286; + xor.b32 %r21025, %r30573, %r21287; + xor.b32 %r21073, %r30570, %r21286; + xor.b32 %r21074, %r30571, %r21287; + xor.b32 %r20978, %r30568, %r21286; + xor.b32 %r20977, %r30569, %r21287; + xor.b32 %r20929, %r30566, %r21286; + xor.b32 %r20930, %r30567, %r21287; + // begin inline asm + shf.l.wrap.b32 %r20860, %r20817, %r20816, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20864, %r20816, %r20817, %r20750; + // end inline asm + xor.b32 %r21288, %r20860, %r20792; + xor.b32 %r21289, %r20864, %r20793; + xor.b32 %r21081, %r30586, %r21288; + xor.b32 %r21082, %r30587, %r21289; + xor.b32 %r20898, %r30584, %r21288; + xor.b32 %r20897, %r30585, %r21289; + xor.b32 %r21057, %r30564, %r21288; + xor.b32 %r21058, %r30565, %r21289; + xor.b32 %r21018, %r30562, %r21288; + xor.b32 %r21017, %r30563, %r21289; + xor.b32 %r21001, %r30560, %r21288; + xor.b32 %r21002, %r30561, %r21289; + // begin inline asm + shf.l.wrap.b32 %r20868, %r20829, %r20828, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20872, %r20828, %r20829, %r20750; + // end inline asm + xor.b32 %r21290, %r20868, %r20804; + xor.b32 %r21291, %r20872, %r20805; + xor.b32 %r20938, %r30582, %r21290; + xor.b32 %r20937, %r30583, %r21291; + xor.b32 %r21065, %r30580, %r21290; + xor.b32 %r21066, %r30581, %r21291; + xor.b32 %r20946, %r30558, %r21290; + xor.b32 %r20945, %r30559, %r21291; + xor.b32 %r21049, %r30556, %r21290; + xor.b32 %r21050, %r30557, %r21291; + xor.b32 %r20914, %r30554, %r21290; + xor.b32 %r20913, %r30555, %r21291; + // begin inline asm + shf.l.wrap.b32 %r20876, %r20841, %r20840, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20880, %r20840, %r20841, %r20750; + // end inline asm + xor.b32 %r21292, %r20876, %r20816; + xor.b32 %r21293, %r20880, %r20817; + xor.b32 %r21033, %r30578, %r21292; + xor.b32 %r21034, %r30579, %r21293; + xor.b32 %r21010, %r30552, %r21292; + xor.b32 %r21009, %r30553, %r21293; + xor.b32 %r20953, %r30550, %r21292; + xor.b32 %r20954, %r30551, %r21293; + xor.b32 %r21041, %r30548, %r21292; + xor.b32 %r21042, %r30549, %r21293; + xor.b32 %r20970, %r30546, %r21292; + xor.b32 %r20969, %r30547, %r21293; + // begin inline asm + shf.l.wrap.b32 %r20884, %r20793, %r20792, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20888, %r20792, %r20793, %r20750; + // end inline asm + xor.b32 %r21294, %r20884, %r20828; + xor.b32 %r21295, %r20888, %r20829; + xor.b32 %r20985, %r30576, %r21294; + xor.b32 %r20986, %r30577, %r21295; + xor.b32 %r20905, %r30544, %r21294; + xor.b32 %r20906, %r30545, %r21295; + xor.b32 %r20922, %r30542, %r21294; + xor.b32 %r20921, %r30543, %r21295; + xor.b32 %r20961, %r30540, %r21294; + xor.b32 %r20962, %r30541, %r21295; + xor.b32 %r20993, %r30538, %r21294; + xor.b32 %r20994, %r30539, %r21295; + mov.u32 %r20899, 44; + // begin inline asm + shf.l.wrap.b32 %r20892, %r20898, %r20897, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20896, %r20897, %r20898, %r20899; + // end inline asm + mov.u32 %r20907, 20; + // begin inline asm + shf.l.wrap.b32 %r20900, %r20906, %r20905, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20904, %r20905, %r20906, %r20907; + // end inline asm + mov.u32 %r20915, 61; + // begin inline asm + shf.l.wrap.b32 %r20908, %r20914, %r20913, %r20915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20912, %r20913, %r20914, %r20915; + // end inline asm + mov.u32 %r20923, 39; + // begin inline asm + shf.l.wrap.b32 %r20916, %r20922, %r20921, %r20923; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20920, %r20921, %r20922, %r20923; + // end inline asm + mov.u32 %r20931, 18; + // begin inline asm + shf.l.wrap.b32 %r20924, %r20930, %r20929, %r20931; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20928, %r20929, %r20930, %r20931; + // end inline asm + mov.u32 %r20939, 62; + // begin inline asm + shf.l.wrap.b32 %r20932, %r20938, %r20937, %r20939; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20936, %r20937, %r20938, %r20939; + // end inline asm + mov.u32 %r20947, 43; + // begin inline asm + shf.l.wrap.b32 %r20940, %r20946, %r20945, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20944, %r20945, %r20946, %r20947; + // end inline asm + mov.u32 %r20955, 25; + // begin inline asm + shf.l.wrap.b32 %r20948, %r20954, %r20953, %r20955; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20952, %r20953, %r20954, %r20955; + // end inline asm + mov.u32 %r20963, 8; + // begin inline asm + shf.l.wrap.b32 %r20956, %r20962, %r20961, %r20963; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20960, %r20961, %r20962, %r20963; + // end inline asm + mov.u32 %r20971, 56; + // begin inline asm + shf.l.wrap.b32 %r20964, %r20970, %r20969, %r20971; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20968, %r20969, %r20970, %r20971; + // end inline asm + mov.u32 %r20979, 41; + // begin inline asm + shf.l.wrap.b32 %r20972, %r20978, %r20977, %r20979; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20976, %r20977, %r20978, %r20979; + // end inline asm + mov.u32 %r20987, 27; + // begin inline asm + shf.l.wrap.b32 %r20980, %r20986, %r20985, %r20987; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20984, %r20985, %r20986, %r20987; + // end inline asm + mov.u32 %r20995, 14; + // begin inline asm + shf.l.wrap.b32 %r20988, %r20994, %r20993, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r20992, %r20993, %r20994, %r20995; + // end inline asm + mov.u32 %r21003, 2; + // begin inline asm + shf.l.wrap.b32 %r20996, %r21002, %r21001, %r21003; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21000, %r21001, %r21002, %r21003; + // end inline asm + mov.u32 %r21011, 55; + // begin inline asm + shf.l.wrap.b32 %r21004, %r21010, %r21009, %r21011; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21008, %r21009, %r21010, %r21011; + // end inline asm + mov.u32 %r21019, 45; + // begin inline asm + shf.l.wrap.b32 %r21012, %r21018, %r21017, %r21019; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21016, %r21017, %r21018, %r21019; + // end inline asm + mov.u32 %r21027, 36; + // begin inline asm + shf.l.wrap.b32 %r21020, %r21026, %r21025, %r21027; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21024, %r21025, %r21026, %r21027; + // end inline asm + mov.u32 %r21035, 28; + // begin inline asm + shf.l.wrap.b32 %r21028, %r21034, %r21033, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21032, %r21033, %r21034, %r21035; + // end inline asm + mov.u32 %r21043, 21; + // begin inline asm + shf.l.wrap.b32 %r21036, %r21042, %r21041, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21040, %r21041, %r21042, %r21043; + // end inline asm + mov.u32 %r21051, 15; + // begin inline asm + shf.l.wrap.b32 %r21044, %r21050, %r21049, %r21051; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21048, %r21049, %r21050, %r21051; + // end inline asm + mov.u32 %r21059, 10; + // begin inline asm + shf.l.wrap.b32 %r21052, %r21058, %r21057, %r21059; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21056, %r21057, %r21058, %r21059; + // end inline asm + mov.u32 %r21067, 6; + // begin inline asm + shf.l.wrap.b32 %r21060, %r21066, %r21065, %r21067; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21064, %r21065, %r21066, %r21067; + // end inline asm + mov.u32 %r21075, 3; + // begin inline asm + shf.l.wrap.b32 %r21068, %r21074, %r21073, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21072, %r21073, %r21074, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21076, %r21082, %r21081, %r20750; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21080, %r21081, %r21082, %r20750; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21084, %r21119, %r20892, %r20940, 0xD2; + lop3.b32 %r21085, %r21122, %r20896, %r20944, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30586, %r20892, %r20940, %r21036, 0xD2; + lop3.b32 %r30587, %r20896, %r20944, %r21040, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30582, %r20940, %r21036, %r20988, 0xD2; + lop3.b32 %r30583, %r20944, %r21040, %r20992, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30578, %r21036, %r20988, %r21119, 0xD2; + lop3.b32 %r30579, %r21040, %r20992, %r21122, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30576, %r20988, %r21119, %r20892, 0xD2; + lop3.b32 %r30577, %r20992, %r21122, %r20896, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30572, %r21028, %r20900, %r21068, 0xD2; + lop3.b32 %r30573, %r21032, %r20904, %r21072, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30584, %r20900, %r21068, %r21012, 0xD2; + lop3.b32 %r30585, %r20904, %r21072, %r21016, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30580, %r21068, %r21012, %r20908, 0xD2; + lop3.b32 %r30581, %r21072, %r21016, %r20912, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30552, %r21012, %r20908, %r21028, 0xD2; + lop3.b32 %r30553, %r21016, %r20912, %r21032, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r30552, %r30553}; + // begin inline asm + // chi + lop3.b32 %r30544, %r20908, %r21028, %r20900, 0xD2; + lop3.b32 %r30545, %r20912, %r21032, %r20904, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r30544, %r30545}; + // begin inline asm + // chi + lop3.b32 %r30570, %r21076, %r21060, %r20948, 0xD2; + lop3.b32 %r30571, %r21080, %r21064, %r20952, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+104], {%r30570, %r30571}; + // begin inline asm + // chi + lop3.b32 %r30564, %r21060, %r20948, %r20956, 0xD2; + lop3.b32 %r30565, %r21064, %r20952, %r20960, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+112], {%r30564, %r30565}; + // begin inline asm + // chi + lop3.b32 %r30558, %r20948, %r20956, %r20924, 0xD2; + lop3.b32 %r30559, %r20952, %r20960, %r20928, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+120], {%r30558, %r30559}; + // begin inline asm + // chi + lop3.b32 %r30550, %r20956, %r20924, %r21076, 0xD2; + lop3.b32 %r30551, %r20960, %r20928, %r21080, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+128], {%r30550, %r30551}; + // begin inline asm + // chi + lop3.b32 %r30542, %r20924, %r21076, %r21060, 0xD2; + lop3.b32 %r30543, %r20928, %r21080, %r21064, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+136], {%r30542, %r30543}; + // begin inline asm + // chi + lop3.b32 %r30568, %r20980, %r21020, %r21052, 0xD2; + lop3.b32 %r30569, %r20984, %r21024, %r21056, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+144], {%r30568, %r30569}; + // begin inline asm + // chi + lop3.b32 %r30562, %r21020, %r21052, %r21044, 0xD2; + lop3.b32 %r30563, %r21024, %r21056, %r21048, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+152], {%r30562, %r30563}; + // begin inline asm + // chi + lop3.b32 %r30556, %r21052, %r21044, %r20964, 0xD2; + lop3.b32 %r30557, %r21056, %r21048, %r20968, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+160], {%r30556, %r30557}; + // begin inline asm + // chi + lop3.b32 %r30548, %r21044, %r20964, %r20980, 0xD2; + lop3.b32 %r30549, %r21048, %r20968, %r20984, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+168], {%r30548, %r30549}; + // begin inline asm + // chi + lop3.b32 %r30540, %r20964, %r20980, %r21020, 0xD2; + lop3.b32 %r30541, %r20968, %r20984, %r21024, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+176], {%r30540, %r30541}; + // begin inline asm + // chi + lop3.b32 %r30566, %r20932, %r21004, %r20916, 0xD2; + lop3.b32 %r30567, %r20936, %r21008, %r20920, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+184], {%r30566, %r30567}; + // begin inline asm + // chi + lop3.b32 %r30560, %r21004, %r20916, %r20972, 0xD2; + lop3.b32 %r30561, %r21008, %r20920, %r20976, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+192], {%r30560, %r30561}; + // begin inline asm + // chi + lop3.b32 %r30554, %r20916, %r20972, %r20996, 0xD2; + lop3.b32 %r30555, %r20920, %r20976, %r21000, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+200], {%r30554, %r30555}; + // begin inline asm + // chi + lop3.b32 %r30546, %r20972, %r20996, %r20932, 0xD2; + lop3.b32 %r30547, %r20976, %r21000, %r20936, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+208], {%r30546, %r30547}; + // begin inline asm + // chi + lop3.b32 %r30538, %r20996, %r20932, %r21004, 0xD2; + lop3.b32 %r30539, %r21000, %r20936, %r21008, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+216], {%r30538, %r30539}; + mul.wide.s32 %rd944, %r30588, 8; + add.s64 %rd941, %rd918, %rd944; + // begin inline asm + ld.global.nc.v2.u32 {%r21284,%r21285}, [%rd941]; + // end inline asm + xor.b32 %r30574, %r21084, %r21284; + xor.b32 %r30575, %r21085, %r21285; + add.s32 %r30588, %r30588, 1; + setp.lt.u32 %p43, %r30588, 23; + @%p43 bra $L__BB2_75; + + mov.u32 %r30621, 0; + mov.u32 %r21395, 1; + st.local.v2.u32 [%rd925+32], {%r30586, %r30587}; + st.local.v2.u32 [%rd925+72], {%r30584, %r30585}; + st.local.v2.u32 [%rd925+40], {%r30582, %r30583}; + st.local.v2.u32 [%rd925+80], {%r30580, %r30581}; + st.local.v2.u32 [%rd925+48], {%r30578, %r30579}; + st.local.v2.u32 [%rd925+56], {%r30576, %r30577}; + st.local.v2.u32 [%rd925+24], {%r30574, %r30575}; + // begin inline asm + // xor5 + lop3.b32 %r21296, %r30574, %r30572, %r30570, 0x96; + lop3.b32 %r21296, %r21296, %r30568, %r30566, 0x96; + lop3.b32 %r21297, %r30575, %r30573, %r30571, 0x96; + lop3.b32 %r21297, %r21297, %r30569, %r30567, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21308, %r30586, %r30584, %r30564, 0x96; + lop3.b32 %r21308, %r21308, %r30562, %r30560, 0x96; + lop3.b32 %r21309, %r30587, %r30585, %r30565, 0x96; + lop3.b32 %r21309, %r21309, %r30563, %r30561, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21320, %r30582, %r30580, %r30558, 0x96; + lop3.b32 %r21320, %r21320, %r30556, %r30554, 0x96; + lop3.b32 %r21321, %r30583, %r30581, %r30559, 0x96; + lop3.b32 %r21321, %r21321, %r30557, %r30555, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21332, %r30578, %r30552, %r30550, 0x96; + lop3.b32 %r21332, %r21332, %r30548, %r30546, 0x96; + lop3.b32 %r21333, %r30579, %r30553, %r30551, 0x96; + lop3.b32 %r21333, %r21333, %r30549, %r30547, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21344, %r30576, %r30544, %r30542, 0x96; + lop3.b32 %r21344, %r21344, %r30540, %r30538, 0x96; + lop3.b32 %r21345, %r30577, %r30545, %r30543, 0x96; + lop3.b32 %r21345, %r21345, %r30541, %r30539, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21356, %r21309, %r21308, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21360, %r21308, %r21309, %r21395; + // end inline asm + xor.b32 %r21535, %r21356, %r21344; + xor.b32 %r21536, %r21360, %r21345; + xor.b32 %r21503, %r30574, %r21535; + xor.b32 %r21506, %r30575, %r21536; + xor.b32 %r21466, %r30571, %r21536; + xor.b32 %r21465, %r30570, %r21535; + st.local.v2.u32 [%rd925+104], {%r21465, %r21466}; + // begin inline asm + shf.l.wrap.b32 %r21364, %r21321, %r21320, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21368, %r21320, %r21321, %r21395; + // end inline asm + xor.b32 %r21537, %r21364, %r21296; + xor.b32 %r21538, %r21368, %r21297; + xor.b32 %r21402, %r30584, %r21537; + xor.b32 %r21401, %r30585, %r21538; + xor.b32 %r21441, %r30563, %r21538; + xor.b32 %r21442, %r30562, %r21537; + st.local.v2.u32 [%rd925+152], {%r21442, %r21441}; + // begin inline asm + shf.l.wrap.b32 %r21372, %r21333, %r21332, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21376, %r21332, %r21333, %r21395; + // end inline asm + xor.b32 %r21539, %r21372, %r21308; + xor.b32 %r21540, %r21376, %r21309; + xor.b32 %r21425, %r30559, %r21540; + xor.b32 %r21426, %r30558, %r21539; + st.local.v2.u32 [%rd925+120], {%r21426, %r21425}; + xor.b32 %r21417, %r30555, %r21540; + xor.b32 %r21418, %r30554, %r21539; + st.local.v2.u32 [%rd925+200], {%r21418, %r21417}; + // begin inline asm + shf.l.wrap.b32 %r21380, %r21345, %r21344, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21384, %r21344, %r21345, %r21395; + // end inline asm + xor.b32 %r21541, %r21380, %r21320; + xor.b32 %r21542, %r21384, %r21321; + xor.b32 %r21449, %r30578, %r21541; + xor.b32 %r21450, %r30579, %r21542; + xor.b32 %r21458, %r30549, %r21542; + xor.b32 %r21457, %r30548, %r21541; + st.local.v2.u32 [%rd925+168], {%r21457, %r21458}; + // begin inline asm + shf.l.wrap.b32 %r21388, %r21297, %r21296, %r21395; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21392, %r21296, %r21297, %r21395; + // end inline asm + xor.b32 %r21543, %r21388, %r21332; + xor.b32 %r21544, %r21392, %r21333; + xor.b32 %r21409, %r30544, %r21543; + xor.b32 %r21410, %r30545, %r21544; + xor.b32 %r21434, %r30539, %r21544; + xor.b32 %r21433, %r30538, %r21543; + st.local.v2.u32 [%rd925+216], {%r21433, %r21434}; + // begin inline asm + shf.l.wrap.b32 %r21396, %r21402, %r21401, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21400, %r21401, %r21402, %r20899; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21404, %r21410, %r21409, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21408, %r21409, %r21410, %r20907; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21416, %r21417, %r21418, %r20915; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21412, %r21418, %r21417, %r20915; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r21412, %r21416}; + // begin inline asm + shf.l.wrap.b32 %r21420, %r21426, %r21425, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21424, %r21425, %r21426, %r20947; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21428, %r21434, %r21433, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21432, %r21433, %r21434, %r20995; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21440, %r21441, %r21442, %r21019; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21436, %r21442, %r21441, %r21019; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r21436, %r21440}; + // begin inline asm + shf.l.wrap.b32 %r21444, %r21450, %r21449, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21448, %r21449, %r21450, %r21035; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21452, %r21458, %r21457, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21456, %r21457, %r21458, %r21043; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21460, %r21466, %r21465, %r21075; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21464, %r21465, %r21466, %r21075; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21468, %r21503, %r21396, %r21420, 0xD2; + lop3.b32 %r21469, %r21506, %r21400, %r21424, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30721, %r21396, %r21420, %r21452, 0xD2; + lop3.b32 %r30722, %r21400, %r21424, %r21456, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+32], {%r30721, %r30722}; + // begin inline asm + // chi + lop3.b32 %r30717, %r21420, %r21452, %r21428, 0xD2; + lop3.b32 %r30718, %r21424, %r21456, %r21432, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+40], {%r30717, %r30718}; + // begin inline asm + // chi + lop3.b32 %r30713, %r21452, %r21428, %r21503, 0xD2; + lop3.b32 %r30714, %r21456, %r21432, %r21506, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+48], {%r30713, %r30714}; + // begin inline asm + // chi + lop3.b32 %r30711, %r21428, %r21503, %r21396, 0xD2; + lop3.b32 %r30712, %r21432, %r21506, %r21400, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+56], {%r30711, %r30712}; + // begin inline asm + // chi + lop3.b32 %r30707, %r21444, %r21404, %r21460, 0xD2; + lop3.b32 %r30708, %r21448, %r21408, %r21464, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+64], {%r30707, %r30708}; + // begin inline asm + // chi + lop3.b32 %r30719, %r21404, %r21460, %r21436, 0xD2; + lop3.b32 %r30720, %r21408, %r21464, %r21440, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+72], {%r30719, %r30720}; + // begin inline asm + // chi + lop3.b32 %r30715, %r21460, %r21436, %r21412, 0xD2; + lop3.b32 %r30716, %r21464, %r21440, %r21416, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+80], {%r30715, %r30716}; + // begin inline asm + ld.global.nc.v2.u32 {%r21532,%r21533}, [%rd919]; + // end inline asm + xor.b32 %r30709, %r21468, %r21532; + xor.b32 %r30710, %r21469, %r21533; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + add.s64 %rd244, %rd925, 24; + add.s64 %rd245, %rd3, 24; + +$L__BB2_77: + shl.b32 %r21545, %r30621, 2; + cvt.u64.u32 %rd952, %r21545; + and.b64 %rd953, %rd952, 60; + add.s64 %rd954, %rd245, %rd953; + xor.b32 %r21546, %r3326, %r30621; + mul.lo.s32 %r21547, %r21546, 16777619; + ld.local.u32 %r21548, [%rd954]; + xor.b32 %r21549, %r21547, %r21548; + mul.wide.u32 %rd955, %r21549, -954391867; + shr.u64 %rd956, %rd955, 32; + cvt.u32.u64 %r21550, %rd956; + sub.s32 %r21551, %r21549, %r21550; + shr.u32 %r21552, %r21551, 1; + add.s32 %r21553, %r21552, %r21550; + shr.u32 %r21554, %r21553, 20; + mul.lo.s32 %r21555, %r21554, 1179641; + sub.s32 %r21556, %r21549, %r21555; + mul.wide.u32 %rd957, %r21556, 64; + add.s64 %rd958, %rd471, %rd957; + mul.lo.s32 %r21557, %r30658, 16777619; + ld.global.u32 %r21558, [%rd958]; + xor.b32 %r30658, %r21557, %r21558; + mul.lo.s32 %r21559, %r30659, 16777619; + ld.global.u32 %r21560, [%rd958+4]; + xor.b32 %r30659, %r21559, %r21560; + mul.lo.s32 %r21561, %r30670, 16777619; + ld.global.u32 %r21562, [%rd958+8]; + mul.lo.s32 %r21563, %r30671, 16777619; + ld.global.u32 %r21564, [%rd958+12]; + xor.b32 %r21565, %r21563, %r21564; + xor.b32 %r30670, %r21561, %r21562; + mov.b64 %rd959, {%r30670, %r21565}; + mul.lo.s32 %r21566, %r30666, 16777619; + ld.global.u32 %r21567, [%rd958+16]; + mul.lo.s32 %r21568, %r30667, 16777619; + ld.global.u32 %r21569, [%rd958+20]; + xor.b32 %r21570, %r21568, %r21569; + xor.b32 %r30666, %r21566, %r21567; + mov.b64 %rd960, {%r30666, %r21570}; + mul.lo.s32 %r21571, %r30662, 16777619; + ld.global.u32 %r21572, [%rd958+24]; + mul.lo.s32 %r21573, %r30663, 16777619; + ld.global.u32 %r21574, [%rd958+28]; + xor.b32 %r21575, %r21573, %r21574; + xor.b32 %r30662, %r21571, %r21572; + mov.b64 %rd961, {%r30662, %r21575}; + mul.lo.s32 %r21576, %r30660, 16777619; + ld.global.u32 %r21577, [%rd958+32]; + mul.lo.s32 %r21578, %r30661, 16777619; + ld.global.u32 %r21579, [%rd958+36]; + xor.b32 %r21580, %r21578, %r21579; + xor.b32 %r30660, %r21576, %r21577; + mov.b64 %rd962, {%r30660, %r21580}; + mul.lo.s32 %r21581, %r30656, 16777619; + ld.global.u32 %r21582, [%rd958+40]; + xor.b32 %r30656, %r21581, %r21582; + mul.lo.s32 %r21583, %r30657, 16777619; + ld.global.u32 %r21584, [%rd958+44]; + xor.b32 %r30657, %r21583, %r21584; + mul.lo.s32 %r21585, %r30668, 16777619; + ld.global.u32 %r21586, [%rd958+48]; + mul.lo.s32 %r21587, %r30669, 16777619; + ld.global.u32 %r21588, [%rd958+52]; + xor.b32 %r21589, %r21587, %r21588; + xor.b32 %r30668, %r21585, %r21586; + mov.b64 %rd963, {%r30668, %r21589}; + mul.lo.s32 %r21590, %r30664, 16777619; + ld.global.u32 %r21591, [%rd958+56]; + mul.lo.s32 %r21592, %r30665, 16777619; + ld.global.u32 %r21593, [%rd958+60]; + xor.b32 %r21594, %r21592, %r21593; + xor.b32 %r30664, %r21590, %r21591; + mov.b64 %rd964, {%r30664, %r21594}; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + st.local.v2.u32 [%rd3+32], {%r30670, %r21565}; + st.local.v2.u32 [%rd3+40], {%r30666, %r21570}; + st.local.v2.u32 [%rd3+48], {%r30662, %r21575}; + st.local.v2.u32 [%rd3+56], {%r30660, %r21580}; + st.local.v2.u32 [%rd3+64], {%r30656, %r30657}; + st.local.v2.u32 [%rd3+72], {%r30668, %r21589}; + st.local.v2.u32 [%rd3+80], {%r30664, %r21594}; + add.s64 %rd965, %rd244, %rd953; + xor.b32 %r21595, %r20787, %r30621; + mul.lo.s32 %r21596, %r21595, 16777619; + ld.local.u32 %r21597, [%rd965]; + xor.b32 %r21598, %r21596, %r21597; + mul.wide.u32 %rd966, %r21598, -954391867; + shr.u64 %rd967, %rd966, 32; + cvt.u32.u64 %r21599, %rd967; + sub.s32 %r21600, %r21598, %r21599; + shr.u32 %r21601, %r21600, 1; + add.s32 %r21602, %r21601, %r21599; + shr.u32 %r21603, %r21602, 20; + mul.lo.s32 %r21604, %r21603, 1179641; + sub.s32 %r21605, %r21598, %r21604; + mul.wide.u32 %rd968, %r21605, 64; + add.s64 %rd969, %rd471, %rd968; + mul.lo.s32 %r21606, %r30709, 16777619; + ld.global.u32 %r21607, [%rd969]; + xor.b32 %r30709, %r21606, %r21607; + mul.lo.s32 %r21608, %r30710, 16777619; + ld.global.u32 %r21609, [%rd969+4]; + xor.b32 %r30710, %r21608, %r21609; + mul.lo.s32 %r21610, %r30721, 16777619; + ld.global.u32 %r21611, [%rd969+8]; + mul.lo.s32 %r21612, %r30722, 16777619; + ld.global.u32 %r21613, [%rd969+12]; + xor.b32 %r21614, %r21612, %r21613; + xor.b32 %r30721, %r21610, %r21611; + mov.b64 %rd970, {%r30721, %r21614}; + mul.lo.s32 %r21615, %r30717, 16777619; + ld.global.u32 %r21616, [%rd969+16]; + mul.lo.s32 %r21617, %r30718, 16777619; + ld.global.u32 %r21618, [%rd969+20]; + xor.b32 %r21619, %r21617, %r21618; + xor.b32 %r30717, %r21615, %r21616; + mov.b64 %rd971, {%r30717, %r21619}; + mul.lo.s32 %r21620, %r30713, 16777619; + ld.global.u32 %r21621, [%rd969+24]; + mul.lo.s32 %r21622, %r30714, 16777619; + ld.global.u32 %r21623, [%rd969+28]; + xor.b32 %r21624, %r21622, %r21623; + xor.b32 %r30713, %r21620, %r21621; + mov.b64 %rd972, {%r30713, %r21624}; + mul.lo.s32 %r21625, %r30711, 16777619; + ld.global.u32 %r21626, [%rd969+32]; + mul.lo.s32 %r21627, %r30712, 16777619; + ld.global.u32 %r21628, [%rd969+36]; + xor.b32 %r21629, %r21627, %r21628; + xor.b32 %r30711, %r21625, %r21626; + mov.b64 %rd973, {%r30711, %r21629}; + mul.lo.s32 %r21630, %r30707, 16777619; + ld.global.u32 %r21631, [%rd969+40]; + xor.b32 %r30707, %r21630, %r21631; + mul.lo.s32 %r21632, %r30708, 16777619; + ld.global.u32 %r21633, [%rd969+44]; + xor.b32 %r30708, %r21632, %r21633; + mul.lo.s32 %r21634, %r30719, 16777619; + ld.global.u32 %r21635, [%rd969+48]; + mul.lo.s32 %r21636, %r30720, 16777619; + ld.global.u32 %r21637, [%rd969+52]; + xor.b32 %r21638, %r21636, %r21637; + xor.b32 %r30719, %r21634, %r21635; + mov.b64 %rd974, {%r30719, %r21638}; + mul.lo.s32 %r21639, %r30715, 16777619; + ld.global.u32 %r21640, [%rd969+56]; + mul.lo.s32 %r21641, %r30716, 16777619; + ld.global.u32 %r21642, [%rd969+60]; + xor.b32 %r21643, %r21641, %r21642; + xor.b32 %r30715, %r21639, %r21640; + mov.b64 %rd975, {%r30715, %r21643}; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + st.local.v2.u32 [%rd925+32], {%r30721, %r21614}; + st.local.v2.u32 [%rd925+40], {%r30717, %r21619}; + st.local.v2.u32 [%rd925+48], {%r30713, %r21624}; + st.local.v2.u32 [%rd925+56], {%r30711, %r21629}; + st.local.v2.u32 [%rd925+64], {%r30707, %r30708}; + st.local.v2.u32 [%rd925+72], {%r30719, %r21638}; + st.local.v2.u32 [%rd925+80], {%r30715, %r21643}; + add.s32 %r30621, %r30621, 1; + setp.lt.u32 %p44, %r30621, 512; + shr.u64 %rd976, %rd959, 32; + cvt.u32.u64 %r30671, %rd976; + shr.u64 %rd977, %rd960, 32; + cvt.u32.u64 %r30667, %rd977; + shr.u64 %rd978, %rd961, 32; + cvt.u32.u64 %r30663, %rd978; + shr.u64 %rd979, %rd962, 32; + cvt.u32.u64 %r30661, %rd979; + shr.u64 %rd980, %rd963, 32; + cvt.u32.u64 %r30669, %rd980; + shr.u64 %rd981, %rd964, 32; + cvt.u32.u64 %r30665, %rd981; + shr.u64 %rd982, %rd970, 32; + cvt.u32.u64 %r30722, %rd982; + shr.u64 %rd983, %rd971, 32; + cvt.u32.u64 %r30718, %rd983; + shr.u64 %rd984, %rd972, 32; + cvt.u32.u64 %r30714, %rd984; + shr.u64 %rd985, %rd973, 32; + cvt.u32.u64 %r30712, %rd985; + shr.u64 %rd986, %rd974, 32; + cvt.u32.u64 %r30720, %rd986; + shr.u64 %rd987, %rd975, 32; + cvt.u32.u64 %r30716, %rd987; + @%p44 bra $L__BB2_77; + + mov.u32 %r30622, 0; + st.local.v2.u32 [%rd3+96], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+104], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+112], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+120], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+128], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+136], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+144], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+152], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+160], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+168], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+176], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+184], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+192], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+200], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+208], {%r30622, %r30622}; + st.local.v2.u32 [%rd3+216], {%r30622, %r30622}; + mov.u32 %r30637, -2147483648; + mov.u32 %r21658, 1; + st.local.v2.u32 [%rd3+88], {%r21658, %r30637}; + mov.u32 %r30623, %r30622; + mov.u32 %r30624, %r30622; + mov.u32 %r30625, %r30622; + mov.u32 %r30626, %r30622; + mov.u32 %r30627, %r30622; + mov.u32 %r30628, %r30622; + mov.u32 %r30629, %r30622; + mov.u32 %r30630, %r30622; + mov.u32 %r30631, %r30622; + mov.u32 %r30632, %r30622; + mov.u32 %r30633, %r30622; + mov.u32 %r30634, %r30622; + mov.u32 %r30635, %r30622; + mov.u32 %r30636, %r21658; + mov.u32 %r30638, %r30622; + mov.u32 %r30639, %r30622; + mov.u32 %r30640, %r30622; + mov.u32 %r30641, %r30622; + mov.u32 %r30642, %r30622; + mov.u32 %r30643, %r30622; + mov.u32 %r30644, %r30622; + mov.u32 %r30645, %r30622; + mov.u32 %r30646, %r30622; + mov.u32 %r30647, %r30622; + mov.u32 %r30648, %r30622; + mov.u32 %r30649, %r30622; + mov.u32 %r30650, %r30622; + mov.u32 %r30651, %r30622; + mov.u32 %r30652, %r30622; + mov.u32 %r30653, %r30622; + mov.u32 %r30654, %r30622; + mov.u32 %r30655, %r30622; + mov.u32 %r30672, %r30622; + +$L__BB2_79: + // begin inline asm + // xor5 + lop3.b32 %r21685, %r30658, %r30656, %r30654, 0x96; + lop3.b32 %r21685, %r21685, %r30652, %r30650, 0x96; + lop3.b32 %r21686, %r30659, %r30657, %r30655, 0x96; + lop3.b32 %r21686, %r21686, %r30653, %r30651, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21697, %r30670, %r30668, %r30648, 0x96; + lop3.b32 %r21697, %r21697, %r30646, %r30644, 0x96; + lop3.b32 %r21698, %r30671, %r30669, %r30649, 0x96; + lop3.b32 %r21698, %r21698, %r30647, %r30645, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21709, %r30666, %r30664, %r30642, 0x96; + lop3.b32 %r21709, %r21709, %r30640, %r30638, 0x96; + lop3.b32 %r21710, %r30667, %r30665, %r30643, 0x96; + lop3.b32 %r21710, %r21710, %r30641, %r30639, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21721, %r30662, %r30636, %r30634, 0x96; + lop3.b32 %r21721, %r21721, %r30632, %r30630, 0x96; + lop3.b32 %r21722, %r30663, %r30637, %r30635, 0x96; + lop3.b32 %r21722, %r21722, %r30633, %r30631, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r21733, %r30660, %r30628, %r30626, 0x96; + lop3.b32 %r21733, %r21733, %r30624, %r30622, 0x96; + lop3.b32 %r21734, %r30661, %r30629, %r30627, 0x96; + lop3.b32 %r21734, %r21734, %r30625, %r30623, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21745, %r21698, %r21697, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21749, %r21697, %r21698, %r21658; + // end inline asm + xor.b32 %r22179, %r21745, %r21733; + xor.b32 %r22180, %r21749, %r21734; + xor.b32 %r22012, %r30658, %r22179; + xor.b32 %r22015, %r30659, %r22180; + xor.b32 %r21919, %r30656, %r22179; + xor.b32 %r21918, %r30657, %r22180; + xor.b32 %r21966, %r30654, %r22179; + xor.b32 %r21967, %r30655, %r22180; + xor.b32 %r21871, %r30652, %r22179; + xor.b32 %r21870, %r30653, %r22180; + xor.b32 %r21822, %r30650, %r22179; + xor.b32 %r21823, %r30651, %r22180; + // begin inline asm + shf.l.wrap.b32 %r21753, %r21710, %r21709, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21757, %r21709, %r21710, %r21658; + // end inline asm + xor.b32 %r22181, %r21753, %r21685; + xor.b32 %r22182, %r21757, %r21686; + xor.b32 %r21974, %r30670, %r22181; + xor.b32 %r21975, %r30671, %r22182; + xor.b32 %r21791, %r30668, %r22181; + xor.b32 %r21790, %r30669, %r22182; + xor.b32 %r21950, %r30648, %r22181; + xor.b32 %r21951, %r30649, %r22182; + xor.b32 %r21911, %r30646, %r22181; + xor.b32 %r21910, %r30647, %r22182; + xor.b32 %r21894, %r30644, %r22181; + xor.b32 %r21895, %r30645, %r22182; + // begin inline asm + shf.l.wrap.b32 %r21761, %r21722, %r21721, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21765, %r21721, %r21722, %r21658; + // end inline asm + xor.b32 %r22183, %r21761, %r21697; + xor.b32 %r22184, %r21765, %r21698; + xor.b32 %r21831, %r30666, %r22183; + xor.b32 %r21830, %r30667, %r22184; + xor.b32 %r21958, %r30664, %r22183; + xor.b32 %r21959, %r30665, %r22184; + xor.b32 %r21839, %r30642, %r22183; + xor.b32 %r21838, %r30643, %r22184; + xor.b32 %r21942, %r30640, %r22183; + xor.b32 %r21943, %r30641, %r22184; + xor.b32 %r21807, %r30638, %r22183; + xor.b32 %r21806, %r30639, %r22184; + // begin inline asm + shf.l.wrap.b32 %r21769, %r21734, %r21733, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21773, %r21733, %r21734, %r21658; + // end inline asm + xor.b32 %r22185, %r21769, %r21709; + xor.b32 %r22186, %r21773, %r21710; + xor.b32 %r21926, %r30662, %r22185; + xor.b32 %r21927, %r30663, %r22186; + xor.b32 %r21903, %r30636, %r22185; + xor.b32 %r21902, %r30637, %r22186; + xor.b32 %r21846, %r30634, %r22185; + xor.b32 %r21847, %r30635, %r22186; + xor.b32 %r21934, %r30632, %r22185; + xor.b32 %r21935, %r30633, %r22186; + xor.b32 %r21863, %r30630, %r22185; + xor.b32 %r21862, %r30631, %r22186; + // begin inline asm + shf.l.wrap.b32 %r21777, %r21686, %r21685, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21781, %r21685, %r21686, %r21658; + // end inline asm + xor.b32 %r22187, %r21777, %r21721; + xor.b32 %r22188, %r21781, %r21722; + xor.b32 %r21878, %r30660, %r22187; + xor.b32 %r21879, %r30661, %r22188; + xor.b32 %r21798, %r30628, %r22187; + xor.b32 %r21799, %r30629, %r22188; + xor.b32 %r21815, %r30626, %r22187; + xor.b32 %r21814, %r30627, %r22188; + xor.b32 %r21854, %r30624, %r22187; + xor.b32 %r21855, %r30625, %r22188; + xor.b32 %r21886, %r30622, %r22187; + xor.b32 %r21887, %r30623, %r22188; + mov.u32 %r21792, 44; + // begin inline asm + shf.l.wrap.b32 %r21785, %r21791, %r21790, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21789, %r21790, %r21791, %r21792; + // end inline asm + mov.u32 %r21800, 20; + // begin inline asm + shf.l.wrap.b32 %r21793, %r21799, %r21798, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21797, %r21798, %r21799, %r21800; + // end inline asm + mov.u32 %r21808, 61; + // begin inline asm + shf.l.wrap.b32 %r21801, %r21807, %r21806, %r21808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21805, %r21806, %r21807, %r21808; + // end inline asm + mov.u32 %r21816, 39; + // begin inline asm + shf.l.wrap.b32 %r21809, %r21815, %r21814, %r21816; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21813, %r21814, %r21815, %r21816; + // end inline asm + mov.u32 %r21824, 18; + // begin inline asm + shf.l.wrap.b32 %r21817, %r21823, %r21822, %r21824; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21821, %r21822, %r21823, %r21824; + // end inline asm + mov.u32 %r21832, 62; + // begin inline asm + shf.l.wrap.b32 %r21825, %r21831, %r21830, %r21832; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21829, %r21830, %r21831, %r21832; + // end inline asm + mov.u32 %r21840, 43; + // begin inline asm + shf.l.wrap.b32 %r21833, %r21839, %r21838, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21837, %r21838, %r21839, %r21840; + // end inline asm + mov.u32 %r21848, 25; + // begin inline asm + shf.l.wrap.b32 %r21841, %r21847, %r21846, %r21848; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21845, %r21846, %r21847, %r21848; + // end inline asm + mov.u32 %r21856, 8; + // begin inline asm + shf.l.wrap.b32 %r21849, %r21855, %r21854, %r21856; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21853, %r21854, %r21855, %r21856; + // end inline asm + mov.u32 %r21864, 56; + // begin inline asm + shf.l.wrap.b32 %r21857, %r21863, %r21862, %r21864; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21861, %r21862, %r21863, %r21864; + // end inline asm + mov.u32 %r21872, 41; + // begin inline asm + shf.l.wrap.b32 %r21865, %r21871, %r21870, %r21872; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21869, %r21870, %r21871, %r21872; + // end inline asm + mov.u32 %r21880, 27; + // begin inline asm + shf.l.wrap.b32 %r21873, %r21879, %r21878, %r21880; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21877, %r21878, %r21879, %r21880; + // end inline asm + mov.u32 %r21888, 14; + // begin inline asm + shf.l.wrap.b32 %r21881, %r21887, %r21886, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21885, %r21886, %r21887, %r21888; + // end inline asm + mov.u32 %r21896, 2; + // begin inline asm + shf.l.wrap.b32 %r21889, %r21895, %r21894, %r21896; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21893, %r21894, %r21895, %r21896; + // end inline asm + mov.u32 %r21904, 55; + // begin inline asm + shf.l.wrap.b32 %r21897, %r21903, %r21902, %r21904; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21901, %r21902, %r21903, %r21904; + // end inline asm + mov.u32 %r21912, 45; + // begin inline asm + shf.l.wrap.b32 %r21905, %r21911, %r21910, %r21912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21909, %r21910, %r21911, %r21912; + // end inline asm + mov.u32 %r21920, 36; + // begin inline asm + shf.l.wrap.b32 %r21913, %r21919, %r21918, %r21920; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21917, %r21918, %r21919, %r21920; + // end inline asm + mov.u32 %r21928, 28; + // begin inline asm + shf.l.wrap.b32 %r21921, %r21927, %r21926, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21925, %r21926, %r21927, %r21928; + // end inline asm + mov.u32 %r21936, 21; + // begin inline asm + shf.l.wrap.b32 %r21929, %r21935, %r21934, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21933, %r21934, %r21935, %r21936; + // end inline asm + mov.u32 %r21944, 15; + // begin inline asm + shf.l.wrap.b32 %r21937, %r21943, %r21942, %r21944; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21941, %r21942, %r21943, %r21944; + // end inline asm + mov.u32 %r21952, 10; + // begin inline asm + shf.l.wrap.b32 %r21945, %r21951, %r21950, %r21952; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21949, %r21950, %r21951, %r21952; + // end inline asm + mov.u32 %r21960, 6; + // begin inline asm + shf.l.wrap.b32 %r21953, %r21959, %r21958, %r21960; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21957, %r21958, %r21959, %r21960; + // end inline asm + mov.u32 %r21968, 3; + // begin inline asm + shf.l.wrap.b32 %r21961, %r21967, %r21966, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21965, %r21966, %r21967, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21969, %r21975, %r21974, %r21658; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r21973, %r21974, %r21975, %r21658; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r21977, %r22012, %r21785, %r21833, 0xD2; + lop3.b32 %r21978, %r22015, %r21789, %r21837, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30670, %r21785, %r21833, %r21929, 0xD2; + lop3.b32 %r30671, %r21789, %r21837, %r21933, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30666, %r21833, %r21929, %r21881, 0xD2; + lop3.b32 %r30667, %r21837, %r21933, %r21885, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30662, %r21929, %r21881, %r22012, 0xD2; + lop3.b32 %r30663, %r21933, %r21885, %r22015, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30660, %r21881, %r22012, %r21785, 0xD2; + lop3.b32 %r30661, %r21885, %r22015, %r21789, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30656, %r21921, %r21793, %r21961, 0xD2; + lop3.b32 %r30657, %r21925, %r21797, %r21965, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30668, %r21793, %r21961, %r21905, 0xD2; + lop3.b32 %r30669, %r21797, %r21965, %r21909, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30664, %r21961, %r21905, %r21801, 0xD2; + lop3.b32 %r30665, %r21965, %r21909, %r21805, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30636, %r21905, %r21801, %r21921, 0xD2; + lop3.b32 %r30637, %r21909, %r21805, %r21925, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r30636, %r30637}; + // begin inline asm + // chi + lop3.b32 %r30628, %r21801, %r21921, %r21793, 0xD2; + lop3.b32 %r30629, %r21805, %r21925, %r21797, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r30628, %r30629}; + // begin inline asm + // chi + lop3.b32 %r30654, %r21969, %r21953, %r21841, 0xD2; + lop3.b32 %r30655, %r21973, %r21957, %r21845, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+104], {%r30654, %r30655}; + // begin inline asm + // chi + lop3.b32 %r30648, %r21953, %r21841, %r21849, 0xD2; + lop3.b32 %r30649, %r21957, %r21845, %r21853, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+112], {%r30648, %r30649}; + // begin inline asm + // chi + lop3.b32 %r30642, %r21841, %r21849, %r21817, 0xD2; + lop3.b32 %r30643, %r21845, %r21853, %r21821, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+120], {%r30642, %r30643}; + // begin inline asm + // chi + lop3.b32 %r30634, %r21849, %r21817, %r21969, 0xD2; + lop3.b32 %r30635, %r21853, %r21821, %r21973, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+128], {%r30634, %r30635}; + // begin inline asm + // chi + lop3.b32 %r30626, %r21817, %r21969, %r21953, 0xD2; + lop3.b32 %r30627, %r21821, %r21973, %r21957, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+136], {%r30626, %r30627}; + // begin inline asm + // chi + lop3.b32 %r30652, %r21873, %r21913, %r21945, 0xD2; + lop3.b32 %r30653, %r21877, %r21917, %r21949, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+144], {%r30652, %r30653}; + // begin inline asm + // chi + lop3.b32 %r30646, %r21913, %r21945, %r21937, 0xD2; + lop3.b32 %r30647, %r21917, %r21949, %r21941, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+152], {%r30646, %r30647}; + // begin inline asm + // chi + lop3.b32 %r30640, %r21945, %r21937, %r21857, 0xD2; + lop3.b32 %r30641, %r21949, %r21941, %r21861, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+160], {%r30640, %r30641}; + // begin inline asm + // chi + lop3.b32 %r30632, %r21937, %r21857, %r21873, 0xD2; + lop3.b32 %r30633, %r21941, %r21861, %r21877, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+168], {%r30632, %r30633}; + // begin inline asm + // chi + lop3.b32 %r30624, %r21857, %r21873, %r21913, 0xD2; + lop3.b32 %r30625, %r21861, %r21877, %r21917, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+176], {%r30624, %r30625}; + // begin inline asm + // chi + lop3.b32 %r30650, %r21825, %r21897, %r21809, 0xD2; + lop3.b32 %r30651, %r21829, %r21901, %r21813, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+184], {%r30650, %r30651}; + // begin inline asm + // chi + lop3.b32 %r30644, %r21897, %r21809, %r21865, 0xD2; + lop3.b32 %r30645, %r21901, %r21813, %r21869, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+192], {%r30644, %r30645}; + // begin inline asm + // chi + lop3.b32 %r30638, %r21809, %r21865, %r21889, 0xD2; + lop3.b32 %r30639, %r21813, %r21869, %r21893, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+200], {%r30638, %r30639}; + // begin inline asm + // chi + lop3.b32 %r30630, %r21865, %r21889, %r21825, 0xD2; + lop3.b32 %r30631, %r21869, %r21893, %r21829, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+208], {%r30630, %r30631}; + // begin inline asm + // chi + lop3.b32 %r30622, %r21889, %r21825, %r21897, 0xD2; + lop3.b32 %r30623, %r21893, %r21829, %r21901, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+216], {%r30622, %r30623}; + mul.wide.s32 %rd991, %r30672, 8; + add.s64 %rd990, %rd918, %rd991; + // begin inline asm + ld.global.nc.v2.u32 {%r22177,%r22178}, [%rd990]; + // end inline asm + xor.b32 %r30658, %r21977, %r22177; + xor.b32 %r30659, %r21978, %r22178; + add.s32 %r30672, %r30672, 1; + setp.lt.u32 %p45, %r30672, 23; + @%p45 bra $L__BB2_79; + + st.local.v2.u32 [%rd3+32], {%r30670, %r30671}; + st.local.v2.u32 [%rd3+72], {%r30668, %r30669}; + st.local.v2.u32 [%rd3+40], {%r30666, %r30667}; + st.local.v2.u32 [%rd3+80], {%r30664, %r30665}; + st.local.v2.u32 [%rd3+48], {%r30662, %r30663}; + st.local.v2.u32 [%rd3+56], {%r30660, %r30661}; + st.local.v2.u32 [%rd3+24], {%r30658, %r30659}; + // begin inline asm + // xor5 + lop3.b32 %r22189, %r30658, %r30656, %r30654, 0x96; + lop3.b32 %r22189, %r22189, %r30652, %r30650, 0x96; + lop3.b32 %r22190, %r30659, %r30657, %r30655, 0x96; + lop3.b32 %r22190, %r22190, %r30653, %r30651, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22201, %r30670, %r30668, %r30648, 0x96; + lop3.b32 %r22201, %r22201, %r30646, %r30644, 0x96; + lop3.b32 %r22202, %r30671, %r30669, %r30649, 0x96; + lop3.b32 %r22202, %r22202, %r30647, %r30645, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22213, %r30666, %r30664, %r30642, 0x96; + lop3.b32 %r22213, %r22213, %r30640, %r30638, 0x96; + lop3.b32 %r22214, %r30667, %r30665, %r30643, 0x96; + lop3.b32 %r22214, %r22214, %r30641, %r30639, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22225, %r30662, %r30636, %r30634, 0x96; + lop3.b32 %r22225, %r22225, %r30632, %r30630, 0x96; + lop3.b32 %r22226, %r30663, %r30637, %r30635, 0x96; + lop3.b32 %r22226, %r22226, %r30633, %r30631, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22237, %r30660, %r30628, %r30626, 0x96; + lop3.b32 %r22237, %r22237, %r30624, %r30622, 0x96; + lop3.b32 %r22238, %r30661, %r30629, %r30627, 0x96; + lop3.b32 %r22238, %r22238, %r30625, %r30623, 0x96; + // end inline asm + mov.u32 %r22441, 1; + // begin inline asm + shf.l.wrap.b32 %r22249, %r22202, %r22201, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22253, %r22201, %r22202, %r22441; + // end inline asm + xor.b32 %r22468, %r22249, %r22237; + xor.b32 %r22469, %r22253, %r22238; + xor.b32 %r22396, %r30658, %r22468; + xor.b32 %r22399, %r30659, %r22469; + xor.b32 %r22359, %r30655, %r22469; + xor.b32 %r22358, %r30654, %r22468; + st.local.v2.u32 [%rd3+104], {%r22358, %r22359}; + // begin inline asm + shf.l.wrap.b32 %r22257, %r22214, %r22213, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22261, %r22213, %r22214, %r22441; + // end inline asm + xor.b32 %r22470, %r22257, %r22189; + xor.b32 %r22471, %r22261, %r22190; + xor.b32 %r22295, %r30668, %r22470; + xor.b32 %r22294, %r30669, %r22471; + xor.b32 %r22334, %r30647, %r22471; + xor.b32 %r22335, %r30646, %r22470; + st.local.v2.u32 [%rd3+152], {%r22335, %r22334}; + // begin inline asm + shf.l.wrap.b32 %r22265, %r22226, %r22225, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22269, %r22225, %r22226, %r22441; + // end inline asm + xor.b32 %r22472, %r22265, %r22201; + xor.b32 %r22473, %r22269, %r22202; + xor.b32 %r22318, %r30643, %r22473; + xor.b32 %r22319, %r30642, %r22472; + st.local.v2.u32 [%rd3+120], {%r22319, %r22318}; + xor.b32 %r22310, %r30639, %r22473; + xor.b32 %r22311, %r30638, %r22472; + st.local.v2.u32 [%rd3+200], {%r22311, %r22310}; + // begin inline asm + shf.l.wrap.b32 %r22273, %r22238, %r22237, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22277, %r22237, %r22238, %r22441; + // end inline asm + xor.b32 %r22474, %r22273, %r22213; + xor.b32 %r22475, %r22277, %r22214; + xor.b32 %r22342, %r30662, %r22474; + xor.b32 %r22343, %r30663, %r22475; + xor.b32 %r22351, %r30633, %r22475; + xor.b32 %r22350, %r30632, %r22474; + st.local.v2.u32 [%rd3+168], {%r22350, %r22351}; + // begin inline asm + shf.l.wrap.b32 %r22281, %r22190, %r22189, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22285, %r22189, %r22190, %r22441; + // end inline asm + xor.b32 %r22476, %r22281, %r22225; + xor.b32 %r22477, %r22285, %r22226; + xor.b32 %r22302, %r30628, %r22476; + xor.b32 %r22303, %r30629, %r22477; + xor.b32 %r22327, %r30623, %r22477; + xor.b32 %r22326, %r30622, %r22476; + st.local.v2.u32 [%rd3+216], {%r22326, %r22327}; + // begin inline asm + shf.l.wrap.b32 %r22289, %r22295, %r22294, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22293, %r22294, %r22295, %r21792; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22297, %r22303, %r22302, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22301, %r22302, %r22303, %r21800; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22309, %r22310, %r22311, %r21808; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22305, %r22311, %r22310, %r21808; + // end inline asm + st.local.v2.u32 [%rd3+96], {%r22305, %r22309}; + // begin inline asm + shf.l.wrap.b32 %r22313, %r22319, %r22318, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22317, %r22318, %r22319, %r21840; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22321, %r22327, %r22326, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22325, %r22326, %r22327, %r21888; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22333, %r22334, %r22335, %r21912; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22329, %r22335, %r22334, %r21912; + // end inline asm + st.local.v2.u32 [%rd3+88], {%r22329, %r22333}; + // begin inline asm + shf.l.wrap.b32 %r22337, %r22343, %r22342, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22341, %r22342, %r22343, %r21928; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22345, %r22351, %r22350, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22349, %r22350, %r22351, %r21936; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22353, %r22359, %r22358, %r21968; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22357, %r22358, %r22359, %r21968; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22361, %r22396, %r22289, %r22313, 0xD2; + lop3.b32 %r22362, %r22399, %r22293, %r22317, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22369, %r22289, %r22313, %r22345, 0xD2; + lop3.b32 %r22370, %r22293, %r22317, %r22349, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+32], {%r22369, %r22370}; + // begin inline asm + // chi + lop3.b32 %r22377, %r22313, %r22345, %r22321, 0xD2; + lop3.b32 %r22378, %r22317, %r22349, %r22325, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+40], {%r22377, %r22378}; + // begin inline asm + // chi + lop3.b32 %r22385, %r22345, %r22321, %r22396, 0xD2; + lop3.b32 %r22386, %r22349, %r22325, %r22399, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+48], {%r22385, %r22386}; + // begin inline asm + // chi + lop3.b32 %r22393, %r22321, %r22396, %r22289, 0xD2; + lop3.b32 %r22394, %r22325, %r22399, %r22293, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+56], {%r22393, %r22394}; + // begin inline asm + // chi + lop3.b32 %r22401, %r22337, %r22297, %r22353, 0xD2; + lop3.b32 %r22402, %r22341, %r22301, %r22357, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+64], {%r22401, %r22402}; + // begin inline asm + // chi + lop3.b32 %r22409, %r22297, %r22353, %r22329, 0xD2; + lop3.b32 %r22410, %r22301, %r22357, %r22333, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+72], {%r22409, %r22410}; + // begin inline asm + // chi + lop3.b32 %r22417, %r22353, %r22329, %r22305, 0xD2; + lop3.b32 %r22418, %r22357, %r22333, %r22309, 0xD2; + // end inline asm + st.local.v2.u32 [%rd3+80], {%r22417, %r22418}; + // begin inline asm + ld.global.nc.v2.u32 {%r22425,%r22426}, [%rd919]; + // end inline asm + xor.b32 %r22478, %r22362, %r22426; + xor.b32 %r22479, %r22361, %r22425; + mov.b64 %rd1269, {%r22479, %r22478}; + mov.b64 %rd1270, {%r22369, %r22370}; + mov.b64 %rd1271, {%r22377, %r22378}; + mov.b64 %rd250, {%r22385, %r22386}; + mov.b64 %rd1272, {%r22393, %r22394}; + mov.b64 %rd252, {%r22401, %r22402}; + mov.b64 %rd253, {%r22409, %r22410}; + mov.b64 %rd254, {%r22417, %r22418}; + mov.u32 %r30673, 0; + st.local.v2.u32 [%rd3+24], {%r22479, %r22478}; + st.local.v2.u32 [%rd925+96], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+104], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+112], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+120], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+128], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+136], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+144], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+152], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+160], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+168], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+176], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+184], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+192], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+200], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+208], {%r30673, %r30673}; + st.local.v2.u32 [%rd925+216], {%r30673, %r30673}; + mov.u32 %r30688, -2147483648; + st.local.v2.u32 [%rd925+88], {%r22441, %r30688}; + mov.u32 %r30674, %r30673; + mov.u32 %r30675, %r30673; + mov.u32 %r30676, %r30673; + mov.u32 %r30677, %r30673; + mov.u32 %r30678, %r30673; + mov.u32 %r30679, %r30673; + mov.u32 %r30680, %r30673; + mov.u32 %r30681, %r30673; + mov.u32 %r30682, %r30673; + mov.u32 %r30683, %r30673; + mov.u32 %r30684, %r30673; + mov.u32 %r30685, %r30673; + mov.u32 %r30686, %r30673; + mov.u32 %r30687, %r22441; + mov.u32 %r30689, %r30673; + mov.u32 %r30690, %r30673; + mov.u32 %r30691, %r30673; + mov.u32 %r30692, %r30673; + mov.u32 %r30693, %r30673; + mov.u32 %r30694, %r30673; + mov.u32 %r30695, %r30673; + mov.u32 %r30696, %r30673; + mov.u32 %r30697, %r30673; + mov.u32 %r30698, %r30673; + mov.u32 %r30699, %r30673; + mov.u32 %r30700, %r30673; + mov.u32 %r30701, %r30673; + mov.u32 %r30702, %r30673; + mov.u32 %r30703, %r30673; + mov.u32 %r30704, %r30673; + mov.u32 %r30705, %r30673; + mov.u32 %r30706, %r30673; + mov.u32 %r30723, %r30673; + +$L__BB2_81: + // begin inline asm + // xor5 + lop3.b32 %r22480, %r30709, %r30707, %r30705, 0x96; + lop3.b32 %r22480, %r22480, %r30703, %r30701, 0x96; + lop3.b32 %r22481, %r30710, %r30708, %r30706, 0x96; + lop3.b32 %r22481, %r22481, %r30704, %r30702, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22492, %r30721, %r30719, %r30699, 0x96; + lop3.b32 %r22492, %r22492, %r30697, %r30695, 0x96; + lop3.b32 %r22493, %r30722, %r30720, %r30700, 0x96; + lop3.b32 %r22493, %r22493, %r30698, %r30696, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22504, %r30717, %r30715, %r30693, 0x96; + lop3.b32 %r22504, %r22504, %r30691, %r30689, 0x96; + lop3.b32 %r22505, %r30718, %r30716, %r30694, 0x96; + lop3.b32 %r22505, %r22505, %r30692, %r30690, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22516, %r30713, %r30687, %r30685, 0x96; + lop3.b32 %r22516, %r22516, %r30683, %r30681, 0x96; + lop3.b32 %r22517, %r30714, %r30688, %r30686, 0x96; + lop3.b32 %r22517, %r22517, %r30684, %r30682, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22528, %r30711, %r30679, %r30677, 0x96; + lop3.b32 %r22528, %r22528, %r30675, %r30673, 0x96; + lop3.b32 %r22529, %r30712, %r30680, %r30678, 0x96; + lop3.b32 %r22529, %r22529, %r30676, %r30674, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22540, %r22493, %r22492, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22544, %r22492, %r22493, %r22441; + // end inline asm + xor.b32 %r22974, %r22540, %r22528; + xor.b32 %r22975, %r22544, %r22529; + xor.b32 %r22807, %r30709, %r22974; + xor.b32 %r22810, %r30710, %r22975; + xor.b32 %r22714, %r30707, %r22974; + xor.b32 %r22713, %r30708, %r22975; + xor.b32 %r22761, %r30705, %r22974; + xor.b32 %r22762, %r30706, %r22975; + xor.b32 %r22666, %r30703, %r22974; + xor.b32 %r22665, %r30704, %r22975; + xor.b32 %r22617, %r30701, %r22974; + xor.b32 %r22618, %r30702, %r22975; + // begin inline asm + shf.l.wrap.b32 %r22548, %r22505, %r22504, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22552, %r22504, %r22505, %r22441; + // end inline asm + xor.b32 %r22976, %r22548, %r22480; + xor.b32 %r22977, %r22552, %r22481; + xor.b32 %r22769, %r30721, %r22976; + xor.b32 %r22770, %r30722, %r22977; + xor.b32 %r22586, %r30719, %r22976; + xor.b32 %r22585, %r30720, %r22977; + xor.b32 %r22745, %r30699, %r22976; + xor.b32 %r22746, %r30700, %r22977; + xor.b32 %r22706, %r30697, %r22976; + xor.b32 %r22705, %r30698, %r22977; + xor.b32 %r22689, %r30695, %r22976; + xor.b32 %r22690, %r30696, %r22977; + // begin inline asm + shf.l.wrap.b32 %r22556, %r22517, %r22516, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22560, %r22516, %r22517, %r22441; + // end inline asm + xor.b32 %r22978, %r22556, %r22492; + xor.b32 %r22979, %r22560, %r22493; + xor.b32 %r22626, %r30717, %r22978; + xor.b32 %r22625, %r30718, %r22979; + xor.b32 %r22753, %r30715, %r22978; + xor.b32 %r22754, %r30716, %r22979; + xor.b32 %r22634, %r30693, %r22978; + xor.b32 %r22633, %r30694, %r22979; + xor.b32 %r22737, %r30691, %r22978; + xor.b32 %r22738, %r30692, %r22979; + xor.b32 %r22602, %r30689, %r22978; + xor.b32 %r22601, %r30690, %r22979; + // begin inline asm + shf.l.wrap.b32 %r22564, %r22529, %r22528, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22568, %r22528, %r22529, %r22441; + // end inline asm + xor.b32 %r22980, %r22564, %r22504; + xor.b32 %r22981, %r22568, %r22505; + xor.b32 %r22721, %r30713, %r22980; + xor.b32 %r22722, %r30714, %r22981; + xor.b32 %r22698, %r30687, %r22980; + xor.b32 %r22697, %r30688, %r22981; + xor.b32 %r22641, %r30685, %r22980; + xor.b32 %r22642, %r30686, %r22981; + xor.b32 %r22729, %r30683, %r22980; + xor.b32 %r22730, %r30684, %r22981; + xor.b32 %r22658, %r30681, %r22980; + xor.b32 %r22657, %r30682, %r22981; + // begin inline asm + shf.l.wrap.b32 %r22572, %r22481, %r22480, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22576, %r22480, %r22481, %r22441; + // end inline asm + xor.b32 %r22982, %r22572, %r22516; + xor.b32 %r22983, %r22576, %r22517; + xor.b32 %r22673, %r30711, %r22982; + xor.b32 %r22674, %r30712, %r22983; + xor.b32 %r22593, %r30679, %r22982; + xor.b32 %r22594, %r30680, %r22983; + xor.b32 %r22610, %r30677, %r22982; + xor.b32 %r22609, %r30678, %r22983; + xor.b32 %r22649, %r30675, %r22982; + xor.b32 %r22650, %r30676, %r22983; + xor.b32 %r22681, %r30673, %r22982; + xor.b32 %r22682, %r30674, %r22983; + mov.u32 %r22587, 44; + // begin inline asm + shf.l.wrap.b32 %r22580, %r22586, %r22585, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22584, %r22585, %r22586, %r22587; + // end inline asm + mov.u32 %r22595, 20; + // begin inline asm + shf.l.wrap.b32 %r22588, %r22594, %r22593, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22592, %r22593, %r22594, %r22595; + // end inline asm + mov.u32 %r22603, 61; + // begin inline asm + shf.l.wrap.b32 %r22596, %r22602, %r22601, %r22603; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22600, %r22601, %r22602, %r22603; + // end inline asm + mov.u32 %r22611, 39; + // begin inline asm + shf.l.wrap.b32 %r22604, %r22610, %r22609, %r22611; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22608, %r22609, %r22610, %r22611; + // end inline asm + mov.u32 %r22619, 18; + // begin inline asm + shf.l.wrap.b32 %r22612, %r22618, %r22617, %r22619; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22616, %r22617, %r22618, %r22619; + // end inline asm + mov.u32 %r22627, 62; + // begin inline asm + shf.l.wrap.b32 %r22620, %r22626, %r22625, %r22627; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22624, %r22625, %r22626, %r22627; + // end inline asm + mov.u32 %r22635, 43; + // begin inline asm + shf.l.wrap.b32 %r22628, %r22634, %r22633, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22632, %r22633, %r22634, %r22635; + // end inline asm + mov.u32 %r22643, 25; + // begin inline asm + shf.l.wrap.b32 %r22636, %r22642, %r22641, %r22643; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22640, %r22641, %r22642, %r22643; + // end inline asm + mov.u32 %r22651, 8; + // begin inline asm + shf.l.wrap.b32 %r22644, %r22650, %r22649, %r22651; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22648, %r22649, %r22650, %r22651; + // end inline asm + mov.u32 %r22659, 56; + // begin inline asm + shf.l.wrap.b32 %r22652, %r22658, %r22657, %r22659; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22656, %r22657, %r22658, %r22659; + // end inline asm + mov.u32 %r22667, 41; + // begin inline asm + shf.l.wrap.b32 %r22660, %r22666, %r22665, %r22667; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22664, %r22665, %r22666, %r22667; + // end inline asm + mov.u32 %r22675, 27; + // begin inline asm + shf.l.wrap.b32 %r22668, %r22674, %r22673, %r22675; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22672, %r22673, %r22674, %r22675; + // end inline asm + mov.u32 %r22683, 14; + // begin inline asm + shf.l.wrap.b32 %r22676, %r22682, %r22681, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22680, %r22681, %r22682, %r22683; + // end inline asm + mov.u32 %r22691, 2; + // begin inline asm + shf.l.wrap.b32 %r22684, %r22690, %r22689, %r22691; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22688, %r22689, %r22690, %r22691; + // end inline asm + mov.u32 %r22699, 55; + // begin inline asm + shf.l.wrap.b32 %r22692, %r22698, %r22697, %r22699; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22696, %r22697, %r22698, %r22699; + // end inline asm + mov.u32 %r22707, 45; + // begin inline asm + shf.l.wrap.b32 %r22700, %r22706, %r22705, %r22707; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22704, %r22705, %r22706, %r22707; + // end inline asm + mov.u32 %r22715, 36; + // begin inline asm + shf.l.wrap.b32 %r22708, %r22714, %r22713, %r22715; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22712, %r22713, %r22714, %r22715; + // end inline asm + mov.u32 %r22723, 28; + // begin inline asm + shf.l.wrap.b32 %r22716, %r22722, %r22721, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22720, %r22721, %r22722, %r22723; + // end inline asm + mov.u32 %r22731, 21; + // begin inline asm + shf.l.wrap.b32 %r22724, %r22730, %r22729, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22728, %r22729, %r22730, %r22731; + // end inline asm + mov.u32 %r22739, 15; + // begin inline asm + shf.l.wrap.b32 %r22732, %r22738, %r22737, %r22739; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22736, %r22737, %r22738, %r22739; + // end inline asm + mov.u32 %r22747, 10; + // begin inline asm + shf.l.wrap.b32 %r22740, %r22746, %r22745, %r22747; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22744, %r22745, %r22746, %r22747; + // end inline asm + mov.u32 %r22755, 6; + // begin inline asm + shf.l.wrap.b32 %r22748, %r22754, %r22753, %r22755; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22752, %r22753, %r22754, %r22755; + // end inline asm + mov.u32 %r22763, 3; + // begin inline asm + shf.l.wrap.b32 %r22756, %r22762, %r22761, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22760, %r22761, %r22762, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22764, %r22770, %r22769, %r22441; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r22768, %r22769, %r22770, %r22441; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r22772, %r22807, %r22580, %r22628, 0xD2; + lop3.b32 %r22773, %r22810, %r22584, %r22632, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30721, %r22580, %r22628, %r22724, 0xD2; + lop3.b32 %r30722, %r22584, %r22632, %r22728, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30717, %r22628, %r22724, %r22676, 0xD2; + lop3.b32 %r30718, %r22632, %r22728, %r22680, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30713, %r22724, %r22676, %r22807, 0xD2; + lop3.b32 %r30714, %r22728, %r22680, %r22810, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30711, %r22676, %r22807, %r22580, 0xD2; + lop3.b32 %r30712, %r22680, %r22810, %r22584, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30707, %r22716, %r22588, %r22756, 0xD2; + lop3.b32 %r30708, %r22720, %r22592, %r22760, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30719, %r22588, %r22756, %r22700, 0xD2; + lop3.b32 %r30720, %r22592, %r22760, %r22704, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30715, %r22756, %r22700, %r22596, 0xD2; + lop3.b32 %r30716, %r22760, %r22704, %r22600, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r30687, %r22700, %r22596, %r22716, 0xD2; + lop3.b32 %r30688, %r22704, %r22600, %r22720, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r30687, %r30688}; + // begin inline asm + // chi + lop3.b32 %r30679, %r22596, %r22716, %r22588, 0xD2; + lop3.b32 %r30680, %r22600, %r22720, %r22592, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r30679, %r30680}; + // begin inline asm + // chi + lop3.b32 %r30705, %r22764, %r22748, %r22636, 0xD2; + lop3.b32 %r30706, %r22768, %r22752, %r22640, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+104], {%r30705, %r30706}; + // begin inline asm + // chi + lop3.b32 %r30699, %r22748, %r22636, %r22644, 0xD2; + lop3.b32 %r30700, %r22752, %r22640, %r22648, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+112], {%r30699, %r30700}; + // begin inline asm + // chi + lop3.b32 %r30693, %r22636, %r22644, %r22612, 0xD2; + lop3.b32 %r30694, %r22640, %r22648, %r22616, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+120], {%r30693, %r30694}; + // begin inline asm + // chi + lop3.b32 %r30685, %r22644, %r22612, %r22764, 0xD2; + lop3.b32 %r30686, %r22648, %r22616, %r22768, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+128], {%r30685, %r30686}; + // begin inline asm + // chi + lop3.b32 %r30677, %r22612, %r22764, %r22748, 0xD2; + lop3.b32 %r30678, %r22616, %r22768, %r22752, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+136], {%r30677, %r30678}; + // begin inline asm + // chi + lop3.b32 %r30703, %r22668, %r22708, %r22740, 0xD2; + lop3.b32 %r30704, %r22672, %r22712, %r22744, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+144], {%r30703, %r30704}; + // begin inline asm + // chi + lop3.b32 %r30697, %r22708, %r22740, %r22732, 0xD2; + lop3.b32 %r30698, %r22712, %r22744, %r22736, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+152], {%r30697, %r30698}; + // begin inline asm + // chi + lop3.b32 %r30691, %r22740, %r22732, %r22652, 0xD2; + lop3.b32 %r30692, %r22744, %r22736, %r22656, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+160], {%r30691, %r30692}; + // begin inline asm + // chi + lop3.b32 %r30683, %r22732, %r22652, %r22668, 0xD2; + lop3.b32 %r30684, %r22736, %r22656, %r22672, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+168], {%r30683, %r30684}; + // begin inline asm + // chi + lop3.b32 %r30675, %r22652, %r22668, %r22708, 0xD2; + lop3.b32 %r30676, %r22656, %r22672, %r22712, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+176], {%r30675, %r30676}; + // begin inline asm + // chi + lop3.b32 %r30701, %r22620, %r22692, %r22604, 0xD2; + lop3.b32 %r30702, %r22624, %r22696, %r22608, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+184], {%r30701, %r30702}; + // begin inline asm + // chi + lop3.b32 %r30695, %r22692, %r22604, %r22660, 0xD2; + lop3.b32 %r30696, %r22696, %r22608, %r22664, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+192], {%r30695, %r30696}; + // begin inline asm + // chi + lop3.b32 %r30689, %r22604, %r22660, %r22684, 0xD2; + lop3.b32 %r30690, %r22608, %r22664, %r22688, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+200], {%r30689, %r30690}; + // begin inline asm + // chi + lop3.b32 %r30681, %r22660, %r22684, %r22620, 0xD2; + lop3.b32 %r30682, %r22664, %r22688, %r22624, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+208], {%r30681, %r30682}; + // begin inline asm + // chi + lop3.b32 %r30673, %r22684, %r22620, %r22692, 0xD2; + lop3.b32 %r30674, %r22688, %r22624, %r22696, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+216], {%r30673, %r30674}; + mul.wide.s32 %rd1002, %r30723, 8; + add.s64 %rd1001, %rd918, %rd1002; + // begin inline asm + ld.global.nc.v2.u32 {%r22972,%r22973}, [%rd1001]; + // end inline asm + xor.b32 %r30709, %r22772, %r22972; + xor.b32 %r30710, %r22773, %r22973; + add.s32 %r30723, %r30723, 1; + setp.lt.u32 %p46, %r30723, 23; + @%p46 bra $L__BB2_81; -$L__BB0_12: - ld.const.u64 %rd76, [target+16]; - setp.eq.s64 %p13, %rd479, %rd76; - @%p13 bra $L__BB0_14; - bra.uni $L__BB0_13; + mov.u32 %r23083, 1; + st.local.v2.u32 [%rd925+32], {%r30721, %r30722}; + st.local.v2.u32 [%rd925+72], {%r30719, %r30720}; + st.local.v2.u32 [%rd925+40], {%r30717, %r30718}; + st.local.v2.u32 [%rd925+80], {%r30715, %r30716}; + st.local.v2.u32 [%rd925+48], {%r30713, %r30714}; + st.local.v2.u32 [%rd925+56], {%r30711, %r30712}; + st.local.v2.u32 [%rd925+24], {%r30709, %r30710}; + // begin inline asm + // xor5 + lop3.b32 %r22984, %r30709, %r30707, %r30705, 0x96; + lop3.b32 %r22984, %r22984, %r30703, %r30701, 0x96; + lop3.b32 %r22985, %r30710, %r30708, %r30706, 0x96; + lop3.b32 %r22985, %r22985, %r30704, %r30702, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r22996, %r30721, %r30719, %r30699, 0x96; + lop3.b32 %r22996, %r22996, %r30697, %r30695, 0x96; + lop3.b32 %r22997, %r30722, %r30720, %r30700, 0x96; + lop3.b32 %r22997, %r22997, %r30698, %r30696, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23008, %r30717, %r30715, %r30693, 0x96; + lop3.b32 %r23008, %r23008, %r30691, %r30689, 0x96; + lop3.b32 %r23009, %r30718, %r30716, %r30694, 0x96; + lop3.b32 %r23009, %r23009, %r30692, %r30690, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23020, %r30713, %r30687, %r30685, 0x96; + lop3.b32 %r23020, %r23020, %r30683, %r30681, 0x96; + lop3.b32 %r23021, %r30714, %r30688, %r30686, 0x96; + lop3.b32 %r23021, %r23021, %r30684, %r30682, 0x96; + // end inline asm + // begin inline asm + // xor5 + lop3.b32 %r23032, %r30711, %r30679, %r30677, 0x96; + lop3.b32 %r23032, %r23032, %r30675, %r30673, 0x96; + lop3.b32 %r23033, %r30712, %r30680, %r30678, 0x96; + lop3.b32 %r23033, %r23033, %r30676, %r30674, 0x96; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23044, %r22997, %r22996, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23048, %r22996, %r22997, %r23083; + // end inline asm + xor.b32 %r23222, %r23044, %r23032; + xor.b32 %r23223, %r23048, %r23033; + xor.b32 %r23191, %r30709, %r23222; + xor.b32 %r23194, %r30710, %r23223; + xor.b32 %r23154, %r30706, %r23223; + xor.b32 %r23153, %r30705, %r23222; + st.local.v2.u32 [%rd925+104], {%r23153, %r23154}; + // begin inline asm + shf.l.wrap.b32 %r23052, %r23009, %r23008, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23056, %r23008, %r23009, %r23083; + // end inline asm + xor.b32 %r23224, %r23052, %r22984; + xor.b32 %r23225, %r23056, %r22985; + xor.b32 %r23090, %r30719, %r23224; + xor.b32 %r23089, %r30720, %r23225; + xor.b32 %r23129, %r30698, %r23225; + xor.b32 %r23130, %r30697, %r23224; + st.local.v2.u32 [%rd925+152], {%r23130, %r23129}; + // begin inline asm + shf.l.wrap.b32 %r23060, %r23021, %r23020, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23064, %r23020, %r23021, %r23083; + // end inline asm + xor.b32 %r23226, %r23060, %r22996; + xor.b32 %r23227, %r23064, %r22997; + xor.b32 %r23113, %r30694, %r23227; + xor.b32 %r23114, %r30693, %r23226; + st.local.v2.u32 [%rd925+120], {%r23114, %r23113}; + xor.b32 %r23105, %r30690, %r23227; + xor.b32 %r23106, %r30689, %r23226; + st.local.v2.u32 [%rd925+200], {%r23106, %r23105}; + // begin inline asm + shf.l.wrap.b32 %r23068, %r23033, %r23032, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23072, %r23032, %r23033, %r23083; + // end inline asm + xor.b32 %r23228, %r23068, %r23008; + xor.b32 %r23229, %r23072, %r23009; + xor.b32 %r23137, %r30713, %r23228; + xor.b32 %r23138, %r30714, %r23229; + xor.b32 %r23146, %r30684, %r23229; + xor.b32 %r23145, %r30683, %r23228; + st.local.v2.u32 [%rd925+168], {%r23145, %r23146}; + // begin inline asm + shf.l.wrap.b32 %r23076, %r22985, %r22984, %r23083; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23080, %r22984, %r22985, %r23083; + // end inline asm + xor.b32 %r23230, %r23076, %r23020; + xor.b32 %r23231, %r23080, %r23021; + xor.b32 %r23097, %r30679, %r23230; + xor.b32 %r23098, %r30680, %r23231; + xor.b32 %r23122, %r30674, %r23231; + xor.b32 %r23121, %r30673, %r23230; + st.local.v2.u32 [%rd925+216], {%r23121, %r23122}; + // begin inline asm + shf.l.wrap.b32 %r23084, %r23090, %r23089, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23088, %r23089, %r23090, %r22587; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23092, %r23098, %r23097, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23096, %r23097, %r23098, %r22595; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23104, %r23105, %r23106, %r22603; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23100, %r23106, %r23105, %r22603; + // end inline asm + st.local.v2.u32 [%rd925+96], {%r23100, %r23104}; + // begin inline asm + shf.l.wrap.b32 %r23108, %r23114, %r23113, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23112, %r23113, %r23114, %r22635; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23116, %r23122, %r23121, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23120, %r23121, %r23122, %r22683; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23128, %r23129, %r23130, %r22707; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23124, %r23130, %r23129, %r22707; + // end inline asm + st.local.v2.u32 [%rd925+88], {%r23124, %r23128}; + // begin inline asm + shf.l.wrap.b32 %r23132, %r23138, %r23137, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23136, %r23137, %r23138, %r22723; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23140, %r23146, %r23145, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23144, %r23145, %r23146, %r22731; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23148, %r23154, %r23153, %r22763; + // end inline asm + // begin inline asm + shf.l.wrap.b32 %r23152, %r23153, %r23154, %r22763; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23156, %r23191, %r23084, %r23108, 0xD2; + lop3.b32 %r23157, %r23194, %r23088, %r23112, 0xD2; + // end inline asm + // begin inline asm + // chi + lop3.b32 %r23164, %r23084, %r23108, %r23140, 0xD2; + lop3.b32 %r23165, %r23088, %r23112, %r23144, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+32], {%r23164, %r23165}; + // begin inline asm + // chi + lop3.b32 %r23172, %r23108, %r23140, %r23116, 0xD2; + lop3.b32 %r23173, %r23112, %r23144, %r23120, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+40], {%r23172, %r23173}; + // begin inline asm + // chi + lop3.b32 %r23180, %r23140, %r23116, %r23191, 0xD2; + lop3.b32 %r23181, %r23144, %r23120, %r23194, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+48], {%r23180, %r23181}; + // begin inline asm + // chi + lop3.b32 %r23188, %r23116, %r23191, %r23084, 0xD2; + lop3.b32 %r23189, %r23120, %r23194, %r23088, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+56], {%r23188, %r23189}; + // begin inline asm + // chi + lop3.b32 %r23196, %r23132, %r23092, %r23148, 0xD2; + lop3.b32 %r23197, %r23136, %r23096, %r23152, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+64], {%r23196, %r23197}; + // begin inline asm + // chi + lop3.b32 %r23204, %r23092, %r23148, %r23124, 0xD2; + lop3.b32 %r23205, %r23096, %r23152, %r23128, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+72], {%r23204, %r23205}; + // begin inline asm + // chi + lop3.b32 %r23212, %r23148, %r23124, %r23100, 0xD2; + lop3.b32 %r23213, %r23152, %r23128, %r23104, 0xD2; + // end inline asm + st.local.v2.u32 [%rd925+80], {%r23212, %r23213}; + // begin inline asm + ld.global.nc.v2.u32 {%r23220,%r23221}, [%rd919]; + // end inline asm + xor.b32 %r23232, %r23157, %r23221; + xor.b32 %r23233, %r23156, %r23220; + st.local.v2.u32 [%rd925+24], {%r23233, %r23232}; + st.global.u64 [%rd224], %rd1269; + st.global.u64 [%rd224+8], %rd1270; + st.global.u64 [%rd224+16], %rd1271; + st.global.u64 [%rd224+24], %rd250; + st.global.u64 [%rd224+32], %rd1272; + st.global.u64 [%rd224+40], %rd252; + st.global.u64 [%rd224+48], %rd253; + st.global.u64 [%rd224+56], %rd254; + st.global.v2.u32 [%rd224+64], {%r23233, %r23232}; + st.global.v2.u32 [%rd224+72], {%r23164, %r23165}; + st.global.v2.u32 [%rd224+80], {%r23172, %r23173}; + st.global.v2.u32 [%rd224+88], {%r23180, %r23181}; + st.global.v2.u32 [%rd224+96], {%r23188, %r23189}; + st.global.v2.u32 [%rd224+104], {%r23196, %r23197}; + st.global.v2.u32 [%rd224+112], {%r23204, %r23205}; + st.global.v2.u32 [%rd224+120], {%r23212, %r23213}; -$L__BB0_14: - ld.const.u64 %rd77, [target+8]; - setp.eq.s64 %p14, %rd484, %rd77; - @%p14 bra $L__BB0_16; - bra.uni $L__BB0_15; +$L__BB2_94: + mul.lo.s32 %r26518, %r20, 16777619; + mov.b64 {%r26519, %r26520}, %rd1265; + mul.lo.s32 %r26521, %r19, 16777619; + xor.b32 %r26522, %r26518, %r26519; + xor.b32 %r26523, %r26521, %r26520; + mov.b64 %rd1116, {%r26522, %r26523}; + mov.b64 {%r26524, %r26525}, %rd1269; + xor.b32 %r26526, %r26525, %r19; + xor.b32 %r26527, %r26524, %r20; + mov.b64 %rd1117, {%r26527, %r26526}; + mov.b64 {%r26528, %r26529}, %rd1259; + mul.lo.s32 %r26530, %r26528, 16777619; + mov.b64 {%r26531, %r26532}, %rd1266; + mul.lo.s32 %r26533, %r26529, 16777619; + xor.b32 %r26534, %r26533, %r26532; + xor.b32 %r26535, %r26530, %r26531; + mov.b64 %rd1118, {%r26535, %r26534}; + mov.b64 {%r26536, %r26537}, %rd1270; + xor.b32 %r26538, %r26537, %r26529; + xor.b32 %r26539, %r26536, %r26528; + mov.b64 %rd1119, {%r26539, %r26538}; + mul.lo.s32 %r26540, %r24, 16777619; + mov.b64 {%r26541, %r26542}, %rd1267; + mul.lo.s32 %r26543, %r23, 16777619; + xor.b32 %r26544, %r26543, %r26542; + xor.b32 %r26545, %r26540, %r26541; + mov.b64 %rd1120, {%r26545, %r26544}; + mov.b64 {%r26546, %r26547}, %rd1271; + xor.b32 %r26548, %r26547, %r23; + xor.b32 %r26549, %r26546, %r24; + mov.b64 %rd1121, {%r26549, %r26548}; + mul.lo.s32 %r26550, %r28, 16777619; + mov.b64 {%r26551, %r26552}, %rd1268; + mul.lo.s32 %r26553, %r27, 16777619; + xor.b32 %r26554, %r26553, %r26552; + xor.b32 %r26555, %r26550, %r26551; + mov.b64 %rd1122, {%r26555, %r26554}; + mov.b64 {%r26556, %r26557}, %rd1272; + xor.b32 %r26558, %r26557, %r27; + xor.b32 %r26559, %r26556, %r28; + mov.b64 %rd1123, {%r26559, %r26558}; + mul.lo.s64 %rd1124, %rd1261, %rd1116; + add.s64 %rd1260, %rd1124, %rd1117; + mul.lo.s64 %rd1125, %rd1262, %rd1118; + add.s64 %rd1259, %rd1125, %rd1119; + mul.lo.s64 %rd1126, %rd1263, %rd1120; + add.s64 %rd1258, %rd1126, %rd1121; + mul.lo.s64 %rd1127, %rd1264, %rd1122; + add.s64 %rd1257, %rd1127, %rd1123; + add.s32 %r29538, %r29538, 1; + setp.lt.u32 %p52, %r29538, 32; + @%p52 bra $L__BB2_19; -$L__BB0_16: - ld.const.u64 %rd454, [target]; - setp.lt.u64 %p16, %rd73, %rd454; - bra.uni $L__BB0_17; + add.u64 %rd1250, %SPL, 2016; + add.u64 %rd1242, %SP, 2016; + add.u64 %rd1241, %SP, 0; + mov.u64 %rd1128, 0; + mov.b64 {%r26560, %r26561}, %rd1260; + mul.lo.s32 %r26562, %r26560, 16777619; + xor.b32 %r26563, %r26562, %r26561; + mul.lo.s32 %r26564, %r26563, 16777619; + mov.b64 {%r26565, %r26566}, %rd1259; + xor.b32 %r26567, %r26564, %r26565; + mul.lo.s32 %r26568, %r26567, 16777619; + xor.b32 %r26569, %r26568, %r26566; + mov.b32 {%rs498, %rs499}, %r26569; + mov.u32 %r26570, 0; + st.local.v4.u32 [%rd1250+32], {%r26570, %r26570, %r26570, %r26570}; + st.local.v4.u32 [%rd1250+48], {%r26570, %r26570, %r26570, %r26570}; + st.local.v4.u32 [%rd1250+64], {%r26570, %r26570, %r26570, %r26570}; + cvt.u64.u16 %rd1131, %rs498; + and.b64 %rd1132, %rd1131, 255; + or.b64 %rd1133, %rd26, %rd1132; + st.local.v2.u64 [%rd1250], {%rd1133, %rd23}; + st.local.v2.u64 [%rd1250+16], {%rd24, %rd25}; + mov.u32 %r26571, -1150833019; + mov.u32 %r26572, 1779033703; + st.local.v2.u32 [%rd3], {%r26572, %r26571}; + mov.u32 %r26573, -1521486534; + mov.u32 %r26574, 1013904242; + st.local.v2.u32 [%rd3+8], {%r26574, %r26573}; + mov.u32 %r26575, -1694144372; + mov.u32 %r26576, 1359893119; + st.local.v2.u32 [%rd3+16], {%r26576, %r26575}; + mov.u32 %r26577, 1541459225; + mov.u32 %r26578, 528734635; + st.local.v2.u32 [%rd3+24], {%r26578, %r26577}; + st.local.v2.u32 [%rd3+32], {%r26572, %r26571}; + st.local.v2.u32 [%rd3+40], {%r26574, %r26573}; + st.local.v2.u32 [%rd3+48], {%r26576, %r26575}; + st.local.v2.u32 [%rd3+56], {%r26578, %r26577}; + st.local.u64 [%rd3+64], %rd1128; + st.local.v2.u32 [%rd3+72], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+80], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+88], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+96], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+104], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+112], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+120], {%r26570, %r26570}; + st.local.v2.u32 [%rd3+128], {%r26570, %r26570}; + mov.u16 %rs500, 0; + st.local.v2.u8 [%rd3+136], {%rs500, %rs500}; + st.local.u8 [%rd3+138], %rs500; + st.local.u8 [%rd3+144], %rs500; + { // callseq 13, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd1241; + .param .b64 param1; + st.param.b64 [param1+0], %rd1242; + call.uni + _Z20blake3_hasher_updateP13blake3_hasherPKvy, + ( + param0, + param1 + ); + } // callseq 13 + ld.local.u8 %rd1275, [%rd3+144]; + setp.eq.s64 %p53, %rd1275, 0; + @%p53 bra $L__BB2_103; -$L__BB0_11: - setp.lt.u64 %p16, %rd474, %rd75; - bra.uni $L__BB0_17; + ld.local.v2.u8 {%rs864, %rs502}, [%rd3+136]; + cvt.u32.u16 %r26579, %rs502; + mul.wide.u32 %rd1135, %r26579, 64; + cvt.u64.u16 %rd1136, %rs864; + neg.s64 %rd1137, %rd1136; + setp.eq.s64 %p54, %rd1135, %rd1137; + @%p54 bra $L__BB2_98; + bra.uni $L__BB2_97; -$L__BB0_13: - setp.lt.u64 %p16, %rd479, %rd76; - bra.uni $L__BB0_17; +$L__BB2_98: + add.s64 %rd1275, %rd1275, -2; + shl.b64 %rd1139, %rd1275, 5; + add.s64 %rd1142, %rd3, %rd1139; + ld.local.u8 %rs667, [%rd3+138]; + mov.u64 %rd1276, 0; + or.b16 %rs734, %rs667, 4; + ld.local.v2.u32 {%r30976, %r30975}, [%rd3]; + ld.local.v2.u32 {%r30974, %r30973}, [%rd3+8]; + ld.local.v2.u32 {%r30972, %r30971}, [%rd3+16]; + ld.local.v2.u32 {%r30970, %r30969}, [%rd3+24]; + ld.local.u8 %rs800, [%rd1142+145]; + ld.local.u8 %rs801, [%rd1142+146]; + ld.local.u8 %rs802, [%rd1142+147]; + ld.local.u8 %rs803, [%rd1142+148]; + ld.local.u8 %rs804, [%rd1142+149]; + ld.local.u8 %rs805, [%rd1142+150]; + ld.local.u8 %rs806, [%rd1142+151]; + ld.local.u8 %rs807, [%rd1142+152]; + ld.local.u8 %rs808, [%rd1142+153]; + ld.local.u8 %rs809, [%rd1142+154]; + ld.local.u8 %rs810, [%rd1142+155]; + ld.local.u8 %rs811, [%rd1142+156]; + ld.local.u8 %rs812, [%rd1142+157]; + ld.local.u8 %rs813, [%rd1142+158]; + ld.local.u8 %rs814, [%rd1142+159]; + ld.local.u8 %rs815, [%rd1142+160]; + ld.local.u8 %rs816, [%rd1142+161]; + ld.local.u8 %rs817, [%rd1142+162]; + ld.local.u8 %rs818, [%rd1142+163]; + ld.local.u8 %rs819, [%rd1142+164]; + ld.local.u8 %rs820, [%rd1142+165]; + ld.local.u8 %rs821, [%rd1142+166]; + ld.local.u8 %rs822, [%rd1142+167]; + ld.local.u8 %rs823, [%rd1142+168]; + ld.local.u8 %rs824, [%rd1142+169]; + ld.local.u8 %rs825, [%rd1142+170]; + ld.local.u8 %rs826, [%rd1142+171]; + ld.local.u8 %rs827, [%rd1142+172]; + ld.local.u8 %rs828, [%rd1142+173]; + ld.local.u8 %rs829, [%rd1142+174]; + ld.local.u8 %rs830, [%rd1142+175]; + ld.local.u8 %rs831, [%rd1142+176]; + ld.local.u8 %rs832, [%rd1142+177]; + ld.local.u8 %rs833, [%rd1142+178]; + ld.local.u8 %rs834, [%rd1142+179]; + ld.local.u8 %rs835, [%rd1142+180]; + ld.local.u8 %rs836, [%rd1142+181]; + ld.local.u8 %rs837, [%rd1142+182]; + ld.local.u8 %rs838, [%rd1142+183]; + ld.local.u8 %rs839, [%rd1142+184]; + ld.local.u8 %rs840, [%rd1142+185]; + ld.local.u8 %rs841, [%rd1142+186]; + ld.local.u8 %rs842, [%rd1142+187]; + ld.local.u8 %rs843, [%rd1142+188]; + ld.local.u8 %rs844, [%rd1142+189]; + ld.local.u8 %rs845, [%rd1142+190]; + ld.local.u8 %rs846, [%rd1142+191]; + ld.local.u8 %rs847, [%rd1142+192]; + ld.local.u8 %rs848, [%rd1142+193]; + ld.local.u8 %rs849, [%rd1142+194]; + ld.local.u8 %rs850, [%rd1142+195]; + ld.local.u8 %rs851, [%rd1142+196]; + ld.local.u8 %rs852, [%rd1142+197]; + ld.local.u8 %rs853, [%rd1142+198]; + ld.local.u8 %rs854, [%rd1142+199]; + ld.local.v4.u16 {%rs855, %rs857, %rs859, %rs861}, [%rd1142+200]; + shr.u16 %rs856, %rs855, 8; + shr.u16 %rs858, %rs857, 8; + shr.u16 %rs860, %rs859, 8; + shr.u16 %rs862, %rs861, 8; + ld.local.u8 %rs863, [%rd1142+208]; + mov.u16 %rs864, 64; + bra.uni $L__BB2_99; -$L__BB0_15: - setp.lt.u64 %p16, %rd484, %rd77; +$L__BB2_103: + ld.local.v4.u8 {%rs570, %rs571, %rs572, %rs573}, [%rd3+136]; + setp.eq.s16 %p58, %rs571, 0; + selp.u16 %rs575, 1, 0, %p58; + ld.local.v2.u32 {%r28596, %r28597}, [%rd3+32]; + ld.local.v2.u32 {%r28600, %r28601}, [%rd3+40]; + ld.local.v2.u32 {%r28604, %r28605}, [%rd3+48]; + ld.local.v2.u32 {%r28608, %r28609}, [%rd3+56]; + ld.local.v4.u16 {%rs576, %rs577, %rs578, %rs579}, [%rd3+72]; + shr.u16 %rs581, %rs576, 8; + shr.u16 %rs583, %rs577, 8; + shr.u16 %rs585, %rs578, 8; + shr.u16 %rs587, %rs579, 8; + ld.local.v4.u16 {%rs588, %rs589, %rs590, %rs591}, [%rd3+80]; + shr.u16 %rs593, %rs588, 8; + shr.u16 %rs595, %rs589, 8; + shr.u16 %rs597, %rs590, 8; + shr.u16 %rs599, %rs591, 8; + ld.local.v4.u16 {%rs600, %rs601, %rs602, %rs603}, [%rd3+88]; + shr.u16 %rs605, %rs600, 8; + shr.u16 %rs607, %rs601, 8; + shr.u16 %rs609, %rs602, 8; + shr.u16 %rs611, %rs603, 8; + ld.local.v4.u16 {%rs612, %rs613, %rs614, %rs615}, [%rd3+96]; + shr.u16 %rs617, %rs612, 8; + shr.u16 %rs619, %rs613, 8; + shr.u16 %rs621, %rs614, 8; + shr.u16 %rs623, %rs615, 8; + ld.local.v4.u16 {%rs624, %rs625, %rs626, %rs627}, [%rd3+104]; + shr.u16 %rs629, %rs624, 8; + shr.u16 %rs631, %rs625, 8; + shr.u16 %rs633, %rs626, 8; + shr.u16 %rs635, %rs627, 8; + ld.local.v4.u16 {%rs636, %rs637, %rs638, %rs639}, [%rd3+112]; + shr.u16 %rs641, %rs636, 8; + shr.u16 %rs643, %rs637, 8; + shr.u16 %rs645, %rs638, 8; + shr.u16 %rs647, %rs639, 8; + ld.local.v4.u16 {%rs648, %rs649, %rs650, %rs651}, [%rd3+120]; + shr.u16 %rs653, %rs648, 8; + shr.u16 %rs655, %rs649, 8; + ld.local.v2.u8 {%rs657, %rs658}, [%rd3+126]; + ld.local.u16 %r28612, [%rd3+132]; + ld.local.v2.u8 {%rs661, %rs662}, [%rd3+134]; + or.b16 %rs665, %rs572, %rs575; + or.b16 %rs666, %rs665, 10; + cvt.u32.u16 %r28613, %rs576; + and.b32 %r28614, %r28613, 255; + cvt.u32.u16 %r28615, %rs581; + prmt.b32 %r28616, %r28615, %r28614, 30212; + cvt.u32.u16 %r28617, %rs577; + prmt.b32 %r28618, %r28617, %r28616, 28756; + cvt.u32.u16 %r28619, %rs583; + prmt.b32 %r28620, %r28619, %r28618, 1620; + cvt.u32.u16 %r28621, %rs578; + and.b32 %r28622, %r28621, 255; + cvt.u32.u16 %r28623, %rs585; + prmt.b32 %r28624, %r28623, %r28622, 30212; + cvt.u32.u16 %r28625, %rs579; + prmt.b32 %r28626, %r28625, %r28624, 28756; + cvt.u32.u16 %r28627, %rs587; + prmt.b32 %r28628, %r28627, %r28626, 1620; + cvt.u32.u16 %r28629, %rs588; + and.b32 %r28630, %r28629, 255; + cvt.u32.u16 %r28631, %rs593; + prmt.b32 %r28632, %r28631, %r28630, 30212; + cvt.u32.u16 %r28633, %rs589; + prmt.b32 %r28634, %r28633, %r28632, 28756; + cvt.u32.u16 %r28635, %rs595; + prmt.b32 %r28636, %r28635, %r28634, 1620; + cvt.u32.u16 %r28637, %rs590; + and.b32 %r28638, %r28637, 255; + cvt.u32.u16 %r28639, %rs597; + prmt.b32 %r28640, %r28639, %r28638, 30212; + cvt.u32.u16 %r28641, %rs591; + prmt.b32 %r28642, %r28641, %r28640, 28756; + cvt.u32.u16 %r28643, %rs599; + prmt.b32 %r28644, %r28643, %r28642, 1620; + cvt.u32.u16 %r28645, %rs600; + and.b32 %r28646, %r28645, 255; + cvt.u32.u16 %r28647, %rs605; + prmt.b32 %r28648, %r28647, %r28646, 30212; + cvt.u32.u16 %r28649, %rs601; + prmt.b32 %r28650, %r28649, %r28648, 28756; + cvt.u32.u16 %r28651, %rs607; + prmt.b32 %r28652, %r28651, %r28650, 1620; + cvt.u32.u16 %r28653, %rs602; + and.b32 %r28654, %r28653, 255; + cvt.u32.u16 %r28655, %rs609; + prmt.b32 %r28656, %r28655, %r28654, 30212; + cvt.u32.u16 %r28657, %rs603; + prmt.b32 %r28658, %r28657, %r28656, 28756; + cvt.u32.u16 %r28659, %rs611; + prmt.b32 %r28660, %r28659, %r28658, 1620; + cvt.u32.u16 %r28661, %rs612; + and.b32 %r28662, %r28661, 255; + cvt.u32.u16 %r28663, %rs617; + prmt.b32 %r28664, %r28663, %r28662, 30212; + cvt.u32.u16 %r28665, %rs613; + prmt.b32 %r28666, %r28665, %r28664, 28756; + cvt.u32.u16 %r28667, %rs619; + prmt.b32 %r28668, %r28667, %r28666, 1620; + cvt.u32.u16 %r28669, %rs614; + and.b32 %r28670, %r28669, 255; + cvt.u32.u16 %r28671, %rs621; + prmt.b32 %r28672, %r28671, %r28670, 30212; + cvt.u32.u16 %r28673, %rs615; + prmt.b32 %r28674, %r28673, %r28672, 28756; + cvt.u32.u16 %r28675, %rs623; + prmt.b32 %r28676, %r28675, %r28674, 1620; + cvt.u32.u16 %r28677, %rs624; + and.b32 %r28678, %r28677, 255; + cvt.u32.u16 %r28679, %rs629; + prmt.b32 %r28680, %r28679, %r28678, 30212; + cvt.u32.u16 %r28681, %rs625; + prmt.b32 %r28682, %r28681, %r28680, 28756; + cvt.u32.u16 %r28683, %rs631; + prmt.b32 %r28684, %r28683, %r28682, 1620; + cvt.u32.u16 %r28685, %rs626; + and.b32 %r28686, %r28685, 255; + cvt.u32.u16 %r28687, %rs633; + prmt.b32 %r28688, %r28687, %r28686, 30212; + cvt.u32.u16 %r28689, %rs627; + prmt.b32 %r28690, %r28689, %r28688, 28756; + cvt.u32.u16 %r28691, %rs635; + prmt.b32 %r28692, %r28691, %r28690, 1620; + cvt.u32.u16 %r28693, %rs636; + and.b32 %r28694, %r28693, 255; + cvt.u32.u16 %r28695, %rs641; + prmt.b32 %r28696, %r28695, %r28694, 30212; + cvt.u32.u16 %r28697, %rs637; + prmt.b32 %r28698, %r28697, %r28696, 28756; + cvt.u32.u16 %r28699, %rs643; + prmt.b32 %r28700, %r28699, %r28698, 1620; + cvt.u32.u16 %r28701, %rs638; + and.b32 %r28702, %r28701, 255; + cvt.u32.u16 %r28703, %rs645; + prmt.b32 %r28704, %r28703, %r28702, 30212; + cvt.u32.u16 %r28705, %rs639; + prmt.b32 %r28706, %r28705, %r28704, 28756; + cvt.u32.u16 %r28707, %rs647; + prmt.b32 %r28708, %r28707, %r28706, 1620; + cvt.u32.u16 %r28709, %rs648; + and.b32 %r28710, %r28709, 255; + cvt.u32.u16 %r28711, %rs653; + prmt.b32 %r28712, %r28711, %r28710, 30212; + cvt.u32.u16 %r28713, %rs649; + prmt.b32 %r28714, %r28713, %r28712, 28756; + cvt.u32.u16 %r28715, %rs655; + prmt.b32 %r28716, %r28715, %r28714, 1620; + cvt.u32.u16 %r28717, %rs650; + and.b32 %r28718, %r28717, 255; + ld.local.u8 %r28719, [%rd3+125]; + prmt.b32 %r28720, %r28719, %r28718, 30212; + cvt.u32.u16 %r28721, %rs657; + prmt.b32 %r28722, %r28721, %r28720, 28756; + cvt.u32.u16 %r28723, %rs658; + prmt.b32 %r28724, %r28723, %r28722, 1620; + ld.local.u32 %r28725, [%rd3+128]; + cvt.u32.u16 %r28726, %rs661; + prmt.b32 %r28727, %r28726, %r28612, 28756; + cvt.u32.u16 %r28728, %rs662; + prmt.b32 %r28729, %r28728, %r28727, 1620; + cvt.u32.u16 %r28730, %rs570; + cvt.u32.u16 %r28731, %rs666; + and.b32 %r28732, %r28731, 255; + add.s32 %r28733, %r28604, %r28596; + add.s32 %r28734, %r28733, %r28620; + add.s32 %r28735, %r28628, %r28734; + add.s32 %r28736, %r28605, %r28597; + add.s32 %r28737, %r28736, %r28636; + add.s32 %r28738, %r28644, %r28737; + add.s32 %r28739, %r28608, %r28600; + add.s32 %r28740, %r28739, %r28652; + xor.b32 %r28741, %r28740, %r28730; + shr.u32 %r28742, %r28740, 16; + shl.b32 %r28743, %r28741, 16; + or.b32 %r28744, %r28743, %r28742; + add.s32 %r28745, %r28744, 1013904242; + xor.b32 %r28746, %r28745, %r28608; + shf.l.wrap.b32 %r28747, %r28746, %r28746, 20; + add.s32 %r28748, %r28660, %r28740; + add.s32 %r28749, %r28748, %r28747; + xor.b32 %r28750, %r28749, %r28744; + shf.l.wrap.b32 %r28751, %r28750, %r28750, 24; + add.s32 %r28752, %r28751, %r28745; + xor.b32 %r28753, %r28752, %r28747; + shf.l.wrap.b32 %r28754, %r28753, %r28753, 25; + add.s32 %r28755, %r28609, %r28601; + add.s32 %r28756, %r28755, %r28668; + xor.b32 %r28757, %r28756, %r28732; + shr.u32 %r28758, %r28756, 16; + shl.b32 %r28759, %r28757, 16; + or.b32 %r28760, %r28759, %r28758; + add.s32 %r28761, %r28760, -1521486534; + xor.b32 %r28762, %r28761, %r28609; + shf.l.wrap.b32 %r28763, %r28762, %r28762, 20; + add.s32 %r28764, %r28676, %r28756; + add.s32 %r28765, %r28764, %r28763; + xor.b32 %r28766, %r28765, %r28760; + shf.l.wrap.b32 %r28767, %r28766, %r28766, 24; + add.s32 %r28768, %r28767, %r28761; + xor.b32 %r28769, %r28768, %r28763; + shf.l.wrap.b32 %r28770, %r28769, %r28769, 25; + add.s32 %r28771, %r28700, %r28754; + add.s32 %r28772, %r28770, %r28749; + add.s32 %r28773, %r28772, %r28716; + add.s32 %r28774, %r28724, %r28773; + add.s32 %r28775, %r28725, %r28765; + shf.l.wrap.b32 %r28776, %r28734, %r28734, 16; + add.s32 %r28777, %r28776, 1779033703; + xor.b32 %r28778, %r28777, %r28604; + shf.l.wrap.b32 %r28779, %r28778, %r28778, 20; + add.s32 %r28780, %r28735, %r28779; + xor.b32 %r28781, %r28780, %r28776; + shf.l.wrap.b32 %r28782, %r28781, %r28781, 24; + add.s32 %r28783, %r28782, %r28777; + xor.b32 %r28784, %r28783, %r28779; + shf.l.wrap.b32 %r28785, %r28784, %r28784, 25; + shf.l.wrap.b32 %r28786, %r28737, %r28737, 16; + add.s32 %r28787, %r28786, -1150833019; + xor.b32 %r28788, %r28787, %r28605; + shf.l.wrap.b32 %r28789, %r28788, %r28788, 20; + add.s32 %r28790, %r28738, %r28789; + xor.b32 %r28791, %r28790, %r28786; + shf.l.wrap.b32 %r28792, %r28791, %r28791, 24; + add.s32 %r28793, %r28792, %r28787; + xor.b32 %r28794, %r28793, %r28789; + shf.l.wrap.b32 %r28795, %r28794, %r28794, 25; + add.s32 %r28796, %r28780, %r28684; + add.s32 %r28797, %r28796, %r28795; + xor.b32 %r28798, %r28797, %r28767; + shf.l.wrap.b32 %r28799, %r28798, %r28798, 16; + add.s32 %r28800, %r28799, %r28752; + xor.b32 %r28801, %r28800, %r28795; + shf.l.wrap.b32 %r28802, %r28801, %r28801, 20; + add.s32 %r28803, %r28797, %r28692; + add.s32 %r28804, %r28803, %r28802; + xor.b32 %r28805, %r28804, %r28799; + shf.l.wrap.b32 %r28806, %r28805, %r28805, 24; + add.s32 %r28807, %r28806, %r28800; + xor.b32 %r28808, %r28807, %r28802; + shf.l.wrap.b32 %r28809, %r28808, %r28808, 25; + add.s32 %r28810, %r28771, %r28790; + xor.b32 %r28811, %r28782, %r28810; + shf.l.wrap.b32 %r28812, %r28811, %r28811, 16; + add.s32 %r28813, %r28812, %r28768; + xor.b32 %r28814, %r28813, %r28754; + shf.l.wrap.b32 %r28815, %r28814, %r28814, 20; + add.s32 %r28816, %r28810, %r28708; + add.s32 %r28817, %r28816, %r28815; + xor.b32 %r28818, %r28817, %r28812; + shf.l.wrap.b32 %r28819, %r28818, %r28818, 24; + add.s32 %r28820, %r28819, %r28813; + xor.b32 %r28821, %r28820, %r28815; + shf.l.wrap.b32 %r28822, %r28821, %r28821, 25; + xor.b32 %r28823, %r28792, %r28773; + shf.l.wrap.b32 %r28824, %r28823, %r28823, 16; + add.s32 %r28825, %r28824, %r28783; + xor.b32 %r28826, %r28825, %r28770; + shf.l.wrap.b32 %r28827, %r28826, %r28826, 20; + add.s32 %r28828, %r28774, %r28827; + xor.b32 %r28829, %r28828, %r28824; + shf.l.wrap.b32 %r28830, %r28829, %r28829, 24; + add.s32 %r28831, %r28830, %r28825; + xor.b32 %r28832, %r28831, %r28827; + shf.l.wrap.b32 %r28833, %r28832, %r28832, 25; + add.s32 %r28834, %r28775, %r28785; + xor.b32 %r28835, %r28834, %r28751; + shf.l.wrap.b32 %r28836, %r28835, %r28835, 16; + add.s32 %r28837, %r28836, %r28793; + xor.b32 %r28838, %r28837, %r28785; + shf.l.wrap.b32 %r28839, %r28838, %r28838, 20; + add.s32 %r28840, %r28834, %r28729; + add.s32 %r28841, %r28840, %r28839; + xor.b32 %r28842, %r28841, %r28836; + shf.l.wrap.b32 %r28843, %r28842, %r28842, 24; + add.s32 %r28844, %r28843, %r28837; + xor.b32 %r28845, %r28844, %r28839; + shf.l.wrap.b32 %r28846, %r28845, %r28845, 25; + add.s32 %r28847, %r28804, %r28636; + add.s32 %r28848, %r28847, %r28846; + xor.b32 %r28849, %r28848, %r28819; + shf.l.wrap.b32 %r28850, %r28849, %r28849, 16; + add.s32 %r28851, %r28850, %r28831; + xor.b32 %r28852, %r28851, %r28846; + shf.l.wrap.b32 %r28853, %r28852, %r28852, 20; + add.s32 %r28854, %r28848, %r28668; + add.s32 %r28855, %r28854, %r28853; + xor.b32 %r28856, %r28855, %r28850; + shf.l.wrap.b32 %r28857, %r28856, %r28856, 24; + add.s32 %r28858, %r28857, %r28851; + xor.b32 %r28859, %r28858, %r28853; + shf.l.wrap.b32 %r28860, %r28859, %r28859, 25; + add.s32 %r28861, %r28817, %r28644; + add.s32 %r28862, %r28861, %r28809; + xor.b32 %r28863, %r28862, %r28830; + shf.l.wrap.b32 %r28864, %r28863, %r28863, 16; + add.s32 %r28865, %r28864, %r28844; + xor.b32 %r28866, %r28865, %r28809; + shf.l.wrap.b32 %r28867, %r28866, %r28866, 20; + add.s32 %r28868, %r28862, %r28700; + add.s32 %r28869, %r28868, %r28867; + xor.b32 %r28870, %r28869, %r28864; + shf.l.wrap.b32 %r28871, %r28870, %r28870, 24; + add.s32 %r28872, %r28871, %r28865; + xor.b32 %r28873, %r28872, %r28867; + shf.l.wrap.b32 %r28874, %r28873, %r28873, 25; + add.s32 %r28875, %r28828, %r28676; + add.s32 %r28876, %r28875, %r28822; + xor.b32 %r28877, %r28843, %r28876; + shf.l.wrap.b32 %r28878, %r28877, %r28877, 16; + add.s32 %r28879, %r28878, %r28807; + xor.b32 %r28880, %r28879, %r28822; + shf.l.wrap.b32 %r28881, %r28880, %r28880, 20; + add.s32 %r28882, %r28876, %r28620; + add.s32 %r28883, %r28882, %r28881; + xor.b32 %r28884, %r28883, %r28878; + shf.l.wrap.b32 %r28885, %r28884, %r28884, 24; + add.s32 %r28886, %r28885, %r28879; + xor.b32 %r28887, %r28886, %r28881; + shf.l.wrap.b32 %r28888, %r28887, %r28887, 25; + add.s32 %r28889, %r28841, %r28652; + add.s32 %r28890, %r28889, %r28833; + xor.b32 %r28891, %r28806, %r28890; + shf.l.wrap.b32 %r28892, %r28891, %r28891, 16; + add.s32 %r28893, %r28892, %r28820; + xor.b32 %r28894, %r28893, %r28833; + shf.l.wrap.b32 %r28895, %r28894, %r28894, 20; + add.s32 %r28896, %r28890, %r28724; + add.s32 %r28897, %r28896, %r28895; + xor.b32 %r28898, %r28897, %r28892; + shf.l.wrap.b32 %r28899, %r28898, %r28898, 24; + add.s32 %r28900, %r28899, %r28893; + xor.b32 %r28901, %r28900, %r28895; + shf.l.wrap.b32 %r28902, %r28901, %r28901, 25; + add.s32 %r28903, %r28855, %r28628; + add.s32 %r28904, %r28903, %r28874; + xor.b32 %r28905, %r28904, %r28899; + shf.l.wrap.b32 %r28906, %r28905, %r28905, 16; + add.s32 %r28907, %r28906, %r28886; + xor.b32 %r28908, %r28907, %r28874; + shf.l.wrap.b32 %r28909, %r28908, %r28908, 20; + add.s32 %r28910, %r28904, %r28708; + add.s32 %r28911, %r28910, %r28909; + xor.b32 %r28912, %r28911, %r28906; + shf.l.wrap.b32 %r28913, %r28912, %r28912, 24; + add.s32 %r28914, %r28913, %r28907; + xor.b32 %r28915, %r28914, %r28909; + shf.l.wrap.b32 %r28916, %r28915, %r28915, 25; + add.s32 %r28917, %r28888, %r28716; + add.s32 %r28918, %r28917, %r28869; + xor.b32 %r28919, %r28857, %r28918; + shf.l.wrap.b32 %r28920, %r28919, %r28919, 16; + add.s32 %r28921, %r28920, %r28900; + xor.b32 %r28922, %r28921, %r28888; + shf.l.wrap.b32 %r28923, %r28922, %r28922, 20; + add.s32 %r28924, %r28918, %r28660; + add.s32 %r28925, %r28924, %r28923; + xor.b32 %r28926, %r28925, %r28920; + shf.l.wrap.b32 %r28927, %r28926, %r28926, 24; + add.s32 %r28928, %r28927, %r28921; + xor.b32 %r28929, %r28928, %r28923; + shf.l.wrap.b32 %r28930, %r28929, %r28929, 25; + add.s32 %r28931, %r28883, %r28692; + add.s32 %r28932, %r28931, %r28902; + xor.b32 %r28933, %r28871, %r28932; + shf.l.wrap.b32 %r28934, %r28933, %r28933, 16; + add.s32 %r28935, %r28934, %r28858; + xor.b32 %r28936, %r28935, %r28902; + shf.l.wrap.b32 %r28937, %r28936, %r28936, 20; + add.s32 %r28938, %r28932, %r28725; + add.s32 %r28939, %r28938, %r28937; + xor.b32 %r28940, %r28939, %r28934; + shf.l.wrap.b32 %r28941, %r28940, %r28940, 24; + add.s32 %r28942, %r28941, %r28935; + xor.b32 %r28943, %r28942, %r28937; + shf.l.wrap.b32 %r28944, %r28943, %r28943, 25; + add.s32 %r28945, %r28897, %r28729; + add.s32 %r28946, %r28945, %r28860; + xor.b32 %r28947, %r28946, %r28885; + shf.l.wrap.b32 %r28948, %r28947, %r28947, 16; + add.s32 %r28949, %r28948, %r28872; + xor.b32 %r28950, %r28949, %r28860; + shf.l.wrap.b32 %r28951, %r28950, %r28950, 20; + add.s32 %r28952, %r28946, %r28684; + add.s32 %r28953, %r28952, %r28951; + xor.b32 %r28954, %r28953, %r28948; + shf.l.wrap.b32 %r28955, %r28954, %r28954, 24; + add.s32 %r28956, %r28955, %r28949; + xor.b32 %r28957, %r28956, %r28951; + shf.l.wrap.b32 %r28958, %r28957, %r28957, 25; + add.s32 %r28959, %r28911, %r28644; + add.s32 %r28960, %r28959, %r28958; + xor.b32 %r28961, %r28960, %r28927; + shf.l.wrap.b32 %r28962, %r28961, %r28961, 16; + add.s32 %r28963, %r28962, %r28942; + xor.b32 %r28964, %r28963, %r28958; + shf.l.wrap.b32 %r28965, %r28964, %r28964, 20; + add.s32 %r28966, %r28960, %r28652; + add.s32 %r28967, %r28966, %r28965; + xor.b32 %r28968, %r28967, %r28962; + shf.l.wrap.b32 %r28969, %r28968, %r28968, 24; + add.s32 %r28970, %r28969, %r28963; + xor.b32 %r28971, %r28970, %r28965; + shf.l.wrap.b32 %r28972, %r28971, %r28971, 25; + add.s32 %r28973, %r28925, %r28700; + add.s32 %r28974, %r28973, %r28916; + xor.b32 %r28975, %r28974, %r28941; + shf.l.wrap.b32 %r28976, %r28975, %r28975, 16; + add.s32 %r28977, %r28976, %r28956; + xor.b32 %r28978, %r28977, %r28916; + shf.l.wrap.b32 %r28979, %r28978, %r28978, 20; + add.s32 %r28980, %r28974, %r28716; + add.s32 %r28981, %r28980, %r28979; + xor.b32 %r28982, %r28981, %r28976; + shf.l.wrap.b32 %r28983, %r28982, %r28982, 24; + add.s32 %r28984, %r28983, %r28977; + xor.b32 %r28985, %r28984, %r28979; + shf.l.wrap.b32 %r28986, %r28985, %r28985, 25; + add.s32 %r28987, %r28939, %r28724; + add.s32 %r28988, %r28987, %r28930; + xor.b32 %r28989, %r28955, %r28988; + shf.l.wrap.b32 %r28990, %r28989, %r28989, 16; + add.s32 %r28991, %r28990, %r28914; + xor.b32 %r28992, %r28991, %r28930; + shf.l.wrap.b32 %r28993, %r28992, %r28992, 20; + add.s32 %r28994, %r28988, %r28636; + add.s32 %r28995, %r28994, %r28993; + xor.b32 %r28996, %r28995, %r28990; + shf.l.wrap.b32 %r28997, %r28996, %r28996, 24; + add.s32 %r28998, %r28997, %r28991; + xor.b32 %r28999, %r28998, %r28993; + shf.l.wrap.b32 %r29000, %r28999, %r28999, 25; + add.s32 %r29001, %r28953, %r28676; + add.s32 %r29002, %r29001, %r28944; + xor.b32 %r29003, %r28913, %r29002; + shf.l.wrap.b32 %r29004, %r29003, %r29003, 16; + add.s32 %r29005, %r29004, %r28928; + xor.b32 %r29006, %r29005, %r28944; + shf.l.wrap.b32 %r29007, %r29006, %r29006, 20; + add.s32 %r29008, %r29002, %r28725; + add.s32 %r29009, %r29008, %r29007; + xor.b32 %r29010, %r29009, %r29004; + shf.l.wrap.b32 %r29011, %r29010, %r29010, 24; + add.s32 %r29012, %r29011, %r29005; + xor.b32 %r29013, %r29012, %r29007; + shf.l.wrap.b32 %r29014, %r29013, %r29013, 25; + add.s32 %r29015, %r28967, %r28668; + add.s32 %r29016, %r29015, %r28986; + xor.b32 %r29017, %r29016, %r29011; + shf.l.wrap.b32 %r29018, %r29017, %r29017, 16; + add.s32 %r29019, %r29018, %r28998; + xor.b32 %r29020, %r29019, %r28986; + shf.l.wrap.b32 %r29021, %r29020, %r29020, 20; + add.s32 %r29022, %r29016, %r28660; + add.s32 %r29023, %r29022, %r29021; + xor.b32 %r29024, %r29023, %r29018; + shf.l.wrap.b32 %r29025, %r29024, %r29024, 24; + add.s32 %r29026, %r29025, %r29019; + xor.b32 %r29027, %r29026, %r29021; + shf.l.wrap.b32 %r29028, %r29027, %r29027, 25; + add.s32 %r29029, %r29000, %r28692; + add.s32 %r29030, %r29029, %r28981; + xor.b32 %r29031, %r28969, %r29030; + shf.l.wrap.b32 %r29032, %r29031, %r29031, 16; + add.s32 %r29033, %r29032, %r29012; + xor.b32 %r29034, %r29033, %r29000; + shf.l.wrap.b32 %r29035, %r29034, %r29034, 20; + add.s32 %r29036, %r29030, %r28620; + add.s32 %r29037, %r29036, %r29035; + xor.b32 %r29038, %r29037, %r29032; + shf.l.wrap.b32 %r29039, %r29038, %r29038, 24; + add.s32 %r29040, %r29039, %r29033; + xor.b32 %r29041, %r29040, %r29035; + shf.l.wrap.b32 %r29042, %r29041, %r29041, 25; + add.s32 %r29043, %r28995, %r28708; + add.s32 %r29044, %r29043, %r29014; + xor.b32 %r29045, %r28983, %r29044; + shf.l.wrap.b32 %r29046, %r29045, %r29045, 16; + add.s32 %r29047, %r29046, %r28970; + xor.b32 %r29048, %r29047, %r29014; + shf.l.wrap.b32 %r29049, %r29048, %r29048, 20; + add.s32 %r29050, %r29044, %r28729; + add.s32 %r29051, %r29050, %r29049; + xor.b32 %r29052, %r29051, %r29046; + shf.l.wrap.b32 %r29053, %r29052, %r29052, 24; + add.s32 %r29054, %r29053, %r29047; + xor.b32 %r29055, %r29054, %r29049; + shf.l.wrap.b32 %r29056, %r29055, %r29055, 25; + add.s32 %r29057, %r29009, %r28684; + add.s32 %r29058, %r29057, %r28972; + xor.b32 %r29059, %r29058, %r28997; + shf.l.wrap.b32 %r29060, %r29059, %r29059, 16; + add.s32 %r29061, %r29060, %r28984; + xor.b32 %r29062, %r29061, %r28972; + shf.l.wrap.b32 %r29063, %r29062, %r29062, 20; + add.s32 %r29064, %r29058, %r28628; + add.s32 %r29065, %r29064, %r29063; + xor.b32 %r29066, %r29065, %r29060; + shf.l.wrap.b32 %r29067, %r29066, %r29066, 24; + add.s32 %r29068, %r29067, %r29061; + xor.b32 %r29069, %r29068, %r29063; + shf.l.wrap.b32 %r29070, %r29069, %r29069, 25; + add.s32 %r29071, %r29023, %r28700; + add.s32 %r29072, %r29071, %r29070; + xor.b32 %r29073, %r29072, %r29039; + shf.l.wrap.b32 %r29074, %r29073, %r29073, 16; + add.s32 %r29075, %r29074, %r29054; + xor.b32 %r29076, %r29075, %r29070; + shf.l.wrap.b32 %r29077, %r29076, %r29076, 20; + add.s32 %r29078, %r29072, %r28676; + add.s32 %r29079, %r29078, %r29077; + xor.b32 %r29080, %r29079, %r29074; + shf.l.wrap.b32 %r29081, %r29080, %r29080, 24; + add.s32 %r29082, %r29081, %r29075; + xor.b32 %r29083, %r29082, %r29077; + shf.l.wrap.b32 %r29084, %r29083, %r29083, 25; + add.s32 %r29085, %r29037, %r28716; + add.s32 %r29086, %r29085, %r29028; + xor.b32 %r29087, %r29086, %r29053; + shf.l.wrap.b32 %r29088, %r29087, %r29087, 16; + add.s32 %r29089, %r29088, %r29068; + xor.b32 %r29090, %r29089, %r29028; + shf.l.wrap.b32 %r29091, %r29090, %r29090, 20; + add.s32 %r29092, %r29086, %r28692; + add.s32 %r29093, %r29092, %r29091; + xor.b32 %r29094, %r29093, %r29088; + shf.l.wrap.b32 %r29095, %r29094, %r29094, 24; + add.s32 %r29096, %r29095, %r29089; + xor.b32 %r29097, %r29096, %r29091; + shf.l.wrap.b32 %r29098, %r29097, %r29097, 25; + add.s32 %r29099, %r29051, %r28725; + add.s32 %r29100, %r29099, %r29042; + xor.b32 %r29101, %r29067, %r29100; + shf.l.wrap.b32 %r29102, %r29101, %r29101, 16; + add.s32 %r29103, %r29102, %r29026; + xor.b32 %r29104, %r29103, %r29042; + shf.l.wrap.b32 %r29105, %r29104, %r29104, 20; + add.s32 %r29106, %r29100, %r28644; + add.s32 %r29107, %r29106, %r29105; + xor.b32 %r29108, %r29107, %r29102; + shf.l.wrap.b32 %r29109, %r29108, %r29108, 24; + add.s32 %r29110, %r29109, %r29103; + xor.b32 %r29111, %r29110, %r29105; + shf.l.wrap.b32 %r29112, %r29111, %r29111, 25; + add.s32 %r29113, %r29065, %r28724; + add.s32 %r29114, %r29113, %r29056; + xor.b32 %r29115, %r29025, %r29114; + shf.l.wrap.b32 %r29116, %r29115, %r29115, 16; + add.s32 %r29117, %r29116, %r29040; + xor.b32 %r29118, %r29117, %r29056; + shf.l.wrap.b32 %r29119, %r29118, %r29118, 20; + add.s32 %r29120, %r29114, %r28729; + add.s32 %r29121, %r29120, %r29119; + xor.b32 %r29122, %r29121, %r29116; + shf.l.wrap.b32 %r29123, %r29122, %r29122, 24; + add.s32 %r29124, %r29123, %r29117; + xor.b32 %r29125, %r29124, %r29119; + shf.l.wrap.b32 %r29126, %r29125, %r29125, 25; + add.s32 %r29127, %r29079, %r28652; + add.s32 %r29128, %r29127, %r29098; + xor.b32 %r29129, %r29128, %r29123; + shf.l.wrap.b32 %r29130, %r29129, %r29129, 16; + add.s32 %r29131, %r29130, %r29110; + xor.b32 %r29132, %r29131, %r29098; + shf.l.wrap.b32 %r29133, %r29132, %r29132, 20; + add.s32 %r29134, %r29128, %r28620; + add.s32 %r29135, %r29134, %r29133; + xor.b32 %r29136, %r29135, %r29130; + shf.l.wrap.b32 %r29137, %r29136, %r29136, 24; + add.s32 %r29138, %r29137, %r29131; + xor.b32 %r29139, %r29138, %r29133; + shf.l.wrap.b32 %r29140, %r29139, %r29139, 25; + add.s32 %r29141, %r29112, %r28708; + add.s32 %r29142, %r29141, %r29093; + xor.b32 %r29143, %r29081, %r29142; + shf.l.wrap.b32 %r29144, %r29143, %r29143, 16; + add.s32 %r29145, %r29144, %r29124; + xor.b32 %r29146, %r29145, %r29112; + shf.l.wrap.b32 %r29147, %r29146, %r29146, 20; + add.s32 %r29148, %r29142, %r28636; + add.s32 %r29149, %r29148, %r29147; + xor.b32 %r29150, %r29149, %r29144; + shf.l.wrap.b32 %r29151, %r29150, %r29150, 24; + add.s32 %r29152, %r29151, %r29145; + xor.b32 %r29153, %r29152, %r29147; + shf.l.wrap.b32 %r29154, %r29153, %r29153, 25; + add.s32 %r29155, %r29107, %r28660; + add.s32 %r29156, %r29155, %r29126; + xor.b32 %r29157, %r29095, %r29156; + shf.l.wrap.b32 %r29158, %r29157, %r29157, 16; + add.s32 %r29159, %r29158, %r29082; + xor.b32 %r29160, %r29159, %r29126; + shf.l.wrap.b32 %r29161, %r29160, %r29160, 20; + add.s32 %r29162, %r29156, %r28684; + add.s32 %r29163, %r29162, %r29161; + xor.b32 %r29164, %r29163, %r29158; + shf.l.wrap.b32 %r29165, %r29164, %r29164, 24; + add.s32 %r29166, %r29165, %r29159; + xor.b32 %r29167, %r29166, %r29161; + shf.l.wrap.b32 %r29168, %r29167, %r29167, 25; + add.s32 %r29169, %r29121, %r28628; + add.s32 %r29170, %r29169, %r29084; + xor.b32 %r29171, %r29170, %r29109; + shf.l.wrap.b32 %r29172, %r29171, %r29171, 16; + add.s32 %r29173, %r29172, %r29096; + xor.b32 %r29174, %r29173, %r29084; + shf.l.wrap.b32 %r29175, %r29174, %r29174, 20; + add.s32 %r29176, %r29170, %r28668; + add.s32 %r29177, %r29176, %r29175; + xor.b32 %r29178, %r29177, %r29172; + shf.l.wrap.b32 %r29179, %r29178, %r29178, 24; + add.s32 %r29180, %r29179, %r29173; + xor.b32 %r29181, %r29180, %r29175; + shf.l.wrap.b32 %r29182, %r29181, %r29181, 25; + add.s32 %r29183, %r29135, %r28716; + add.s32 %r29184, %r29183, %r29182; + xor.b32 %r29185, %r29184, %r29151; + shf.l.wrap.b32 %r29186, %r29185, %r29185, 16; + add.s32 %r29187, %r29186, %r29166; + xor.b32 %r29188, %r29187, %r29182; + shf.l.wrap.b32 %r29189, %r29188, %r29188, 20; + add.s32 %r29190, %r29184, %r28724; + add.s32 %r29191, %r29190, %r29189; + xor.b32 %r29192, %r29191, %r29186; + shf.l.wrap.b32 %r29193, %r29192, %r29192, 24; + add.s32 %r29194, %r29193, %r29187; + xor.b32 %r29195, %r29194, %r29189; + shf.l.wrap.b32 %r29196, %r29195, %r29195, 25; + add.s32 %r29197, %r29149, %r28692; + add.s32 %r29198, %r29197, %r29140; + xor.b32 %r29199, %r29198, %r29165; + shf.l.wrap.b32 %r29200, %r29199, %r29199, 16; + add.s32 %r29201, %r29200, %r29180; + xor.b32 %r29202, %r29201, %r29140; + shf.l.wrap.b32 %r29203, %r29202, %r29202, 20; + add.s32 %r29204, %r29198, %r28708; + add.s32 %r29205, %r29204, %r29203; + xor.b32 %r29206, %r29205, %r29200; + shf.l.wrap.b32 %r29207, %r29206, %r29206, 24; + add.s32 %r29208, %r29207, %r29201; + xor.b32 %r29209, %r29208, %r29203; + shf.l.wrap.b32 %r29210, %r29209, %r29209, 25; + add.s32 %r29211, %r29163, %r28729; + add.s32 %r29212, %r29211, %r29154; + xor.b32 %r29213, %r29179, %r29212; + shf.l.wrap.b32 %r29214, %r29213, %r29213, 16; + add.s32 %r29215, %r29214, %r29138; + xor.b32 %r29216, %r29215, %r29154; + shf.l.wrap.b32 %r29217, %r29216, %r29216, 20; + add.s32 %r29218, %r29212, %r28700; + add.s32 %r29219, %r29218, %r29217; + xor.b32 %r29220, %r29219, %r29214; + shf.l.wrap.b32 %r29221, %r29220, %r29220, 24; + add.s32 %r29222, %r29221, %r29215; + xor.b32 %r29223, %r29222, %r29217; + shf.l.wrap.b32 %r29224, %r29223, %r29223, 25; + add.s32 %r29225, %r29177, %r28725; + add.s32 %r29226, %r29225, %r29168; + xor.b32 %r29227, %r29137, %r29226; + shf.l.wrap.b32 %r29228, %r29227, %r29227, 16; + add.s32 %r29229, %r29228, %r29152; + xor.b32 %r29230, %r29229, %r29168; + shf.l.wrap.b32 %r29231, %r29230, %r29230, 20; + add.s32 %r29232, %r29226, %r28684; + add.s32 %r29233, %r29232, %r29231; + xor.b32 %r29234, %r29233, %r29228; + shf.l.wrap.b32 %r29235, %r29234, %r29234, 24; + add.s32 %r29236, %r29235, %r29229; + xor.b32 %r29237, %r29236, %r29231; + shf.l.wrap.b32 %r29238, %r29237, %r29237, 25; + add.s32 %r29239, %r29191, %r28676; + add.s32 %r29240, %r29239, %r29210; + xor.b32 %r29241, %r29240, %r29235; + shf.l.wrap.b32 %r29242, %r29241, %r29241, 16; + add.s32 %r29243, %r29242, %r29222; + xor.b32 %r29244, %r29243, %r29210; + shf.l.wrap.b32 %r29245, %r29244, %r29244, 20; + add.s32 %r29246, %r29240, %r28636; + add.s32 %r29247, %r29246, %r29245; + xor.b32 %r29248, %r29247, %r29242; + shf.l.wrap.b32 %r29249, %r29248, %r29248, 24; + add.s32 %r29250, %r29249, %r29243; + xor.b32 %r29251, %r29250, %r29245; + shf.l.wrap.b32 %r29252, %r29251, %r29251, 25; + add.s32 %r29253, %r29224, %r28660; + add.s32 %r29254, %r29253, %r29205; + xor.b32 %r29255, %r29193, %r29254; + shf.l.wrap.b32 %r29256, %r29255, %r29255, 16; + add.s32 %r29257, %r29256, %r29236; + xor.b32 %r29258, %r29257, %r29224; + shf.l.wrap.b32 %r29259, %r29258, %r29258, 20; + add.s32 %r29260, %r29254, %r28644; + add.s32 %r29261, %r29260, %r29259; + xor.b32 %r29262, %r29261, %r29256; + shf.l.wrap.b32 %r29263, %r29262, %r29262, 24; + add.s32 %r29264, %r29263, %r29257; + xor.b32 %r29265, %r29264, %r29259; + shf.l.wrap.b32 %r29266, %r29265, %r29265, 25; + add.s32 %r29267, %r29219, %r28620; + add.s32 %r29268, %r29267, %r29238; + xor.b32 %r29269, %r29207, %r29268; + shf.l.wrap.b32 %r29270, %r29269, %r29269, 16; + add.s32 %r29271, %r29270, %r29194; + xor.b32 %r29272, %r29271, %r29238; + shf.l.wrap.b32 %r29273, %r29272, %r29272, 20; + add.s32 %r29274, %r29268, %r28628; + add.s32 %r29275, %r29274, %r29273; + xor.b32 %r29276, %r29275, %r29270; + shf.l.wrap.b32 %r29277, %r29276, %r29276, 24; + add.s32 %r29278, %r29277, %r29271; + xor.b32 %r29279, %r29278, %r29273; + shf.l.wrap.b32 %r29280, %r29279, %r29279, 25; + add.s32 %r29281, %r29233, %r28668; + add.s32 %r29282, %r29281, %r29196; + xor.b32 %r29283, %r29282, %r29221; + shf.l.wrap.b32 %r29284, %r29283, %r29283, 16; + add.s32 %r29285, %r29284, %r29208; + xor.b32 %r29286, %r29285, %r29196; + shf.l.wrap.b32 %r29287, %r29286, %r29286, 20; + add.s32 %r29288, %r29282, %r28652; + add.s32 %r29289, %r29288, %r29287; + xor.b32 %r29290, %r29289, %r29284; + shf.l.wrap.b32 %r29291, %r29290, %r29290, 24; + add.s32 %r29292, %r29291, %r29285; + xor.b32 %r29293, %r29292, %r29287; + shf.l.wrap.b32 %r29294, %r29293, %r29293, 25; + add.s32 %r29295, %r29247, %r28692; + add.s32 %r29296, %r29295, %r29294; + xor.b32 %r29297, %r29296, %r29263; + shf.l.wrap.b32 %r29298, %r29297, %r29297, 16; + add.s32 %r29299, %r29298, %r29278; + xor.b32 %r29300, %r29299, %r29294; + shf.l.wrap.b32 %r29301, %r29300, %r29300, 20; + add.s32 %r29302, %r29296, %r28725; + add.s32 %r29303, %r29302, %r29301; + xor.b32 %r29304, %r29303, %r29298; + shf.l.wrap.b32 %r29305, %r29304, %r29304, 24; + add.s32 %r29306, %r29305, %r29299; + xor.b32 %r29307, %r29306, %r29301; + shf.l.wrap.b32 %r29308, %r29307, %r29307, 25; + add.s32 %r29309, %r29261, %r28708; + add.s32 %r29310, %r29309, %r29252; + xor.b32 %r29311, %r29310, %r29277; + shf.l.wrap.b32 %r29312, %r29311, %r29311, 16; + add.s32 %r29313, %r29312, %r29292; + xor.b32 %r29314, %r29313, %r29252; + shf.l.wrap.b32 %r29315, %r29314, %r29314, 20; + add.s32 %r29316, %r29310, %r28660; + add.s32 %r29317, %r29316, %r29315; + xor.b32 %r29318, %r29317, %r29312; + shf.l.wrap.b32 %r29319, %r29318, %r29318, 24; + add.s32 %r29320, %r29319, %r29313; + xor.b32 %r29321, %r29320, %r29315; + shf.l.wrap.b32 %r29322, %r29321, %r29321, 25; + add.s32 %r29323, %r29275, %r28684; + add.s32 %r29324, %r29323, %r29266; + xor.b32 %r29325, %r29291, %r29324; + shf.l.wrap.b32 %r29326, %r29325, %r29325, 16; + add.s32 %r29327, %r29326, %r29250; + xor.b32 %r29328, %r29327, %r29266; + shf.l.wrap.b32 %r29329, %r29328, %r29328, 20; + add.s32 %r29330, %r29324, %r28716; + add.s32 %r29331, %r29330, %r29329; + xor.b32 %r29332, %r29331, %r29326; + shf.l.wrap.b32 %r29333, %r29332, %r29332, 24; + add.s32 %r29334, %r29333, %r29327; + xor.b32 %r29335, %r29334, %r29329; + shf.l.wrap.b32 %r29336, %r29335, %r29335, 25; + add.s32 %r29337, %r29289, %r28729; + add.s32 %r29338, %r29337, %r29280; + xor.b32 %r29339, %r29249, %r29338; + shf.l.wrap.b32 %r29340, %r29339, %r29339, 16; + add.s32 %r29341, %r29340, %r29264; + xor.b32 %r29342, %r29341, %r29280; + shf.l.wrap.b32 %r29343, %r29342, %r29342, 20; + add.s32 %r29344, %r29338, %r28628; + add.s32 %r29345, %r29344, %r29343; + xor.b32 %r29346, %r29345, %r29340; + shf.l.wrap.b32 %r29347, %r29346, %r29346, 24; + add.s32 %r29348, %r29347, %r29341; + xor.b32 %r29349, %r29348, %r29343; + shf.l.wrap.b32 %r29350, %r29349, %r29349, 25; + add.s32 %r29351, %r29303, %r28724; + add.s32 %r29352, %r29351, %r29322; + xor.b32 %r29353, %r29352, %r29347; + shf.l.wrap.b32 %r29354, %r29353, %r29353, 16; + add.s32 %r29355, %r29354, %r29334; + xor.b32 %r29356, %r29355, %r29322; + shf.l.wrap.b32 %r29357, %r29356, %r29356, 20; + add.s32 %r29358, %r29352, %r28644; + add.s32 %r29359, %r29358, %r29357; + xor.b32 %r29360, %r29359, %r29354; + shf.l.wrap.b32 %r29361, %r29360, %r29360, 24; + add.s32 %r29362, %r29361, %r29355; + xor.b32 %r29363, %r29362, %r29357; + shf.l.wrap.b32 %r29364, %r29363, %r29363, 25; + add.s32 %r29365, %r29336, %r28620; + add.s32 %r29366, %r29365, %r29317; + xor.b32 %r29367, %r29305, %r29366; + shf.l.wrap.b32 %r29368, %r29367, %r29367, 16; + add.s32 %r29369, %r29368, %r29348; + xor.b32 %r29370, %r29369, %r29336; + shf.l.wrap.b32 %r29371, %r29370, %r29370, 20; + add.s32 %r29372, %r29366, %r28700; + add.s32 %r29373, %r29372, %r29371; + xor.b32 %r29374, %r29373, %r29368; + shf.l.wrap.b32 %r29375, %r29374, %r29374, 24; + add.s32 %r29376, %r29375, %r29369; + xor.b32 %r29377, %r29376, %r29371; + shf.l.wrap.b32 %r29378, %r29377, %r29377, 25; + add.s32 %r29379, %r29331, %r28636; + add.s32 %r29380, %r29379, %r29350; + xor.b32 %r29381, %r29319, %r29380; + shf.l.wrap.b32 %r29382, %r29381, %r29381, 16; + add.s32 %r29383, %r29382, %r29306; + xor.b32 %r29384, %r29383, %r29350; + shf.l.wrap.b32 %r29385, %r29384, %r29384, 20; + add.s32 %r29386, %r29380, %r28668; + add.s32 %r29387, %r29386, %r29385; + xor.b32 %r29388, %r29387, %r29382; + shf.l.wrap.b32 %r29389, %r29388, %r29388, 24; + add.s32 %r29390, %r29389, %r29383; + xor.b32 %r29391, %r29390, %r29385; + shf.l.wrap.b32 %r29392, %r29391, %r29391, 25; + add.s32 %r29393, %r29345, %r28652; + add.s32 %r29394, %r29393, %r29308; + xor.b32 %r29395, %r29394, %r29333; + shf.l.wrap.b32 %r29396, %r29395, %r29395, 16; + add.s32 %r29397, %r29396, %r29320; + xor.b32 %r29398, %r29397, %r29308; + shf.l.wrap.b32 %r29399, %r29398, %r29398, 20; + add.s32 %r29400, %r29394, %r28676; + add.s32 %r29401, %r29400, %r29399; + xor.b32 %r29402, %r29401, %r29396; + shf.l.wrap.b32 %r29403, %r29402, %r29402, 24; + add.s32 %r29404, %r29403, %r29397; + xor.b32 %r29405, %r29404, %r29399; + shf.l.wrap.b32 %r29406, %r29405, %r29405, 25; + add.s32 %r29407, %r29359, %r28708; + add.s32 %r29408, %r29407, %r29406; + xor.b32 %r29409, %r29408, %r29375; + shf.l.wrap.b32 %r29410, %r29409, %r29409, 16; + add.s32 %r29411, %r29410, %r29390; + xor.b32 %r29412, %r29411, %r29406; + shf.l.wrap.b32 %r29413, %r29412, %r29412, 20; + add.s32 %r29414, %r29408, %r28729; + add.s32 %r29415, %r29414, %r29413; + xor.b32 %r29416, %r29415, %r29410; + shf.l.wrap.b32 %r29417, %r29416, %r29416, 24; + add.s32 %r29418, %r29417, %r29411; + xor.b32 %r29419, %r29418, %r29413; + shf.l.wrap.b32 %r29420, %r29419, %r29419, 25; + add.s32 %r29421, %r29373, %r28660; + add.s32 %r29422, %r29421, %r29364; + xor.b32 %r29423, %r29422, %r29389; + shf.l.wrap.b32 %r29424, %r29423, %r29423, 16; + add.s32 %r29425, %r29424, %r29404; + xor.b32 %r29426, %r29425, %r29364; + shf.l.wrap.b32 %r29427, %r29426, %r29426, 20; + add.s32 %r29428, %r29422, %r28620; + add.s32 %r29429, %r29428, %r29427; + xor.b32 %r29430, %r29429, %r29424; + shf.l.wrap.b32 %r29431, %r29430, %r29430, 24; + add.s32 %r29432, %r29431, %r29425; + xor.b32 %r29433, %r29432, %r29427; + shf.l.wrap.b32 %r29434, %r29433, %r29433, 25; + add.s32 %r29435, %r29387, %r28628; + add.s32 %r29436, %r29435, %r29378; + xor.b32 %r29437, %r29403, %r29436; + shf.l.wrap.b32 %r29438, %r29437, %r29437, 16; + add.s32 %r29439, %r29438, %r29362; + xor.b32 %r29440, %r29439, %r29378; + shf.l.wrap.b32 %r29441, %r29440, %r29440, 20; + add.s32 %r29442, %r29436, %r28692; + add.s32 %r29443, %r29442, %r29441; + xor.b32 %r29444, %r29443, %r29438; + shf.l.wrap.b32 %r29445, %r29444, %r29444, 24; + add.s32 %r29446, %r29445, %r29439; + xor.b32 %r29447, %r29446, %r29441; + shf.l.wrap.b32 %r29448, %r29447, %r29447, 25; + add.s32 %r29449, %r29401, %r28684; + add.s32 %r29450, %r29449, %r29392; + xor.b32 %r29451, %r29361, %r29450; + shf.l.wrap.b32 %r29452, %r29451, %r29451, 16; + add.s32 %r29453, %r29452, %r29376; + xor.b32 %r29454, %r29453, %r29392; + shf.l.wrap.b32 %r29455, %r29454, %r29454, 20; + add.s32 %r29456, %r29450, %r28668; + add.s32 %r29457, %r29456, %r29455; + xor.b32 %r29458, %r29457, %r29452; + shf.l.wrap.b32 %r29459, %r29458, %r29458, 24; + add.s32 %r29460, %r29459, %r29453; + xor.b32 %r29461, %r29460, %r29455; + shf.l.wrap.b32 %r29462, %r29461, %r29461, 25; + add.s32 %r29463, %r29415, %r28725; + add.s32 %r29464, %r29463, %r29434; + xor.b32 %r29465, %r29464, %r29459; + shf.l.wrap.b32 %r29466, %r29465, %r29465, 16; + add.s32 %r29467, %r29466, %r29446; + xor.b32 %r29468, %r29467, %r29434; + shf.l.wrap.b32 %r29469, %r29468, %r29468, 20; + add.s32 %r29470, %r29464, %r28700; + add.s32 %r29471, %r29470, %r29469; + xor.b32 %r29472, %r29471, %r29466; + shf.l.wrap.b32 %r29473, %r29472, %r29472, 24; + add.s32 %r29474, %r29473, %r29467; + xor.b32 %r29475, %r29474, %r29469; + shf.l.wrap.b32 %r29476, %r29475, %r29475, 25; + add.s32 %r29477, %r29448, %r28636; + add.s32 %r29478, %r29477, %r29429; + xor.b32 %r29479, %r29417, %r29478; + shf.l.wrap.b32 %r29480, %r29479, %r29479, 16; + add.s32 %r29481, %r29480, %r29460; + xor.b32 %r29482, %r29481, %r29448; + shf.l.wrap.b32 %r29483, %r29482, %r29482, 20; + add.s32 %r29484, %r29478, %r28716; + add.s32 %r29485, %r29484, %r29483; + xor.b32 %r29486, %r29485, %r29480; + shf.l.wrap.b32 %r29487, %r29486, %r29486, 24; + add.s32 %r29488, %r29487, %r29481; + xor.b32 %r29489, %r29488, %r29483; + shf.l.wrap.b32 %r29490, %r29489, %r29489, 25; + add.s32 %r29491, %r29443, %r28644; + add.s32 %r29492, %r29491, %r29462; + xor.b32 %r29493, %r29431, %r29492; + shf.l.wrap.b32 %r29494, %r29493, %r29493, 16; + add.s32 %r29495, %r29494, %r29418; + xor.b32 %r29496, %r29495, %r29462; + shf.l.wrap.b32 %r29497, %r29496, %r29496, 20; + add.s32 %r29498, %r29492, %r28652; + add.s32 %r29499, %r29498, %r29497; + xor.b32 %r29500, %r29499, %r29494; + shf.l.wrap.b32 %r29501, %r29500, %r29500, 24; + add.s32 %r29502, %r29501, %r29495; + xor.b32 %r29503, %r29502, %r29497; + shf.l.wrap.b32 %r29504, %r29503, %r29503, 25; + add.s32 %r29505, %r29457, %r28676; + add.s32 %r29506, %r29505, %r29420; + xor.b32 %r29507, %r29506, %r29445; + shf.l.wrap.b32 %r29508, %r29507, %r29507, 16; + add.s32 %r29509, %r29508, %r29432; + xor.b32 %r29510, %r29509, %r29420; + shf.l.wrap.b32 %r29511, %r29510, %r29510, 20; + add.s32 %r29512, %r29506, %r28724; + add.s32 %r29513, %r29512, %r29511; + xor.b32 %r29514, %r29513, %r29508; + shf.l.wrap.b32 %r29515, %r29514, %r29514, 24; + add.s32 %r29516, %r29515, %r29509; + xor.b32 %r29517, %r29516, %r29511; + shf.l.wrap.b32 %r29518, %r29517, %r29517, 25; + xor.b32 %r29519, %r29471, %r29502; + cvt.u64.u32 %rd1190, %r29519; + xor.b32 %r29520, %r29516, %r29485; + and.b32 %r29521, %r29520, 255; + cvt.u64.u32 %rd1191, %r29521; + cvt.u64.u32 %rd1192, %r29520; + shl.b64 %rd1193, %rd1192, 32; + and.b64 %rd1194, %rd1193, 280375465082880; + and.b64 %rd1195, %rd1193, 71776119061217280; + shr.u32 %r29522, %r29520, 24; + cvt.u64.u32 %rd1196, %r29522; + shl.b64 %rd1197, %rd1196, 56; + bfi.b64 %rd1198, %rd1191, %rd1190, 32, 32; + or.b64 %rd1199, %rd1198, %rd1194; + or.b64 %rd1200, %rd1199, %rd1195; + or.b64 %rd341, %rd1200, %rd1197; + xor.b32 %r29523, %r29474, %r29499; + cvt.u64.u32 %rd1201, %r29523; + xor.b32 %r29524, %r29513, %r29488; + and.b32 %r29525, %r29524, 255; + cvt.u64.u32 %rd1202, %r29525; + cvt.u64.u32 %rd1203, %r29524; + shl.b64 %rd1204, %rd1203, 32; + and.b64 %rd1205, %rd1204, 280375465082880; + and.b64 %rd1206, %rd1204, 71776119061217280; + shr.u32 %r29526, %r29524, 24; + cvt.u64.u32 %rd1207, %r29526; + shl.b64 %rd1208, %rd1207, 56; + bfi.b64 %rd1209, %rd1202, %rd1201, 32, 32; + or.b64 %rd1210, %rd1209, %rd1205; + or.b64 %rd1211, %rd1210, %rd1206; + or.b64 %rd345, %rd1211, %rd1208; + xor.b32 %r29527, %r29518, %r29487; + cvt.u64.u32 %rd1212, %r29527; + xor.b32 %r29528, %r29476, %r29501; + and.b32 %r29529, %r29528, 255; + cvt.u64.u32 %rd1213, %r29529; + cvt.u64.u32 %rd1214, %r29528; + shl.b64 %rd1215, %rd1214, 32; + and.b64 %rd1216, %rd1215, 280375465082880; + and.b64 %rd1217, %rd1215, 71776119061217280; + shr.u32 %r29530, %r29528, 24; + cvt.u64.u32 %rd1218, %r29530; + shl.b64 %rd1219, %rd1218, 56; + bfi.b64 %rd1220, %rd1213, %rd1212, 32, 32; + or.b64 %rd1221, %rd1220, %rd1216; + or.b64 %rd1222, %rd1221, %rd1217; + or.b64 %rd1280, %rd1222, %rd1219; + xor.b32 %r29531, %r29515, %r29490; + cvt.u64.u32 %rd1223, %r29531; + xor.b32 %r29532, %r29473, %r29504; + and.b32 %r29533, %r29532, 255; + cvt.u64.u32 %rd1224, %r29533; + cvt.u64.u32 %rd1225, %r29532; + shl.b64 %rd1226, %rd1225, 32; + and.b64 %rd1227, %rd1226, 280375465082880; + and.b64 %rd1228, %rd1226, 71776119061217280; + shr.u32 %r29534, %r29532, 24; + cvt.u64.u32 %rd1229, %r29534; + shl.b64 %rd1230, %rd1229, 56; + bfi.b64 %rd1231, %rd1224, %rd1223, 32, 32; + or.b64 %rd1232, %rd1231, %rd1227; + or.b64 %rd1233, %rd1232, %rd1228; + or.b64 %rd1279, %rd1233, %rd1230; + mov.u64 %rd342, %rd341; + bra.uni $L__BB2_104; -$L__BB0_17: - not.pred %p15, %p16; - @%p15 bra $L__BB0_19; +$L__BB2_97: + setp.eq.s16 %p55, %rs502, 0; + selp.u16 %rs504, 1, 0, %p55; + ld.local.u8 %rs667, [%rd3+138]; + or.b16 %rs505, %rs667, %rs504; + or.b16 %rs734, %rs505, 2; + ld.local.u64 %rd1276, [%rd3+64]; + ld.local.v2.u32 {%r30976, %r30975}, [%rd3+32]; + ld.local.v2.u32 {%r30974, %r30973}, [%rd3+40]; + ld.local.v2.u32 {%r30972, %r30971}, [%rd3+48]; + ld.local.v2.u32 {%r30970, %r30969}, [%rd3+56]; + ld.local.v4.u16 {%rs800, %rs802, %rs804, %rs806}, [%rd3+72]; + shr.u16 %rs801, %rs800, 8; + shr.u16 %rs803, %rs802, 8; + shr.u16 %rs805, %rs804, 8; + shr.u16 %rs807, %rs806, 8; + ld.local.v4.u16 {%rs808, %rs810, %rs812, %rs814}, [%rd3+80]; + shr.u16 %rs809, %rs808, 8; + shr.u16 %rs811, %rs810, 8; + shr.u16 %rs813, %rs812, 8; + shr.u16 %rs815, %rs814, 8; + ld.local.v4.u16 {%rs816, %rs818, %rs820, %rs822}, [%rd3+88]; + shr.u16 %rs817, %rs816, 8; + shr.u16 %rs819, %rs818, 8; + shr.u16 %rs821, %rs820, 8; + shr.u16 %rs823, %rs822, 8; + ld.local.v4.u16 {%rs824, %rs826, %rs828, %rs830}, [%rd3+96]; + shr.u16 %rs825, %rs824, 8; + shr.u16 %rs827, %rs826, 8; + shr.u16 %rs829, %rs828, 8; + shr.u16 %rs831, %rs830, 8; + ld.local.v4.u16 {%rs832, %rs834, %rs836, %rs838}, [%rd3+104]; + shr.u16 %rs833, %rs832, 8; + shr.u16 %rs835, %rs834, 8; + shr.u16 %rs837, %rs836, 8; + shr.u16 %rs839, %rs838, 8; + ld.local.v4.u16 {%rs840, %rs842, %rs844, %rs846}, [%rd3+112]; + shr.u16 %rs841, %rs840, 8; + shr.u16 %rs843, %rs842, 8; + shr.u16 %rs845, %rs844, 8; + shr.u16 %rs847, %rs846, 8; + ld.local.v4.u8 {%rs848, %rs849, %rs850, %rs851}, [%rd3+120]; + ld.local.v2.u8 {%rs852, %rs853}, [%rd3+124]; + ld.local.v2.u8 {%rs854, %rs855}, [%rd3+126]; + ld.local.v4.u8 {%rs856, %rs857, %rs858, %rs859}, [%rd3+128]; + ld.local.v2.u8 {%rs860, %rs861}, [%rd3+132]; + ld.local.v2.u8 {%rs862, %rs863}, [%rd3+134]; - ld.param.u64 %rd462, [heavy_hash_param_0]; - ld.param.u64 %rd461, [heavy_hash_param_1]; - and.b64 %rd460, %rd463, %rd462; - or.b64 %rd459, %rd460, %rd461; - ld.param.u64 %rd458, [heavy_hash_param_5]; - cvta.to.global.u64 %rd457, %rd458; - mov.u64 %rd455, 0; - atom.global.cas.b64 %rd456, [%rd457], %rd455, %rd459; +$L__BB2_99: + setp.eq.s64 %p56, %rd1275, 0; + mov.u32 %r30977, %r30976; + mov.u32 %r30978, %r30975; + mov.u32 %r30979, %r30974; + mov.u32 %r30980, %r30973; + mov.u32 %r30981, %r30972; + mov.u32 %r30982, %r30971; + mov.u32 %r30983, %r30970; + mov.u32 %r30984, %r30969; + mov.u16 %rs865, %rs734; + @%p56 bra $L__BB2_102; -$L__BB0_19: + or.b16 %rs865, %rs667, 4; + ld.local.v2.u32 {%r30977, %r30978}, [%rd3]; + ld.local.v2.u32 {%r30979, %r30980}, [%rd3+8]; + ld.local.v2.u32 {%r30981, %r30982}, [%rd3+16]; + ld.local.v2.u32 {%r30983, %r30984}, [%rd3+24]; + mov.u16 %rs768, %rs831; + mov.u16 %rs769, %rs830; + mov.u16 %rs770, %rs829; + mov.u16 %rs771, %rs828; + mov.u16 %rs772, %rs827; + mov.u16 %rs773, %rs826; + mov.u16 %rs774, %rs825; + mov.u16 %rs775, %rs824; + mov.u16 %rs776, %rs823; + mov.u16 %rs777, %rs822; + mov.u16 %rs778, %rs821; + mov.u16 %rs779, %rs820; + mov.u16 %rs780, %rs819; + mov.u16 %rs781, %rs818; + mov.u16 %rs782, %rs817; + mov.u16 %rs783, %rs816; + mov.u16 %rs784, %rs815; + mov.u16 %rs785, %rs814; + mov.u16 %rs786, %rs813; + mov.u16 %rs787, %rs812; + mov.u16 %rs788, %rs811; + mov.u16 %rs789, %rs810; + mov.u16 %rs790, %rs809; + mov.u16 %rs791, %rs808; + mov.u16 %rs792, %rs807; + mov.u16 %rs793, %rs806; + mov.u16 %rs794, %rs805; + mov.u16 %rs795, %rs804; + mov.u16 %rs796, %rs803; + mov.u16 %rs797, %rs802; + mov.u16 %rs798, %rs801; + mov.u16 %rs799, %rs800; + +$L__BB2_101: + add.s64 %rd1275, %rd1275, -1; + shl.b64 %rd1144, %rd1275, 5; + add.s64 %rd1145, %rd3, %rd1144; + ld.local.u8 %rs800, [%rd1145+145]; + mov.u64 %rd1143, 0; + ld.local.u8 %rs801, [%rd1145+146]; + ld.local.u8 %rs802, [%rd1145+147]; + ld.local.u8 %rs803, [%rd1145+148]; + ld.local.u8 %rs804, [%rd1145+149]; + ld.local.u8 %rs805, [%rd1145+150]; + ld.local.u8 %rs806, [%rd1145+151]; + ld.local.u8 %rs807, [%rd1145+152]; + ld.local.u8 %rs808, [%rd1145+153]; + ld.local.u8 %rs809, [%rd1145+154]; + ld.local.u8 %rs810, [%rd1145+155]; + ld.local.u8 %rs811, [%rd1145+156]; + ld.local.u8 %rs812, [%rd1145+157]; + ld.local.u8 %rs813, [%rd1145+158]; + ld.local.u8 %rs814, [%rd1145+159]; + ld.local.u8 %rs815, [%rd1145+160]; + ld.local.u8 %rs816, [%rd1145+161]; + ld.local.u8 %rs817, [%rd1145+162]; + ld.local.u8 %rs818, [%rd1145+163]; + ld.local.u8 %rs819, [%rd1145+164]; + ld.local.u8 %rs820, [%rd1145+165]; + ld.local.u8 %rs821, [%rd1145+166]; + ld.local.u8 %rs822, [%rd1145+167]; + ld.local.u8 %rs823, [%rd1145+168]; + ld.local.u8 %rs824, [%rd1145+169]; + ld.local.u8 %rs825, [%rd1145+170]; + ld.local.u8 %rs826, [%rd1145+171]; + ld.local.u8 %rs827, [%rd1145+172]; + ld.local.u8 %rs828, [%rd1145+173]; + ld.local.u8 %rs829, [%rd1145+174]; + ld.local.u8 %rs830, [%rd1145+175]; + ld.local.u8 %rs831, [%rd1145+176]; + cvt.u32.u16 %r26604, %rs799; + and.b32 %r26605, %r26604, 255; + cvt.u32.u16 %r26606, %rs798; + prmt.b32 %r26607, %r26606, %r26605, 30212; + cvt.u32.u16 %r26608, %rs797; + shl.b32 %r26609, %r26608, 16; + and.b32 %r26610, %r26609, 16711680; + or.b32 %r26611, %r26607, %r26610; + cvt.u32.u16 %r26612, %rs796; + shl.b32 %r26613, %r26612, 24; + or.b32 %r26614, %r26611, %r26613; + cvt.u32.u16 %r26615, %rs795; + and.b32 %r26616, %r26615, 255; + cvt.u32.u16 %r26617, %rs794; + prmt.b32 %r26618, %r26617, %r26616, 30212; + cvt.u32.u16 %r26619, %rs793; + shl.b32 %r26620, %r26619, 16; + and.b32 %r26621, %r26620, 16711680; + or.b32 %r26622, %r26618, %r26621; + cvt.u32.u16 %r26623, %rs792; + shl.b32 %r26624, %r26623, 24; + or.b32 %r26625, %r26622, %r26624; + cvt.u32.u16 %r26626, %rs791; + and.b32 %r26627, %r26626, 255; + cvt.u32.u16 %r26628, %rs790; + prmt.b32 %r26629, %r26628, %r26627, 30212; + cvt.u32.u16 %r26630, %rs789; + shl.b32 %r26631, %r26630, 16; + and.b32 %r26632, %r26631, 16711680; + or.b32 %r26633, %r26629, %r26632; + cvt.u32.u16 %r26634, %rs788; + shl.b32 %r26635, %r26634, 24; + or.b32 %r26636, %r26633, %r26635; + cvt.u32.u16 %r26637, %rs787; + and.b32 %r26638, %r26637, 255; + cvt.u32.u16 %r26639, %rs786; + prmt.b32 %r26640, %r26639, %r26638, 30212; + cvt.u32.u16 %r26641, %rs785; + shl.b32 %r26642, %r26641, 16; + and.b32 %r26643, %r26642, 16711680; + or.b32 %r26644, %r26640, %r26643; + cvt.u32.u16 %r26645, %rs784; + shl.b32 %r26646, %r26645, 24; + or.b32 %r26647, %r26644, %r26646; + cvt.u32.u16 %r26648, %rs783; + and.b32 %r26649, %r26648, 255; + cvt.u32.u16 %r26650, %rs782; + prmt.b32 %r26651, %r26650, %r26649, 30212; + cvt.u32.u16 %r26652, %rs781; + shl.b32 %r26653, %r26652, 16; + and.b32 %r26654, %r26653, 16711680; + or.b32 %r26655, %r26651, %r26654; + cvt.u32.u16 %r26656, %rs780; + shl.b32 %r26657, %r26656, 24; + or.b32 %r26658, %r26655, %r26657; + cvt.u32.u16 %r26659, %rs779; + and.b32 %r26660, %r26659, 255; + cvt.u32.u16 %r26661, %rs778; + prmt.b32 %r26662, %r26661, %r26660, 30212; + cvt.u32.u16 %r26663, %rs777; + shl.b32 %r26664, %r26663, 16; + and.b32 %r26665, %r26664, 16711680; + or.b32 %r26666, %r26662, %r26665; + cvt.u32.u16 %r26667, %rs776; + shl.b32 %r26668, %r26667, 24; + or.b32 %r26669, %r26666, %r26668; + cvt.u32.u16 %r26670, %rs775; + and.b32 %r26671, %r26670, 255; + cvt.u32.u16 %r26672, %rs774; + prmt.b32 %r26673, %r26672, %r26671, 30212; + cvt.u32.u16 %r26674, %rs773; + shl.b32 %r26675, %r26674, 16; + and.b32 %r26676, %r26675, 16711680; + or.b32 %r26677, %r26673, %r26676; + cvt.u32.u16 %r26678, %rs772; + shl.b32 %r26679, %r26678, 24; + or.b32 %r26680, %r26677, %r26679; + cvt.u32.u16 %r26681, %rs771; + and.b32 %r26682, %r26681, 255; + cvt.u32.u16 %r26683, %rs770; + prmt.b32 %r26684, %r26683, %r26682, 30212; + cvt.u32.u16 %r26685, %rs769; + shl.b32 %r26686, %r26685, 16; + and.b32 %r26687, %r26686, 16711680; + or.b32 %r26688, %r26684, %r26687; + cvt.u32.u16 %r26689, %rs768; + shl.b32 %r26690, %r26689, 24; + or.b32 %r26691, %r26688, %r26690; + cvt.u32.u16 %r26692, %rs832; + and.b32 %r26693, %r26692, 255; + cvt.u32.u16 %r26694, %rs833; + prmt.b32 %r26695, %r26694, %r26693, 30212; + cvt.u32.u16 %r26696, %rs834; + shl.b32 %r26697, %r26696, 16; + and.b32 %r26698, %r26697, 16711680; + or.b32 %r26699, %r26695, %r26698; + cvt.u32.u16 %r26700, %rs835; + shl.b32 %r26701, %r26700, 24; + or.b32 %r26702, %r26699, %r26701; + cvt.u32.u16 %r26703, %rs836; + and.b32 %r26704, %r26703, 255; + cvt.u32.u16 %r26705, %rs837; + prmt.b32 %r26706, %r26705, %r26704, 30212; + cvt.u32.u16 %r26707, %rs838; + shl.b32 %r26708, %r26707, 16; + and.b32 %r26709, %r26708, 16711680; + or.b32 %r26710, %r26706, %r26709; + cvt.u32.u16 %r26711, %rs839; + shl.b32 %r26712, %r26711, 24; + or.b32 %r26713, %r26710, %r26712; + cvt.u32.u16 %r26714, %rs840; + and.b32 %r26715, %r26714, 255; + cvt.u32.u16 %r26716, %rs841; + prmt.b32 %r26717, %r26716, %r26715, 30212; + cvt.u32.u16 %r26718, %rs842; + shl.b32 %r26719, %r26718, 16; + and.b32 %r26720, %r26719, 16711680; + or.b32 %r26721, %r26717, %r26720; + cvt.u32.u16 %r26722, %rs843; + shl.b32 %r26723, %r26722, 24; + or.b32 %r26724, %r26721, %r26723; + cvt.u32.u16 %r26725, %rs844; + and.b32 %r26726, %r26725, 255; + cvt.u32.u16 %r26727, %rs845; + prmt.b32 %r26728, %r26727, %r26726, 30212; + cvt.u32.u16 %r26729, %rs846; + shl.b32 %r26730, %r26729, 16; + and.b32 %r26731, %r26730, 16711680; + or.b32 %r26732, %r26728, %r26731; + cvt.u32.u16 %r26733, %rs847; + shl.b32 %r26734, %r26733, 24; + or.b32 %r26735, %r26732, %r26734; + cvt.u32.u16 %r26736, %rs848; + and.b32 %r26737, %r26736, 255; + cvt.u32.u16 %r26738, %rs849; + prmt.b32 %r26739, %r26738, %r26737, 30212; + cvt.u32.u16 %r26740, %rs850; + shl.b32 %r26741, %r26740, 16; + and.b32 %r26742, %r26741, 16711680; + or.b32 %r26743, %r26739, %r26742; + cvt.u32.u16 %r26744, %rs851; + shl.b32 %r26745, %r26744, 24; + or.b32 %r26746, %r26743, %r26745; + cvt.u32.u16 %r26747, %rs852; + and.b32 %r26748, %r26747, 255; + cvt.u32.u16 %r26749, %rs853; + prmt.b32 %r26750, %r26749, %r26748, 30212; + cvt.u32.u16 %r26751, %rs854; + shl.b32 %r26752, %r26751, 16; + and.b32 %r26753, %r26752, 16711680; + or.b32 %r26754, %r26750, %r26753; + cvt.u32.u16 %r26755, %rs855; + shl.b32 %r26756, %r26755, 24; + or.b32 %r26757, %r26754, %r26756; + cvt.u32.u16 %r26758, %rs856; + and.b32 %r26759, %r26758, 255; + cvt.u32.u16 %r26760, %rs857; + prmt.b32 %r26761, %r26760, %r26759, 30212; + cvt.u32.u16 %r26762, %rs858; + shl.b32 %r26763, %r26762, 16; + and.b32 %r26764, %r26763, 16711680; + or.b32 %r26765, %r26761, %r26764; + cvt.u32.u16 %r26766, %rs859; + shl.b32 %r26767, %r26766, 24; + or.b32 %r26768, %r26765, %r26767; + cvt.u32.u16 %r26769, %rs860; + and.b32 %r26770, %r26769, 255; + cvt.u32.u16 %r26771, %rs861; + prmt.b32 %r26772, %r26771, %r26770, 30212; + cvt.u32.u16 %r26773, %rs862; + shl.b32 %r26774, %r26773, 16; + and.b32 %r26775, %r26774, 16711680; + or.b32 %r26776, %r26772, %r26775; + cvt.u32.u16 %r26777, %rs863; + shl.b32 %r26778, %r26777, 24; + or.b32 %r26779, %r26776, %r26778; + shr.u64 %rd1146, %rd1276, 32; + cvt.u32.u64 %r26780, %rd1146; + add.s32 %r26781, %r30972, %r30976; + add.s32 %r26782, %r26781, %r26614; + cvt.u32.u64 %r26783, %rd1276; + xor.b32 %r26784, %r26782, %r26783; + shf.l.wrap.b32 %r26785, %r26784, %r26784, 16; + add.s32 %r26786, %r26785, 1779033703; + xor.b32 %r26787, %r26786, %r30972; + shf.l.wrap.b32 %r26788, %r26787, %r26787, 20; + add.s32 %r26789, %r26625, %r26782; + add.s32 %r26790, %r26789, %r26788; + xor.b32 %r26791, %r26790, %r26785; + shf.l.wrap.b32 %r26792, %r26791, %r26791, 24; + add.s32 %r26793, %r26792, %r26786; + xor.b32 %r26794, %r26793, %r26788; + shf.l.wrap.b32 %r26795, %r26794, %r26794, 25; + add.s32 %r26796, %r30971, %r30975; + add.s32 %r26797, %r26796, %r26636; + xor.b32 %r26798, %r26797, %r26780; + shf.l.wrap.b32 %r26799, %r26798, %r26798, 16; + add.s32 %r26800, %r26799, -1150833019; + xor.b32 %r26801, %r26800, %r30971; + shf.l.wrap.b32 %r26802, %r26801, %r26801, 20; + add.s32 %r26803, %r26647, %r26797; + add.s32 %r26804, %r26803, %r26802; + xor.b32 %r26805, %r26804, %r26799; + shf.l.wrap.b32 %r26806, %r26805, %r26805, 24; + add.s32 %r26807, %r26806, %r26800; + xor.b32 %r26808, %r26807, %r26802; + shf.l.wrap.b32 %r26809, %r26808, %r26808, 25; + add.s32 %r26810, %r30970, %r30974; + add.s32 %r26811, %r26810, %r26658; + cvt.u32.u16 %r26812, %rs864; + and.b32 %r26813, %r26812, 255; + xor.b32 %r26814, %r26811, %r26813; + shr.u32 %r26815, %r26811, 16; + shl.b32 %r26816, %r26814, 16; + or.b32 %r26817, %r26816, %r26815; + add.s32 %r26818, %r26817, 1013904242; + xor.b32 %r26819, %r26818, %r30970; + shf.l.wrap.b32 %r26820, %r26819, %r26819, 20; + add.s32 %r26821, %r26669, %r26811; + add.s32 %r26822, %r26821, %r26820; + xor.b32 %r26823, %r26822, %r26817; + shf.l.wrap.b32 %r26824, %r26823, %r26823, 24; + add.s32 %r26825, %r26824, %r26818; + xor.b32 %r26826, %r26825, %r26820; + shf.l.wrap.b32 %r26827, %r26826, %r26826, 25; + add.s32 %r26828, %r30969, %r30973; + add.s32 %r26829, %r26828, %r26680; + cvt.u32.u16 %r26830, %rs734; + and.b32 %r26831, %r26830, 255; + xor.b32 %r26832, %r26829, %r26831; + shr.u32 %r26833, %r26829, 16; + shl.b32 %r26834, %r26832, 16; + or.b32 %r26835, %r26834, %r26833; + add.s32 %r26836, %r26835, -1521486534; + xor.b32 %r26837, %r26836, %r30969; + shf.l.wrap.b32 %r26838, %r26837, %r26837, 20; + add.s32 %r26839, %r26691, %r26829; + add.s32 %r26840, %r26839, %r26838; + xor.b32 %r26841, %r26840, %r26835; + shf.l.wrap.b32 %r26842, %r26841, %r26841, 24; + add.s32 %r26843, %r26842, %r26836; + xor.b32 %r26844, %r26843, %r26838; + shf.l.wrap.b32 %r26845, %r26844, %r26844, 25; + add.s32 %r26846, %r26809, %r26790; + add.s32 %r26847, %r26846, %r26702; + xor.b32 %r26848, %r26842, %r26847; + shf.l.wrap.b32 %r26849, %r26848, %r26848, 16; + add.s32 %r26850, %r26849, %r26825; + xor.b32 %r26851, %r26850, %r26809; + shf.l.wrap.b32 %r26852, %r26851, %r26851, 20; + add.s32 %r26853, %r26713, %r26847; + add.s32 %r26854, %r26853, %r26852; + xor.b32 %r26855, %r26854, %r26849; + shf.l.wrap.b32 %r26856, %r26855, %r26855, 24; + add.s32 %r26857, %r26856, %r26850; + xor.b32 %r26858, %r26857, %r26852; + shf.l.wrap.b32 %r26859, %r26858, %r26858, 25; + add.s32 %r26860, %r26724, %r26804; + add.s32 %r26861, %r26860, %r26827; + xor.b32 %r26862, %r26861, %r26792; + shf.l.wrap.b32 %r26863, %r26862, %r26862, 16; + add.s32 %r26864, %r26863, %r26843; + xor.b32 %r26865, %r26864, %r26827; + shf.l.wrap.b32 %r26866, %r26865, %r26865, 20; + add.s32 %r26867, %r26861, %r26735; + add.s32 %r26868, %r26867, %r26866; + xor.b32 %r26869, %r26868, %r26863; + shf.l.wrap.b32 %r26870, %r26869, %r26869, 24; + add.s32 %r26871, %r26870, %r26864; + xor.b32 %r26872, %r26871, %r26866; + shf.l.wrap.b32 %r26873, %r26872, %r26872, 25; + add.s32 %r26874, %r26822, %r26746; + add.s32 %r26875, %r26874, %r26845; + xor.b32 %r26876, %r26875, %r26806; + shf.l.wrap.b32 %r26877, %r26876, %r26876, 16; + add.s32 %r26878, %r26877, %r26793; + xor.b32 %r26879, %r26878, %r26845; + shf.l.wrap.b32 %r26880, %r26879, %r26879, 20; + add.s32 %r26881, %r26875, %r26757; + add.s32 %r26882, %r26881, %r26880; + xor.b32 %r26883, %r26882, %r26877; + shf.l.wrap.b32 %r26884, %r26883, %r26883, 24; + add.s32 %r26885, %r26884, %r26878; + xor.b32 %r26886, %r26885, %r26880; + shf.l.wrap.b32 %r26887, %r26886, %r26886, 25; + add.s32 %r26888, %r26768, %r26795; + add.s32 %r26889, %r26888, %r26840; + xor.b32 %r26890, %r26824, %r26889; + shf.l.wrap.b32 %r26891, %r26890, %r26890, 16; + add.s32 %r26892, %r26891, %r26807; + xor.b32 %r26893, %r26892, %r26795; + shf.l.wrap.b32 %r26894, %r26893, %r26893, 20; + add.s32 %r26895, %r26889, %r26779; + add.s32 %r26896, %r26895, %r26894; + xor.b32 %r26897, %r26896, %r26891; + shf.l.wrap.b32 %r26898, %r26897, %r26897, 24; + add.s32 %r26899, %r26898, %r26892; + xor.b32 %r26900, %r26899, %r26894; + shf.l.wrap.b32 %r26901, %r26900, %r26900, 25; + add.s32 %r26902, %r26854, %r26636; + add.s32 %r26903, %r26902, %r26901; + xor.b32 %r26904, %r26870, %r26903; + shf.l.wrap.b32 %r26905, %r26904, %r26904, 16; + add.s32 %r26906, %r26905, %r26885; + xor.b32 %r26907, %r26906, %r26901; + shf.l.wrap.b32 %r26908, %r26907, %r26907, 20; + add.s32 %r26909, %r26903, %r26680; + add.s32 %r26910, %r26909, %r26908; + xor.b32 %r26911, %r26910, %r26905; + shf.l.wrap.b32 %r26912, %r26911, %r26911, 24; + add.s32 %r26913, %r26912, %r26906; + xor.b32 %r26914, %r26913, %r26908; + shf.l.wrap.b32 %r26915, %r26914, %r26914, 25; + add.s32 %r26916, %r26868, %r26647; + add.s32 %r26917, %r26916, %r26859; + xor.b32 %r26918, %r26884, %r26917; + shf.l.wrap.b32 %r26919, %r26918, %r26918, 16; + add.s32 %r26920, %r26919, %r26899; + xor.b32 %r26921, %r26920, %r26859; + shf.l.wrap.b32 %r26922, %r26921, %r26921, 20; + add.s32 %r26923, %r26917, %r26724; + add.s32 %r26924, %r26923, %r26922; + xor.b32 %r26925, %r26924, %r26919; + shf.l.wrap.b32 %r26926, %r26925, %r26925, 24; + add.s32 %r26927, %r26926, %r26920; + xor.b32 %r26928, %r26927, %r26922; + shf.l.wrap.b32 %r26929, %r26928, %r26928, 25; + add.s32 %r26930, %r26882, %r26691; + add.s32 %r26931, %r26930, %r26873; + xor.b32 %r26932, %r26931, %r26898; + shf.l.wrap.b32 %r26933, %r26932, %r26932, 16; + add.s32 %r26934, %r26933, %r26857; + xor.b32 %r26935, %r26934, %r26873; + shf.l.wrap.b32 %r26936, %r26935, %r26935, 20; + add.s32 %r26937, %r26931, %r26614; + add.s32 %r26938, %r26937, %r26936; + xor.b32 %r26939, %r26938, %r26933; + shf.l.wrap.b32 %r26940, %r26939, %r26939, 24; + add.s32 %r26941, %r26940, %r26934; + xor.b32 %r26942, %r26941, %r26936; + shf.l.wrap.b32 %r26943, %r26942, %r26942, 25; + add.s32 %r26944, %r26896, %r26658; + add.s32 %r26945, %r26944, %r26887; + xor.b32 %r26946, %r26945, %r26856; + shf.l.wrap.b32 %r26947, %r26946, %r26946, 16; + add.s32 %r26948, %r26947, %r26871; + xor.b32 %r26949, %r26948, %r26887; + shf.l.wrap.b32 %r26950, %r26949, %r26949, 20; + add.s32 %r26951, %r26945, %r26757; + add.s32 %r26952, %r26951, %r26950; + xor.b32 %r26953, %r26952, %r26947; + shf.l.wrap.b32 %r26954, %r26953, %r26953, 24; + add.s32 %r26955, %r26954, %r26948; + xor.b32 %r26956, %r26955, %r26950; + shf.l.wrap.b32 %r26957, %r26956, %r26956, 25; + add.s32 %r26958, %r26910, %r26625; + add.s32 %r26959, %r26958, %r26929; + xor.b32 %r26960, %r26954, %r26959; + shf.l.wrap.b32 %r26961, %r26960, %r26960, 16; + add.s32 %r26962, %r26961, %r26941; + xor.b32 %r26963, %r26962, %r26929; + shf.l.wrap.b32 %r26964, %r26963, %r26963, 20; + add.s32 %r26965, %r26959, %r26735; + add.s32 %r26966, %r26965, %r26964; + xor.b32 %r26967, %r26966, %r26961; + shf.l.wrap.b32 %r26968, %r26967, %r26967, 24; + add.s32 %r26969, %r26968, %r26962; + xor.b32 %r26970, %r26969, %r26964; + shf.l.wrap.b32 %r26971, %r26970, %r26970, 25; + add.s32 %r26972, %r26924, %r26746; + add.s32 %r26973, %r26972, %r26943; + xor.b32 %r26974, %r26973, %r26912; + shf.l.wrap.b32 %r26975, %r26974, %r26974, 16; + add.s32 %r26976, %r26975, %r26955; + xor.b32 %r26977, %r26976, %r26943; + shf.l.wrap.b32 %r26978, %r26977, %r26977, 20; + add.s32 %r26979, %r26973, %r26669; + add.s32 %r26980, %r26979, %r26978; + xor.b32 %r26981, %r26980, %r26975; + shf.l.wrap.b32 %r26982, %r26981, %r26981, 24; + add.s32 %r26983, %r26982, %r26976; + xor.b32 %r26984, %r26983, %r26978; + shf.l.wrap.b32 %r26985, %r26984, %r26984, 25; + add.s32 %r26986, %r26938, %r26713; + add.s32 %r26987, %r26986, %r26957; + xor.b32 %r26988, %r26987, %r26926; + shf.l.wrap.b32 %r26989, %r26988, %r26988, 16; + add.s32 %r26990, %r26989, %r26913; + xor.b32 %r26991, %r26990, %r26957; + shf.l.wrap.b32 %r26992, %r26991, %r26991, 20; + add.s32 %r26993, %r26987, %r26768; + add.s32 %r26994, %r26993, %r26992; + xor.b32 %r26995, %r26994, %r26989; + shf.l.wrap.b32 %r26996, %r26995, %r26995, 24; + add.s32 %r26997, %r26996, %r26990; + xor.b32 %r26998, %r26997, %r26992; + shf.l.wrap.b32 %r26999, %r26998, %r26998, 25; + add.s32 %r27000, %r26915, %r26779; + add.s32 %r27001, %r27000, %r26952; + xor.b32 %r27002, %r26940, %r27001; + shf.l.wrap.b32 %r27003, %r27002, %r27002, 16; + add.s32 %r27004, %r27003, %r26927; + xor.b32 %r27005, %r27004, %r26915; + shf.l.wrap.b32 %r27006, %r27005, %r27005, 20; + add.s32 %r27007, %r27001, %r26702; + add.s32 %r27008, %r27007, %r27006; + xor.b32 %r27009, %r27008, %r27003; + shf.l.wrap.b32 %r27010, %r27009, %r27009, 24; + add.s32 %r27011, %r27010, %r27004; + xor.b32 %r27012, %r27011, %r27006; + shf.l.wrap.b32 %r27013, %r27012, %r27012, 25; + add.s32 %r27014, %r26966, %r26647; + add.s32 %r27015, %r27014, %r27013; + xor.b32 %r27016, %r26982, %r27015; + shf.l.wrap.b32 %r27017, %r27016, %r27016, 16; + add.s32 %r27018, %r27017, %r26997; + xor.b32 %r27019, %r27018, %r27013; + shf.l.wrap.b32 %r27020, %r27019, %r27019, 20; + add.s32 %r27021, %r27015, %r26658; + add.s32 %r27022, %r27021, %r27020; + xor.b32 %r27023, %r27022, %r27017; + shf.l.wrap.b32 %r27024, %r27023, %r27023, 24; + add.s32 %r27025, %r27024, %r27018; + xor.b32 %r27026, %r27025, %r27020; + shf.l.wrap.b32 %r27027, %r27026, %r27026, 25; + add.s32 %r27028, %r26980, %r26724; + add.s32 %r27029, %r27028, %r26971; + xor.b32 %r27030, %r26996, %r27029; + shf.l.wrap.b32 %r27031, %r27030, %r27030, 16; + add.s32 %r27032, %r27031, %r27011; + xor.b32 %r27033, %r27032, %r26971; + shf.l.wrap.b32 %r27034, %r27033, %r27033, 20; + add.s32 %r27035, %r27029, %r26746; + add.s32 %r27036, %r27035, %r27034; + xor.b32 %r27037, %r27036, %r27031; + shf.l.wrap.b32 %r27038, %r27037, %r27037, 24; + add.s32 %r27039, %r27038, %r27032; + xor.b32 %r27040, %r27039, %r27034; + shf.l.wrap.b32 %r27041, %r27040, %r27040, 25; + add.s32 %r27042, %r26994, %r26757; + add.s32 %r27043, %r27042, %r26985; + xor.b32 %r27044, %r27043, %r27010; + shf.l.wrap.b32 %r27045, %r27044, %r27044, 16; + add.s32 %r27046, %r27045, %r26969; + xor.b32 %r27047, %r27046, %r26985; + shf.l.wrap.b32 %r27048, %r27047, %r27047, 20; + add.s32 %r27049, %r27043, %r26636; + add.s32 %r27050, %r27049, %r27048; + xor.b32 %r27051, %r27050, %r27045; + shf.l.wrap.b32 %r27052, %r27051, %r27051, 24; + add.s32 %r27053, %r27052, %r27046; + xor.b32 %r27054, %r27053, %r27048; + shf.l.wrap.b32 %r27055, %r27054, %r27054, 25; + add.s32 %r27056, %r27008, %r26691; + add.s32 %r27057, %r27056, %r26999; + xor.b32 %r27058, %r27057, %r26968; + shf.l.wrap.b32 %r27059, %r27058, %r27058, 16; + add.s32 %r27060, %r27059, %r26983; + xor.b32 %r27061, %r27060, %r26999; + shf.l.wrap.b32 %r27062, %r27061, %r27061, 20; + add.s32 %r27063, %r27057, %r26768; + add.s32 %r27064, %r27063, %r27062; + xor.b32 %r27065, %r27064, %r27059; + shf.l.wrap.b32 %r27066, %r27065, %r27065, 24; + add.s32 %r27067, %r27066, %r27060; + xor.b32 %r27068, %r27067, %r27062; + shf.l.wrap.b32 %r27069, %r27068, %r27068, 25; + add.s32 %r27070, %r27022, %r26680; + add.s32 %r27071, %r27070, %r27041; + xor.b32 %r27072, %r27066, %r27071; + shf.l.wrap.b32 %r27073, %r27072, %r27072, 16; + add.s32 %r27074, %r27073, %r27053; + xor.b32 %r27075, %r27074, %r27041; + shf.l.wrap.b32 %r27076, %r27075, %r27075, 20; + add.s32 %r27077, %r27071, %r26669; + add.s32 %r27078, %r27077, %r27076; + xor.b32 %r27079, %r27078, %r27073; + shf.l.wrap.b32 %r27080, %r27079, %r27079, 24; + add.s32 %r27081, %r27080, %r27074; + xor.b32 %r27082, %r27081, %r27076; + shf.l.wrap.b32 %r27083, %r27082, %r27082, 25; + add.s32 %r27084, %r27036, %r26713; + add.s32 %r27085, %r27084, %r27055; + xor.b32 %r27086, %r27085, %r27024; + shf.l.wrap.b32 %r27087, %r27086, %r27086, 16; + add.s32 %r27088, %r27087, %r27067; + xor.b32 %r27089, %r27088, %r27055; + shf.l.wrap.b32 %r27090, %r27089, %r27089, 20; + add.s32 %r27091, %r27085, %r26614; + add.s32 %r27092, %r27091, %r27090; + xor.b32 %r27093, %r27092, %r27087; + shf.l.wrap.b32 %r27094, %r27093, %r27093, 24; + add.s32 %r27095, %r27094, %r27088; + xor.b32 %r27096, %r27095, %r27090; + shf.l.wrap.b32 %r27097, %r27096, %r27096, 25; + add.s32 %r27098, %r27050, %r26735; + add.s32 %r27099, %r27098, %r27069; + xor.b32 %r27100, %r27099, %r27038; + shf.l.wrap.b32 %r27101, %r27100, %r27100, 16; + add.s32 %r27102, %r27101, %r27025; + xor.b32 %r27103, %r27102, %r27069; + shf.l.wrap.b32 %r27104, %r27103, %r27103, 20; + add.s32 %r27105, %r27099, %r26779; + add.s32 %r27106, %r27105, %r27104; + xor.b32 %r27107, %r27106, %r27101; + shf.l.wrap.b32 %r27108, %r27107, %r27107, 24; + add.s32 %r27109, %r27108, %r27102; + xor.b32 %r27110, %r27109, %r27104; + shf.l.wrap.b32 %r27111, %r27110, %r27110, 25; + add.s32 %r27112, %r27027, %r26702; + add.s32 %r27113, %r27112, %r27064; + xor.b32 %r27114, %r27052, %r27113; + shf.l.wrap.b32 %r27115, %r27114, %r27114, 16; + add.s32 %r27116, %r27115, %r27039; + xor.b32 %r27117, %r27116, %r27027; + shf.l.wrap.b32 %r27118, %r27117, %r27117, 20; + add.s32 %r27119, %r27113, %r26625; + add.s32 %r27120, %r27119, %r27118; + xor.b32 %r27121, %r27120, %r27115; + shf.l.wrap.b32 %r27122, %r27121, %r27121, 24; + add.s32 %r27123, %r27122, %r27116; + xor.b32 %r27124, %r27123, %r27118; + shf.l.wrap.b32 %r27125, %r27124, %r27124, 25; + add.s32 %r27126, %r27078, %r26724; + add.s32 %r27127, %r27126, %r27125; + xor.b32 %r27128, %r27094, %r27127; + shf.l.wrap.b32 %r27129, %r27128, %r27128, 16; + add.s32 %r27130, %r27129, %r27109; + xor.b32 %r27131, %r27130, %r27125; + shf.l.wrap.b32 %r27132, %r27131, %r27131, 20; + add.s32 %r27133, %r27127, %r26691; + add.s32 %r27134, %r27133, %r27132; + xor.b32 %r27135, %r27134, %r27129; + shf.l.wrap.b32 %r27136, %r27135, %r27135, 24; + add.s32 %r27137, %r27136, %r27130; + xor.b32 %r27138, %r27137, %r27132; + shf.l.wrap.b32 %r27139, %r27138, %r27138, 25; + add.s32 %r27140, %r27092, %r26746; + add.s32 %r27141, %r27140, %r27083; + xor.b32 %r27142, %r27108, %r27141; + shf.l.wrap.b32 %r27143, %r27142, %r27142, 16; + add.s32 %r27144, %r27143, %r27123; + xor.b32 %r27145, %r27144, %r27083; + shf.l.wrap.b32 %r27146, %r27145, %r27145, 20; + add.s32 %r27147, %r27141, %r26713; + add.s32 %r27148, %r27147, %r27146; + xor.b32 %r27149, %r27148, %r27143; + shf.l.wrap.b32 %r27150, %r27149, %r27149, 24; + add.s32 %r27151, %r27150, %r27144; + xor.b32 %r27152, %r27151, %r27146; + shf.l.wrap.b32 %r27153, %r27152, %r27152, 25; + add.s32 %r27154, %r27106, %r26768; + add.s32 %r27155, %r27154, %r27097; + xor.b32 %r27156, %r27155, %r27122; + shf.l.wrap.b32 %r27157, %r27156, %r27156, 16; + add.s32 %r27158, %r27157, %r27081; + xor.b32 %r27159, %r27158, %r27097; + shf.l.wrap.b32 %r27160, %r27159, %r27159, 20; + add.s32 %r27161, %r27155, %r26647; + add.s32 %r27162, %r27161, %r27160; + xor.b32 %r27163, %r27162, %r27157; + shf.l.wrap.b32 %r27164, %r27163, %r27163, 24; + add.s32 %r27165, %r27164, %r27158; + xor.b32 %r27166, %r27165, %r27160; + shf.l.wrap.b32 %r27167, %r27166, %r27166, 25; + add.s32 %r27168, %r27120, %r26757; + add.s32 %r27169, %r27168, %r27111; + xor.b32 %r27170, %r27169, %r27080; + shf.l.wrap.b32 %r27171, %r27170, %r27170, 16; + add.s32 %r27172, %r27171, %r27095; + xor.b32 %r27173, %r27172, %r27111; + shf.l.wrap.b32 %r27174, %r27173, %r27173, 20; + add.s32 %r27175, %r27169, %r26779; + add.s32 %r27176, %r27175, %r27174; + xor.b32 %r27177, %r27176, %r27171; + shf.l.wrap.b32 %r27178, %r27177, %r27177, 24; + add.s32 %r27179, %r27178, %r27172; + xor.b32 %r27180, %r27179, %r27174; + shf.l.wrap.b32 %r27181, %r27180, %r27180, 25; + add.s32 %r27182, %r27134, %r26658; + add.s32 %r27183, %r27182, %r27153; + xor.b32 %r27184, %r27178, %r27183; + shf.l.wrap.b32 %r27185, %r27184, %r27184, 16; + add.s32 %r27186, %r27185, %r27165; + xor.b32 %r27187, %r27186, %r27153; + shf.l.wrap.b32 %r27188, %r27187, %r27187, 20; + add.s32 %r27189, %r27183, %r26614; + add.s32 %r27190, %r27189, %r27188; + xor.b32 %r27191, %r27190, %r27185; + shf.l.wrap.b32 %r27192, %r27191, %r27191, 24; + add.s32 %r27193, %r27192, %r27186; + xor.b32 %r27194, %r27193, %r27188; + shf.l.wrap.b32 %r27195, %r27194, %r27194, 25; + add.s32 %r27196, %r27148, %r26735; + add.s32 %r27197, %r27196, %r27167; + xor.b32 %r27198, %r27197, %r27136; + shf.l.wrap.b32 %r27199, %r27198, %r27198, 16; + add.s32 %r27200, %r27199, %r27179; + xor.b32 %r27201, %r27200, %r27167; + shf.l.wrap.b32 %r27202, %r27201, %r27201, 20; + add.s32 %r27203, %r27197, %r26636; + add.s32 %r27204, %r27203, %r27202; + xor.b32 %r27205, %r27204, %r27199; + shf.l.wrap.b32 %r27206, %r27205, %r27205, 24; + add.s32 %r27207, %r27206, %r27200; + xor.b32 %r27208, %r27207, %r27202; + shf.l.wrap.b32 %r27209, %r27208, %r27208, 25; + add.s32 %r27210, %r27162, %r26669; + add.s32 %r27211, %r27210, %r27181; + xor.b32 %r27212, %r27211, %r27150; + shf.l.wrap.b32 %r27213, %r27212, %r27212, 16; + add.s32 %r27214, %r27213, %r27137; + xor.b32 %r27215, %r27214, %r27181; + shf.l.wrap.b32 %r27216, %r27215, %r27215, 20; + add.s32 %r27217, %r27211, %r26702; + add.s32 %r27218, %r27217, %r27216; + xor.b32 %r27219, %r27218, %r27213; + shf.l.wrap.b32 %r27220, %r27219, %r27219, 24; + add.s32 %r27221, %r27220, %r27214; + xor.b32 %r27222, %r27221, %r27216; + shf.l.wrap.b32 %r27223, %r27222, %r27222, 25; + add.s32 %r27224, %r27139, %r26625; + add.s32 %r27225, %r27224, %r27176; + xor.b32 %r27226, %r27164, %r27225; + shf.l.wrap.b32 %r27227, %r27226, %r27226, 16; + add.s32 %r27228, %r27227, %r27151; + xor.b32 %r27229, %r27228, %r27139; + shf.l.wrap.b32 %r27230, %r27229, %r27229, 20; + add.s32 %r27231, %r27225, %r26680; + add.s32 %r27232, %r27231, %r27230; + xor.b32 %r27233, %r27232, %r27227; + shf.l.wrap.b32 %r27234, %r27233, %r27233, 24; + add.s32 %r27235, %r27234, %r27228; + xor.b32 %r27236, %r27235, %r27230; + shf.l.wrap.b32 %r27237, %r27236, %r27236, 25; + add.s32 %r27238, %r27190, %r26746; + add.s32 %r27239, %r27238, %r27237; + xor.b32 %r27240, %r27206, %r27239; + shf.l.wrap.b32 %r27241, %r27240, %r27240, 16; + add.s32 %r27242, %r27241, %r27221; + xor.b32 %r27243, %r27242, %r27237; + shf.l.wrap.b32 %r27244, %r27243, %r27243, 20; + add.s32 %r27245, %r27239, %r26757; + add.s32 %r27246, %r27245, %r27244; + xor.b32 %r27247, %r27246, %r27241; + shf.l.wrap.b32 %r27248, %r27247, %r27247, 24; + add.s32 %r27249, %r27248, %r27242; + xor.b32 %r27250, %r27249, %r27244; + shf.l.wrap.b32 %r27251, %r27250, %r27250, 25; + add.s32 %r27252, %r27204, %r26713; + add.s32 %r27253, %r27252, %r27195; + xor.b32 %r27254, %r27220, %r27253; + shf.l.wrap.b32 %r27255, %r27254, %r27254, 16; + add.s32 %r27256, %r27255, %r27235; + xor.b32 %r27257, %r27256, %r27195; + shf.l.wrap.b32 %r27258, %r27257, %r27257, 20; + add.s32 %r27259, %r27253, %r26735; + add.s32 %r27260, %r27259, %r27258; + xor.b32 %r27261, %r27260, %r27255; + shf.l.wrap.b32 %r27262, %r27261, %r27261, 24; + add.s32 %r27263, %r27262, %r27256; + xor.b32 %r27264, %r27263, %r27258; + shf.l.wrap.b32 %r27265, %r27264, %r27264, 25; + add.s32 %r27266, %r27218, %r26779; + add.s32 %r27267, %r27266, %r27209; + xor.b32 %r27268, %r27267, %r27234; + shf.l.wrap.b32 %r27269, %r27268, %r27268, 16; + add.s32 %r27270, %r27269, %r27193; + xor.b32 %r27271, %r27270, %r27209; + shf.l.wrap.b32 %r27272, %r27271, %r27271, 20; + add.s32 %r27273, %r27267, %r26724; + add.s32 %r27274, %r27273, %r27272; + xor.b32 %r27275, %r27274, %r27269; + shf.l.wrap.b32 %r27276, %r27275, %r27275, 24; + add.s32 %r27277, %r27276, %r27270; + xor.b32 %r27278, %r27277, %r27272; + shf.l.wrap.b32 %r27279, %r27278, %r27278, 25; + add.s32 %r27280, %r27232, %r26768; + add.s32 %r27281, %r27280, %r27223; + xor.b32 %r27282, %r27281, %r27192; + shf.l.wrap.b32 %r27283, %r27282, %r27282, 16; + add.s32 %r27284, %r27283, %r27207; + xor.b32 %r27285, %r27284, %r27223; + shf.l.wrap.b32 %r27286, %r27285, %r27285, 20; + add.s32 %r27287, %r27281, %r26702; + add.s32 %r27288, %r27287, %r27286; + xor.b32 %r27289, %r27288, %r27283; + shf.l.wrap.b32 %r27290, %r27289, %r27289, 24; + add.s32 %r27291, %r27290, %r27284; + xor.b32 %r27292, %r27291, %r27286; + shf.l.wrap.b32 %r27293, %r27292, %r27292, 25; + add.s32 %r27294, %r27246, %r26691; + add.s32 %r27295, %r27294, %r27265; + xor.b32 %r27296, %r27290, %r27295; + shf.l.wrap.b32 %r27297, %r27296, %r27296, 16; + add.s32 %r27298, %r27297, %r27277; + xor.b32 %r27299, %r27298, %r27265; + shf.l.wrap.b32 %r27300, %r27299, %r27299, 20; + add.s32 %r27301, %r27295, %r26636; + add.s32 %r27302, %r27301, %r27300; + xor.b32 %r27303, %r27302, %r27297; + shf.l.wrap.b32 %r27304, %r27303, %r27303, 24; + add.s32 %r27305, %r27304, %r27298; + xor.b32 %r27306, %r27305, %r27300; + shf.l.wrap.b32 %r27307, %r27306, %r27306, 25; + add.s32 %r27308, %r27260, %r26669; + add.s32 %r27309, %r27308, %r27279; + xor.b32 %r27310, %r27309, %r27248; + shf.l.wrap.b32 %r27311, %r27310, %r27310, 16; + add.s32 %r27312, %r27311, %r27291; + xor.b32 %r27313, %r27312, %r27279; + shf.l.wrap.b32 %r27314, %r27313, %r27313, 20; + add.s32 %r27315, %r27309, %r26647; + add.s32 %r27316, %r27315, %r27314; + xor.b32 %r27317, %r27316, %r27311; + shf.l.wrap.b32 %r27318, %r27317, %r27317, 24; + add.s32 %r27319, %r27318, %r27312; + xor.b32 %r27320, %r27319, %r27314; + shf.l.wrap.b32 %r27321, %r27320, %r27320, 25; + add.s32 %r27322, %r27274, %r26614; + add.s32 %r27323, %r27322, %r27293; + xor.b32 %r27324, %r27323, %r27262; + shf.l.wrap.b32 %r27325, %r27324, %r27324, 16; + add.s32 %r27326, %r27325, %r27249; + xor.b32 %r27327, %r27326, %r27293; + shf.l.wrap.b32 %r27328, %r27327, %r27327, 20; + add.s32 %r27329, %r27323, %r26625; + add.s32 %r27330, %r27329, %r27328; + xor.b32 %r27331, %r27330, %r27325; + shf.l.wrap.b32 %r27332, %r27331, %r27331, 24; + add.s32 %r27333, %r27332, %r27326; + xor.b32 %r27334, %r27333, %r27328; + shf.l.wrap.b32 %r27335, %r27334, %r27334, 25; + add.s32 %r27336, %r27251, %r26680; + add.s32 %r27337, %r27336, %r27288; + xor.b32 %r27338, %r27276, %r27337; + shf.l.wrap.b32 %r27339, %r27338, %r27338, 16; + add.s32 %r27340, %r27339, %r27263; + xor.b32 %r27341, %r27340, %r27251; + shf.l.wrap.b32 %r27342, %r27341, %r27341, 20; + add.s32 %r27343, %r27337, %r26658; + add.s32 %r27344, %r27343, %r27342; + xor.b32 %r27345, %r27344, %r27339; + shf.l.wrap.b32 %r27346, %r27345, %r27345, 24; + add.s32 %r27347, %r27346, %r27340; + xor.b32 %r27348, %r27347, %r27342; + shf.l.wrap.b32 %r27349, %r27348, %r27348, 25; + add.s32 %r27350, %r27302, %r26713; + add.s32 %r27351, %r27350, %r27349; + xor.b32 %r27352, %r27318, %r27351; + shf.l.wrap.b32 %r27353, %r27352, %r27352, 16; + add.s32 %r27354, %r27353, %r27333; + xor.b32 %r27355, %r27354, %r27349; + shf.l.wrap.b32 %r27356, %r27355, %r27355, 20; + add.s32 %r27357, %r27351, %r26768; + add.s32 %r27358, %r27357, %r27356; + xor.b32 %r27359, %r27358, %r27353; + shf.l.wrap.b32 %r27360, %r27359, %r27359, 24; + add.s32 %r27361, %r27360, %r27354; + xor.b32 %r27362, %r27361, %r27356; + shf.l.wrap.b32 %r27363, %r27362, %r27362, 25; + add.s32 %r27364, %r27316, %r26735; + add.s32 %r27365, %r27364, %r27307; + xor.b32 %r27366, %r27332, %r27365; + shf.l.wrap.b32 %r27367, %r27366, %r27366, 16; + add.s32 %r27368, %r27367, %r27347; + xor.b32 %r27369, %r27368, %r27307; + shf.l.wrap.b32 %r27370, %r27369, %r27369, 20; + add.s32 %r27371, %r27365, %r26669; + add.s32 %r27372, %r27371, %r27370; + xor.b32 %r27373, %r27372, %r27367; + shf.l.wrap.b32 %r27374, %r27373, %r27373, 24; + add.s32 %r27375, %r27374, %r27368; + xor.b32 %r27376, %r27375, %r27370; + shf.l.wrap.b32 %r27377, %r27376, %r27376, 25; + add.s32 %r27378, %r27330, %r26702; + add.s32 %r27379, %r27378, %r27321; + xor.b32 %r27380, %r27379, %r27346; + shf.l.wrap.b32 %r27381, %r27380, %r27380, 16; + add.s32 %r27382, %r27381, %r27305; + xor.b32 %r27383, %r27382, %r27321; + shf.l.wrap.b32 %r27384, %r27383, %r27383, 20; + add.s32 %r27385, %r27379, %r26746; + add.s32 %r27386, %r27385, %r27384; + xor.b32 %r27387, %r27386, %r27381; + shf.l.wrap.b32 %r27388, %r27387, %r27387, 24; + add.s32 %r27389, %r27388, %r27382; + xor.b32 %r27390, %r27389, %r27384; + shf.l.wrap.b32 %r27391, %r27390, %r27390, 25; + add.s32 %r27392, %r27344, %r26779; + add.s32 %r27393, %r27392, %r27335; + xor.b32 %r27394, %r27393, %r27304; + shf.l.wrap.b32 %r27395, %r27394, %r27394, 16; + add.s32 %r27396, %r27395, %r27319; + xor.b32 %r27397, %r27396, %r27335; + shf.l.wrap.b32 %r27398, %r27397, %r27397, 20; + add.s32 %r27399, %r27393, %r26625; + add.s32 %r27400, %r27399, %r27398; + xor.b32 %r27401, %r27400, %r27395; + shf.l.wrap.b32 %r27402, %r27401, %r27401, 24; + add.s32 %r27403, %r27402, %r27396; + xor.b32 %r27404, %r27403, %r27398; + shf.l.wrap.b32 %r27405, %r27404, %r27404, 25; + add.s32 %r27406, %r27358, %r26757; + add.s32 %r27407, %r27406, %r27377; + xor.b32 %r27408, %r27402, %r27407; + shf.l.wrap.b32 %r27409, %r27408, %r27408, 16; + add.s32 %r27410, %r27409, %r27389; + xor.b32 %r27411, %r27410, %r27377; + shf.l.wrap.b32 %r27412, %r27411, %r27411, 20; + add.s32 %r27413, %r27407, %r26647; + add.s32 %r27414, %r27413, %r27412; + xor.b32 %r27415, %r27414, %r27409; + shf.l.wrap.b32 %r27416, %r27415, %r27415, 24; + add.s32 %r27417, %r27416, %r27410; + xor.b32 %r27418, %r27417, %r27412; + shf.l.wrap.b32 %r27419, %r27418, %r27418, 25; + add.s32 %r27420, %r27372, %r26614; + add.s32 %r27421, %r27420, %r27391; + xor.b32 %r27422, %r27421, %r27360; + shf.l.wrap.b32 %r27423, %r27422, %r27422, 16; + add.s32 %r27424, %r27423, %r27403; + xor.b32 %r27425, %r27424, %r27391; + shf.l.wrap.b32 %r27426, %r27425, %r27425, 20; + add.s32 %r27427, %r27421, %r26724; + add.s32 %r27428, %r27427, %r27426; + xor.b32 %r27429, %r27428, %r27423; + shf.l.wrap.b32 %r27430, %r27429, %r27429, 24; + add.s32 %r27431, %r27430, %r27424; + xor.b32 %r27432, %r27431, %r27426; + shf.l.wrap.b32 %r27433, %r27432, %r27432, 25; + add.s32 %r27434, %r27386, %r26636; + add.s32 %r27435, %r27434, %r27405; + xor.b32 %r27436, %r27435, %r27374; + shf.l.wrap.b32 %r27437, %r27436, %r27436, 16; + add.s32 %r27438, %r27437, %r27361; + xor.b32 %r27439, %r27438, %r27405; + shf.l.wrap.b32 %r27440, %r27439, %r27439, 20; + add.s32 %r27441, %r27435, %r26680; + add.s32 %r27442, %r27441, %r27440; + xor.b32 %r27443, %r27442, %r27437; + shf.l.wrap.b32 %r27444, %r27443, %r27443, 24; + add.s32 %r27445, %r27444, %r27438; + xor.b32 %r27446, %r27445, %r27440; + shf.l.wrap.b32 %r27447, %r27446, %r27446, 25; + add.s32 %r27448, %r27363, %r26658; + add.s32 %r27449, %r27448, %r27400; + xor.b32 %r27450, %r27388, %r27449; + shf.l.wrap.b32 %r27451, %r27450, %r27450, 16; + add.s32 %r27452, %r27451, %r27375; + xor.b32 %r27453, %r27452, %r27363; + shf.l.wrap.b32 %r27454, %r27453, %r27453, 20; + add.s32 %r27455, %r27449, %r26691; + add.s32 %r27456, %r27455, %r27454; + xor.b32 %r27457, %r27456, %r27451; + shf.l.wrap.b32 %r27458, %r27457, %r27457, 24; + add.s32 %r27459, %r27458, %r27452; + xor.b32 %r27460, %r27459, %r27454; + shf.l.wrap.b32 %r27461, %r27460, %r27460, 25; + add.s32 %r27462, %r27414, %r26735; + add.s32 %r27463, %r27462, %r27461; + xor.b32 %r27464, %r27430, %r27463; + shf.l.wrap.b32 %r27465, %r27464, %r27464, 16; + add.s32 %r27466, %r27465, %r27445; + xor.b32 %r27467, %r27466, %r27461; + shf.l.wrap.b32 %r27468, %r27467, %r27467, 20; + add.s32 %r27469, %r27463, %r26779; + add.s32 %r27470, %r27469, %r27468; + xor.b32 %r27471, %r27470, %r27465; + shf.l.wrap.b32 %r27472, %r27471, %r27471, 24; + add.s32 %r27473, %r27472, %r27466; + xor.b32 %r27474, %r27473, %r27468; + shf.l.wrap.b32 %r27475, %r27474, %r27474, 25; + add.s32 %r27476, %r27428, %r26669; + add.s32 %r27477, %r27476, %r27419; + xor.b32 %r27478, %r27444, %r27477; + shf.l.wrap.b32 %r27479, %r27478, %r27478, 16; + add.s32 %r27480, %r27479, %r27459; + xor.b32 %r27481, %r27480, %r27419; + shf.l.wrap.b32 %r27482, %r27481, %r27481, 20; + add.s32 %r27483, %r27477, %r26614; + add.s32 %r27484, %r27483, %r27482; + xor.b32 %r27485, %r27484, %r27479; + shf.l.wrap.b32 %r27486, %r27485, %r27485, 24; + add.s32 %r27487, %r27486, %r27480; + xor.b32 %r27488, %r27487, %r27482; + shf.l.wrap.b32 %r27489, %r27488, %r27488, 25; + add.s32 %r27490, %r27442, %r26625; + add.s32 %r27491, %r27490, %r27433; + xor.b32 %r27492, %r27491, %r27458; + shf.l.wrap.b32 %r27493, %r27492, %r27492, 16; + add.s32 %r27494, %r27493, %r27417; + xor.b32 %r27495, %r27494, %r27433; + shf.l.wrap.b32 %r27496, %r27495, %r27495, 20; + add.s32 %r27497, %r27491, %r26713; + add.s32 %r27498, %r27497, %r27496; + xor.b32 %r27499, %r27498, %r27493; + shf.l.wrap.b32 %r27500, %r27499, %r27499, 24; + add.s32 %r27501, %r27500, %r27494; + xor.b32 %r27502, %r27501, %r27496; + shf.l.wrap.b32 %r27503, %r27502, %r27502, 25; + add.s32 %r27504, %r27456, %r26702; + add.s32 %r27505, %r27504, %r27447; + xor.b32 %r27506, %r27505, %r27416; + shf.l.wrap.b32 %r27507, %r27506, %r27506, 16; + add.s32 %r27508, %r27507, %r27431; + xor.b32 %r27509, %r27508, %r27447; + shf.l.wrap.b32 %r27510, %r27509, %r27509, 20; + add.s32 %r27511, %r27505, %r26680; + add.s32 %r27512, %r27511, %r27510; + xor.b32 %r27513, %r27512, %r27507; + shf.l.wrap.b32 %r27514, %r27513, %r27513, 24; + add.s32 %r27515, %r27514, %r27508; + xor.b32 %r27516, %r27515, %r27510; + shf.l.wrap.b32 %r27517, %r27516, %r27516, 25; + add.s32 %r27518, %r27470, %r26768; + add.s32 %r27519, %r27518, %r27489; + xor.b32 %r27520, %r27514, %r27519; + shf.l.wrap.b32 %r27521, %r27520, %r27520, 16; + add.s32 %r27522, %r27521, %r27501; + xor.b32 %r27523, %r27522, %r27489; + shf.l.wrap.b32 %r27524, %r27523, %r27523, 20; + add.s32 %r27525, %r27519, %r26724; + add.s32 %r27526, %r27525, %r27524; + xor.b32 %r27527, %r27526, %r27521; + shr.u32 %r27528, %r27527, 8; + shf.l.wrap.b32 %r27529, %r27527, %r27527, 24; + add.s32 %r27530, %r27529, %r27522; + xor.b32 %r27531, %r27530, %r27524; + shr.u32 %r27532, %r27531, 7; + shf.l.wrap.b32 %r27533, %r27531, %r27531, 25; + add.s32 %r27534, %r27484, %r26636; + add.s32 %r27535, %r27534, %r27503; + xor.b32 %r27536, %r27535, %r27472; + shf.l.wrap.b32 %r27537, %r27536, %r27536, 16; + add.s32 %r27538, %r27537, %r27515; + xor.b32 %r27539, %r27538, %r27503; + shf.l.wrap.b32 %r27540, %r27539, %r27539, 20; + add.s32 %r27541, %r27535, %r26746; + add.s32 %r27542, %r27541, %r27540; + xor.b32 %r27543, %r27542, %r27537; + shr.u32 %r27544, %r27543, 8; + shf.l.wrap.b32 %r27545, %r27543, %r27543, 24; + add.s32 %r27546, %r27545, %r27538; + xor.b32 %r27547, %r27546, %r27540; + shr.u32 %r27548, %r27547, 7; + shf.l.wrap.b32 %r27549, %r27547, %r27547, 25; + add.s32 %r27550, %r27498, %r26647; + add.s32 %r27551, %r27550, %r27517; + xor.b32 %r27552, %r27551, %r27486; + shf.l.wrap.b32 %r27553, %r27552, %r27552, 16; + add.s32 %r27554, %r27553, %r27473; + xor.b32 %r27555, %r27554, %r27517; + shf.l.wrap.b32 %r27556, %r27555, %r27555, 20; + add.s32 %r27557, %r27551, %r26658; + add.s32 %r27558, %r27557, %r27556; + xor.b32 %r27559, %r27558, %r27553; + shr.u32 %r27560, %r27559, 8; + shf.l.wrap.b32 %r27561, %r27559, %r27559, 24; + add.s32 %r27562, %r27561, %r27554; + xor.b32 %r27563, %r27562, %r27556; + shr.u32 %r27564, %r27563, 7; + shf.l.wrap.b32 %r27565, %r27563, %r27563, 25; + add.s32 %r27566, %r27475, %r26691; + add.s32 %r27567, %r27566, %r27512; + xor.b32 %r27568, %r27500, %r27567; + shf.l.wrap.b32 %r27569, %r27568, %r27568, 16; + add.s32 %r27570, %r27569, %r27487; + xor.b32 %r27571, %r27570, %r27475; + shf.l.wrap.b32 %r27572, %r27571, %r27571, 20; + add.s32 %r27573, %r27567, %r26757; + add.s32 %r27574, %r27573, %r27572; + xor.b32 %r27575, %r27574, %r27569; + shr.u32 %r27576, %r27575, 8; + shf.l.wrap.b32 %r27577, %r27575, %r27575, 24; + add.s32 %r27578, %r27577, %r27570; + xor.b32 %r27579, %r27578, %r27572; + shr.u32 %r27580, %r27579, 7; + shf.l.wrap.b32 %r27581, %r27579, %r27579, 25; + xor.b32 %r27582, %r27562, %r27526; + xor.b32 %r27583, %r27542, %r27578; + xor.b32 %r27584, %r27558, %r27530; + xor.b32 %r27585, %r27546, %r27574; + xor.b32 %r27586, %r27545, %r27581; + xor.b32 %r27587, %r27561, %r27533; + xor.b32 %r27588, %r27549, %r27577; + xor.b32 %r27589, %r27565, %r27529; + cvt.u16.u32 %rs553, %r27526; + cvt.u16.u32 %rs554, %r27562; + xor.b16 %rs832, %rs554, %rs553; + shr.u32 %r27590, %r27582, 8; + cvt.u16.u32 %rs833, %r27590; + shr.u32 %r27591, %r27582, 16; + cvt.u16.u32 %rs834, %r27591; + shr.u32 %r27592, %r27582, 24; + cvt.u16.u32 %rs835, %r27592; + cvt.u16.u32 %rs555, %r27578; + cvt.u16.u32 %rs556, %r27542; + xor.b16 %rs836, %rs556, %rs555; + shr.u32 %r27593, %r27583, 8; + cvt.u16.u32 %rs837, %r27593; + shr.u32 %r27594, %r27583, 16; + cvt.u16.u32 %rs838, %r27594; + shr.u32 %r27595, %r27583, 24; + cvt.u16.u32 %rs839, %r27595; + cvt.u16.u32 %rs557, %r27530; + cvt.u16.u32 %rs558, %r27558; + xor.b16 %rs840, %rs558, %rs557; + shr.u32 %r27596, %r27584, 8; + cvt.u16.u32 %rs841, %r27596; + shr.u32 %r27597, %r27584, 16; + cvt.u16.u32 %rs842, %r27597; + shr.u32 %r27598, %r27584, 24; + cvt.u16.u32 %rs843, %r27598; + cvt.u16.u32 %rs559, %r27546; + cvt.u16.u32 %rs560, %r27574; + xor.b16 %rs844, %rs559, %rs560; + shr.u32 %r27599, %r27585, 8; + cvt.u16.u32 %rs845, %r27599; + shr.u32 %r27600, %r27585, 16; + cvt.u16.u32 %rs846, %r27600; + shr.u32 %r27601, %r27585, 24; + cvt.u16.u32 %rs847, %r27601; + cvt.u16.u32 %rs561, %r27580; + cvt.u16.u32 %rs562, %r27544; + xor.b16 %rs848, %rs562, %rs561; + shr.u32 %r27602, %r27586, 8; + cvt.u16.u32 %rs849, %r27602; + shr.u32 %r27603, %r27586, 16; + cvt.u16.u32 %rs850, %r27603; + shr.u32 %r27604, %r27586, 24; + cvt.u16.u32 %rs851, %r27604; + cvt.u16.u32 %rs563, %r27532; + cvt.u16.u32 %rs564, %r27560; + xor.b16 %rs852, %rs564, %rs563; + shr.u32 %r27605, %r27587, 8; + cvt.u16.u32 %rs853, %r27605; + shr.u32 %r27606, %r27587, 16; + cvt.u16.u32 %rs854, %r27606; + shr.u32 %r27607, %r27587, 24; + cvt.u16.u32 %rs855, %r27607; + cvt.u16.u32 %rs565, %r27548; + cvt.u16.u32 %rs566, %r27576; + xor.b16 %rs856, %rs565, %rs566; + shr.u32 %r27608, %r27588, 8; + cvt.u16.u32 %rs857, %r27608; + shr.u32 %r27609, %r27588, 16; + cvt.u16.u32 %rs858, %r27609; + shr.u32 %r27610, %r27588, 24; + cvt.u16.u32 %rs859, %r27610; + cvt.u16.u32 %rs567, %r27528; + cvt.u16.u32 %rs568, %r27564; + xor.b16 %rs860, %rs568, %rs567; + shr.u32 %r27611, %r27589, 8; + cvt.u16.u32 %rs861, %r27611; + shr.u32 %r27612, %r27589, 16; + cvt.u16.u32 %rs862, %r27612; + shr.u32 %r27613, %r27589, 24; + cvt.u16.u32 %rs863, %r27613; + setp.ne.s64 %p57, %rd1275, 0; + mov.u16 %rs864, 64; + mov.u16 %rs734, %rs865; + mov.u16 %rs768, %rs831; + mov.u16 %rs769, %rs830; + mov.u16 %rs770, %rs829; + mov.u16 %rs771, %rs828; + mov.u16 %rs772, %rs827; + mov.u16 %rs773, %rs826; + mov.u16 %rs774, %rs825; + mov.u16 %rs775, %rs824; + mov.u16 %rs776, %rs823; + mov.u16 %rs777, %rs822; + mov.u16 %rs778, %rs821; + mov.u16 %rs779, %rs820; + mov.u16 %rs780, %rs819; + mov.u16 %rs781, %rs818; + mov.u16 %rs782, %rs817; + mov.u16 %rs783, %rs816; + mov.u16 %rs784, %rs815; + mov.u16 %rs785, %rs814; + mov.u16 %rs786, %rs813; + mov.u16 %rs787, %rs812; + mov.u16 %rs788, %rs811; + mov.u16 %rs789, %rs810; + mov.u16 %rs790, %rs809; + mov.u16 %rs791, %rs808; + mov.u16 %rs792, %rs807; + mov.u16 %rs793, %rs806; + mov.u16 %rs794, %rs805; + mov.u16 %rs795, %rs804; + mov.u16 %rs796, %rs803; + mov.u16 %rs797, %rs802; + mov.u16 %rs798, %rs801; + mov.u16 %rs799, %rs800; + mov.u64 %rd1276, %rd1143; + mov.u32 %r30969, %r30984; + mov.u32 %r30970, %r30983; + mov.u32 %r30971, %r30982; + mov.u32 %r30972, %r30981; + mov.u32 %r30973, %r30980; + mov.u32 %r30974, %r30979; + mov.u32 %r30975, %r30978; + mov.u32 %r30976, %r30977; + @%p57 bra $L__BB2_101; + +$L__BB2_102: + cvt.u32.u16 %r27614, %rs800; + and.b32 %r27615, %r27614, 255; + cvt.u32.u16 %r27616, %rs801; + prmt.b32 %r27617, %r27616, %r27615, 30212; + cvt.u32.u16 %r27618, %rs802; + shl.b32 %r27619, %r27618, 16; + and.b32 %r27620, %r27619, 16711680; + or.b32 %r27621, %r27617, %r27620; + cvt.u32.u16 %r27622, %rs803; + shl.b32 %r27623, %r27622, 24; + or.b32 %r27624, %r27621, %r27623; + cvt.u32.u16 %r27625, %rs804; + and.b32 %r27626, %r27625, 255; + cvt.u32.u16 %r27627, %rs805; + prmt.b32 %r27628, %r27627, %r27626, 30212; + cvt.u32.u16 %r27629, %rs806; + shl.b32 %r27630, %r27629, 16; + and.b32 %r27631, %r27630, 16711680; + or.b32 %r27632, %r27628, %r27631; + cvt.u32.u16 %r27633, %rs807; + shl.b32 %r27634, %r27633, 24; + or.b32 %r27635, %r27632, %r27634; + cvt.u32.u16 %r27636, %rs808; + and.b32 %r27637, %r27636, 255; + cvt.u32.u16 %r27638, %rs809; + prmt.b32 %r27639, %r27638, %r27637, 30212; + cvt.u32.u16 %r27640, %rs810; + shl.b32 %r27641, %r27640, 16; + and.b32 %r27642, %r27641, 16711680; + or.b32 %r27643, %r27639, %r27642; + cvt.u32.u16 %r27644, %rs811; + shl.b32 %r27645, %r27644, 24; + or.b32 %r27646, %r27643, %r27645; + cvt.u32.u16 %r27647, %rs812; + and.b32 %r27648, %r27647, 255; + cvt.u32.u16 %r27649, %rs813; + prmt.b32 %r27650, %r27649, %r27648, 30212; + cvt.u32.u16 %r27651, %rs814; + shl.b32 %r27652, %r27651, 16; + and.b32 %r27653, %r27652, 16711680; + or.b32 %r27654, %r27650, %r27653; + cvt.u32.u16 %r27655, %rs815; + shl.b32 %r27656, %r27655, 24; + or.b32 %r27657, %r27654, %r27656; + cvt.u32.u16 %r27658, %rs816; + and.b32 %r27659, %r27658, 255; + cvt.u32.u16 %r27660, %rs817; + prmt.b32 %r27661, %r27660, %r27659, 30212; + cvt.u32.u16 %r27662, %rs818; + shl.b32 %r27663, %r27662, 16; + and.b32 %r27664, %r27663, 16711680; + or.b32 %r27665, %r27661, %r27664; + cvt.u32.u16 %r27666, %rs819; + shl.b32 %r27667, %r27666, 24; + or.b32 %r27668, %r27665, %r27667; + cvt.u32.u16 %r27669, %rs820; + and.b32 %r27670, %r27669, 255; + cvt.u32.u16 %r27671, %rs821; + prmt.b32 %r27672, %r27671, %r27670, 30212; + cvt.u32.u16 %r27673, %rs822; + shl.b32 %r27674, %r27673, 16; + and.b32 %r27675, %r27674, 16711680; + or.b32 %r27676, %r27672, %r27675; + cvt.u32.u16 %r27677, %rs823; + shl.b32 %r27678, %r27677, 24; + or.b32 %r27679, %r27676, %r27678; + cvt.u32.u16 %r27680, %rs824; + and.b32 %r27681, %r27680, 255; + cvt.u32.u16 %r27682, %rs825; + prmt.b32 %r27683, %r27682, %r27681, 30212; + cvt.u32.u16 %r27684, %rs826; + shl.b32 %r27685, %r27684, 16; + and.b32 %r27686, %r27685, 16711680; + or.b32 %r27687, %r27683, %r27686; + cvt.u32.u16 %r27688, %rs827; + shl.b32 %r27689, %r27688, 24; + or.b32 %r27690, %r27687, %r27689; + cvt.u32.u16 %r27691, %rs828; + and.b32 %r27692, %r27691, 255; + cvt.u32.u16 %r27693, %rs829; + prmt.b32 %r27694, %r27693, %r27692, 30212; + cvt.u32.u16 %r27695, %rs830; + shl.b32 %r27696, %r27695, 16; + and.b32 %r27697, %r27696, 16711680; + or.b32 %r27698, %r27694, %r27697; + cvt.u32.u16 %r27699, %rs831; + shl.b32 %r27700, %r27699, 24; + or.b32 %r27701, %r27698, %r27700; + cvt.u32.u16 %r27702, %rs832; + and.b32 %r27703, %r27702, 255; + cvt.u32.u16 %r27704, %rs833; + prmt.b32 %r27705, %r27704, %r27703, 30212; + cvt.u32.u16 %r27706, %rs834; + shl.b32 %r27707, %r27706, 16; + and.b32 %r27708, %r27707, 16711680; + or.b32 %r27709, %r27705, %r27708; + cvt.u32.u16 %r27710, %rs835; + shl.b32 %r27711, %r27710, 24; + or.b32 %r27712, %r27709, %r27711; + cvt.u32.u16 %r27713, %rs836; + and.b32 %r27714, %r27713, 255; + cvt.u32.u16 %r27715, %rs837; + prmt.b32 %r27716, %r27715, %r27714, 30212; + cvt.u32.u16 %r27717, %rs838; + shl.b32 %r27718, %r27717, 16; + and.b32 %r27719, %r27718, 16711680; + or.b32 %r27720, %r27716, %r27719; + cvt.u32.u16 %r27721, %rs839; + shl.b32 %r27722, %r27721, 24; + or.b32 %r27723, %r27720, %r27722; + cvt.u32.u16 %r27724, %rs840; + and.b32 %r27725, %r27724, 255; + cvt.u32.u16 %r27726, %rs841; + prmt.b32 %r27727, %r27726, %r27725, 30212; + cvt.u32.u16 %r27728, %rs842; + shl.b32 %r27729, %r27728, 16; + and.b32 %r27730, %r27729, 16711680; + or.b32 %r27731, %r27727, %r27730; + cvt.u32.u16 %r27732, %rs843; + shl.b32 %r27733, %r27732, 24; + or.b32 %r27734, %r27731, %r27733; + cvt.u32.u16 %r27735, %rs844; + and.b32 %r27736, %r27735, 255; + cvt.u32.u16 %r27737, %rs845; + prmt.b32 %r27738, %r27737, %r27736, 30212; + cvt.u32.u16 %r27739, %rs846; + shl.b32 %r27740, %r27739, 16; + and.b32 %r27741, %r27740, 16711680; + or.b32 %r27742, %r27738, %r27741; + cvt.u32.u16 %r27743, %rs847; + shl.b32 %r27744, %r27743, 24; + or.b32 %r27745, %r27742, %r27744; + cvt.u32.u16 %r27746, %rs848; + and.b32 %r27747, %r27746, 255; + cvt.u32.u16 %r27748, %rs849; + prmt.b32 %r27749, %r27748, %r27747, 30212; + cvt.u32.u16 %r27750, %rs850; + shl.b32 %r27751, %r27750, 16; + and.b32 %r27752, %r27751, 16711680; + or.b32 %r27753, %r27749, %r27752; + cvt.u32.u16 %r27754, %rs851; + shl.b32 %r27755, %r27754, 24; + or.b32 %r27756, %r27753, %r27755; + cvt.u32.u16 %r27757, %rs852; + and.b32 %r27758, %r27757, 255; + cvt.u32.u16 %r27759, %rs853; + prmt.b32 %r27760, %r27759, %r27758, 30212; + cvt.u32.u16 %r27761, %rs854; + shl.b32 %r27762, %r27761, 16; + and.b32 %r27763, %r27762, 16711680; + or.b32 %r27764, %r27760, %r27763; + cvt.u32.u16 %r27765, %rs855; + shl.b32 %r27766, %r27765, 24; + or.b32 %r27767, %r27764, %r27766; + cvt.u32.u16 %r27768, %rs856; + and.b32 %r27769, %r27768, 255; + cvt.u32.u16 %r27770, %rs857; + prmt.b32 %r27771, %r27770, %r27769, 30212; + cvt.u32.u16 %r27772, %rs858; + shl.b32 %r27773, %r27772, 16; + and.b32 %r27774, %r27773, 16711680; + or.b32 %r27775, %r27771, %r27774; + cvt.u32.u16 %r27776, %rs859; + shl.b32 %r27777, %r27776, 24; + or.b32 %r27778, %r27775, %r27777; + cvt.u32.u16 %r27779, %rs860; + and.b32 %r27780, %r27779, 255; + cvt.u32.u16 %r27781, %rs861; + prmt.b32 %r27782, %r27781, %r27780, 30212; + cvt.u32.u16 %r27783, %rs862; + shl.b32 %r27784, %r27783, 16; + and.b32 %r27785, %r27784, 16711680; + or.b32 %r27786, %r27782, %r27785; + cvt.u32.u16 %r27787, %rs863; + shl.b32 %r27788, %r27787, 24; + or.b32 %r27789, %r27786, %r27788; + or.b16 %rs569, %rs865, 8; + cvt.u32.u16 %r27790, %rs569; + and.b32 %r27791, %r27790, 255; + add.s32 %r27792, %r30981, %r30977; + add.s32 %r27793, %r27792, %r27624; + add.s32 %r27794, %r27635, %r27793; + add.s32 %r27795, %r30982, %r30978; + add.s32 %r27796, %r27795, %r27646; + add.s32 %r27797, %r27657, %r27796; + add.s32 %r27798, %r30983, %r30979; + add.s32 %r27799, %r27798, %r27668; + cvt.u32.u16 %r27800, %rs864; + and.b32 %r27801, %r27800, 255; + xor.b32 %r27802, %r27799, %r27801; + shr.u32 %r27803, %r27799, 16; + shl.b32 %r27804, %r27802, 16; + or.b32 %r27805, %r27804, %r27803; + add.s32 %r27806, %r27805, 1013904242; + xor.b32 %r27807, %r27806, %r30983; + shf.l.wrap.b32 %r27808, %r27807, %r27807, 20; + add.s32 %r27809, %r27679, %r27799; + add.s32 %r27810, %r27809, %r27808; + xor.b32 %r27811, %r27810, %r27805; + shf.l.wrap.b32 %r27812, %r27811, %r27811, 24; + add.s32 %r27813, %r27812, %r27806; + xor.b32 %r27814, %r27813, %r27808; + shf.l.wrap.b32 %r27815, %r27814, %r27814, 25; + add.s32 %r27816, %r30984, %r30980; + add.s32 %r27817, %r27816, %r27690; + xor.b32 %r27818, %r27817, %r27791; + shr.u32 %r27819, %r27817, 16; + shl.b32 %r27820, %r27818, 16; + or.b32 %r27821, %r27820, %r27819; + add.s32 %r27822, %r27821, -1521486534; + xor.b32 %r27823, %r27822, %r30984; + shf.l.wrap.b32 %r27824, %r27823, %r27823, 20; + add.s32 %r27825, %r27701, %r27817; + add.s32 %r27826, %r27825, %r27824; + xor.b32 %r27827, %r27826, %r27821; + shf.l.wrap.b32 %r27828, %r27827, %r27827, 24; + add.s32 %r27829, %r27828, %r27822; + xor.b32 %r27830, %r27829, %r27824; + shf.l.wrap.b32 %r27831, %r27830, %r27830, 25; + add.s32 %r27832, %r27815, %r27734; + add.s32 %r27833, %r27810, %r27756; + add.s32 %r27834, %r27833, %r27831; + add.s32 %r27835, %r27834, %r27767; + add.s32 %r27836, %r27826, %r27778; + shf.l.wrap.b32 %r27837, %r27793, %r27793, 16; + add.s32 %r27838, %r27837, 1779033703; + xor.b32 %r27839, %r27838, %r30981; + shf.l.wrap.b32 %r27840, %r27839, %r27839, 20; + add.s32 %r27841, %r27794, %r27840; + xor.b32 %r27842, %r27841, %r27837; + shf.l.wrap.b32 %r27843, %r27842, %r27842, 24; + add.s32 %r27844, %r27843, %r27838; + xor.b32 %r27845, %r27844, %r27840; + shf.l.wrap.b32 %r27846, %r27845, %r27845, 25; + shf.l.wrap.b32 %r27847, %r27796, %r27796, 16; + add.s32 %r27848, %r27847, -1150833019; + xor.b32 %r27849, %r27848, %r30982; + shf.l.wrap.b32 %r27850, %r27849, %r27849, 20; + add.s32 %r27851, %r27797, %r27850; + xor.b32 %r27852, %r27851, %r27847; + shf.l.wrap.b32 %r27853, %r27852, %r27852, 24; + add.s32 %r27854, %r27853, %r27848; + xor.b32 %r27855, %r27854, %r27850; + shf.l.wrap.b32 %r27856, %r27855, %r27855, 25; + add.s32 %r27857, %r27841, %r27712; + add.s32 %r27858, %r27857, %r27856; + xor.b32 %r27859, %r27858, %r27828; + shf.l.wrap.b32 %r27860, %r27859, %r27859, 16; + add.s32 %r27861, %r27860, %r27813; + xor.b32 %r27862, %r27861, %r27856; + shf.l.wrap.b32 %r27863, %r27862, %r27862, 20; + add.s32 %r27864, %r27858, %r27723; + add.s32 %r27865, %r27864, %r27863; + xor.b32 %r27866, %r27865, %r27860; + shf.l.wrap.b32 %r27867, %r27866, %r27866, 24; + add.s32 %r27868, %r27867, %r27861; + xor.b32 %r27869, %r27868, %r27863; + shf.l.wrap.b32 %r27870, %r27869, %r27869, 25; + add.s32 %r27871, %r27832, %r27851; + xor.b32 %r27872, %r27843, %r27871; + shf.l.wrap.b32 %r27873, %r27872, %r27872, 16; + add.s32 %r27874, %r27873, %r27829; + xor.b32 %r27875, %r27874, %r27815; + shf.l.wrap.b32 %r27876, %r27875, %r27875, 20; + add.s32 %r27877, %r27871, %r27745; + add.s32 %r27878, %r27877, %r27876; + xor.b32 %r27879, %r27878, %r27873; + shf.l.wrap.b32 %r27880, %r27879, %r27879, 24; + add.s32 %r27881, %r27880, %r27874; + xor.b32 %r27882, %r27881, %r27876; + shf.l.wrap.b32 %r27883, %r27882, %r27882, 25; + xor.b32 %r27884, %r27853, %r27834; + shf.l.wrap.b32 %r27885, %r27884, %r27884, 16; + add.s32 %r27886, %r27885, %r27844; + xor.b32 %r27887, %r27886, %r27831; + shf.l.wrap.b32 %r27888, %r27887, %r27887, 20; + add.s32 %r27889, %r27835, %r27888; + xor.b32 %r27890, %r27889, %r27885; + shf.l.wrap.b32 %r27891, %r27890, %r27890, 24; + add.s32 %r27892, %r27891, %r27886; + xor.b32 %r27893, %r27892, %r27888; + shf.l.wrap.b32 %r27894, %r27893, %r27893, 25; + add.s32 %r27895, %r27836, %r27846; + xor.b32 %r27896, %r27895, %r27812; + shf.l.wrap.b32 %r27897, %r27896, %r27896, 16; + add.s32 %r27898, %r27897, %r27854; + xor.b32 %r27899, %r27898, %r27846; + shf.l.wrap.b32 %r27900, %r27899, %r27899, 20; + add.s32 %r27901, %r27895, %r27789; + add.s32 %r27902, %r27901, %r27900; + xor.b32 %r27903, %r27902, %r27897; + shf.l.wrap.b32 %r27904, %r27903, %r27903, 24; + add.s32 %r27905, %r27904, %r27898; + xor.b32 %r27906, %r27905, %r27900; + shf.l.wrap.b32 %r27907, %r27906, %r27906, 25; + add.s32 %r27908, %r27865, %r27646; + add.s32 %r27909, %r27908, %r27907; + xor.b32 %r27910, %r27909, %r27880; + shf.l.wrap.b32 %r27911, %r27910, %r27910, 16; + add.s32 %r27912, %r27911, %r27892; + xor.b32 %r27913, %r27912, %r27907; + shf.l.wrap.b32 %r27914, %r27913, %r27913, 20; + add.s32 %r27915, %r27909, %r27690; + add.s32 %r27916, %r27915, %r27914; + xor.b32 %r27917, %r27916, %r27911; + shf.l.wrap.b32 %r27918, %r27917, %r27917, 24; + add.s32 %r27919, %r27918, %r27912; + xor.b32 %r27920, %r27919, %r27914; + shf.l.wrap.b32 %r27921, %r27920, %r27920, 25; + add.s32 %r27922, %r27878, %r27657; + add.s32 %r27923, %r27922, %r27870; + xor.b32 %r27924, %r27923, %r27891; + shf.l.wrap.b32 %r27925, %r27924, %r27924, 16; + add.s32 %r27926, %r27925, %r27905; + xor.b32 %r27927, %r27926, %r27870; + shf.l.wrap.b32 %r27928, %r27927, %r27927, 20; + add.s32 %r27929, %r27923, %r27734; + add.s32 %r27930, %r27929, %r27928; + xor.b32 %r27931, %r27930, %r27925; + shf.l.wrap.b32 %r27932, %r27931, %r27931, 24; + add.s32 %r27933, %r27932, %r27926; + xor.b32 %r27934, %r27933, %r27928; + shf.l.wrap.b32 %r27935, %r27934, %r27934, 25; + add.s32 %r27936, %r27889, %r27701; + add.s32 %r27937, %r27936, %r27883; + xor.b32 %r27938, %r27904, %r27937; + shf.l.wrap.b32 %r27939, %r27938, %r27938, 16; + add.s32 %r27940, %r27939, %r27868; + xor.b32 %r27941, %r27940, %r27883; + shf.l.wrap.b32 %r27942, %r27941, %r27941, 20; + add.s32 %r27943, %r27937, %r27624; + add.s32 %r27944, %r27943, %r27942; + xor.b32 %r27945, %r27944, %r27939; + shf.l.wrap.b32 %r27946, %r27945, %r27945, 24; + add.s32 %r27947, %r27946, %r27940; + xor.b32 %r27948, %r27947, %r27942; + shf.l.wrap.b32 %r27949, %r27948, %r27948, 25; + add.s32 %r27950, %r27902, %r27668; + add.s32 %r27951, %r27950, %r27894; + xor.b32 %r27952, %r27867, %r27951; + shf.l.wrap.b32 %r27953, %r27952, %r27952, 16; + add.s32 %r27954, %r27953, %r27881; + xor.b32 %r27955, %r27954, %r27894; + shf.l.wrap.b32 %r27956, %r27955, %r27955, 20; + add.s32 %r27957, %r27951, %r27767; + add.s32 %r27958, %r27957, %r27956; + xor.b32 %r27959, %r27958, %r27953; + shf.l.wrap.b32 %r27960, %r27959, %r27959, 24; + add.s32 %r27961, %r27960, %r27954; + xor.b32 %r27962, %r27961, %r27956; + shf.l.wrap.b32 %r27963, %r27962, %r27962, 25; + add.s32 %r27964, %r27916, %r27635; + add.s32 %r27965, %r27964, %r27935; + xor.b32 %r27966, %r27965, %r27960; + shf.l.wrap.b32 %r27967, %r27966, %r27966, 16; + add.s32 %r27968, %r27967, %r27947; + xor.b32 %r27969, %r27968, %r27935; + shf.l.wrap.b32 %r27970, %r27969, %r27969, 20; + add.s32 %r27971, %r27965, %r27745; + add.s32 %r27972, %r27971, %r27970; + xor.b32 %r27973, %r27972, %r27967; + shf.l.wrap.b32 %r27974, %r27973, %r27973, 24; + add.s32 %r27975, %r27974, %r27968; + xor.b32 %r27976, %r27975, %r27970; + shf.l.wrap.b32 %r27977, %r27976, %r27976, 25; + add.s32 %r27978, %r27949, %r27756; + add.s32 %r27979, %r27978, %r27930; + xor.b32 %r27980, %r27918, %r27979; + shf.l.wrap.b32 %r27981, %r27980, %r27980, 16; + add.s32 %r27982, %r27981, %r27961; + xor.b32 %r27983, %r27982, %r27949; + shf.l.wrap.b32 %r27984, %r27983, %r27983, 20; + add.s32 %r27985, %r27979, %r27679; + add.s32 %r27986, %r27985, %r27984; + xor.b32 %r27987, %r27986, %r27981; + shf.l.wrap.b32 %r27988, %r27987, %r27987, 24; + add.s32 %r27989, %r27988, %r27982; + xor.b32 %r27990, %r27989, %r27984; + shf.l.wrap.b32 %r27991, %r27990, %r27990, 25; + add.s32 %r27992, %r27944, %r27723; + add.s32 %r27993, %r27992, %r27963; + xor.b32 %r27994, %r27932, %r27993; + shf.l.wrap.b32 %r27995, %r27994, %r27994, 16; + add.s32 %r27996, %r27995, %r27919; + xor.b32 %r27997, %r27996, %r27963; + shf.l.wrap.b32 %r27998, %r27997, %r27997, 20; + add.s32 %r27999, %r27993, %r27778; + add.s32 %r28000, %r27999, %r27998; + xor.b32 %r28001, %r28000, %r27995; + shf.l.wrap.b32 %r28002, %r28001, %r28001, 24; + add.s32 %r28003, %r28002, %r27996; + xor.b32 %r28004, %r28003, %r27998; + shf.l.wrap.b32 %r28005, %r28004, %r28004, 25; + add.s32 %r28006, %r27958, %r27789; + add.s32 %r28007, %r28006, %r27921; + xor.b32 %r28008, %r28007, %r27946; + shf.l.wrap.b32 %r28009, %r28008, %r28008, 16; + add.s32 %r28010, %r28009, %r27933; + xor.b32 %r28011, %r28010, %r27921; + shf.l.wrap.b32 %r28012, %r28011, %r28011, 20; + add.s32 %r28013, %r28007, %r27712; + add.s32 %r28014, %r28013, %r28012; + xor.b32 %r28015, %r28014, %r28009; + shf.l.wrap.b32 %r28016, %r28015, %r28015, 24; + add.s32 %r28017, %r28016, %r28010; + xor.b32 %r28018, %r28017, %r28012; + shf.l.wrap.b32 %r28019, %r28018, %r28018, 25; + add.s32 %r28020, %r27972, %r27657; + add.s32 %r28021, %r28020, %r28019; + xor.b32 %r28022, %r28021, %r27988; + shf.l.wrap.b32 %r28023, %r28022, %r28022, 16; + add.s32 %r28024, %r28023, %r28003; + xor.b32 %r28025, %r28024, %r28019; + shf.l.wrap.b32 %r28026, %r28025, %r28025, 20; + add.s32 %r28027, %r28021, %r27668; + add.s32 %r28028, %r28027, %r28026; + xor.b32 %r28029, %r28028, %r28023; + shf.l.wrap.b32 %r28030, %r28029, %r28029, 24; + add.s32 %r28031, %r28030, %r28024; + xor.b32 %r28032, %r28031, %r28026; + shf.l.wrap.b32 %r28033, %r28032, %r28032, 25; + add.s32 %r28034, %r27986, %r27734; + add.s32 %r28035, %r28034, %r27977; + xor.b32 %r28036, %r28035, %r28002; + shf.l.wrap.b32 %r28037, %r28036, %r28036, 16; + add.s32 %r28038, %r28037, %r28017; + xor.b32 %r28039, %r28038, %r27977; + shf.l.wrap.b32 %r28040, %r28039, %r28039, 20; + add.s32 %r28041, %r28035, %r27756; + add.s32 %r28042, %r28041, %r28040; + xor.b32 %r28043, %r28042, %r28037; + shf.l.wrap.b32 %r28044, %r28043, %r28043, 24; + add.s32 %r28045, %r28044, %r28038; + xor.b32 %r28046, %r28045, %r28040; + shf.l.wrap.b32 %r28047, %r28046, %r28046, 25; + add.s32 %r28048, %r28000, %r27767; + add.s32 %r28049, %r28048, %r27991; + xor.b32 %r28050, %r28016, %r28049; + shf.l.wrap.b32 %r28051, %r28050, %r28050, 16; + add.s32 %r28052, %r28051, %r27975; + xor.b32 %r28053, %r28052, %r27991; + shf.l.wrap.b32 %r28054, %r28053, %r28053, 20; + add.s32 %r28055, %r28049, %r27646; + add.s32 %r28056, %r28055, %r28054; + xor.b32 %r28057, %r28056, %r28051; + shf.l.wrap.b32 %r28058, %r28057, %r28057, 24; + add.s32 %r28059, %r28058, %r28052; + xor.b32 %r28060, %r28059, %r28054; + shf.l.wrap.b32 %r28061, %r28060, %r28060, 25; + add.s32 %r28062, %r28014, %r27701; + add.s32 %r28063, %r28062, %r28005; + xor.b32 %r28064, %r27974, %r28063; + shf.l.wrap.b32 %r28065, %r28064, %r28064, 16; + add.s32 %r28066, %r28065, %r27989; + xor.b32 %r28067, %r28066, %r28005; + shf.l.wrap.b32 %r28068, %r28067, %r28067, 20; + add.s32 %r28069, %r28063, %r27778; + add.s32 %r28070, %r28069, %r28068; + xor.b32 %r28071, %r28070, %r28065; + shf.l.wrap.b32 %r28072, %r28071, %r28071, 24; + add.s32 %r28073, %r28072, %r28066; + xor.b32 %r28074, %r28073, %r28068; + shf.l.wrap.b32 %r28075, %r28074, %r28074, 25; + add.s32 %r28076, %r28028, %r27690; + add.s32 %r28077, %r28076, %r28047; + xor.b32 %r28078, %r28077, %r28072; + shf.l.wrap.b32 %r28079, %r28078, %r28078, 16; + add.s32 %r28080, %r28079, %r28059; + xor.b32 %r28081, %r28080, %r28047; + shf.l.wrap.b32 %r28082, %r28081, %r28081, 20; + add.s32 %r28083, %r28077, %r27679; + add.s32 %r28084, %r28083, %r28082; + xor.b32 %r28085, %r28084, %r28079; + shf.l.wrap.b32 %r28086, %r28085, %r28085, 24; + add.s32 %r28087, %r28086, %r28080; + xor.b32 %r28088, %r28087, %r28082; + shf.l.wrap.b32 %r28089, %r28088, %r28088, 25; + add.s32 %r28090, %r28061, %r27723; + add.s32 %r28091, %r28090, %r28042; + xor.b32 %r28092, %r28030, %r28091; + shf.l.wrap.b32 %r28093, %r28092, %r28092, 16; + add.s32 %r28094, %r28093, %r28073; + xor.b32 %r28095, %r28094, %r28061; + shf.l.wrap.b32 %r28096, %r28095, %r28095, 20; + add.s32 %r28097, %r28091, %r27624; + add.s32 %r28098, %r28097, %r28096; + xor.b32 %r28099, %r28098, %r28093; + shf.l.wrap.b32 %r28100, %r28099, %r28099, 24; + add.s32 %r28101, %r28100, %r28094; + xor.b32 %r28102, %r28101, %r28096; + shf.l.wrap.b32 %r28103, %r28102, %r28102, 25; + add.s32 %r28104, %r28056, %r27745; + add.s32 %r28105, %r28104, %r28075; + xor.b32 %r28106, %r28044, %r28105; + shf.l.wrap.b32 %r28107, %r28106, %r28106, 16; + add.s32 %r28108, %r28107, %r28031; + xor.b32 %r28109, %r28108, %r28075; + shf.l.wrap.b32 %r28110, %r28109, %r28109, 20; + add.s32 %r28111, %r28105, %r27789; + add.s32 %r28112, %r28111, %r28110; + xor.b32 %r28113, %r28112, %r28107; + shf.l.wrap.b32 %r28114, %r28113, %r28113, 24; + add.s32 %r28115, %r28114, %r28108; + xor.b32 %r28116, %r28115, %r28110; + shf.l.wrap.b32 %r28117, %r28116, %r28116, 25; + add.s32 %r28118, %r28070, %r27712; + add.s32 %r28119, %r28118, %r28033; + xor.b32 %r28120, %r28119, %r28058; + shf.l.wrap.b32 %r28121, %r28120, %r28120, 16; + add.s32 %r28122, %r28121, %r28045; + xor.b32 %r28123, %r28122, %r28033; + shf.l.wrap.b32 %r28124, %r28123, %r28123, 20; + add.s32 %r28125, %r28119, %r27635; + add.s32 %r28126, %r28125, %r28124; + xor.b32 %r28127, %r28126, %r28121; + shf.l.wrap.b32 %r28128, %r28127, %r28127, 24; + add.s32 %r28129, %r28128, %r28122; + xor.b32 %r28130, %r28129, %r28124; + shf.l.wrap.b32 %r28131, %r28130, %r28130, 25; + add.s32 %r28132, %r28084, %r27734; + add.s32 %r28133, %r28132, %r28131; + xor.b32 %r28134, %r28133, %r28100; + shf.l.wrap.b32 %r28135, %r28134, %r28134, 16; + add.s32 %r28136, %r28135, %r28115; + xor.b32 %r28137, %r28136, %r28131; + shf.l.wrap.b32 %r28138, %r28137, %r28137, 20; + add.s32 %r28139, %r28133, %r27701; + add.s32 %r28140, %r28139, %r28138; + xor.b32 %r28141, %r28140, %r28135; + shf.l.wrap.b32 %r28142, %r28141, %r28141, 24; + add.s32 %r28143, %r28142, %r28136; + xor.b32 %r28144, %r28143, %r28138; + shf.l.wrap.b32 %r28145, %r28144, %r28144, 25; + add.s32 %r28146, %r28098, %r27756; + add.s32 %r28147, %r28146, %r28089; + xor.b32 %r28148, %r28147, %r28114; + shf.l.wrap.b32 %r28149, %r28148, %r28148, 16; + add.s32 %r28150, %r28149, %r28129; + xor.b32 %r28151, %r28150, %r28089; + shf.l.wrap.b32 %r28152, %r28151, %r28151, 20; + add.s32 %r28153, %r28147, %r27723; + add.s32 %r28154, %r28153, %r28152; + xor.b32 %r28155, %r28154, %r28149; + shf.l.wrap.b32 %r28156, %r28155, %r28155, 24; + add.s32 %r28157, %r28156, %r28150; + xor.b32 %r28158, %r28157, %r28152; + shf.l.wrap.b32 %r28159, %r28158, %r28158, 25; + add.s32 %r28160, %r28112, %r27778; + add.s32 %r28161, %r28160, %r28103; + xor.b32 %r28162, %r28128, %r28161; + shf.l.wrap.b32 %r28163, %r28162, %r28162, 16; + add.s32 %r28164, %r28163, %r28087; + xor.b32 %r28165, %r28164, %r28103; + shf.l.wrap.b32 %r28166, %r28165, %r28165, 20; + add.s32 %r28167, %r28161, %r27657; + add.s32 %r28168, %r28167, %r28166; + xor.b32 %r28169, %r28168, %r28163; + shf.l.wrap.b32 %r28170, %r28169, %r28169, 24; + add.s32 %r28171, %r28170, %r28164; + xor.b32 %r28172, %r28171, %r28166; + shf.l.wrap.b32 %r28173, %r28172, %r28172, 25; + add.s32 %r28174, %r28126, %r27767; + add.s32 %r28175, %r28174, %r28117; + xor.b32 %r28176, %r28086, %r28175; + shf.l.wrap.b32 %r28177, %r28176, %r28176, 16; + add.s32 %r28178, %r28177, %r28101; + xor.b32 %r28179, %r28178, %r28117; + shf.l.wrap.b32 %r28180, %r28179, %r28179, 20; + add.s32 %r28181, %r28175, %r27789; + add.s32 %r28182, %r28181, %r28180; + xor.b32 %r28183, %r28182, %r28177; + shf.l.wrap.b32 %r28184, %r28183, %r28183, 24; + add.s32 %r28185, %r28184, %r28178; + xor.b32 %r28186, %r28185, %r28180; + shf.l.wrap.b32 %r28187, %r28186, %r28186, 25; + add.s32 %r28188, %r28140, %r27668; + add.s32 %r28189, %r28188, %r28159; + xor.b32 %r28190, %r28189, %r28184; + shf.l.wrap.b32 %r28191, %r28190, %r28190, 16; + add.s32 %r28192, %r28191, %r28171; + xor.b32 %r28193, %r28192, %r28159; + shf.l.wrap.b32 %r28194, %r28193, %r28193, 20; + add.s32 %r28195, %r28189, %r27624; + add.s32 %r28196, %r28195, %r28194; + xor.b32 %r28197, %r28196, %r28191; + shf.l.wrap.b32 %r28198, %r28197, %r28197, 24; + add.s32 %r28199, %r28198, %r28192; + xor.b32 %r28200, %r28199, %r28194; + shf.l.wrap.b32 %r28201, %r28200, %r28200, 25; + add.s32 %r28202, %r28173, %r27745; + add.s32 %r28203, %r28202, %r28154; + xor.b32 %r28204, %r28142, %r28203; + shf.l.wrap.b32 %r28205, %r28204, %r28204, 16; + add.s32 %r28206, %r28205, %r28185; + xor.b32 %r28207, %r28206, %r28173; + shf.l.wrap.b32 %r28208, %r28207, %r28207, 20; + add.s32 %r28209, %r28203, %r27646; + add.s32 %r28210, %r28209, %r28208; + xor.b32 %r28211, %r28210, %r28205; + shf.l.wrap.b32 %r28212, %r28211, %r28211, 24; + add.s32 %r28213, %r28212, %r28206; + xor.b32 %r28214, %r28213, %r28208; + shf.l.wrap.b32 %r28215, %r28214, %r28214, 25; + add.s32 %r28216, %r28168, %r27679; + add.s32 %r28217, %r28216, %r28187; + xor.b32 %r28218, %r28156, %r28217; + shf.l.wrap.b32 %r28219, %r28218, %r28218, 16; + add.s32 %r28220, %r28219, %r28143; + xor.b32 %r28221, %r28220, %r28187; + shf.l.wrap.b32 %r28222, %r28221, %r28221, 20; + add.s32 %r28223, %r28217, %r27712; + add.s32 %r28224, %r28223, %r28222; + xor.b32 %r28225, %r28224, %r28219; + shf.l.wrap.b32 %r28226, %r28225, %r28225, 24; + add.s32 %r28227, %r28226, %r28220; + xor.b32 %r28228, %r28227, %r28222; + shf.l.wrap.b32 %r28229, %r28228, %r28228, 25; + add.s32 %r28230, %r28182, %r27635; + add.s32 %r28231, %r28230, %r28145; + xor.b32 %r28232, %r28231, %r28170; + shf.l.wrap.b32 %r28233, %r28232, %r28232, 16; + add.s32 %r28234, %r28233, %r28157; + xor.b32 %r28235, %r28234, %r28145; + shf.l.wrap.b32 %r28236, %r28235, %r28235, 20; + add.s32 %r28237, %r28231, %r27690; + add.s32 %r28238, %r28237, %r28236; + xor.b32 %r28239, %r28238, %r28233; + shf.l.wrap.b32 %r28240, %r28239, %r28239, 24; + add.s32 %r28241, %r28240, %r28234; + xor.b32 %r28242, %r28241, %r28236; + shf.l.wrap.b32 %r28243, %r28242, %r28242, 25; + add.s32 %r28244, %r28196, %r27756; + add.s32 %r28245, %r28244, %r28243; + xor.b32 %r28246, %r28245, %r28212; + shf.l.wrap.b32 %r28247, %r28246, %r28246, 16; + add.s32 %r28248, %r28247, %r28227; + xor.b32 %r28249, %r28248, %r28243; + shf.l.wrap.b32 %r28250, %r28249, %r28249, 20; + add.s32 %r28251, %r28245, %r27767; + add.s32 %r28252, %r28251, %r28250; + xor.b32 %r28253, %r28252, %r28247; + shf.l.wrap.b32 %r28254, %r28253, %r28253, 24; + add.s32 %r28255, %r28254, %r28248; + xor.b32 %r28256, %r28255, %r28250; + shf.l.wrap.b32 %r28257, %r28256, %r28256, 25; + add.s32 %r28258, %r28210, %r27723; + add.s32 %r28259, %r28258, %r28201; + xor.b32 %r28260, %r28259, %r28226; + shf.l.wrap.b32 %r28261, %r28260, %r28260, 16; + add.s32 %r28262, %r28261, %r28241; + xor.b32 %r28263, %r28262, %r28201; + shf.l.wrap.b32 %r28264, %r28263, %r28263, 20; + add.s32 %r28265, %r28259, %r27745; + add.s32 %r28266, %r28265, %r28264; + xor.b32 %r28267, %r28266, %r28261; + shf.l.wrap.b32 %r28268, %r28267, %r28267, 24; + add.s32 %r28269, %r28268, %r28262; + xor.b32 %r28270, %r28269, %r28264; + shf.l.wrap.b32 %r28271, %r28270, %r28270, 25; + add.s32 %r28272, %r28224, %r27789; + add.s32 %r28273, %r28272, %r28215; + xor.b32 %r28274, %r28240, %r28273; + shf.l.wrap.b32 %r28275, %r28274, %r28274, 16; + add.s32 %r28276, %r28275, %r28199; + xor.b32 %r28277, %r28276, %r28215; + shf.l.wrap.b32 %r28278, %r28277, %r28277, 20; + add.s32 %r28279, %r28273, %r27734; + add.s32 %r28280, %r28279, %r28278; + xor.b32 %r28281, %r28280, %r28275; + shf.l.wrap.b32 %r28282, %r28281, %r28281, 24; + add.s32 %r28283, %r28282, %r28276; + xor.b32 %r28284, %r28283, %r28278; + shf.l.wrap.b32 %r28285, %r28284, %r28284, 25; + add.s32 %r28286, %r28238, %r27778; + add.s32 %r28287, %r28286, %r28229; + xor.b32 %r28288, %r28198, %r28287; + shf.l.wrap.b32 %r28289, %r28288, %r28288, 16; + add.s32 %r28290, %r28289, %r28213; + xor.b32 %r28291, %r28290, %r28229; + shf.l.wrap.b32 %r28292, %r28291, %r28291, 20; + add.s32 %r28293, %r28287, %r27712; + add.s32 %r28294, %r28293, %r28292; + xor.b32 %r28295, %r28294, %r28289; + shf.l.wrap.b32 %r28296, %r28295, %r28295, 24; + add.s32 %r28297, %r28296, %r28290; + xor.b32 %r28298, %r28297, %r28292; + shf.l.wrap.b32 %r28299, %r28298, %r28298, 25; + add.s32 %r28300, %r28252, %r27701; + add.s32 %r28301, %r28300, %r28271; + xor.b32 %r28302, %r28301, %r28296; + shf.l.wrap.b32 %r28303, %r28302, %r28302, 16; + add.s32 %r28304, %r28303, %r28283; + xor.b32 %r28305, %r28304, %r28271; + shf.l.wrap.b32 %r28306, %r28305, %r28305, 20; + add.s32 %r28307, %r28301, %r27646; + add.s32 %r28308, %r28307, %r28306; + xor.b32 %r28309, %r28308, %r28303; + shf.l.wrap.b32 %r28310, %r28309, %r28309, 24; + add.s32 %r28311, %r28310, %r28304; + xor.b32 %r28312, %r28311, %r28306; + shf.l.wrap.b32 %r28313, %r28312, %r28312, 25; + add.s32 %r28314, %r28285, %r27679; + add.s32 %r28315, %r28314, %r28266; + xor.b32 %r28316, %r28254, %r28315; + shf.l.wrap.b32 %r28317, %r28316, %r28316, 16; + add.s32 %r28318, %r28317, %r28297; + xor.b32 %r28319, %r28318, %r28285; + shf.l.wrap.b32 %r28320, %r28319, %r28319, 20; + add.s32 %r28321, %r28315, %r27657; + add.s32 %r28322, %r28321, %r28320; + xor.b32 %r28323, %r28322, %r28317; + shf.l.wrap.b32 %r28324, %r28323, %r28323, 24; + add.s32 %r28325, %r28324, %r28318; + xor.b32 %r28326, %r28325, %r28320; + shf.l.wrap.b32 %r28327, %r28326, %r28326, 25; + add.s32 %r28328, %r28280, %r27624; + add.s32 %r28329, %r28328, %r28299; + xor.b32 %r28330, %r28268, %r28329; + shf.l.wrap.b32 %r28331, %r28330, %r28330, 16; + add.s32 %r28332, %r28331, %r28255; + xor.b32 %r28333, %r28332, %r28299; + shf.l.wrap.b32 %r28334, %r28333, %r28333, 20; + add.s32 %r28335, %r28329, %r27635; + add.s32 %r28336, %r28335, %r28334; + xor.b32 %r28337, %r28336, %r28331; + shf.l.wrap.b32 %r28338, %r28337, %r28337, 24; + add.s32 %r28339, %r28338, %r28332; + xor.b32 %r28340, %r28339, %r28334; + shf.l.wrap.b32 %r28341, %r28340, %r28340, 25; + add.s32 %r28342, %r28294, %r27690; + add.s32 %r28343, %r28342, %r28257; + xor.b32 %r28344, %r28343, %r28282; + shf.l.wrap.b32 %r28345, %r28344, %r28344, 16; + add.s32 %r28346, %r28345, %r28269; + xor.b32 %r28347, %r28346, %r28257; + shf.l.wrap.b32 %r28348, %r28347, %r28347, 20; + add.s32 %r28349, %r28343, %r27668; + add.s32 %r28350, %r28349, %r28348; + xor.b32 %r28351, %r28350, %r28345; + shf.l.wrap.b32 %r28352, %r28351, %r28351, 24; + add.s32 %r28353, %r28352, %r28346; + xor.b32 %r28354, %r28353, %r28348; + shf.l.wrap.b32 %r28355, %r28354, %r28354, 25; + add.s32 %r28356, %r28308, %r27723; + add.s32 %r28357, %r28356, %r28355; + xor.b32 %r28358, %r28357, %r28324; + shf.l.wrap.b32 %r28359, %r28358, %r28358, 16; + add.s32 %r28360, %r28359, %r28339; + xor.b32 %r28361, %r28360, %r28355; + shf.l.wrap.b32 %r28362, %r28361, %r28361, 20; + add.s32 %r28363, %r28357, %r27778; + add.s32 %r28364, %r28363, %r28362; + xor.b32 %r28365, %r28364, %r28359; + shf.l.wrap.b32 %r28366, %r28365, %r28365, 24; + add.s32 %r28367, %r28366, %r28360; + xor.b32 %r28368, %r28367, %r28362; + shf.l.wrap.b32 %r28369, %r28368, %r28368, 25; + add.s32 %r28370, %r28322, %r27745; + add.s32 %r28371, %r28370, %r28313; + xor.b32 %r28372, %r28371, %r28338; + shf.l.wrap.b32 %r28373, %r28372, %r28372, 16; + add.s32 %r28374, %r28373, %r28353; + xor.b32 %r28375, %r28374, %r28313; + shf.l.wrap.b32 %r28376, %r28375, %r28375, 20; + add.s32 %r28377, %r28371, %r27679; + add.s32 %r28378, %r28377, %r28376; + xor.b32 %r28379, %r28378, %r28373; + shf.l.wrap.b32 %r28380, %r28379, %r28379, 24; + add.s32 %r28381, %r28380, %r28374; + xor.b32 %r28382, %r28381, %r28376; + shf.l.wrap.b32 %r28383, %r28382, %r28382, 25; + add.s32 %r28384, %r28336, %r27712; + add.s32 %r28385, %r28384, %r28327; + xor.b32 %r28386, %r28352, %r28385; + shf.l.wrap.b32 %r28387, %r28386, %r28386, 16; + add.s32 %r28388, %r28387, %r28311; + xor.b32 %r28389, %r28388, %r28327; + shf.l.wrap.b32 %r28390, %r28389, %r28389, 20; + add.s32 %r28391, %r28385, %r27756; + add.s32 %r28392, %r28391, %r28390; + xor.b32 %r28393, %r28392, %r28387; + shf.l.wrap.b32 %r28394, %r28393, %r28393, 24; + add.s32 %r28395, %r28394, %r28388; + xor.b32 %r28396, %r28395, %r28390; + shf.l.wrap.b32 %r28397, %r28396, %r28396, 25; + add.s32 %r28398, %r28350, %r27789; + add.s32 %r28399, %r28398, %r28341; + xor.b32 %r28400, %r28310, %r28399; + shf.l.wrap.b32 %r28401, %r28400, %r28400, 16; + add.s32 %r28402, %r28401, %r28325; + xor.b32 %r28403, %r28402, %r28341; + shf.l.wrap.b32 %r28404, %r28403, %r28403, 20; + add.s32 %r28405, %r28399, %r27635; + add.s32 %r28406, %r28405, %r28404; + xor.b32 %r28407, %r28406, %r28401; + shf.l.wrap.b32 %r28408, %r28407, %r28407, 24; + add.s32 %r28409, %r28408, %r28402; + xor.b32 %r28410, %r28409, %r28404; + shf.l.wrap.b32 %r28411, %r28410, %r28410, 25; + add.s32 %r28412, %r28364, %r27767; + add.s32 %r28413, %r28412, %r28383; + xor.b32 %r28414, %r28413, %r28408; + shf.l.wrap.b32 %r28415, %r28414, %r28414, 16; + add.s32 %r28416, %r28415, %r28395; + xor.b32 %r28417, %r28416, %r28383; + shf.l.wrap.b32 %r28418, %r28417, %r28417, 20; + add.s32 %r28419, %r28413, %r27657; + add.s32 %r28420, %r28419, %r28418; + xor.b32 %r28421, %r28420, %r28415; + shf.l.wrap.b32 %r28422, %r28421, %r28421, 24; + add.s32 %r28423, %r28422, %r28416; + xor.b32 %r28424, %r28423, %r28418; + shf.l.wrap.b32 %r28425, %r28424, %r28424, 25; + add.s32 %r28426, %r28397, %r27624; + add.s32 %r28427, %r28426, %r28378; + xor.b32 %r28428, %r28366, %r28427; + shf.l.wrap.b32 %r28429, %r28428, %r28428, 16; + add.s32 %r28430, %r28429, %r28409; + xor.b32 %r28431, %r28430, %r28397; + shf.l.wrap.b32 %r28432, %r28431, %r28431, 20; + add.s32 %r28433, %r28427, %r27734; + add.s32 %r28434, %r28433, %r28432; + xor.b32 %r28435, %r28434, %r28429; + shf.l.wrap.b32 %r28436, %r28435, %r28435, 24; + add.s32 %r28437, %r28436, %r28430; + xor.b32 %r28438, %r28437, %r28432; + shf.l.wrap.b32 %r28439, %r28438, %r28438, 25; + add.s32 %r28440, %r28392, %r27646; + add.s32 %r28441, %r28440, %r28411; + xor.b32 %r28442, %r28380, %r28441; + shf.l.wrap.b32 %r28443, %r28442, %r28442, 16; + add.s32 %r28444, %r28443, %r28367; + xor.b32 %r28445, %r28444, %r28411; + shf.l.wrap.b32 %r28446, %r28445, %r28445, 20; + add.s32 %r28447, %r28441, %r27690; + add.s32 %r28448, %r28447, %r28446; + xor.b32 %r28449, %r28448, %r28443; + shf.l.wrap.b32 %r28450, %r28449, %r28449, 24; + add.s32 %r28451, %r28450, %r28444; + xor.b32 %r28452, %r28451, %r28446; + shf.l.wrap.b32 %r28453, %r28452, %r28452, 25; + add.s32 %r28454, %r28406, %r27668; + add.s32 %r28455, %r28454, %r28369; + xor.b32 %r28456, %r28455, %r28394; + shf.l.wrap.b32 %r28457, %r28456, %r28456, 16; + add.s32 %r28458, %r28457, %r28381; + xor.b32 %r28459, %r28458, %r28369; + shf.l.wrap.b32 %r28460, %r28459, %r28459, 20; + add.s32 %r28461, %r28455, %r27701; + add.s32 %r28462, %r28461, %r28460; + xor.b32 %r28463, %r28462, %r28457; + shf.l.wrap.b32 %r28464, %r28463, %r28463, 24; + add.s32 %r28465, %r28464, %r28458; + xor.b32 %r28466, %r28465, %r28460; + shf.l.wrap.b32 %r28467, %r28466, %r28466, 25; + add.s32 %r28468, %r28420, %r27745; + add.s32 %r28469, %r28468, %r28467; + xor.b32 %r28470, %r28469, %r28436; + shf.l.wrap.b32 %r28471, %r28470, %r28470, 16; + add.s32 %r28472, %r28471, %r28451; + xor.b32 %r28473, %r28472, %r28467; + shf.l.wrap.b32 %r28474, %r28473, %r28473, 20; + add.s32 %r28475, %r28469, %r27789; + add.s32 %r28476, %r28475, %r28474; + xor.b32 %r28477, %r28476, %r28471; + shf.l.wrap.b32 %r28478, %r28477, %r28477, 24; + add.s32 %r28479, %r28478, %r28472; + xor.b32 %r28480, %r28479, %r28474; + shf.l.wrap.b32 %r28481, %r28480, %r28480, 25; + add.s32 %r28482, %r28434, %r27679; + add.s32 %r28483, %r28482, %r28425; + xor.b32 %r28484, %r28483, %r28450; + shf.l.wrap.b32 %r28485, %r28484, %r28484, 16; + add.s32 %r28486, %r28485, %r28465; + xor.b32 %r28487, %r28486, %r28425; + shf.l.wrap.b32 %r28488, %r28487, %r28487, 20; + add.s32 %r28489, %r28483, %r27624; + add.s32 %r28490, %r28489, %r28488; + xor.b32 %r28491, %r28490, %r28485; + shf.l.wrap.b32 %r28492, %r28491, %r28491, 24; + add.s32 %r28493, %r28492, %r28486; + xor.b32 %r28494, %r28493, %r28488; + shf.l.wrap.b32 %r28495, %r28494, %r28494, 25; + add.s32 %r28496, %r28448, %r27635; + add.s32 %r28497, %r28496, %r28439; + xor.b32 %r28498, %r28464, %r28497; + shf.l.wrap.b32 %r28499, %r28498, %r28498, 16; + add.s32 %r28500, %r28499, %r28423; + xor.b32 %r28501, %r28500, %r28439; + shf.l.wrap.b32 %r28502, %r28501, %r28501, 20; + add.s32 %r28503, %r28497, %r27723; + add.s32 %r28504, %r28503, %r28502; + xor.b32 %r28505, %r28504, %r28499; + shf.l.wrap.b32 %r28506, %r28505, %r28505, 24; + add.s32 %r28507, %r28506, %r28500; + xor.b32 %r28508, %r28507, %r28502; + shf.l.wrap.b32 %r28509, %r28508, %r28508, 25; + add.s32 %r28510, %r28462, %r27712; + add.s32 %r28511, %r28510, %r28453; + xor.b32 %r28512, %r28422, %r28511; + shf.l.wrap.b32 %r28513, %r28512, %r28512, 16; + add.s32 %r28514, %r28513, %r28437; + xor.b32 %r28515, %r28514, %r28453; + shf.l.wrap.b32 %r28516, %r28515, %r28515, 20; + add.s32 %r28517, %r28511, %r27690; + add.s32 %r28518, %r28517, %r28516; + xor.b32 %r28519, %r28518, %r28513; + shf.l.wrap.b32 %r28520, %r28519, %r28519, 24; + add.s32 %r28521, %r28520, %r28514; + xor.b32 %r28522, %r28521, %r28516; + shf.l.wrap.b32 %r28523, %r28522, %r28522, 25; + add.s32 %r28524, %r28476, %r27778; + add.s32 %r28525, %r28524, %r28495; + xor.b32 %r28526, %r28525, %r28520; + shf.l.wrap.b32 %r28527, %r28526, %r28526, 16; + add.s32 %r28528, %r28527, %r28507; + xor.b32 %r28529, %r28528, %r28495; + shf.l.wrap.b32 %r28530, %r28529, %r28529, 20; + add.s32 %r28531, %r28525, %r27734; + add.s32 %r28532, %r28531, %r28530; + xor.b32 %r28533, %r28532, %r28527; + shf.l.wrap.b32 %r28534, %r28533, %r28533, 24; + add.s32 %r28535, %r28534, %r28528; + xor.b32 %r28536, %r28535, %r28530; + shf.l.wrap.b32 %r28537, %r28536, %r28536, 25; + add.s32 %r28538, %r28509, %r27646; + add.s32 %r28539, %r28538, %r28490; + xor.b32 %r28540, %r28478, %r28539; + shf.l.wrap.b32 %r28541, %r28540, %r28540, 16; + add.s32 %r28542, %r28541, %r28521; + xor.b32 %r28543, %r28542, %r28509; + shf.l.wrap.b32 %r28544, %r28543, %r28543, 20; + add.s32 %r28545, %r28539, %r27756; + add.s32 %r28546, %r28545, %r28544; + xor.b32 %r28547, %r28546, %r28541; + shf.l.wrap.b32 %r28548, %r28547, %r28547, 24; + add.s32 %r28549, %r28548, %r28542; + xor.b32 %r28550, %r28549, %r28544; + shf.l.wrap.b32 %r28551, %r28550, %r28550, 25; + add.s32 %r28552, %r28504, %r27657; + add.s32 %r28553, %r28552, %r28523; + xor.b32 %r28554, %r28492, %r28553; + shf.l.wrap.b32 %r28555, %r28554, %r28554, 16; + add.s32 %r28556, %r28555, %r28479; + xor.b32 %r28557, %r28556, %r28523; + shf.l.wrap.b32 %r28558, %r28557, %r28557, 20; + add.s32 %r28559, %r28553, %r27668; + add.s32 %r28560, %r28559, %r28558; + xor.b32 %r28561, %r28560, %r28555; + shf.l.wrap.b32 %r28562, %r28561, %r28561, 24; + add.s32 %r28563, %r28562, %r28556; + xor.b32 %r28564, %r28563, %r28558; + shf.l.wrap.b32 %r28565, %r28564, %r28564, 25; + add.s32 %r28566, %r28518, %r27701; + add.s32 %r28567, %r28566, %r28481; + xor.b32 %r28568, %r28567, %r28506; + shf.l.wrap.b32 %r28569, %r28568, %r28568, 16; + add.s32 %r28570, %r28569, %r28493; + xor.b32 %r28571, %r28570, %r28481; + shf.l.wrap.b32 %r28572, %r28571, %r28571, 20; + add.s32 %r28573, %r28567, %r27767; + add.s32 %r28574, %r28573, %r28572; + xor.b32 %r28575, %r28574, %r28569; + shf.l.wrap.b32 %r28576, %r28575, %r28575, 24; + add.s32 %r28577, %r28576, %r28570; + xor.b32 %r28578, %r28577, %r28572; + shf.l.wrap.b32 %r28579, %r28578, %r28578, 25; + xor.b32 %r28580, %r28532, %r28563; + cvt.u64.u32 %rd342, %r28580; + xor.b32 %r28581, %r28577, %r28546; + and.b32 %r28582, %r28581, 255; + cvt.u64.u32 %rd1147, %r28582; + cvt.u64.u32 %rd1148, %r28581; + shl.b64 %rd1149, %rd1148, 32; + and.b64 %rd1150, %rd1149, 280375465082880; + and.b64 %rd1151, %rd1149, 71776119061217280; + shr.u32 %r28583, %r28581, 24; + cvt.u64.u32 %rd1152, %r28583; + shl.b64 %rd1153, %rd1152, 56; + bfi.b64 %rd1154, %rd1147, %rd342, 32, 32; + or.b64 %rd1155, %rd1154, %rd1150; + or.b64 %rd1156, %rd1155, %rd1151; + or.b64 %rd341, %rd1156, %rd1153; + xor.b32 %r28584, %r28535, %r28560; + cvt.u64.u32 %rd1157, %r28584; + xor.b32 %r28585, %r28574, %r28549; + and.b32 %r28586, %r28585, 255; + cvt.u64.u32 %rd1158, %r28586; + cvt.u64.u32 %rd1159, %r28585; + shl.b64 %rd1160, %rd1159, 32; + and.b64 %rd1161, %rd1160, 280375465082880; + and.b64 %rd1162, %rd1160, 71776119061217280; + shr.u32 %r28587, %r28585, 24; + cvt.u64.u32 %rd1163, %r28587; + shl.b64 %rd1164, %rd1163, 56; + bfi.b64 %rd1165, %rd1158, %rd1157, 32, 32; + or.b64 %rd1166, %rd1165, %rd1161; + or.b64 %rd1167, %rd1166, %rd1162; + or.b64 %rd345, %rd1167, %rd1164; + xor.b32 %r28588, %r28579, %r28548; + cvt.u64.u32 %rd1168, %r28588; + xor.b32 %r28589, %r28537, %r28562; + and.b32 %r28590, %r28589, 255; + cvt.u64.u32 %rd1169, %r28590; + cvt.u64.u32 %rd1170, %r28589; + shl.b64 %rd1171, %rd1170, 32; + and.b64 %rd1172, %rd1171, 280375465082880; + and.b64 %rd1173, %rd1171, 71776119061217280; + shr.u32 %r28591, %r28589, 24; + cvt.u64.u32 %rd1174, %r28591; + shl.b64 %rd1175, %rd1174, 56; + bfi.b64 %rd1176, %rd1169, %rd1168, 32, 32; + or.b64 %rd1177, %rd1176, %rd1172; + or.b64 %rd1178, %rd1177, %rd1173; + or.b64 %rd1280, %rd1178, %rd1175; + xor.b32 %r28592, %r28576, %r28551; + cvt.u64.u32 %rd1179, %r28592; + xor.b32 %r28593, %r28534, %r28565; + and.b32 %r28594, %r28593, 255; + cvt.u64.u32 %rd1180, %r28594; + cvt.u64.u32 %rd1181, %r28593; + shl.b64 %rd1182, %rd1181, 32; + and.b64 %rd1183, %rd1182, 280375465082880; + and.b64 %rd1184, %rd1182, 71776119061217280; + shr.u32 %r28595, %r28593, 24; + cvt.u64.u32 %rd1185, %r28595; + shl.b64 %rd1186, %rd1185, 56; + bfi.b64 %rd1187, %rd1180, %rd1179, 32, 32; + or.b64 %rd1188, %rd1187, %rd1183; + or.b64 %rd1189, %rd1188, %rd1184; + or.b64 %rd1279, %rd1189, %rd1186; + +$L__BB2_104: + ld.const.u64 %rd346, [target+24]; + setp.eq.s64 %p59, %rd1279, %rd346; + @%p59 bra $L__BB2_106; + bra.uni $L__BB2_105; + +$L__BB2_106: + ld.const.u64 %rd347, [target+16]; + setp.eq.s64 %p60, %rd1280, %rd347; + @%p60 bra $L__BB2_108; + bra.uni $L__BB2_107; + +$L__BB2_108: + ld.const.u64 %rd348, [target+8]; + setp.eq.s64 %p61, %rd345, %rd348; + @%p61 bra $L__BB2_110; + bra.uni $L__BB2_109; + +$L__BB2_110: + and.b64 %rd1234, %rd342, 255; + and.b64 %rd1235, %rd341, -256; + or.b64 %rd1236, %rd1235, %rd1234; + ld.const.u64 %rd1237, [target]; + setp.lt.u64 %p63, %rd1236, %rd1237; + bra.uni $L__BB2_111; + +$L__BB2_105: + setp.lt.u64 %p63, %rd1279, %rd346; + bra.uni $L__BB2_111; + +$L__BB2_107: + setp.lt.u64 %p63, %rd1280, %rd347; + bra.uni $L__BB2_111; + +$L__BB2_109: + setp.lt.u64 %p63, %rd345, %rd348; + +$L__BB2_111: + not.pred %p62, %p63; + @%p62 bra $L__BB2_113; + + ld.param.u64 %rd1247, [heavy_hash_param_0]; + ld.param.u64 %rd1246, [heavy_hash_param_1]; + and.b64 %rd1245, %rd1255, %rd1247; + or.b64 %rd1244, %rd1245, %rd1246; + ld.param.u64 %rd1243, [heavy_hash_param_5]; + cvta.to.global.u64 %rd1238, %rd1243; + mov.u64 %rd1239, 0; + atom.global.cas.b64 %rd1240, [%rd1238], %rd1239, %rd1244; + +$L__BB2_113: ret; } diff --git a/plugins/cuda/src/worker.rs b/plugins/cuda/src/worker.rs index 9f7d772..8d60b19 100644 --- a/plugins/cuda/src/worker.rs +++ b/plugins/cuda/src/worker.rs @@ -4,12 +4,15 @@ use cust::device::DeviceAttribute; use cust::function::Function; use cust::module::{ModuleJitOption, OptLevel}; use cust::prelude::*; +use cust::memory::DeviceCopy; use karlsen_miner::xoshiro256starstar::Xoshiro256StarStar; use karlsen_miner::Worker; use log::{error, info}; use rand::{Fill, RngCore}; use std::ffi::CString; use std::sync::{Arc, Weak}; +use tiny_keccak::Hasher; +use std::ops::BitXor; static BPS: f32 = 1.; @@ -51,6 +54,202 @@ impl<'kernel> Kernel<'kernel> { } } +#[repr(C)] +#[derive(Copy, Clone)] +pub union hash256 { + pub word64s: [u64; 4usize], + pub word32s: [u32; 8usize], + pub bytes: [u8; 32usize], + pub str_: [::std::os::raw::c_char; 32usize], +} + +/* +impl hash256 { + pub fn new() -> hash256 { + // Initialize the union with the bytes field set to zero. + unsafe { std::mem::transmute([0u8; 32usize]) } + } +} +*/ + +/* +impl hash256 { + // A constant function to create a new hash256 from a byte array. + const fn new(bytes: [u8; 32]) -> hash256 { + let mut union = std::mem::MaybeUninit::::uninit(); + unsafe { + std::ptr::write(&mut union as *mut _ as *mut [u8; 32], bytes); + union.assume_init() + } + } +} +*/ + +#[repr(C)] +#[derive(Copy, Clone)] +pub union hash512 { + pub word64s: [u64; 8usize], + pub word32s: [u32; 16usize], + pub bytes: [u8; 64usize], + pub str_: [::std::os::raw::c_char; 64usize], +} + +impl hash512 { + pub fn new() -> hash512 { + // Initialize the union with one of its fields. Here, we choose to initialize it with bytes. + // Since Rust does not allow direct initialization of unions, we use a temporary variable and transmute it. + unsafe { std::mem::transmute([0u8; 64usize]) } + } +} + +#[repr(C)] +#[derive(Copy, Clone)] +pub union hash1024 { + pub hash512s: [hash512; 2usize], + pub word64s: [u64; 16usize], + pub word32s: [u32; 32usize], + pub bytes: [u8; 128usize], + pub str_: [::std::os::raw::c_char; 128usize], +} + +impl hash1024 { + pub fn new() -> hash1024 { + // Initialize the union with the bytes field set to zero. + unsafe { std::mem::transmute([0u8; 128usize]) } + } +} + +const SIZE_U32: usize = std::mem::size_of::(); +const SIZE_U64: usize = std::mem::size_of::(); + +pub trait HashData { + fn new() -> Self; + fn as_bytes(&self) -> &[u8]; + fn as_bytes_mut(&mut self) -> &mut [u8]; + + fn get_as_u32(&self, index: usize) -> u32 { + u32::from_le_bytes( + self.as_bytes()[index * SIZE_U32..index * SIZE_U32 + SIZE_U32] + .try_into() + .unwrap(), + ) + } + + fn set_as_u32(&mut self, index: usize, value: u32) { + self.as_bytes_mut()[index * SIZE_U32..index * SIZE_U32 + SIZE_U32] + .copy_from_slice(&value.to_le_bytes()) + } + + fn get_as_u64(&self, index: usize) -> u64 { + u64::from_le_bytes( + self.as_bytes()[index * SIZE_U64..index * SIZE_U64 + SIZE_U64] + .try_into() + .unwrap(), + ) + } + + fn set_as_u64(&mut self, index: usize, value: u64) { + self.as_bytes_mut()[index * SIZE_U64..index * SIZE_U64 + SIZE_U64] + .copy_from_slice(&value.to_le_bytes()) + } +} + +#[derive(Debug)] +pub struct Hash256([u8; 32]); + +impl HashData for Hash256 { + fn new() -> Self { + Self([0; 32]) + } + + fn as_bytes(&self) -> &[u8] { + &self.0 + } + + fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.0 + } +} + +#[repr(C)] +#[derive(Clone, Copy, Debug, DeviceCopy)] +pub struct Hash512([u8; 64]); + +impl HashData for Hash512 { + fn new() -> Self { + Self([0; 64]) + } + + fn as_bytes(&self) -> &[u8] { + &self.0 + } + + fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.0 + } +} + +impl BitXor<&Hash512> for &Hash512 { + type Output = Hash512; + + fn bitxor(self, rhs: &Hash512) -> Self::Output { + let mut hash = Hash512::new(); + + for i in 0..64 { + hash.0[i] = self.0[i] ^ rhs.0[i] + } + + hash + } +} + +#[repr(C)] +#[derive(Copy, Clone, Debug, DeviceCopy)] +pub struct Hash1024([u8; 128]); + +impl HashData for Hash1024 { + fn new() -> Self { + Self([0; 128]) + } + + fn as_bytes(&self) -> &[u8] { + &self.0 + } + + fn as_bytes_mut(&mut self) -> &mut [u8] { + &mut self.0 + } +} + +impl Hash1024 { + fn from_512s(first: &Hash512, second: &Hash512) -> Self { + let mut hash = Self::new(); + let (first_half, second_half) = hash.0.split_at_mut(first.0.len()); + first_half.copy_from_slice(&first.0); + second_half.copy_from_slice(&second.0); + + hash + } +} + +const FNV_PRIME: u32 = 0x01000193; +const FULL_DATASET_ITEM_PARENTS: u32 = 512; +const NUM_DATASET_ACCESSES: i32 = 32; +const LIGHT_CACHE_ROUNDS: i32 = 3; + +const LIGHT_CACHE_NUM_ITEMS: u32 = 1179641; +const FULL_DATASET_NUM_ITEMS: u32 = 37748717; +const SEED: Hash256 = Hash256([ + 0xeb, 0x01, 0x63, 0xae, 0xf2, 0xab, 0x1c, 0x5a, 0x66, 0x31, 0x0c, 0x1c, 0x14, 0xd6, 0x0f, 0x42, + 0x55, 0xa9, 0xb3, 0x9b, 0x0e, 0xdf, 0x26, 0x53, 0x98, 0x44, 0xf1, 0x17, 0xad, 0x67, 0x21, 0x19, +]); +/* +const SEED: hash256 = hash256::new([ + 0xeb, 0x01, 0x63, 0xae, 0xf2, 0xab, 0x1c, 0x5a, 0x66, 0x31, 0x0c, 0x1c, 0x14, 0xd6, 0x0f, 0x42, + 0x55, 0xa9, 0xb3, 0x9b, 0x0e, 0xdf, 0x26, 0x53, 0x98, 0x44, 0xf1, 0x17, 0xad, 0x67, 0x21, 0x19, +]); +*/ + pub struct CudaGPUWorker<'gpu> { // NOTE: The order is important! context must be closed last heavy_hash_kernel: Kernel<'gpu>, @@ -62,33 +261,68 @@ pub struct CudaGPUWorker<'gpu> { rand_state: DeviceBuffer, final_nonce_buff: DeviceBuffer, + cache2: DeviceBuffer, + dataset2: DeviceBuffer, + //cache2_ptr: DevicePointer, + //dataset2_ptr: DevicePointer, + device_id: u32, pub workload: usize, _context: Context, random: NonceGenEnum, + + //pub full_dataset: *mut Hash1024, + //pub light_cache: *mut Hash512, } impl<'gpu> Worker for CudaGPUWorker<'gpu> { + fn id(&self) -> String { let device = CurrentContext::get_device().unwrap(); format!("#{} ({})", self.device_id, device.name().unwrap()) } fn load_block_constants(&mut self, hash_header: &[u8; 72], matrix: &[[u16; 64]; 64], target: &[u64; 4]) { + //info!("load_block_constants: debug1 "); let u8matrix: Arc<[[u8; 64]; 64]> = Arc::new(matrix.map(|row| row.map(|v| v as u8))); + //info!("load_block_constants: debug2 "); let mut hash_header_gpu = self._module.get_global::<[u8; 72]>(&CString::new("hash_header").unwrap()).unwrap(); + //info!("load_block_constants: debug3 "); hash_header_gpu.copy_from(hash_header).map_err(|e| e.to_string()).unwrap(); + //info!("load_block_constants: debug4 "); let mut matrix_gpu = self._module.get_global::<[[u8; 64]; 64]>(&CString::new("matrix").unwrap()).unwrap(); + //info!("load_block_constants: debug5 "); matrix_gpu.copy_from(&u8matrix).map_err(|e| e.to_string()).unwrap(); + //info!("load_block_constants: debug6 "); let mut target_gpu = self._module.get_global::<[u64; 4]>(&CString::new("target").unwrap()).unwrap(); + //info!("load_block_constants: debug7 "); target_gpu.copy_from(target).map_err(|e| e.to_string()).unwrap(); + //info!("load_block_constants: debug8 "); + + //let mut data = DeviceBuffer::from_slice(&vec![hash1024 { bytes: [0; 128] }; FULL_DATASET_NUM_ITEMS]); + + + + + + //let u8cache: Arc<[u8; 10]> = Arc::new([0; 10]); + //let mut data = DeviceBuffer::from_slice(&vec![hash512 { bytes: [0; 64] }; LIGHT_CACHE_NUM_ITEMS]); + //self.cache = DeviceBuffer::from_slice(&vec![hash512 { bytes: [0; 64] }; LIGHT_CACHE_NUM_ITEMS]); + /* + info!("load_block_constants: debug8.1 "); + let mut cache_gpu = self._module.get_global::<[DeviceBuffer; LIGHT_CACHE_NUM_ITEMS]>(&CString::new("cache_test").unwrap()).unwrap(); + info!("load_block_constants: debug9 "); + cache_gpu.copy_from(&data).map_err(|e| e.to_string()).unwrap(); + info!("load_block_constants: debug10 "); + */ } #[inline(always)] fn calculate_hash(&mut self, _nonces: Option<&Vec>, nonce_mask: u64, nonce_fixed: u64) { + //info!("calculate_hash: debug1 "); let func = &self.heavy_hash_kernel.func; let stream = &self.stream; let random: u8 = match self.random { @@ -99,23 +333,39 @@ impl<'gpu> Worker for CudaGPUWorker<'gpu> { NonceGenEnum::Xoshiro => 1, }; + + //self.light_cache = vec![Hash512::new(); LIGHT_CACHE_NUM_ITEMS as usize].into_boxed_slice(); + //self.full_dataset = vec![Hash1024::new(); FULL_DATASET_NUM_ITEMS as usize].into_boxed_slice(); + + //info!("calculate_hash: debug2 "); self.start_event.record(stream).unwrap(); + //info!("calculate_hash: debug3 cache size : {}", self.cache2.len()); + //info!("calculate_hash: debug3 dataset size : {}", self.dataset2.len()); + + //info!("calculate_hash: debug3 dataset[10] : {:?}", self.dataset.index(10)); unsafe { launch!( func<<< self.heavy_hash_kernel.grid_size, self.heavy_hash_kernel.block_size, 0, stream >>>( - nonce_mask, nonce_fixed, + nonce_mask, + nonce_fixed, self.workload, random, self.rand_state.as_device_ptr(), - self.final_nonce_buff.as_device_ptr() + self.final_nonce_buff.as_device_ptr(), + self.cache2.as_device_ptr(), + self.dataset2.as_device_ptr(), + //self.cache2_ptr.as_raw(), + //self.dataset2_ptr.as_raw(), ) ) .unwrap(); // We see errors in sync } + //info!("calculate_hash: debug4 "); self.stop_event.record(stream).unwrap(); + //info!("calculate_hash: debug5 "); } #[inline(always)] @@ -139,7 +389,176 @@ impl<'gpu> Worker for CudaGPUWorker<'gpu> { } } + +pub fn keccak_in_place(data: &mut [u8]) { + let mut hasher = tiny_keccak::Keccak::v512(); + hasher.update(data); + hasher.finalize(data); +} + +pub fn keccak(out: &mut [u8], data: &[u8]) { + let mut hasher = tiny_keccak::Keccak::v512(); + hasher.update(data); + hasher.finalize(out); +} + +fn xor_hash512(a: hash512, b: hash512) -> hash512 { + unsafe { + let mut result = hash512 { word64s: [0u64; 8] }; + for i in 0..8 { + result.word64s[i] = a.word64s[i] ^ b.word64s[i]; + } + result + } +} + +fn build_light_cache(cache: &mut [Hash512]) { + let mut item: Hash512 = Hash512::new(); + keccak(&mut item.0, &SEED.0); + cache[0] = item; + + for cache_item in cache + .iter_mut() + .take(LIGHT_CACHE_NUM_ITEMS as usize) + .skip(1) + { + keccak_in_place(&mut item.0); + *cache_item = item; + } + + for _ in 0..LIGHT_CACHE_ROUNDS { + for i in 0..LIGHT_CACHE_NUM_ITEMS { + // First index: 4 first bytes of the item as little-endian integer + let t: u32 = cache[i as usize].get_as_u32(0); + let v: u32 = t % LIGHT_CACHE_NUM_ITEMS; + + // Second index + let w: u32 = + (LIGHT_CACHE_NUM_ITEMS.wrapping_add(i.wrapping_sub(1))) % LIGHT_CACHE_NUM_ITEMS; + + let x = &cache[v as usize] ^ &cache[w as usize]; + keccak(&mut cache[i as usize].0, &x.0); + } + } +} + + +fn prebuild_dataset(full_dataset: &mut Box<[Hash1024]>, light_cache: &[Hash512], num_threads: usize) { + + //let full_dataset = full_dataset_opt.as_mut().unwrap(); + + if num_threads > 1 { + std::thread::scope(|scope| { + let mut threads = Vec::with_capacity(num_threads); + + let light_cache_slice = &light_cache[0..]; + let batch_size = full_dataset.len() / num_threads; + let chunks = full_dataset.chunks_mut(batch_size); + + for (index, chunk) in chunks.enumerate() { + let start = index * batch_size; + + let thread_handle = + scope.spawn(move || build_dataset_segment(chunk, light_cache_slice, start)); + threads.push(thread_handle); + } + + for handle in threads { + handle.join().unwrap(); + } + }); + } else { + build_dataset_segment(&mut full_dataset[0..], &light_cache, 0); + } +} + + +fn build_dataset_segment(dataset_slice: &mut [Hash1024], light_cache: &[Hash512], offset: usize) { + for (index, item) in dataset_slice.iter_mut().enumerate() { + *item = calculate_dataset_item_1024(light_cache, offset + index); + } +} + +fn fnv1(u: u32, v: u32) -> u32 { + (u * FNV_PRIME) ^ v +} + +fn fnv1_512(u: Hash512, v: Hash512) -> Hash512 { + let mut r = Hash512::new(); + + for i in 0..r.0.len() / SIZE_U32 { + r.set_as_u32(i, fnv1(u.get_as_u32(i), v.get_as_u32(i))); + } + + r +} + +fn calculate_dataset_item_1024(light_cache: &[Hash512], index: usize) -> Hash1024 { + let seed0 = (index * 2) as u32; + let seed1 = seed0 + 1; + + let mut mix0 = light_cache[(seed0 % LIGHT_CACHE_NUM_ITEMS) as usize]; + let mut mix1 = light_cache[(seed1 % LIGHT_CACHE_NUM_ITEMS) as usize]; + + let mix0_seed = mix0.get_as_u32(0) ^ seed0; + let mix1_seed = mix1.get_as_u32(0) ^ seed1; + + mix0.set_as_u32(0, mix0_seed); + mix1.set_as_u32(0, mix1_seed); + + keccak_in_place(&mut mix0.0); + keccak_in_place(&mut mix1.0); + + let num_words: u32 = (std::mem::size_of_val(&mix0) / SIZE_U32) as u32; + for j in 0..FULL_DATASET_ITEM_PARENTS { + let t0 = fnv1(seed0 ^ j, mix0.get_as_u32((j % num_words) as usize)); + let t1 = fnv1(seed1 ^ j, mix1.get_as_u32((j % num_words) as usize)); + mix0 = fnv1_512(mix0, light_cache[(t0 % LIGHT_CACHE_NUM_ITEMS) as usize]); + mix1 = fnv1_512(mix1, light_cache[(t1 % LIGHT_CACHE_NUM_ITEMS) as usize]); + } + + keccak_in_place(&mut mix0.0); + keccak_in_place(&mut mix1.0); + + Hash1024::from_512s(&mix0, &mix1) +} + + impl<'gpu> CudaGPUWorker<'gpu> { + + /* + pub fn build_light_cache(cache: &mut [hash512]) { + let mut item: hash512 = hash512::new(); + keccak(&mut item.bytes, &SEED.bytes); + cache[0] = item; + + for cache_item in cache + .iter_mut() + .take(LIGHT_CACHE_NUM_ITEMS as usize) + .skip(1) + { + keccak_in_place(&mut item.bytes); + *cache_item = item; + } + + for _ in 0..LIGHT_CACHE_ROUNDS { + for i in 0..LIGHT_CACHE_NUM_ITEMS { + // First index: 4 first bytes of the item as little-endian integer + let t: u32 = cache[i as usize].word32s[0]; + let v: u32 = t % LIGHT_CACHE_NUM_ITEMS; + + // Second index + let w: u32 = + (LIGHT_CACHE_NUM_ITEMS.wrapping_add(i.wrapping_sub(1))) % LIGHT_CACHE_NUM_ITEMS; + + //let x = &cache[v as usize].bytes ^ &cache[w as usize].bytes; + let x = xor_hash512(cache[v as usize], cache[w as usize]); + keccak(&mut cache[i as usize].bytes, &x.bytes); + } + } + } + */ + pub fn new( device_id: u32, workload: f32, @@ -156,6 +575,40 @@ impl<'gpu> CudaGPUWorker<'gpu> { let _context = Context::new(device)?; _context.set_flags(sync_flag)?; + //let _cache = DeviceBuffer::::zeroed(FULL_DATASET_NUM_ITEMS)?; + //let mut cache = DeviceBuffer::from_slice(&vec![Hash512::new(); LIGHT_CACHE_NUM_ITEMS as usize])?; + + //let cache = DeviceBuffer::from_slice(&vec![hash512 { bytes: [0; 64] }; LIGHT_CACHE_NUM_ITEMS as usize])?; + //info!(">>>>>>>>>>>>>>>>>>>>>>>>> cache size : {}", cache.len()); + //let dataset = DeviceBuffer::from_slice(&vec![hash1024 { bytes: [0; 128] }; FULL_DATASET_NUM_ITEMS as usize])?; + //let mut dataset = DeviceBuffer::from_slice(&vec![Hash1024::new(); FULL_DATASET_NUM_ITEMS as usize])?; + + //info!(">>>>>>>>>>>>>>>>>>>>>>>>> dataset size : {}", dataset.len()); + + let mut light_cache = + vec![Hash512::new(); LIGHT_CACHE_NUM_ITEMS as usize].into_boxed_slice(); + build_light_cache(&mut light_cache); + //cache.copy_from(&light_cache)?; + let cache2 = DeviceBuffer::from_slice(&light_cache)?; + // Obtention du pointeur de périphérique + //let cache2_ptr: DevicePointer = cache2.as_device_ptr(); + // Conversion du pointeur de périphérique en pointeur brut pour l'appel FFI + //let raw_device_ptr = device_ptr.as_raw(); + + let mut full_dataset = + Some(vec![Hash1024::new(); FULL_DATASET_NUM_ITEMS as usize].into_boxed_slice()); + let full_dataset_uwrap = full_dataset.as_mut().unwrap(); + //build_dataset_segment(&mut full_dataset_uwrap[0..], &light_cache, 0); + prebuild_dataset(full_dataset_uwrap, &light_cache, 8); + + info!("dataset[10] : {:?}", full_dataset_uwrap[10].as_bytes()); + info!("dataset[42] : {:?}", full_dataset_uwrap[42].as_bytes()); + info!("dataset[12345] : {:?}", full_dataset_uwrap[12345].as_bytes()); + + //dataset.copy_from(&full_dataset_uwrap)?; + let dataset2 = DeviceBuffer::from_slice(full_dataset_uwrap)?; + //let dataset2_ptr: DevicePointer = dataset2.as_device_ptr(); + let major = device.get_attribute(DeviceAttribute::ComputeCapabilityMajor)?; let minor = device.get_attribute(DeviceAttribute::ComputeCapabilityMinor)?; let _module: Arc; @@ -244,8 +697,11 @@ impl<'gpu> CudaGPUWorker<'gpu> { stream, rand_state, final_nonce_buff, + cache2, + dataset2, heavy_hash_kernel, random, + }) } } diff --git a/run_miner.sh b/run_miner.sh new file mode 100644 index 0000000..2cc2656 --- /dev/null +++ b/run_miner.sh @@ -0,0 +1 @@ +./target/release/karlsen-miner.exe -s 149.202.82.76 --testnet --mining-address karlsentest:qrxuvenk483jj5k5zpwgdqyk27eacsgv9fj3kwu6puj38usnaj9uu55cz0y8q diff --git a/src/cli.rs b/src/cli.rs index 3257f30..8cb3066 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -16,7 +16,7 @@ pub struct Opt { #[clap(long = "devfund-percent", help = "The percentage of blocks to send to the devfund (minimum 0%)", default_value = "0", parse(try_from_str = parse_devfund_percent))] pub devfund_percent: u16, - #[clap(short, long, help = "karlsend port [default: Mainnet = 42110, Testnet = 16211]")] + #[clap(short, long, help = "karlsend port [default: Mainnet = 42110, Testnet = 42210]")] port: Option, #[clap(long, help = "Use testnet instead of mainnet [default: false]")] @@ -100,7 +100,7 @@ impl Opt { } fn port(&mut self) -> u16 { - *self.port.get_or_insert(if self.testnet { 16211 } else { 42110 }) + *self.port.get_or_insert(if self.testnet { 42210 } else { 42110 }) } pub fn log_level(&self) -> LevelFilter { diff --git a/src/pow.rs b/src/pow.rs index f89c559..5c25344 100644 --- a/src/pow.rs +++ b/src/pow.rs @@ -164,11 +164,13 @@ impl State { } pub fn load_to_gpu(&self, gpu_work: &mut dyn Worker) { + info!("load_to_gpu: debug1 "); gpu_work.load_block_constants(&self.pow_hash_header, &self.matrix.0, &self.target.0); } #[inline(always)] pub fn pow_gpu(&self, gpu_work: &mut dyn Worker) { + info!("pow_gpu: debug1 "); gpu_work.calculate_hash(None, self.nonce_mask, self.nonce_fixed); } }