//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Wed Jul 10 12:41:20 2013 (1373485280)
// Cuda compilation tools, release 5.5, V5.5.0
//

.version 3.2
.target sm_30
.address_size 64

	.file	1 "D:/singlebarrel/releases/2014.03/shared/adobe/MediaCore/GPUFoundation/Src/ImageProcessing/Accumulate.cu", 1399785310, 3950
	.file	2 "D:\\singlebarrel\\releases\\2014.03\\shared\\adobe\\MediaCore\\GPUFoundation\\API\\Inc\\GPUFoundation/KernelSupport/PixelUtils.h", 1399785310, 5707
	.file	3 "d:\\singlebarrel\\releases\\2014.03\\shared\\adobe\\mediacore\\external\\3rdparty\\nvidia\\cuda\\win\\include\\device_functions.h", 1399785281, 191626
.global .align 1 .b8 $str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};

.visible .func  (.param .align 16 .b8 func_retval0[16]) _Z18UnpremultiplyPixel8PixelRGB(
	.param .align 16 .b8 _Z18UnpremultiplyPixel8PixelRGB_param_0[16]
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<24>;


	ld.param.f32 	%f3, [_Z18UnpremultiplyPixel8PixelRGB_param_0+8];
	ld.param.f32 	%f2, [_Z18UnpremultiplyPixel8PixelRGB_param_0+4];
	ld.param.f32 	%f1, [_Z18UnpremultiplyPixel8PixelRGB_param_0];
	ld.param.f32 	%f12, [_Z18UnpremultiplyPixel8PixelRGB_param_0+12];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f20, %f12;
	.loc 2 45 1
	add.ftz.f32 	%f13, %f20, 0fB70637BD;
	setp.gtu.ftz.f32	%p1, %f13, 0f00000000;
	@%p1 bra 	BB0_2;

	mov.f32 	%f23, 0f00000000;
	mov.f32 	%f22, %f23;
	mov.f32 	%f21, %f23;
	mov.f32 	%f20, %f23;
	bra.uni 	BB0_3;

BB0_2:
	mov.f32 	%f18, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f19, %f18, %f20;
	.loc 2 45 1
	mul.ftz.f32 	%f21, %f3, %f19;
	mul.ftz.f32 	%f22, %f2, %f19;
	mul.ftz.f32 	%f23, %f1, %f19;

BB0_3:
	st.param.f32	[func_retval0+0], %f23;
	st.param.f32	[func_retval0+4], %f22;
	st.param.f32	[func_retval0+8], %f21;
	st.param.f32	[func_retval0+12], %f20;
	.loc 2 45 1
	ret;
}

.visible .func _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii(
	.param .b64 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_0,
	.param .b64 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_1,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_2,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_3,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_4,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_5,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_6,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_7
)
{
	.reg .pred 	%p<6>;
	.reg .s16 	%rs<9>;
	.reg .s32 	%r<8>;
	.reg .f32 	%f<65>;
	.reg .s64 	%rd<9>;


	ld.param.u64 	%rd1, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_0];
	ld.param.u64 	%rd2, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_1];
	ld.param.u32 	%r2, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_2];
	ld.param.u32 	%r3, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_3];
	ld.param.f32 	%f30, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_4];
	ld.param.u32 	%r4, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_5];
	ld.param.u32 	%r5, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_6];
	ld.param.u32 	%r6, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_7];
	.loc 1 31 1
	setp.eq.s64	%p1, %rd2, 0;
	@%p1 bra 	BB1_14;

	.loc 1 31 1
	mad.lo.s32 	%r1, %r6, %r2, %r5;
	setp.eq.s32	%p2, %r4, 0;
	@%p2 bra 	BB1_3;

	add.s32 	%r7, %r1, %r3;
	mul.wide.s32 	%rd3, %r7, 16;
	add.s64 	%rd4, %rd2, %rd3;
	ld.v4.f32 	{%f31, %f32, %f33, %f34}, [%rd4];
	mov.f32 	%f61, %f34;
	mov.f32 	%f60, %f33;
	mov.f32 	%f59, %f32;
	mov.f32 	%f58, %f31;
	bra.uni 	BB1_4;

BB1_3:
	mul.wide.s32 	%rd5, %r3, 16;
	add.s64 	%rd6, %rd2, %rd5;
	mul.wide.s32 	%rd7, %r1, 8;
	add.s64 	%rd8, %rd6, %rd7;
	.loc 1 31 1
	ld.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd8];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f58, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f59, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f60, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f61, %temp;
	}

BB1_4:
	.loc 1 31 1
	mul.ftz.f32 	%f35, %f61, %f30;
	ld.f32 	%f18, [%rd1+12];
	ld.f32 	%f17, [%rd1+8];
	ld.f32 	%f16, [%rd1+4];
	ld.f32 	%f15, [%rd1];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f19, %f35;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p3, %f58, 0f00000000;
	@%p3 bra 	BB1_6;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f36, %f58;
	mul.ftz.f32 	%f37, %f36, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f62, %f37;
	bra.uni 	BB1_7;

BB1_6:
	neg.ftz.f32 	%f38, %f58;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f39, %f38;
	mul.ftz.f32 	%f40, %f39, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f41, %f40;
	neg.ftz.f32 	%f62, %f41;

BB1_7:
	setp.ltu.ftz.f32	%p4, %f59, 0f00000000;
	@%p4 bra 	BB1_9;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f42, %f59;
	mul.ftz.f32 	%f43, %f42, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f63, %f43;
	bra.uni 	BB1_10;

BB1_9:
	neg.ftz.f32 	%f44, %f59;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f45, %f44;
	mul.ftz.f32 	%f46, %f45, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f47, %f46;
	neg.ftz.f32 	%f63, %f47;

BB1_10:
	setp.ltu.ftz.f32	%p5, %f60, 0f00000000;
	@%p5 bra 	BB1_12;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f48, %f60;
	mul.ftz.f32 	%f49, %f48, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f64, %f49;
	bra.uni 	BB1_13;

BB1_12:
	neg.ftz.f32 	%f50, %f60;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f51, %f50;
	mul.ftz.f32 	%f52, %f51, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f53, %f52;
	neg.ftz.f32 	%f64, %f53;

BB1_13:
	fma.rn.ftz.f32 	%f54, %f62, %f19, %f15;
	fma.rn.ftz.f32 	%f55, %f63, %f19, %f16;
	fma.rn.ftz.f32 	%f56, %f64, %f19, %f17;
	add.ftz.f32 	%f57, %f18, %f19;
	st.f32 	[%rd1+12], %f57;
	st.f32 	[%rd1+8], %f56;
	st.f32 	[%rd1+4], %f55;
	st.f32 	[%rd1], %f54;

BB1_14:
	.loc 1 31 2
	ret;
}

.visible .entry AccumulateKernel(
	.param .u64 AccumulateKernel_param_0,
	.param .f32 AccumulateKernel_param_1,
	.param .u64 AccumulateKernel_param_2,
	.param .f32 AccumulateKernel_param_3,
	.param .u64 AccumulateKernel_param_4,
	.param .f32 AccumulateKernel_param_5,
	.param .u64 AccumulateKernel_param_6,
	.param .f32 AccumulateKernel_param_7,
	.param .u64 AccumulateKernel_param_8,
	.param .f32 AccumulateKernel_param_9,
	.param .u64 AccumulateKernel_param_10,
	.param .f32 AccumulateKernel_param_11,
	.param .u64 AccumulateKernel_param_12,
	.param .f32 AccumulateKernel_param_13,
	.param .u64 AccumulateKernel_param_14,
	.param .f32 AccumulateKernel_param_15,
	.param .u64 AccumulateKernel_param_16,
	.param .f32 AccumulateKernel_param_17,
	.param .u64 AccumulateKernel_param_18,
	.param .f32 AccumulateKernel_param_19,
	.param .u64 AccumulateKernel_param_20,
	.param .u32 AccumulateKernel_param_21,
	.param .u32 AccumulateKernel_param_22,
	.param .u32 AccumulateKernel_param_23,
	.param .u32 AccumulateKernel_param_24,
	.param .u32 AccumulateKernel_param_25
)
{
	.reg .pred 	%p<59>;
	.reg .s16 	%rs<85>;
	.reg .s32 	%r<133>;
	.reg .f32 	%f<712>;
	.reg .s64 	%rd<100>;


	ld.param.u64 	%rd1, [AccumulateKernel_param_0];
	ld.param.u64 	%rd2, [AccumulateKernel_param_2];
	ld.param.u64 	%rd3, [AccumulateKernel_param_4];
	ld.param.u64 	%rd4, [AccumulateKernel_param_6];
	ld.param.u64 	%rd5, [AccumulateKernel_param_8];
	ld.param.u64 	%rd6, [AccumulateKernel_param_10];
	ld.param.u64 	%rd7, [AccumulateKernel_param_12];
	ld.param.u64 	%rd8, [AccumulateKernel_param_14];
	ld.param.u64 	%rd9, [AccumulateKernel_param_16];
	ld.param.u64 	%rd10, [AccumulateKernel_param_18];
	ld.param.u64 	%rd11, [AccumulateKernel_param_20];
	ld.param.u32 	%r11, [AccumulateKernel_param_21];
	ld.param.u32 	%r12, [AccumulateKernel_param_22];
	ld.param.u32 	%r13, [AccumulateKernel_param_23];
	ld.param.u32 	%r14, [AccumulateKernel_param_24];
	ld.param.u32 	%r15, [AccumulateKernel_param_25];
	.loc 1 31 1
	mov.u32 	%r16, %ntid.x;
	mov.u32 	%r17, %ctaid.x;
	mov.u32 	%r18, %tid.x;
	mad.lo.s32 	%r19, %r16, %r17, %r18;
	mov.u32 	%r20, %ntid.y;
	mov.u32 	%r21, %ctaid.y;
	mov.u32 	%r22, %tid.y;
	mad.lo.s32 	%r23, %r20, %r21, %r22;
	.loc 1 31 1
	setp.lt.s32	%p1, %r19, %r14;
	setp.lt.s32	%p2, %r23, %r15;
	and.pred  	%p3, %p1, %p2;
	.loc 1 31 1
	@!%p3 bra 	BB2_157;
	bra.uni 	BB2_1;

BB2_1:
	.loc 1 31 1
	setp.ne.s64	%p4, %rd1, 0;
	@%p4 bra 	BB2_3;

	mov.f32 	%f704, 0f00000000;
	mov.f32 	%f703, %f704;
	mov.f32 	%f702, %f704;
	mov.f32 	%f701, %f704;
	bra.uni 	BB2_16;

BB2_3:
	.loc 1 31 1
	setp.eq.s32	%p5, %r13, 0;
	@%p5 bra 	BB2_5;

	cvta.to.global.u64 	%rd12, %rd1;
	.loc 1 31 1
	mad.lo.s32 	%r32, %r23, %r11, %r19;
	add.s32 	%r33, %r32, %r12;
	mul.wide.s32 	%rd13, %r33, 16;
	add.s64 	%rd14, %rd12, %rd13;
	ld.global.v4.f32 	{%f362, %f363, %f364, %f365}, [%rd14];
	mov.f32 	%f634, %f365;
	mov.f32 	%f633, %f364;
	mov.f32 	%f632, %f363;
	mov.f32 	%f631, %f362;
	bra.uni 	BB2_6;

BB2_5:
	cvta.to.global.u64 	%rd15, %rd1;
	mul.wide.s32 	%rd16, %r12, 16;
	add.s64 	%rd17, %rd15, %rd16;
	.loc 1 31 1
	mad.lo.s32 	%r42, %r23, %r11, %r19;
	mul.wide.s32 	%rd18, %r42, 8;
	add.s64 	%rd19, %rd17, %rd18;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd19];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f631, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f632, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f633, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f634, %temp;
	}

BB2_6:
	ld.param.f32 	%f621, [AccumulateKernel_param_1];
	.loc 1 31 1
	mul.ftz.f32 	%f366, %f634, %f621;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f15, %f366;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p6, %f631, 0f00000000;
	@%p6 bra 	BB2_8;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f367, %f631;
	mul.ftz.f32 	%f368, %f367, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f635, %f368;
	bra.uni 	BB2_9;

BB2_8:
	neg.ftz.f32 	%f369, %f631;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f370, %f369;
	mul.ftz.f32 	%f371, %f370, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f372, %f371;
	neg.ftz.f32 	%f635, %f372;

BB2_9:
	setp.ltu.ftz.f32	%p7, %f632, 0f00000000;
	@%p7 bra 	BB2_11;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f373, %f632;
	mul.ftz.f32 	%f374, %f373, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f636, %f374;
	bra.uni 	BB2_12;

BB2_11:
	neg.ftz.f32 	%f375, %f632;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f376, %f375;
	mul.ftz.f32 	%f377, %f376, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f378, %f377;
	neg.ftz.f32 	%f636, %f378;

BB2_12:
	setp.ltu.ftz.f32	%p8, %f633, 0f00000000;
	@%p8 bra 	BB2_14;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f379, %f633;
	mul.ftz.f32 	%f380, %f379, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f637, %f380;
	bra.uni 	BB2_15;

BB2_14:
	neg.ftz.f32 	%f381, %f633;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f382, %f381;
	mul.ftz.f32 	%f383, %f382, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f384, %f383;
	neg.ftz.f32 	%f637, %f384;

BB2_15:
	fma.rn.ftz.f32 	%f701, %f635, %f15, 0f00000000;
	fma.rn.ftz.f32 	%f702, %f636, %f15, 0f00000000;
	fma.rn.ftz.f32 	%f703, %f637, %f15, 0f00000000;
	add.ftz.f32 	%f704, %f15, 0f00000000;

BB2_16:
	.loc 1 31 1
	setp.eq.s64	%p9, %rd2, 0;
	@%p9 bra 	BB2_30;

	.loc 1 31 1
	mad.lo.s32 	%r1, %r23, %r11, %r19;
	setp.eq.s32	%p10, %r13, 0;
	@%p10 bra 	BB2_19;

	cvta.to.global.u64 	%rd20, %rd2;
	add.s32 	%r51, %r1, %r12;
	mul.wide.s32 	%rd21, %r51, 16;
	add.s64 	%rd22, %rd20, %rd21;
	ld.global.v4.f32 	{%f389, %f390, %f391, %f392}, [%rd22];
	mov.f32 	%f641, %f392;
	mov.f32 	%f640, %f391;
	mov.f32 	%f639, %f390;
	mov.f32 	%f638, %f389;
	bra.uni 	BB2_20;

BB2_19:
	cvta.to.global.u64 	%rd23, %rd2;
	mul.wide.s32 	%rd24, %r12, 16;
	add.s64 	%rd25, %rd23, %rd24;
	mul.wide.s32 	%rd26, %r1, 8;
	add.s64 	%rd27, %rd25, %rd26;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs9, %rs10, %rs11, %rs12}, [%rd27];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs9;
	cvt.f32.f16 	%f638, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs10;
	cvt.f32.f16 	%f639, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs11;
	cvt.f32.f16 	%f640, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs12;
	cvt.f32.f16 	%f641, %temp;
	}

BB2_20:
	ld.param.f32 	%f622, [AccumulateKernel_param_3];
	.loc 1 31 1
	mul.ftz.f32 	%f393, %f641, %f622;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f48, %f393;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p11, %f638, 0f00000000;
	@%p11 bra 	BB2_22;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f394, %f638;
	mul.ftz.f32 	%f395, %f394, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f642, %f395;
	bra.uni 	BB2_23;

BB2_22:
	neg.ftz.f32 	%f396, %f638;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f397, %f396;
	mul.ftz.f32 	%f398, %f397, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f399, %f398;
	neg.ftz.f32 	%f642, %f399;

BB2_23:
	setp.ltu.ftz.f32	%p12, %f639, 0f00000000;
	@%p12 bra 	BB2_25;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f400, %f639;
	mul.ftz.f32 	%f401, %f400, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f643, %f401;
	bra.uni 	BB2_26;

BB2_25:
	neg.ftz.f32 	%f402, %f639;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f403, %f402;
	mul.ftz.f32 	%f404, %f403, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f405, %f404;
	neg.ftz.f32 	%f643, %f405;

BB2_26:
	setp.ltu.ftz.f32	%p13, %f640, 0f00000000;
	@%p13 bra 	BB2_28;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f406, %f640;
	mul.ftz.f32 	%f407, %f406, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f644, %f407;
	bra.uni 	BB2_29;

BB2_28:
	neg.ftz.f32 	%f408, %f640;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f409, %f408;
	mul.ftz.f32 	%f410, %f409, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f411, %f410;
	neg.ftz.f32 	%f644, %f411;

BB2_29:
	fma.rn.ftz.f32 	%f701, %f642, %f48, %f701;
	fma.rn.ftz.f32 	%f702, %f643, %f48, %f702;
	fma.rn.ftz.f32 	%f703, %f644, %f48, %f703;
	add.ftz.f32 	%f704, %f704, %f48;

BB2_30:
	.loc 1 31 1
	setp.eq.s64	%p14, %rd3, 0;
	@%p14 bra 	BB2_44;

	.loc 1 31 1
	mad.lo.s32 	%r2, %r23, %r11, %r19;
	setp.eq.s32	%p15, %r13, 0;
	@%p15 bra 	BB2_33;

	cvta.to.global.u64 	%rd28, %rd3;
	add.s32 	%r60, %r2, %r12;
	mul.wide.s32 	%rd29, %r60, 16;
	add.s64 	%rd30, %rd28, %rd29;
	ld.global.v4.f32 	{%f412, %f413, %f414, %f415}, [%rd30];
	mov.f32 	%f648, %f415;
	mov.f32 	%f647, %f414;
	mov.f32 	%f646, %f413;
	mov.f32 	%f645, %f412;
	bra.uni 	BB2_34;

BB2_33:
	cvta.to.global.u64 	%rd31, %rd3;
	mul.wide.s32 	%rd32, %r12, 16;
	add.s64 	%rd33, %rd31, %rd32;
	mul.wide.s32 	%rd34, %r2, 8;
	add.s64 	%rd35, %rd33, %rd34;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs17, %rs18, %rs19, %rs20}, [%rd35];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs17;
	cvt.f32.f16 	%f645, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs18;
	cvt.f32.f16 	%f646, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs19;
	cvt.f32.f16 	%f647, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs20;
	cvt.f32.f16 	%f648, %temp;
	}

BB2_34:
	ld.param.f32 	%f623, [AccumulateKernel_param_5];
	.loc 1 31 1
	mul.ftz.f32 	%f416, %f648, %f623;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f81, %f416;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p16, %f645, 0f00000000;
	@%p16 bra 	BB2_36;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f417, %f645;
	mul.ftz.f32 	%f418, %f417, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f649, %f418;
	bra.uni 	BB2_37;

BB2_36:
	neg.ftz.f32 	%f419, %f645;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f420, %f419;
	mul.ftz.f32 	%f421, %f420, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f422, %f421;
	neg.ftz.f32 	%f649, %f422;

BB2_37:
	setp.ltu.ftz.f32	%p17, %f646, 0f00000000;
	@%p17 bra 	BB2_39;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f423, %f646;
	mul.ftz.f32 	%f424, %f423, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f650, %f424;
	bra.uni 	BB2_40;

BB2_39:
	neg.ftz.f32 	%f425, %f646;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f426, %f425;
	mul.ftz.f32 	%f427, %f426, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f428, %f427;
	neg.ftz.f32 	%f650, %f428;

BB2_40:
	setp.ltu.ftz.f32	%p18, %f647, 0f00000000;
	@%p18 bra 	BB2_42;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f429, %f647;
	mul.ftz.f32 	%f430, %f429, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f651, %f430;
	bra.uni 	BB2_43;

BB2_42:
	neg.ftz.f32 	%f431, %f647;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f432, %f431;
	mul.ftz.f32 	%f433, %f432, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f434, %f433;
	neg.ftz.f32 	%f651, %f434;

BB2_43:
	fma.rn.ftz.f32 	%f701, %f649, %f81, %f701;
	fma.rn.ftz.f32 	%f702, %f650, %f81, %f702;
	fma.rn.ftz.f32 	%f703, %f651, %f81, %f703;
	add.ftz.f32 	%f704, %f704, %f81;

BB2_44:
	.loc 1 31 1
	setp.eq.s64	%p19, %rd4, 0;
	@%p19 bra 	BB2_58;

	.loc 1 31 1
	mad.lo.s32 	%r3, %r23, %r11, %r19;
	setp.eq.s32	%p20, %r13, 0;
	@%p20 bra 	BB2_47;

	cvta.to.global.u64 	%rd36, %rd4;
	add.s32 	%r69, %r3, %r12;
	mul.wide.s32 	%rd37, %r69, 16;
	add.s64 	%rd38, %rd36, %rd37;
	ld.global.v4.f32 	{%f435, %f436, %f437, %f438}, [%rd38];
	mov.f32 	%f655, %f438;
	mov.f32 	%f654, %f437;
	mov.f32 	%f653, %f436;
	mov.f32 	%f652, %f435;
	bra.uni 	BB2_48;

BB2_47:
	cvta.to.global.u64 	%rd39, %rd4;
	mul.wide.s32 	%rd40, %r12, 16;
	add.s64 	%rd41, %rd39, %rd40;
	mul.wide.s32 	%rd42, %r3, 8;
	add.s64 	%rd43, %rd41, %rd42;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs25, %rs26, %rs27, %rs28}, [%rd43];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs25;
	cvt.f32.f16 	%f652, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs26;
	cvt.f32.f16 	%f653, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs27;
	cvt.f32.f16 	%f654, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs28;
	cvt.f32.f16 	%f655, %temp;
	}

BB2_48:
	ld.param.f32 	%f624, [AccumulateKernel_param_7];
	.loc 1 31 1
	mul.ftz.f32 	%f439, %f655, %f624;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f114, %f439;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p21, %f652, 0f00000000;
	@%p21 bra 	BB2_50;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f440, %f652;
	mul.ftz.f32 	%f441, %f440, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f656, %f441;
	bra.uni 	BB2_51;

BB2_50:
	neg.ftz.f32 	%f442, %f652;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f443, %f442;
	mul.ftz.f32 	%f444, %f443, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f445, %f444;
	neg.ftz.f32 	%f656, %f445;

BB2_51:
	setp.ltu.ftz.f32	%p22, %f653, 0f00000000;
	@%p22 bra 	BB2_53;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f446, %f653;
	mul.ftz.f32 	%f447, %f446, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f657, %f447;
	bra.uni 	BB2_54;

BB2_53:
	neg.ftz.f32 	%f448, %f653;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f449, %f448;
	mul.ftz.f32 	%f450, %f449, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f451, %f450;
	neg.ftz.f32 	%f657, %f451;

BB2_54:
	setp.ltu.ftz.f32	%p23, %f654, 0f00000000;
	@%p23 bra 	BB2_56;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f452, %f654;
	mul.ftz.f32 	%f453, %f452, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f658, %f453;
	bra.uni 	BB2_57;

BB2_56:
	neg.ftz.f32 	%f454, %f654;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f455, %f454;
	mul.ftz.f32 	%f456, %f455, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f457, %f456;
	neg.ftz.f32 	%f658, %f457;

BB2_57:
	fma.rn.ftz.f32 	%f701, %f656, %f114, %f701;
	fma.rn.ftz.f32 	%f702, %f657, %f114, %f702;
	fma.rn.ftz.f32 	%f703, %f658, %f114, %f703;
	add.ftz.f32 	%f704, %f704, %f114;

BB2_58:
	.loc 1 31 1
	setp.eq.s64	%p24, %rd5, 0;
	@%p24 bra 	BB2_72;

	.loc 1 31 1
	mad.lo.s32 	%r4, %r23, %r11, %r19;
	setp.eq.s32	%p25, %r13, 0;
	@%p25 bra 	BB2_61;

	cvta.to.global.u64 	%rd44, %rd5;
	add.s32 	%r78, %r4, %r12;
	mul.wide.s32 	%rd45, %r78, 16;
	add.s64 	%rd46, %rd44, %rd45;
	ld.global.v4.f32 	{%f458, %f459, %f460, %f461}, [%rd46];
	mov.f32 	%f662, %f461;
	mov.f32 	%f661, %f460;
	mov.f32 	%f660, %f459;
	mov.f32 	%f659, %f458;
	bra.uni 	BB2_62;

BB2_61:
	cvta.to.global.u64 	%rd47, %rd5;
	mul.wide.s32 	%rd48, %r12, 16;
	add.s64 	%rd49, %rd47, %rd48;
	mul.wide.s32 	%rd50, %r4, 8;
	add.s64 	%rd51, %rd49, %rd50;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs33, %rs34, %rs35, %rs36}, [%rd51];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs33;
	cvt.f32.f16 	%f659, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs34;
	cvt.f32.f16 	%f660, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs35;
	cvt.f32.f16 	%f661, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs36;
	cvt.f32.f16 	%f662, %temp;
	}

BB2_62:
	ld.param.f32 	%f625, [AccumulateKernel_param_9];
	.loc 1 31 1
	mul.ftz.f32 	%f462, %f662, %f625;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f147, %f462;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p26, %f659, 0f00000000;
	@%p26 bra 	BB2_64;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f463, %f659;
	mul.ftz.f32 	%f464, %f463, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f663, %f464;
	bra.uni 	BB2_65;

BB2_64:
	neg.ftz.f32 	%f465, %f659;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f466, %f465;
	mul.ftz.f32 	%f467, %f466, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f468, %f467;
	neg.ftz.f32 	%f663, %f468;

BB2_65:
	setp.ltu.ftz.f32	%p27, %f660, 0f00000000;
	@%p27 bra 	BB2_67;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f469, %f660;
	mul.ftz.f32 	%f470, %f469, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f664, %f470;
	bra.uni 	BB2_68;

BB2_67:
	neg.ftz.f32 	%f471, %f660;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f472, %f471;
	mul.ftz.f32 	%f473, %f472, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f474, %f473;
	neg.ftz.f32 	%f664, %f474;

BB2_68:
	setp.ltu.ftz.f32	%p28, %f661, 0f00000000;
	@%p28 bra 	BB2_70;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f475, %f661;
	mul.ftz.f32 	%f476, %f475, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f665, %f476;
	bra.uni 	BB2_71;

BB2_70:
	neg.ftz.f32 	%f477, %f661;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f478, %f477;
	mul.ftz.f32 	%f479, %f478, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f480, %f479;
	neg.ftz.f32 	%f665, %f480;

BB2_71:
	fma.rn.ftz.f32 	%f701, %f663, %f147, %f701;
	fma.rn.ftz.f32 	%f702, %f664, %f147, %f702;
	fma.rn.ftz.f32 	%f703, %f665, %f147, %f703;
	add.ftz.f32 	%f704, %f704, %f147;

BB2_72:
	.loc 1 31 1
	setp.eq.s64	%p29, %rd6, 0;
	@%p29 bra 	BB2_86;

	.loc 1 31 1
	mad.lo.s32 	%r5, %r23, %r11, %r19;
	setp.eq.s32	%p30, %r13, 0;
	@%p30 bra 	BB2_75;

	cvta.to.global.u64 	%rd52, %rd6;
	add.s32 	%r87, %r5, %r12;
	mul.wide.s32 	%rd53, %r87, 16;
	add.s64 	%rd54, %rd52, %rd53;
	ld.global.v4.f32 	{%f481, %f482, %f483, %f484}, [%rd54];
	mov.f32 	%f669, %f484;
	mov.f32 	%f668, %f483;
	mov.f32 	%f667, %f482;
	mov.f32 	%f666, %f481;
	bra.uni 	BB2_76;

BB2_75:
	cvta.to.global.u64 	%rd55, %rd6;
	mul.wide.s32 	%rd56, %r12, 16;
	add.s64 	%rd57, %rd55, %rd56;
	mul.wide.s32 	%rd58, %r5, 8;
	add.s64 	%rd59, %rd57, %rd58;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs41, %rs42, %rs43, %rs44}, [%rd59];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs41;
	cvt.f32.f16 	%f666, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs42;
	cvt.f32.f16 	%f667, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs43;
	cvt.f32.f16 	%f668, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs44;
	cvt.f32.f16 	%f669, %temp;
	}

BB2_76:
	ld.param.f32 	%f626, [AccumulateKernel_param_11];
	.loc 1 31 1
	mul.ftz.f32 	%f485, %f669, %f626;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f180, %f485;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p31, %f666, 0f00000000;
	@%p31 bra 	BB2_78;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f486, %f666;
	mul.ftz.f32 	%f487, %f486, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f670, %f487;
	bra.uni 	BB2_79;

BB2_78:
	neg.ftz.f32 	%f488, %f666;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f489, %f488;
	mul.ftz.f32 	%f490, %f489, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f491, %f490;
	neg.ftz.f32 	%f670, %f491;

BB2_79:
	setp.ltu.ftz.f32	%p32, %f667, 0f00000000;
	@%p32 bra 	BB2_81;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f492, %f667;
	mul.ftz.f32 	%f493, %f492, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f671, %f493;
	bra.uni 	BB2_82;

BB2_81:
	neg.ftz.f32 	%f494, %f667;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f495, %f494;
	mul.ftz.f32 	%f496, %f495, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f497, %f496;
	neg.ftz.f32 	%f671, %f497;

BB2_82:
	setp.ltu.ftz.f32	%p33, %f668, 0f00000000;
	@%p33 bra 	BB2_84;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f498, %f668;
	mul.ftz.f32 	%f499, %f498, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f672, %f499;
	bra.uni 	BB2_85;

BB2_84:
	neg.ftz.f32 	%f500, %f668;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f501, %f500;
	mul.ftz.f32 	%f502, %f501, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f503, %f502;
	neg.ftz.f32 	%f672, %f503;

BB2_85:
	fma.rn.ftz.f32 	%f701, %f670, %f180, %f701;
	fma.rn.ftz.f32 	%f702, %f671, %f180, %f702;
	fma.rn.ftz.f32 	%f703, %f672, %f180, %f703;
	add.ftz.f32 	%f704, %f704, %f180;

BB2_86:
	.loc 1 31 1
	setp.eq.s64	%p34, %rd7, 0;
	@%p34 bra 	BB2_100;

	.loc 1 31 1
	mad.lo.s32 	%r6, %r23, %r11, %r19;
	setp.eq.s32	%p35, %r13, 0;
	@%p35 bra 	BB2_89;

	cvta.to.global.u64 	%rd60, %rd7;
	add.s32 	%r96, %r6, %r12;
	mul.wide.s32 	%rd61, %r96, 16;
	add.s64 	%rd62, %rd60, %rd61;
	ld.global.v4.f32 	{%f504, %f505, %f506, %f507}, [%rd62];
	mov.f32 	%f676, %f507;
	mov.f32 	%f675, %f506;
	mov.f32 	%f674, %f505;
	mov.f32 	%f673, %f504;
	bra.uni 	BB2_90;

BB2_89:
	cvta.to.global.u64 	%rd63, %rd7;
	mul.wide.s32 	%rd64, %r12, 16;
	add.s64 	%rd65, %rd63, %rd64;
	mul.wide.s32 	%rd66, %r6, 8;
	add.s64 	%rd67, %rd65, %rd66;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs49, %rs50, %rs51, %rs52}, [%rd67];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs49;
	cvt.f32.f16 	%f673, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs50;
	cvt.f32.f16 	%f674, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs51;
	cvt.f32.f16 	%f675, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs52;
	cvt.f32.f16 	%f676, %temp;
	}

BB2_90:
	ld.param.f32 	%f627, [AccumulateKernel_param_13];
	.loc 1 31 1
	mul.ftz.f32 	%f508, %f676, %f627;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f213, %f508;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p36, %f673, 0f00000000;
	@%p36 bra 	BB2_92;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f509, %f673;
	mul.ftz.f32 	%f510, %f509, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f677, %f510;
	bra.uni 	BB2_93;

BB2_92:
	neg.ftz.f32 	%f511, %f673;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f512, %f511;
	mul.ftz.f32 	%f513, %f512, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f514, %f513;
	neg.ftz.f32 	%f677, %f514;

BB2_93:
	setp.ltu.ftz.f32	%p37, %f674, 0f00000000;
	@%p37 bra 	BB2_95;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f515, %f674;
	mul.ftz.f32 	%f516, %f515, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f678, %f516;
	bra.uni 	BB2_96;

BB2_95:
	neg.ftz.f32 	%f517, %f674;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f518, %f517;
	mul.ftz.f32 	%f519, %f518, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f520, %f519;
	neg.ftz.f32 	%f678, %f520;

BB2_96:
	setp.ltu.ftz.f32	%p38, %f675, 0f00000000;
	@%p38 bra 	BB2_98;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f521, %f675;
	mul.ftz.f32 	%f522, %f521, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f679, %f522;
	bra.uni 	BB2_99;

BB2_98:
	neg.ftz.f32 	%f523, %f675;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f524, %f523;
	mul.ftz.f32 	%f525, %f524, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f526, %f525;
	neg.ftz.f32 	%f679, %f526;

BB2_99:
	fma.rn.ftz.f32 	%f701, %f677, %f213, %f701;
	fma.rn.ftz.f32 	%f702, %f678, %f213, %f702;
	fma.rn.ftz.f32 	%f703, %f679, %f213, %f703;
	add.ftz.f32 	%f704, %f704, %f213;

BB2_100:
	.loc 1 31 1
	setp.eq.s64	%p39, %rd8, 0;
	@%p39 bra 	BB2_114;

	.loc 1 31 1
	mad.lo.s32 	%r7, %r23, %r11, %r19;
	setp.eq.s32	%p40, %r13, 0;
	@%p40 bra 	BB2_103;

	cvta.to.global.u64 	%rd68, %rd8;
	add.s32 	%r105, %r7, %r12;
	mul.wide.s32 	%rd69, %r105, 16;
	add.s64 	%rd70, %rd68, %rd69;
	ld.global.v4.f32 	{%f527, %f528, %f529, %f530}, [%rd70];
	mov.f32 	%f683, %f530;
	mov.f32 	%f682, %f529;
	mov.f32 	%f681, %f528;
	mov.f32 	%f680, %f527;
	bra.uni 	BB2_104;

BB2_103:
	cvta.to.global.u64 	%rd71, %rd8;
	mul.wide.s32 	%rd72, %r12, 16;
	add.s64 	%rd73, %rd71, %rd72;
	mul.wide.s32 	%rd74, %r7, 8;
	add.s64 	%rd75, %rd73, %rd74;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs57, %rs58, %rs59, %rs60}, [%rd75];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs57;
	cvt.f32.f16 	%f680, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs58;
	cvt.f32.f16 	%f681, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs59;
	cvt.f32.f16 	%f682, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs60;
	cvt.f32.f16 	%f683, %temp;
	}

BB2_104:
	ld.param.f32 	%f628, [AccumulateKernel_param_15];
	.loc 1 31 1
	mul.ftz.f32 	%f531, %f683, %f628;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f246, %f531;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p41, %f680, 0f00000000;
	@%p41 bra 	BB2_106;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f532, %f680;
	mul.ftz.f32 	%f533, %f532, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f684, %f533;
	bra.uni 	BB2_107;

BB2_106:
	neg.ftz.f32 	%f534, %f680;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f535, %f534;
	mul.ftz.f32 	%f536, %f535, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f537, %f536;
	neg.ftz.f32 	%f684, %f537;

BB2_107:
	setp.ltu.ftz.f32	%p42, %f681, 0f00000000;
	@%p42 bra 	BB2_109;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f538, %f681;
	mul.ftz.f32 	%f539, %f538, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f685, %f539;
	bra.uni 	BB2_110;

BB2_109:
	neg.ftz.f32 	%f540, %f681;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f541, %f540;
	mul.ftz.f32 	%f542, %f541, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f543, %f542;
	neg.ftz.f32 	%f685, %f543;

BB2_110:
	setp.ltu.ftz.f32	%p43, %f682, 0f00000000;
	@%p43 bra 	BB2_112;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f544, %f682;
	mul.ftz.f32 	%f545, %f544, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f686, %f545;
	bra.uni 	BB2_113;

BB2_112:
	neg.ftz.f32 	%f546, %f682;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f547, %f546;
	mul.ftz.f32 	%f548, %f547, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f549, %f548;
	neg.ftz.f32 	%f686, %f549;

BB2_113:
	fma.rn.ftz.f32 	%f701, %f684, %f246, %f701;
	fma.rn.ftz.f32 	%f702, %f685, %f246, %f702;
	fma.rn.ftz.f32 	%f703, %f686, %f246, %f703;
	add.ftz.f32 	%f704, %f704, %f246;

BB2_114:
	.loc 1 31 1
	setp.eq.s64	%p44, %rd9, 0;
	@%p44 bra 	BB2_128;

	.loc 1 31 1
	mad.lo.s32 	%r8, %r23, %r11, %r19;
	setp.eq.s32	%p45, %r13, 0;
	@%p45 bra 	BB2_117;

	cvta.to.global.u64 	%rd76, %rd9;
	add.s32 	%r114, %r8, %r12;
	mul.wide.s32 	%rd77, %r114, 16;
	add.s64 	%rd78, %rd76, %rd77;
	ld.global.v4.f32 	{%f550, %f551, %f552, %f553}, [%rd78];
	mov.f32 	%f690, %f553;
	mov.f32 	%f689, %f552;
	mov.f32 	%f688, %f551;
	mov.f32 	%f687, %f550;
	bra.uni 	BB2_118;

BB2_117:
	cvta.to.global.u64 	%rd79, %rd9;
	mul.wide.s32 	%rd80, %r12, 16;
	add.s64 	%rd81, %rd79, %rd80;
	mul.wide.s32 	%rd82, %r8, 8;
	add.s64 	%rd83, %rd81, %rd82;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs65, %rs66, %rs67, %rs68}, [%rd83];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs65;
	cvt.f32.f16 	%f687, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs66;
	cvt.f32.f16 	%f688, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs67;
	cvt.f32.f16 	%f689, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs68;
	cvt.f32.f16 	%f690, %temp;
	}

BB2_118:
	ld.param.f32 	%f629, [AccumulateKernel_param_17];
	.loc 1 31 1
	mul.ftz.f32 	%f554, %f690, %f629;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f279, %f554;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p46, %f687, 0f00000000;
	@%p46 bra 	BB2_120;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f555, %f687;
	mul.ftz.f32 	%f556, %f555, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f691, %f556;
	bra.uni 	BB2_121;

BB2_120:
	neg.ftz.f32 	%f557, %f687;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f558, %f557;
	mul.ftz.f32 	%f559, %f558, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f560, %f559;
	neg.ftz.f32 	%f691, %f560;

BB2_121:
	setp.ltu.ftz.f32	%p47, %f688, 0f00000000;
	@%p47 bra 	BB2_123;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f561, %f688;
	mul.ftz.f32 	%f562, %f561, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f692, %f562;
	bra.uni 	BB2_124;

BB2_123:
	neg.ftz.f32 	%f563, %f688;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f564, %f563;
	mul.ftz.f32 	%f565, %f564, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f566, %f565;
	neg.ftz.f32 	%f692, %f566;

BB2_124:
	setp.ltu.ftz.f32	%p48, %f689, 0f00000000;
	@%p48 bra 	BB2_126;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f567, %f689;
	mul.ftz.f32 	%f568, %f567, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f693, %f568;
	bra.uni 	BB2_127;

BB2_126:
	neg.ftz.f32 	%f569, %f689;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f570, %f569;
	mul.ftz.f32 	%f571, %f570, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f572, %f571;
	neg.ftz.f32 	%f693, %f572;

BB2_127:
	fma.rn.ftz.f32 	%f701, %f691, %f279, %f701;
	fma.rn.ftz.f32 	%f702, %f692, %f279, %f702;
	fma.rn.ftz.f32 	%f703, %f693, %f279, %f703;
	add.ftz.f32 	%f704, %f704, %f279;

BB2_128:
	.loc 1 31 1
	setp.eq.s64	%p49, %rd10, 0;
	@%p49 bra 	BB2_142;

	.loc 1 31 1
	mad.lo.s32 	%r9, %r23, %r11, %r19;
	setp.eq.s32	%p50, %r13, 0;
	@%p50 bra 	BB2_131;

	cvta.to.global.u64 	%rd84, %rd10;
	add.s32 	%r123, %r9, %r12;
	mul.wide.s32 	%rd85, %r123, 16;
	add.s64 	%rd86, %rd84, %rd85;
	ld.global.v4.f32 	{%f573, %f574, %f575, %f576}, [%rd86];
	mov.f32 	%f697, %f576;
	mov.f32 	%f696, %f575;
	mov.f32 	%f695, %f574;
	mov.f32 	%f694, %f573;
	bra.uni 	BB2_132;

BB2_131:
	cvta.to.global.u64 	%rd87, %rd10;
	mul.wide.s32 	%rd88, %r12, 16;
	add.s64 	%rd89, %rd87, %rd88;
	mul.wide.s32 	%rd90, %r9, 8;
	add.s64 	%rd91, %rd89, %rd90;
	.loc 1 31 1
	ld.global.v4.u16 	{%rs73, %rs74, %rs75, %rs76}, [%rd91];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs73;
	cvt.f32.f16 	%f694, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs74;
	cvt.f32.f16 	%f695, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs75;
	cvt.f32.f16 	%f696, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs76;
	cvt.f32.f16 	%f697, %temp;
	}

BB2_132:
	ld.param.f32 	%f630, [AccumulateKernel_param_19];
	.loc 1 31 1
	mul.ftz.f32 	%f577, %f697, %f630;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f312, %f577;
	.loc 1 31 189
	setp.ltu.ftz.f32	%p51, %f694, 0f00000000;
	@%p51 bra 	BB2_134;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f578, %f694;
	mul.ftz.f32 	%f579, %f578, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f698, %f579;
	bra.uni 	BB2_135;

BB2_134:
	neg.ftz.f32 	%f580, %f694;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f581, %f580;
	mul.ftz.f32 	%f582, %f581, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f583, %f582;
	neg.ftz.f32 	%f698, %f583;

BB2_135:
	setp.ltu.ftz.f32	%p52, %f695, 0f00000000;
	@%p52 bra 	BB2_137;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f584, %f695;
	mul.ftz.f32 	%f585, %f584, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f699, %f585;
	bra.uni 	BB2_138;

BB2_137:
	neg.ftz.f32 	%f586, %f695;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f587, %f586;
	mul.ftz.f32 	%f588, %f587, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f589, %f588;
	neg.ftz.f32 	%f699, %f589;

BB2_138:
	setp.ltu.ftz.f32	%p53, %f696, 0f00000000;
	@%p53 bra 	BB2_140;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f590, %f696;
	mul.ftz.f32 	%f591, %f590, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f700, %f591;
	bra.uni 	BB2_141;

BB2_140:
	neg.ftz.f32 	%f592, %f696;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f593, %f592;
	mul.ftz.f32 	%f594, %f593, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f595, %f594;
	neg.ftz.f32 	%f700, %f595;

BB2_141:
	fma.rn.ftz.f32 	%f701, %f698, %f312, %f701;
	fma.rn.ftz.f32 	%f702, %f699, %f312, %f702;
	fma.rn.ftz.f32 	%f703, %f700, %f312, %f703;
	add.ftz.f32 	%f704, %f704, %f312;

BB2_142:
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f708, %f704;
	.loc 2 45 1
	add.ftz.f32 	%f596, %f708, 0fB70637BD;
	setp.gtu.ftz.f32	%p54, %f596, 0f00000000;
	@%p54 bra 	BB2_144;

	mov.f32 	%f708, 0f00000000;
	mov.f32 	%f707, %f708;
	mov.f32 	%f706, %f708;
	mov.f32 	%f705, %f708;
	bra.uni 	BB2_145;

BB2_144:
	mov.f32 	%f601, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f602, %f601, %f708;
	.loc 2 45 1
	mul.ftz.f32 	%f707, %f703, %f602;
	mul.ftz.f32 	%f706, %f702, %f602;
	mul.ftz.f32 	%f705, %f701, %f602;

BB2_145:
	.loc 1 31 85
	setp.ltu.ftz.f32	%p55, %f705, 0f00000000;
	@%p55 bra 	BB2_147;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f603, %f705;
	mul.ftz.f32 	%f604, %f603, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f709, %f604;
	bra.uni 	BB2_148;

BB2_147:
	.loc 1 31 198
	neg.ftz.f32 	%f605, %f705;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f606, %f605;
	mul.ftz.f32 	%f607, %f606, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f608, %f607;
	.loc 1 31 236
	neg.ftz.f32 	%f709, %f608;

BB2_148:
	setp.ltu.ftz.f32	%p56, %f706, 0f00000000;
	@%p56 bra 	BB2_150;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f609, %f706;
	mul.ftz.f32 	%f610, %f609, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f710, %f610;
	bra.uni 	BB2_151;

BB2_150:
	neg.ftz.f32 	%f611, %f706;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f612, %f611;
	mul.ftz.f32 	%f613, %f612, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f614, %f613;
	neg.ftz.f32 	%f710, %f614;

BB2_151:
	setp.ltu.ftz.f32	%p57, %f707, 0f00000000;
	@%p57 bra 	BB2_153;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f615, %f707;
	mul.ftz.f32 	%f616, %f615, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f711, %f616;
	bra.uni 	BB2_154;

BB2_153:
	neg.ftz.f32 	%f617, %f707;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f618, %f617;
	mul.ftz.f32 	%f619, %f618, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f620, %f619;
	neg.ftz.f32 	%f711, %f620;

BB2_154:
	.loc 1 31 1
	mad.lo.s32 	%r10, %r23, %r11, %r19;
	.loc 1 31 1
	setp.eq.s32	%p58, %r13, 0;
	@%p58 bra 	BB2_156;

	cvta.to.global.u64 	%rd92, %rd11;
	add.s32 	%r132, %r10, %r12;
	mul.wide.s32 	%rd93, %r132, 16;
	add.s64 	%rd94, %rd92, %rd93;
	.loc 1 31 1
	st.global.v4.f32 	[%rd94], {%f709, %f710, %f711, %f708};
	bra.uni 	BB2_157;

BB2_156:
	cvta.to.global.u64 	%rd95, %rd11;
	mul.wide.s32 	%rd96, %r12, 16;
	add.s64 	%rd97, %rd95, %rd96;
	mul.wide.s32 	%rd98, %r10, 8;
	add.s64 	%rd99, %rd97, %rd98;
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f709;
	mov.b16 	%rs81, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f710;
	mov.b16 	%rs82, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f711;
	mov.b16 	%rs83, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f708;
	mov.b16 	%rs84, %temp;
}
	.loc 1 31 244
	st.global.v4.u16 	[%rd99], {%rs81, %rs82, %rs83, %rs84};

BB2_157:
	.loc 1 31 2
	ret;
}


