//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Fri Jul 25 04:36:16 2014 (1406288176)
// Cuda compilation tools, release 6.5, V6.5.13
//

.version 4.1
.target sm_30
.address_size 64


.visible .func  (.param .align 16 .b8 func_retval0[16]) _Z18UnpremultiplyPixel8PixelRGB(
	.param .align 16 .b8 _Z18UnpremultiplyPixel8PixelRGB_param_0[16]
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<24>;


	ld.param.f32 	%f11, [_Z18UnpremultiplyPixel8PixelRGB_param_0+8];
	ld.param.f32 	%f10, [_Z18UnpremultiplyPixel8PixelRGB_param_0+4];
	ld.param.f32 	%f9, [_Z18UnpremultiplyPixel8PixelRGB_param_0];
	ld.param.f32 	%f12, [_Z18UnpremultiplyPixel8PixelRGB_param_0+12];
	cvt.ftz.sat.f32.f32	%f20, %f12;
	add.ftz.f32 	%f13, %f20, 0fB70637BD;
	setp.gtu.ftz.f32	%p1, %f13, 0f00000000;
	@%p1 bra 	BB0_2;

	mov.f32 	%f23, 0f00000000;
	mov.f32 	%f22, %f23;
	mov.f32 	%f21, %f23;
	mov.f32 	%f20, %f23;
	bra.uni 	BB0_3;

BB0_2:
	mov.f32 	%f18, 0f3F800000;
	div.approx.ftz.f32 	%f19, %f18, %f20;
	mul.ftz.f32 	%f21, %f11, %f19;
	mul.ftz.f32 	%f22, %f10, %f19;
	mul.ftz.f32 	%f23, %f9, %f19;

BB0_3:
	st.param.f32	[func_retval0+0], %f23;
	st.param.f32	[func_retval0+4], %f22;
	st.param.f32	[func_retval0+8], %f21;
	st.param.f32	[func_retval0+12], %f20;
	ret;
}

.visible .func _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii(
	.param .b64 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_0,
	.param .b64 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_1,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_2,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_3,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_4,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_5,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_6,
	.param .b32 _Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_7
)
{
	.reg .pred 	%p<6>;
	.reg .s16 	%rs<9>;
	.reg .s32 	%r<7>;
	.reg .f32 	%f<65>;
	.reg .s64 	%rd<12>;


	ld.param.u64 	%rd3, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_0];
	ld.param.u64 	%rd4, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_1];
	ld.param.u32 	%r1, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_2];
	ld.param.u32 	%r2, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_3];
	ld.param.f32 	%f30, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_4];
	ld.param.u32 	%r3, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_5];
	ld.param.u32 	%r4, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_6];
	ld.param.u32 	%r5, [_Z15AccumulatePixelP8PixelRGBPK6float4iif17DevicePixelFormatii_param_7];
	setp.eq.s64	%p1, %rd4, 0;
	@%p1 bra 	BB1_14;

	mad.lo.s32 	%r6, %r5, %r1, %r4;
	cvt.s64.s32	%rd1, %r2;
	cvt.s64.s32	%rd2, %r6;
	setp.eq.s32	%p2, %r3, 0;
	@%p2 bra 	BB1_3;

	add.s64 	%rd5, %rd2, %rd1;
	shl.b64 	%rd6, %rd5, 4;
	add.s64 	%rd7, %rd4, %rd6;
	ld.v4.f32 	{%f31, %f32, %f33, %f34}, [%rd7];
	mov.f32 	%f61, %f34;
	mov.f32 	%f60, %f33;
	mov.f32 	%f59, %f32;
	mov.f32 	%f58, %f31;
	bra.uni 	BB1_4;

BB1_3:
	shl.b64 	%rd8, %rd1, 4;
	add.s64 	%rd9, %rd4, %rd8;
	shl.b64 	%rd10, %rd2, 3;
	add.s64 	%rd11, %rd9, %rd10;
	ld.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd11];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f58, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f59, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f60, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f61, %temp;
	}

BB1_4:
	mul.ftz.f32 	%f35, %f61, %f30;
	ld.f32 	%f18, [%rd3+12];
	ld.f32 	%f17, [%rd3+8];
	ld.f32 	%f16, [%rd3+4];
	ld.f32 	%f15, [%rd3];
	cvt.ftz.sat.f32.f32	%f19, %f35;
	setp.ltu.ftz.f32	%p3, %f58, 0f00000000;
	@%p3 bra 	BB1_6;

	lg2.approx.ftz.f32 	%f36, %f58;
	mul.ftz.f32 	%f37, %f36, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f62, %f37;
	bra.uni 	BB1_7;

BB1_6:
	neg.ftz.f32 	%f38, %f58;
	lg2.approx.ftz.f32 	%f39, %f38;
	mul.ftz.f32 	%f40, %f39, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f41, %f40;
	neg.ftz.f32 	%f62, %f41;

BB1_7:
	setp.ltu.ftz.f32	%p4, %f59, 0f00000000;
	@%p4 bra 	BB1_9;

	lg2.approx.ftz.f32 	%f42, %f59;
	mul.ftz.f32 	%f43, %f42, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f63, %f43;
	bra.uni 	BB1_10;

BB1_9:
	neg.ftz.f32 	%f44, %f59;
	lg2.approx.ftz.f32 	%f45, %f44;
	mul.ftz.f32 	%f46, %f45, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f47, %f46;
	neg.ftz.f32 	%f63, %f47;

BB1_10:
	setp.ltu.ftz.f32	%p5, %f60, 0f00000000;
	@%p5 bra 	BB1_12;

	lg2.approx.ftz.f32 	%f48, %f60;
	mul.ftz.f32 	%f49, %f48, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f64, %f49;
	bra.uni 	BB1_13;

BB1_12:
	neg.ftz.f32 	%f50, %f60;
	lg2.approx.ftz.f32 	%f51, %f50;
	mul.ftz.f32 	%f52, %f51, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f53, %f52;
	neg.ftz.f32 	%f64, %f53;

BB1_13:
	fma.rn.ftz.f32 	%f54, %f62, %f19, %f15;
	fma.rn.ftz.f32 	%f55, %f63, %f19, %f16;
	fma.rn.ftz.f32 	%f56, %f64, %f19, %f17;
	st.f32 	[%rd3], %f54;
	st.f32 	[%rd3+4], %f55;
	st.f32 	[%rd3+8], %f56;
	add.ftz.f32 	%f57, %f18, %f19;
	st.f32 	[%rd3+12], %f57;

BB1_14:
	ret;
}

.visible .entry AccumulateKernel(
	.param .u64 AccumulateKernel_param_0,
	.param .f32 AccumulateKernel_param_1,
	.param .u64 AccumulateKernel_param_2,
	.param .f32 AccumulateKernel_param_3,
	.param .u64 AccumulateKernel_param_4,
	.param .f32 AccumulateKernel_param_5,
	.param .u64 AccumulateKernel_param_6,
	.param .f32 AccumulateKernel_param_7,
	.param .u64 AccumulateKernel_param_8,
	.param .f32 AccumulateKernel_param_9,
	.param .u64 AccumulateKernel_param_10,
	.param .f32 AccumulateKernel_param_11,
	.param .u64 AccumulateKernel_param_12,
	.param .f32 AccumulateKernel_param_13,
	.param .u64 AccumulateKernel_param_14,
	.param .f32 AccumulateKernel_param_15,
	.param .u64 AccumulateKernel_param_16,
	.param .f32 AccumulateKernel_param_17,
	.param .u64 AccumulateKernel_param_18,
	.param .f32 AccumulateKernel_param_19,
	.param .u64 AccumulateKernel_param_20,
	.param .u32 AccumulateKernel_param_21,
	.param .u32 AccumulateKernel_param_22,
	.param .u32 AccumulateKernel_param_23,
	.param .u32 AccumulateKernel_param_24,
	.param .u32 AccumulateKernel_param_25
)
{
	.reg .pred 	%p<59>;
	.reg .s16 	%rs<85>;
	.reg .s32 	%r<122>;
	.reg .f32 	%f<708>;
	.reg .s64 	%rd<133>;


	ld.param.u64 	%rd21, [AccumulateKernel_param_0];
	ld.param.u64 	%rd22, [AccumulateKernel_param_2];
	ld.param.u64 	%rd23, [AccumulateKernel_param_4];
	ld.param.u64 	%rd24, [AccumulateKernel_param_6];
	ld.param.u64 	%rd25, [AccumulateKernel_param_8];
	ld.param.u64 	%rd26, [AccumulateKernel_param_10];
	ld.param.u64 	%rd27, [AccumulateKernel_param_12];
	ld.param.u64 	%rd28, [AccumulateKernel_param_14];
	ld.param.u64 	%rd29, [AccumulateKernel_param_16];
	ld.param.u64 	%rd30, [AccumulateKernel_param_18];
	ld.param.u64 	%rd31, [AccumulateKernel_param_20];
	ld.param.u32 	%r1, [AccumulateKernel_param_21];
	ld.param.u32 	%r2, [AccumulateKernel_param_22];
	ld.param.u32 	%r3, [AccumulateKernel_param_23];
	ld.param.u32 	%r4, [AccumulateKernel_param_24];
	ld.param.u32 	%r5, [AccumulateKernel_param_25];
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r9, %r6, %r7, %r8;
	mov.u32 	%r10, %ntid.y;
	mov.u32 	%r11, %ctaid.y;
	mov.u32 	%r12, %tid.y;
	mad.lo.s32 	%r13, %r10, %r11, %r12;
	setp.lt.s32	%p1, %r9, %r4;
	setp.lt.s32	%p2, %r13, %r5;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB2_157;
	bra.uni 	BB2_1;

BB2_1:
	setp.ne.s64	%p4, %rd21, 0;
	@%p4 bra 	BB2_3;

	mov.f32 	%f700, 0f00000000;
	mov.f32 	%f699, %f700;
	mov.f32 	%f698, %f700;
	mov.f32 	%f697, %f700;
	bra.uni 	BB2_16;

BB2_3:
	setp.eq.s32	%p5, %r3, 0;
	@%p5 bra 	BB2_5;

	cvta.to.global.u64 	%rd32, %rd21;
	mad.lo.s32 	%r22, %r13, %r1, %r9;
	cvt.s64.s32	%rd33, %r22;
	cvt.s64.s32	%rd34, %r2;
	add.s64 	%rd35, %rd33, %rd34;
	shl.b64 	%rd36, %rd35, 4;
	add.s64 	%rd37, %rd32, %rd36;
	ld.global.v4.f32 	{%f358, %f359, %f360, %f361}, [%rd37];
	mov.f32 	%f630, %f361;
	mov.f32 	%f629, %f360;
	mov.f32 	%f628, %f359;
	mov.f32 	%f627, %f358;
	bra.uni 	BB2_6;

BB2_5:
	cvta.to.global.u64 	%rd38, %rd21;
	mul.wide.s32 	%rd39, %r2, 16;
	add.s64 	%rd40, %rd38, %rd39;
	mad.lo.s32 	%r31, %r13, %r1, %r9;
	mul.wide.s32 	%rd41, %r31, 8;
	add.s64 	%rd42, %rd40, %rd41;
	ld.global.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd42];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f627, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f628, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f629, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f630, %temp;
	}

BB2_6:
	ld.param.f32 	%f625, [AccumulateKernel_param_1];
	mul.ftz.f32 	%f362, %f630, %f625;
	cvt.ftz.sat.f32.f32	%f15, %f362;
	setp.ltu.ftz.f32	%p6, %f627, 0f00000000;
	@%p6 bra 	BB2_8;

	lg2.approx.ftz.f32 	%f363, %f627;
	mul.ftz.f32 	%f364, %f363, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f631, %f364;
	bra.uni 	BB2_9;

BB2_8:
	neg.ftz.f32 	%f365, %f627;
	lg2.approx.ftz.f32 	%f366, %f365;
	mul.ftz.f32 	%f367, %f366, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f368, %f367;
	neg.ftz.f32 	%f631, %f368;

BB2_9:
	setp.ltu.ftz.f32	%p7, %f628, 0f00000000;
	@%p7 bra 	BB2_11;

	lg2.approx.ftz.f32 	%f369, %f628;
	mul.ftz.f32 	%f370, %f369, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f632, %f370;
	bra.uni 	BB2_12;

BB2_11:
	neg.ftz.f32 	%f371, %f628;
	lg2.approx.ftz.f32 	%f372, %f371;
	mul.ftz.f32 	%f373, %f372, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f374, %f373;
	neg.ftz.f32 	%f632, %f374;

BB2_12:
	setp.ltu.ftz.f32	%p8, %f629, 0f00000000;
	@%p8 bra 	BB2_14;

	lg2.approx.ftz.f32 	%f375, %f629;
	mul.ftz.f32 	%f376, %f375, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f633, %f376;
	bra.uni 	BB2_15;

BB2_14:
	neg.ftz.f32 	%f377, %f629;
	lg2.approx.ftz.f32 	%f378, %f377;
	mul.ftz.f32 	%f379, %f378, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f380, %f379;
	neg.ftz.f32 	%f633, %f380;

BB2_15:
	fma.rn.ftz.f32 	%f697, %f631, %f15, 0f00000000;
	fma.rn.ftz.f32 	%f698, %f632, %f15, 0f00000000;
	fma.rn.ftz.f32 	%f699, %f633, %f15, 0f00000000;
	add.ftz.f32 	%f700, %f15, 0f00000000;

BB2_16:
	setp.eq.s64	%p9, %rd22, 0;
	@%p9 bra 	BB2_30;

	mad.lo.s32 	%r40, %r13, %r1, %r9;
	cvt.s64.s32	%rd1, %r2;
	cvt.s64.s32	%rd2, %r40;
	setp.eq.s32	%p10, %r3, 0;
	@%p10 bra 	BB2_19;

	cvta.to.global.u64 	%rd43, %rd22;
	add.s64 	%rd44, %rd2, %rd1;
	shl.b64 	%rd45, %rd44, 4;
	add.s64 	%rd46, %rd43, %rd45;
	ld.global.v4.f32 	{%f385, %f386, %f387, %f388}, [%rd46];
	mov.f32 	%f637, %f388;
	mov.f32 	%f636, %f387;
	mov.f32 	%f635, %f386;
	mov.f32 	%f634, %f385;
	bra.uni 	BB2_20;

BB2_19:
	cvta.to.global.u64 	%rd47, %rd22;
	shl.b64 	%rd48, %rd1, 4;
	add.s64 	%rd49, %rd47, %rd48;
	shl.b64 	%rd50, %rd2, 3;
	add.s64 	%rd51, %rd49, %rd50;
	ld.global.v4.u16 	{%rs9, %rs10, %rs11, %rs12}, [%rd51];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs9;
	cvt.f32.f16 	%f634, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs10;
	cvt.f32.f16 	%f635, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs11;
	cvt.f32.f16 	%f636, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs12;
	cvt.f32.f16 	%f637, %temp;
	}

BB2_20:
	ld.param.f32 	%f617, [AccumulateKernel_param_3];
	mul.ftz.f32 	%f389, %f637, %f617;
	cvt.ftz.sat.f32.f32	%f48, %f389;
	setp.ltu.ftz.f32	%p11, %f634, 0f00000000;
	@%p11 bra 	BB2_22;

	lg2.approx.ftz.f32 	%f390, %f634;
	mul.ftz.f32 	%f391, %f390, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f638, %f391;
	bra.uni 	BB2_23;

BB2_22:
	neg.ftz.f32 	%f392, %f634;
	lg2.approx.ftz.f32 	%f393, %f392;
	mul.ftz.f32 	%f394, %f393, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f395, %f394;
	neg.ftz.f32 	%f638, %f395;

BB2_23:
	setp.ltu.ftz.f32	%p12, %f635, 0f00000000;
	@%p12 bra 	BB2_25;

	lg2.approx.ftz.f32 	%f396, %f635;
	mul.ftz.f32 	%f397, %f396, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f639, %f397;
	bra.uni 	BB2_26;

BB2_25:
	neg.ftz.f32 	%f398, %f635;
	lg2.approx.ftz.f32 	%f399, %f398;
	mul.ftz.f32 	%f400, %f399, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f401, %f400;
	neg.ftz.f32 	%f639, %f401;

BB2_26:
	setp.ltu.ftz.f32	%p13, %f636, 0f00000000;
	@%p13 bra 	BB2_28;

	lg2.approx.ftz.f32 	%f402, %f636;
	mul.ftz.f32 	%f403, %f402, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f640, %f403;
	bra.uni 	BB2_29;

BB2_28:
	neg.ftz.f32 	%f404, %f636;
	lg2.approx.ftz.f32 	%f405, %f404;
	mul.ftz.f32 	%f406, %f405, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f407, %f406;
	neg.ftz.f32 	%f640, %f407;

BB2_29:
	fma.rn.ftz.f32 	%f697, %f638, %f48, %f697;
	fma.rn.ftz.f32 	%f698, %f639, %f48, %f698;
	fma.rn.ftz.f32 	%f699, %f640, %f48, %f699;
	add.ftz.f32 	%f700, %f700, %f48;

BB2_30:
	setp.eq.s64	%p14, %rd23, 0;
	@%p14 bra 	BB2_44;

	mad.lo.s32 	%r49, %r13, %r1, %r9;
	cvt.s64.s32	%rd3, %r2;
	cvt.s64.s32	%rd4, %r49;
	setp.eq.s32	%p15, %r3, 0;
	@%p15 bra 	BB2_33;

	cvta.to.global.u64 	%rd52, %rd23;
	add.s64 	%rd53, %rd4, %rd3;
	shl.b64 	%rd54, %rd53, 4;
	add.s64 	%rd55, %rd52, %rd54;
	ld.global.v4.f32 	{%f408, %f409, %f410, %f411}, [%rd55];
	mov.f32 	%f644, %f411;
	mov.f32 	%f643, %f410;
	mov.f32 	%f642, %f409;
	mov.f32 	%f641, %f408;
	bra.uni 	BB2_34;

BB2_33:
	cvta.to.global.u64 	%rd56, %rd23;
	shl.b64 	%rd57, %rd3, 4;
	add.s64 	%rd58, %rd56, %rd57;
	shl.b64 	%rd59, %rd4, 3;
	add.s64 	%rd60, %rd58, %rd59;
	ld.global.v4.u16 	{%rs17, %rs18, %rs19, %rs20}, [%rd60];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs17;
	cvt.f32.f16 	%f641, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs18;
	cvt.f32.f16 	%f642, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs19;
	cvt.f32.f16 	%f643, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs20;
	cvt.f32.f16 	%f644, %temp;
	}

BB2_34:
	ld.param.f32 	%f618, [AccumulateKernel_param_5];
	mul.ftz.f32 	%f412, %f644, %f618;
	cvt.ftz.sat.f32.f32	%f81, %f412;
	setp.ltu.ftz.f32	%p16, %f641, 0f00000000;
	@%p16 bra 	BB2_36;

	lg2.approx.ftz.f32 	%f413, %f641;
	mul.ftz.f32 	%f414, %f413, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f645, %f414;
	bra.uni 	BB2_37;

BB2_36:
	neg.ftz.f32 	%f415, %f641;
	lg2.approx.ftz.f32 	%f416, %f415;
	mul.ftz.f32 	%f417, %f416, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f418, %f417;
	neg.ftz.f32 	%f645, %f418;

BB2_37:
	setp.ltu.ftz.f32	%p17, %f642, 0f00000000;
	@%p17 bra 	BB2_39;

	lg2.approx.ftz.f32 	%f419, %f642;
	mul.ftz.f32 	%f420, %f419, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f646, %f420;
	bra.uni 	BB2_40;

BB2_39:
	neg.ftz.f32 	%f421, %f642;
	lg2.approx.ftz.f32 	%f422, %f421;
	mul.ftz.f32 	%f423, %f422, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f424, %f423;
	neg.ftz.f32 	%f646, %f424;

BB2_40:
	setp.ltu.ftz.f32	%p18, %f643, 0f00000000;
	@%p18 bra 	BB2_42;

	lg2.approx.ftz.f32 	%f425, %f643;
	mul.ftz.f32 	%f426, %f425, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f647, %f426;
	bra.uni 	BB2_43;

BB2_42:
	neg.ftz.f32 	%f427, %f643;
	lg2.approx.ftz.f32 	%f428, %f427;
	mul.ftz.f32 	%f429, %f428, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f430, %f429;
	neg.ftz.f32 	%f647, %f430;

BB2_43:
	fma.rn.ftz.f32 	%f697, %f645, %f81, %f697;
	fma.rn.ftz.f32 	%f698, %f646, %f81, %f698;
	fma.rn.ftz.f32 	%f699, %f647, %f81, %f699;
	add.ftz.f32 	%f700, %f700, %f81;

BB2_44:
	setp.eq.s64	%p19, %rd24, 0;
	@%p19 bra 	BB2_58;

	mad.lo.s32 	%r58, %r13, %r1, %r9;
	cvt.s64.s32	%rd5, %r2;
	cvt.s64.s32	%rd6, %r58;
	setp.eq.s32	%p20, %r3, 0;
	@%p20 bra 	BB2_47;

	cvta.to.global.u64 	%rd61, %rd24;
	add.s64 	%rd62, %rd6, %rd5;
	shl.b64 	%rd63, %rd62, 4;
	add.s64 	%rd64, %rd61, %rd63;
	ld.global.v4.f32 	{%f431, %f432, %f433, %f434}, [%rd64];
	mov.f32 	%f651, %f434;
	mov.f32 	%f650, %f433;
	mov.f32 	%f649, %f432;
	mov.f32 	%f648, %f431;
	bra.uni 	BB2_48;

BB2_47:
	cvta.to.global.u64 	%rd65, %rd24;
	shl.b64 	%rd66, %rd5, 4;
	add.s64 	%rd67, %rd65, %rd66;
	shl.b64 	%rd68, %rd6, 3;
	add.s64 	%rd69, %rd67, %rd68;
	ld.global.v4.u16 	{%rs25, %rs26, %rs27, %rs28}, [%rd69];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs25;
	cvt.f32.f16 	%f648, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs26;
	cvt.f32.f16 	%f649, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs27;
	cvt.f32.f16 	%f650, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs28;
	cvt.f32.f16 	%f651, %temp;
	}

BB2_48:
	ld.param.f32 	%f619, [AccumulateKernel_param_7];
	mul.ftz.f32 	%f435, %f651, %f619;
	cvt.ftz.sat.f32.f32	%f114, %f435;
	setp.ltu.ftz.f32	%p21, %f648, 0f00000000;
	@%p21 bra 	BB2_50;

	lg2.approx.ftz.f32 	%f436, %f648;
	mul.ftz.f32 	%f437, %f436, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f652, %f437;
	bra.uni 	BB2_51;

BB2_50:
	neg.ftz.f32 	%f438, %f648;
	lg2.approx.ftz.f32 	%f439, %f438;
	mul.ftz.f32 	%f440, %f439, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f441, %f440;
	neg.ftz.f32 	%f652, %f441;

BB2_51:
	setp.ltu.ftz.f32	%p22, %f649, 0f00000000;
	@%p22 bra 	BB2_53;

	lg2.approx.ftz.f32 	%f442, %f649;
	mul.ftz.f32 	%f443, %f442, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f653, %f443;
	bra.uni 	BB2_54;

BB2_53:
	neg.ftz.f32 	%f444, %f649;
	lg2.approx.ftz.f32 	%f445, %f444;
	mul.ftz.f32 	%f446, %f445, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f447, %f446;
	neg.ftz.f32 	%f653, %f447;

BB2_54:
	setp.ltu.ftz.f32	%p23, %f650, 0f00000000;
	@%p23 bra 	BB2_56;

	lg2.approx.ftz.f32 	%f448, %f650;
	mul.ftz.f32 	%f449, %f448, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f654, %f449;
	bra.uni 	BB2_57;

BB2_56:
	neg.ftz.f32 	%f450, %f650;
	lg2.approx.ftz.f32 	%f451, %f450;
	mul.ftz.f32 	%f452, %f451, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f453, %f452;
	neg.ftz.f32 	%f654, %f453;

BB2_57:
	fma.rn.ftz.f32 	%f697, %f652, %f114, %f697;
	fma.rn.ftz.f32 	%f698, %f653, %f114, %f698;
	fma.rn.ftz.f32 	%f699, %f654, %f114, %f699;
	add.ftz.f32 	%f700, %f700, %f114;

BB2_58:
	setp.eq.s64	%p24, %rd25, 0;
	@%p24 bra 	BB2_72;

	mad.lo.s32 	%r67, %r13, %r1, %r9;
	cvt.s64.s32	%rd7, %r2;
	cvt.s64.s32	%rd8, %r67;
	setp.eq.s32	%p25, %r3, 0;
	@%p25 bra 	BB2_61;

	cvta.to.global.u64 	%rd70, %rd25;
	add.s64 	%rd71, %rd8, %rd7;
	shl.b64 	%rd72, %rd71, 4;
	add.s64 	%rd73, %rd70, %rd72;
	ld.global.v4.f32 	{%f454, %f455, %f456, %f457}, [%rd73];
	mov.f32 	%f658, %f457;
	mov.f32 	%f657, %f456;
	mov.f32 	%f656, %f455;
	mov.f32 	%f655, %f454;
	bra.uni 	BB2_62;

BB2_61:
	cvta.to.global.u64 	%rd74, %rd25;
	shl.b64 	%rd75, %rd7, 4;
	add.s64 	%rd76, %rd74, %rd75;
	shl.b64 	%rd77, %rd8, 3;
	add.s64 	%rd78, %rd76, %rd77;
	ld.global.v4.u16 	{%rs33, %rs34, %rs35, %rs36}, [%rd78];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs33;
	cvt.f32.f16 	%f655, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs34;
	cvt.f32.f16 	%f656, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs35;
	cvt.f32.f16 	%f657, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs36;
	cvt.f32.f16 	%f658, %temp;
	}

BB2_62:
	ld.param.f32 	%f620, [AccumulateKernel_param_9];
	mul.ftz.f32 	%f458, %f658, %f620;
	cvt.ftz.sat.f32.f32	%f147, %f458;
	setp.ltu.ftz.f32	%p26, %f655, 0f00000000;
	@%p26 bra 	BB2_64;

	lg2.approx.ftz.f32 	%f459, %f655;
	mul.ftz.f32 	%f460, %f459, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f659, %f460;
	bra.uni 	BB2_65;

BB2_64:
	neg.ftz.f32 	%f461, %f655;
	lg2.approx.ftz.f32 	%f462, %f461;
	mul.ftz.f32 	%f463, %f462, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f464, %f463;
	neg.ftz.f32 	%f659, %f464;

BB2_65:
	setp.ltu.ftz.f32	%p27, %f656, 0f00000000;
	@%p27 bra 	BB2_67;

	lg2.approx.ftz.f32 	%f465, %f656;
	mul.ftz.f32 	%f466, %f465, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f660, %f466;
	bra.uni 	BB2_68;

BB2_67:
	neg.ftz.f32 	%f467, %f656;
	lg2.approx.ftz.f32 	%f468, %f467;
	mul.ftz.f32 	%f469, %f468, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f470, %f469;
	neg.ftz.f32 	%f660, %f470;

BB2_68:
	setp.ltu.ftz.f32	%p28, %f657, 0f00000000;
	@%p28 bra 	BB2_70;

	lg2.approx.ftz.f32 	%f471, %f657;
	mul.ftz.f32 	%f472, %f471, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f661, %f472;
	bra.uni 	BB2_71;

BB2_70:
	neg.ftz.f32 	%f473, %f657;
	lg2.approx.ftz.f32 	%f474, %f473;
	mul.ftz.f32 	%f475, %f474, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f476, %f475;
	neg.ftz.f32 	%f661, %f476;

BB2_71:
	fma.rn.ftz.f32 	%f697, %f659, %f147, %f697;
	fma.rn.ftz.f32 	%f698, %f660, %f147, %f698;
	fma.rn.ftz.f32 	%f699, %f661, %f147, %f699;
	add.ftz.f32 	%f700, %f700, %f147;

BB2_72:
	setp.eq.s64	%p29, %rd26, 0;
	@%p29 bra 	BB2_86;

	mad.lo.s32 	%r76, %r13, %r1, %r9;
	cvt.s64.s32	%rd9, %r2;
	cvt.s64.s32	%rd10, %r76;
	setp.eq.s32	%p30, %r3, 0;
	@%p30 bra 	BB2_75;

	cvta.to.global.u64 	%rd79, %rd26;
	add.s64 	%rd80, %rd10, %rd9;
	shl.b64 	%rd81, %rd80, 4;
	add.s64 	%rd82, %rd79, %rd81;
	ld.global.v4.f32 	{%f477, %f478, %f479, %f480}, [%rd82];
	mov.f32 	%f665, %f480;
	mov.f32 	%f664, %f479;
	mov.f32 	%f663, %f478;
	mov.f32 	%f662, %f477;
	bra.uni 	BB2_76;

BB2_75:
	cvta.to.global.u64 	%rd83, %rd26;
	shl.b64 	%rd84, %rd9, 4;
	add.s64 	%rd85, %rd83, %rd84;
	shl.b64 	%rd86, %rd10, 3;
	add.s64 	%rd87, %rd85, %rd86;
	ld.global.v4.u16 	{%rs41, %rs42, %rs43, %rs44}, [%rd87];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs41;
	cvt.f32.f16 	%f662, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs42;
	cvt.f32.f16 	%f663, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs43;
	cvt.f32.f16 	%f664, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs44;
	cvt.f32.f16 	%f665, %temp;
	}

BB2_76:
	ld.param.f32 	%f621, [AccumulateKernel_param_11];
	mul.ftz.f32 	%f481, %f665, %f621;
	cvt.ftz.sat.f32.f32	%f180, %f481;
	setp.ltu.ftz.f32	%p31, %f662, 0f00000000;
	@%p31 bra 	BB2_78;

	lg2.approx.ftz.f32 	%f482, %f662;
	mul.ftz.f32 	%f483, %f482, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f666, %f483;
	bra.uni 	BB2_79;

BB2_78:
	neg.ftz.f32 	%f484, %f662;
	lg2.approx.ftz.f32 	%f485, %f484;
	mul.ftz.f32 	%f486, %f485, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f487, %f486;
	neg.ftz.f32 	%f666, %f487;

BB2_79:
	setp.ltu.ftz.f32	%p32, %f663, 0f00000000;
	@%p32 bra 	BB2_81;

	lg2.approx.ftz.f32 	%f488, %f663;
	mul.ftz.f32 	%f489, %f488, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f667, %f489;
	bra.uni 	BB2_82;

BB2_81:
	neg.ftz.f32 	%f490, %f663;
	lg2.approx.ftz.f32 	%f491, %f490;
	mul.ftz.f32 	%f492, %f491, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f493, %f492;
	neg.ftz.f32 	%f667, %f493;

BB2_82:
	setp.ltu.ftz.f32	%p33, %f664, 0f00000000;
	@%p33 bra 	BB2_84;

	lg2.approx.ftz.f32 	%f494, %f664;
	mul.ftz.f32 	%f495, %f494, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f668, %f495;
	bra.uni 	BB2_85;

BB2_84:
	neg.ftz.f32 	%f496, %f664;
	lg2.approx.ftz.f32 	%f497, %f496;
	mul.ftz.f32 	%f498, %f497, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f499, %f498;
	neg.ftz.f32 	%f668, %f499;

BB2_85:
	fma.rn.ftz.f32 	%f697, %f666, %f180, %f697;
	fma.rn.ftz.f32 	%f698, %f667, %f180, %f698;
	fma.rn.ftz.f32 	%f699, %f668, %f180, %f699;
	add.ftz.f32 	%f700, %f700, %f180;

BB2_86:
	setp.eq.s64	%p34, %rd27, 0;
	@%p34 bra 	BB2_100;

	mad.lo.s32 	%r85, %r13, %r1, %r9;
	cvt.s64.s32	%rd11, %r2;
	cvt.s64.s32	%rd12, %r85;
	setp.eq.s32	%p35, %r3, 0;
	@%p35 bra 	BB2_89;

	cvta.to.global.u64 	%rd88, %rd27;
	add.s64 	%rd89, %rd12, %rd11;
	shl.b64 	%rd90, %rd89, 4;
	add.s64 	%rd91, %rd88, %rd90;
	ld.global.v4.f32 	{%f500, %f501, %f502, %f503}, [%rd91];
	mov.f32 	%f672, %f503;
	mov.f32 	%f671, %f502;
	mov.f32 	%f670, %f501;
	mov.f32 	%f669, %f500;
	bra.uni 	BB2_90;

BB2_89:
	cvta.to.global.u64 	%rd92, %rd27;
	shl.b64 	%rd93, %rd11, 4;
	add.s64 	%rd94, %rd92, %rd93;
	shl.b64 	%rd95, %rd12, 3;
	add.s64 	%rd96, %rd94, %rd95;
	ld.global.v4.u16 	{%rs49, %rs50, %rs51, %rs52}, [%rd96];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs49;
	cvt.f32.f16 	%f669, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs50;
	cvt.f32.f16 	%f670, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs51;
	cvt.f32.f16 	%f671, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs52;
	cvt.f32.f16 	%f672, %temp;
	}

BB2_90:
	ld.param.f32 	%f622, [AccumulateKernel_param_13];
	mul.ftz.f32 	%f504, %f672, %f622;
	cvt.ftz.sat.f32.f32	%f213, %f504;
	setp.ltu.ftz.f32	%p36, %f669, 0f00000000;
	@%p36 bra 	BB2_92;

	lg2.approx.ftz.f32 	%f505, %f669;
	mul.ftz.f32 	%f506, %f505, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f673, %f506;
	bra.uni 	BB2_93;

BB2_92:
	neg.ftz.f32 	%f507, %f669;
	lg2.approx.ftz.f32 	%f508, %f507;
	mul.ftz.f32 	%f509, %f508, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f510, %f509;
	neg.ftz.f32 	%f673, %f510;

BB2_93:
	setp.ltu.ftz.f32	%p37, %f670, 0f00000000;
	@%p37 bra 	BB2_95;

	lg2.approx.ftz.f32 	%f511, %f670;
	mul.ftz.f32 	%f512, %f511, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f674, %f512;
	bra.uni 	BB2_96;

BB2_95:
	neg.ftz.f32 	%f513, %f670;
	lg2.approx.ftz.f32 	%f514, %f513;
	mul.ftz.f32 	%f515, %f514, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f516, %f515;
	neg.ftz.f32 	%f674, %f516;

BB2_96:
	setp.ltu.ftz.f32	%p38, %f671, 0f00000000;
	@%p38 bra 	BB2_98;

	lg2.approx.ftz.f32 	%f517, %f671;
	mul.ftz.f32 	%f518, %f517, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f675, %f518;
	bra.uni 	BB2_99;

BB2_98:
	neg.ftz.f32 	%f519, %f671;
	lg2.approx.ftz.f32 	%f520, %f519;
	mul.ftz.f32 	%f521, %f520, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f522, %f521;
	neg.ftz.f32 	%f675, %f522;

BB2_99:
	fma.rn.ftz.f32 	%f697, %f673, %f213, %f697;
	fma.rn.ftz.f32 	%f698, %f674, %f213, %f698;
	fma.rn.ftz.f32 	%f699, %f675, %f213, %f699;
	add.ftz.f32 	%f700, %f700, %f213;

BB2_100:
	setp.eq.s64	%p39, %rd28, 0;
	@%p39 bra 	BB2_114;

	mad.lo.s32 	%r94, %r13, %r1, %r9;
	cvt.s64.s32	%rd13, %r2;
	cvt.s64.s32	%rd14, %r94;
	setp.eq.s32	%p40, %r3, 0;
	@%p40 bra 	BB2_103;

	cvta.to.global.u64 	%rd97, %rd28;
	add.s64 	%rd98, %rd14, %rd13;
	shl.b64 	%rd99, %rd98, 4;
	add.s64 	%rd100, %rd97, %rd99;
	ld.global.v4.f32 	{%f523, %f524, %f525, %f526}, [%rd100];
	mov.f32 	%f679, %f526;
	mov.f32 	%f678, %f525;
	mov.f32 	%f677, %f524;
	mov.f32 	%f676, %f523;
	bra.uni 	BB2_104;

BB2_103:
	cvta.to.global.u64 	%rd101, %rd28;
	shl.b64 	%rd102, %rd13, 4;
	add.s64 	%rd103, %rd101, %rd102;
	shl.b64 	%rd104, %rd14, 3;
	add.s64 	%rd105, %rd103, %rd104;
	ld.global.v4.u16 	{%rs57, %rs58, %rs59, %rs60}, [%rd105];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs57;
	cvt.f32.f16 	%f676, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs58;
	cvt.f32.f16 	%f677, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs59;
	cvt.f32.f16 	%f678, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs60;
	cvt.f32.f16 	%f679, %temp;
	}

BB2_104:
	ld.param.f32 	%f623, [AccumulateKernel_param_15];
	mul.ftz.f32 	%f527, %f679, %f623;
	cvt.ftz.sat.f32.f32	%f246, %f527;
	setp.ltu.ftz.f32	%p41, %f676, 0f00000000;
	@%p41 bra 	BB2_106;

	lg2.approx.ftz.f32 	%f528, %f676;
	mul.ftz.f32 	%f529, %f528, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f680, %f529;
	bra.uni 	BB2_107;

BB2_106:
	neg.ftz.f32 	%f530, %f676;
	lg2.approx.ftz.f32 	%f531, %f530;
	mul.ftz.f32 	%f532, %f531, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f533, %f532;
	neg.ftz.f32 	%f680, %f533;

BB2_107:
	setp.ltu.ftz.f32	%p42, %f677, 0f00000000;
	@%p42 bra 	BB2_109;

	lg2.approx.ftz.f32 	%f534, %f677;
	mul.ftz.f32 	%f535, %f534, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f681, %f535;
	bra.uni 	BB2_110;

BB2_109:
	neg.ftz.f32 	%f536, %f677;
	lg2.approx.ftz.f32 	%f537, %f536;
	mul.ftz.f32 	%f538, %f537, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f539, %f538;
	neg.ftz.f32 	%f681, %f539;

BB2_110:
	setp.ltu.ftz.f32	%p43, %f678, 0f00000000;
	@%p43 bra 	BB2_112;

	lg2.approx.ftz.f32 	%f540, %f678;
	mul.ftz.f32 	%f541, %f540, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f682, %f541;
	bra.uni 	BB2_113;

BB2_112:
	neg.ftz.f32 	%f542, %f678;
	lg2.approx.ftz.f32 	%f543, %f542;
	mul.ftz.f32 	%f544, %f543, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f545, %f544;
	neg.ftz.f32 	%f682, %f545;

BB2_113:
	fma.rn.ftz.f32 	%f697, %f680, %f246, %f697;
	fma.rn.ftz.f32 	%f698, %f681, %f246, %f698;
	fma.rn.ftz.f32 	%f699, %f682, %f246, %f699;
	add.ftz.f32 	%f700, %f700, %f246;

BB2_114:
	setp.eq.s64	%p44, %rd29, 0;
	@%p44 bra 	BB2_128;

	mad.lo.s32 	%r103, %r13, %r1, %r9;
	cvt.s64.s32	%rd15, %r2;
	cvt.s64.s32	%rd16, %r103;
	setp.eq.s32	%p45, %r3, 0;
	@%p45 bra 	BB2_117;

	cvta.to.global.u64 	%rd106, %rd29;
	add.s64 	%rd107, %rd16, %rd15;
	shl.b64 	%rd108, %rd107, 4;
	add.s64 	%rd109, %rd106, %rd108;
	ld.global.v4.f32 	{%f546, %f547, %f548, %f549}, [%rd109];
	mov.f32 	%f686, %f549;
	mov.f32 	%f685, %f548;
	mov.f32 	%f684, %f547;
	mov.f32 	%f683, %f546;
	bra.uni 	BB2_118;

BB2_117:
	cvta.to.global.u64 	%rd110, %rd29;
	shl.b64 	%rd111, %rd15, 4;
	add.s64 	%rd112, %rd110, %rd111;
	shl.b64 	%rd113, %rd16, 3;
	add.s64 	%rd114, %rd112, %rd113;
	ld.global.v4.u16 	{%rs65, %rs66, %rs67, %rs68}, [%rd114];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs65;
	cvt.f32.f16 	%f683, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs66;
	cvt.f32.f16 	%f684, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs67;
	cvt.f32.f16 	%f685, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs68;
	cvt.f32.f16 	%f686, %temp;
	}

BB2_118:
	ld.param.f32 	%f624, [AccumulateKernel_param_17];
	mul.ftz.f32 	%f550, %f686, %f624;
	cvt.ftz.sat.f32.f32	%f279, %f550;
	setp.ltu.ftz.f32	%p46, %f683, 0f00000000;
	@%p46 bra 	BB2_120;

	lg2.approx.ftz.f32 	%f551, %f683;
	mul.ftz.f32 	%f552, %f551, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f687, %f552;
	bra.uni 	BB2_121;

BB2_120:
	neg.ftz.f32 	%f553, %f683;
	lg2.approx.ftz.f32 	%f554, %f553;
	mul.ftz.f32 	%f555, %f554, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f556, %f555;
	neg.ftz.f32 	%f687, %f556;

BB2_121:
	setp.ltu.ftz.f32	%p47, %f684, 0f00000000;
	@%p47 bra 	BB2_123;

	lg2.approx.ftz.f32 	%f557, %f684;
	mul.ftz.f32 	%f558, %f557, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f688, %f558;
	bra.uni 	BB2_124;

BB2_123:
	neg.ftz.f32 	%f559, %f684;
	lg2.approx.ftz.f32 	%f560, %f559;
	mul.ftz.f32 	%f561, %f560, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f562, %f561;
	neg.ftz.f32 	%f688, %f562;

BB2_124:
	setp.ltu.ftz.f32	%p48, %f685, 0f00000000;
	@%p48 bra 	BB2_126;

	lg2.approx.ftz.f32 	%f563, %f685;
	mul.ftz.f32 	%f564, %f563, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f689, %f564;
	bra.uni 	BB2_127;

BB2_126:
	neg.ftz.f32 	%f565, %f685;
	lg2.approx.ftz.f32 	%f566, %f565;
	mul.ftz.f32 	%f567, %f566, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f568, %f567;
	neg.ftz.f32 	%f689, %f568;

BB2_127:
	fma.rn.ftz.f32 	%f697, %f687, %f279, %f697;
	fma.rn.ftz.f32 	%f698, %f688, %f279, %f698;
	fma.rn.ftz.f32 	%f699, %f689, %f279, %f699;
	add.ftz.f32 	%f700, %f700, %f279;

BB2_128:
	setp.eq.s64	%p49, %rd30, 0;
	@%p49 bra 	BB2_142;

	mad.lo.s32 	%r112, %r13, %r1, %r9;
	cvt.s64.s32	%rd17, %r2;
	cvt.s64.s32	%rd18, %r112;
	setp.eq.s32	%p50, %r3, 0;
	@%p50 bra 	BB2_131;

	cvta.to.global.u64 	%rd115, %rd30;
	add.s64 	%rd116, %rd18, %rd17;
	shl.b64 	%rd117, %rd116, 4;
	add.s64 	%rd118, %rd115, %rd117;
	ld.global.v4.f32 	{%f569, %f570, %f571, %f572}, [%rd118];
	mov.f32 	%f693, %f572;
	mov.f32 	%f692, %f571;
	mov.f32 	%f691, %f570;
	mov.f32 	%f690, %f569;
	bra.uni 	BB2_132;

BB2_131:
	cvta.to.global.u64 	%rd119, %rd30;
	shl.b64 	%rd120, %rd17, 4;
	add.s64 	%rd121, %rd119, %rd120;
	shl.b64 	%rd122, %rd18, 3;
	add.s64 	%rd123, %rd121, %rd122;
	ld.global.v4.u16 	{%rs73, %rs74, %rs75, %rs76}, [%rd123];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs73;
	cvt.f32.f16 	%f690, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs74;
	cvt.f32.f16 	%f691, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs75;
	cvt.f32.f16 	%f692, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs76;
	cvt.f32.f16 	%f693, %temp;
	}

BB2_132:
	ld.param.f32 	%f626, [AccumulateKernel_param_19];
	mul.ftz.f32 	%f573, %f693, %f626;
	cvt.ftz.sat.f32.f32	%f312, %f573;
	setp.ltu.ftz.f32	%p51, %f690, 0f00000000;
	@%p51 bra 	BB2_134;

	lg2.approx.ftz.f32 	%f574, %f690;
	mul.ftz.f32 	%f575, %f574, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f694, %f575;
	bra.uni 	BB2_135;

BB2_134:
	neg.ftz.f32 	%f576, %f690;
	lg2.approx.ftz.f32 	%f577, %f576;
	mul.ftz.f32 	%f578, %f577, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f579, %f578;
	neg.ftz.f32 	%f694, %f579;

BB2_135:
	setp.ltu.ftz.f32	%p52, %f691, 0f00000000;
	@%p52 bra 	BB2_137;

	lg2.approx.ftz.f32 	%f580, %f691;
	mul.ftz.f32 	%f581, %f580, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f695, %f581;
	bra.uni 	BB2_138;

BB2_137:
	neg.ftz.f32 	%f582, %f691;
	lg2.approx.ftz.f32 	%f583, %f582;
	mul.ftz.f32 	%f584, %f583, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f585, %f584;
	neg.ftz.f32 	%f695, %f585;

BB2_138:
	setp.ltu.ftz.f32	%p53, %f692, 0f00000000;
	@%p53 bra 	BB2_140;

	lg2.approx.ftz.f32 	%f586, %f692;
	mul.ftz.f32 	%f587, %f586, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f696, %f587;
	bra.uni 	BB2_141;

BB2_140:
	neg.ftz.f32 	%f588, %f692;
	lg2.approx.ftz.f32 	%f589, %f588;
	mul.ftz.f32 	%f590, %f589, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f591, %f590;
	neg.ftz.f32 	%f696, %f591;

BB2_141:
	fma.rn.ftz.f32 	%f697, %f694, %f312, %f697;
	fma.rn.ftz.f32 	%f698, %f695, %f312, %f698;
	fma.rn.ftz.f32 	%f699, %f696, %f312, %f699;
	add.ftz.f32 	%f700, %f700, %f312;

BB2_142:
	cvt.ftz.sat.f32.f32	%f701, %f700;
	add.ftz.f32 	%f592, %f701, 0fB70637BD;
	setp.gtu.ftz.f32	%p54, %f592, 0f00000000;
	@%p54 bra 	BB2_144;

	mov.f32 	%f704, 0f00000000;
	mov.f32 	%f703, %f704;
	mov.f32 	%f702, %f704;
	mov.f32 	%f701, %f704;
	bra.uni 	BB2_145;

BB2_144:
	mov.f32 	%f597, 0f3F800000;
	div.approx.ftz.f32 	%f598, %f597, %f701;
	mul.ftz.f32 	%f702, %f699, %f598;
	mul.ftz.f32 	%f703, %f698, %f598;
	mul.ftz.f32 	%f704, %f697, %f598;

BB2_145:
	setp.ltu.ftz.f32	%p55, %f704, 0f00000000;
	@%p55 bra 	BB2_147;

	lg2.approx.ftz.f32 	%f599, %f704;
	mul.ftz.f32 	%f600, %f599, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f705, %f600;
	bra.uni 	BB2_148;

BB2_147:
	neg.ftz.f32 	%f601, %f704;
	lg2.approx.ftz.f32 	%f602, %f601;
	mul.ftz.f32 	%f603, %f602, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f604, %f603;
	neg.ftz.f32 	%f705, %f604;

BB2_148:
	setp.ltu.ftz.f32	%p56, %f703, 0f00000000;
	@%p56 bra 	BB2_150;

	lg2.approx.ftz.f32 	%f605, %f703;
	mul.ftz.f32 	%f606, %f605, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f706, %f606;
	bra.uni 	BB2_151;

BB2_150:
	neg.ftz.f32 	%f607, %f703;
	lg2.approx.ftz.f32 	%f608, %f607;
	mul.ftz.f32 	%f609, %f608, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f610, %f609;
	neg.ftz.f32 	%f706, %f610;

BB2_151:
	setp.ltu.ftz.f32	%p57, %f702, 0f00000000;
	@%p57 bra 	BB2_153;

	lg2.approx.ftz.f32 	%f611, %f702;
	mul.ftz.f32 	%f612, %f611, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f707, %f612;
	bra.uni 	BB2_154;

BB2_153:
	neg.ftz.f32 	%f613, %f702;
	lg2.approx.ftz.f32 	%f614, %f613;
	mul.ftz.f32 	%f615, %f614, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f616, %f615;
	neg.ftz.f32 	%f707, %f616;

BB2_154:
	mad.lo.s32 	%r121, %r13, %r1, %r9;
	cvt.s64.s32	%rd19, %r2;
	cvt.s64.s32	%rd20, %r121;
	setp.eq.s32	%p58, %r3, 0;
	@%p58 bra 	BB2_156;

	cvta.to.global.u64 	%rd124, %rd31;
	add.s64 	%rd125, %rd20, %rd19;
	shl.b64 	%rd126, %rd125, 4;
	add.s64 	%rd127, %rd124, %rd126;
	st.global.v4.f32 	[%rd127], {%f705, %f706, %f707, %f701};
	bra.uni 	BB2_157;

BB2_156:
	cvta.to.global.u64 	%rd128, %rd31;
	shl.b64 	%rd129, %rd19, 4;
	add.s64 	%rd130, %rd128, %rd129;
	shl.b64 	%rd131, %rd20, 3;
	add.s64 	%rd132, %rd130, %rd131;
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f701;
	mov.b16 	%rs81, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f707;
	mov.b16 	%rs82, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f706;
	mov.b16 	%rs83, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f705;
	mov.b16 	%rs84, %temp;
}
	st.global.v4.u16 	[%rd132], {%rs84, %rs83, %rs82, %rs81};

BB2_157:
	ret;
}


