//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Fri Jul 25 04:36:16 2014 (1406288176)
// Cuda compilation tools, release 6.5, V6.5.13
//

.version 4.1
.target sm_30
.address_size 64

.const .align 4 .b8 inParams[164];
// cuda_kernel_chokefilter_x$__cuda_local_var_180308_3088_non_const_data has been demoted
// cuda_kernel_chokefilter_y$__cuda_local_var_180308_4502_non_const_data has been demoted
.extern .shared .align 4 .b8 smem[];

.visible .func  (.param .b32 func_retval0) _Z9ReadAlphaPK6float4i17DevicePixelFormat(
	.param .b64 _Z9ReadAlphaPK6float4i17DevicePixelFormat_param_0,
	.param .b32 _Z9ReadAlphaPK6float4i17DevicePixelFormat_param_1,
	.param .b32 _Z9ReadAlphaPK6float4i17DevicePixelFormat_param_2
)
{
	.reg .pred 	%p<2>;
	.reg .s16 	%rs<2>;
	.reg .s32 	%r<2>;
	.reg .f32 	%f<5>;
	.reg .s64 	%rd<7>;


	ld.param.u64 	%rd2, [_Z9ReadAlphaPK6float4i17DevicePixelFormat_param_0];
	ld.param.u32 	%r1, [_Z9ReadAlphaPK6float4i17DevicePixelFormat_param_2];
	setp.eq.s32	%p1, %r1, 0;
	ld.param.s32 	%rd1, [_Z9ReadAlphaPK6float4i17DevicePixelFormat_param_1];
	@%p1 bra 	BB0_2;

	shl.b64 	%rd3, %rd1, 4;
	add.s64 	%rd4, %rd2, %rd3;
	ld.f32 	%f4, [%rd4+12];
	bra.uni 	BB0_3;

BB0_2:
	shl.b64 	%rd5, %rd1, 3;
	add.s64 	%rd6, %rd2, %rd5;
	ld.u16 	%rs1, [%rd6+6];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f4, %temp;
	}

BB0_3:
	st.param.f32	[func_retval0+0], %f4;
	ret;
}

.visible .func _Z6MinRowiP6float2Pf(
	.param .b32 _Z6MinRowiP6float2Pf_param_0,
	.param .b64 _Z6MinRowiP6float2Pf_param_1,
	.param .b64 _Z6MinRowiP6float2Pf_param_2
)
{
	.reg .pred 	%p<3>;
	.reg .s32 	%r<8>;
	.reg .f32 	%f<13>;
	.reg .s64 	%rd<15>;


	ld.param.s32 	%rd6, [_Z6MinRowiP6float2Pf_param_0];
	cvt.u32.u64	%r4, %rd6;
	ld.param.u64 	%rd4, [_Z6MinRowiP6float2Pf_param_1];
	ld.param.u64 	%rd5, [_Z6MinRowiP6float2Pf_param_2];
	not.b64 	%rd7, %rd6;
	shl.b64 	%rd8, %rd7, 2;
	add.s64 	%rd9, %rd5, %rd8;
	ld.f32 	%f1, [%rd9];
	ld.f32 	%f12, [%rd9+4];
	shl.b32 	%r1, %r4, 1;
	setp.lt.s32	%p1, %r1, 1;
	@%p1 bra 	BB1_3;

	neg.s32 	%r6, %r4;
	mov.u32 	%r7, 0;
	mul.wide.s32 	%rd10, %r6, 4;
	add.s64 	%rd11, %rd10, %rd5;
	add.s64 	%rd14, %rd11, 8;

BB1_2:
	ld.f32 	%f6, [%rd14+-4];
	min.ftz.f32 	%f7, %f12, %f6;
	ld.f32 	%f8, [%rd14];
	min.ftz.f32 	%f12, %f7, %f8;
	add.s64 	%rd14, %rd14, 8;
	add.s32 	%r7, %r7, 2;
	setp.lt.s32	%p2, %r7, %r1;
	@%p2 bra 	BB1_2;

BB1_3:
	mul.wide.s32 	%rd12, %r4, 4;
	add.s64 	%rd13, %rd5, %rd12;
	ld.f32 	%f9, [%rd13+4];
	min.ftz.f32 	%f10, %f1, %f9;
	min.ftz.f32 	%f11, %f10, %f12;
	st.v2.f32 	[%rd4], {%f11, %f12};
	ret;
}

.visible .func  (.param .align 8 .b8 func_retval0[8]) _Z9MinColumniP6float2(
	.param .b32 _Z9MinColumniP6float2_param_0,
	.param .b64 _Z9MinColumniP6float2_param_1
)
{
	.reg .pred 	%p<3>;
	.reg .s32 	%r<9>;
	.reg .f32 	%f<27>;
	.reg .s64 	%rd<14>;


	ld.param.s32 	%rd5, [_Z9MinColumniP6float2_param_0];
	cvt.u32.u64	%r4, %rd5;
	ld.param.u64 	%rd4, [_Z9MinColumniP6float2_param_1];
	not.b64 	%rd6, %rd5;
	shl.b64 	%rd7, %rd6, 3;
	add.s64 	%rd8, %rd4, %rd7;
	ld.f32 	%f9, [%rd8];
	ld.v2.f32 	{%f10, %f11}, [%rd8+8];
	min.ftz.f32 	%f25, %f9, %f10;
	shl.b32 	%r1, %r4, 1;
	setp.lt.s32	%p1, %r1, 1;
	mov.f32 	%f26, %f11;
	@%p1 bra 	BB2_3;

	neg.s32 	%r6, %r4;
	mov.u32 	%r8, 0;
	mul.wide.s32 	%rd9, %r6, 8;
	add.s64 	%rd10, %rd9, %rd4;
	add.s64 	%rd13, %rd10, 20;

BB2_2:
	ld.v2.f32 	{%f13, %f14}, [%rd13+-12];
	min.ftz.f32 	%f16, %f25, %f13;
	min.ftz.f32 	%f18, %f26, %f14;
	ld.v2.f32 	{%f19, %f20}, [%rd13+-4];
	min.ftz.f32 	%f25, %f16, %f19;
	min.ftz.f32 	%f26, %f18, %f20;
	add.s64 	%rd13, %rd13, 16;
	add.s32 	%r8, %r8, 2;
	setp.lt.s32	%p2, %r8, %r1;
	@%p2 bra 	BB2_2;

BB2_3:
	add.s32 	%r7, %r4, 1;
	mul.wide.s32 	%rd11, %r7, 8;
	add.s64 	%rd12, %rd4, %rd11;
	ld.f32 	%f23, [%rd12];
	min.ftz.f32 	%f24, %f25, %f23;
	st.param.f32	[func_retval0+0], %f24;
	st.param.f32	[func_retval0+4], %f26;
	ret;
}

.visible .entry cuda_kernel_chokefilter_x(
	.param .u64 cuda_kernel_chokefilter_x_param_0,
	.param .u64 cuda_kernel_chokefilter_x_param_1,
	.param .u32 cuda_kernel_chokefilter_x_param_2,
	.param .u32 cuda_kernel_chokefilter_x_param_3,
	.param .u32 cuda_kernel_chokefilter_x_param_4,
	.param .u32 cuda_kernel_chokefilter_x_param_5,
	.param .u32 cuda_kernel_chokefilter_x_param_6,
	.param .u32 cuda_kernel_chokefilter_x_param_7
)
{
	.reg .pred 	%p<15>;
	.reg .s16 	%rs<4>;
	.reg .s32 	%r<31>;
	.reg .f32 	%f<36>;
	.reg .s64 	%rd<56>;
	// demoted variable
	.shared .align 4 .b8 cuda_kernel_chokefilter_x$__cuda_local_var_180308_3088_non_const_data[1152];

	ld.param.u64 	%rd17, [cuda_kernel_chokefilter_x_param_0];
	ld.param.u64 	%rd16, [cuda_kernel_chokefilter_x_param_1];
	ld.param.u32 	%r15, [cuda_kernel_chokefilter_x_param_2];
	ld.param.u32 	%r11, [cuda_kernel_chokefilter_x_param_3];
	ld.param.u32 	%r12, [cuda_kernel_chokefilter_x_param_4];
	ld.param.s32 	%rd18, [cuda_kernel_chokefilter_x_param_6];
	cvt.u32.u64	%r13, %rd18;
	ld.param.u32 	%r14, [cuda_kernel_chokefilter_x_param_7];
	cvta.to.global.u64 	%rd1, %rd17;
	mov.u32 	%r16, %ntid.x;
	mov.u32 	%r17, %ctaid.x;
	mov.u32 	%r1, %tid.x;
	mad.lo.s32 	%r2, %r16, %r17, %r1;
	add.s32 	%r3, %r13, 1;
	mov.u32 	%r4, %ctaid.y;
	mad.lo.s32 	%r5, %r4, %r15, %r2;
	add.s32 	%r18, %r1, 16;
	cvt.s64.s32	%rd19, %r1;
	cvt.s64.s32	%rd20, %r18;
	mul.wide.s32 	%rd21, %r18, 4;
	mov.u64 	%rd22, cuda_kernel_chokefilter_x$__cuda_local_var_180308_3088_non_const_data;
	add.s64 	%rd2, %rd22, %rd21;
	add.s64 	%rd23, %rd19, %rd18;
	shl.b64 	%rd24, %rd23, 2;
	add.s64 	%rd3, %rd22, %rd24;
	not.b64 	%rd25, %rd18;
	add.s64 	%rd26, %rd20, %rd25;
	shl.b64 	%rd27, %rd26, 2;
	add.s64 	%rd4, %rd22, %rd27;
	setp.ge.s32	%p2, %r2, %r12;
	@%p2 bra 	BB3_16;

	setp.eq.s32	%p3, %r14, 0;
	cvt.s64.s32	%rd5, %r5;
	@%p3 bra 	BB3_3;

	shl.b64 	%rd28, %rd5, 4;
	add.s64 	%rd29, %rd1, %rd28;
	ld.global.f32 	%f34, [%rd29+12];
	bra.uni 	BB3_4;

BB3_3:
	shl.b64 	%rd30, %rd5, 3;
	add.s64 	%rd31, %rd1, %rd30;
	ld.global.u16 	%rs1, [%rd31+6];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f34, %temp;
	}

BB3_4:
	st.shared.f32 	[%rd2], %f34;
	setp.lt.s32	%p4, %r1, %r3;
	@%p4 bra 	BB3_11;

	sub.s32 	%r20, %r16, %r3;
	setp.lt.s32	%p5, %r1, %r20;
	@%p5 bra 	BB3_16;

	sub.s32 	%r21, %r12, %r3;
	setp.lt.s32	%p6, %r2, %r21;
	@%p6 bra 	BB3_8;

	st.shared.f32 	[%rd3+68], %f34;
	bra.uni 	BB3_16;

BB3_8:
	add.s32 	%r22, %r5, %r3;
	cvt.s64.s32	%rd6, %r22;
	@%p3 bra 	BB3_10;

	shl.b64 	%rd32, %rd6, 4;
	add.s64 	%rd33, %rd1, %rd32;
	ld.global.f32 	%f4, [%rd33+12];
	st.shared.f32 	[%rd3+68], %f4;
	bra.uni 	BB3_16;

BB3_10:
	shl.b64 	%rd34, %rd6, 3;
	add.s64 	%rd35, %rd1, %rd34;
	ld.global.u16 	%rs2, [%rd35+6];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f5, %temp;
	}
	st.shared.f32 	[%rd3+68], %f5;
	bra.uni 	BB3_16;

BB3_11:
	setp.lt.s32	%p8, %r2, %r3;
	@%p8 bra 	BB3_15;

	sub.s32 	%r23, %r5, %r3;
	cvt.s64.s32	%rd7, %r23;
	@%p3 bra 	BB3_14;

	shl.b64 	%rd36, %rd7, 4;
	add.s64 	%rd37, %rd1, %rd36;
	ld.global.f32 	%f7, [%rd37+12];
	st.shared.f32 	[%rd4], %f7;
	bra.uni 	BB3_16;

BB3_14:
	shl.b64 	%rd38, %rd7, 3;
	add.s64 	%rd39, %rd1, %rd38;
	ld.global.u16 	%rs3, [%rd39+6];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f8, %temp;
	}
	st.shared.f32 	[%rd4], %f8;
	bra.uni 	BB3_16;

BB3_15:
	st.shared.f32 	[%rd4], %f34;

BB3_16:
	cvta.to.global.u64 	%rd40, %rd16;
	mad.lo.s32 	%r24, %r4, %r11, %r2;
	mul.wide.s32 	%rd41, %r24, 8;
	add.s64 	%rd8, %rd40, %rd41;
	setp.lt.s32	%p1, %r2, %r12;
	bar.sync 	0;
	@!%p1 bra 	BB3_26;
	bra.uni 	BB3_17;

BB3_17:
	setp.lt.u32	%p10, %r13, 4;
	ld.shared.f32 	%f10, [%rd4];
	ld.shared.f32 	%f35, [%rd4+4];
	shl.b32 	%r6, %r13, 1;
	@%p10 bra 	BB3_22;

	setp.lt.s32	%p11, %r6, 1;
	@%p11 bra 	BB3_21;

	neg.s32 	%r26, %r13;
	mov.u32 	%r29, 0;
	cvt.s64.s32	%rd43, %r26;
	add.s64 	%rd44, %rd19, %rd43;
	shl.b64 	%rd45, %rd44, 2;
	add.s64 	%rd47, %rd45, %rd22;
	add.s64 	%rd54, %rd47, 72;

BB3_20:
	ld.shared.f32 	%f18, [%rd54+-4];
	min.ftz.f32 	%f19, %f35, %f18;
	ld.shared.f32 	%f20, [%rd54];
	min.ftz.f32 	%f35, %f19, %f20;
	add.s64 	%rd54, %rd54, 8;
	add.s32 	%r29, %r29, 2;
	setp.lt.s32	%p12, %r29, %r6;
	@%p12 bra 	BB3_20;

BB3_21:
	ld.shared.f32 	%f21, [%rd3+68];
	min.ftz.f32 	%f22, %f10, %f21;
	min.ftz.f32 	%f23, %f22, %f35;
	st.global.v2.f32 	[%rd8], {%f23, %f35};
	bra.uni 	BB3_26;

BB3_22:
	setp.lt.s32	%p13, %r6, 1;
	@%p13 bra 	BB3_25;

	neg.s32 	%r28, %r13;
	mov.u32 	%r30, 0;
	cvt.s64.s32	%rd49, %r28;
	add.s64 	%rd50, %rd19, %rd49;
	shl.b64 	%rd51, %rd50, 2;
	add.s64 	%rd53, %rd51, %rd22;
	add.s64 	%rd55, %rd53, 72;

BB3_24:
	ld.shared.f32 	%f24, [%rd55+-4];
	min.ftz.f32 	%f25, %f35, %f24;
	ld.shared.f32 	%f26, [%rd55];
	min.ftz.f32 	%f35, %f25, %f26;
	add.s64 	%rd55, %rd55, 8;
	add.s32 	%r30, %r30, 2;
	setp.lt.s32	%p14, %r30, %r6;
	@%p14 bra 	BB3_24;

BB3_25:
	ld.shared.f32 	%f27, [%rd3+68];
	min.ftz.f32 	%f28, %f10, %f27;
	min.ftz.f32 	%f29, %f28, %f35;
	st.global.v2.f32 	[%rd8], {%f29, %f35};

BB3_26:
	ret;
}

.visible .entry cuda_kernel_chokefilter_y(
	.param .u64 cuda_kernel_chokefilter_y_param_0,
	.param .u64 cuda_kernel_chokefilter_y_param_1,
	.param .u32 cuda_kernel_chokefilter_y_param_2,
	.param .u32 cuda_kernel_chokefilter_y_param_3,
	.param .u32 cuda_kernel_chokefilter_y_param_4,
	.param .u32 cuda_kernel_chokefilter_y_param_5,
	.param .u32 cuda_kernel_chokefilter_y_param_6,
	.param .f32 cuda_kernel_chokefilter_y_param_7,
	.param .u32 cuda_kernel_chokefilter_y_param_8,
	.param .u32 cuda_kernel_chokefilter_y_param_9,
	.param .u64 cuda_kernel_chokefilter_y_param_10
)
{
	.reg .pred 	%p<18>;
	.reg .s16 	%rs<2>;
	.reg .s32 	%r<46>;
	.reg .f32 	%f<75>;
	.reg .s64 	%rd<58>;
	// demoted variable
	.shared .align 8 .b8 cuda_kernel_chokefilter_y$__cuda_local_var_180308_4502_non_const_data[3200];

	ld.param.u64 	%rd13, [cuda_kernel_chokefilter_y_param_0];
	ld.param.u64 	%rd14, [cuda_kernel_chokefilter_y_param_1];
	ld.param.u32 	%r9, [cuda_kernel_chokefilter_y_param_2];
	ld.param.u32 	%r10, [cuda_kernel_chokefilter_y_param_3];
	ld.param.u32 	%r11, [cuda_kernel_chokefilter_y_param_4];
	ld.param.u32 	%r12, [cuda_kernel_chokefilter_y_param_5];
	ld.param.s32 	%rd15, [cuda_kernel_chokefilter_y_param_6];
	cvt.u32.u64	%r13, %rd15;
	ld.param.f32 	%f24, [cuda_kernel_chokefilter_y_param_7];
	ld.param.u32 	%r14, [cuda_kernel_chokefilter_y_param_8];
	ld.param.u32 	%r15, [cuda_kernel_chokefilter_y_param_9];
	mov.u32 	%r16, %ntid.x;
	mov.u32 	%r17, %ctaid.x;
	mov.u32 	%r18, %tid.x;
	mad.lo.s32 	%r1, %r16, %r17, %r18;
	mov.u32 	%r19, %ntid.y;
	mov.u32 	%r20, %ctaid.y;
	mov.u32 	%r21, %tid.y;
	mad.lo.s32 	%r2, %r19, %r20, %r21;
	add.s32 	%r3, %r13, 1;
	mad.lo.s32 	%r22, %r2, %r9, %r1;
	cvt.s64.s32	%rd1, %r22;
	mad.lo.s32 	%r23, %r18, 25, %r21;
	add.s32 	%r24, %r23, 8;
	cvt.s64.s32	%rd2, %r23;
	cvt.s64.s32	%rd16, %r24;
	setp.lt.s32	%p1, %r1, %r11;
	setp.lt.s32	%p2, %r2, %r12;
	and.pred  	%p3, %p1, %p2;
	not.b64 	%rd17, %rd15;
	add.s64 	%rd18, %rd16, %rd17;
	shl.b64 	%rd19, %rd18, 3;
	mov.u64 	%rd20, cuda_kernel_chokefilter_y$__cuda_local_var_180308_4502_non_const_data;
	add.s64 	%rd3, %rd20, %rd19;
	add.s64 	%rd21, %rd2, %rd15;
	shl.b64 	%rd22, %rd21, 3;
	add.s64 	%rd4, %rd20, %rd22;
	@!%p3 bra 	BB4_9;
	bra.uni 	BB4_1;

BB4_1:
	cvta.to.global.u64 	%rd23, %rd13;
	shl.b64 	%rd24, %rd1, 3;
	add.s64 	%rd25, %rd23, %rd24;
	shl.b64 	%rd26, %rd2, 3;
	add.s64 	%rd5, %rd20, %rd26;
	ld.global.v2.f32 	{%f25, %f26}, [%rd25];
	st.shared.v2.f32 	[%rd5+64], {%f25, %f26};
	setp.ge.s32	%p4, %r21, %r3;
	@%p4 bra 	BB4_5;

	setp.lt.s32	%p5, %r2, %r3;
	@%p5 bra 	BB4_4;

	not.b32 	%r26, %r13;
	mul.lo.s32 	%r27, %r26, %r9;
	cvt.s64.s32	%rd29, %r27;
	add.s64 	%rd30, %rd1, %rd29;
	shl.b64 	%rd31, %rd30, 3;
	add.s64 	%rd32, %rd23, %rd31;
	ld.global.v2.f32 	{%f27, %f28}, [%rd32];
	st.shared.v2.f32 	[%rd3], {%f27, %f28};
	bra.uni 	BB4_5;

BB4_4:
	st.shared.v2.f32 	[%rd3], {%f25, %f26};

BB4_5:
	sub.s32 	%r29, %r19, %r3;
	setp.lt.s32	%p6, %r21, %r29;
	@%p6 bra 	BB4_9;

	sub.s32 	%r31, %r12, %r3;
	setp.lt.s32	%p7, %r2, %r31;
	@%p7 bra 	BB4_8;

	ld.shared.v2.f32 	{%f31, %f32}, [%rd5+64];
	st.shared.v2.f32 	[%rd4+72], {%f31, %f32};
	bra.uni 	BB4_9;

BB4_8:
	mul.lo.s32 	%r32, %r3, %r9;
	cvt.s64.s32	%rd34, %r32;
	add.s64 	%rd35, %rd1, %rd34;
	shl.b64 	%rd36, %rd35, 3;
	add.s64 	%rd37, %rd23, %rd36;
	ld.global.v2.f32 	{%f35, %f36}, [%rd37];
	st.shared.v2.f32 	[%rd4+72], {%f35, %f36};

BB4_9:
	bar.sync 	0;
	setp.ge.s32	%p8, %r1, %r11;
	setp.ge.s32	%p9, %r2, %r12;
	or.pred  	%p10, %p8, %p9;
	@%p10 bra 	BB4_24;

	setp.lt.u32	%p11, %r13, 4;
	ld.shared.f32 	%f39, [%rd3];
	ld.shared.v2.f32 	{%f40, %f41}, [%rd3+8];
	min.ftz.f32 	%f71, %f39, %f40;
	shl.b32 	%r4, %r13, 1;
	mov.f32 	%f73, %f41;
	@%p11 bra 	BB4_15;

	setp.lt.s32	%p12, %r4, 1;
	@%p12 bra 	BB4_14;

	neg.s32 	%r34, %r13;
	mov.u32 	%r44, 0;
	cvt.s64.s32	%rd38, %r34;
	add.s64 	%rd40, %rd38, %rd2;
	shl.b64 	%rd41, %rd40, 3;
	add.s64 	%rd43, %rd41, %rd20;
	add.s64 	%rd56, %rd43, 84;

BB4_13:
	ld.shared.v2.f32 	{%f43, %f44}, [%rd56+-12];
	min.ftz.f32 	%f46, %f71, %f43;
	min.ftz.f32 	%f48, %f73, %f44;
	ld.shared.v2.f32 	{%f49, %f50}, [%rd56+-4];
	min.ftz.f32 	%f71, %f46, %f49;
	min.ftz.f32 	%f73, %f48, %f50;
	add.s64 	%rd56, %rd56, 16;
	add.s32 	%r44, %r44, 2;
	setp.lt.s32	%p13, %r44, %r4;
	@%p13 bra 	BB4_13;

BB4_14:
	ld.shared.f32 	%f53, [%rd4+72];
	min.ftz.f32 	%f72, %f71, %f53;
	bra.uni 	BB4_19;

BB4_15:
	setp.lt.s32	%p14, %r4, 1;
	@%p14 bra 	BB4_18;

	neg.s32 	%r39, %r13;
	mov.u32 	%r45, 0;
	cvt.s64.s32	%rd44, %r39;
	add.s64 	%rd46, %rd44, %rd2;
	shl.b64 	%rd47, %rd46, 3;
	add.s64 	%rd49, %rd47, %rd20;
	add.s64 	%rd57, %rd49, 84;

BB4_17:
	ld.shared.v2.f32 	{%f54, %f55}, [%rd57+-12];
	min.ftz.f32 	%f57, %f71, %f54;
	min.ftz.f32 	%f59, %f73, %f55;
	ld.shared.v2.f32 	{%f60, %f61}, [%rd57+-4];
	min.ftz.f32 	%f71, %f57, %f60;
	min.ftz.f32 	%f73, %f59, %f61;
	add.s64 	%rd57, %rd57, 16;
	add.s32 	%r45, %r45, 2;
	setp.lt.s32	%p15, %r45, %r4;
	@%p15 bra 	BB4_17;

BB4_18:
	ld.shared.f32 	%f64, [%rd4+72];
	min.ftz.f32 	%f72, %f71, %f64;

BB4_19:
	mov.f32 	%f65, 0f3F800000;
	sub.ftz.f32 	%f66, %f65, %f24;
	mul.ftz.f32 	%f67, %f72, %f24;
	fma.rn.ftz.f32 	%f74, %f73, %f66, %f67;
	setp.eq.s32	%p16, %r14, 0;
	@%p16 bra 	BB4_21;

	ld.const.f32 	%f68, [inParams+140];
	ld.const.f32 	%f69, [inParams+144];
	fma.rn.ftz.f32 	%f70, %f74, %f68, %f69;
	cvt.ftz.sat.f32.f32	%f74, %f70;

BB4_21:
	mad.lo.s32 	%r43, %r2, %r10, %r1;
	cvt.s64.s32	%rd12, %r43;
	setp.eq.s32	%p17, %r15, 0;
	@%p17 bra 	BB4_23;

	cvta.to.global.u64 	%rd50, %rd14;
	shl.b64 	%rd51, %rd12, 4;
	add.s64 	%rd52, %rd50, %rd51;
	st.global.f32 	[%rd52+12], %f74;
	bra.uni 	BB4_24;

BB4_23:
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f74;
	mov.b16 	%rs1, %temp;
}
	cvta.to.global.u64 	%rd53, %rd14;
	shl.b64 	%rd54, %rd12, 3;
	add.s64 	%rd55, %rd53, %rd54;
	st.global.u16 	[%rd55+6], %rs1;

BB4_24:
	ret;
}

.visible .entry BoxFilterKernel(
	.param .u64 BoxFilterKernel_param_0,
	.param .u32 BoxFilterKernel_param_1,
	.param .u32 BoxFilterKernel_param_2,
	.param .u32 BoxFilterKernel_param_3,
	.param .u32 BoxFilterKernel_param_4,
	.param .f32 BoxFilterKernel_param_5,
	.param .u32 BoxFilterKernel_param_6,
	.param .u64 BoxFilterKernel_param_7,
	.param .u64 BoxFilterKernel_param_8
)
{
	.reg .pred 	%p<25>;
	.reg .s16 	%rs<18>;
	.reg .s32 	%r<111>;
	.reg .f32 	%f<95>;
	.reg .s64 	%rd<93>;


	ld.param.u64 	%rd17, [BoxFilterKernel_param_0];
	ld.param.u32 	%r31, [BoxFilterKernel_param_1];
	ld.param.u32 	%r32, [BoxFilterKernel_param_2];
	ld.param.u32 	%r33, [BoxFilterKernel_param_3];
	ld.param.u32 	%r34, [BoxFilterKernel_param_4];
	ld.param.f32 	%f40, [BoxFilterKernel_param_5];
	ld.param.u32 	%r35, [BoxFilterKernel_param_6];
	mov.u32 	%r36, %ctaid.x;
	shl.b32 	%r37, %r36, 7;
	mov.u32 	%r1, %tid.x;
	add.s32 	%r38, %r37, %r1;
	mov.u32 	%r39, %ctaid.y;
	cvt.rzi.ftz.s32.f32	%r2, %f40;
	shl.b32 	%r3, %r2, 1;
	add.s32 	%r4, %r3, 3;
	not.b32 	%r40, %r2;
	mad.lo.s32 	%r108, %r39, 100, %r40;
	add.s32 	%r41, %r38, -16;
	mov.u32 	%r107, 0;
	max.s32 	%r43, %r41, %r107;
	add.s32 	%r44, %r33, -1;
	min.s32 	%r6, %r43, %r44;
	setp.lt.s32	%p3, %r4, 1;
	@%p3 bra 	BB5_6;

	mul.wide.s32 	%rd18, %r1, 4;
	mov.u64 	%rd19, smem;
	add.s64 	%rd88, %rd19, %rd18;
	mov.u32 	%r45, 0;
	mov.u32 	%r103, %r45;

BB5_2:
	mov.u32 	%r7, %r103;
	add.s32 	%r46, %r34, -1;
	max.s32 	%r48, %r108, %r45;
	min.s32 	%r49, %r48, %r46;
	mad.lo.s32 	%r50, %r49, %r31, %r6;
	cvt.s64.s32	%rd3, %r50;
	setp.eq.s32	%p4, %r32, 0;
	@%p4 bra 	BB5_4;

	cvta.to.global.u64 	%rd20, %rd17;
	shl.b64 	%rd21, %rd3, 4;
	add.s64 	%rd22, %rd20, %rd21;
	ld.global.v4.f32 	{%f41, %f42, %f43, %f44}, [%rd22];
	mov.f32 	%f89, %f44;
	mov.f32 	%f3, %f43;
	mov.f32 	%f2, %f42;
	mov.f32 	%f1, %f41;
	bra.uni 	BB5_5;

BB5_4:
	cvta.to.global.u64 	%rd23, %rd17;
	shl.b64 	%rd24, %rd3, 3;
	add.s64 	%rd25, %rd23, %rd24;
	ld.global.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd25];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f89, %temp;
	}

BB5_5:
	st.shared.f32 	[%rd88], %f89;
	add.s32 	%r108, %r108, 1;
	add.s64 	%rd88, %rd88, 640;
	add.s32 	%r10, %r7, 1;
	setp.lt.s32	%p5, %r10, %r4;
	mov.u32 	%r103, %r10;
	@%p5 bra 	BB5_2;

BB5_6:
	add.s32 	%r104, %r1, 160;
	add.s32 	%r13, %r3, 2;
	setp.gt.s32	%p6, %r13, 1;
	mul.wide.s32 	%rd26, %r104, 4;
	mov.u64 	%rd27, smem;
	add.s64 	%rd91, %rd27, %rd26;
	@%p6 bra 	BB5_8;

	mov.f32 	%f90, 0f00000000;
	bra.uni 	BB5_10;

BB5_8:
	mul.wide.s32 	%rd28, %r1, 4;
	add.s64 	%rd30, %rd28, %rd27;
	add.s64 	%rd89, %rd30, 640;
	mov.f32 	%f90, 0f00000000;
	mov.u32 	%r105, 1;
	mov.u64 	%rd92, %rd91;

BB5_9:
	ld.shared.f32 	%f47, [%rd92];
	add.ftz.f32 	%f90, %f90, %f47;
	add.s32 	%r104, %r104, 160;
	mul.wide.s32 	%rd31, %r104, 4;
	add.s64 	%rd92, %rd27, %rd31;
	add.s64 	%rd91, %rd89, 640;
	add.s32 	%r105, %r105, 1;
	setp.lt.s32	%p7, %r105, %r13;
	mov.u64 	%rd89, %rd91;
	@%p7 bra 	BB5_9;

BB5_10:
	cvt.s64.s32	%rd33, %r1;
	mad.lo.s32 	%r55, %r3, 160, 480;
	cvt.s64.s32	%rd34, %r55;
	add.s64 	%rd35, %rd34, %rd33;
	shl.b64 	%rd36, %rd35, 2;
	add.s64 	%rd38, %rd27, %rd36;
	st.shared.f32 	[%rd38], %f90;
	ld.shared.f32 	%f48, [%rd91];
	add.ftz.f32 	%f49, %f90, %f48;
	mul.wide.s32 	%rd39, %r1, 4;
	add.s64 	%rd40, %rd27, %rd39;
	ld.shared.f32 	%f50, [%rd40];
	add.ftz.f32 	%f51, %f49, %f50;
	st.shared.f32 	[%rd38+640], %f51;
	bar.sync 	0;
	mul.lo.s32 	%r109, %r39, 100;
	setp.ge.s32	%p8, %r109, %r34;
	@%p8 bra 	BB5_26;

	cvt.rn.f32.s32	%f52, %r3;
	add.ftz.f32 	%f53, %f52, 0f3F800000;
	add.ftz.f32 	%f54, %f53, 0f40000000;
	mul.ftz.f32 	%f16, %f53, %f53;
	mul.ftz.f32 	%f17, %f54, %f54;
	add.s32 	%r61, %r1, 16;
	sub.s32 	%r18, %r61, %r2;
	add.s32 	%r19, %r18, %r3;
	add.s32 	%r62, %r18, -1;
	cvt.s64.s32	%rd41, %r62;
	add.s64 	%rd43, %rd41, %rd34;
	shl.b64 	%rd44, %rd43, 2;
	add.s64 	%rd46, %rd44, %rd27;
	add.s64 	%rd13, %rd46, 640;
	add.s32 	%r64, %r19, 1;
	cvt.s64.s32	%rd47, %r64;
	add.s64 	%rd48, %rd47, %rd34;
	shl.b64 	%rd49, %rd48, 2;
	add.s64 	%rd50, %rd49, %rd27;
	add.s64 	%rd14, %rd50, 640;
	cvt.rn.f32.s32	%f55, %r2;
	sub.ftz.f32 	%f18, %f40, %f55;
	mov.u32 	%r106, %r107;

BB5_12:
	mov.u32 	%r22, %r107;
	setp.lt.s32	%p9, %r1, 128;
	setp.lt.s32	%p10, %r38, %r33;
	and.pred  	%p11, %p9, %p10;
	setp.eq.s32	%p24, %r32, 0;
	@!%p11 bra 	BB5_22;
	bra.uni 	BB5_13;

BB5_13:
	mul.lo.s32 	%r70, %r4, 160;
	cvt.s64.s32	%rd15, %r70;
	mov.f32 	%f92, 0f00000000;
	setp.gt.s32	%p12, %r3, -1;
	@%p12 bra 	BB5_15;

	mov.f32 	%f91, %f92;
	bra.uni 	BB5_17;

BB5_15:
	mov.f32 	%f91, %f92;
	mov.u32 	%r110, %r18;

BB5_16:
	mov.u32 	%r25, %r110;
	cvt.s64.s32	%rd51, %r25;
	add.s64 	%rd52, %rd51, %rd15;
	shl.b64 	%rd53, %rd52, 2;
	add.s64 	%rd55, %rd27, %rd53;
	ld.shared.f32 	%f58, [%rd55];
	add.ftz.f32 	%f92, %f92, %f58;
	ld.shared.f32 	%f59, [%rd55+640];
	add.ftz.f32 	%f91, %f91, %f59;
	add.s32 	%r26, %r25, 1;
	setp.le.s32	%p13, %r26, %r19;
	mov.u32 	%r110, %r26;
	@%p13 bra 	BB5_16;

BB5_17:
	ld.shared.f32 	%f62, [%rd14];
	ld.shared.f32 	%f63, [%rd13];
	add.ftz.f32 	%f64, %f63, %f62;
	add.ftz.f32 	%f65, %f91, %f64;
	div.approx.ftz.f32 	%f66, %f65, %f17;
	mov.f32 	%f67, 0f3F800000;
	sub.ftz.f32 	%f68, %f67, %f18;
	div.approx.ftz.f32 	%f69, %f92, %f16;
	mul.ftz.f32 	%f70, %f68, %f69;
	fma.rn.ftz.f32 	%f93, %f18, %f66, %f70;
	setp.eq.s32	%p14, %r35, 0;
	@%p14 bra 	BB5_19;

	ld.const.f32 	%f71, [inParams+140];
	ld.const.f32 	%f72, [inParams+144];
	fma.rn.ftz.f32 	%f73, %f93, %f71, %f72;
	cvt.ftz.sat.f32.f32	%f93, %f73;

BB5_19:
	setp.eq.s32	%p15, %r32, 0;
	@%p15 bra 	BB5_21;

	cvta.to.global.u64 	%rd56, %rd17;
	mad.lo.s32 	%r75, %r109, %r31, %r38;
	mul.wide.s32 	%rd57, %r75, 16;
	add.s64 	%rd58, %rd56, %rd57;
	st.global.f32 	[%rd58+12], %f93;
	mov.pred 	%p24, 0;
	bra.uni 	BB5_22;

BB5_21:
	cvta.to.global.u64 	%rd59, %rd17;
	mad.lo.s32 	%r80, %r109, %r31, %r38;
	mul.wide.s32 	%rd60, %r80, 8;
	add.s64 	%rd61, %rd59, %rd60;
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f93;
	mov.b16 	%rs9, %temp;
}
	st.global.u16 	[%rd61+6], %rs9;
	mov.pred 	%p24, -1;

BB5_22:
	bar.sync 	0;
	add.s32 	%r81, %r22, 1;
	setp.lt.s32	%p18, %r81, %r4;
	selp.b32	%r82, 0, %r4, %p18;
	mov.u32 	%r83, 0;
	sub.s32 	%r107, %r81, %r82;
	mad.lo.s32 	%r86, %r107, 160, %r1;
	mul.wide.s32 	%rd62, %r86, 4;
	add.s64 	%rd64, %rd27, %rd62;
	mul.lo.s32 	%r87, %r4, 160;
	cvt.s64.s32	%rd65, %r87;
	add.s64 	%rd67, %rd65, %rd33;
	shl.b64 	%rd68, %rd67, 2;
	add.s64 	%rd69, %rd27, %rd68;
	ld.shared.f32 	%f74, [%rd69];
	ld.shared.f32 	%f75, [%rd64];
	sub.ftz.f32 	%f76, %f74, %f75;
	st.shared.f32 	[%rd69], %f76;
	mad.lo.s32 	%r88, %r22, 160, %r1;
	mul.wide.s32 	%rd70, %r88, 4;
	add.s64 	%rd71, %rd27, %rd70;
	ld.shared.f32 	%f77, [%rd69+640];
	ld.shared.f32 	%f78, [%rd71];
	sub.ftz.f32 	%f79, %f77, %f78;
	st.shared.f32 	[%rd69+640], %f79;
	add.s32 	%r89, %r34, -1;
	max.s32 	%r90, %r108, %r83;
	min.s32 	%r91, %r90, %r89;
	mad.lo.s32 	%r92, %r91, %r31, %r6;
	cvt.s64.s32	%rd16, %r92;
	@%p24 bra 	BB5_24;

	cvta.to.global.u64 	%rd72, %rd17;
	shl.b64 	%rd73, %rd16, 4;
	add.s64 	%rd74, %rd72, %rd73;
	ld.global.v4.f32 	{%f80, %f81, %f82, %f83}, [%rd74];
	mov.f32 	%f94, %f83;
	mov.f32 	%f30, %f82;
	mov.f32 	%f29, %f81;
	mov.f32 	%f28, %f80;
	bra.uni 	BB5_25;

BB5_24:
	cvta.to.global.u64 	%rd75, %rd17;
	shl.b64 	%rd76, %rd16, 3;
	add.s64 	%rd77, %rd75, %rd76;
	ld.global.v4.u16 	{%rs10, %rs11, %rs12, %rs13}, [%rd77];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs13;
	cvt.f32.f16 	%f94, %temp;
	}

BB5_25:
	mul.wide.s32 	%rd78, %r88, 4;
	add.s64 	%rd80, %rd27, %rd78;
	st.shared.f32 	[%rd80], %f94;
	add.s32 	%r108, %r108, 1;
	ld.shared.f32 	%f84, [%rd69+640];
	add.ftz.f32 	%f85, %f84, %f94;
	st.shared.f32 	[%rd69+640], %f85;
	add.s32 	%r96, %r3, %r22;
	add.s32 	%r97, %r96, 2;
	setp.lt.s32	%p19, %r97, %r4;
	selp.b32	%r98, 0, %r4, %p19;
	sub.s32 	%r99, %r97, %r98;
	mad.lo.s32 	%r100, %r99, 160, %r1;
	mul.wide.s32 	%rd86, %r100, 4;
	add.s64 	%rd87, %rd27, %rd86;
	ld.shared.f32 	%f86, [%rd69];
	ld.shared.f32 	%f87, [%rd87];
	add.ftz.f32 	%f88, %f86, %f87;
	st.shared.f32 	[%rd69], %f88;
	bar.sync 	0;
	add.s32 	%r106, %r106, 1;
	setp.lt.s32	%p20, %r106, 100;
	add.s32 	%r109, %r109, 1;
	setp.lt.s32	%p21, %r109, %r34;
	and.pred  	%p22, %p20, %p21;
	@%p22 bra 	BB5_12;

BB5_26:
	ret;
}

.visible .entry cuda_kernel_composite(
	.param .u64 cuda_kernel_composite_param_0,
	.param .u64 cuda_kernel_composite_param_1,
	.param .u32 cuda_kernel_composite_param_2,
	.param .u32 cuda_kernel_composite_param_3,
	.param .u32 cuda_kernel_composite_param_4,
	.param .u32 cuda_kernel_composite_param_5,
	.param .u32 cuda_kernel_composite_param_6,
	.param .u32 cuda_kernel_composite_param_7,
	.param .u64 cuda_kernel_composite_param_8
)
{
	.reg .pred 	%p<21>;
	.reg .s16 	%rs<13>;
	.reg .s32 	%r<17>;
	.reg .f32 	%f<247>;
	.reg .s64 	%rd<16>;


	ld.param.u64 	%rd5, [cuda_kernel_composite_param_0];
	ld.param.u64 	%rd4, [cuda_kernel_composite_param_1];
	ld.param.u32 	%r7, [cuda_kernel_composite_param_2];
	ld.param.u32 	%r8, [cuda_kernel_composite_param_3];
	ld.param.u32 	%r3, [cuda_kernel_composite_param_4];
	ld.param.u32 	%r4, [cuda_kernel_composite_param_5];
	ld.param.u32 	%r5, [cuda_kernel_composite_param_6];
	ld.param.u32 	%r6, [cuda_kernel_composite_param_7];
	cvta.to.global.u64 	%rd1, %rd5;
	mov.u32 	%r9, %ntid.x;
	mov.u32 	%r10, %ctaid.x;
	mov.u32 	%r11, %tid.x;
	mad.lo.s32 	%r1, %r9, %r10, %r11;
	mov.u32 	%r12, %ntid.y;
	mov.u32 	%r13, %ctaid.y;
	mov.u32 	%r14, %tid.y;
	mad.lo.s32 	%r2, %r12, %r13, %r14;
	setp.ge.s32	%p1, %r2, %r8;
	setp.ge.s32	%p2, %r1, %r7;
	or.pred  	%p3, %p2, %p1;
	@%p3 bra 	BB6_39;

	mad.lo.s32 	%r15, %r2, %r3, %r1;
	cvt.s64.s32	%rd2, %r15;
	setp.eq.s32	%p4, %r6, 0;
	@%p4 bra 	BB6_3;

	shl.b64 	%rd6, %rd2, 4;
	add.s64 	%rd7, %rd1, %rd6;
	ld.global.v4.f32 	{%f66, %f67, %f68, %f69}, [%rd7];
	mov.f32 	%f239, %f69;
	mov.f32 	%f238, %f68;
	mov.f32 	%f237, %f67;
	mov.f32 	%f236, %f66;
	bra.uni 	BB6_4;

BB6_3:
	shl.b64 	%rd8, %rd2, 3;
	add.s64 	%rd9, %rd1, %rd8;
	ld.global.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd9];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f236, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f237, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f238, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f239, %temp;
	}

BB6_4:
	ld.const.f32 	%f16, [inParams+148];
	ld.const.f32 	%f17, [inParams+152];
	ld.const.f32 	%f18, [inParams+156];
	ld.const.f32 	%f19, [inParams+160];
	cvt.ftz.sat.f32.f32	%f20, %f236;
	setp.gt.ftz.f32	%p5, %f20, 0f3D25AEE6;
	@%p5 bra 	BB6_6;

	mov.f32 	%f70, 0f414EB852;
	div.approx.ftz.f32 	%f240, %f20, %f70;
	bra.uni 	BB6_9;

BB6_6:
	add.ftz.f32 	%f71, %f20, 0f3D6147AE;
	mov.f32 	%f72, 0f3F870A3D;
	div.approx.ftz.f32 	%f22, %f71, %f72;
	setp.ltu.ftz.f32	%p6, %f22, 0f00000000;
	@%p6 bra 	BB6_8;

	lg2.approx.ftz.f32 	%f73, %f22;
	mul.ftz.f32 	%f74, %f73, 0f4019999A;
	ex2.approx.ftz.f32 	%f240, %f74;
	bra.uni 	BB6_9;

BB6_8:
	neg.ftz.f32 	%f75, %f22;
	lg2.approx.ftz.f32 	%f76, %f75;
	mul.ftz.f32 	%f77, %f76, 0f4019999A;
	ex2.approx.ftz.f32 	%f78, %f77;
	neg.ftz.f32 	%f240, %f78;

BB6_9:
	cvt.ftz.sat.f32.f32	%f26, %f237;
	setp.gt.ftz.f32	%p7, %f26, 0f3D25AEE6;
	@%p7 bra 	BB6_11;

	mov.f32 	%f79, 0f414EB852;
	div.approx.ftz.f32 	%f241, %f26, %f79;
	bra.uni 	BB6_14;

BB6_11:
	add.ftz.f32 	%f80, %f26, 0f3D6147AE;
	mov.f32 	%f81, 0f3F870A3D;
	div.approx.ftz.f32 	%f28, %f80, %f81;
	setp.ltu.ftz.f32	%p8, %f28, 0f00000000;
	@%p8 bra 	BB6_13;

	lg2.approx.ftz.f32 	%f82, %f28;
	mul.ftz.f32 	%f83, %f82, 0f4019999A;
	ex2.approx.ftz.f32 	%f241, %f83;
	bra.uni 	BB6_14;

BB6_13:
	neg.ftz.f32 	%f84, %f28;
	lg2.approx.ftz.f32 	%f85, %f84;
	mul.ftz.f32 	%f86, %f85, 0f4019999A;
	ex2.approx.ftz.f32 	%f87, %f86;
	neg.ftz.f32 	%f241, %f87;

BB6_14:
	cvt.ftz.sat.f32.f32	%f32, %f238;
	setp.gt.ftz.f32	%p9, %f32, 0f3D25AEE6;
	@%p9 bra 	BB6_16;

	mov.f32 	%f88, 0f414EB852;
	div.approx.ftz.f32 	%f242, %f32, %f88;
	bra.uni 	BB6_19;

BB6_16:
	add.ftz.f32 	%f89, %f32, 0f3D6147AE;
	mov.f32 	%f90, 0f3F870A3D;
	div.approx.ftz.f32 	%f34, %f89, %f90;
	setp.ltu.ftz.f32	%p10, %f34, 0f00000000;
	@%p10 bra 	BB6_18;

	lg2.approx.ftz.f32 	%f91, %f34;
	mul.ftz.f32 	%f92, %f91, 0f4019999A;
	ex2.approx.ftz.f32 	%f242, %f92;
	bra.uni 	BB6_19;

BB6_18:
	neg.ftz.f32 	%f93, %f34;
	lg2.approx.ftz.f32 	%f94, %f93;
	mul.ftz.f32 	%f95, %f94, 0f4019999A;
	ex2.approx.ftz.f32 	%f96, %f95;
	neg.ftz.f32 	%f242, %f96;

BB6_19:
	ld.const.f32 	%f97, [inParams+40];
	ld.const.f32 	%f98, [inParams+44];
	mul.ftz.f32 	%f99, %f241, %f98;
	fma.rn.ftz.f32 	%f100, %f242, %f97, %f99;
	ld.const.f32 	%f101, [inParams+48];
	fma.rn.ftz.f32 	%f102, %f240, %f101, %f100;
	ld.const.f32 	%f103, [inParams+52];
	ld.const.f32 	%f104, [inParams+56];
	mul.ftz.f32 	%f105, %f241, %f104;
	fma.rn.ftz.f32 	%f106, %f242, %f103, %f105;
	ld.const.f32 	%f107, [inParams+60];
	fma.rn.ftz.f32 	%f108, %f240, %f107, %f106;
	ld.const.f32 	%f109, [inParams+64];
	ld.const.f32 	%f110, [inParams+68];
	mul.ftz.f32 	%f111, %f241, %f110;
	fma.rn.ftz.f32 	%f112, %f242, %f109, %f111;
	ld.const.f32 	%f113, [inParams+72];
	fma.rn.ftz.f32 	%f114, %f240, %f113, %f112;
	mov.f32 	%f115, 0f02081CEA;
	max.ftz.f32 	%f116, %f114, %f115;
	div.approx.ftz.f32 	%f117, %f102, %f116;
	div.approx.ftz.f32 	%f118, %f108, %f116;
	sub.ftz.f32 	%f119, %f117, %f16;
	sub.ftz.f32 	%f120, %f118, %f17;
	sub.ftz.f32 	%f121, %f116, %f18;
	mul.ftz.f32 	%f122, %f120, %f120;
	fma.rn.ftz.f32 	%f123, %f119, %f119, %f122;
	max.ftz.f32 	%f124, %f123, %f115;
	rsqrt.approx.ftz.f32 	%f125, %f124;
	mul.ftz.f32 	%f126, %f120, %f17;
	fma.rn.ftz.f32 	%f127, %f119, %f16, %f126;
	mul.ftz.f32 	%f128, %f19, %f125;
	mul.ftz.f32 	%f129, %f127, %f128;
	setp.gt.ftz.f32	%p11, %f129, 0f00000000;
	ld.const.f32 	%f130, [inParams+8];
	mul.ftz.f32 	%f131, %f129, %f130;
	add.ftz.f32 	%f132, %f129, 0f3F800000;
	mov.f32 	%f133, 0f3F800000;
	mul.ftz.f32 	%f134, %f132, %f130;
	selp.f32	%f135, %f131, 0f00000000, %p11;
	selp.f32	%f136, %f130, %f134, %p11;
	sub.ftz.f32 	%f137, %f133, %f135;
	sub.ftz.f32 	%f138, %f133, %f136;
	div.approx.ftz.f32 	%f139, %f133, %f125;
	mul.ftz.f32 	%f140, %f139, %f19;
	mul.ftz.f32 	%f141, %f137, %f140;
	mul.ftz.f32 	%f142, %f138, %f140;
	ld.const.f32 	%f143, [inParams];
	mul.ftz.f32 	%f144, %f142, %f143;
	cvt.ftz.sat.f32.f32	%f145, %f144;
	max.ftz.f32 	%f146, %f145, %f115;
	div.approx.ftz.f32 	%f147, %f133, %f146;
	add.ftz.f32 	%f148, %f147, 0fBF800000;
	ld.const.f32 	%f149, [inParams+4];
	fma.rn.ftz.f32 	%f150, %f149, %f148, %f148;
	fma.rn.ftz.f32 	%f151, %f119, %f150, %f117;
	fma.rn.ftz.f32 	%f152, %f120, %f150, %f118;
	ld.const.f32 	%f153, [inParams+12];
	mul.ftz.f32 	%f154, %f142, %f153;
	cvt.ftz.sat.f32.f32	%f155, %f154;
	mul.ftz.f32 	%f156, %f151, %f155;
	mul.ftz.f32 	%f157, %f152, %f155;
	ld.const.f32 	%f158, [inParams+16];
	mul.ftz.f32 	%f159, %f141, %f158;
	cvt.ftz.sat.f32.f32	%f160, %f159;
	div.approx.ftz.f32 	%f161, %f121, %f18;
	setp.lt.ftz.f32	%p12, %f161, 0f00000000;
	ld.const.f32 	%f162, [inParams+24];
	ld.const.f32 	%f163, [inParams+20];
	selp.f32	%f164, %f163, %f162, %p12;
	mul.ftz.f32 	%f165, %f161, %f164;
	cvt.ftz.sat.f32.f32	%f166, %f165;
	add.ftz.f32 	%f167, %f160, %f166;
	mul.ftz.f32 	%f168, %f160, %f166;
	sub.ftz.f32 	%f169, %f167, %f168;
	max.ftz.f32 	%f170, %f169, %f115;
	div.approx.ftz.f32 	%f171, %f133, %f170;
	add.ftz.f32 	%f172, %f171, 0fBF800000;
	ld.const.f32 	%f173, [inParams+28];
	mul.ftz.f32 	%f174, %f172, %f173;
	fma.rn.ftz.f32 	%f38, %f121, %f174, %f116;
	ld.const.f32 	%f175, [inParams+32];
	mul.ftz.f32 	%f176, %f169, %f175;
	ld.const.f32 	%f177, [inParams+36];
	sub.ftz.f32 	%f178, %f176, %f177;
	cvt.ftz.sat.f32.f32	%f39, %f178;
	mul.ftz.f32 	%f40, %f156, %f38;
	mul.ftz.f32 	%f41, %f157, %f38;
	ld.const.f32 	%f179, [inParams+100];
	ld.const.f32 	%f180, [inParams+104];
	mul.ftz.f32 	%f181, %f41, %f180;
	fma.rn.ftz.f32 	%f182, %f40, %f179, %f181;
	ld.const.f32 	%f183, [inParams+108];
	fma.rn.ftz.f32 	%f184, %f38, %f183, %f182;
	ld.const.f32 	%f185, [inParams+112];
	add.ftz.f32 	%f186, %f184, %f185;
	cvt.ftz.sat.f32.f32	%f42, %f186;
	setp.lt.ftz.f32	%p13, %f42, 0f3B4D2E1C;
	@%p13 bra 	BB6_23;

	setp.ltu.ftz.f32	%p14, %f42, 0f00000000;
	@%p14 bra 	BB6_22;

	lg2.approx.ftz.f32 	%f187, %f42;
	mul.ftz.f32 	%f188, %f187, 0f3ED55476;
	ex2.approx.ftz.f32 	%f43, %f188;
	fma.rn.ftz.f32 	%f243, %f43, 0f3F870A3D, 0fBD6147AE;
	bra.uni 	BB6_24;

BB6_22:
	neg.ftz.f32 	%f189, %f42;
	lg2.approx.ftz.f32 	%f190, %f189;
	mul.ftz.f32 	%f191, %f190, 0f3ED55476;
	ex2.approx.ftz.f32 	%f192, %f191;
	neg.ftz.f32 	%f44, %f192;
	fma.rn.ftz.f32 	%f243, %f44, 0f3F870A3D, 0fBD6147AE;
	bra.uni 	BB6_24;

BB6_23:
	mul.ftz.f32 	%f243, %f42, 0f414EB852;

BB6_24:
	ld.const.f32 	%f193, [inParams+88];
	ld.const.f32 	%f194, [inParams+92];
	mul.ftz.f32 	%f195, %f41, %f194;
	fma.rn.ftz.f32 	%f196, %f40, %f193, %f195;
	ld.const.f32 	%f197, [inParams+96];
	fma.rn.ftz.f32 	%f198, %f38, %f197, %f196;
	ld.const.f32 	%f199, [inParams+116];
	add.ftz.f32 	%f200, %f198, %f199;
	cvt.ftz.sat.f32.f32	%f49, %f200;
	setp.lt.ftz.f32	%p15, %f49, 0f3B4D2E1C;
	@%p15 bra 	BB6_28;

	setp.ltu.ftz.f32	%p16, %f49, 0f00000000;
	@%p16 bra 	BB6_27;

	lg2.approx.ftz.f32 	%f201, %f49;
	mul.ftz.f32 	%f202, %f201, 0f3ED55476;
	ex2.approx.ftz.f32 	%f50, %f202;
	fma.rn.ftz.f32 	%f244, %f50, 0f3F870A3D, 0fBD6147AE;
	bra.uni 	BB6_29;

BB6_27:
	neg.ftz.f32 	%f203, %f49;
	lg2.approx.ftz.f32 	%f204, %f203;
	mul.ftz.f32 	%f205, %f204, 0f3ED55476;
	ex2.approx.ftz.f32 	%f206, %f205;
	neg.ftz.f32 	%f51, %f206;
	fma.rn.ftz.f32 	%f244, %f51, 0f3F870A3D, 0fBD6147AE;
	bra.uni 	BB6_29;

BB6_28:
	mul.ftz.f32 	%f244, %f49, 0f414EB852;

BB6_29:
	ld.const.f32 	%f207, [inParams+76];
	ld.const.f32 	%f208, [inParams+80];
	mul.ftz.f32 	%f209, %f41, %f208;
	fma.rn.ftz.f32 	%f210, %f40, %f207, %f209;
	ld.const.f32 	%f211, [inParams+84];
	fma.rn.ftz.f32 	%f212, %f38, %f211, %f210;
	ld.const.f32 	%f213, [inParams+120];
	add.ftz.f32 	%f214, %f212, %f213;
	cvt.ftz.sat.f32.f32	%f56, %f214;
	setp.lt.ftz.f32	%p17, %f56, 0f3B4D2E1C;
	@%p17 bra 	BB6_33;

	setp.ltu.ftz.f32	%p18, %f56, 0f00000000;
	@%p18 bra 	BB6_32;

	lg2.approx.ftz.f32 	%f215, %f56;
	mul.ftz.f32 	%f216, %f215, 0f3ED55476;
	ex2.approx.ftz.f32 	%f57, %f216;
	fma.rn.ftz.f32 	%f245, %f57, 0f3F870A3D, 0fBD6147AE;
	bra.uni 	BB6_34;

BB6_32:
	neg.ftz.f32 	%f217, %f56;
	lg2.approx.ftz.f32 	%f218, %f217;
	mul.ftz.f32 	%f219, %f218, 0f3ED55476;
	ex2.approx.ftz.f32 	%f220, %f219;
	neg.ftz.f32 	%f58, %f220;
	fma.rn.ftz.f32 	%f245, %f58, 0f3F870A3D, 0fBD6147AE;
	bra.uni 	BB6_34;

BB6_33:
	mul.ftz.f32 	%f245, %f56, 0f414EB852;

BB6_34:
	mul.ftz.f32 	%f246, %f239, %f39;
	setp.eq.s32	%p19, %r5, 0;
	@%p19 bra 	BB6_36;

	ld.const.f32 	%f221, [inParams+140];
	ld.const.f32 	%f222, [inParams+144];
	fma.rn.ftz.f32 	%f223, %f246, %f221, %f222;
	cvt.ftz.sat.f32.f32	%f246, %f223;

BB6_36:
	mad.lo.s32 	%r16, %r2, %r4, %r1;
	cvt.s64.s32	%rd3, %r16;
	@%p4 bra 	BB6_38;

	cvta.to.global.u64 	%rd10, %rd4;
	shl.b64 	%rd11, %rd3, 4;
	add.s64 	%rd12, %rd10, %rd11;
	st.global.v4.f32 	[%rd12], {%f243, %f244, %f245, %f246};
	bra.uni 	BB6_39;

BB6_38:
	cvta.to.global.u64 	%rd13, %rd4;
	shl.b64 	%rd14, %rd3, 3;
	add.s64 	%rd15, %rd13, %rd14;
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f246;
	mov.b16 	%rs9, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f245;
	mov.b16 	%rs10, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f244;
	mov.b16 	%rs11, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f243;
	mov.b16 	%rs12, %temp;
}
	st.global.v4.u16 	[%rd15], {%rs12, %rs11, %rs10, %rs9};

BB6_39:
	ret;
}

.visible .entry cuda_kernel_showAlpha(
	.param .u64 cuda_kernel_showAlpha_param_0,
	.param .u64 cuda_kernel_showAlpha_param_1,
	.param .u32 cuda_kernel_showAlpha_param_2,
	.param .u32 cuda_kernel_showAlpha_param_3,
	.param .u32 cuda_kernel_showAlpha_param_4,
	.param .u32 cuda_kernel_showAlpha_param_5,
	.param .u32 cuda_kernel_showAlpha_param_6
)
{
	.reg .pred 	%p<6>;
	.reg .s16 	%rs<11>;
	.reg .s32 	%r<16>;
	.reg .f32 	%f<21>;
	.reg .s64 	%rd<15>;


	ld.param.u64 	%rd5, [cuda_kernel_showAlpha_param_0];
	ld.param.u64 	%rd6, [cuda_kernel_showAlpha_param_1];
	ld.param.u32 	%r6, [cuda_kernel_showAlpha_param_2];
	ld.param.u32 	%r7, [cuda_kernel_showAlpha_param_3];
	ld.param.u32 	%r3, [cuda_kernel_showAlpha_param_4];
	ld.param.u32 	%r4, [cuda_kernel_showAlpha_param_5];
	ld.param.u32 	%r5, [cuda_kernel_showAlpha_param_6];
	cvta.to.global.u64 	%rd1, %rd6;
	cvta.to.global.u64 	%rd2, %rd5;
	mov.u32 	%r8, %ntid.x;
	mov.u32 	%r9, %ctaid.x;
	mov.u32 	%r10, %tid.x;
	mad.lo.s32 	%r1, %r8, %r9, %r10;
	mov.u32 	%r11, %ntid.y;
	mov.u32 	%r12, %ctaid.y;
	mov.u32 	%r13, %tid.y;
	mad.lo.s32 	%r2, %r11, %r12, %r13;
	setp.ge.s32	%p1, %r2, %r7;
	setp.ge.s32	%p2, %r1, %r6;
	or.pred  	%p3, %p2, %p1;
	@%p3 bra 	BB7_7;

	mad.lo.s32 	%r14, %r2, %r3, %r1;
	cvt.s64.s32	%rd3, %r14;
	setp.eq.s32	%p4, %r5, 0;
	@%p4 bra 	BB7_3;

	shl.b64 	%rd7, %rd3, 4;
	add.s64 	%rd8, %rd2, %rd7;
	ld.global.v4.f32 	{%f14, %f15, %f16, %f17}, [%rd8];
	mov.f32 	%f20, %f17;
	mov.f32 	%f3, %f16;
	mov.f32 	%f2, %f15;
	mov.f32 	%f1, %f14;
	bra.uni 	BB7_4;

BB7_3:
	shl.b64 	%rd9, %rd3, 3;
	add.s64 	%rd10, %rd2, %rd9;
	ld.global.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd10];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f20, %temp;
	}

BB7_4:
	cvt.ftz.sat.f32.f32	%f13, %f20;
	mad.lo.s32 	%r15, %r2, %r4, %r1;
	cvt.s64.s32	%rd4, %r15;
	@%p4 bra 	BB7_6;

	shl.b64 	%rd11, %rd4, 4;
	add.s64 	%rd12, %rd1, %rd11;
	mov.f32 	%f18, 0f3F800000;
	st.global.v4.f32 	[%rd12], {%f13, %f13, %f13, %f18};
	bra.uni 	BB7_7;

BB7_6:
	shl.b64 	%rd13, %rd4, 3;
	add.s64 	%rd14, %rd1, %rd13;
	mov.f32 	%f19, 0f3F800000;
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f19;
	mov.b16 	%rs9, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f13;
	mov.b16 	%rs10, %temp;
}
	st.global.v4.u16 	[%rd14], {%rs10, %rs10, %rs10, %rs9};

BB7_7:
	ret;
}

.visible .entry cuda_kernel_showColor(
	.param .u64 cuda_kernel_showColor_param_0,
	.param .u64 cuda_kernel_showColor_param_1,
	.param .u32 cuda_kernel_showColor_param_2,
	.param .u32 cuda_kernel_showColor_param_3,
	.param .u32 cuda_kernel_showColor_param_4,
	.param .u32 cuda_kernel_showColor_param_5,
	.param .u32 cuda_kernel_showColor_param_6
)
{
	.reg .pred 	%p<6>;
	.reg .s16 	%rs<13>;
	.reg .s32 	%r<16>;
	.reg .f32 	%f<27>;
	.reg .s64 	%rd<15>;


	ld.param.u64 	%rd5, [cuda_kernel_showColor_param_0];
	ld.param.u64 	%rd6, [cuda_kernel_showColor_param_1];
	ld.param.u32 	%r6, [cuda_kernel_showColor_param_2];
	ld.param.u32 	%r7, [cuda_kernel_showColor_param_3];
	ld.param.u32 	%r3, [cuda_kernel_showColor_param_4];
	ld.param.u32 	%r4, [cuda_kernel_showColor_param_5];
	ld.param.u32 	%r5, [cuda_kernel_showColor_param_6];
	cvta.to.global.u64 	%rd1, %rd6;
	cvta.to.global.u64 	%rd2, %rd5;
	mov.u32 	%r8, %ntid.x;
	mov.u32 	%r9, %ctaid.x;
	mov.u32 	%r10, %tid.x;
	mad.lo.s32 	%r1, %r8, %r9, %r10;
	mov.u32 	%r11, %ntid.y;
	mov.u32 	%r12, %ctaid.y;
	mov.u32 	%r13, %tid.y;
	mad.lo.s32 	%r2, %r11, %r12, %r13;
	setp.ge.s32	%p1, %r2, %r7;
	setp.ge.s32	%p2, %r1, %r6;
	or.pred  	%p3, %p2, %p1;
	@%p3 bra 	BB8_7;

	mad.lo.s32 	%r14, %r2, %r3, %r1;
	cvt.s64.s32	%rd3, %r14;
	setp.eq.s32	%p4, %r5, 0;
	@%p4 bra 	BB8_3;

	shl.b64 	%rd7, %rd3, 4;
	add.s64 	%rd8, %rd2, %rd7;
	ld.global.v4.f32 	{%f16, %f17, %f18, %f19}, [%rd8];
	mov.f32 	%f26, %f19;
	mov.f32 	%f25, %f18;
	mov.f32 	%f24, %f17;
	mov.f32 	%f23, %f16;
	bra.uni 	BB8_4;

BB8_3:
	shl.b64 	%rd9, %rd3, 3;
	add.s64 	%rd10, %rd2, %rd9;
	ld.global.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd10];
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f23, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f24, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f25, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f26, %temp;
	}

BB8_4:
	cvt.ftz.sat.f32.f32	%f20, %f26;
	mul.ftz.f32 	%f13, %f23, %f20;
	mul.ftz.f32 	%f14, %f24, %f20;
	mul.ftz.f32 	%f15, %f25, %f20;
	mad.lo.s32 	%r15, %r2, %r4, %r1;
	cvt.s64.s32	%rd4, %r15;
	@%p4 bra 	BB8_6;

	shl.b64 	%rd11, %rd4, 4;
	add.s64 	%rd12, %rd1, %rd11;
	mov.f32 	%f21, 0f3F800000;
	st.global.v4.f32 	[%rd12], {%f13, %f14, %f15, %f21};
	bra.uni 	BB8_7;

BB8_6:
	shl.b64 	%rd13, %rd4, 3;
	add.s64 	%rd14, %rd1, %rd13;
	mov.f32 	%f22, 0f3F800000;
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f22;
	mov.b16 	%rs9, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f15;
	mov.b16 	%rs10, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f14;
	mov.b16 	%rs11, %temp;
}
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f13;
	mov.b16 	%rs12, %temp;
}
	st.global.v4.u16 	[%rd14], {%rs12, %rs11, %rs10, %rs9};

BB8_7:
	ret;
}


