//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Fri Jul 25 04:36:16 2014 (1406288176)
// Cuda compilation tools, release 6.5, V6.5.13
//

.version 4.1
.target sm_30
.address_size 64

// _d_gauss_filter_kernel_shmem_float4_1$__cuda_local_var_180245_228_non_const_s_data has been demoted
// _d_gauss_filter_kernel_shmem_float4_2$__cuda_local_var_180249_228_non_const_s_data has been demoted
// _d_gauss_filter_kernel_shmem_float4_3$__cuda_local_var_180253_228_non_const_s_data has been demoted
// _d_gauss_filter_kernel_shmem_float4_4$__cuda_local_var_180257_228_non_const_s_data has been demoted
// _d_gauss_filter_kernel_shmem_float4_5$__cuda_local_var_180261_228_non_const_s_data has been demoted
// _d_gauss_filter_kernel_shmem_float4_6$__cuda_local_var_180265_228_non_const_s_data has been demoted
// _d_gauss_filter_kernel_shmem_float4_7$__cuda_local_var_180269_228_non_const_s_data has been demoted
// _d_median_filter_kernel_2x2$__cuda_local_var_180273_184_non_const_s_data has been demoted

.visible .entry _d_gauss_filter_kernel_float4(
	.param .u64 _d_gauss_filter_kernel_float4_param_0,
	.param .u32 _d_gauss_filter_kernel_float4_param_1,
	.param .u64 _d_gauss_filter_kernel_float4_param_2,
	.param .u32 _d_gauss_filter_kernel_float4_param_3,
	.param .u32 _d_gauss_filter_kernel_float4_param_4,
	.param .u32 _d_gauss_filter_kernel_float4_param_5,
	.param .f32 _d_gauss_filter_kernel_float4_param_6,
	.param .f32 _d_gauss_filter_kernel_float4_param_7,
	.param .u32 _d_gauss_filter_kernel_float4_param_8,
	.param .u32 _d_gauss_filter_kernel_float4_param_9
)
{
	.reg .pred 	%p<8>;
	.reg .s32 	%r<59>;
	.reg .f32 	%f<70>;
	.reg .s64 	%rd<9>;


	ld.param.u64 	%rd2, [_d_gauss_filter_kernel_float4_param_0];
	ld.param.u32 	%r11, [_d_gauss_filter_kernel_float4_param_1];
	ld.param.u64 	%rd3, [_d_gauss_filter_kernel_float4_param_2];
	ld.param.u32 	%r12, [_d_gauss_filter_kernel_float4_param_3];
	ld.param.u32 	%r13, [_d_gauss_filter_kernel_float4_param_4];
	ld.param.u32 	%r14, [_d_gauss_filter_kernel_float4_param_5];
	ld.param.f32 	%f29, [_d_gauss_filter_kernel_float4_param_6];
	ld.param.f32 	%f30, [_d_gauss_filter_kernel_float4_param_7];
	ld.param.u32 	%r15, [_d_gauss_filter_kernel_float4_param_8];
	ld.param.u32 	%r16, [_d_gauss_filter_kernel_float4_param_9];
	mov.u32 	%r17, %ntid.x;
	mov.u32 	%r18, %ctaid.x;
	mov.u32 	%r19, %tid.x;
	mad.lo.s32 	%r20, %r17, %r18, %r19;
	mov.u32 	%r21, %ntid.y;
	mov.u32 	%r22, %ctaid.y;
	mov.u32 	%r23, %tid.y;
	mad.lo.s32 	%r24, %r21, %r22, %r23;
	setp.lt.s32	%p1, %r20, %r13;
	setp.lt.s32	%p2, %r24, %r14;
	and.pred  	%p3, %p1, %p2;
	@!%p3 bra 	BB0_9;
	bra.uni 	BB0_1;

BB0_1:
	neg.s32 	%r56, %r16;
	setp.le.s32	%p4, %r56, %r16;
	@%p4 bra 	BB0_3;

	mov.f32 	%f69, 0f00000000;
	mov.f32 	%f68, %f69;
	mov.f32 	%f67, %f69;
	mov.f32 	%f66, %f69;
	mov.f32 	%f65, %f69;
	bra.uni 	BB0_8;

BB0_3:
	mul.ftz.f32 	%f36, %f29, %f29;
	fma.rn.ftz.f32 	%f1, %f29, %f29, %f36;
	mul.ftz.f32 	%f37, %f30, %f30;
	fma.rn.ftz.f32 	%f2, %f30, %f30, %f37;
	mov.f32 	%f69, 0f00000000;
	mov.f32 	%f68, %f69;
	mov.f32 	%f67, %f69;
	mov.f32 	%f66, %f69;
	mov.f32 	%f65, %f69;
	cvta.to.global.u64 	%rd1, %rd3;

BB0_4:
	neg.s32 	%r58, %r15;
	setp.gt.s32	%p5, %r58, %r15;
	@%p5 bra 	BB0_7;

	add.s32 	%r31, %r56, %r24;
	add.s32 	%r32, %r14, -1;
	min.s32 	%r33, %r32, %r31;
	mov.u32 	%r34, 0;
	max.s32 	%r35, %r34, %r33;
	mul.lo.s32 	%r36, %r56, %r56;
	cvt.rn.f32.s32	%f38, %r36;
	neg.ftz.f32 	%f39, %f38;
	div.approx.ftz.f32 	%f40, %f39, %f2;
	mul.ftz.f32 	%f41, %f40, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f8, %f41;
	mul.lo.s32 	%r3, %r35, %r12;
	sub.s32 	%r57, %r20, %r15;

BB0_6:
	add.s32 	%r41, %r13, -1;
	min.s32 	%r42, %r41, %r57;
	max.s32 	%r44, %r34, %r42;
	mul.lo.s32 	%r45, %r58, %r58;
	cvt.rn.f32.s32	%f42, %r45;
	neg.ftz.f32 	%f43, %f42;
	div.approx.ftz.f32 	%f44, %f43, %f1;
	mul.ftz.f32 	%f45, %f44, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f46, %f45;
	mul.ftz.f32 	%f47, %f8, %f46;
	add.s32 	%r46, %r44, %r3;
	mul.wide.s32 	%rd4, %r46, 16;
	add.s64 	%rd5, %rd1, %rd4;
	ld.global.v4.f32 	{%f48, %f49, %f50, %f51}, [%rd5];
	fma.rn.ftz.f32 	%f66, %f48, %f47, %f66;
	fma.rn.ftz.f32 	%f67, %f49, %f47, %f67;
	fma.rn.ftz.f32 	%f68, %f50, %f47, %f68;
	fma.rn.ftz.f32 	%f69, %f51, %f47, %f69;
	add.ftz.f32 	%f65, %f65, %f47;
	add.s32 	%r57, %r57, 1;
	add.s32 	%r58, %r58, 1;
	setp.le.s32	%p6, %r58, %r15;
	@%p6 bra 	BB0_6;

BB0_7:
	add.s32 	%r56, %r56, 1;
	setp.le.s32	%p7, %r56, %r16;
	@%p7 bra 	BB0_4;

BB0_8:
	cvta.to.global.u64 	%rd6, %rd2;
	mad.lo.s32 	%r55, %r24, %r11, %r20;
	mul.wide.s32 	%rd7, %r55, 16;
	add.s64 	%rd8, %rd6, %rd7;
	div.approx.ftz.f32 	%f61, %f66, %f65;
	div.approx.ftz.f32 	%f62, %f67, %f65;
	div.approx.ftz.f32 	%f63, %f68, %f65;
	div.approx.ftz.f32 	%f64, %f69, %f65;
	st.global.v4.f32 	[%rd8], {%f61, %f62, %f63, %f64};

BB0_9:
	ret;
}

.visible .entry _d_gauss_filter_kernel_shmem_float4_1(
	.param .u64 _d_gauss_filter_kernel_shmem_float4_1_param_0,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_1_param_1,
	.param .u64 _d_gauss_filter_kernel_shmem_float4_1_param_2,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_1_param_3,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_1_param_4,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_1_param_5,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_1_param_6,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_1_param_7
)
{
	.reg .pred 	%p<16>;
	.reg .s32 	%r<54>;
	.reg .f32 	%f<218>;
	.reg .s64 	%rd<30>;
	// demoted variable
	.shared .align 16 .b8 _d_gauss_filter_kernel_shmem_float4_1$__cuda_local_var_180245_228_non_const_s_data[5184];

	ld.param.u64 	%rd3, [_d_gauss_filter_kernel_shmem_float4_1_param_0];
	ld.param.u32 	%r9, [_d_gauss_filter_kernel_shmem_float4_1_param_1];
	ld.param.u64 	%rd4, [_d_gauss_filter_kernel_shmem_float4_1_param_2];
	ld.param.u32 	%r10, [_d_gauss_filter_kernel_shmem_float4_1_param_3];
	ld.param.u32 	%r11, [_d_gauss_filter_kernel_shmem_float4_1_param_4];
	ld.param.u32 	%r12, [_d_gauss_filter_kernel_shmem_float4_1_param_5];
	ld.param.f32 	%f1, [_d_gauss_filter_kernel_shmem_float4_1_param_6];
	ld.param.f32 	%f2, [_d_gauss_filter_kernel_shmem_float4_1_param_7];
	cvta.to.global.u64 	%rd1, %rd4;
	mov.u32 	%r13, %ntid.x;
	mov.u32 	%r14, %ctaid.x;
	mov.u32 	%r15, %tid.x;
	mad.lo.s32 	%r1, %r13, %r14, %r15;
	add.s32 	%r16, %r11, -1;
	min.s32 	%r2, %r1, %r16;
	mov.u32 	%r17, %ntid.y;
	mov.u32 	%r18, %ctaid.y;
	mov.u32 	%r3, %tid.y;
	mad.lo.s32 	%r4, %r17, %r18, %r3;
	add.s32 	%r19, %r12, -1;
	min.s32 	%r5, %r4, %r19;
	mul.wide.s32 	%rd5, %r3, 288;
	mov.u64 	%rd6, _d_gauss_filter_kernel_shmem_float4_1$__cuda_local_var_180245_228_non_const_s_data;
	add.s64 	%rd7, %rd6, %rd5;
	mul.wide.s32 	%rd8, %r15, 16;
	add.s64 	%rd2, %rd7, %rd8;
	mul.lo.s32 	%r6, %r5, %r10;
	add.s32 	%r20, %r6, %r2;
	mul.wide.s32 	%rd9, %r20, 16;
	add.s64 	%rd10, %rd1, %rd9;
	ld.global.v4.f32 	{%f3, %f4, %f5, %f6}, [%rd10];
	st.shared.v4.f32 	[%rd2+304], {%f3, %f4, %f5, %f6};
	setp.gt.s32	%p1, %r3, 0;
	@%p1 bra 	BB1_2;

	add.s32 	%r21, %r5, -1;
	mov.u32 	%r22, 0;
	max.s32 	%r23, %r22, %r21;
	mad.lo.s32 	%r24, %r23, %r10, %r2;
	mul.wide.s32 	%rd11, %r24, 16;
	add.s64 	%rd12, %rd1, %rd11;
	ld.global.v4.f32 	{%f11, %f12, %f13, %f14}, [%rd12];
	st.shared.v4.f32 	[%rd2+16], {%f11, %f12, %f13, %f14};

BB1_2:
	setp.lt.s32	%p2, %r3, 15;
	@%p2 bra 	BB1_4;

	add.s32 	%r25, %r5, 1;
	setp.lt.s32	%p3, %r25, %r12;
	selp.b32	%r27, %r25, %r19, %p3;
	mad.lo.s32 	%r28, %r27, %r10, %r2;
	mul.wide.s32 	%rd13, %r28, 16;
	add.s64 	%rd14, %rd1, %rd13;
	ld.global.v4.f32 	{%f19, %f20, %f21, %f22}, [%rd14];
	st.shared.v4.f32 	[%rd2+592], {%f19, %f20, %f21, %f22};

BB1_4:
	setp.gt.s32	%p4, %r15, 0;
	@%p4 bra 	BB1_9;

	add.s32 	%r30, %r2, -1;
	mov.u32 	%r31, 0;
	max.s32 	%r7, %r31, %r30;
	add.s32 	%r32, %r6, %r7;
	mul.wide.s32 	%rd15, %r32, 16;
	add.s64 	%rd16, %rd1, %rd15;
	ld.global.v4.f32 	{%f27, %f28, %f29, %f30}, [%rd16];
	st.shared.v4.f32 	[%rd2+288], {%f27, %f28, %f29, %f30};
	@%p1 bra 	BB1_7;

	add.s32 	%r33, %r5, -1;
	max.s32 	%r35, %r31, %r33;
	mad.lo.s32 	%r36, %r35, %r10, %r7;
	mul.wide.s32 	%rd17, %r36, 16;
	add.s64 	%rd18, %rd1, %rd17;
	ld.global.v4.f32 	{%f35, %f36, %f37, %f38}, [%rd18];
	st.shared.v4.f32 	[%rd2], {%f35, %f36, %f37, %f38};

BB1_7:
	@%p2 bra 	BB1_9;

	add.s32 	%r37, %r5, 1;
	setp.lt.s32	%p7, %r37, %r12;
	selp.b32	%r39, %r37, %r19, %p7;
	mad.lo.s32 	%r40, %r39, %r10, %r7;
	mul.wide.s32 	%rd19, %r40, 16;
	add.s64 	%rd20, %rd1, %rd19;
	ld.global.v4.f32 	{%f43, %f44, %f45, %f46}, [%rd20];
	st.shared.v4.f32 	[%rd2+576], {%f43, %f44, %f45, %f46};

BB1_9:
	setp.lt.s32	%p8, %r15, 15;
	@%p8 bra 	BB1_14;

	add.s32 	%r42, %r2, 1;
	setp.lt.s32	%p9, %r42, %r11;
	selp.b32	%r8, %r42, %r16, %p9;
	add.s32 	%r44, %r6, %r8;
	mul.wide.s32 	%rd21, %r44, 16;
	add.s64 	%rd22, %rd1, %rd21;
	ld.global.v4.f32 	{%f51, %f52, %f53, %f54}, [%rd22];
	st.shared.v4.f32 	[%rd2+320], {%f51, %f52, %f53, %f54};
	@%p1 bra 	BB1_12;

	add.s32 	%r45, %r5, -1;
	mov.u32 	%r46, 0;
	max.s32 	%r47, %r46, %r45;
	mad.lo.s32 	%r48, %r47, %r10, %r8;
	mul.wide.s32 	%rd23, %r48, 16;
	add.s64 	%rd24, %rd1, %rd23;
	ld.global.v4.f32 	{%f59, %f60, %f61, %f62}, [%rd24];
	st.shared.v4.f32 	[%rd2+32], {%f59, %f60, %f61, %f62};

BB1_12:
	@%p2 bra 	BB1_14;

	add.s32 	%r49, %r5, 1;
	setp.lt.s32	%p12, %r49, %r12;
	selp.b32	%r51, %r49, %r19, %p12;
	mad.lo.s32 	%r52, %r51, %r10, %r8;
	mul.wide.s32 	%rd25, %r52, 16;
	add.s64 	%rd26, %rd1, %rd25;
	ld.global.v4.f32 	{%f67, %f68, %f69, %f70}, [%rd26];
	st.shared.v4.f32 	[%rd2+608], {%f67, %f68, %f69, %f70};

BB1_14:
	bar.sync 	0;
	setp.lt.s32	%p13, %r4, %r12;
	setp.lt.s32	%p14, %r1, %r11;
	and.pred  	%p15, %p14, %p13;
	@!%p15 bra 	BB1_16;
	bra.uni 	BB1_15;

BB1_15:
	cvta.to.global.u64 	%rd27, %rd3;
	mul.ftz.f32 	%f75, %f1, %f1;
	fma.rn.ftz.f32 	%f76, %f1, %f1, %f75;
	mul.ftz.f32 	%f77, %f2, %f2;
	fma.rn.ftz.f32 	%f78, %f2, %f2, %f77;
	mov.f32 	%f79, 0fBF800000;
	div.approx.ftz.f32 	%f80, %f79, %f78;
	mul.ftz.f32 	%f81, %f80, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f82, %f81;
	div.approx.ftz.f32 	%f83, %f79, %f76;
	mul.ftz.f32 	%f84, %f83, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f85, %f84;
	mul.ftz.f32 	%f86, %f82, %f85;
	ld.shared.v4.f32 	{%f87, %f88, %f89, %f90}, [%rd2];
	fma.rn.ftz.f32 	%f92, %f87, %f86, 0f00000000;
	fma.rn.ftz.f32 	%f94, %f88, %f86, 0f00000000;
	fma.rn.ftz.f32 	%f96, %f89, %f86, 0f00000000;
	fma.rn.ftz.f32 	%f98, %f90, %f86, 0f00000000;
	add.ftz.f32 	%f99, %f86, 0f00000000;
	mov.f32 	%f100, 0f80000000;
	div.approx.ftz.f32 	%f101, %f100, %f76;
	mul.ftz.f32 	%f102, %f101, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f103, %f102;
	mul.ftz.f32 	%f104, %f82, %f103;
	ld.shared.v4.f32 	{%f105, %f106, %f107, %f108}, [%rd2+16];
	fma.rn.ftz.f32 	%f110, %f105, %f104, %f92;
	fma.rn.ftz.f32 	%f112, %f106, %f104, %f94;
	fma.rn.ftz.f32 	%f114, %f107, %f104, %f96;
	fma.rn.ftz.f32 	%f116, %f108, %f104, %f98;
	add.ftz.f32 	%f117, %f99, %f104;
	ld.shared.v4.f32 	{%f118, %f119, %f120, %f121}, [%rd2+32];
	fma.rn.ftz.f32 	%f123, %f118, %f86, %f110;
	fma.rn.ftz.f32 	%f125, %f119, %f86, %f112;
	fma.rn.ftz.f32 	%f127, %f120, %f86, %f114;
	fma.rn.ftz.f32 	%f129, %f121, %f86, %f116;
	add.ftz.f32 	%f130, %f117, %f86;
	div.approx.ftz.f32 	%f131, %f100, %f78;
	mul.ftz.f32 	%f132, %f131, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f133, %f132;
	mul.ftz.f32 	%f134, %f133, %f85;
	ld.shared.v4.f32 	{%f135, %f136, %f137, %f138}, [%rd2+288];
	fma.rn.ftz.f32 	%f140, %f135, %f134, %f123;
	fma.rn.ftz.f32 	%f142, %f136, %f134, %f125;
	fma.rn.ftz.f32 	%f144, %f137, %f134, %f127;
	fma.rn.ftz.f32 	%f146, %f138, %f134, %f129;
	add.ftz.f32 	%f147, %f130, %f134;
	mul.ftz.f32 	%f148, %f133, %f103;
	ld.shared.v4.f32 	{%f149, %f150, %f151, %f152}, [%rd2+304];
	fma.rn.ftz.f32 	%f154, %f149, %f148, %f140;
	fma.rn.ftz.f32 	%f156, %f150, %f148, %f142;
	fma.rn.ftz.f32 	%f158, %f151, %f148, %f144;
	fma.rn.ftz.f32 	%f160, %f152, %f148, %f146;
	add.ftz.f32 	%f161, %f147, %f148;
	ld.shared.v4.f32 	{%f162, %f163, %f164, %f165}, [%rd2+320];
	fma.rn.ftz.f32 	%f167, %f162, %f134, %f154;
	fma.rn.ftz.f32 	%f169, %f163, %f134, %f156;
	fma.rn.ftz.f32 	%f171, %f164, %f134, %f158;
	fma.rn.ftz.f32 	%f173, %f165, %f134, %f160;
	add.ftz.f32 	%f174, %f161, %f134;
	ld.shared.v4.f32 	{%f175, %f176, %f177, %f178}, [%rd2+576];
	fma.rn.ftz.f32 	%f180, %f175, %f86, %f167;
	fma.rn.ftz.f32 	%f182, %f176, %f86, %f169;
	fma.rn.ftz.f32 	%f184, %f177, %f86, %f171;
	fma.rn.ftz.f32 	%f186, %f178, %f86, %f173;
	add.ftz.f32 	%f187, %f174, %f86;
	ld.shared.v4.f32 	{%f188, %f189, %f190, %f191}, [%rd2+592];
	fma.rn.ftz.f32 	%f193, %f188, %f104, %f180;
	fma.rn.ftz.f32 	%f195, %f189, %f104, %f182;
	fma.rn.ftz.f32 	%f197, %f190, %f104, %f184;
	fma.rn.ftz.f32 	%f199, %f191, %f104, %f186;
	add.ftz.f32 	%f200, %f187, %f104;
	ld.shared.v4.f32 	{%f201, %f202, %f203, %f204}, [%rd2+608];
	fma.rn.ftz.f32 	%f206, %f201, %f86, %f193;
	fma.rn.ftz.f32 	%f208, %f202, %f86, %f195;
	fma.rn.ftz.f32 	%f210, %f203, %f86, %f197;
	fma.rn.ftz.f32 	%f212, %f204, %f86, %f199;
	add.ftz.f32 	%f213, %f200, %f86;
	mad.lo.s32 	%r53, %r4, %r9, %r1;
	mul.wide.s32 	%rd28, %r53, 16;
	add.s64 	%rd29, %rd27, %rd28;
	div.approx.ftz.f32 	%f214, %f212, %f213;
	div.approx.ftz.f32 	%f215, %f210, %f213;
	div.approx.ftz.f32 	%f216, %f208, %f213;
	div.approx.ftz.f32 	%f217, %f206, %f213;
	st.global.v4.f32 	[%rd29], {%f217, %f216, %f215, %f214};

BB1_16:
	ret;
}

.visible .entry _d_gauss_filter_kernel_shmem_float4_2(
	.param .u64 _d_gauss_filter_kernel_shmem_float4_2_param_0,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_2_param_1,
	.param .u64 _d_gauss_filter_kernel_shmem_float4_2_param_2,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_2_param_3,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_2_param_4,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_2_param_5,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_2_param_6,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_2_param_7
)
{
	.reg .pred 	%p<17>;
	.reg .s32 	%r<60>;
	.reg .f32 	%f<187>;
	.reg .s64 	%rd<38>;
	// demoted variable
	.shared .align 16 .b8 _d_gauss_filter_kernel_shmem_float4_2$__cuda_local_var_180249_228_non_const_s_data[6400];

	ld.param.u64 	%rd6, [_d_gauss_filter_kernel_shmem_float4_2_param_0];
	ld.param.u32 	%r11, [_d_gauss_filter_kernel_shmem_float4_2_param_1];
	ld.param.u64 	%rd7, [_d_gauss_filter_kernel_shmem_float4_2_param_2];
	ld.param.u32 	%r12, [_d_gauss_filter_kernel_shmem_float4_2_param_3];
	ld.param.u32 	%r13, [_d_gauss_filter_kernel_shmem_float4_2_param_4];
	ld.param.u32 	%r14, [_d_gauss_filter_kernel_shmem_float4_2_param_5];
	ld.param.f32 	%f19, [_d_gauss_filter_kernel_shmem_float4_2_param_6];
	ld.param.f32 	%f20, [_d_gauss_filter_kernel_shmem_float4_2_param_7];
	cvta.to.global.u64 	%rd1, %rd7;
	mov.u32 	%r15, %ntid.x;
	mov.u32 	%r16, %ctaid.x;
	mov.u32 	%r17, %tid.x;
	mad.lo.s32 	%r1, %r15, %r16, %r17;
	add.s32 	%r18, %r13, -1;
	min.s32 	%r2, %r1, %r18;
	mov.u32 	%r19, %ntid.y;
	mov.u32 	%r20, %ctaid.y;
	mov.u32 	%r3, %tid.y;
	mad.lo.s32 	%r4, %r19, %r20, %r3;
	add.s32 	%r21, %r14, -1;
	min.s32 	%r5, %r4, %r21;
	mul.wide.s32 	%rd8, %r3, 320;
	mov.u64 	%rd9, _d_gauss_filter_kernel_shmem_float4_2$__cuda_local_var_180249_228_non_const_s_data;
	add.s64 	%rd10, %rd9, %rd8;
	mul.wide.s32 	%rd11, %r17, 16;
	add.s64 	%rd2, %rd10, %rd11;
	mul.lo.s32 	%r6, %r5, %r12;
	add.s32 	%r22, %r6, %r2;
	mul.wide.s32 	%rd12, %r22, 16;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.v4.f32 	{%f21, %f22, %f23, %f24}, [%rd13];
	st.shared.v4.f32 	[%rd2+672], {%f21, %f22, %f23, %f24};
	setp.gt.s32	%p1, %r3, 1;
	@%p1 bra 	BB2_2;

	add.s32 	%r23, %r5, -2;
	mov.u32 	%r24, 0;
	max.s32 	%r25, %r24, %r23;
	mad.lo.s32 	%r26, %r25, %r12, %r2;
	mul.wide.s32 	%rd14, %r26, 16;
	add.s64 	%rd15, %rd1, %rd14;
	ld.global.v4.f32 	{%f29, %f30, %f31, %f32}, [%rd15];
	st.shared.v4.f32 	[%rd2+32], {%f29, %f30, %f31, %f32};

BB2_2:
	setp.lt.s32	%p2, %r3, 14;
	@%p2 bra 	BB2_4;

	add.s32 	%r27, %r5, 2;
	setp.lt.s32	%p3, %r27, %r14;
	selp.b32	%r29, %r27, %r21, %p3;
	mad.lo.s32 	%r30, %r29, %r12, %r2;
	mul.wide.s32 	%rd16, %r30, 16;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.v4.f32 	{%f37, %f38, %f39, %f40}, [%rd17];
	st.shared.v4.f32 	[%rd2+1312], {%f37, %f38, %f39, %f40};

BB2_4:
	setp.gt.s32	%p4, %r17, 1;
	@%p4 bra 	BB2_9;

	add.s32 	%r32, %r2, -2;
	mov.u32 	%r33, 0;
	max.s32 	%r7, %r33, %r32;
	add.s32 	%r34, %r6, %r7;
	mul.wide.s32 	%rd18, %r34, 16;
	add.s64 	%rd19, %rd1, %rd18;
	ld.global.v4.f32 	{%f45, %f46, %f47, %f48}, [%rd19];
	st.shared.v4.f32 	[%rd2+640], {%f45, %f46, %f47, %f48};
	@%p1 bra 	BB2_7;

	add.s32 	%r35, %r5, -2;
	max.s32 	%r37, %r33, %r35;
	mad.lo.s32 	%r38, %r37, %r12, %r7;
	mul.wide.s32 	%rd20, %r38, 16;
	add.s64 	%rd21, %rd1, %rd20;
	ld.global.v4.f32 	{%f53, %f54, %f55, %f56}, [%rd21];
	st.shared.v4.f32 	[%rd2], {%f53, %f54, %f55, %f56};

BB2_7:
	@%p2 bra 	BB2_9;

	add.s32 	%r39, %r5, 2;
	setp.lt.s32	%p7, %r39, %r14;
	selp.b32	%r41, %r39, %r21, %p7;
	mad.lo.s32 	%r42, %r41, %r12, %r7;
	mul.wide.s32 	%rd22, %r42, 16;
	add.s64 	%rd23, %rd1, %rd22;
	ld.global.v4.f32 	{%f61, %f62, %f63, %f64}, [%rd23];
	st.shared.v4.f32 	[%rd2+1280], {%f61, %f62, %f63, %f64};

BB2_9:
	setp.lt.s32	%p8, %r17, 14;
	@%p8 bra 	BB2_14;

	add.s32 	%r44, %r2, 2;
	setp.lt.s32	%p9, %r44, %r13;
	selp.b32	%r8, %r44, %r18, %p9;
	add.s32 	%r46, %r6, %r8;
	mul.wide.s32 	%rd24, %r46, 16;
	add.s64 	%rd25, %rd1, %rd24;
	ld.global.v4.f32 	{%f69, %f70, %f71, %f72}, [%rd25];
	st.shared.v4.f32 	[%rd2+704], {%f69, %f70, %f71, %f72};
	@%p1 bra 	BB2_12;

	add.s32 	%r47, %r5, -2;
	mov.u32 	%r48, 0;
	max.s32 	%r49, %r48, %r47;
	mad.lo.s32 	%r50, %r49, %r12, %r8;
	mul.wide.s32 	%rd26, %r50, 16;
	add.s64 	%rd27, %rd1, %rd26;
	ld.global.v4.f32 	{%f77, %f78, %f79, %f80}, [%rd27];
	st.shared.v4.f32 	[%rd2+64], {%f77, %f78, %f79, %f80};

BB2_12:
	@%p2 bra 	BB2_14;

	add.s32 	%r51, %r5, 2;
	setp.lt.s32	%p12, %r51, %r14;
	selp.b32	%r53, %r51, %r21, %p12;
	mad.lo.s32 	%r54, %r53, %r12, %r8;
	mul.wide.s32 	%rd28, %r54, 16;
	add.s64 	%rd29, %rd1, %rd28;
	ld.global.v4.f32 	{%f85, %f86, %f87, %f88}, [%rd29];
	st.shared.v4.f32 	[%rd2+1344], {%f85, %f86, %f87, %f88};

BB2_14:
	bar.sync 	0;
	setp.lt.s32	%p13, %r4, %r14;
	setp.lt.s32	%p14, %r1, %r13;
	and.pred  	%p15, %p14, %p13;
	@!%p15 bra 	BB2_18;
	bra.uni 	BB2_15;

BB2_15:
	mul.ftz.f32 	%f98, %f19, %f19;
	fma.rn.ftz.f32 	%f99, %f19, %f19, %f98;
	mul.ftz.f32 	%f100, %f20, %f20;
	fma.rn.ftz.f32 	%f1, %f20, %f20, %f100;
	mov.f32 	%f101, 0fC0800000;
	div.approx.ftz.f32 	%f102, %f101, %f99;
	mul.ftz.f32 	%f103, %f102, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f2, %f103;
	mov.f32 	%f104, 0fBF800000;
	div.approx.ftz.f32 	%f105, %f104, %f99;
	mul.ftz.f32 	%f106, %f105, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f3, %f106;
	mov.f32 	%f107, 0f80000000;
	div.approx.ftz.f32 	%f108, %f107, %f99;
	mul.ftz.f32 	%f109, %f108, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f4, %f109;
	mul.wide.s32 	%rd30, %r3, 320;
	add.s64 	%rd32, %rd9, %rd30;
	mul.wide.s32 	%rd33, %r17, 16;
	add.s64 	%rd37, %rd32, %rd33;
	mov.f32 	%f186, 0f00000000;
	mov.f32 	%f185, %f186;
	mov.f32 	%f184, %f186;
	mov.f32 	%f183, %f186;
	mov.f32 	%f182, %f186;
	mov.u32 	%r59, -2;

BB2_16:
	mul.lo.s32 	%r57, %r59, %r59;
	cvt.rn.f32.s32	%f110, %r57;
	neg.ftz.f32 	%f111, %f110;
	div.approx.ftz.f32 	%f112, %f111, %f1;
	mul.ftz.f32 	%f113, %f112, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f114, %f113;
	mul.ftz.f32 	%f115, %f114, %f2;
	ld.shared.v4.f32 	{%f116, %f117, %f118, %f119}, [%rd37];
	fma.rn.ftz.f32 	%f121, %f116, %f115, %f183;
	fma.rn.ftz.f32 	%f123, %f117, %f115, %f184;
	fma.rn.ftz.f32 	%f125, %f118, %f115, %f185;
	fma.rn.ftz.f32 	%f127, %f119, %f115, %f186;
	add.ftz.f32 	%f128, %f182, %f115;
	mul.ftz.f32 	%f129, %f114, %f3;
	ld.shared.v4.f32 	{%f130, %f131, %f132, %f133}, [%rd37+16];
	fma.rn.ftz.f32 	%f135, %f130, %f129, %f121;
	fma.rn.ftz.f32 	%f137, %f131, %f129, %f123;
	fma.rn.ftz.f32 	%f139, %f132, %f129, %f125;
	fma.rn.ftz.f32 	%f141, %f133, %f129, %f127;
	add.ftz.f32 	%f142, %f128, %f129;
	mul.ftz.f32 	%f143, %f114, %f4;
	ld.shared.v4.f32 	{%f144, %f145, %f146, %f147}, [%rd37+32];
	fma.rn.ftz.f32 	%f149, %f144, %f143, %f135;
	fma.rn.ftz.f32 	%f151, %f145, %f143, %f137;
	fma.rn.ftz.f32 	%f153, %f146, %f143, %f139;
	fma.rn.ftz.f32 	%f155, %f147, %f143, %f141;
	add.ftz.f32 	%f156, %f142, %f143;
	ld.shared.v4.f32 	{%f157, %f158, %f159, %f160}, [%rd37+48];
	fma.rn.ftz.f32 	%f162, %f157, %f129, %f149;
	fma.rn.ftz.f32 	%f164, %f158, %f129, %f151;
	fma.rn.ftz.f32 	%f166, %f159, %f129, %f153;
	fma.rn.ftz.f32 	%f168, %f160, %f129, %f155;
	add.ftz.f32 	%f169, %f156, %f129;
	ld.shared.v4.f32 	{%f170, %f171, %f172, %f173}, [%rd37+64];
	fma.rn.ftz.f32 	%f10, %f170, %f115, %f162;
	fma.rn.ftz.f32 	%f11, %f171, %f115, %f164;
	fma.rn.ftz.f32 	%f12, %f172, %f115, %f166;
	fma.rn.ftz.f32 	%f13, %f173, %f115, %f168;
	add.ftz.f32 	%f182, %f169, %f115;
	add.s64 	%rd37, %rd37, 320;
	add.s32 	%r59, %r59, 1;
	setp.lt.s32	%p16, %r59, 3;
	mov.f32 	%f186, %f13;
	mov.f32 	%f185, %f12;
	mov.f32 	%f184, %f11;
	mov.f32 	%f183, %f10;
	@%p16 bra 	BB2_16;

	cvta.to.global.u64 	%rd34, %rd6;
	mad.lo.s32 	%r58, %r4, %r11, %r1;
	mul.wide.s32 	%rd35, %r58, 16;
	add.s64 	%rd36, %rd34, %rd35;
	div.approx.ftz.f32 	%f178, %f13, %f182;
	div.approx.ftz.f32 	%f179, %f12, %f182;
	div.approx.ftz.f32 	%f180, %f11, %f182;
	div.approx.ftz.f32 	%f181, %f10, %f182;
	st.global.v4.f32 	[%rd36], {%f181, %f180, %f179, %f178};

BB2_18:
	ret;
}

.visible .entry _d_gauss_filter_kernel_shmem_float4_3(
	.param .u64 _d_gauss_filter_kernel_shmem_float4_3_param_0,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_3_param_1,
	.param .u64 _d_gauss_filter_kernel_shmem_float4_3_param_2,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_3_param_3,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_3_param_4,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_3_param_5,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_3_param_6,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_3_param_7
)
{
	.reg .pred 	%p<17>;
	.reg .s32 	%r<60>;
	.reg .f32 	%f<218>;
	.reg .s64 	%rd<38>;
	// demoted variable
	.shared .align 16 .b8 _d_gauss_filter_kernel_shmem_float4_3$__cuda_local_var_180253_228_non_const_s_data[7744];

	ld.param.u64 	%rd6, [_d_gauss_filter_kernel_shmem_float4_3_param_0];
	ld.param.u32 	%r11, [_d_gauss_filter_kernel_shmem_float4_3_param_1];
	ld.param.u64 	%rd7, [_d_gauss_filter_kernel_shmem_float4_3_param_2];
	ld.param.u32 	%r12, [_d_gauss_filter_kernel_shmem_float4_3_param_3];
	ld.param.u32 	%r13, [_d_gauss_filter_kernel_shmem_float4_3_param_4];
	ld.param.u32 	%r14, [_d_gauss_filter_kernel_shmem_float4_3_param_5];
	ld.param.f32 	%f20, [_d_gauss_filter_kernel_shmem_float4_3_param_6];
	ld.param.f32 	%f21, [_d_gauss_filter_kernel_shmem_float4_3_param_7];
	cvta.to.global.u64 	%rd1, %rd7;
	mov.u32 	%r15, %ntid.x;
	mov.u32 	%r16, %ctaid.x;
	mov.u32 	%r17, %tid.x;
	mad.lo.s32 	%r1, %r15, %r16, %r17;
	add.s32 	%r18, %r13, -1;
	min.s32 	%r2, %r1, %r18;
	mov.u32 	%r19, %ntid.y;
	mov.u32 	%r20, %ctaid.y;
	mov.u32 	%r3, %tid.y;
	mad.lo.s32 	%r4, %r19, %r20, %r3;
	add.s32 	%r21, %r14, -1;
	min.s32 	%r5, %r4, %r21;
	mul.wide.s32 	%rd8, %r3, 352;
	mov.u64 	%rd9, _d_gauss_filter_kernel_shmem_float4_3$__cuda_local_var_180253_228_non_const_s_data;
	add.s64 	%rd10, %rd9, %rd8;
	mul.wide.s32 	%rd11, %r17, 16;
	add.s64 	%rd2, %rd10, %rd11;
	mul.lo.s32 	%r6, %r5, %r12;
	add.s32 	%r22, %r6, %r2;
	mul.wide.s32 	%rd12, %r22, 16;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.v4.f32 	{%f22, %f23, %f24, %f25}, [%rd13];
	st.shared.v4.f32 	[%rd2+1104], {%f22, %f23, %f24, %f25};
	setp.gt.s32	%p1, %r3, 2;
	@%p1 bra 	BB3_2;

	add.s32 	%r23, %r5, -3;
	mov.u32 	%r24, 0;
	max.s32 	%r25, %r24, %r23;
	mad.lo.s32 	%r26, %r25, %r12, %r2;
	mul.wide.s32 	%rd14, %r26, 16;
	add.s64 	%rd15, %rd1, %rd14;
	ld.global.v4.f32 	{%f30, %f31, %f32, %f33}, [%rd15];
	st.shared.v4.f32 	[%rd2+48], {%f30, %f31, %f32, %f33};

BB3_2:
	setp.lt.s32	%p2, %r3, 13;
	@%p2 bra 	BB3_4;

	add.s32 	%r27, %r5, 3;
	setp.lt.s32	%p3, %r27, %r14;
	selp.b32	%r29, %r27, %r21, %p3;
	mad.lo.s32 	%r30, %r29, %r12, %r2;
	mul.wide.s32 	%rd16, %r30, 16;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.v4.f32 	{%f38, %f39, %f40, %f41}, [%rd17];
	st.shared.v4.f32 	[%rd2+2160], {%f38, %f39, %f40, %f41};

BB3_4:
	setp.gt.s32	%p4, %r17, 2;
	@%p4 bra 	BB3_9;

	add.s32 	%r32, %r2, -3;
	mov.u32 	%r33, 0;
	max.s32 	%r7, %r33, %r32;
	add.s32 	%r34, %r6, %r7;
	mul.wide.s32 	%rd18, %r34, 16;
	add.s64 	%rd19, %rd1, %rd18;
	ld.global.v4.f32 	{%f46, %f47, %f48, %f49}, [%rd19];
	st.shared.v4.f32 	[%rd2+1056], {%f46, %f47, %f48, %f49};
	@%p1 bra 	BB3_7;

	add.s32 	%r35, %r5, -3;
	max.s32 	%r37, %r33, %r35;
	mad.lo.s32 	%r38, %r37, %r12, %r7;
	mul.wide.s32 	%rd20, %r38, 16;
	add.s64 	%rd21, %rd1, %rd20;
	ld.global.v4.f32 	{%f54, %f55, %f56, %f57}, [%rd21];
	st.shared.v4.f32 	[%rd2], {%f54, %f55, %f56, %f57};

BB3_7:
	@%p2 bra 	BB3_9;

	add.s32 	%r39, %r5, 3;
	setp.lt.s32	%p7, %r39, %r14;
	selp.b32	%r41, %r39, %r21, %p7;
	mad.lo.s32 	%r42, %r41, %r12, %r7;
	mul.wide.s32 	%rd22, %r42, 16;
	add.s64 	%rd23, %rd1, %rd22;
	ld.global.v4.f32 	{%f62, %f63, %f64, %f65}, [%rd23];
	st.shared.v4.f32 	[%rd2+2112], {%f62, %f63, %f64, %f65};

BB3_9:
	setp.lt.s32	%p8, %r17, 13;
	@%p8 bra 	BB3_14;

	add.s32 	%r44, %r2, 3;
	setp.lt.s32	%p9, %r44, %r13;
	selp.b32	%r8, %r44, %r18, %p9;
	add.s32 	%r46, %r6, %r8;
	mul.wide.s32 	%rd24, %r46, 16;
	add.s64 	%rd25, %rd1, %rd24;
	ld.global.v4.f32 	{%f70, %f71, %f72, %f73}, [%rd25];
	st.shared.v4.f32 	[%rd2+1152], {%f70, %f71, %f72, %f73};
	@%p1 bra 	BB3_12;

	add.s32 	%r47, %r5, -3;
	mov.u32 	%r48, 0;
	max.s32 	%r49, %r48, %r47;
	mad.lo.s32 	%r50, %r49, %r12, %r8;
	mul.wide.s32 	%rd26, %r50, 16;
	add.s64 	%rd27, %rd1, %rd26;
	ld.global.v4.f32 	{%f78, %f79, %f80, %f81}, [%rd27];
	st.shared.v4.f32 	[%rd2+96], {%f78, %f79, %f80, %f81};

BB3_12:
	@%p2 bra 	BB3_14;

	add.s32 	%r51, %r5, 3;
	setp.lt.s32	%p12, %r51, %r14;
	selp.b32	%r53, %r51, %r21, %p12;
	mad.lo.s32 	%r54, %r53, %r12, %r8;
	mul.wide.s32 	%rd28, %r54, 16;
	add.s64 	%rd29, %rd1, %rd28;
	ld.global.v4.f32 	{%f86, %f87, %f88, %f89}, [%rd29];
	st.shared.v4.f32 	[%rd2+2208], {%f86, %f87, %f88, %f89};

BB3_14:
	bar.sync 	0;
	setp.lt.s32	%p13, %r4, %r14;
	setp.lt.s32	%p14, %r1, %r13;
	and.pred  	%p15, %p14, %p13;
	@!%p15 bra 	BB3_18;
	bra.uni 	BB3_15;

BB3_15:
	mul.ftz.f32 	%f99, %f20, %f20;
	fma.rn.ftz.f32 	%f100, %f20, %f20, %f99;
	mul.ftz.f32 	%f101, %f21, %f21;
	fma.rn.ftz.f32 	%f1, %f21, %f21, %f101;
	mov.f32 	%f102, 0fC1100000;
	div.approx.ftz.f32 	%f103, %f102, %f100;
	mul.ftz.f32 	%f104, %f103, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f2, %f104;
	mov.f32 	%f105, 0fC0800000;
	div.approx.ftz.f32 	%f106, %f105, %f100;
	mul.ftz.f32 	%f107, %f106, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f3, %f107;
	mov.f32 	%f108, 0fBF800000;
	div.approx.ftz.f32 	%f109, %f108, %f100;
	mul.ftz.f32 	%f110, %f109, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f4, %f110;
	mov.f32 	%f111, 0f80000000;
	div.approx.ftz.f32 	%f112, %f111, %f100;
	mul.ftz.f32 	%f113, %f112, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f5, %f113;
	mul.wide.s32 	%rd30, %r3, 352;
	add.s64 	%rd32, %rd9, %rd30;
	mul.wide.s32 	%rd33, %r17, 16;
	add.s64 	%rd37, %rd32, %rd33;
	mov.f32 	%f217, 0f00000000;
	mov.f32 	%f216, %f217;
	mov.f32 	%f215, %f217;
	mov.f32 	%f214, %f217;
	mov.f32 	%f213, %f217;
	mov.u32 	%r59, -3;

BB3_16:
	mul.lo.s32 	%r57, %r59, %r59;
	cvt.rn.f32.s32	%f114, %r57;
	neg.ftz.f32 	%f115, %f114;
	div.approx.ftz.f32 	%f116, %f115, %f1;
	mul.ftz.f32 	%f117, %f116, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f118, %f117;
	mul.ftz.f32 	%f119, %f118, %f2;
	ld.shared.v4.f32 	{%f120, %f121, %f122, %f123}, [%rd37];
	fma.rn.ftz.f32 	%f125, %f120, %f119, %f214;
	fma.rn.ftz.f32 	%f127, %f121, %f119, %f215;
	fma.rn.ftz.f32 	%f129, %f122, %f119, %f216;
	fma.rn.ftz.f32 	%f131, %f123, %f119, %f217;
	add.ftz.f32 	%f132, %f213, %f119;
	mul.ftz.f32 	%f133, %f118, %f3;
	ld.shared.v4.f32 	{%f134, %f135, %f136, %f137}, [%rd37+16];
	fma.rn.ftz.f32 	%f139, %f134, %f133, %f125;
	fma.rn.ftz.f32 	%f141, %f135, %f133, %f127;
	fma.rn.ftz.f32 	%f143, %f136, %f133, %f129;
	fma.rn.ftz.f32 	%f145, %f137, %f133, %f131;
	add.ftz.f32 	%f146, %f132, %f133;
	mul.ftz.f32 	%f147, %f118, %f4;
	ld.shared.v4.f32 	{%f148, %f149, %f150, %f151}, [%rd37+32];
	fma.rn.ftz.f32 	%f153, %f148, %f147, %f139;
	fma.rn.ftz.f32 	%f155, %f149, %f147, %f141;
	fma.rn.ftz.f32 	%f157, %f150, %f147, %f143;
	fma.rn.ftz.f32 	%f159, %f151, %f147, %f145;
	add.ftz.f32 	%f160, %f146, %f147;
	mul.ftz.f32 	%f161, %f118, %f5;
	ld.shared.v4.f32 	{%f162, %f163, %f164, %f165}, [%rd37+48];
	fma.rn.ftz.f32 	%f167, %f162, %f161, %f153;
	fma.rn.ftz.f32 	%f169, %f163, %f161, %f155;
	fma.rn.ftz.f32 	%f171, %f164, %f161, %f157;
	fma.rn.ftz.f32 	%f173, %f165, %f161, %f159;
	add.ftz.f32 	%f174, %f160, %f161;
	ld.shared.v4.f32 	{%f175, %f176, %f177, %f178}, [%rd37+64];
	fma.rn.ftz.f32 	%f180, %f175, %f147, %f167;
	fma.rn.ftz.f32 	%f182, %f176, %f147, %f169;
	fma.rn.ftz.f32 	%f184, %f177, %f147, %f171;
	fma.rn.ftz.f32 	%f186, %f178, %f147, %f173;
	add.ftz.f32 	%f187, %f174, %f147;
	ld.shared.v4.f32 	{%f188, %f189, %f190, %f191}, [%rd37+80];
	fma.rn.ftz.f32 	%f193, %f188, %f133, %f180;
	fma.rn.ftz.f32 	%f195, %f189, %f133, %f182;
	fma.rn.ftz.f32 	%f197, %f190, %f133, %f184;
	fma.rn.ftz.f32 	%f199, %f191, %f133, %f186;
	add.ftz.f32 	%f200, %f187, %f133;
	ld.shared.v4.f32 	{%f201, %f202, %f203, %f204}, [%rd37+96];
	fma.rn.ftz.f32 	%f11, %f201, %f119, %f193;
	fma.rn.ftz.f32 	%f12, %f202, %f119, %f195;
	fma.rn.ftz.f32 	%f13, %f203, %f119, %f197;
	fma.rn.ftz.f32 	%f14, %f204, %f119, %f199;
	add.ftz.f32 	%f213, %f200, %f119;
	add.s64 	%rd37, %rd37, 352;
	add.s32 	%r59, %r59, 1;
	setp.lt.s32	%p16, %r59, 4;
	mov.f32 	%f217, %f14;
	mov.f32 	%f216, %f13;
	mov.f32 	%f215, %f12;
	mov.f32 	%f214, %f11;
	@%p16 bra 	BB3_16;

	cvta.to.global.u64 	%rd34, %rd6;
	mad.lo.s32 	%r58, %r4, %r11, %r1;
	mul.wide.s32 	%rd35, %r58, 16;
	add.s64 	%rd36, %rd34, %rd35;
	div.approx.ftz.f32 	%f209, %f14, %f213;
	div.approx.ftz.f32 	%f210, %f13, %f213;
	div.approx.ftz.f32 	%f211, %f12, %f213;
	div.approx.ftz.f32 	%f212, %f11, %f213;
	st.global.v4.f32 	[%rd36], {%f212, %f211, %f210, %f209};

BB3_18:
	ret;
}

.visible .entry _d_gauss_filter_kernel_shmem_float4_4(
	.param .u64 _d_gauss_filter_kernel_shmem_float4_4_param_0,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_4_param_1,
	.param .u64 _d_gauss_filter_kernel_shmem_float4_4_param_2,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_4_param_3,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_4_param_4,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_4_param_5,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_4_param_6,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_4_param_7
)
{
	.reg .pred 	%p<17>;
	.reg .s32 	%r<60>;
	.reg .f32 	%f<249>;
	.reg .s64 	%rd<39>;
	// demoted variable
	.shared .align 16 .b8 _d_gauss_filter_kernel_shmem_float4_4$__cuda_local_var_180257_228_non_const_s_data[9216];

	ld.param.u64 	%rd6, [_d_gauss_filter_kernel_shmem_float4_4_param_0];
	ld.param.u32 	%r11, [_d_gauss_filter_kernel_shmem_float4_4_param_1];
	ld.param.u64 	%rd7, [_d_gauss_filter_kernel_shmem_float4_4_param_2];
	ld.param.u32 	%r12, [_d_gauss_filter_kernel_shmem_float4_4_param_3];
	ld.param.u32 	%r13, [_d_gauss_filter_kernel_shmem_float4_4_param_4];
	ld.param.u32 	%r14, [_d_gauss_filter_kernel_shmem_float4_4_param_5];
	ld.param.f32 	%f21, [_d_gauss_filter_kernel_shmem_float4_4_param_6];
	ld.param.f32 	%f22, [_d_gauss_filter_kernel_shmem_float4_4_param_7];
	cvta.to.global.u64 	%rd1, %rd7;
	mov.u32 	%r15, %ntid.x;
	mov.u32 	%r16, %ctaid.x;
	mov.u32 	%r17, %tid.x;
	mad.lo.s32 	%r1, %r15, %r16, %r17;
	add.s32 	%r18, %r13, -1;
	min.s32 	%r2, %r1, %r18;
	mov.u32 	%r19, %ntid.y;
	mov.u32 	%r20, %ctaid.y;
	mov.u32 	%r3, %tid.y;
	mad.lo.s32 	%r4, %r19, %r20, %r3;
	add.s32 	%r21, %r14, -1;
	min.s32 	%r5, %r4, %r21;
	mul.wide.s32 	%rd8, %r3, 384;
	mov.u64 	%rd9, _d_gauss_filter_kernel_shmem_float4_4$__cuda_local_var_180257_228_non_const_s_data;
	add.s64 	%rd10, %rd9, %rd8;
	mul.wide.s32 	%rd11, %r17, 16;
	add.s64 	%rd2, %rd10, %rd11;
	mul.lo.s32 	%r6, %r5, %r12;
	add.s32 	%r22, %r6, %r2;
	mul.wide.s32 	%rd12, %r22, 16;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.v4.f32 	{%f23, %f24, %f25, %f26}, [%rd13];
	st.shared.v4.f32 	[%rd2+1600], {%f23, %f24, %f25, %f26};
	setp.gt.s32	%p1, %r3, 3;
	@%p1 bra 	BB4_2;

	add.s32 	%r23, %r5, -4;
	mov.u32 	%r24, 0;
	max.s32 	%r25, %r24, %r23;
	mad.lo.s32 	%r26, %r25, %r12, %r2;
	mul.wide.s32 	%rd14, %r26, 16;
	add.s64 	%rd15, %rd1, %rd14;
	ld.global.v4.f32 	{%f31, %f32, %f33, %f34}, [%rd15];
	st.shared.v4.f32 	[%rd2+64], {%f31, %f32, %f33, %f34};

BB4_2:
	setp.lt.s32	%p2, %r3, 12;
	@%p2 bra 	BB4_4;

	add.s32 	%r27, %r5, 4;
	setp.lt.s32	%p3, %r27, %r14;
	selp.b32	%r29, %r27, %r21, %p3;
	mad.lo.s32 	%r30, %r29, %r12, %r2;
	mul.wide.s32 	%rd16, %r30, 16;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.v4.f32 	{%f39, %f40, %f41, %f42}, [%rd17];
	st.shared.v4.f32 	[%rd2+3136], {%f39, %f40, %f41, %f42};

BB4_4:
	setp.gt.s32	%p4, %r17, 3;
	@%p4 bra 	BB4_9;

	add.s32 	%r32, %r2, -4;
	mov.u32 	%r33, 0;
	max.s32 	%r7, %r33, %r32;
	add.s32 	%r34, %r6, %r7;
	mul.wide.s32 	%rd18, %r34, 16;
	add.s64 	%rd19, %rd1, %rd18;
	ld.global.v4.f32 	{%f47, %f48, %f49, %f50}, [%rd19];
	st.shared.v4.f32 	[%rd2+1536], {%f47, %f48, %f49, %f50};
	@%p1 bra 	BB4_7;

	add.s32 	%r35, %r5, -4;
	max.s32 	%r37, %r33, %r35;
	mad.lo.s32 	%r38, %r37, %r12, %r7;
	mul.wide.s32 	%rd20, %r38, 16;
	add.s64 	%rd21, %rd1, %rd20;
	ld.global.v4.f32 	{%f55, %f56, %f57, %f58}, [%rd21];
	st.shared.v4.f32 	[%rd2], {%f55, %f56, %f57, %f58};

BB4_7:
	@%p2 bra 	BB4_9;

	add.s32 	%r39, %r5, 4;
	setp.lt.s32	%p7, %r39, %r14;
	selp.b32	%r41, %r39, %r21, %p7;
	mad.lo.s32 	%r42, %r41, %r12, %r7;
	mul.wide.s32 	%rd22, %r42, 16;
	add.s64 	%rd23, %rd1, %rd22;
	ld.global.v4.f32 	{%f63, %f64, %f65, %f66}, [%rd23];
	st.shared.v4.f32 	[%rd2+3072], {%f63, %f64, %f65, %f66};

BB4_9:
	setp.lt.s32	%p8, %r17, 12;
	@%p8 bra 	BB4_14;

	add.s32 	%r44, %r2, 4;
	setp.lt.s32	%p9, %r44, %r13;
	selp.b32	%r8, %r44, %r18, %p9;
	add.s32 	%r46, %r6, %r8;
	mul.wide.s32 	%rd24, %r46, 16;
	add.s64 	%rd25, %rd1, %rd24;
	ld.global.v4.f32 	{%f71, %f72, %f73, %f74}, [%rd25];
	st.shared.v4.f32 	[%rd2+1664], {%f71, %f72, %f73, %f74};
	@%p1 bra 	BB4_12;

	add.s32 	%r47, %r5, -4;
	mov.u32 	%r48, 0;
	max.s32 	%r49, %r48, %r47;
	mad.lo.s32 	%r50, %r49, %r12, %r8;
	mul.wide.s32 	%rd26, %r50, 16;
	add.s64 	%rd27, %rd1, %rd26;
	ld.global.v4.f32 	{%f79, %f80, %f81, %f82}, [%rd27];
	st.shared.v4.f32 	[%rd2+128], {%f79, %f80, %f81, %f82};

BB4_12:
	@%p2 bra 	BB4_14;

	add.s32 	%r51, %r5, 4;
	setp.lt.s32	%p12, %r51, %r14;
	selp.b32	%r53, %r51, %r21, %p12;
	mad.lo.s32 	%r54, %r53, %r12, %r8;
	mul.wide.s32 	%rd28, %r54, 16;
	add.s64 	%rd29, %rd1, %rd28;
	ld.global.v4.f32 	{%f87, %f88, %f89, %f90}, [%rd29];
	st.shared.v4.f32 	[%rd2+3200], {%f87, %f88, %f89, %f90};

BB4_14:
	bar.sync 	0;
	setp.lt.s32	%p13, %r4, %r14;
	setp.lt.s32	%p14, %r1, %r13;
	and.pred  	%p15, %p14, %p13;
	@!%p15 bra 	BB4_18;
	bra.uni 	BB4_15;

BB4_15:
	mul.ftz.f32 	%f100, %f21, %f21;
	fma.rn.ftz.f32 	%f101, %f21, %f21, %f100;
	mul.ftz.f32 	%f102, %f22, %f22;
	fma.rn.ftz.f32 	%f1, %f22, %f22, %f102;
	mov.f32 	%f103, 0fC1800000;
	div.approx.ftz.f32 	%f104, %f103, %f101;
	mul.ftz.f32 	%f105, %f104, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f2, %f105;
	mov.f32 	%f106, 0fC1100000;
	div.approx.ftz.f32 	%f107, %f106, %f101;
	mul.ftz.f32 	%f108, %f107, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f3, %f108;
	mov.f32 	%f109, 0fC0800000;
	div.approx.ftz.f32 	%f110, %f109, %f101;
	mul.ftz.f32 	%f111, %f110, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f4, %f111;
	mov.f32 	%f112, 0fBF800000;
	div.approx.ftz.f32 	%f113, %f112, %f101;
	mul.ftz.f32 	%f114, %f113, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f5, %f114;
	mov.f32 	%f115, 0f80000000;
	div.approx.ftz.f32 	%f116, %f115, %f101;
	mul.ftz.f32 	%f117, %f116, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f6, %f117;
	mul.wide.s32 	%rd30, %r3, 384;
	add.s64 	%rd32, %rd9, %rd30;
	mul.wide.s32 	%rd33, %r17, 16;
	add.s64 	%rd34, %rd33, %rd32;
	add.s64 	%rd38, %rd34, 64;
	mov.f32 	%f248, 0f00000000;
	mov.f32 	%f247, %f248;
	mov.f32 	%f246, %f248;
	mov.f32 	%f245, %f248;
	mov.f32 	%f244, %f248;
	mov.u32 	%r59, -4;

BB4_16:
	mul.lo.s32 	%r57, %r59, %r59;
	cvt.rn.f32.s32	%f118, %r57;
	neg.ftz.f32 	%f119, %f118;
	div.approx.ftz.f32 	%f120, %f119, %f1;
	mul.ftz.f32 	%f121, %f120, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f122, %f121;
	mul.ftz.f32 	%f123, %f122, %f2;
	ld.shared.v4.f32 	{%f124, %f125, %f126, %f127}, [%rd38+-64];
	fma.rn.ftz.f32 	%f129, %f124, %f123, %f245;
	fma.rn.ftz.f32 	%f131, %f125, %f123, %f246;
	fma.rn.ftz.f32 	%f133, %f126, %f123, %f247;
	fma.rn.ftz.f32 	%f135, %f127, %f123, %f248;
	add.ftz.f32 	%f136, %f244, %f123;
	mul.ftz.f32 	%f137, %f122, %f3;
	ld.shared.v4.f32 	{%f138, %f139, %f140, %f141}, [%rd38+-48];
	fma.rn.ftz.f32 	%f143, %f138, %f137, %f129;
	fma.rn.ftz.f32 	%f145, %f139, %f137, %f131;
	fma.rn.ftz.f32 	%f147, %f140, %f137, %f133;
	fma.rn.ftz.f32 	%f149, %f141, %f137, %f135;
	add.ftz.f32 	%f150, %f136, %f137;
	mul.ftz.f32 	%f151, %f122, %f4;
	ld.shared.v4.f32 	{%f152, %f153, %f154, %f155}, [%rd38+-32];
	fma.rn.ftz.f32 	%f157, %f152, %f151, %f143;
	fma.rn.ftz.f32 	%f159, %f153, %f151, %f145;
	fma.rn.ftz.f32 	%f161, %f154, %f151, %f147;
	fma.rn.ftz.f32 	%f163, %f155, %f151, %f149;
	add.ftz.f32 	%f164, %f150, %f151;
	mul.ftz.f32 	%f165, %f122, %f5;
	ld.shared.v4.f32 	{%f166, %f167, %f168, %f169}, [%rd38+-16];
	fma.rn.ftz.f32 	%f171, %f166, %f165, %f157;
	fma.rn.ftz.f32 	%f173, %f167, %f165, %f159;
	fma.rn.ftz.f32 	%f175, %f168, %f165, %f161;
	fma.rn.ftz.f32 	%f177, %f169, %f165, %f163;
	add.ftz.f32 	%f178, %f164, %f165;
	mul.ftz.f32 	%f179, %f122, %f6;
	ld.shared.v4.f32 	{%f180, %f181, %f182, %f183}, [%rd38];
	fma.rn.ftz.f32 	%f185, %f180, %f179, %f171;
	fma.rn.ftz.f32 	%f187, %f181, %f179, %f173;
	fma.rn.ftz.f32 	%f189, %f182, %f179, %f175;
	fma.rn.ftz.f32 	%f191, %f183, %f179, %f177;
	add.ftz.f32 	%f192, %f178, %f179;
	ld.shared.v4.f32 	{%f193, %f194, %f195, %f196}, [%rd38+16];
	fma.rn.ftz.f32 	%f198, %f193, %f165, %f185;
	fma.rn.ftz.f32 	%f200, %f194, %f165, %f187;
	fma.rn.ftz.f32 	%f202, %f195, %f165, %f189;
	fma.rn.ftz.f32 	%f204, %f196, %f165, %f191;
	add.ftz.f32 	%f205, %f192, %f165;
	ld.shared.v4.f32 	{%f206, %f207, %f208, %f209}, [%rd38+32];
	fma.rn.ftz.f32 	%f211, %f206, %f151, %f198;
	fma.rn.ftz.f32 	%f213, %f207, %f151, %f200;
	fma.rn.ftz.f32 	%f215, %f208, %f151, %f202;
	fma.rn.ftz.f32 	%f217, %f209, %f151, %f204;
	add.ftz.f32 	%f218, %f205, %f151;
	ld.shared.v4.f32 	{%f219, %f220, %f221, %f222}, [%rd38+48];
	fma.rn.ftz.f32 	%f224, %f219, %f137, %f211;
	fma.rn.ftz.f32 	%f226, %f220, %f137, %f213;
	fma.rn.ftz.f32 	%f228, %f221, %f137, %f215;
	fma.rn.ftz.f32 	%f230, %f222, %f137, %f217;
	add.ftz.f32 	%f231, %f218, %f137;
	ld.shared.v4.f32 	{%f232, %f233, %f234, %f235}, [%rd38+64];
	fma.rn.ftz.f32 	%f12, %f232, %f123, %f224;
	fma.rn.ftz.f32 	%f13, %f233, %f123, %f226;
	fma.rn.ftz.f32 	%f14, %f234, %f123, %f228;
	fma.rn.ftz.f32 	%f15, %f235, %f123, %f230;
	add.ftz.f32 	%f244, %f231, %f123;
	add.s64 	%rd38, %rd38, 384;
	add.s32 	%r59, %r59, 1;
	setp.lt.s32	%p16, %r59, 5;
	mov.f32 	%f248, %f15;
	mov.f32 	%f247, %f14;
	mov.f32 	%f246, %f13;
	mov.f32 	%f245, %f12;
	@%p16 bra 	BB4_16;

	cvta.to.global.u64 	%rd35, %rd6;
	mad.lo.s32 	%r58, %r4, %r11, %r1;
	mul.wide.s32 	%rd36, %r58, 16;
	add.s64 	%rd37, %rd35, %rd36;
	div.approx.ftz.f32 	%f240, %f15, %f244;
	div.approx.ftz.f32 	%f241, %f14, %f244;
	div.approx.ftz.f32 	%f242, %f13, %f244;
	div.approx.ftz.f32 	%f243, %f12, %f244;
	st.global.v4.f32 	[%rd37], {%f243, %f242, %f241, %f240};

BB4_18:
	ret;
}

.visible .entry _d_gauss_filter_kernel_shmem_float4_5(
	.param .u64 _d_gauss_filter_kernel_shmem_float4_5_param_0,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_5_param_1,
	.param .u64 _d_gauss_filter_kernel_shmem_float4_5_param_2,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_5_param_3,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_5_param_4,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_5_param_5,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_5_param_6,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_5_param_7
)
{
	.reg .pred 	%p<18>;
	.reg .s32 	%r<85>;
	.reg .f32 	%f<131>;
	.reg .s64 	%rd<39>;
	// demoted variable
	.shared .align 16 .b8 _d_gauss_filter_kernel_shmem_float4_5$__cuda_local_var_180261_228_non_const_s_data[10816];

	ld.param.u64 	%rd6, [_d_gauss_filter_kernel_shmem_float4_5_param_0];
	ld.param.u32 	%r13, [_d_gauss_filter_kernel_shmem_float4_5_param_1];
	ld.param.u64 	%rd7, [_d_gauss_filter_kernel_shmem_float4_5_param_2];
	ld.param.u32 	%r14, [_d_gauss_filter_kernel_shmem_float4_5_param_3];
	ld.param.u32 	%r15, [_d_gauss_filter_kernel_shmem_float4_5_param_4];
	ld.param.u32 	%r16, [_d_gauss_filter_kernel_shmem_float4_5_param_5];
	ld.param.f32 	%f23, [_d_gauss_filter_kernel_shmem_float4_5_param_6];
	ld.param.f32 	%f24, [_d_gauss_filter_kernel_shmem_float4_5_param_7];
	cvta.to.global.u64 	%rd1, %rd7;
	mov.u32 	%r17, %ntid.x;
	mov.u32 	%r18, %ctaid.x;
	mov.u32 	%r19, %tid.x;
	mad.lo.s32 	%r1, %r17, %r18, %r19;
	add.s32 	%r20, %r15, -1;
	min.s32 	%r2, %r1, %r20;
	mov.u32 	%r21, %ntid.y;
	mov.u32 	%r22, %ctaid.y;
	mov.u32 	%r23, %tid.y;
	mad.lo.s32 	%r3, %r21, %r22, %r23;
	add.s32 	%r24, %r16, -1;
	min.s32 	%r4, %r3, %r24;
	mul.wide.s32 	%rd8, %r23, 416;
	mov.u64 	%rd9, _d_gauss_filter_kernel_shmem_float4_5$__cuda_local_var_180261_228_non_const_s_data;
	add.s64 	%rd10, %rd9, %rd8;
	mul.wide.s32 	%rd11, %r19, 16;
	add.s64 	%rd2, %rd10, %rd11;
	mad.lo.s32 	%r25, %r4, %r14, %r2;
	mul.wide.s32 	%rd12, %r25, 16;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.v4.f32 	{%f25, %f26, %f27, %f28}, [%rd13];
	st.shared.v4.f32 	[%rd2+2160], {%f25, %f26, %f27, %f28};
	setp.gt.s32	%p1, %r23, 4;
	@%p1 bra 	BB5_2;

	add.s32 	%r26, %r4, -5;
	mov.u32 	%r27, 0;
	max.s32 	%r28, %r27, %r26;
	mad.lo.s32 	%r29, %r28, %r14, %r2;
	mul.wide.s32 	%rd14, %r29, 16;
	add.s64 	%rd15, %rd1, %rd14;
	ld.global.v4.f32 	{%f33, %f34, %f35, %f36}, [%rd15];
	st.shared.v4.f32 	[%rd2+80], {%f33, %f34, %f35, %f36};

BB5_2:
	setp.lt.s32	%p2, %r23, 11;
	@%p2 bra 	BB5_4;

	add.s32 	%r31, %r4, 5;
	setp.lt.s32	%p3, %r31, %r16;
	selp.b32	%r33, %r31, %r24, %p3;
	mad.lo.s32 	%r34, %r33, %r14, %r2;
	mul.wide.s32 	%rd16, %r34, 16;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.v4.f32 	{%f41, %f42, %f43, %f44}, [%rd17];
	st.shared.v4.f32 	[%rd2+4240], {%f41, %f42, %f43, %f44};

BB5_4:
	setp.gt.s32	%p4, %r19, 4;
	@%p4 bra 	BB5_9;

	add.s32 	%r37, %r2, -5;
	mov.u32 	%r38, 0;
	max.s32 	%r5, %r38, %r37;
	mad.lo.s32 	%r39, %r4, %r14, %r5;
	mul.wide.s32 	%rd18, %r39, 16;
	add.s64 	%rd19, %rd1, %rd18;
	ld.global.v4.f32 	{%f49, %f50, %f51, %f52}, [%rd19];
	st.shared.v4.f32 	[%rd2+2080], {%f49, %f50, %f51, %f52};
	@%p1 bra 	BB5_7;

	add.s32 	%r40, %r4, -5;
	max.s32 	%r42, %r38, %r40;
	mad.lo.s32 	%r43, %r42, %r14, %r5;
	mul.wide.s32 	%rd20, %r43, 16;
	add.s64 	%rd21, %rd1, %rd20;
	ld.global.v4.f32 	{%f57, %f58, %f59, %f60}, [%rd21];
	st.shared.v4.f32 	[%rd2], {%f57, %f58, %f59, %f60};

BB5_7:
	@%p2 bra 	BB5_9;

	add.s32 	%r45, %r4, 5;
	setp.lt.s32	%p7, %r45, %r16;
	selp.b32	%r47, %r45, %r24, %p7;
	mad.lo.s32 	%r48, %r47, %r14, %r5;
	mul.wide.s32 	%rd22, %r48, 16;
	add.s64 	%rd23, %rd1, %rd22;
	ld.global.v4.f32 	{%f65, %f66, %f67, %f68}, [%rd23];
	st.shared.v4.f32 	[%rd2+4160], {%f65, %f66, %f67, %f68};

BB5_9:
	setp.lt.s32	%p8, %r19, 11;
	@%p8 bra 	BB5_14;

	add.s32 	%r51, %r2, 5;
	setp.lt.s32	%p9, %r51, %r15;
	selp.b32	%r6, %r51, %r20, %p9;
	mad.lo.s32 	%r53, %r4, %r14, %r6;
	mul.wide.s32 	%rd24, %r53, 16;
	add.s64 	%rd25, %rd1, %rd24;
	ld.global.v4.f32 	{%f73, %f74, %f75, %f76}, [%rd25];
	st.shared.v4.f32 	[%rd2+2240], {%f73, %f74, %f75, %f76};
	@%p1 bra 	BB5_12;

	add.s32 	%r54, %r4, -5;
	mov.u32 	%r55, 0;
	max.s32 	%r56, %r55, %r54;
	mad.lo.s32 	%r57, %r56, %r14, %r6;
	mul.wide.s32 	%rd26, %r57, 16;
	add.s64 	%rd27, %rd1, %rd26;
	ld.global.v4.f32 	{%f81, %f82, %f83, %f84}, [%rd27];
	st.shared.v4.f32 	[%rd2+160], {%f81, %f82, %f83, %f84};

BB5_12:
	@%p2 bra 	BB5_14;

	add.s32 	%r59, %r4, 5;
	setp.lt.s32	%p12, %r59, %r16;
	selp.b32	%r61, %r59, %r24, %p12;
	mad.lo.s32 	%r62, %r61, %r14, %r6;
	mul.wide.s32 	%rd28, %r62, 16;
	add.s64 	%rd29, %rd1, %rd28;
	ld.global.v4.f32 	{%f89, %f90, %f91, %f92}, [%rd29];
	st.shared.v4.f32 	[%rd2+4320], {%f89, %f90, %f91, %f92};

BB5_14:
	bar.sync 	0;
	setp.lt.s32	%p13, %r3, %r16;
	setp.lt.s32	%p14, %r1, %r15;
	and.pred  	%p15, %p14, %p13;
	@!%p15 bra 	BB5_20;
	bra.uni 	BB5_15;

BB5_15:
	mul.ftz.f32 	%f102, %f23, %f23;
	fma.rn.ftz.f32 	%f1, %f23, %f23, %f102;
	mul.ftz.f32 	%f103, %f24, %f24;
	fma.rn.ftz.f32 	%f2, %f24, %f24, %f103;
	mov.f32 	%f130, 0f00000000;
	mov.f32 	%f129, %f130;
	mov.f32 	%f128, %f130;
	mov.f32 	%f127, %f130;
	mov.f32 	%f126, %f130;
	mov.u32 	%r81, -5;
	mov.u32 	%r63, 0;
	mov.u32 	%r84, %r63;

BB5_16:
	add.s32 	%r67, %r23, %r84;
	mul.wide.s32 	%rd30, %r67, 26;
	mul.wide.s32 	%rd31, %r19, 16;
	add.s64 	%rd33, %rd9, %rd31;
	shl.b64 	%rd34, %rd30, 4;
	add.s64 	%rd38, %rd33, %rd34;
	mul.lo.s32 	%r69, %r81, %r81;
	cvt.rn.f32.s32	%f104, %r69;
	neg.ftz.f32 	%f105, %f104;
	div.approx.ftz.f32 	%f106, %f105, %f2;
	mul.ftz.f32 	%f107, %f106, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f8, %f107;
	mov.u32 	%r83, %r63;

BB5_17:
	mov.u32 	%r9, %r83;
	add.s32 	%r70, %r9, -5;
	mul.lo.s32 	%r71, %r70, %r70;
	cvt.rn.f32.s32	%f108, %r71;
	neg.ftz.f32 	%f109, %f108;
	div.approx.ftz.f32 	%f110, %f109, %f1;
	mul.ftz.f32 	%f111, %f110, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f112, %f111;
	mul.ftz.f32 	%f113, %f8, %f112;
	ld.shared.v4.f32 	{%f114, %f115, %f116, %f117}, [%rd38];
	fma.rn.ftz.f32 	%f14, %f114, %f113, %f127;
	fma.rn.ftz.f32 	%f15, %f115, %f113, %f128;
	fma.rn.ftz.f32 	%f16, %f116, %f113, %f129;
	fma.rn.ftz.f32 	%f17, %f117, %f113, %f130;
	add.ftz.f32 	%f126, %f126, %f113;
	add.s32 	%r10, %r9, 1;
	add.s64 	%rd38, %rd38, 16;
	setp.ne.s32	%p16, %r9, 10;
	mov.f32 	%f130, %f17;
	mov.f32 	%f129, %f16;
	mov.f32 	%f128, %f15;
	mov.f32 	%f127, %f14;
	mov.u32 	%r83, %r10;
	@%p16 bra 	BB5_17;

	add.s32 	%r81, %r81, 1;
	setp.lt.s32	%p17, %r81, 6;
	add.s32 	%r84, %r84, 1;
	@%p17 bra 	BB5_16;

	cvta.to.global.u64 	%rd35, %rd6;
	mad.lo.s32 	%r80, %r3, %r13, %r1;
	mul.wide.s32 	%rd36, %r80, 16;
	add.s64 	%rd37, %rd35, %rd36;
	div.approx.ftz.f32 	%f122, %f17, %f126;
	div.approx.ftz.f32 	%f123, %f16, %f126;
	div.approx.ftz.f32 	%f124, %f15, %f126;
	div.approx.ftz.f32 	%f125, %f14, %f126;
	st.global.v4.f32 	[%rd37], {%f125, %f124, %f123, %f122};

BB5_20:
	ret;
}

.visible .entry _d_gauss_filter_kernel_shmem_float4_6(
	.param .u64 _d_gauss_filter_kernel_shmem_float4_6_param_0,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_6_param_1,
	.param .u64 _d_gauss_filter_kernel_shmem_float4_6_param_2,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_6_param_3,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_6_param_4,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_6_param_5,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_6_param_6,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_6_param_7
)
{
	.reg .pred 	%p<18>;
	.reg .s32 	%r<85>;
	.reg .f32 	%f<131>;
	.reg .s64 	%rd<39>;
	// demoted variable
	.shared .align 16 .b8 _d_gauss_filter_kernel_shmem_float4_6$__cuda_local_var_180265_228_non_const_s_data[12544];

	ld.param.u64 	%rd6, [_d_gauss_filter_kernel_shmem_float4_6_param_0];
	ld.param.u32 	%r13, [_d_gauss_filter_kernel_shmem_float4_6_param_1];
	ld.param.u64 	%rd7, [_d_gauss_filter_kernel_shmem_float4_6_param_2];
	ld.param.u32 	%r14, [_d_gauss_filter_kernel_shmem_float4_6_param_3];
	ld.param.u32 	%r15, [_d_gauss_filter_kernel_shmem_float4_6_param_4];
	ld.param.u32 	%r16, [_d_gauss_filter_kernel_shmem_float4_6_param_5];
	ld.param.f32 	%f23, [_d_gauss_filter_kernel_shmem_float4_6_param_6];
	ld.param.f32 	%f24, [_d_gauss_filter_kernel_shmem_float4_6_param_7];
	cvta.to.global.u64 	%rd1, %rd7;
	mov.u32 	%r17, %ntid.x;
	mov.u32 	%r18, %ctaid.x;
	mov.u32 	%r19, %tid.x;
	mad.lo.s32 	%r1, %r17, %r18, %r19;
	add.s32 	%r20, %r15, -1;
	min.s32 	%r2, %r1, %r20;
	mov.u32 	%r21, %ntid.y;
	mov.u32 	%r22, %ctaid.y;
	mov.u32 	%r23, %tid.y;
	mad.lo.s32 	%r3, %r21, %r22, %r23;
	add.s32 	%r24, %r16, -1;
	min.s32 	%r4, %r3, %r24;
	mul.wide.s32 	%rd8, %r23, 448;
	mov.u64 	%rd9, _d_gauss_filter_kernel_shmem_float4_6$__cuda_local_var_180265_228_non_const_s_data;
	add.s64 	%rd10, %rd9, %rd8;
	mul.wide.s32 	%rd11, %r19, 16;
	add.s64 	%rd2, %rd10, %rd11;
	mad.lo.s32 	%r25, %r4, %r14, %r2;
	mul.wide.s32 	%rd12, %r25, 16;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.v4.f32 	{%f25, %f26, %f27, %f28}, [%rd13];
	st.shared.v4.f32 	[%rd2+2784], {%f25, %f26, %f27, %f28};
	setp.gt.s32	%p1, %r23, 5;
	@%p1 bra 	BB6_2;

	add.s32 	%r26, %r4, -6;
	mov.u32 	%r27, 0;
	max.s32 	%r28, %r27, %r26;
	mad.lo.s32 	%r29, %r28, %r14, %r2;
	mul.wide.s32 	%rd14, %r29, 16;
	add.s64 	%rd15, %rd1, %rd14;
	ld.global.v4.f32 	{%f33, %f34, %f35, %f36}, [%rd15];
	st.shared.v4.f32 	[%rd2+96], {%f33, %f34, %f35, %f36};

BB6_2:
	setp.lt.s32	%p2, %r23, 10;
	@%p2 bra 	BB6_4;

	add.s32 	%r31, %r4, 6;
	setp.lt.s32	%p3, %r31, %r16;
	selp.b32	%r33, %r31, %r24, %p3;
	mad.lo.s32 	%r34, %r33, %r14, %r2;
	mul.wide.s32 	%rd16, %r34, 16;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.v4.f32 	{%f41, %f42, %f43, %f44}, [%rd17];
	st.shared.v4.f32 	[%rd2+5472], {%f41, %f42, %f43, %f44};

BB6_4:
	setp.gt.s32	%p4, %r19, 5;
	@%p4 bra 	BB6_9;

	add.s32 	%r37, %r2, -6;
	mov.u32 	%r38, 0;
	max.s32 	%r5, %r38, %r37;
	mad.lo.s32 	%r39, %r4, %r14, %r5;
	mul.wide.s32 	%rd18, %r39, 16;
	add.s64 	%rd19, %rd1, %rd18;
	ld.global.v4.f32 	{%f49, %f50, %f51, %f52}, [%rd19];
	st.shared.v4.f32 	[%rd2+2688], {%f49, %f50, %f51, %f52};
	@%p1 bra 	BB6_7;

	add.s32 	%r40, %r4, -6;
	max.s32 	%r42, %r38, %r40;
	mad.lo.s32 	%r43, %r42, %r14, %r5;
	mul.wide.s32 	%rd20, %r43, 16;
	add.s64 	%rd21, %rd1, %rd20;
	ld.global.v4.f32 	{%f57, %f58, %f59, %f60}, [%rd21];
	st.shared.v4.f32 	[%rd2], {%f57, %f58, %f59, %f60};

BB6_7:
	@%p2 bra 	BB6_9;

	add.s32 	%r45, %r4, 6;
	setp.lt.s32	%p7, %r45, %r16;
	selp.b32	%r47, %r45, %r24, %p7;
	mad.lo.s32 	%r48, %r47, %r14, %r5;
	mul.wide.s32 	%rd22, %r48, 16;
	add.s64 	%rd23, %rd1, %rd22;
	ld.global.v4.f32 	{%f65, %f66, %f67, %f68}, [%rd23];
	st.shared.v4.f32 	[%rd2+5376], {%f65, %f66, %f67, %f68};

BB6_9:
	setp.lt.s32	%p8, %r19, 10;
	@%p8 bra 	BB6_14;

	add.s32 	%r51, %r2, 6;
	setp.lt.s32	%p9, %r51, %r15;
	selp.b32	%r6, %r51, %r20, %p9;
	mad.lo.s32 	%r53, %r4, %r14, %r6;
	mul.wide.s32 	%rd24, %r53, 16;
	add.s64 	%rd25, %rd1, %rd24;
	ld.global.v4.f32 	{%f73, %f74, %f75, %f76}, [%rd25];
	st.shared.v4.f32 	[%rd2+2880], {%f73, %f74, %f75, %f76};
	@%p1 bra 	BB6_12;

	add.s32 	%r54, %r4, -6;
	mov.u32 	%r55, 0;
	max.s32 	%r56, %r55, %r54;
	mad.lo.s32 	%r57, %r56, %r14, %r6;
	mul.wide.s32 	%rd26, %r57, 16;
	add.s64 	%rd27, %rd1, %rd26;
	ld.global.v4.f32 	{%f81, %f82, %f83, %f84}, [%rd27];
	st.shared.v4.f32 	[%rd2+192], {%f81, %f82, %f83, %f84};

BB6_12:
	@%p2 bra 	BB6_14;

	add.s32 	%r59, %r4, 6;
	setp.lt.s32	%p12, %r59, %r16;
	selp.b32	%r61, %r59, %r24, %p12;
	mad.lo.s32 	%r62, %r61, %r14, %r6;
	mul.wide.s32 	%rd28, %r62, 16;
	add.s64 	%rd29, %rd1, %rd28;
	ld.global.v4.f32 	{%f89, %f90, %f91, %f92}, [%rd29];
	st.shared.v4.f32 	[%rd2+5568], {%f89, %f90, %f91, %f92};

BB6_14:
	bar.sync 	0;
	setp.lt.s32	%p13, %r3, %r16;
	setp.lt.s32	%p14, %r1, %r15;
	and.pred  	%p15, %p14, %p13;
	@!%p15 bra 	BB6_20;
	bra.uni 	BB6_15;

BB6_15:
	mul.ftz.f32 	%f102, %f23, %f23;
	fma.rn.ftz.f32 	%f1, %f23, %f23, %f102;
	mul.ftz.f32 	%f103, %f24, %f24;
	fma.rn.ftz.f32 	%f2, %f24, %f24, %f103;
	mov.f32 	%f130, 0f00000000;
	mov.f32 	%f129, %f130;
	mov.f32 	%f128, %f130;
	mov.f32 	%f127, %f130;
	mov.f32 	%f126, %f130;
	mov.u32 	%r81, -6;
	mov.u32 	%r63, 0;
	mov.u32 	%r84, %r63;

BB6_16:
	add.s32 	%r67, %r23, %r84;
	mul.wide.s32 	%rd30, %r67, 28;
	mul.wide.s32 	%rd31, %r19, 16;
	add.s64 	%rd33, %rd9, %rd31;
	shl.b64 	%rd34, %rd30, 4;
	add.s64 	%rd38, %rd33, %rd34;
	mul.lo.s32 	%r69, %r81, %r81;
	cvt.rn.f32.s32	%f104, %r69;
	neg.ftz.f32 	%f105, %f104;
	div.approx.ftz.f32 	%f106, %f105, %f2;
	mul.ftz.f32 	%f107, %f106, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f8, %f107;
	mov.u32 	%r83, %r63;

BB6_17:
	mov.u32 	%r9, %r83;
	add.s32 	%r70, %r9, -6;
	mul.lo.s32 	%r71, %r70, %r70;
	cvt.rn.f32.s32	%f108, %r71;
	neg.ftz.f32 	%f109, %f108;
	div.approx.ftz.f32 	%f110, %f109, %f1;
	mul.ftz.f32 	%f111, %f110, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f112, %f111;
	mul.ftz.f32 	%f113, %f8, %f112;
	ld.shared.v4.f32 	{%f114, %f115, %f116, %f117}, [%rd38];
	fma.rn.ftz.f32 	%f14, %f114, %f113, %f127;
	fma.rn.ftz.f32 	%f15, %f115, %f113, %f128;
	fma.rn.ftz.f32 	%f16, %f116, %f113, %f129;
	fma.rn.ftz.f32 	%f17, %f117, %f113, %f130;
	add.ftz.f32 	%f126, %f126, %f113;
	add.s32 	%r10, %r9, 1;
	add.s64 	%rd38, %rd38, 16;
	setp.ne.s32	%p16, %r9, 12;
	mov.f32 	%f130, %f17;
	mov.f32 	%f129, %f16;
	mov.f32 	%f128, %f15;
	mov.f32 	%f127, %f14;
	mov.u32 	%r83, %r10;
	@%p16 bra 	BB6_17;

	add.s32 	%r81, %r81, 1;
	setp.lt.s32	%p17, %r81, 7;
	add.s32 	%r84, %r84, 1;
	@%p17 bra 	BB6_16;

	cvta.to.global.u64 	%rd35, %rd6;
	mad.lo.s32 	%r80, %r3, %r13, %r1;
	mul.wide.s32 	%rd36, %r80, 16;
	add.s64 	%rd37, %rd35, %rd36;
	div.approx.ftz.f32 	%f122, %f17, %f126;
	div.approx.ftz.f32 	%f123, %f16, %f126;
	div.approx.ftz.f32 	%f124, %f15, %f126;
	div.approx.ftz.f32 	%f125, %f14, %f126;
	st.global.v4.f32 	[%rd37], {%f125, %f124, %f123, %f122};

BB6_20:
	ret;
}

.visible .entry _d_gauss_filter_kernel_shmem_float4_7(
	.param .u64 _d_gauss_filter_kernel_shmem_float4_7_param_0,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_7_param_1,
	.param .u64 _d_gauss_filter_kernel_shmem_float4_7_param_2,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_7_param_3,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_7_param_4,
	.param .u32 _d_gauss_filter_kernel_shmem_float4_7_param_5,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_7_param_6,
	.param .f32 _d_gauss_filter_kernel_shmem_float4_7_param_7
)
{
	.reg .pred 	%p<18>;
	.reg .s32 	%r<95>;
	.reg .f32 	%f<207>;
	.reg .s64 	%rd<39>;
	// demoted variable
	.shared .align 16 .b8 _d_gauss_filter_kernel_shmem_float4_7$__cuda_local_var_180269_228_non_const_s_data[14400];

	ld.param.u64 	%rd6, [_d_gauss_filter_kernel_shmem_float4_7_param_0];
	ld.param.u32 	%r15, [_d_gauss_filter_kernel_shmem_float4_7_param_1];
	ld.param.u64 	%rd7, [_d_gauss_filter_kernel_shmem_float4_7_param_2];
	ld.param.u32 	%r16, [_d_gauss_filter_kernel_shmem_float4_7_param_3];
	ld.param.u32 	%r17, [_d_gauss_filter_kernel_shmem_float4_7_param_4];
	ld.param.u32 	%r18, [_d_gauss_filter_kernel_shmem_float4_7_param_5];
	ld.param.f32 	%f23, [_d_gauss_filter_kernel_shmem_float4_7_param_6];
	ld.param.f32 	%f24, [_d_gauss_filter_kernel_shmem_float4_7_param_7];
	cvta.to.global.u64 	%rd1, %rd7;
	mov.u32 	%r19, %ntid.x;
	mov.u32 	%r20, %ctaid.x;
	mov.u32 	%r21, %tid.x;
	mad.lo.s32 	%r1, %r19, %r20, %r21;
	add.s32 	%r22, %r17, -1;
	min.s32 	%r2, %r1, %r22;
	mov.u32 	%r23, %ntid.y;
	mov.u32 	%r24, %ctaid.y;
	mov.u32 	%r25, %tid.y;
	mad.lo.s32 	%r3, %r23, %r24, %r25;
	add.s32 	%r26, %r18, -1;
	min.s32 	%r4, %r3, %r26;
	mul.wide.s32 	%rd8, %r25, 480;
	mov.u64 	%rd9, _d_gauss_filter_kernel_shmem_float4_7$__cuda_local_var_180269_228_non_const_s_data;
	add.s64 	%rd10, %rd9, %rd8;
	mul.wide.s32 	%rd11, %r21, 16;
	add.s64 	%rd2, %rd10, %rd11;
	mad.lo.s32 	%r27, %r4, %r16, %r2;
	mul.wide.s32 	%rd12, %r27, 16;
	add.s64 	%rd13, %rd1, %rd12;
	ld.global.v4.f32 	{%f25, %f26, %f27, %f28}, [%rd13];
	st.shared.v4.f32 	[%rd2+3472], {%f25, %f26, %f27, %f28};
	setp.gt.s32	%p1, %r25, 6;
	@%p1 bra 	BB7_2;

	add.s32 	%r28, %r4, -7;
	mov.u32 	%r29, 0;
	max.s32 	%r30, %r29, %r28;
	mad.lo.s32 	%r31, %r30, %r16, %r2;
	mul.wide.s32 	%rd14, %r31, 16;
	add.s64 	%rd15, %rd1, %rd14;
	ld.global.v4.f32 	{%f33, %f34, %f35, %f36}, [%rd15];
	st.shared.v4.f32 	[%rd2+112], {%f33, %f34, %f35, %f36};

BB7_2:
	setp.lt.s32	%p2, %r25, 9;
	@%p2 bra 	BB7_4;

	add.s32 	%r33, %r4, 7;
	setp.lt.s32	%p3, %r33, %r18;
	selp.b32	%r35, %r33, %r26, %p3;
	mad.lo.s32 	%r36, %r35, %r16, %r2;
	mul.wide.s32 	%rd16, %r36, 16;
	add.s64 	%rd17, %rd1, %rd16;
	ld.global.v4.f32 	{%f41, %f42, %f43, %f44}, [%rd17];
	st.shared.v4.f32 	[%rd2+6832], {%f41, %f42, %f43, %f44};

BB7_4:
	setp.gt.s32	%p4, %r21, 6;
	@%p4 bra 	BB7_9;

	add.s32 	%r39, %r2, -7;
	mov.u32 	%r40, 0;
	max.s32 	%r5, %r40, %r39;
	mad.lo.s32 	%r41, %r4, %r16, %r5;
	mul.wide.s32 	%rd18, %r41, 16;
	add.s64 	%rd19, %rd1, %rd18;
	ld.global.v4.f32 	{%f49, %f50, %f51, %f52}, [%rd19];
	st.shared.v4.f32 	[%rd2+3360], {%f49, %f50, %f51, %f52};
	@%p1 bra 	BB7_7;

	add.s32 	%r42, %r4, -7;
	max.s32 	%r44, %r40, %r42;
	mad.lo.s32 	%r45, %r44, %r16, %r5;
	mul.wide.s32 	%rd20, %r45, 16;
	add.s64 	%rd21, %rd1, %rd20;
	ld.global.v4.f32 	{%f57, %f58, %f59, %f60}, [%rd21];
	st.shared.v4.f32 	[%rd2], {%f57, %f58, %f59, %f60};

BB7_7:
	@%p2 bra 	BB7_9;

	add.s32 	%r47, %r4, 7;
	setp.lt.s32	%p7, %r47, %r18;
	selp.b32	%r49, %r47, %r26, %p7;
	mad.lo.s32 	%r50, %r49, %r16, %r5;
	mul.wide.s32 	%rd22, %r50, 16;
	add.s64 	%rd23, %rd1, %rd22;
	ld.global.v4.f32 	{%f65, %f66, %f67, %f68}, [%rd23];
	st.shared.v4.f32 	[%rd2+6720], {%f65, %f66, %f67, %f68};

BB7_9:
	setp.lt.s32	%p8, %r21, 9;
	@%p8 bra 	BB7_14;

	add.s32 	%r53, %r2, 7;
	setp.lt.s32	%p9, %r53, %r17;
	selp.b32	%r6, %r53, %r22, %p9;
	mad.lo.s32 	%r55, %r4, %r16, %r6;
	mul.wide.s32 	%rd24, %r55, 16;
	add.s64 	%rd25, %rd1, %rd24;
	ld.global.v4.f32 	{%f73, %f74, %f75, %f76}, [%rd25];
	st.shared.v4.f32 	[%rd2+3584], {%f73, %f74, %f75, %f76};
	@%p1 bra 	BB7_12;

	add.s32 	%r56, %r4, -7;
	mov.u32 	%r57, 0;
	max.s32 	%r58, %r57, %r56;
	mad.lo.s32 	%r59, %r58, %r16, %r6;
	mul.wide.s32 	%rd26, %r59, 16;
	add.s64 	%rd27, %rd1, %rd26;
	ld.global.v4.f32 	{%f81, %f82, %f83, %f84}, [%rd27];
	st.shared.v4.f32 	[%rd2+224], {%f81, %f82, %f83, %f84};

BB7_12:
	@%p2 bra 	BB7_14;

	add.s32 	%r61, %r4, 7;
	setp.lt.s32	%p12, %r61, %r18;
	selp.b32	%r63, %r61, %r26, %p12;
	mad.lo.s32 	%r64, %r63, %r16, %r6;
	mul.wide.s32 	%rd28, %r64, 16;
	add.s64 	%rd29, %rd1, %rd28;
	ld.global.v4.f32 	{%f89, %f90, %f91, %f92}, [%rd29];
	st.shared.v4.f32 	[%rd2+6944], {%f89, %f90, %f91, %f92};

BB7_14:
	bar.sync 	0;
	setp.lt.s32	%p13, %r3, %r18;
	setp.lt.s32	%p14, %r1, %r17;
	and.pred  	%p15, %p14, %p13;
	@!%p15 bra 	BB7_20;
	bra.uni 	BB7_15;

BB7_15:
	mul.ftz.f32 	%f102, %f23, %f23;
	fma.rn.ftz.f32 	%f1, %f23, %f23, %f102;
	mul.ftz.f32 	%f103, %f24, %f24;
	fma.rn.ftz.f32 	%f2, %f24, %f24, %f103;
	mov.f32 	%f206, 0f00000000;
	mov.f32 	%f205, %f206;
	mov.f32 	%f204, %f206;
	mov.f32 	%f203, %f206;
	mov.f32 	%f202, %f206;
	mov.u32 	%r66, -7;
	mov.u32 	%r90, 0;
	mov.u32 	%r94, %r66;

BB7_16:
	add.s32 	%r70, %r25, %r90;
	mul.wide.s32 	%rd30, %r70, 30;
	mul.wide.s32 	%rd31, %r21, 16;
	add.s64 	%rd33, %rd9, %rd31;
	shl.b64 	%rd34, %rd30, 4;
	add.s64 	%rd38, %rd33, %rd34;
	mul.lo.s32 	%r72, %r94, %r94;
	cvt.rn.f32.s32	%f104, %r72;
	neg.ftz.f32 	%f105, %f104;
	div.approx.ftz.f32 	%f106, %f105, %f2;
	mul.ftz.f32 	%f107, %f106, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f8, %f107;
	mov.u32 	%r91, -6;
	mov.u32 	%r93, %r66;

BB7_17:
	mov.u32 	%r10, %r93;
	mul.lo.s32 	%r73, %r10, %r10;
	cvt.rn.f32.s32	%f108, %r73;
	neg.ftz.f32 	%f109, %f108;
	div.approx.ftz.f32 	%f110, %f109, %f1;
	mul.ftz.f32 	%f111, %f110, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f112, %f111;
	mul.ftz.f32 	%f113, %f8, %f112;
	ld.shared.v4.f32 	{%f114, %f115, %f116, %f117}, [%rd38];
	fma.rn.ftz.f32 	%f119, %f114, %f113, %f203;
	fma.rn.ftz.f32 	%f121, %f115, %f113, %f204;
	fma.rn.ftz.f32 	%f123, %f116, %f113, %f205;
	fma.rn.ftz.f32 	%f125, %f117, %f113, %f206;
	add.ftz.f32 	%f126, %f202, %f113;
	mul.lo.s32 	%r74, %r91, %r91;
	cvt.rn.f32.s32	%f127, %r74;
	neg.ftz.f32 	%f128, %f127;
	div.approx.ftz.f32 	%f129, %f128, %f1;
	mul.ftz.f32 	%f130, %f129, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f131, %f130;
	mul.ftz.f32 	%f132, %f8, %f131;
	ld.shared.v4.f32 	{%f133, %f134, %f135, %f136}, [%rd38+16];
	fma.rn.ftz.f32 	%f138, %f133, %f132, %f119;
	fma.rn.ftz.f32 	%f140, %f134, %f132, %f121;
	fma.rn.ftz.f32 	%f142, %f135, %f132, %f123;
	fma.rn.ftz.f32 	%f144, %f136, %f132, %f125;
	add.ftz.f32 	%f145, %f126, %f132;
	add.s32 	%r75, %r10, 2;
	mul.lo.s32 	%r76, %r75, %r75;
	cvt.rn.f32.s32	%f146, %r76;
	neg.ftz.f32 	%f147, %f146;
	div.approx.ftz.f32 	%f148, %f147, %f1;
	mul.ftz.f32 	%f149, %f148, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f150, %f149;
	mul.ftz.f32 	%f151, %f8, %f150;
	ld.shared.v4.f32 	{%f152, %f153, %f154, %f155}, [%rd38+32];
	fma.rn.ftz.f32 	%f157, %f152, %f151, %f138;
	fma.rn.ftz.f32 	%f159, %f153, %f151, %f140;
	fma.rn.ftz.f32 	%f161, %f154, %f151, %f142;
	fma.rn.ftz.f32 	%f163, %f155, %f151, %f144;
	add.ftz.f32 	%f164, %f145, %f151;
	add.s32 	%r77, %r10, 3;
	mul.lo.s32 	%r78, %r77, %r77;
	cvt.rn.f32.s32	%f165, %r78;
	neg.ftz.f32 	%f166, %f165;
	div.approx.ftz.f32 	%f167, %f166, %f1;
	mul.ftz.f32 	%f168, %f167, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f169, %f168;
	mul.ftz.f32 	%f170, %f8, %f169;
	ld.shared.v4.f32 	{%f171, %f172, %f173, %f174}, [%rd38+48];
	fma.rn.ftz.f32 	%f176, %f171, %f170, %f157;
	fma.rn.ftz.f32 	%f178, %f172, %f170, %f159;
	fma.rn.ftz.f32 	%f180, %f173, %f170, %f161;
	fma.rn.ftz.f32 	%f182, %f174, %f170, %f163;
	add.ftz.f32 	%f183, %f164, %f170;
	add.s32 	%r79, %r10, 4;
	mul.lo.s32 	%r80, %r79, %r79;
	cvt.rn.f32.s32	%f184, %r80;
	neg.ftz.f32 	%f185, %f184;
	div.approx.ftz.f32 	%f186, %f185, %f1;
	mul.ftz.f32 	%f187, %f186, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f188, %f187;
	mul.ftz.f32 	%f189, %f8, %f188;
	ld.shared.v4.f32 	{%f190, %f191, %f192, %f193}, [%rd38+64];
	fma.rn.ftz.f32 	%f14, %f190, %f189, %f176;
	fma.rn.ftz.f32 	%f15, %f191, %f189, %f178;
	fma.rn.ftz.f32 	%f16, %f192, %f189, %f180;
	fma.rn.ftz.f32 	%f17, %f193, %f189, %f182;
	add.ftz.f32 	%f202, %f183, %f189;
	add.s32 	%r91, %r91, 5;
	add.s64 	%rd38, %rd38, 80;
	add.s32 	%r12, %r10, 5;
	setp.ne.s32	%p16, %r12, 8;
	mov.f32 	%f206, %f17;
	mov.f32 	%f205, %f16;
	mov.f32 	%f204, %f15;
	mov.f32 	%f203, %f14;
	mov.u32 	%r93, %r12;
	@%p16 bra 	BB7_17;

	add.s32 	%r94, %r94, 1;
	setp.lt.s32	%p17, %r94, 8;
	add.s32 	%r90, %r90, 1;
	@%p17 bra 	BB7_16;

	cvta.to.global.u64 	%rd35, %rd6;
	mad.lo.s32 	%r89, %r3, %r15, %r1;
	mul.wide.s32 	%rd36, %r89, 16;
	add.s64 	%rd37, %rd35, %rd36;
	div.approx.ftz.f32 	%f198, %f17, %f202;
	div.approx.ftz.f32 	%f199, %f16, %f202;
	div.approx.ftz.f32 	%f200, %f15, %f202;
	div.approx.ftz.f32 	%f201, %f14, %f202;
	st.global.v4.f32 	[%rd37], {%f201, %f200, %f199, %f198};

BB7_20:
	ret;
}

.visible .entry _d_median_filter_kernel_2x2(
	.param .u64 _d_median_filter_kernel_2x2_param_0,
	.param .u32 _d_median_filter_kernel_2x2_param_1,
	.param .u64 _d_median_filter_kernel_2x2_param_2,
	.param .u32 _d_median_filter_kernel_2x2_param_3,
	.param .u32 _d_median_filter_kernel_2x2_param_4,
	.param .u32 _d_median_filter_kernel_2x2_param_5
)
{
	.reg .pred 	%p<16>;
	.reg .s32 	%r<406>;
	.reg .f32 	%f<199>;
	.reg .s64 	%rd<30>;
	// demoted variable
	.shared .align 4 .b8 _d_median_filter_kernel_2x2$__cuda_local_var_180273_184_non_const_s_data[1600];

	ld.param.u64 	%rd3, [_d_median_filter_kernel_2x2_param_0];
	ld.param.u32 	%r9, [_d_median_filter_kernel_2x2_param_1];
	ld.param.u64 	%rd4, [_d_median_filter_kernel_2x2_param_2];
	ld.param.u32 	%r10, [_d_median_filter_kernel_2x2_param_3];
	ld.param.u32 	%r11, [_d_median_filter_kernel_2x2_param_4];
	ld.param.u32 	%r12, [_d_median_filter_kernel_2x2_param_5];
	cvta.to.global.u64 	%rd1, %rd4;
	mov.u32 	%r13, %ntid.x;
	mov.u32 	%r14, %ctaid.x;
	mov.u32 	%r15, %tid.x;
	mad.lo.s32 	%r1, %r13, %r14, %r15;
	add.s32 	%r16, %r11, -1;
	min.s32 	%r2, %r1, %r16;
	mov.u32 	%r17, %ntid.y;
	mov.u32 	%r18, %ctaid.y;
	mov.u32 	%r3, %tid.y;
	mad.lo.s32 	%r4, %r17, %r18, %r3;
	add.s32 	%r19, %r12, -1;
	min.s32 	%r5, %r4, %r19;
	mul.lo.s32 	%r6, %r5, %r10;
	add.s32 	%r20, %r6, %r2;
	mul.wide.s32 	%rd5, %r20, 4;
	add.s64 	%rd6, %rd1, %rd5;
	mul.wide.s32 	%rd7, %r3, 80;
	mov.u64 	%rd8, _d_median_filter_kernel_2x2$__cuda_local_var_180273_184_non_const_s_data;
	add.s64 	%rd9, %rd8, %rd7;
	mul.wide.s32 	%rd10, %r15, 4;
	add.s64 	%rd2, %rd9, %rd10;
	ld.global.f32 	%f1, [%rd6];
	st.shared.f32 	[%rd2+168], %f1;
	setp.gt.s32	%p1, %r3, 1;
	@%p1 bra 	BB8_2;

	add.s32 	%r21, %r5, -2;
	mov.u32 	%r22, 0;
	max.s32 	%r23, %r22, %r21;
	mad.lo.s32 	%r24, %r23, %r10, %r2;
	mul.wide.s32 	%rd11, %r24, 4;
	add.s64 	%rd12, %rd1, %rd11;
	ld.global.f32 	%f2, [%rd12];
	st.shared.f32 	[%rd2+8], %f2;

BB8_2:
	setp.lt.s32	%p2, %r3, 14;
	@%p2 bra 	BB8_4;

	add.s32 	%r25, %r5, 2;
	setp.lt.s32	%p3, %r25, %r12;
	selp.b32	%r27, %r25, %r19, %p3;
	mad.lo.s32 	%r28, %r27, %r10, %r2;
	mul.wide.s32 	%rd13, %r28, 4;
	add.s64 	%rd14, %rd1, %rd13;
	ld.global.f32 	%f3, [%rd14];
	st.shared.f32 	[%rd2+328], %f3;

BB8_4:
	setp.gt.s32	%p4, %r15, 1;
	@%p4 bra 	BB8_9;

	add.s32 	%r30, %r2, -2;
	mov.u32 	%r31, 0;
	max.s32 	%r7, %r31, %r30;
	add.s32 	%r32, %r6, %r7;
	mul.wide.s32 	%rd15, %r32, 4;
	add.s64 	%rd16, %rd1, %rd15;
	ld.global.f32 	%f4, [%rd16];
	st.shared.f32 	[%rd2+160], %f4;
	@%p1 bra 	BB8_7;

	add.s32 	%r33, %r5, -2;
	max.s32 	%r35, %r31, %r33;
	mad.lo.s32 	%r36, %r35, %r10, %r7;
	mul.wide.s32 	%rd17, %r36, 4;
	add.s64 	%rd18, %rd1, %rd17;
	ld.global.f32 	%f5, [%rd18];
	st.shared.f32 	[%rd2], %f5;

BB8_7:
	@%p2 bra 	BB8_9;

	add.s32 	%r37, %r5, 2;
	setp.lt.s32	%p7, %r37, %r12;
	selp.b32	%r39, %r37, %r19, %p7;
	mad.lo.s32 	%r40, %r39, %r10, %r7;
	mul.wide.s32 	%rd19, %r40, 4;
	add.s64 	%rd20, %rd1, %rd19;
	ld.global.f32 	%f6, [%rd20];
	st.shared.f32 	[%rd2+320], %f6;

BB8_9:
	setp.lt.s32	%p8, %r15, 14;
	@%p8 bra 	BB8_14;

	add.s32 	%r42, %r2, 2;
	setp.lt.s32	%p9, %r42, %r11;
	selp.b32	%r8, %r42, %r16, %p9;
	add.s32 	%r44, %r6, %r8;
	mul.wide.s32 	%rd21, %r44, 4;
	add.s64 	%rd22, %rd1, %rd21;
	ld.global.f32 	%f7, [%rd22];
	st.shared.f32 	[%rd2+176], %f7;
	@%p1 bra 	BB8_12;

	add.s32 	%r45, %r5, -2;
	mov.u32 	%r46, 0;
	max.s32 	%r47, %r46, %r45;
	mad.lo.s32 	%r48, %r47, %r10, %r8;
	mul.wide.s32 	%rd23, %r48, 4;
	add.s64 	%rd24, %rd1, %rd23;
	ld.global.f32 	%f8, [%rd24];
	st.shared.f32 	[%rd2+16], %f8;

BB8_12:
	@%p2 bra 	BB8_14;

	add.s32 	%r49, %r5, 2;
	setp.lt.s32	%p12, %r49, %r12;
	selp.b32	%r51, %r49, %r19, %p12;
	mad.lo.s32 	%r52, %r51, %r10, %r8;
	mul.wide.s32 	%rd25, %r52, 4;
	add.s64 	%rd26, %rd1, %rd25;
	ld.global.f32 	%f9, [%rd26];
	st.shared.f32 	[%rd2+336], %f9;

BB8_14:
	bar.sync 	0;
	setp.lt.s32	%p13, %r4, %r12;
	setp.lt.s32	%p14, %r1, %r11;
	and.pred  	%p15, %p14, %p13;
	@!%p15 bra 	BB8_16;
	bra.uni 	BB8_15;

BB8_15:
	ld.shared.f32 	%f10, [%rd2];
	cvt.rzi.ftz.s32.f32	%r53, %f10;
	ld.shared.f32 	%f11, [%rd2+4];
	cvt.rzi.ftz.s32.f32	%r54, %f11;
	min.s32 	%r55, %r53, %r54;
	cvt.rn.f32.s32	%f12, %r55;
	max.s32 	%r56, %r53, %r54;
	cvt.rn.f32.s32	%f13, %r56;
	ld.shared.f32 	%f14, [%rd2+12];
	cvt.rzi.ftz.s32.f32	%r57, %f14;
	ld.shared.f32 	%f15, [%rd2+16];
	cvt.rzi.ftz.s32.f32	%r58, %f15;
	min.s32 	%r59, %r57, %r58;
	cvt.rn.f32.s32	%f16, %r59;
	max.s32 	%r60, %r57, %r58;
	cvt.rn.f32.s32	%f17, %r60;
	ld.shared.f32 	%f18, [%rd2+8];
	cvt.rzi.ftz.s32.f32	%r61, %f18;
	cvt.rzi.ftz.s32.f32	%r62, %f17;
	min.s32 	%r63, %r61, %r62;
	cvt.rn.f32.s32	%f19, %r63;
	max.s32 	%r64, %r61, %r62;
	cvt.rn.f32.s32	%f20, %r64;
	cvt.rzi.ftz.s32.f32	%r65, %f19;
	cvt.rzi.ftz.s32.f32	%r66, %f16;
	min.s32 	%r67, %r65, %r66;
	cvt.rn.f32.s32	%f21, %r67;
	max.s32 	%r68, %r65, %r66;
	cvt.rn.f32.s32	%f22, %r68;
	ld.shared.f32 	%f23, [%rd2+84];
	cvt.rzi.ftz.s32.f32	%r69, %f23;
	ld.shared.f32 	%f24, [%rd2+88];
	cvt.rzi.ftz.s32.f32	%r70, %f24;
	min.s32 	%r71, %r69, %r70;
	cvt.rn.f32.s32	%f25, %r71;
	max.s32 	%r72, %r69, %r70;
	cvt.rn.f32.s32	%f26, %r72;
	ld.shared.f32 	%f27, [%rd2+80];
	cvt.rzi.ftz.s32.f32	%r73, %f27;
	cvt.rzi.ftz.s32.f32	%r74, %f26;
	min.s32 	%r75, %r73, %r74;
	cvt.rn.f32.s32	%f28, %r75;
	max.s32 	%r76, %r73, %r74;
	cvt.rn.f32.s32	%f29, %r76;
	cvt.rzi.ftz.s32.f32	%r77, %f28;
	cvt.rzi.ftz.s32.f32	%r78, %f25;
	min.s32 	%r79, %r77, %r78;
	cvt.rn.f32.s32	%f30, %r79;
	max.s32 	%r80, %r77, %r78;
	cvt.rn.f32.s32	%f31, %r80;
	ld.shared.f32 	%f32, [%rd2+96];
	cvt.rzi.ftz.s32.f32	%r81, %f32;
	cvt.rzi.ftz.s32.f32	%r82, %f29;
	min.s32 	%r83, %r81, %r82;
	cvt.rn.f32.s32	%f33, %r83;
	max.s32 	%r84, %r81, %r82;
	cvt.rn.f32.s32	%f34, %r84;
	cvt.rzi.ftz.s32.f32	%r85, %f13;
	cvt.rzi.ftz.s32.f32	%r86, %f34;
	min.s32 	%r87, %r85, %r86;
	cvt.rn.f32.s32	%f35, %r87;
	max.s32 	%r88, %r85, %r86;
	cvt.rn.f32.s32	%f36, %r88;
	cvt.rzi.ftz.s32.f32	%r89, %f35;
	cvt.rzi.ftz.s32.f32	%r90, %f20;
	min.s32 	%r91, %r89, %r90;
	cvt.rn.f32.s32	%f37, %r91;
	max.s32 	%r92, %r89, %r90;
	cvt.rn.f32.s32	%f38, %r92;
	ld.shared.f32 	%f39, [%rd2+168];
	cvt.rzi.ftz.s32.f32	%r93, %f39;
	ld.shared.f32 	%f40, [%rd2+172];
	cvt.rzi.ftz.s32.f32	%r94, %f40;
	min.s32 	%r95, %r93, %r94;
	cvt.rn.f32.s32	%f41, %r95;
	max.s32 	%r96, %r93, %r94;
	cvt.rn.f32.s32	%f42, %r96;
	ld.shared.f32 	%f43, [%rd2+164];
	cvt.rzi.ftz.s32.f32	%r97, %f43;
	cvt.rzi.ftz.s32.f32	%r98, %f42;
	min.s32 	%r99, %r97, %r98;
	cvt.rn.f32.s32	%f44, %r99;
	max.s32 	%r100, %r97, %r98;
	cvt.rn.f32.s32	%f45, %r100;
	cvt.rzi.ftz.s32.f32	%r101, %f44;
	cvt.rzi.ftz.s32.f32	%r102, %f41;
	min.s32 	%r103, %r101, %r102;
	cvt.rn.f32.s32	%f46, %r103;
	max.s32 	%r104, %r101, %r102;
	cvt.rn.f32.s32	%f47, %r104;
	ld.shared.f32 	%f48, [%rd2+240];
	cvt.rzi.ftz.s32.f32	%r105, %f48;
	ld.shared.f32 	%f49, [%rd2+244];
	cvt.rzi.ftz.s32.f32	%r106, %f49;
	min.s32 	%r107, %r105, %r106;
	cvt.rn.f32.s32	%f50, %r107;
	max.s32 	%r108, %r105, %r106;
	cvt.rn.f32.s32	%f51, %r108;
	ld.shared.f32 	%f52, [%rd2+176];
	cvt.rzi.ftz.s32.f32	%r109, %f52;
	cvt.rzi.ftz.s32.f32	%r110, %f51;
	min.s32 	%r111, %r109, %r110;
	cvt.rn.f32.s32	%f53, %r111;
	max.s32 	%r112, %r109, %r110;
	cvt.rn.f32.s32	%f54, %r112;
	cvt.rzi.ftz.s32.f32	%r113, %f53;
	cvt.rzi.ftz.s32.f32	%r114, %f50;
	min.s32 	%r115, %r113, %r114;
	cvt.rn.f32.s32	%f55, %r115;
	max.s32 	%r116, %r113, %r114;
	cvt.rn.f32.s32	%f56, %r116;
	ld.shared.f32 	%f57, [%rd2+252];
	cvt.rzi.ftz.s32.f32	%r117, %f57;
	ld.shared.f32 	%f58, [%rd2+256];
	cvt.rzi.ftz.s32.f32	%r118, %f58;
	min.s32 	%r119, %r117, %r118;
	cvt.rn.f32.s32	%f59, %r119;
	max.s32 	%r120, %r117, %r118;
	cvt.rn.f32.s32	%f60, %r120;
	ld.shared.f32 	%f61, [%rd2+248];
	cvt.rzi.ftz.s32.f32	%r121, %f61;
	cvt.rzi.ftz.s32.f32	%r122, %f60;
	min.s32 	%r123, %r121, %r122;
	cvt.rn.f32.s32	%f62, %r123;
	max.s32 	%r124, %r121, %r122;
	cvt.rn.f32.s32	%f63, %r124;
	cvt.rzi.ftz.s32.f32	%r125, %f62;
	cvt.rzi.ftz.s32.f32	%r126, %f59;
	min.s32 	%r127, %r125, %r126;
	cvt.rn.f32.s32	%f64, %r127;
	max.s32 	%r128, %r125, %r126;
	cvt.rn.f32.s32	%f65, %r128;
	ld.shared.f32 	%f66, [%rd2+324];
	cvt.rzi.ftz.s32.f32	%r129, %f66;
	ld.shared.f32 	%f67, [%rd2+328];
	cvt.rzi.ftz.s32.f32	%r130, %f67;
	min.s32 	%r131, %r129, %r130;
	cvt.rn.f32.s32	%f68, %r131;
	max.s32 	%r132, %r129, %r130;
	cvt.rn.f32.s32	%f69, %r132;
	ld.shared.f32 	%f70, [%rd2+320];
	cvt.rzi.ftz.s32.f32	%r133, %f70;
	cvt.rzi.ftz.s32.f32	%r134, %f69;
	min.s32 	%r135, %r133, %r134;
	cvt.rn.f32.s32	%f71, %r135;
	max.s32 	%r136, %r133, %r134;
	cvt.rn.f32.s32	%f72, %r136;
	cvt.rzi.ftz.s32.f32	%r137, %f71;
	cvt.rzi.ftz.s32.f32	%r138, %f68;
	min.s32 	%r139, %r137, %r138;
	cvt.rn.f32.s32	%f73, %r139;
	max.s32 	%r140, %r137, %r138;
	cvt.rn.f32.s32	%f74, %r140;
	ld.shared.f32 	%f75, [%rd2+332];
	cvt.rzi.ftz.s32.f32	%r141, %f75;
	ld.shared.f32 	%f76, [%rd2+336];
	cvt.rzi.ftz.s32.f32	%r142, %f76;
	min.s32 	%r143, %r141, %r142;
	cvt.rn.f32.s32	%f77, %r143;
	max.s32 	%r144, %r141, %r142;
	cvt.rn.f32.s32	%f78, %r144;
	cvt.rzi.ftz.s32.f32	%r145, %f21;
	cvt.rzi.ftz.s32.f32	%r146, %f30;
	min.s32 	%r147, %r145, %r146;
	cvt.rn.f32.s32	%f79, %r147;
	max.s32 	%r148, %r145, %r146;
	cvt.rn.f32.s32	%f80, %r148;
	cvt.rzi.ftz.s32.f32	%r149, %f22;
	cvt.rzi.ftz.s32.f32	%r150, %f31;
	min.s32 	%r151, %r149, %r150;
	cvt.rn.f32.s32	%f81, %r151;
	max.s32 	%r152, %r149, %r150;
	cvt.rn.f32.s32	%f82, %r152;
	cvt.rzi.ftz.s32.f32	%r153, %f12;
	cvt.rzi.ftz.s32.f32	%r154, %f82;
	min.s32 	%r155, %r153, %r154;
	cvt.rn.f32.s32	%f83, %r155;
	max.s32 	%r156, %r153, %r154;
	cvt.rn.f32.s32	%f84, %r156;
	cvt.rzi.ftz.s32.f32	%r157, %f83;
	cvt.rzi.ftz.s32.f32	%r158, %f81;
	min.s32 	%r159, %r157, %r158;
	cvt.rn.f32.s32	%f85, %r159;
	max.s32 	%r160, %r157, %r158;
	cvt.rn.f32.s32	%f86, %r160;
	cvt.rzi.ftz.s32.f32	%r161, %f38;
	cvt.rzi.ftz.s32.f32	%r162, %f36;
	min.s32 	%r163, %r161, %r162;
	cvt.rn.f32.s32	%f87, %r163;
	max.s32 	%r164, %r161, %r162;
	cvt.rn.f32.s32	%f88, %r164;
	cvt.rzi.ftz.s32.f32	%r165, %f37;
	cvt.rzi.ftz.s32.f32	%r166, %f88;
	min.s32 	%r167, %r165, %r166;
	cvt.rn.f32.s32	%f89, %r167;
	max.s32 	%r168, %r165, %r166;
	cvt.rn.f32.s32	%f90, %r168;
	cvt.rzi.ftz.s32.f32	%r169, %f89;
	cvt.rzi.ftz.s32.f32	%r170, %f87;
	min.s32 	%r171, %r169, %r170;
	cvt.rn.f32.s32	%f91, %r171;
	max.s32 	%r172, %r169, %r170;
	cvt.rn.f32.s32	%f92, %r172;
	cvt.rzi.ftz.s32.f32	%r173, %f46;
	cvt.rzi.ftz.s32.f32	%r174, %f55;
	min.s32 	%r175, %r173, %r174;
	cvt.rn.f32.s32	%f93, %r175;
	max.s32 	%r176, %r173, %r174;
	cvt.rn.f32.s32	%f94, %r176;
	ld.shared.f32 	%f95, [%rd2+92];
	cvt.rzi.ftz.s32.f32	%r177, %f95;
	cvt.rzi.ftz.s32.f32	%r178, %f94;
	min.s32 	%r179, %r177, %r178;
	cvt.rn.f32.s32	%f96, %r179;
	max.s32 	%r180, %r177, %r178;
	cvt.rn.f32.s32	%f97, %r180;
	cvt.rzi.ftz.s32.f32	%r181, %f96;
	cvt.rzi.ftz.s32.f32	%r182, %f93;
	min.s32 	%r183, %r181, %r182;
	cvt.rn.f32.s32	%f98, %r183;
	max.s32 	%r184, %r181, %r182;
	cvt.rn.f32.s32	%f99, %r184;
	cvt.rzi.ftz.s32.f32	%r185, %f47;
	cvt.rzi.ftz.s32.f32	%r186, %f56;
	min.s32 	%r187, %r185, %r186;
	cvt.rn.f32.s32	%f100, %r187;
	max.s32 	%r188, %r185, %r186;
	cvt.rn.f32.s32	%f101, %r188;
	cvt.rzi.ftz.s32.f32	%r189, %f33;
	cvt.rzi.ftz.s32.f32	%r190, %f101;
	min.s32 	%r191, %r189, %r190;
	cvt.rn.f32.s32	%f102, %r191;
	max.s32 	%r192, %r189, %r190;
	cvt.rn.f32.s32	%f103, %r192;
	cvt.rzi.ftz.s32.f32	%r193, %f102;
	cvt.rzi.ftz.s32.f32	%r194, %f100;
	min.s32 	%r195, %r193, %r194;
	cvt.rn.f32.s32	%f104, %r195;
	max.s32 	%r196, %r193, %r194;
	cvt.rn.f32.s32	%f105, %r196;
	cvt.rzi.ftz.s32.f32	%r197, %f45;
	cvt.rzi.ftz.s32.f32	%r198, %f54;
	min.s32 	%r199, %r197, %r198;
	cvt.rn.f32.s32	%f106, %r199;
	max.s32 	%r200, %r197, %r198;
	cvt.rn.f32.s32	%f107, %r200;
	ld.shared.f32 	%f108, [%rd2+160];
	cvt.rzi.ftz.s32.f32	%r201, %f108;
	cvt.rzi.ftz.s32.f32	%r202, %f107;
	min.s32 	%r203, %r201, %r202;
	cvt.rn.f32.s32	%f109, %r203;
	max.s32 	%r204, %r201, %r202;
	cvt.rn.f32.s32	%f110, %r204;
	cvt.rzi.ftz.s32.f32	%r205, %f109;
	cvt.rzi.ftz.s32.f32	%r206, %f106;
	min.s32 	%r207, %r205, %r206;
	cvt.rn.f32.s32	%f111, %r207;
	max.s32 	%r208, %r205, %r206;
	cvt.rn.f32.s32	%f112, %r208;
	cvt.rzi.ftz.s32.f32	%r209, %f73;
	cvt.rzi.ftz.s32.f32	%r210, %f77;
	min.s32 	%r211, %r209, %r210;
	cvt.rn.f32.s32	%f113, %r211;
	max.s32 	%r212, %r209, %r210;
	cvt.rn.f32.s32	%f114, %r212;
	cvt.rzi.ftz.s32.f32	%r213, %f64;
	cvt.rzi.ftz.s32.f32	%r214, %f114;
	min.s32 	%r215, %r213, %r214;
	cvt.rn.f32.s32	%f115, %r215;
	max.s32 	%r216, %r213, %r214;
	cvt.rn.f32.s32	%f116, %r216;
	cvt.rzi.ftz.s32.f32	%r217, %f115;
	cvt.rzi.ftz.s32.f32	%r218, %f113;
	min.s32 	%r219, %r217, %r218;
	cvt.rn.f32.s32	%f117, %r219;
	max.s32 	%r220, %r217, %r218;
	cvt.rn.f32.s32	%f118, %r220;
	cvt.rzi.ftz.s32.f32	%r221, %f74;
	cvt.rzi.ftz.s32.f32	%r222, %f78;
	min.s32 	%r223, %r221, %r222;
	cvt.rn.f32.s32	%f119, %r223;
	max.s32 	%r224, %r221, %r222;
	cvt.rn.f32.s32	%f120, %r224;
	cvt.rzi.ftz.s32.f32	%r225, %f65;
	cvt.rzi.ftz.s32.f32	%r226, %f120;
	min.s32 	%r227, %r225, %r226;
	cvt.rn.f32.s32	%f121, %r227;
	max.s32 	%r228, %r225, %r226;
	cvt.rn.f32.s32	%f122, %r228;
	cvt.rzi.ftz.s32.f32	%r229, %f121;
	cvt.rzi.ftz.s32.f32	%r230, %f119;
	min.s32 	%r231, %r229, %r230;
	cvt.rn.f32.s32	%f123, %r231;
	max.s32 	%r232, %r229, %r230;
	cvt.rn.f32.s32	%f124, %r232;
	cvt.rzi.ftz.s32.f32	%r233, %f63;
	cvt.rzi.ftz.s32.f32	%r234, %f72;
	min.s32 	%r235, %r233, %r234;
	cvt.rn.f32.s32	%f125, %r235;
	max.s32 	%r236, %r233, %r234;
	cvt.rn.f32.s32	%f126, %r236;
	cvt.rzi.ftz.s32.f32	%r237, %f98;
	cvt.rzi.ftz.s32.f32	%r238, %f117;
	max.s32 	%r239, %r237, %r238;
	cvt.rn.f32.s32	%f127, %r239;
	cvt.rzi.ftz.s32.f32	%r240, %f104;
	cvt.rzi.ftz.s32.f32	%r241, %f123;
	min.s32 	%r242, %r240, %r241;
	cvt.rn.f32.s32	%f128, %r242;
	max.s32 	%r243, %r240, %r241;
	cvt.rn.f32.s32	%f129, %r243;
	cvt.rzi.ftz.s32.f32	%r244, %f85;
	cvt.rzi.ftz.s32.f32	%r245, %f129;
	min.s32 	%r246, %r244, %r245;
	cvt.rn.f32.s32	%f130, %r246;
	max.s32 	%r247, %r244, %r245;
	cvt.rn.f32.s32	%f131, %r247;
	cvt.rzi.ftz.s32.f32	%r248, %f130;
	cvt.rzi.ftz.s32.f32	%r249, %f128;
	max.s32 	%r250, %r248, %r249;
	cvt.rn.f32.s32	%f132, %r250;
	cvt.rzi.ftz.s32.f32	%r251, %f111;
	cvt.rzi.ftz.s32.f32	%r252, %f125;
	min.s32 	%r253, %r251, %r252;
	cvt.rn.f32.s32	%f133, %r253;
	max.s32 	%r254, %r251, %r252;
	cvt.rn.f32.s32	%f134, %r254;
	cvt.rzi.ftz.s32.f32	%r255, %f91;
	cvt.rzi.ftz.s32.f32	%r256, %f134;
	min.s32 	%r257, %r255, %r256;
	cvt.rn.f32.s32	%f135, %r257;
	max.s32 	%r258, %r255, %r256;
	cvt.rn.f32.s32	%f136, %r258;
	cvt.rzi.ftz.s32.f32	%r259, %f135;
	cvt.rzi.ftz.s32.f32	%r260, %f133;
	max.s32 	%r261, %r259, %r260;
	cvt.rn.f32.s32	%f137, %r261;
	cvt.rzi.ftz.s32.f32	%r262, %f99;
	cvt.rzi.ftz.s32.f32	%r263, %f118;
	min.s32 	%r264, %r262, %r263;
	cvt.rn.f32.s32	%f138, %r264;
	max.s32 	%r265, %r262, %r263;
	cvt.rn.f32.s32	%f139, %r265;
	cvt.rzi.ftz.s32.f32	%r266, %f79;
	cvt.rzi.ftz.s32.f32	%r267, %f139;
	min.s32 	%r268, %r266, %r267;
	cvt.rn.f32.s32	%f140, %r268;
	max.s32 	%r269, %r266, %r267;
	cvt.rn.f32.s32	%f141, %r269;
	cvt.rzi.ftz.s32.f32	%r270, %f140;
	cvt.rzi.ftz.s32.f32	%r271, %f138;
	max.s32 	%r272, %r270, %r271;
	cvt.rn.f32.s32	%f142, %r272;
	cvt.rzi.ftz.s32.f32	%r273, %f105;
	cvt.rzi.ftz.s32.f32	%r274, %f124;
	min.s32 	%r275, %r273, %r274;
	cvt.rn.f32.s32	%f143, %r275;
	max.s32 	%r276, %r273, %r274;
	cvt.rn.f32.s32	%f144, %r276;
	cvt.rzi.ftz.s32.f32	%r277, %f86;
	cvt.rzi.ftz.s32.f32	%r278, %f144;
	min.s32 	%r279, %r277, %r278;
	cvt.rn.f32.s32	%f145, %r279;
	cvt.rzi.ftz.s32.f32	%r280, %f145;
	cvt.rzi.ftz.s32.f32	%r281, %f143;
	min.s32 	%r282, %r280, %r281;
	cvt.rn.f32.s32	%f146, %r282;
	max.s32 	%r283, %r280, %r281;
	cvt.rn.f32.s32	%f147, %r283;
	cvt.rzi.ftz.s32.f32	%r284, %f112;
	cvt.rzi.ftz.s32.f32	%r285, %f126;
	min.s32 	%r286, %r284, %r285;
	cvt.rn.f32.s32	%f148, %r286;
	max.s32 	%r287, %r284, %r285;
	cvt.rn.f32.s32	%f149, %r287;
	cvt.rzi.ftz.s32.f32	%r288, %f92;
	cvt.rzi.ftz.s32.f32	%r289, %f149;
	min.s32 	%r290, %r288, %r289;
	cvt.rn.f32.s32	%f150, %r290;
	cvt.rzi.ftz.s32.f32	%r291, %f150;
	cvt.rzi.ftz.s32.f32	%r292, %f148;
	min.s32 	%r293, %r291, %r292;
	cvt.rn.f32.s32	%f151, %r293;
	cvt.rzi.ftz.s32.f32	%r294, %f97;
	cvt.rzi.ftz.s32.f32	%r295, %f116;
	min.s32 	%r296, %r294, %r295;
	cvt.rn.f32.s32	%f152, %r296;
	max.s32 	%r297, %r294, %r295;
	cvt.rn.f32.s32	%f153, %r297;
	cvt.rzi.ftz.s32.f32	%r298, %f80;
	cvt.rzi.ftz.s32.f32	%r299, %f153;
	min.s32 	%r300, %r298, %r299;
	cvt.rn.f32.s32	%f154, %r300;
	cvt.rzi.ftz.s32.f32	%r301, %f154;
	cvt.rzi.ftz.s32.f32	%r302, %f152;
	min.s32 	%r303, %r301, %r302;
	cvt.rn.f32.s32	%f155, %r303;
	max.s32 	%r304, %r301, %r302;
	cvt.rn.f32.s32	%f156, %r304;
	cvt.rzi.ftz.s32.f32	%r305, %f103;
	cvt.rzi.ftz.s32.f32	%r306, %f122;
	min.s32 	%r307, %r305, %r306;
	cvt.rn.f32.s32	%f157, %r307;
	max.s32 	%r308, %r305, %r306;
	cvt.rn.f32.s32	%f158, %r308;
	cvt.rzi.ftz.s32.f32	%r309, %f84;
	cvt.rzi.ftz.s32.f32	%r310, %f158;
	min.s32 	%r311, %r309, %r310;
	cvt.rn.f32.s32	%f159, %r311;
	cvt.rzi.ftz.s32.f32	%r312, %f159;
	cvt.rzi.ftz.s32.f32	%r313, %f157;
	min.s32 	%r314, %r312, %r313;
	cvt.rn.f32.s32	%f160, %r314;
	cvt.rzi.ftz.s32.f32	%r315, %f90;
	cvt.rzi.ftz.s32.f32	%r316, %f110;
	min.s32 	%r317, %r315, %r316;
	cvt.rn.f32.s32	%f161, %r317;
	cvt.rzi.ftz.s32.f32	%r318, %f161;
	cvt.rzi.ftz.s32.f32	%r319, %f136;
	min.s32 	%r320, %r318, %r319;
	cvt.rn.f32.s32	%f162, %r320;
	cvt.rzi.ftz.s32.f32	%r321, %f146;
	cvt.rzi.ftz.s32.f32	%r322, %f142;
	max.s32 	%r323, %r321, %r322;
	cvt.rn.f32.s32	%f163, %r323;
	cvt.rzi.ftz.s32.f32	%r324, %f155;
	cvt.rzi.ftz.s32.f32	%r325, %f127;
	max.s32 	%r326, %r324, %r325;
	cvt.rn.f32.s32	%f164, %r326;
	cvt.rzi.ftz.s32.f32	%r327, %f163;
	cvt.rzi.ftz.s32.f32	%r328, %f164;
	max.s32 	%r329, %r327, %r328;
	cvt.rn.f32.s32	%f165, %r329;
	cvt.rzi.ftz.s32.f32	%r330, %f132;
	cvt.rzi.ftz.s32.f32	%r331, %f165;
	max.s32 	%r332, %r330, %r331;
	cvt.rn.f32.s32	%f166, %r332;
	cvt.rzi.ftz.s32.f32	%r333, %f151;
	cvt.rzi.ftz.s32.f32	%r334, %f137;
	min.s32 	%r335, %r333, %r334;
	cvt.rn.f32.s32	%f167, %r335;
	max.s32 	%r336, %r333, %r334;
	cvt.rn.f32.s32	%f168, %r336;
	cvt.rzi.ftz.s32.f32	%r337, %f160;
	cvt.rzi.ftz.s32.f32	%r338, %f147;
	min.s32 	%r339, %r337, %r338;
	cvt.rn.f32.s32	%f169, %r339;
	max.s32 	%r340, %r337, %r338;
	cvt.rn.f32.s32	%f170, %r340;
	cvt.rzi.ftz.s32.f32	%r341, %f162;
	cvt.rzi.ftz.s32.f32	%r342, %f156;
	min.s32 	%r343, %r341, %r342;
	cvt.rn.f32.s32	%f171, %r343;
	max.s32 	%r344, %r341, %r342;
	cvt.rn.f32.s32	%f172, %r344;
	cvt.rzi.ftz.s32.f32	%r345, %f167;
	cvt.rzi.ftz.s32.f32	%r346, %f169;
	min.s32 	%r347, %r345, %r346;
	cvt.rn.f32.s32	%f173, %r347;
	max.s32 	%r348, %r345, %r346;
	cvt.rn.f32.s32	%f174, %r348;
	cvt.rzi.ftz.s32.f32	%r349, %f173;
	cvt.rzi.ftz.s32.f32	%r350, %f171;
	max.s32 	%r351, %r349, %r350;
	cvt.rn.f32.s32	%f175, %r351;
	cvt.rzi.ftz.s32.f32	%r352, %f170;
	cvt.rzi.ftz.s32.f32	%r353, %f172;
	min.s32 	%r354, %r352, %r353;
	cvt.rn.f32.s32	%f176, %r354;
	max.s32 	%r355, %r352, %r353;
	cvt.rn.f32.s32	%f177, %r355;
	cvt.rzi.ftz.s32.f32	%r356, %f168;
	cvt.rzi.ftz.s32.f32	%r357, %f177;
	min.s32 	%r358, %r356, %r357;
	cvt.rn.f32.s32	%f178, %r358;
	cvt.rzi.ftz.s32.f32	%r359, %f174;
	cvt.rzi.ftz.s32.f32	%r360, %f175;
	min.s32 	%r361, %r359, %r360;
	cvt.rn.f32.s32	%f179, %r361;
	max.s32 	%r362, %r359, %r360;
	cvt.rn.f32.s32	%f180, %r362;
	cvt.rzi.ftz.s32.f32	%r363, %f178;
	cvt.rzi.ftz.s32.f32	%r364, %f176;
	min.s32 	%r365, %r363, %r364;
	cvt.rn.f32.s32	%f181, %r365;
	max.s32 	%r366, %r363, %r364;
	cvt.rn.f32.s32	%f182, %r366;
	cvt.rzi.ftz.s32.f32	%r367, %f179;
	cvt.rzi.ftz.s32.f32	%r368, %f181;
	min.s32 	%r369, %r367, %r368;
	cvt.rn.f32.s32	%f183, %r369;
	max.s32 	%r370, %r367, %r368;
	cvt.rn.f32.s32	%f184, %r370;
	cvt.rzi.ftz.s32.f32	%r371, %f183;
	cvt.rzi.ftz.s32.f32	%r372, %f166;
	max.s32 	%r373, %r371, %r372;
	cvt.rn.f32.s32	%f185, %r373;
	cvt.rzi.ftz.s32.f32	%r374, %f182;
	cvt.rzi.ftz.s32.f32	%r375, %f185;
	min.s32 	%r376, %r374, %r375;
	cvt.rn.f32.s32	%f186, %r376;
	max.s32 	%r377, %r374, %r375;
	cvt.rn.f32.s32	%f187, %r377;
	cvt.rzi.ftz.s32.f32	%r378, %f180;
	cvt.rzi.ftz.s32.f32	%r379, %f187;
	min.s32 	%r380, %r378, %r379;
	cvt.rn.f32.s32	%f188, %r380;
	cvt.rzi.ftz.s32.f32	%r381, %f188;
	cvt.rzi.ftz.s32.f32	%r382, %f184;
	min.s32 	%r383, %r381, %r382;
	cvt.rn.f32.s32	%f189, %r383;
	max.s32 	%r384, %r381, %r382;
	cvt.rn.f32.s32	%f190, %r384;
	cvt.rzi.ftz.s32.f32	%r385, %f186;
	cvt.rzi.ftz.s32.f32	%r386, %f131;
	min.s32 	%r387, %r385, %r386;
	cvt.rn.f32.s32	%f191, %r387;
	max.s32 	%r388, %r385, %r386;
	cvt.rn.f32.s32	%f192, %r388;
	cvt.rzi.ftz.s32.f32	%r389, %f189;
	cvt.rzi.ftz.s32.f32	%r390, %f191;
	max.s32 	%r391, %r389, %r390;
	cvt.rn.f32.s32	%f193, %r391;
	cvt.rzi.ftz.s32.f32	%r392, %f190;
	cvt.rzi.ftz.s32.f32	%r393, %f192;
	min.s32 	%r394, %r392, %r393;
	cvt.rn.f32.s32	%f194, %r394;
	cvt.rzi.ftz.s32.f32	%r395, %f193;
	cvt.rzi.ftz.s32.f32	%r396, %f141;
	min.s32 	%r397, %r395, %r396;
	cvt.rn.f32.s32	%f195, %r397;
	max.s32 	%r398, %r395, %r396;
	cvt.rn.f32.s32	%f196, %r398;
	cvt.rzi.ftz.s32.f32	%r399, %f194;
	cvt.rzi.ftz.s32.f32	%r400, %f196;
	min.s32 	%r401, %r399, %r400;
	cvt.rn.f32.s32	%f197, %r401;
	cvt.rzi.ftz.s32.f32	%r402, %f197;
	cvt.rzi.ftz.s32.f32	%r403, %f195;
	max.s32 	%r404, %r402, %r403;
	cvt.rn.f32.s32	%f198, %r404;
	mad.lo.s32 	%r405, %r4, %r9, %r1;
	cvta.to.global.u64 	%rd27, %rd3;
	mul.wide.s32 	%rd28, %r405, 4;
	add.s64 	%rd29, %rd27, %rd28;
	st.global.f32 	[%rd29], %f198;

BB8_16:
	ret;
}


