//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Wed Jul 10 12:41:20 2013 (1373485280)
// Cuda compilation tools, release 5.5, V5.5.0
//

.version 3.2
.target sm_30
.address_size 64

	.file	1 "D:/singlebarrel/releases/2014.03/shared/adobe/MediaCore/GPUFoundation/Src/ImageProcessing/GaussianBlur.cu", 1399785311, 21911
	.file	2 "D:\\singlebarrel\\releases\\2014.03\\shared\\adobe\\MediaCore\\GPUFoundation\\API\\Inc\\GPUFoundation/KernelSupport/PixelUtils.h", 1399785310, 5707
	.file	3 "d:\\singlebarrel\\releases\\2014.03\\shared\\adobe\\mediacore\\external\\3rdparty\\nvidia\\cuda\\win\\include\\device_functions.h", 1399785281, 191626
// VerticalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_8773_non_const_smem has been demoted
// VerticalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_9460_non_const_smem has been demoted
// HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem has been demoted
// HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem has been demoted
.global .align 1 .b8 $str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};

.visible .func  (.param .align 16 .b8 func_retval0[16]) _Z18UnpremultiplyPixel8PixelRGB(
	.param .align 16 .b8 _Z18UnpremultiplyPixel8PixelRGB_param_0[16]
)
{
	.reg .pred 	%p<2>;
	.reg .f32 	%f<24>;


	ld.param.f32 	%f3, [_Z18UnpremultiplyPixel8PixelRGB_param_0+8];
	ld.param.f32 	%f2, [_Z18UnpremultiplyPixel8PixelRGB_param_0+4];
	ld.param.f32 	%f1, [_Z18UnpremultiplyPixel8PixelRGB_param_0];
	ld.param.f32 	%f12, [_Z18UnpremultiplyPixel8PixelRGB_param_0+12];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f20, %f12;
	.loc 2 45 1
	add.ftz.f32 	%f13, %f20, 0fB70637BD;
	setp.gtu.ftz.f32	%p1, %f13, 0f00000000;
	@%p1 bra 	BB0_2;

	mov.f32 	%f23, 0f00000000;
	mov.f32 	%f22, %f23;
	mov.f32 	%f21, %f23;
	mov.f32 	%f20, %f23;
	bra.uni 	BB0_3;

BB0_2:
	mov.f32 	%f18, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f19, %f18, %f20;
	.loc 2 45 1
	mul.ftz.f32 	%f21, %f3, %f19;
	mul.ftz.f32 	%f22, %f2, %f19;
	mul.ftz.f32 	%f23, %f1, %f19;

BB0_3:
	st.param.f32	[func_retval0+0], %f23;
	st.param.f32	[func_retval0+4], %f22;
	st.param.f32	[func_retval0+8], %f21;
	st.param.f32	[func_retval0+12], %f20;
	.loc 2 45 1
	ret;
}

.visible .func _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff(
	.param .b64 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_0,
	.param .b64 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_1,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_2,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_3,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_4,
	.param .b64 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_5,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_6,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_7,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_8,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_9,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_10,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_11,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_12,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_13,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_14,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_15,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_16,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_17,
	.param .b32 _Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_18
)
{
	.reg .pred 	%p<49>;
	.reg .s16 	%rs<33>;
	.reg .s32 	%r<114>;
	.reg .f32 	%f<309>;
	.reg .s64 	%rd<32>;


	ld.param.u64 	%rd3, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_0];
	ld.param.u64 	%rd4, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_1];
	ld.param.u32 	%r39, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_2];
	ld.param.u32 	%r40, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_3];
	ld.param.u32 	%r41, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_4];
	ld.param.u64 	%rd5, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_5];
	ld.param.u32 	%r42, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_6];
	ld.param.u32 	%r43, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_7];
	ld.param.u32 	%r44, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_8];
	ld.param.u32 	%r45, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_9];
	ld.param.u32 	%r46, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_10];
	ld.param.f32 	%f120, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_11];
	ld.param.f32 	%f121, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_12];
	ld.param.f32 	%f122, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_13];
	ld.param.f32 	%f123, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_14];
	ld.param.f32 	%f124, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_15];
	ld.param.f32 	%f125, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_16];
	ld.param.f32 	%f126, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_17];
	ld.param.f32 	%f127, [_Z25VerticalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_18];
	.loc 1 77 1
	mov.u32 	%r47, %ntid.x;
	mov.u32 	%r48, %ctaid.x;
	mov.u32 	%r49, %tid.x;
	mad.lo.s32 	%r1, %r48, %r47, %r49;
	sub.s32 	%r50, %r43, %r40;
	shr.s32 	%r51, %r50, 1;
	sub.s32 	%r2, %r1, %r51;
	sub.s32 	%r52, %r44, %r41;
	shr.s32 	%r53, %r52, 1;
	mov.u32 	%r113, %tid.y;
	sub.s32 	%r111, %r113, %r53;
	.loc 1 77 1
	setp.ge.s32	%p1, %r1, %r43;
	@%p1 bra 	BB1_60;

	.loc 1 77 1
	mad.lo.s32 	%r109, %r111, %r39, %r2;
	mad.lo.s32 	%r103, %r113, %r42, %r1;
	mov.u32 	%r112, 0;
	.loc 1 77 1
	setp.gt.s32	%p2, %r44, 0;
	@%p2 bra 	BB1_2;
	bra.uni 	BB1_23;

BB1_2:
	mov.f32 	%f271, 0f00000000;
	mov.f32 	%f270, %f271;
	mov.f32 	%f269, %f271;
	mov.u32 	%r110, %r109;

BB1_3:
	setp.lt.s32	%p3, %r2, %r40;
	setp.gt.s32	%p4, %r2, -1;
	.loc 1 77 1
	setp.lt.s32	%p5, %r111, %r41;
	setp.gt.s32	%p6, %r111, -1;
	and.pred  	%p7, %p6, %p5;
	and.pred  	%p8, %p7, %p4;
	and.pred  	%p9, %p8, %p3;
	.loc 1 77 1
	@%p9 bra 	BB1_5;

	mov.f32 	%f282, 0f00000000;
	mov.f32 	%f281, %f282;
	mov.f32 	%f280, %f282;
	mov.f32 	%f279, %f282;
	bra.uni 	BB1_18;

BB1_5:
	setp.eq.s32	%p10, %r45, 0;
	.loc 1 77 1
	@%p10 bra 	BB1_7;

	mul.wide.s32 	%rd6, %r110, 16;
	add.s64 	%rd7, %rd4, %rd6;
	ld.v4.f32 	{%f135, %f136, %f137, %f138}, [%rd7];
	mov.f32 	%f275, %f138;
	mov.f32 	%f274, %f137;
	mov.f32 	%f273, %f136;
	mov.f32 	%f272, %f135;
	bra.uni 	BB1_8;

BB1_7:
	mul.wide.s32 	%rd8, %r110, 8;
	add.s64 	%rd9, %rd4, %rd8;
	.loc 1 77 1
	ld.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd9];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f272, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f273, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f274, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f275, %temp;
	}

BB1_8:
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f282, %f275;
	setp.ltu.ftz.f32	%p11, %f272, 0f00000000;
	@%p11 bra 	BB1_10;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f139, %f272;
	mul.ftz.f32 	%f140, %f139, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f276, %f140;
	bra.uni 	BB1_11;

BB1_10:
	neg.ftz.f32 	%f141, %f272;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f142, %f141;
	mul.ftz.f32 	%f143, %f142, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f144, %f143;
	neg.ftz.f32 	%f276, %f144;

BB1_11:
	setp.ltu.ftz.f32	%p12, %f273, 0f00000000;
	@%p12 bra 	BB1_13;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f145, %f273;
	mul.ftz.f32 	%f146, %f145, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f277, %f146;
	bra.uni 	BB1_14;

BB1_13:
	neg.ftz.f32 	%f147, %f273;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f148, %f147;
	mul.ftz.f32 	%f149, %f148, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f150, %f149;
	neg.ftz.f32 	%f277, %f150;

BB1_14:
	setp.ltu.ftz.f32	%p13, %f274, 0f00000000;
	@%p13 bra 	BB1_16;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f151, %f274;
	mul.ftz.f32 	%f152, %f151, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f278, %f152;
	bra.uni 	BB1_17;

BB1_16:
	neg.ftz.f32 	%f153, %f274;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f154, %f153;
	mul.ftz.f32 	%f155, %f154, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f156, %f155;
	neg.ftz.f32 	%f278, %f156;

BB1_17:
	mul.ftz.f32 	%f281, %f278, %f282;
	mul.ftz.f32 	%f280, %f277, %f282;
	mul.ftz.f32 	%f279, %f276, %f282;

BB1_18:
	.loc 1 77 1
	mov.u32 	%r12, %tid.y;
	.loc 1 77 1
	shl.b32 	%r63, %r12, 5;
	add.s32 	%r64, %r63, %r49;
	mul.wide.s32 	%rd10, %r64, 4;
	add.s64 	%rd11, %rd3, %rd10;
	.loc 1 77 1
	st.f32 	[%rd11], %f279;
	st.f32 	[%rd11+512], %f280;
	st.f32 	[%rd11+1024], %f281;
	st.f32 	[%rd11+1536], %f282;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	shl.b32 	%r65, %r12, 7;
	add.s32 	%r66, %r49, %r65;
	mul.wide.s32 	%rd12, %r66, 4;
	add.s64 	%rd13, %rd3, %rd12;
	.loc 1 77 1
	setp.eq.s32	%p14, %r112, 0;
	setp.ne.s32	%p15, %r46, 0;
	and.pred  	%p16, %p14, %p15;
	.loc 1 77 1
	ld.f32 	%f157, [%rd13];
	.loc 1 77 1
	mul.ftz.f32 	%f158, %f157, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f159, %f158, %f270, %p16;
	selp.f32	%f160, %f158, %f269, %p16;
	.loc 1 77 1
	mul.ftz.f32 	%f161, %f271, %f121;
	fma.rn.ftz.f32 	%f162, %f157, %f120, %f161;
	fma.rn.ftz.f32 	%f163, %f159, %f122, %f162;
	fma.rn.ftz.f32 	%f164, %f160, %f123, %f163;
	st.f32 	[%rd13], %f164;
	ld.f32 	%f165, [%rd13+128];
	mul.ftz.f32 	%f166, %f157, %f121;
	fma.rn.ftz.f32 	%f167, %f165, %f120, %f166;
	fma.rn.ftz.f32 	%f168, %f164, %f122, %f167;
	fma.rn.ftz.f32 	%f169, %f159, %f123, %f168;
	st.f32 	[%rd13+128], %f169;
	ld.f32 	%f170, [%rd13+256];
	mul.ftz.f32 	%f171, %f165, %f121;
	fma.rn.ftz.f32 	%f172, %f170, %f120, %f171;
	fma.rn.ftz.f32 	%f173, %f169, %f122, %f172;
	fma.rn.ftz.f32 	%f269, %f164, %f123, %f173;
	st.f32 	[%rd13+256], %f269;
	ld.f32 	%f271, [%rd13+384];
	mul.ftz.f32 	%f174, %f170, %f121;
	fma.rn.ftz.f32 	%f175, %f271, %f120, %f174;
	fma.rn.ftz.f32 	%f176, %f269, %f122, %f175;
	fma.rn.ftz.f32 	%f270, %f169, %f123, %f176;
	st.f32 	[%rd13+384], %f270;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	setp.ge.s32	%p17, %r113, %r44;
	@%p17 bra 	BB1_22;

	setp.eq.s32	%p18, %r45, 0;
	mul.wide.s32 	%rd14, %r64, 4;
	add.s64 	%rd15, %rd3, %rd14;
	.loc 1 77 1
	ld.f32 	%f36, [%rd15];
	ld.f32 	%f37, [%rd15+512];
	ld.f32 	%f38, [%rd15+1024];
	ld.f32 	%f39, [%rd15+1536];
	.loc 1 77 1
	@%p18 bra 	BB1_21;

	mul.wide.s32 	%rd16, %r103, 16;
	add.s64 	%rd17, %rd5, %rd16;
	.loc 1 77 1
	st.v4.f32 	[%rd17], {%f36, %f37, %f38, %f39};
	bra.uni 	BB1_22;

BB1_21:
	mul.wide.s32 	%rd18, %r103, 8;
	add.s64 	%rd19, %rd5, %rd18;
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f36;
	mov.b16 	%rs9, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f37;
	mov.b16 	%rs10, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f38;
	mov.b16 	%rs11, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f39;
	mov.b16 	%rs12, %temp;
}
	st.v4.u16 	[%rd19], {%rs9, %rs10, %rs11, %rs12};

BB1_22:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r111, %r111, 4;
	add.s32 	%r113, %r113, 4;
	shl.b32 	%r71, %r39, 2;
	add.s32 	%r110, %r110, %r71;
	shl.b32 	%r72, %r42, 2;
	add.s32 	%r103, %r103, %r72;
	add.s32 	%r112, %r112, 4;
	.loc 1 77 1
	setp.lt.s32	%p19, %r112, %r44;
	mov.u32 	%r109, %r110;
	@%p19 bra 	BB1_3;

BB1_23:
	.loc 1 77 1
	mov.u32 	%r108, %r109;
	setp.lt.s32	%p20, %r112, 1;
	@%p20 bra 	BB1_60;

	add.s32 	%r23, %r112, -1;
	mov.f32 	%f286, 0f00000000;
	mov.f32 	%f285, %f286;
	mov.f32 	%f284, %f286;
	mov.f32 	%f283, %f286;
	mov.u32 	%r104, 0;

BB1_25:
	.loc 1 77 1
	mov.u32 	%r25, %r108;
	setp.lt.s32	%p21, %r2, %r40;
	setp.gt.s32	%p22, %r2, -1;
	.loc 1 77 1
	mad.lo.s32 	%r29, %r104, -4, %r23;
	add.s32 	%r112, %r112, -4;
	add.s32 	%r113, %r113, -4;
	shl.b32 	%r81, %r39, 2;
	sub.s32 	%r32, %r25, %r81;
	add.s32 	%r111, %r111, -4;
	.loc 1 77 1
	setp.gt.s32	%p23, %r111, -1;
	setp.lt.s32	%p24, %r111, %r41;
	and.pred  	%p25, %p23, %p24;
	and.pred  	%p26, %p25, %p22;
	and.pred  	%p27, %p26, %p21;
	.loc 1 77 1
	@%p27 bra 	BB1_27;

	mov.f32 	%f297, 0f00000000;
	mov.f32 	%f296, %f297;
	mov.f32 	%f295, %f297;
	mov.f32 	%f294, %f297;
	bra.uni 	BB1_40;

BB1_27:
	setp.eq.s32	%p28, %r45, 0;
	.loc 1 77 1
	@%p28 bra 	BB1_29;

	mul.wide.s32 	%rd20, %r32, 16;
	add.s64 	%rd21, %rd4, %rd20;
	ld.v4.f32 	{%f185, %f186, %f187, %f188}, [%rd21];
	mov.f32 	%f290, %f188;
	mov.f32 	%f289, %f187;
	mov.f32 	%f288, %f186;
	mov.f32 	%f287, %f185;
	bra.uni 	BB1_30;

BB1_29:
	mul.wide.s32 	%rd22, %r32, 8;
	add.s64 	%rd23, %rd4, %rd22;
	.loc 1 77 1
	ld.v4.u16 	{%rs13, %rs14, %rs15, %rs16}, [%rd23];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs13;
	cvt.f32.f16 	%f287, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs14;
	cvt.f32.f16 	%f288, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs15;
	cvt.f32.f16 	%f289, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs16;
	cvt.f32.f16 	%f290, %temp;
	}

BB1_30:
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f297, %f290;
	setp.ltu.ftz.f32	%p29, %f287, 0f00000000;
	@%p29 bra 	BB1_32;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f189, %f287;
	mul.ftz.f32 	%f190, %f189, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f291, %f190;
	bra.uni 	BB1_33;

BB1_32:
	neg.ftz.f32 	%f191, %f287;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f192, %f191;
	mul.ftz.f32 	%f193, %f192, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f194, %f193;
	neg.ftz.f32 	%f291, %f194;

BB1_33:
	setp.ltu.ftz.f32	%p30, %f288, 0f00000000;
	@%p30 bra 	BB1_35;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f195, %f288;
	mul.ftz.f32 	%f196, %f195, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f292, %f196;
	bra.uni 	BB1_36;

BB1_35:
	neg.ftz.f32 	%f197, %f288;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f198, %f197;
	mul.ftz.f32 	%f199, %f198, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f200, %f199;
	neg.ftz.f32 	%f292, %f200;

BB1_36:
	setp.ltu.ftz.f32	%p31, %f289, 0f00000000;
	@%p31 bra 	BB1_38;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f201, %f289;
	mul.ftz.f32 	%f202, %f201, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f293, %f202;
	bra.uni 	BB1_39;

BB1_38:
	neg.ftz.f32 	%f203, %f289;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f204, %f203;
	mul.ftz.f32 	%f205, %f204, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f206, %f205;
	neg.ftz.f32 	%f293, %f206;

BB1_39:
	mul.ftz.f32 	%f296, %f293, %f297;
	mul.ftz.f32 	%f295, %f292, %f297;
	mul.ftz.f32 	%f294, %f291, %f297;

BB1_40:
	.loc 1 77 1
	mov.u32 	%r34, %tid.y;
	.loc 1 77 1
	shl.b32 	%r82, %r34, 5;
	add.s32 	%r83, %r82, %r49;
	mul.wide.s32 	%rd24, %r83, 4;
	add.s64 	%rd25, %rd3, %rd24;
	.loc 1 77 1
	st.f32 	[%rd25], %f294;
	st.f32 	[%rd25+512], %f295;
	st.f32 	[%rd25+1024], %f296;
	st.f32 	[%rd25+1536], %f297;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	shl.b32 	%r84, %r34, 7;
	add.s32 	%r85, %r49, %r84;
	mul.wide.s32 	%rd26, %r85, 4;
	add.s64 	%rd27, %rd3, %rd26;
	.loc 1 77 1
	add.s32 	%r86, %r44, -1;
	setp.eq.s32	%p32, %r29, %r86;
	setp.ne.s32	%p33, %r46, 0;
	and.pred  	%p34, %p32, %p33;
	.loc 1 77 1
	ld.f32 	%f207, [%rd27+384];
	.loc 1 77 1
	mul.ftz.f32 	%f208, %f207, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f209, %f208, %f284, %p34;
	selp.f32	%f210, %f208, %f283, %p34;
	.loc 1 77 1
	mul.ftz.f32 	%f211, %f285, %f125;
	fma.rn.ftz.f32 	%f212, %f286, %f124, %f211;
	fma.rn.ftz.f32 	%f213, %f209, %f126, %f212;
	fma.rn.ftz.f32 	%f214, %f210, %f127, %f213;
	st.f32 	[%rd27+384], %f214;
	.loc 1 77 1
	setp.eq.s32	%p35, %r29, %r44;
	and.pred  	%p36, %p35, %p33;
	.loc 1 77 1
	ld.f32 	%f215, [%rd27+256];
	.loc 1 77 1
	mul.ftz.f32 	%f216, %f215, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f217, %f216, %f214, %p36;
	selp.f32	%f218, %f216, %f209, %p36;
	.loc 1 77 1
	mul.ftz.f32 	%f219, %f286, %f125;
	fma.rn.ftz.f32 	%f220, %f207, %f124, %f219;
	fma.rn.ftz.f32 	%f221, %f217, %f126, %f220;
	fma.rn.ftz.f32 	%f222, %f218, %f127, %f221;
	st.f32 	[%rd27+256], %f222;
	.loc 1 77 1
	add.s32 	%r87, %r29, -2;
	setp.eq.s32	%p37, %r87, %r86;
	and.pred  	%p38, %p37, %p33;
	.loc 1 77 1
	ld.f32 	%f285, [%rd27+128];
	.loc 1 77 1
	mul.ftz.f32 	%f223, %f285, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f224, %f223, %f222, %p38;
	selp.f32	%f225, %f223, %f217, %p38;
	.loc 1 77 1
	mul.ftz.f32 	%f226, %f207, %f125;
	fma.rn.ftz.f32 	%f227, %f215, %f124, %f226;
	fma.rn.ftz.f32 	%f228, %f224, %f126, %f227;
	fma.rn.ftz.f32 	%f229, %f225, %f127, %f228;
	st.f32 	[%rd27+128], %f229;
	.loc 1 77 1
	add.s32 	%r88, %r29, -3;
	setp.eq.s32	%p39, %r88, %r86;
	and.pred  	%p40, %p39, %p33;
	.loc 1 77 1
	ld.f32 	%f286, [%rd27];
	.loc 1 77 1
	mul.ftz.f32 	%f230, %f286, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f283, %f230, %f229, %p40;
	selp.f32	%f231, %f230, %f224, %p40;
	.loc 1 77 1
	mul.ftz.f32 	%f232, %f215, %f125;
	fma.rn.ftz.f32 	%f233, %f285, %f124, %f232;
	fma.rn.ftz.f32 	%f234, %f283, %f126, %f233;
	fma.rn.ftz.f32 	%f284, %f231, %f127, %f234;
	st.f32 	[%rd27], %f284;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	mad.lo.s32 	%r36, %r113, %r42, %r1;
	.loc 1 77 1
	setp.ge.s32	%p41, %r113, %r44;
	@%p41 bra 	BB1_59;

	setp.eq.s32	%p42, %r45, 0;
	mul.wide.s32 	%rd28, %r36, 16;
	add.s64 	%rd1, %rd5, %rd28;
	mul.wide.s32 	%rd29, %r36, 8;
	add.s64 	%rd2, %rd5, %rd29;
	.loc 1 77 1
	@%p42 bra 	BB1_43;

	ld.v4.f32 	{%f235, %f236, %f237, %f238}, [%rd1];
	mov.f32 	%f301, %f238;
	mov.f32 	%f300, %f237;
	mov.f32 	%f299, %f236;
	mov.f32 	%f298, %f235;
	bra.uni 	BB1_44;

BB1_43:
	.loc 1 77 1
	ld.v4.u16 	{%rs21, %rs22, %rs23, %rs24}, [%rd2];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs21;
	cvt.f32.f16 	%f298, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs22;
	cvt.f32.f16 	%f299, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs23;
	cvt.f32.f16 	%f300, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs24;
	cvt.f32.f16 	%f301, %temp;
	}

BB1_44:
	mul.wide.s32 	%rd30, %r83, 4;
	add.s64 	%rd31, %rd3, %rd30;
	.loc 1 77 1
	ld.f32 	%f239, [%rd31];
	add.ftz.f32 	%f96, %f298, %f239;
	ld.f32 	%f240, [%rd31+512];
	add.ftz.f32 	%f97, %f299, %f240;
	ld.f32 	%f241, [%rd31+1024];
	add.ftz.f32 	%f98, %f300, %f241;
	ld.f32 	%f242, [%rd31+1536];
	add.ftz.f32 	%f243, %f301, %f242;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f302, %f243;
	.loc 2 45 1
	add.ftz.f32 	%f244, %f302, 0fB70637BD;
	setp.gtu.ftz.f32	%p43, %f244, 0f00000000;
	@%p43 bra 	BB1_46;

	mov.f32 	%f305, 0f00000000;
	mov.f32 	%f304, %f305;
	mov.f32 	%f303, %f305;
	mov.f32 	%f302, %f305;
	bra.uni 	BB1_47;

BB1_46:
	mov.f32 	%f249, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f250, %f249, %f302;
	.loc 2 45 1
	mul.ftz.f32 	%f303, %f98, %f250;
	mul.ftz.f32 	%f304, %f97, %f250;
	mul.ftz.f32 	%f305, %f96, %f250;

BB1_47:
	.loc 1 77 164
	setp.ltu.ftz.f32	%p44, %f305, 0f00000000;
	@%p44 bra 	BB1_49;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f251, %f305;
	mul.ftz.f32 	%f252, %f251, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f306, %f252;
	bra.uni 	BB1_50;

BB1_49:
	neg.ftz.f32 	%f253, %f305;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f254, %f253;
	mul.ftz.f32 	%f255, %f254, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f256, %f255;
	neg.ftz.f32 	%f306, %f256;

BB1_50:
	setp.ltu.ftz.f32	%p45, %f304, 0f00000000;
	@%p45 bra 	BB1_52;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f257, %f304;
	mul.ftz.f32 	%f258, %f257, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f307, %f258;
	bra.uni 	BB1_53;

BB1_52:
	neg.ftz.f32 	%f259, %f304;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f260, %f259;
	mul.ftz.f32 	%f261, %f260, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f262, %f261;
	neg.ftz.f32 	%f307, %f262;

BB1_53:
	setp.ltu.ftz.f32	%p46, %f303, 0f00000000;
	@%p46 bra 	BB1_55;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f263, %f303;
	mul.ftz.f32 	%f264, %f263, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f308, %f264;
	bra.uni 	BB1_56;

BB1_55:
	neg.ftz.f32 	%f265, %f303;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f266, %f265;
	mul.ftz.f32 	%f267, %f266, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f268, %f267;
	neg.ftz.f32 	%f308, %f268;

BB1_56:
	.loc 1 77 1
	@%p42 bra 	BB1_58;

	.loc 1 77 1
	st.v4.f32 	[%rd1], {%f306, %f307, %f308, %f302};
	bra.uni 	BB1_59;

BB1_58:
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f306;
	mov.b16 	%rs29, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f307;
	mov.b16 	%rs30, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f308;
	mov.b16 	%rs31, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f302;
	mov.b16 	%rs32, %temp;
}
	.loc 1 77 238
	st.v4.u16 	[%rd2], {%rs29, %rs30, %rs31, %rs32};

BB1_59:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r104, %r104, 1;
	.loc 1 77 1
	mad.lo.s32 	%r108, %r111, %r39, %r2;
	.loc 1 77 1
	setp.gt.s32	%p48, %r112, 0;
	@%p48 bra 	BB1_25;

BB1_60:
	.loc 1 77 2
	ret;
}

.visible .func _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff(
	.param .b64 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_0,
	.param .b64 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_1,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_2,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_3,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_4,
	.param .b64 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_5,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_6,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_7,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_8,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_9,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_10,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_11,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_12,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_13,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_14,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_15,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_16,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_17,
	.param .b32 _Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_18
)
{
	.reg .pred 	%p<94>;
	.reg .s16 	%rs<65>;
	.reg .s32 	%r<216>;
	.reg .f32 	%f<603>;
	.reg .s64 	%rd<68>;


	ld.param.u64 	%rd9, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_0];
	ld.param.u64 	%rd10, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_1];
	ld.param.u32 	%r29, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_2];
	ld.param.u32 	%r30, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_3];
	ld.param.u32 	%r31, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_4];
	ld.param.u64 	%rd11, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_5];
	ld.param.u32 	%r32, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_6];
	ld.param.u32 	%r33, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_7];
	ld.param.u32 	%r34, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_8];
	ld.param.u32 	%r35, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_9];
	ld.param.u32 	%r36, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_10];
	ld.param.f32 	%f243, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_11];
	ld.param.f32 	%f244, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_12];
	ld.param.f32 	%f245, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_13];
	ld.param.f32 	%f246, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_14];
	ld.param.f32 	%f247, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_15];
	ld.param.f32 	%f248, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_16];
	ld.param.f32 	%f249, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_17];
	ld.param.f32 	%f250, [_Z27HorizontalRecursiveGaussianPfPK6float4iiiPS0_iii17DevicePixelFormatiffffffff_param_18];
	mov.u32 	%r213, 0;
	.loc 1 77 1
	setp.gt.s32	%p7, %r33, 0;
	@%p7 bra 	BB2_1;
	bra.uni 	BB2_50;

BB2_1:
	mov.f32 	%f554, 0f00000000;
	mov.f32 	%f553, %f554;
	mov.f32 	%f552, %f554;
	mov.u32 	%r211, %r213;

BB2_2:
	.loc 1 77 1
	mov.u32 	%r209, %r211;
	mov.u32 	%r1, %r209;
	.loc 1 77 1
	mov.u32 	%r40, %ctaid.y;
	shl.b32 	%r41, %r40, 3;
	mov.u32 	%r42, %tid.y;
	add.s32 	%r43, %r41, %r42;
	sub.s32 	%r44, %r34, %r31;
	shr.s32 	%r45, %r44, 1;
	sub.s32 	%r46, %r43, %r45;
	setp.lt.s32	%p8, %r46, %r31;
	.loc 1 77 1
	mov.u32 	%r47, %tid.x;
	.loc 1 77 1
	add.s32 	%r3, %r47, %r213;
	sub.s32 	%r48, %r33, %r30;
	shr.s32 	%r49, %r48, 1;
	sub.s32 	%r50, %r3, %r49;
	mad.lo.s32 	%r4, %r46, %r29, %r50;
	.loc 1 77 1
	setp.gt.s32	%p9, %r50, -1;
	setp.gt.s32	%p10, %r46, -1;
	and.pred  	%p11, %p10, %p8;
	and.pred  	%p12, %p11, %p9;
	.loc 1 77 1
	setp.lt.s32	%p13, %r50, %r30;
	and.pred  	%p1, %p12, %p13;
	.loc 1 77 1
	setp.ge.s32	%p14, %r46, %r31;
	@%p14 bra 	BB2_19;

	.loc 1 77 1
	@%p1 bra 	BB2_5;

	mov.f32 	%f540, 0f00000000;
	mov.f32 	%f539, %f540;
	mov.f32 	%f538, %f540;
	mov.f32 	%f537, %f540;
	bra.uni 	BB2_18;

BB2_5:
	setp.eq.s32	%p15, %r35, 0;
	.loc 1 77 1
	@%p15 bra 	BB2_7;

	mul.wide.s32 	%rd12, %r4, 16;
	add.s64 	%rd13, %rd10, %rd12;
	ld.v4.f32 	{%f258, %f259, %f260, %f261}, [%rd13];
	mov.f32 	%f533, %f261;
	mov.f32 	%f532, %f260;
	mov.f32 	%f531, %f259;
	mov.f32 	%f530, %f258;
	bra.uni 	BB2_8;

BB2_7:
	mul.wide.s32 	%rd14, %r4, 8;
	add.s64 	%rd15, %rd10, %rd14;
	.loc 1 77 1
	ld.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd15];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f530, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f531, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f532, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f533, %temp;
	}

BB2_8:
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f540, %f533;
	setp.ltu.ftz.f32	%p16, %f530, 0f00000000;
	@%p16 bra 	BB2_10;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f262, %f530;
	mul.ftz.f32 	%f263, %f262, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f534, %f263;
	bra.uni 	BB2_11;

BB2_10:
	neg.ftz.f32 	%f264, %f530;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f265, %f264;
	mul.ftz.f32 	%f266, %f265, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f267, %f266;
	neg.ftz.f32 	%f534, %f267;

BB2_11:
	setp.ltu.ftz.f32	%p17, %f531, 0f00000000;
	@%p17 bra 	BB2_13;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f268, %f531;
	mul.ftz.f32 	%f269, %f268, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f535, %f269;
	bra.uni 	BB2_14;

BB2_13:
	neg.ftz.f32 	%f270, %f531;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f271, %f270;
	mul.ftz.f32 	%f272, %f271, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f273, %f272;
	neg.ftz.f32 	%f535, %f273;

BB2_14:
	setp.ltu.ftz.f32	%p18, %f532, 0f00000000;
	@%p18 bra 	BB2_16;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f274, %f532;
	mul.ftz.f32 	%f275, %f274, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f536, %f275;
	bra.uni 	BB2_17;

BB2_16:
	neg.ftz.f32 	%f276, %f532;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f277, %f276;
	mul.ftz.f32 	%f278, %f277, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f279, %f278;
	neg.ftz.f32 	%f536, %f279;

BB2_17:
	mul.ftz.f32 	%f539, %f536, %f540;
	mul.ftz.f32 	%f538, %f535, %f540;
	mul.ftz.f32 	%f537, %f534, %f540;

BB2_18:
	.loc 1 77 1
	mad.lo.s32 	%r53, %r42, 33, %r47;
	mul.wide.s32 	%rd16, %r53, 4;
	add.s64 	%rd17, %rd9, %rd16;
	.loc 1 77 1
	st.f32 	[%rd17], %f537;
	st.f32 	[%rd17+1056], %f538;
	st.f32 	[%rd17+2112], %f539;
	st.f32 	[%rd17+3168], %f540;

BB2_19:
	.loc 1 77 1
	add.s32 	%r61, %r46, 4;
	.loc 1 77 1
	shl.b32 	%r62, %r29, 2;
	add.s32 	%r5, %r4, %r62;
	.loc 1 77 1
	setp.ge.s32	%p19, %r61, %r31;
	@%p19 bra 	BB2_36;

	.loc 1 77 1
	@%p1 bra 	BB2_22;

	mov.f32 	%f551, 0f00000000;
	mov.f32 	%f550, %f551;
	mov.f32 	%f549, %f551;
	mov.f32 	%f548, %f551;
	bra.uni 	BB2_35;

BB2_22:
	setp.eq.s32	%p20, %r35, 0;
	.loc 1 77 1
	@%p20 bra 	BB2_24;

	mul.wide.s32 	%rd18, %r5, 16;
	add.s64 	%rd19, %rd10, %rd18;
	ld.v4.f32 	{%f284, %f285, %f286, %f287}, [%rd19];
	mov.f32 	%f544, %f287;
	mov.f32 	%f543, %f286;
	mov.f32 	%f542, %f285;
	mov.f32 	%f541, %f284;
	bra.uni 	BB2_25;

BB2_24:
	mul.wide.s32 	%rd20, %r5, 8;
	add.s64 	%rd21, %rd10, %rd20;
	.loc 1 77 1
	ld.v4.u16 	{%rs9, %rs10, %rs11, %rs12}, [%rd21];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs9;
	cvt.f32.f16 	%f541, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs10;
	cvt.f32.f16 	%f542, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs11;
	cvt.f32.f16 	%f543, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs12;
	cvt.f32.f16 	%f544, %temp;
	}

BB2_25:
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f551, %f544;
	setp.ltu.ftz.f32	%p21, %f541, 0f00000000;
	@%p21 bra 	BB2_27;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f288, %f541;
	mul.ftz.f32 	%f289, %f288, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f545, %f289;
	bra.uni 	BB2_28;

BB2_27:
	neg.ftz.f32 	%f290, %f541;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f291, %f290;
	mul.ftz.f32 	%f292, %f291, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f293, %f292;
	neg.ftz.f32 	%f545, %f293;

BB2_28:
	setp.ltu.ftz.f32	%p22, %f542, 0f00000000;
	@%p22 bra 	BB2_30;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f294, %f542;
	mul.ftz.f32 	%f295, %f294, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f546, %f295;
	bra.uni 	BB2_31;

BB2_30:
	neg.ftz.f32 	%f296, %f542;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f297, %f296;
	mul.ftz.f32 	%f298, %f297, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f299, %f298;
	neg.ftz.f32 	%f546, %f299;

BB2_31:
	setp.ltu.ftz.f32	%p23, %f543, 0f00000000;
	@%p23 bra 	BB2_33;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f300, %f543;
	mul.ftz.f32 	%f301, %f300, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f547, %f301;
	bra.uni 	BB2_34;

BB2_33:
	neg.ftz.f32 	%f302, %f543;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f303, %f302;
	mul.ftz.f32 	%f304, %f303, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f305, %f304;
	neg.ftz.f32 	%f547, %f305;

BB2_34:
	mul.ftz.f32 	%f550, %f547, %f551;
	mul.ftz.f32 	%f549, %f546, %f551;
	mul.ftz.f32 	%f548, %f545, %f551;

BB2_35:
	.loc 1 77 1
	mad.lo.s32 	%r65, %r42, 33, %r47;
	mul.wide.s32 	%rd22, %r65, 4;
	add.s64 	%rd23, %rd9, %rd22;
	.loc 1 77 1
	st.f32 	[%rd23+528], %f548;
	st.f32 	[%rd23+1584], %f549;
	st.f32 	[%rd23+2640], %f550;
	st.f32 	[%rd23+3696], %f551;

BB2_36:
	.loc 1 77 1
	shl.b32 	%r66, %r42, 5;
	add.s32 	%r67, %r66, %r47;
	setp.lt.s32	%p2, %r67, 32;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	mad.lo.s32 	%r68, %r42, 16, %r47;
	mul.lo.s32 	%r69, %r68, 33;
	mul.wide.s32 	%rd24, %r69, 4;
	add.s64 	%rd66, %rd9, %rd24;
	.loc 1 77 1
	mul.lo.s32 	%r70, %r47, 33;
	mad.lo.s32 	%r71, %r42, 528, %r70;
	add.s32 	%r72, %r71, 4;
	mul.wide.s32 	%rd25, %r72, 4;
	add.s64 	%rd67, %rd9, %rd25;
	.loc 1 77 1
	@!%p2 bra 	BB2_39;
	bra.uni 	BB2_37;

BB2_37:
	mov.u32 	%r212, 0;
	mov.u32 	%r210, %r1;

BB2_38:
	.loc 1 77 1
	mov.u32 	%r8, %r210;
	.loc 1 77 1
	setp.eq.s32	%p24, %r8, 0;
	setp.ne.s32	%p25, %r36, 0;
	and.pred  	%p26, %p24, %p25;
	.loc 1 77 1
	ld.f32 	%f306, [%rd66];
	.loc 1 77 1
	mul.ftz.f32 	%f307, %f306, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f308, %f307, %f553, %p26;
	selp.f32	%f309, %f307, %f552, %p26;
	.loc 1 77 1
	mul.ftz.f32 	%f310, %f554, %f244;
	fma.rn.ftz.f32 	%f311, %f306, %f243, %f310;
	fma.rn.ftz.f32 	%f312, %f308, %f245, %f311;
	fma.rn.ftz.f32 	%f313, %f309, %f246, %f312;
	st.f32 	[%rd66], %f313;
	ld.f32 	%f314, [%rd67+-12];
	mul.ftz.f32 	%f315, %f306, %f244;
	fma.rn.ftz.f32 	%f316, %f314, %f243, %f315;
	fma.rn.ftz.f32 	%f317, %f313, %f245, %f316;
	fma.rn.ftz.f32 	%f318, %f308, %f246, %f317;
	ld.f32 	%f319, [%rd67+-8];
	ld.f32 	%f320, [%rd67+-4];
	ld.f32 	%f321, [%rd67];
	st.f32 	[%rd67+-12], %f318;
	mul.ftz.f32 	%f322, %f314, %f244;
	fma.rn.ftz.f32 	%f323, %f319, %f243, %f322;
	fma.rn.ftz.f32 	%f324, %f318, %f245, %f323;
	fma.rn.ftz.f32 	%f325, %f313, %f246, %f324;
	st.f32 	[%rd67+-8], %f325;
	mul.ftz.f32 	%f326, %f319, %f244;
	fma.rn.ftz.f32 	%f327, %f320, %f243, %f326;
	fma.rn.ftz.f32 	%f328, %f325, %f245, %f327;
	fma.rn.ftz.f32 	%f329, %f318, %f246, %f328;
	st.f32 	[%rd67+-4], %f329;
	mul.ftz.f32 	%f330, %f320, %f244;
	fma.rn.ftz.f32 	%f331, %f321, %f243, %f330;
	fma.rn.ftz.f32 	%f332, %f329, %f245, %f331;
	fma.rn.ftz.f32 	%f333, %f325, %f246, %f332;
	st.f32 	[%rd67], %f333;
	ld.f32 	%f334, [%rd67+4];
	mul.ftz.f32 	%f335, %f321, %f244;
	fma.rn.ftz.f32 	%f336, %f334, %f243, %f335;
	fma.rn.ftz.f32 	%f337, %f333, %f245, %f336;
	fma.rn.ftz.f32 	%f338, %f329, %f246, %f337;
	ld.f32 	%f339, [%rd67+8];
	ld.f32 	%f554, [%rd67+12];
	st.f32 	[%rd67+4], %f338;
	mul.ftz.f32 	%f340, %f334, %f244;
	fma.rn.ftz.f32 	%f341, %f339, %f243, %f340;
	fma.rn.ftz.f32 	%f342, %f338, %f245, %f341;
	fma.rn.ftz.f32 	%f552, %f333, %f246, %f342;
	st.f32 	[%rd67+8], %f552;
	mul.ftz.f32 	%f343, %f339, %f244;
	fma.rn.ftz.f32 	%f344, %f554, %f243, %f343;
	fma.rn.ftz.f32 	%f345, %f552, %f245, %f344;
	fma.rn.ftz.f32 	%f553, %f338, %f246, %f345;
	st.f32 	[%rd67+12], %f553;
	add.s64 	%rd67, %rd67, 32;
	.loc 1 77 1
	add.s32 	%r10, %r8, -8;
	add.s64 	%rd66, %rd66, 32;
	.loc 1 77 1
	add.s32 	%r212, %r212, 32;
	setp.ne.s32	%p27, %r212, 128;
	mov.u32 	%r210, %r10;
	@%p27 bra 	BB2_38;

BB2_39:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r206, %r47, %r213;
	.loc 1 77 1
	mov.u32 	%r205, %ctaid.y;
	shl.b32 	%r204, %r205, 3;
	add.s32 	%r203, %r204, %r42;
	.loc 1 77 1
	mad.lo.s32 	%r13, %r203, %r32, %r206;
	.loc 1 77 1
	@!%p8 bra 	BB2_44;
	bra.uni 	BB2_40;

BB2_40:
	.loc 1 77 1
	add.s32 	%r208, %r47, %r213;
	.loc 1 77 1
	mad.lo.s32 	%r82, %r42, 33, %r47;
	mul.wide.s32 	%rd26, %r82, 4;
	add.s64 	%rd27, %rd9, %rd26;
	.loc 1 77 1
	ld.f32 	%f75, [%rd27];
	ld.f32 	%f76, [%rd27+1056];
	ld.f32 	%f77, [%rd27+2112];
	ld.f32 	%f78, [%rd27+3168];
	.loc 1 77 1
	setp.ge.s32	%p28, %r208, %r33;
	@%p28 bra 	BB2_44;

	setp.eq.s32	%p29, %r35, 0;
	.loc 1 77 1
	@%p29 bra 	BB2_43;

	mul.wide.s32 	%rd28, %r13, 16;
	add.s64 	%rd29, %rd11, %rd28;
	.loc 1 77 1
	st.v4.f32 	[%rd29], {%f75, %f76, %f77, %f78};
	bra.uni 	BB2_44;

BB2_43:
	mul.wide.s32 	%rd30, %r13, 8;
	add.s64 	%rd31, %rd11, %rd30;
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f75;
	mov.b16 	%rs17, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f76;
	mov.b16 	%rs18, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f77;
	mov.b16 	%rs19, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f78;
	mov.b16 	%rs20, %temp;
}
	st.v4.u16 	[%rd31], {%rs17, %rs18, %rs19, %rs20};

BB2_44:
	.loc 1 77 1
	shl.b32 	%r91, %r32, 2;
	add.s32 	%r14, %r13, %r91;
	.loc 1 77 1
	@%p19 bra 	BB2_49;

	.loc 1 77 1
	add.s32 	%r207, %r47, %r213;
	.loc 1 77 1
	mad.lo.s32 	%r94, %r42, 33, %r47;
	mul.wide.s32 	%rd32, %r94, 4;
	add.s64 	%rd33, %rd9, %rd32;
	.loc 1 77 1
	ld.f32 	%f83, [%rd33+528];
	ld.f32 	%f84, [%rd33+1584];
	ld.f32 	%f85, [%rd33+2640];
	ld.f32 	%f86, [%rd33+3696];
	.loc 1 77 1
	setp.ge.s32	%p31, %r207, %r33;
	@%p31 bra 	BB2_49;

	setp.eq.s32	%p32, %r35, 0;
	.loc 1 77 1
	@%p32 bra 	BB2_48;

	mul.wide.s32 	%rd34, %r14, 16;
	add.s64 	%rd35, %rd11, %rd34;
	.loc 1 77 1
	st.v4.f32 	[%rd35], {%f83, %f84, %f85, %f86};
	bra.uni 	BB2_49;

BB2_48:
	mul.wide.s32 	%rd36, %r14, 8;
	add.s64 	%rd37, %rd11, %rd36;
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f83;
	mov.b16 	%rs21, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f84;
	mov.b16 	%rs22, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f85;
	mov.b16 	%rs23, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f86;
	mov.b16 	%rs24, %temp;
}
	st.v4.u16 	[%rd37], {%rs21, %rs22, %rs23, %rs24};

BB2_49:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r213, %r213, 32;
	.loc 1 77 1
	setp.lt.s32	%p33, %r213, %r33;
	add.s32 	%r211, %r1, -32;
	@%p33 bra 	BB2_2;

BB2_50:
	.loc 1 77 1
	setp.lt.s32	%p34, %r213, 1;
	@%p34 bra 	BB2_128;

	add.s32 	%r18, %r213, -1;
	mov.f32 	%f580, 0f00000000;
	mov.f32 	%f579, %f580;
	mov.f32 	%f578, %f580;
	mov.f32 	%f577, %f580;
	mov.u32 	%r214, 0;

BB2_52:
	.loc 1 77 1
	mov.u32 	%r96, %ctaid.y;
	shl.b32 	%r97, %r96, 3;
	mov.u32 	%r98, %tid.y;
	add.s32 	%r99, %r97, %r98;
	sub.s32 	%r100, %r34, %r31;
	shr.s32 	%r101, %r100, 1;
	sub.s32 	%r102, %r99, %r101;
	setp.lt.s32	%p35, %r102, %r31;
	.loc 1 77 1
	mad.lo.s32 	%r20, %r214, -32, %r18;
	.loc 1 77 1
	mov.u32 	%r103, %tid.x;
	add.s32 	%r104, %r20, %r103;
	.loc 1 77 1
	add.s32 	%r21, %r104, -31;
	sub.s32 	%r105, %r33, %r30;
	shr.s32 	%r106, %r105, 1;
	sub.s32 	%r22, %r21, %r106;
	mad.lo.s32 	%r23, %r102, %r29, %r22;
	.loc 1 77 1
	setp.gt.s32	%p36, %r22, -1;
	setp.gt.s32	%p37, %r102, -1;
	and.pred  	%p38, %p37, %p35;
	and.pred  	%p39, %p38, %p36;
	.loc 1 77 1
	setp.lt.s32	%p40, %r22, %r30;
	and.pred  	%p4, %p39, %p40;
	.loc 1 77 1
	setp.ge.s32	%p41, %r102, %r31;
	@%p41 bra 	BB2_69;

	.loc 1 77 1
	@%p4 bra 	BB2_55;

	mov.f32 	%f565, 0f00000000;
	mov.f32 	%f564, %f565;
	mov.f32 	%f563, %f565;
	mov.f32 	%f562, %f565;
	bra.uni 	BB2_68;

BB2_55:
	setp.eq.s32	%p42, %r35, 0;
	.loc 1 77 1
	@%p42 bra 	BB2_57;

	mul.wide.s32 	%rd38, %r23, 16;
	add.s64 	%rd39, %rd10, %rd38;
	ld.v4.f32 	{%f354, %f355, %f356, %f357}, [%rd39];
	mov.f32 	%f558, %f357;
	mov.f32 	%f557, %f356;
	mov.f32 	%f556, %f355;
	mov.f32 	%f555, %f354;
	bra.uni 	BB2_58;

BB2_57:
	mul.wide.s32 	%rd40, %r23, 8;
	add.s64 	%rd41, %rd10, %rd40;
	.loc 1 77 1
	ld.v4.u16 	{%rs25, %rs26, %rs27, %rs28}, [%rd41];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs25;
	cvt.f32.f16 	%f555, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs26;
	cvt.f32.f16 	%f556, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs27;
	cvt.f32.f16 	%f557, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs28;
	cvt.f32.f16 	%f558, %temp;
	}

BB2_58:
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f565, %f558;
	setp.ltu.ftz.f32	%p43, %f555, 0f00000000;
	@%p43 bra 	BB2_60;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f358, %f555;
	mul.ftz.f32 	%f359, %f358, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f559, %f359;
	bra.uni 	BB2_61;

BB2_60:
	neg.ftz.f32 	%f360, %f555;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f361, %f360;
	mul.ftz.f32 	%f362, %f361, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f363, %f362;
	neg.ftz.f32 	%f559, %f363;

BB2_61:
	setp.ltu.ftz.f32	%p44, %f556, 0f00000000;
	@%p44 bra 	BB2_63;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f364, %f556;
	mul.ftz.f32 	%f365, %f364, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f560, %f365;
	bra.uni 	BB2_64;

BB2_63:
	neg.ftz.f32 	%f366, %f556;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f367, %f366;
	mul.ftz.f32 	%f368, %f367, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f369, %f368;
	neg.ftz.f32 	%f560, %f369;

BB2_64:
	setp.ltu.ftz.f32	%p45, %f557, 0f00000000;
	@%p45 bra 	BB2_66;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f370, %f557;
	mul.ftz.f32 	%f371, %f370, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f561, %f371;
	bra.uni 	BB2_67;

BB2_66:
	neg.ftz.f32 	%f372, %f557;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f373, %f372;
	mul.ftz.f32 	%f374, %f373, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f375, %f374;
	neg.ftz.f32 	%f561, %f375;

BB2_67:
	mul.ftz.f32 	%f564, %f561, %f565;
	mul.ftz.f32 	%f563, %f560, %f565;
	mul.ftz.f32 	%f562, %f559, %f565;

BB2_68:
	.loc 1 77 1
	mad.lo.s32 	%r109, %r98, 33, %r103;
	mul.wide.s32 	%rd42, %r109, 4;
	add.s64 	%rd43, %rd9, %rd42;
	.loc 1 77 1
	st.f32 	[%rd43], %f562;
	st.f32 	[%rd43+1056], %f563;
	st.f32 	[%rd43+2112], %f564;
	st.f32 	[%rd43+3168], %f565;

BB2_69:
	.loc 1 77 1
	add.s32 	%r117, %r102, 4;
	.loc 1 77 1
	shl.b32 	%r119, %r29, 2;
	add.s32 	%r24, %r23, %r119;
	.loc 1 77 1
	setp.ge.s32	%p46, %r117, %r31;
	@%p46 bra 	BB2_86;

	.loc 1 77 1
	@%p4 bra 	BB2_72;

	mov.f32 	%f576, 0f00000000;
	mov.f32 	%f575, %f576;
	mov.f32 	%f574, %f576;
	mov.f32 	%f573, %f576;
	bra.uni 	BB2_85;

BB2_72:
	setp.eq.s32	%p54, %r35, 0;
	.loc 1 77 1
	@%p54 bra 	BB2_74;

	mul.wide.s32 	%rd44, %r24, 16;
	add.s64 	%rd45, %rd10, %rd44;
	ld.v4.f32 	{%f380, %f381, %f382, %f383}, [%rd45];
	mov.f32 	%f569, %f383;
	mov.f32 	%f568, %f382;
	mov.f32 	%f567, %f381;
	mov.f32 	%f566, %f380;
	bra.uni 	BB2_75;

BB2_74:
	mul.wide.s32 	%rd46, %r24, 8;
	add.s64 	%rd47, %rd10, %rd46;
	.loc 1 77 1
	ld.v4.u16 	{%rs33, %rs34, %rs35, %rs36}, [%rd47];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs33;
	cvt.f32.f16 	%f566, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs34;
	cvt.f32.f16 	%f567, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs35;
	cvt.f32.f16 	%f568, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs36;
	cvt.f32.f16 	%f569, %temp;
	}

BB2_75:
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f576, %f569;
	setp.ltu.ftz.f32	%p55, %f566, 0f00000000;
	@%p55 bra 	BB2_77;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f384, %f566;
	mul.ftz.f32 	%f385, %f384, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f570, %f385;
	bra.uni 	BB2_78;

BB2_77:
	neg.ftz.f32 	%f386, %f566;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f387, %f386;
	mul.ftz.f32 	%f388, %f387, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f389, %f388;
	neg.ftz.f32 	%f570, %f389;

BB2_78:
	setp.ltu.ftz.f32	%p56, %f567, 0f00000000;
	@%p56 bra 	BB2_80;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f390, %f567;
	mul.ftz.f32 	%f391, %f390, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f571, %f391;
	bra.uni 	BB2_81;

BB2_80:
	neg.ftz.f32 	%f392, %f567;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f393, %f392;
	mul.ftz.f32 	%f394, %f393, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f395, %f394;
	neg.ftz.f32 	%f571, %f395;

BB2_81:
	setp.ltu.ftz.f32	%p57, %f568, 0f00000000;
	@%p57 bra 	BB2_83;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f396, %f568;
	mul.ftz.f32 	%f397, %f396, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f572, %f397;
	bra.uni 	BB2_84;

BB2_83:
	neg.ftz.f32 	%f398, %f568;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f399, %f398;
	mul.ftz.f32 	%f400, %f399, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f401, %f400;
	neg.ftz.f32 	%f572, %f401;

BB2_84:
	mul.ftz.f32 	%f575, %f572, %f576;
	mul.ftz.f32 	%f574, %f571, %f576;
	mul.ftz.f32 	%f573, %f570, %f576;

BB2_85:
	.loc 1 77 1
	mad.lo.s32 	%r129, %r98, 33, %r103;
	mul.wide.s32 	%rd48, %r129, 4;
	add.s64 	%rd49, %rd9, %rd48;
	.loc 1 77 1
	st.f32 	[%rd49+528], %f573;
	st.f32 	[%rd49+1584], %f574;
	st.f32 	[%rd49+2640], %f575;
	st.f32 	[%rd49+3696], %f576;

BB2_86:
	.loc 1 77 1
	shl.b32 	%r131, %r98, 5;
	add.s32 	%r133, %r131, %r103;
	setp.lt.s32	%p5, %r133, 32;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	@!%p5 bra 	BB2_89;
	bra.uni 	BB2_87;

BB2_87:
	mov.u32 	%r215, 0;

BB2_88:
	.loc 1 77 1
	mad.lo.s32 	%r201, %r214, -32, %r18;
	.loc 1 77 1
	shl.b32 	%r136, %r98, 4;
	add.s32 	%r138, %r103, %r136;
	mad.lo.s32 	%r139, %r138, 33, 31;
	.loc 1 77 1
	sub.s32 	%r140, %r139, %r215;
	mul.wide.s32 	%rd50, %r140, 4;
	add.s64 	%rd51, %rd9, %rd50;
	.loc 1 77 1
	add.s32 	%r141, %r33, -1;
	sub.s32 	%r142, %r201, %r215;
	setp.eq.s32	%p58, %r142, %r141;
	setp.ne.s32	%p59, %r36, 0;
	and.pred  	%p60, %p58, %p59;
	.loc 1 77 1
	ld.f32 	%f402, [%rd51];
	.loc 1 77 1
	mul.ftz.f32 	%f403, %f402, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f404, %f403, %f578, %p60;
	selp.f32	%f405, %f403, %f577, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f406, %f579, %f248;
	fma.rn.ftz.f32 	%f407, %f580, %f247, %f406;
	fma.rn.ftz.f32 	%f408, %f404, %f249, %f407;
	fma.rn.ftz.f32 	%f409, %f405, %f250, %f408;
	ld.f32 	%f410, [%rd51+-4];
	ld.f32 	%f411, [%rd51+-8];
	ld.f32 	%f412, [%rd51+-12];
	st.f32 	[%rd51], %f409;
	not.b32 	%r143, %r215;
	.loc 1 77 1
	add.s32 	%r144, %r201, %r143;
	setp.eq.s32	%p61, %r144, %r141;
	and.pred  	%p62, %p61, %p59;
	.loc 1 77 1
	mul.ftz.f32 	%f413, %f410, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f414, %f413, %f409, %p62;
	selp.f32	%f415, %f413, %f404, %p62;
	.loc 1 77 1
	mul.ftz.f32 	%f416, %f580, %f248;
	fma.rn.ftz.f32 	%f417, %f402, %f247, %f416;
	fma.rn.ftz.f32 	%f418, %f414, %f249, %f417;
	fma.rn.ftz.f32 	%f419, %f415, %f250, %f418;
	st.f32 	[%rd51+-4], %f419;
	mov.u32 	%r145, -2;
	.loc 1 77 1
	sub.s32 	%r146, %r145, %r215;
	.loc 1 77 1
	add.s32 	%r147, %r201, %r146;
	setp.eq.s32	%p63, %r147, %r141;
	and.pred  	%p64, %p63, %p59;
	.loc 1 77 1
	mul.ftz.f32 	%f420, %f411, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f421, %f420, %f419, %p64;
	selp.f32	%f422, %f420, %f414, %p64;
	.loc 1 77 1
	mul.ftz.f32 	%f423, %f402, %f248;
	fma.rn.ftz.f32 	%f424, %f410, %f247, %f423;
	fma.rn.ftz.f32 	%f425, %f421, %f249, %f424;
	fma.rn.ftz.f32 	%f426, %f422, %f250, %f425;
	st.f32 	[%rd51+-8], %f426;
	mov.u32 	%r148, -3;
	.loc 1 77 1
	sub.s32 	%r149, %r148, %r215;
	.loc 1 77 1
	add.s32 	%r150, %r201, %r149;
	setp.eq.s32	%p65, %r150, %r141;
	and.pred  	%p66, %p65, %p59;
	.loc 1 77 1
	mul.ftz.f32 	%f427, %f412, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f428, %f427, %f426, %p66;
	selp.f32	%f429, %f427, %f421, %p66;
	.loc 1 77 1
	mul.ftz.f32 	%f430, %f410, %f248;
	fma.rn.ftz.f32 	%f431, %f411, %f247, %f430;
	fma.rn.ftz.f32 	%f432, %f428, %f249, %f431;
	fma.rn.ftz.f32 	%f433, %f429, %f250, %f432;
	st.f32 	[%rd51+-12], %f433;
	mov.u32 	%r151, -4;
	.loc 1 77 1
	sub.s32 	%r152, %r151, %r215;
	.loc 1 77 1
	add.s32 	%r153, %r201, %r152;
	setp.eq.s32	%p67, %r153, %r141;
	and.pred  	%p68, %p67, %p59;
	.loc 1 77 1
	ld.f32 	%f434, [%rd51+-16];
	.loc 1 77 1
	mul.ftz.f32 	%f435, %f434, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f436, %f435, %f433, %p68;
	selp.f32	%f437, %f435, %f428, %p68;
	.loc 1 77 1
	mul.ftz.f32 	%f438, %f411, %f248;
	fma.rn.ftz.f32 	%f439, %f412, %f247, %f438;
	fma.rn.ftz.f32 	%f440, %f436, %f249, %f439;
	fma.rn.ftz.f32 	%f441, %f437, %f250, %f440;
	ld.f32 	%f442, [%rd51+-20];
	ld.f32 	%f579, [%rd51+-24];
	ld.f32 	%f580, [%rd51+-28];
	st.f32 	[%rd51+-16], %f441;
	mov.u32 	%r154, -5;
	.loc 1 77 1
	sub.s32 	%r155, %r154, %r215;
	.loc 1 77 1
	add.s32 	%r156, %r201, %r155;
	setp.eq.s32	%p69, %r156, %r141;
	and.pred  	%p70, %p69, %p59;
	.loc 1 77 1
	mul.ftz.f32 	%f443, %f442, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f444, %f443, %f441, %p70;
	selp.f32	%f445, %f443, %f436, %p70;
	.loc 1 77 1
	mul.ftz.f32 	%f446, %f412, %f248;
	fma.rn.ftz.f32 	%f447, %f434, %f247, %f446;
	fma.rn.ftz.f32 	%f448, %f444, %f249, %f447;
	fma.rn.ftz.f32 	%f449, %f445, %f250, %f448;
	st.f32 	[%rd51+-20], %f449;
	mov.u32 	%r157, -6;
	.loc 1 77 1
	sub.s32 	%r158, %r157, %r215;
	.loc 1 77 1
	add.s32 	%r159, %r201, %r158;
	setp.eq.s32	%p71, %r159, %r141;
	and.pred  	%p72, %p71, %p59;
	.loc 1 77 1
	mul.ftz.f32 	%f450, %f579, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f451, %f450, %f449, %p72;
	selp.f32	%f452, %f450, %f444, %p72;
	.loc 1 77 1
	mul.ftz.f32 	%f453, %f434, %f248;
	fma.rn.ftz.f32 	%f454, %f442, %f247, %f453;
	fma.rn.ftz.f32 	%f455, %f451, %f249, %f454;
	fma.rn.ftz.f32 	%f456, %f452, %f250, %f455;
	st.f32 	[%rd51+-24], %f456;
	mov.u32 	%r160, -7;
	.loc 1 77 1
	sub.s32 	%r161, %r160, %r215;
	.loc 1 77 1
	add.s32 	%r162, %r201, %r161;
	setp.eq.s32	%p73, %r162, %r141;
	and.pred  	%p74, %p73, %p59;
	.loc 1 77 1
	mul.ftz.f32 	%f457, %f580, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f577, %f457, %f456, %p74;
	selp.f32	%f458, %f457, %f451, %p74;
	.loc 1 77 1
	mul.ftz.f32 	%f459, %f442, %f248;
	fma.rn.ftz.f32 	%f460, %f579, %f247, %f459;
	fma.rn.ftz.f32 	%f461, %f577, %f249, %f460;
	fma.rn.ftz.f32 	%f578, %f458, %f250, %f461;
	st.f32 	[%rd51+-28], %f578;
	.loc 1 77 1
	add.s32 	%r215, %r215, 8;
	setp.ne.s32	%p75, %r215, 32;
	@%p75 bra 	BB2_88;

BB2_89:
	setp.lt.s32	%p6, %r99, %r34;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	setp.lt.s32	%p76, %r21, %r33;
	.loc 1 77 1
	and.pred  	%p77, %p6, %p76;
	@!%p77 bra 	BB2_108;
	bra.uni 	BB2_90;

BB2_90:
	setp.eq.s32	%p78, %r35, 0;
	.loc 1 77 1
	@%p78 bra 	BB2_92;

	.loc 1 77 1
	mad.lo.s32 	%r171, %r99, %r32, %r21;
	mul.wide.s32 	%rd52, %r171, 16;
	add.s64 	%rd53, %rd11, %rd52;
	ld.v4.f32 	{%f462, %f463, %f464, %f465}, [%rd53];
	mov.f32 	%f584, %f465;
	mov.f32 	%f583, %f464;
	mov.f32 	%f582, %f463;
	mov.f32 	%f581, %f462;
	bra.uni 	BB2_93;

BB2_92:
	.loc 1 77 1
	mad.lo.s32 	%r176, %r99, %r32, %r21;
	mul.wide.s32 	%rd54, %r176, 8;
	add.s64 	%rd55, %rd11, %rd54;
	.loc 1 77 1
	ld.v4.u16 	{%rs41, %rs42, %rs43, %rs44}, [%rd55];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs41;
	cvt.f32.f16 	%f581, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs42;
	cvt.f32.f16 	%f582, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs43;
	cvt.f32.f16 	%f583, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs44;
	cvt.f32.f16 	%f584, %temp;
	}

BB2_93:
	.loc 1 77 1
	mad.lo.s32 	%r179, %r98, 33, %r103;
	mul.wide.s32 	%rd56, %r179, 4;
	add.s64 	%rd57, %rd9, %rd56;
	.loc 1 77 1
	ld.f32 	%f466, [%rd57];
	add.ftz.f32 	%f183, %f581, %f466;
	ld.f32 	%f467, [%rd57+1056];
	add.ftz.f32 	%f184, %f582, %f467;
	ld.f32 	%f468, [%rd57+2112];
	add.ftz.f32 	%f185, %f583, %f468;
	ld.f32 	%f469, [%rd57+3168];
	add.ftz.f32 	%f470, %f584, %f469;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f585, %f470;
	.loc 2 45 1
	add.ftz.f32 	%f471, %f585, 0fB70637BD;
	setp.gtu.ftz.f32	%p79, %f471, 0f00000000;
	@%p79 bra 	BB2_95;

	mov.f32 	%f588, 0f00000000;
	mov.f32 	%f587, %f588;
	mov.f32 	%f586, %f588;
	mov.f32 	%f585, %f588;
	bra.uni 	BB2_96;

BB2_95:
	mov.f32 	%f476, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f477, %f476, %f585;
	.loc 2 45 1
	mul.ftz.f32 	%f586, %f185, %f477;
	mul.ftz.f32 	%f587, %f184, %f477;
	mul.ftz.f32 	%f588, %f183, %f477;

BB2_96:
	.loc 1 77 165
	setp.ltu.ftz.f32	%p80, %f588, 0f00000000;
	@%p80 bra 	BB2_98;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f478, %f588;
	mul.ftz.f32 	%f479, %f478, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f589, %f479;
	bra.uni 	BB2_99;

BB2_98:
	neg.ftz.f32 	%f480, %f588;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f481, %f480;
	mul.ftz.f32 	%f482, %f481, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f483, %f482;
	neg.ftz.f32 	%f589, %f483;

BB2_99:
	setp.ltu.ftz.f32	%p81, %f587, 0f00000000;
	@%p81 bra 	BB2_101;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f484, %f587;
	mul.ftz.f32 	%f485, %f484, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f590, %f485;
	bra.uni 	BB2_102;

BB2_101:
	neg.ftz.f32 	%f486, %f587;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f487, %f486;
	mul.ftz.f32 	%f488, %f487, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f489, %f488;
	neg.ftz.f32 	%f590, %f489;

BB2_102:
	setp.ltu.ftz.f32	%p82, %f586, 0f00000000;
	@%p82 bra 	BB2_104;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f490, %f586;
	mul.ftz.f32 	%f491, %f490, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f591, %f491;
	bra.uni 	BB2_105;

BB2_104:
	neg.ftz.f32 	%f492, %f586;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f493, %f492;
	mul.ftz.f32 	%f494, %f493, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f495, %f494;
	neg.ftz.f32 	%f591, %f495;

BB2_105:
	.loc 1 77 1
	@%p78 bra 	BB2_107;

	.loc 1 77 1
	mad.lo.s32 	%r184, %r99, %r32, %r21;
	mul.wide.s32 	%rd58, %r184, 16;
	add.s64 	%rd59, %rd11, %rd58;
	.loc 1 77 1
	st.v4.f32 	[%rd59], {%f589, %f590, %f591, %f585};
	bra.uni 	BB2_108;

BB2_107:
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f589;
	mov.b16 	%rs49, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f590;
	mov.b16 	%rs50, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f591;
	mov.b16 	%rs51, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f585;
	mov.b16 	%rs52, %temp;
}
	.loc 1 77 1
	mad.lo.s32 	%r189, %r99, %r32, %r21;
	mul.wide.s32 	%rd60, %r189, 8;
	add.s64 	%rd61, %rd11, %rd60;
	.loc 1 77 238
	st.v4.u16 	[%rd61], {%rs49, %rs50, %rs51, %rs52};

BB2_108:
	.loc 1 77 1
	add.s32 	%r194, %r99, 4;
	setp.lt.s32	%p85, %r194, %r34;
	.loc 1 77 1
	mad.lo.s32 	%r195, %r99, %r32, %r21;
	.loc 1 77 1
	shl.b32 	%r196, %r32, 2;
	add.s32 	%r27, %r195, %r196;
	.loc 1 77 1
	and.pred  	%p86, %p85, %p76;
	@!%p86 bra 	BB2_127;
	bra.uni 	BB2_109;

BB2_109:
	setp.eq.s32	%p87, %r35, 0;
	mul.wide.s32 	%rd62, %r27, 16;
	add.s64 	%rd7, %rd11, %rd62;
	mul.wide.s32 	%rd63, %r27, 8;
	add.s64 	%rd8, %rd11, %rd63;
	.loc 1 77 1
	@%p87 bra 	BB2_111;

	ld.v4.f32 	{%f496, %f497, %f498, %f499}, [%rd7];
	mov.f32 	%f595, %f499;
	mov.f32 	%f594, %f498;
	mov.f32 	%f593, %f497;
	mov.f32 	%f592, %f496;
	bra.uni 	BB2_112;

BB2_111:
	.loc 1 77 1
	ld.v4.u16 	{%rs53, %rs54, %rs55, %rs56}, [%rd8];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs53;
	cvt.f32.f16 	%f592, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs54;
	cvt.f32.f16 	%f593, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs55;
	cvt.f32.f16 	%f594, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs56;
	cvt.f32.f16 	%f595, %temp;
	}

BB2_112:
	.loc 1 77 1
	mad.lo.s32 	%r199, %r98, 33, %r103;
	mul.wide.s32 	%rd64, %r199, 4;
	add.s64 	%rd65, %rd9, %rd64;
	.loc 1 77 1
	ld.f32 	%f500, [%rd65+528];
	add.ftz.f32 	%f219, %f592, %f500;
	ld.f32 	%f501, [%rd65+1584];
	add.ftz.f32 	%f220, %f593, %f501;
	ld.f32 	%f502, [%rd65+2640];
	add.ftz.f32 	%f221, %f594, %f502;
	ld.f32 	%f503, [%rd65+3696];
	add.ftz.f32 	%f504, %f595, %f503;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f596, %f504;
	.loc 2 45 1
	add.ftz.f32 	%f505, %f596, 0fB70637BD;
	setp.gtu.ftz.f32	%p88, %f505, 0f00000000;
	@%p88 bra 	BB2_114;

	mov.f32 	%f599, 0f00000000;
	mov.f32 	%f598, %f599;
	mov.f32 	%f597, %f599;
	mov.f32 	%f596, %f599;
	bra.uni 	BB2_115;

BB2_114:
	mov.f32 	%f510, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f511, %f510, %f596;
	.loc 2 45 1
	mul.ftz.f32 	%f597, %f221, %f511;
	mul.ftz.f32 	%f598, %f220, %f511;
	mul.ftz.f32 	%f599, %f219, %f511;

BB2_115:
	.loc 1 77 165
	setp.ltu.ftz.f32	%p89, %f599, 0f00000000;
	@%p89 bra 	BB2_117;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f512, %f599;
	mul.ftz.f32 	%f513, %f512, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f600, %f513;
	bra.uni 	BB2_118;

BB2_117:
	neg.ftz.f32 	%f514, %f599;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f515, %f514;
	mul.ftz.f32 	%f516, %f515, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f517, %f516;
	neg.ftz.f32 	%f600, %f517;

BB2_118:
	setp.ltu.ftz.f32	%p90, %f598, 0f00000000;
	@%p90 bra 	BB2_120;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f518, %f598;
	mul.ftz.f32 	%f519, %f518, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f601, %f519;
	bra.uni 	BB2_121;

BB2_120:
	neg.ftz.f32 	%f520, %f598;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f521, %f520;
	mul.ftz.f32 	%f522, %f521, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f523, %f522;
	neg.ftz.f32 	%f601, %f523;

BB2_121:
	setp.ltu.ftz.f32	%p91, %f597, 0f00000000;
	@%p91 bra 	BB2_123;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f524, %f597;
	mul.ftz.f32 	%f525, %f524, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f602, %f525;
	bra.uni 	BB2_124;

BB2_123:
	neg.ftz.f32 	%f526, %f597;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f527, %f526;
	mul.ftz.f32 	%f528, %f527, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f529, %f528;
	neg.ftz.f32 	%f602, %f529;

BB2_124:
	.loc 1 77 1
	@%p87 bra 	BB2_126;

	.loc 1 77 1
	st.v4.f32 	[%rd7], {%f600, %f601, %f602, %f596};
	bra.uni 	BB2_127;

BB2_126:
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f600;
	mov.b16 	%rs61, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f601;
	mov.b16 	%rs62, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f602;
	mov.b16 	%rs63, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f596;
	mov.b16 	%rs64, %temp;
}
	.loc 1 77 238
	st.v4.u16 	[%rd8], {%rs61, %rs62, %rs63, %rs64};

BB2_127:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	mad.lo.s32 	%r202, %r214, -32, %r18;
	add.s32 	%r200, %r202, -31;
	.loc 1 77 1
	setp.gt.s32	%p93, %r200, 0;
	add.s32 	%r214, %r214, 1;
	@%p93 bra 	BB2_52;

BB2_128:
	.loc 1 77 2
	ret;
}

.visible .entry VerticalRecursiveGaussianRGBAF16_kernel(
	.param .u64 VerticalRecursiveGaussianRGBAF16_kernel_param_0,
	.param .u32 VerticalRecursiveGaussianRGBAF16_kernel_param_1,
	.param .u32 VerticalRecursiveGaussianRGBAF16_kernel_param_2,
	.param .u32 VerticalRecursiveGaussianRGBAF16_kernel_param_3,
	.param .u64 VerticalRecursiveGaussianRGBAF16_kernel_param_4,
	.param .u32 VerticalRecursiveGaussianRGBAF16_kernel_param_5,
	.param .u32 VerticalRecursiveGaussianRGBAF16_kernel_param_6,
	.param .u32 VerticalRecursiveGaussianRGBAF16_kernel_param_7,
	.param .u32 VerticalRecursiveGaussianRGBAF16_kernel_param_8,
	.param .f32 VerticalRecursiveGaussianRGBAF16_kernel_param_9,
	.param .f32 VerticalRecursiveGaussianRGBAF16_kernel_param_10,
	.param .f32 VerticalRecursiveGaussianRGBAF16_kernel_param_11,
	.param .f32 VerticalRecursiveGaussianRGBAF16_kernel_param_12,
	.param .f32 VerticalRecursiveGaussianRGBAF16_kernel_param_13,
	.param .f32 VerticalRecursiveGaussianRGBAF16_kernel_param_14,
	.param .f32 VerticalRecursiveGaussianRGBAF16_kernel_param_15,
	.param .f32 VerticalRecursiveGaussianRGBAF16_kernel_param_16
)
{
	.reg .pred 	%p<44>;
	.reg .s16 	%rs<33>;
	.reg .s32 	%r<118>;
	.reg .f32 	%f<250>;
	.reg .s64 	%rd<36>;
	// demoted variable
	.shared .align 4 .b8 VerticalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_8773_non_const_smem[2048];

	ld.param.u64 	%rd1, [VerticalRecursiveGaussianRGBAF16_kernel_param_0];
	ld.param.u32 	%r38, [VerticalRecursiveGaussianRGBAF16_kernel_param_1];
	ld.param.u32 	%r39, [VerticalRecursiveGaussianRGBAF16_kernel_param_2];
	ld.param.u32 	%r40, [VerticalRecursiveGaussianRGBAF16_kernel_param_3];
	ld.param.u64 	%rd2, [VerticalRecursiveGaussianRGBAF16_kernel_param_4];
	ld.param.u32 	%r41, [VerticalRecursiveGaussianRGBAF16_kernel_param_5];
	ld.param.u32 	%r42, [VerticalRecursiveGaussianRGBAF16_kernel_param_6];
	ld.param.u32 	%r43, [VerticalRecursiveGaussianRGBAF16_kernel_param_7];
	ld.param.u32 	%r44, [VerticalRecursiveGaussianRGBAF16_kernel_param_8];
	ld.param.f32 	%f75, [VerticalRecursiveGaussianRGBAF16_kernel_param_9];
	ld.param.f32 	%f76, [VerticalRecursiveGaussianRGBAF16_kernel_param_10];
	ld.param.f32 	%f77, [VerticalRecursiveGaussianRGBAF16_kernel_param_11];
	ld.param.f32 	%f78, [VerticalRecursiveGaussianRGBAF16_kernel_param_12];
	ld.param.f32 	%f79, [VerticalRecursiveGaussianRGBAF16_kernel_param_13];
	ld.param.f32 	%f80, [VerticalRecursiveGaussianRGBAF16_kernel_param_14];
	ld.param.f32 	%f81, [VerticalRecursiveGaussianRGBAF16_kernel_param_15];
	ld.param.f32 	%f82, [VerticalRecursiveGaussianRGBAF16_kernel_param_16];
	.loc 1 77 1
	mov.u32 	%r45, %ctaid.x;
	mov.u32 	%r46, %ntid.x;
	mov.u32 	%r47, %tid.x;
	mad.lo.s32 	%r1, %r45, %r46, %r47;
	sub.s32 	%r48, %r42, %r39;
	shr.s32 	%r49, %r48, 1;
	sub.s32 	%r2, %r1, %r49;
	sub.s32 	%r50, %r43, %r40;
	shr.s32 	%r51, %r50, 1;
	mov.u32 	%r117, %tid.y;
	sub.s32 	%r115, %r117, %r51;
	.loc 1 77 1
	setp.ge.s32	%p1, %r1, %r42;
	@%p1 bra 	BB3_47;

	.loc 1 77 1
	mad.lo.s32 	%r113, %r115, %r38, %r2;
	mad.lo.s32 	%r107, %r117, %r41, %r1;
	mov.u32 	%r116, 0;
	.loc 1 77 1
	setp.gt.s32	%p2, %r43, 0;
	@%p2 bra 	BB3_2;
	bra.uni 	BB3_18;

BB3_2:
	mov.f32 	%f224, 0f00000000;
	mov.f32 	%f223, %f224;
	mov.f32 	%f222, %f224;
	mov.u32 	%r114, %r113;

BB3_3:
	setp.lt.s32	%p3, %r2, %r39;
	setp.gt.s32	%p4, %r2, -1;
	.loc 1 77 1
	setp.lt.s32	%p5, %r115, %r40;
	setp.gt.s32	%p6, %r115, -1;
	and.pred  	%p7, %p6, %p5;
	and.pred  	%p8, %p7, %p4;
	and.pred  	%p9, %p8, %p3;
	.loc 1 77 1
	@%p9 bra 	BB3_5;

	mov.f32 	%f231, 0f00000000;
	mov.f32 	%f230, %f231;
	mov.f32 	%f229, %f231;
	mov.f32 	%f228, %f231;
	bra.uni 	BB3_15;

BB3_5:
	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r114, 8;
	add.s64 	%rd5, %rd3, %rd4;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd5];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f4, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f5, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f6, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f90, %temp;
	}
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f231, %f90;
	setp.ltu.ftz.f32	%p10, %f4, 0f00000000;
	@%p10 bra 	BB3_7;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f91, %f4;
	mul.ftz.f32 	%f92, %f91, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f225, %f92;
	bra.uni 	BB3_8;

BB3_7:
	neg.ftz.f32 	%f93, %f4;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f94, %f93;
	mul.ftz.f32 	%f95, %f94, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f96, %f95;
	neg.ftz.f32 	%f225, %f96;

BB3_8:
	setp.ltu.ftz.f32	%p11, %f5, 0f00000000;
	@%p11 bra 	BB3_10;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f97, %f5;
	mul.ftz.f32 	%f98, %f97, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f226, %f98;
	bra.uni 	BB3_11;

BB3_10:
	neg.ftz.f32 	%f99, %f5;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f100, %f99;
	mul.ftz.f32 	%f101, %f100, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f102, %f101;
	neg.ftz.f32 	%f226, %f102;

BB3_11:
	setp.ltu.ftz.f32	%p12, %f6, 0f00000000;
	@%p12 bra 	BB3_13;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f103, %f6;
	mul.ftz.f32 	%f104, %f103, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f227, %f104;
	bra.uni 	BB3_14;

BB3_13:
	neg.ftz.f32 	%f105, %f6;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f106, %f105;
	mul.ftz.f32 	%f107, %f106, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f108, %f107;
	neg.ftz.f32 	%f227, %f108;

BB3_14:
	mul.ftz.f32 	%f230, %f227, %f231;
	mul.ftz.f32 	%f229, %f226, %f231;
	mul.ftz.f32 	%f228, %f225, %f231;

BB3_15:
	.loc 1 77 1
	mov.u32 	%r12, %tid.y;
	.loc 1 77 1
	shl.b32 	%r61, %r12, 5;
	add.s32 	%r62, %r61, %r47;
	mul.wide.s32 	%rd6, %r62, 4;
	mov.u64 	%rd7, VerticalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_8773_non_const_smem;
	add.s64 	%rd8, %rd7, %rd6;
	.loc 1 77 1
	st.shared.f32 	[%rd8], %f228;
	st.shared.f32 	[%rd8+512], %f229;
	st.shared.f32 	[%rd8+1024], %f230;
	st.shared.f32 	[%rd8+1536], %f231;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	shl.b32 	%r63, %r12, 7;
	.loc 1 77 1
	add.s32 	%r64, %r47, %r63;
	mul.wide.s32 	%rd9, %r64, 4;
	add.s64 	%rd11, %rd7, %rd9;
	.loc 1 77 1
	setp.eq.s32	%p13, %r116, 0;
	setp.ne.s32	%p14, %r44, 0;
	and.pred  	%p15, %p13, %p14;
	.loc 1 77 1
	ld.shared.f32 	%f109, [%rd11];
	.loc 1 77 1
	mul.ftz.f32 	%f110, %f109, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f111, %f110, %f223, %p15;
	selp.f32	%f112, %f110, %f222, %p15;
	.loc 1 77 1
	mul.ftz.f32 	%f113, %f224, %f76;
	fma.rn.ftz.f32 	%f114, %f109, %f75, %f113;
	fma.rn.ftz.f32 	%f115, %f111, %f77, %f114;
	fma.rn.ftz.f32 	%f116, %f112, %f78, %f115;
	st.shared.f32 	[%rd11], %f116;
	ld.shared.f32 	%f117, [%rd11+128];
	mul.ftz.f32 	%f118, %f109, %f76;
	fma.rn.ftz.f32 	%f119, %f117, %f75, %f118;
	fma.rn.ftz.f32 	%f120, %f116, %f77, %f119;
	fma.rn.ftz.f32 	%f121, %f111, %f78, %f120;
	st.shared.f32 	[%rd11+128], %f121;
	ld.shared.f32 	%f122, [%rd11+256];
	mul.ftz.f32 	%f123, %f117, %f76;
	fma.rn.ftz.f32 	%f124, %f122, %f75, %f123;
	fma.rn.ftz.f32 	%f125, %f121, %f77, %f124;
	fma.rn.ftz.f32 	%f222, %f116, %f78, %f125;
	st.shared.f32 	[%rd11+256], %f222;
	ld.shared.f32 	%f224, [%rd11+384];
	mul.ftz.f32 	%f126, %f122, %f76;
	fma.rn.ftz.f32 	%f127, %f224, %f75, %f126;
	fma.rn.ftz.f32 	%f128, %f222, %f77, %f127;
	fma.rn.ftz.f32 	%f223, %f121, %f78, %f128;
	st.shared.f32 	[%rd11+384], %f223;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	setp.ge.s32	%p16, %r117, %r43;
	@%p16 bra 	BB3_17;

	cvta.to.global.u64 	%rd12, %rd2;
	mul.wide.s32 	%rd13, %r62, 4;
	add.s64 	%rd15, %rd7, %rd13;
	mul.wide.s32 	%rd16, %r107, 8;
	add.s64 	%rd17, %rd12, %rd16;
	.loc 1 77 1
	ld.shared.f32 	%f129, [%rd15];
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f129;
	mov.b16 	%rs9, %temp;
}
	.loc 1 77 1
	ld.shared.f32 	%f130, [%rd15+512];
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f130;
	mov.b16 	%rs10, %temp;
}
	.loc 1 77 1
	ld.shared.f32 	%f131, [%rd15+1024];
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f131;
	mov.b16 	%rs11, %temp;
}
	.loc 1 77 1
	ld.shared.f32 	%f132, [%rd15+1536];
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f132;
	mov.b16 	%rs12, %temp;
}
	st.global.v4.u16 	[%rd17], {%rs9, %rs10, %rs11, %rs12};

BB3_17:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r115, %r115, 4;
	add.s32 	%r117, %r117, 4;
	shl.b32 	%r69, %r38, 2;
	add.s32 	%r114, %r114, %r69;
	shl.b32 	%r70, %r41, 2;
	add.s32 	%r107, %r107, %r70;
	add.s32 	%r116, %r116, 4;
	.loc 1 77 1
	setp.lt.s32	%p17, %r116, %r43;
	mov.u32 	%r113, %r114;
	@%p17 bra 	BB3_3;

BB3_18:
	.loc 1 77 1
	mov.u32 	%r112, %r113;
	setp.lt.s32	%p18, %r116, 1;
	@%p18 bra 	BB3_47;

	add.s32 	%r23, %r116, -1;
	mov.f32 	%f235, 0f00000000;
	mov.f32 	%f234, %f235;
	mov.f32 	%f233, %f235;
	mov.f32 	%f232, %f235;
	mov.u32 	%r108, 0;

BB3_20:
	.loc 1 77 1
	mov.u32 	%r25, %r112;
	setp.lt.s32	%p19, %r2, %r39;
	setp.gt.s32	%p20, %r2, -1;
	.loc 1 77 1
	add.s32 	%r116, %r116, -4;
	add.s32 	%r117, %r117, -4;
	shl.b32 	%r79, %r38, 2;
	sub.s32 	%r31, %r25, %r79;
	add.s32 	%r115, %r115, -4;
	.loc 1 77 1
	setp.gt.s32	%p21, %r115, -1;
	setp.lt.s32	%p22, %r115, %r40;
	and.pred  	%p23, %p21, %p22;
	and.pred  	%p24, %p23, %p20;
	and.pred  	%p25, %p24, %p19;
	.loc 1 77 1
	@%p25 bra 	BB3_22;

	mov.f32 	%f242, 0f00000000;
	mov.f32 	%f241, %f242;
	mov.f32 	%f240, %f242;
	mov.f32 	%f239, %f242;
	bra.uni 	BB3_32;

BB3_22:
	cvta.to.global.u64 	%rd18, %rd1;
	mul.wide.s32 	%rd19, %r31, 8;
	add.s64 	%rd20, %rd18, %rd19;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs13, %rs14, %rs15, %rs16}, [%rd20];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs13;
	cvt.f32.f16 	%f31, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs14;
	cvt.f32.f16 	%f32, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs15;
	cvt.f32.f16 	%f33, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs16;
	cvt.f32.f16 	%f141, %temp;
	}
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f242, %f141;
	setp.ltu.ftz.f32	%p26, %f31, 0f00000000;
	@%p26 bra 	BB3_24;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f142, %f31;
	mul.ftz.f32 	%f143, %f142, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f236, %f143;
	bra.uni 	BB3_25;

BB3_24:
	neg.ftz.f32 	%f144, %f31;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f145, %f144;
	mul.ftz.f32 	%f146, %f145, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f147, %f146;
	neg.ftz.f32 	%f236, %f147;

BB3_25:
	setp.ltu.ftz.f32	%p27, %f32, 0f00000000;
	@%p27 bra 	BB3_27;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f148, %f32;
	mul.ftz.f32 	%f149, %f148, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f237, %f149;
	bra.uni 	BB3_28;

BB3_27:
	neg.ftz.f32 	%f150, %f32;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f151, %f150;
	mul.ftz.f32 	%f152, %f151, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f153, %f152;
	neg.ftz.f32 	%f237, %f153;

BB3_28:
	setp.ltu.ftz.f32	%p28, %f33, 0f00000000;
	@%p28 bra 	BB3_30;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f154, %f33;
	mul.ftz.f32 	%f155, %f154, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f238, %f155;
	bra.uni 	BB3_31;

BB3_30:
	neg.ftz.f32 	%f156, %f33;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f157, %f156;
	mul.ftz.f32 	%f158, %f157, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f159, %f158;
	neg.ftz.f32 	%f238, %f159;

BB3_31:
	mul.ftz.f32 	%f241, %f238, %f242;
	mul.ftz.f32 	%f240, %f237, %f242;
	mul.ftz.f32 	%f239, %f236, %f242;

BB3_32:
	.loc 1 77 1
	mov.u32 	%r33, %tid.y;
	.loc 1 77 1
	shl.b32 	%r80, %r33, 5;
	add.s32 	%r81, %r80, %r47;
	mul.wide.s32 	%rd21, %r81, 4;
	mov.u64 	%rd22, VerticalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_8773_non_const_smem;
	add.s64 	%rd23, %rd22, %rd21;
	.loc 1 77 1
	st.shared.f32 	[%rd23], %f239;
	st.shared.f32 	[%rd23+512], %f240;
	st.shared.f32 	[%rd23+1024], %f241;
	st.shared.f32 	[%rd23+1536], %f242;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	shl.b32 	%r82, %r33, 7;
	.loc 1 77 1
	add.s32 	%r83, %r47, %r82;
	mul.wide.s32 	%rd24, %r83, 4;
	add.s64 	%rd26, %rd22, %rd24;
	.loc 1 77 1
	mad.lo.s32 	%r84, %r108, -4, %r23;
	.loc 1 77 1
	add.s32 	%r85, %r43, -1;
	setp.eq.s32	%p29, %r84, %r85;
	setp.ne.s32	%p30, %r44, 0;
	and.pred  	%p31, %p29, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f160, [%rd26+384];
	.loc 1 77 1
	mul.ftz.f32 	%f161, %f160, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f162, %f161, %f233, %p31;
	selp.f32	%f163, %f161, %f232, %p31;
	.loc 1 77 1
	mul.ftz.f32 	%f164, %f234, %f80;
	fma.rn.ftz.f32 	%f165, %f235, %f79, %f164;
	fma.rn.ftz.f32 	%f166, %f162, %f81, %f165;
	fma.rn.ftz.f32 	%f167, %f163, %f82, %f166;
	st.shared.f32 	[%rd26+384], %f167;
	.loc 1 77 1
	setp.eq.s32	%p32, %r84, %r43;
	and.pred  	%p33, %p32, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f168, [%rd26+256];
	.loc 1 77 1
	mul.ftz.f32 	%f169, %f168, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f170, %f169, %f167, %p33;
	selp.f32	%f171, %f169, %f162, %p33;
	.loc 1 77 1
	mul.ftz.f32 	%f172, %f235, %f80;
	fma.rn.ftz.f32 	%f173, %f160, %f79, %f172;
	fma.rn.ftz.f32 	%f174, %f170, %f81, %f173;
	fma.rn.ftz.f32 	%f175, %f171, %f82, %f174;
	st.shared.f32 	[%rd26+256], %f175;
	.loc 1 77 1
	add.s32 	%r86, %r84, -2;
	setp.eq.s32	%p34, %r86, %r85;
	and.pred  	%p35, %p34, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f234, [%rd26+128];
	.loc 1 77 1
	mul.ftz.f32 	%f176, %f234, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f177, %f176, %f175, %p35;
	selp.f32	%f178, %f176, %f170, %p35;
	.loc 1 77 1
	mul.ftz.f32 	%f179, %f160, %f80;
	fma.rn.ftz.f32 	%f180, %f168, %f79, %f179;
	fma.rn.ftz.f32 	%f181, %f177, %f81, %f180;
	fma.rn.ftz.f32 	%f182, %f178, %f82, %f181;
	st.shared.f32 	[%rd26+128], %f182;
	.loc 1 77 1
	add.s32 	%r87, %r84, -3;
	setp.eq.s32	%p36, %r87, %r85;
	and.pred  	%p37, %p36, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f235, [%rd26];
	.loc 1 77 1
	mul.ftz.f32 	%f183, %f235, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f232, %f183, %f182, %p37;
	selp.f32	%f184, %f183, %f177, %p37;
	.loc 1 77 1
	mul.ftz.f32 	%f185, %f168, %f80;
	fma.rn.ftz.f32 	%f186, %f234, %f79, %f185;
	fma.rn.ftz.f32 	%f187, %f232, %f81, %f186;
	fma.rn.ftz.f32 	%f233, %f184, %f82, %f187;
	st.shared.f32 	[%rd26], %f233;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	setp.ge.s32	%p38, %r117, %r43;
	@%p38 bra 	BB3_46;

	cvta.to.global.u64 	%rd27, %rd2;
	.loc 1 77 1
	mad.lo.s32 	%r90, %r117, %r41, %r1;
	mul.wide.s32 	%rd28, %r90, 8;
	add.s64 	%rd29, %rd27, %rd28;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs21, %rs22, %rs23, %rs24}, [%rd29];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs21;
	cvt.f32.f16 	%f188, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs22;
	cvt.f32.f16 	%f189, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs23;
	cvt.f32.f16 	%f190, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs24;
	cvt.f32.f16 	%f191, %temp;
	}
	mul.wide.s32 	%rd30, %r81, 4;
	add.s64 	%rd32, %rd22, %rd30;
	.loc 1 77 1
	ld.shared.f32 	%f192, [%rd32];
	add.ftz.f32 	%f55, %f188, %f192;
	ld.shared.f32 	%f193, [%rd32+512];
	add.ftz.f32 	%f56, %f189, %f193;
	ld.shared.f32 	%f194, [%rd32+1024];
	add.ftz.f32 	%f57, %f190, %f194;
	ld.shared.f32 	%f195, [%rd32+1536];
	add.ftz.f32 	%f196, %f191, %f195;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f243, %f196;
	.loc 2 45 1
	add.ftz.f32 	%f197, %f243, 0fB70637BD;
	setp.gtu.ftz.f32	%p39, %f197, 0f00000000;
	@%p39 bra 	BB3_35;

	mov.f32 	%f246, 0f00000000;
	mov.f32 	%f245, %f246;
	mov.f32 	%f244, %f246;
	mov.f32 	%f243, %f246;
	bra.uni 	BB3_36;

BB3_35:
	mov.f32 	%f202, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f203, %f202, %f243;
	.loc 2 45 1
	mul.ftz.f32 	%f244, %f57, %f203;
	mul.ftz.f32 	%f245, %f56, %f203;
	mul.ftz.f32 	%f246, %f55, %f203;

BB3_36:
	.loc 1 77 164
	setp.ltu.ftz.f32	%p40, %f246, 0f00000000;
	@%p40 bra 	BB3_38;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f204, %f246;
	mul.ftz.f32 	%f205, %f204, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f247, %f205;
	bra.uni 	BB3_39;

BB3_38:
	neg.ftz.f32 	%f206, %f246;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f207, %f206;
	mul.ftz.f32 	%f208, %f207, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f209, %f208;
	neg.ftz.f32 	%f247, %f209;

BB3_39:
	setp.ltu.ftz.f32	%p41, %f245, 0f00000000;
	@%p41 bra 	BB3_41;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f210, %f245;
	mul.ftz.f32 	%f211, %f210, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f248, %f211;
	bra.uni 	BB3_42;

BB3_41:
	neg.ftz.f32 	%f212, %f245;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f213, %f212;
	mul.ftz.f32 	%f214, %f213, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f215, %f214;
	neg.ftz.f32 	%f248, %f215;

BB3_42:
	setp.ltu.ftz.f32	%p42, %f244, 0f00000000;
	@%p42 bra 	BB3_44;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f216, %f244;
	mul.ftz.f32 	%f217, %f216, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f249, %f217;
	bra.uni 	BB3_45;

BB3_44:
	neg.ftz.f32 	%f218, %f244;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f219, %f218;
	mul.ftz.f32 	%f220, %f219, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f221, %f220;
	neg.ftz.f32 	%f249, %f221;

BB3_45:
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f247;
	mov.b16 	%rs29, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f248;
	mov.b16 	%rs30, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f249;
	mov.b16 	%rs31, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f243;
	mov.b16 	%rs32, %temp;
}
	mul.wide.s32 	%rd34, %r90, 8;
	add.s64 	%rd35, %rd27, %rd34;
	.loc 1 77 238
	st.global.v4.u16 	[%rd35], {%rs29, %rs30, %rs31, %rs32};

BB3_46:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r108, %r108, 1;
	.loc 1 77 1
	mad.lo.s32 	%r112, %r115, %r38, %r2;
	.loc 1 77 1
	setp.gt.s32	%p43, %r116, 0;
	@%p43 bra 	BB3_20;

BB3_47:
	.loc 1 77 2
	ret;
}

.visible .entry VerticalRecursiveGaussianRGBAF32_kernel(
	.param .u64 VerticalRecursiveGaussianRGBAF32_kernel_param_0,
	.param .u32 VerticalRecursiveGaussianRGBAF32_kernel_param_1,
	.param .u32 VerticalRecursiveGaussianRGBAF32_kernel_param_2,
	.param .u32 VerticalRecursiveGaussianRGBAF32_kernel_param_3,
	.param .u64 VerticalRecursiveGaussianRGBAF32_kernel_param_4,
	.param .u32 VerticalRecursiveGaussianRGBAF32_kernel_param_5,
	.param .u32 VerticalRecursiveGaussianRGBAF32_kernel_param_6,
	.param .u32 VerticalRecursiveGaussianRGBAF32_kernel_param_7,
	.param .u32 VerticalRecursiveGaussianRGBAF32_kernel_param_8,
	.param .f32 VerticalRecursiveGaussianRGBAF32_kernel_param_9,
	.param .f32 VerticalRecursiveGaussianRGBAF32_kernel_param_10,
	.param .f32 VerticalRecursiveGaussianRGBAF32_kernel_param_11,
	.param .f32 VerticalRecursiveGaussianRGBAF32_kernel_param_12,
	.param .f32 VerticalRecursiveGaussianRGBAF32_kernel_param_13,
	.param .f32 VerticalRecursiveGaussianRGBAF32_kernel_param_14,
	.param .f32 VerticalRecursiveGaussianRGBAF32_kernel_param_15,
	.param .f32 VerticalRecursiveGaussianRGBAF32_kernel_param_16
)
{
	.reg .pred 	%p<44>;
	.reg .s32 	%r<118>;
	.reg .f32 	%f<263>;
	.reg .s64 	%rd<36>;
	// demoted variable
	.shared .align 4 .b8 VerticalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_9460_non_const_smem[2048];

	ld.param.u64 	%rd1, [VerticalRecursiveGaussianRGBAF32_kernel_param_0];
	ld.param.u32 	%r38, [VerticalRecursiveGaussianRGBAF32_kernel_param_1];
	ld.param.u32 	%r39, [VerticalRecursiveGaussianRGBAF32_kernel_param_2];
	ld.param.u32 	%r40, [VerticalRecursiveGaussianRGBAF32_kernel_param_3];
	ld.param.u64 	%rd2, [VerticalRecursiveGaussianRGBAF32_kernel_param_4];
	ld.param.u32 	%r41, [VerticalRecursiveGaussianRGBAF32_kernel_param_5];
	ld.param.u32 	%r42, [VerticalRecursiveGaussianRGBAF32_kernel_param_6];
	ld.param.u32 	%r43, [VerticalRecursiveGaussianRGBAF32_kernel_param_7];
	ld.param.u32 	%r44, [VerticalRecursiveGaussianRGBAF32_kernel_param_8];
	ld.param.f32 	%f77, [VerticalRecursiveGaussianRGBAF32_kernel_param_9];
	ld.param.f32 	%f78, [VerticalRecursiveGaussianRGBAF32_kernel_param_10];
	ld.param.f32 	%f79, [VerticalRecursiveGaussianRGBAF32_kernel_param_11];
	ld.param.f32 	%f80, [VerticalRecursiveGaussianRGBAF32_kernel_param_12];
	ld.param.f32 	%f81, [VerticalRecursiveGaussianRGBAF32_kernel_param_13];
	ld.param.f32 	%f82, [VerticalRecursiveGaussianRGBAF32_kernel_param_14];
	ld.param.f32 	%f83, [VerticalRecursiveGaussianRGBAF32_kernel_param_15];
	ld.param.f32 	%f84, [VerticalRecursiveGaussianRGBAF32_kernel_param_16];
	.loc 1 77 1
	mov.u32 	%r45, %ctaid.x;
	mov.u32 	%r46, %ntid.x;
	mov.u32 	%r47, %tid.x;
	mad.lo.s32 	%r1, %r45, %r46, %r47;
	sub.s32 	%r48, %r42, %r39;
	shr.s32 	%r49, %r48, 1;
	sub.s32 	%r2, %r1, %r49;
	sub.s32 	%r50, %r43, %r40;
	shr.s32 	%r51, %r50, 1;
	mov.u32 	%r117, %tid.y;
	sub.s32 	%r115, %r117, %r51;
	.loc 1 77 1
	setp.ge.s32	%p1, %r1, %r42;
	@%p1 bra 	BB4_47;

	.loc 1 77 1
	mad.lo.s32 	%r113, %r115, %r38, %r2;
	mad.lo.s32 	%r107, %r117, %r41, %r1;
	mov.u32 	%r116, 0;
	.loc 1 77 1
	setp.gt.s32	%p2, %r43, 0;
	@%p2 bra 	BB4_2;
	bra.uni 	BB4_18;

BB4_2:
	mov.f32 	%f237, 0f00000000;
	mov.f32 	%f236, %f237;
	mov.f32 	%f235, %f237;
	mov.u32 	%r114, %r113;

BB4_3:
	setp.lt.s32	%p3, %r2, %r39;
	setp.gt.s32	%p4, %r2, -1;
	.loc 1 77 1
	setp.lt.s32	%p5, %r115, %r40;
	setp.gt.s32	%p6, %r115, -1;
	and.pred  	%p7, %p6, %p5;
	and.pred  	%p8, %p7, %p4;
	and.pred  	%p9, %p8, %p3;
	.loc 1 77 1
	@%p9 bra 	BB4_5;

	mov.f32 	%f244, 0f00000000;
	mov.f32 	%f243, %f244;
	mov.f32 	%f242, %f244;
	mov.f32 	%f241, %f244;
	bra.uni 	BB4_15;

BB4_5:
	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r114, 16;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.v4.f32 	{%f92, %f93, %f94, %f95}, [%rd5];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f244, %f95;
	setp.ltu.ftz.f32	%p10, %f92, 0f00000000;
	@%p10 bra 	BB4_7;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f96, %f92;
	mul.ftz.f32 	%f97, %f96, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f238, %f97;
	bra.uni 	BB4_8;

BB4_7:
	neg.ftz.f32 	%f98, %f92;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f99, %f98;
	mul.ftz.f32 	%f100, %f99, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f101, %f100;
	neg.ftz.f32 	%f238, %f101;

BB4_8:
	setp.ltu.ftz.f32	%p11, %f93, 0f00000000;
	@%p11 bra 	BB4_10;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f102, %f93;
	mul.ftz.f32 	%f103, %f102, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f239, %f103;
	bra.uni 	BB4_11;

BB4_10:
	neg.ftz.f32 	%f104, %f93;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f105, %f104;
	mul.ftz.f32 	%f106, %f105, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f107, %f106;
	neg.ftz.f32 	%f239, %f107;

BB4_11:
	setp.ltu.ftz.f32	%p12, %f94, 0f00000000;
	@%p12 bra 	BB4_13;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f108, %f94;
	mul.ftz.f32 	%f109, %f108, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f240, %f109;
	bra.uni 	BB4_14;

BB4_13:
	neg.ftz.f32 	%f110, %f94;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f111, %f110;
	mul.ftz.f32 	%f112, %f111, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f113, %f112;
	neg.ftz.f32 	%f240, %f113;

BB4_14:
	mul.ftz.f32 	%f243, %f240, %f244;
	mul.ftz.f32 	%f242, %f239, %f244;
	mul.ftz.f32 	%f241, %f238, %f244;

BB4_15:
	.loc 1 77 1
	mov.u32 	%r12, %tid.y;
	.loc 1 77 1
	shl.b32 	%r61, %r12, 5;
	add.s32 	%r62, %r61, %r47;
	mul.wide.s32 	%rd6, %r62, 4;
	mov.u64 	%rd7, VerticalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_9460_non_const_smem;
	add.s64 	%rd8, %rd7, %rd6;
	.loc 1 77 1
	st.shared.f32 	[%rd8], %f241;
	st.shared.f32 	[%rd8+512], %f242;
	st.shared.f32 	[%rd8+1024], %f243;
	st.shared.f32 	[%rd8+1536], %f244;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	shl.b32 	%r63, %r12, 7;
	.loc 1 77 1
	add.s32 	%r64, %r47, %r63;
	mul.wide.s32 	%rd9, %r64, 4;
	add.s64 	%rd11, %rd7, %rd9;
	.loc 1 77 1
	setp.eq.s32	%p13, %r116, 0;
	setp.ne.s32	%p14, %r44, 0;
	and.pred  	%p15, %p13, %p14;
	.loc 1 77 1
	ld.shared.f32 	%f114, [%rd11];
	.loc 1 77 1
	mul.ftz.f32 	%f115, %f114, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f116, %f115, %f236, %p15;
	selp.f32	%f117, %f115, %f235, %p15;
	.loc 1 77 1
	mul.ftz.f32 	%f118, %f237, %f78;
	fma.rn.ftz.f32 	%f119, %f114, %f77, %f118;
	fma.rn.ftz.f32 	%f120, %f116, %f79, %f119;
	fma.rn.ftz.f32 	%f121, %f117, %f80, %f120;
	st.shared.f32 	[%rd11], %f121;
	ld.shared.f32 	%f122, [%rd11+128];
	mul.ftz.f32 	%f123, %f114, %f78;
	fma.rn.ftz.f32 	%f124, %f122, %f77, %f123;
	fma.rn.ftz.f32 	%f125, %f121, %f79, %f124;
	fma.rn.ftz.f32 	%f126, %f116, %f80, %f125;
	st.shared.f32 	[%rd11+128], %f126;
	ld.shared.f32 	%f127, [%rd11+256];
	mul.ftz.f32 	%f128, %f122, %f78;
	fma.rn.ftz.f32 	%f129, %f127, %f77, %f128;
	fma.rn.ftz.f32 	%f130, %f126, %f79, %f129;
	fma.rn.ftz.f32 	%f235, %f121, %f80, %f130;
	st.shared.f32 	[%rd11+256], %f235;
	ld.shared.f32 	%f237, [%rd11+384];
	mul.ftz.f32 	%f131, %f127, %f78;
	fma.rn.ftz.f32 	%f132, %f237, %f77, %f131;
	fma.rn.ftz.f32 	%f133, %f235, %f79, %f132;
	fma.rn.ftz.f32 	%f236, %f126, %f80, %f133;
	st.shared.f32 	[%rd11+384], %f236;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	setp.ge.s32	%p16, %r117, %r43;
	@%p16 bra 	BB4_17;

	cvta.to.global.u64 	%rd12, %rd2;
	mul.wide.s32 	%rd13, %r62, 4;
	add.s64 	%rd15, %rd7, %rd13;
	.loc 1 77 1
	ld.shared.f32 	%f134, [%rd15];
	ld.shared.f32 	%f135, [%rd15+512];
	ld.shared.f32 	%f136, [%rd15+1024];
	ld.shared.f32 	%f137, [%rd15+1536];
	mul.wide.s32 	%rd16, %r107, 16;
	add.s64 	%rd17, %rd12, %rd16;
	.loc 1 77 1
	st.global.v4.f32 	[%rd17], {%f134, %f135, %f136, %f137};

BB4_17:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r115, %r115, 4;
	add.s32 	%r117, %r117, 4;
	shl.b32 	%r69, %r38, 2;
	add.s32 	%r114, %r114, %r69;
	shl.b32 	%r70, %r41, 2;
	add.s32 	%r107, %r107, %r70;
	add.s32 	%r116, %r116, 4;
	.loc 1 77 1
	setp.lt.s32	%p17, %r116, %r43;
	mov.u32 	%r113, %r114;
	@%p17 bra 	BB4_3;

BB4_18:
	.loc 1 77 1
	mov.u32 	%r112, %r113;
	setp.lt.s32	%p18, %r116, 1;
	@%p18 bra 	BB4_47;

	add.s32 	%r23, %r116, -1;
	mov.f32 	%f248, 0f00000000;
	mov.f32 	%f247, %f248;
	mov.f32 	%f246, %f248;
	mov.f32 	%f245, %f248;
	mov.u32 	%r108, 0;

BB4_20:
	.loc 1 77 1
	mov.u32 	%r25, %r112;
	setp.lt.s32	%p19, %r2, %r39;
	setp.gt.s32	%p20, %r2, -1;
	.loc 1 77 1
	add.s32 	%r116, %r116, -4;
	add.s32 	%r117, %r117, -4;
	shl.b32 	%r79, %r38, 2;
	sub.s32 	%r31, %r25, %r79;
	add.s32 	%r115, %r115, -4;
	.loc 1 77 1
	setp.gt.s32	%p21, %r115, -1;
	setp.lt.s32	%p22, %r115, %r40;
	and.pred  	%p23, %p21, %p22;
	and.pred  	%p24, %p23, %p20;
	and.pred  	%p25, %p24, %p19;
	.loc 1 77 1
	@%p25 bra 	BB4_22;

	mov.f32 	%f255, 0f00000000;
	mov.f32 	%f254, %f255;
	mov.f32 	%f253, %f255;
	mov.f32 	%f252, %f255;
	bra.uni 	BB4_32;

BB4_22:
	cvta.to.global.u64 	%rd18, %rd1;
	mul.wide.s32 	%rd19, %r31, 16;
	add.s64 	%rd20, %rd18, %rd19;
	ld.global.v4.f32 	{%f146, %f147, %f148, %f149}, [%rd20];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f255, %f149;
	setp.ltu.ftz.f32	%p26, %f146, 0f00000000;
	@%p26 bra 	BB4_24;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f151, %f146;
	mul.ftz.f32 	%f152, %f151, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f249, %f152;
	bra.uni 	BB4_25;

BB4_24:
	neg.ftz.f32 	%f153, %f146;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f154, %f153;
	mul.ftz.f32 	%f155, %f154, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f156, %f155;
	neg.ftz.f32 	%f249, %f156;

BB4_25:
	setp.ltu.ftz.f32	%p27, %f147, 0f00000000;
	@%p27 bra 	BB4_27;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f157, %f147;
	mul.ftz.f32 	%f158, %f157, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f250, %f158;
	bra.uni 	BB4_28;

BB4_27:
	neg.ftz.f32 	%f159, %f147;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f160, %f159;
	mul.ftz.f32 	%f161, %f160, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f162, %f161;
	neg.ftz.f32 	%f250, %f162;

BB4_28:
	setp.ltu.ftz.f32	%p28, %f148, 0f00000000;
	@%p28 bra 	BB4_30;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f163, %f148;
	mul.ftz.f32 	%f164, %f163, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f251, %f164;
	bra.uni 	BB4_31;

BB4_30:
	neg.ftz.f32 	%f165, %f148;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f166, %f165;
	mul.ftz.f32 	%f167, %f166, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f168, %f167;
	neg.ftz.f32 	%f251, %f168;

BB4_31:
	mul.ftz.f32 	%f254, %f251, %f255;
	mul.ftz.f32 	%f253, %f250, %f255;
	mul.ftz.f32 	%f252, %f249, %f255;

BB4_32:
	.loc 1 77 1
	mov.u32 	%r33, %tid.y;
	.loc 1 77 1
	shl.b32 	%r80, %r33, 5;
	add.s32 	%r81, %r80, %r47;
	mul.wide.s32 	%rd21, %r81, 4;
	mov.u64 	%rd22, VerticalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_9460_non_const_smem;
	add.s64 	%rd23, %rd22, %rd21;
	.loc 1 77 1
	st.shared.f32 	[%rd23], %f252;
	st.shared.f32 	[%rd23+512], %f253;
	st.shared.f32 	[%rd23+1024], %f254;
	st.shared.f32 	[%rd23+1536], %f255;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	shl.b32 	%r82, %r33, 7;
	.loc 1 77 1
	add.s32 	%r83, %r47, %r82;
	mul.wide.s32 	%rd24, %r83, 4;
	add.s64 	%rd26, %rd22, %rd24;
	.loc 1 77 1
	mad.lo.s32 	%r84, %r108, -4, %r23;
	.loc 1 77 1
	add.s32 	%r85, %r43, -1;
	setp.eq.s32	%p29, %r84, %r85;
	setp.ne.s32	%p30, %r44, 0;
	and.pred  	%p31, %p29, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f169, [%rd26+384];
	.loc 1 77 1
	mul.ftz.f32 	%f170, %f169, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f171, %f170, %f246, %p31;
	selp.f32	%f172, %f170, %f245, %p31;
	.loc 1 77 1
	mul.ftz.f32 	%f173, %f247, %f82;
	fma.rn.ftz.f32 	%f174, %f248, %f81, %f173;
	fma.rn.ftz.f32 	%f175, %f171, %f83, %f174;
	fma.rn.ftz.f32 	%f176, %f172, %f84, %f175;
	st.shared.f32 	[%rd26+384], %f176;
	.loc 1 77 1
	setp.eq.s32	%p32, %r84, %r43;
	and.pred  	%p33, %p32, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f177, [%rd26+256];
	.loc 1 77 1
	mul.ftz.f32 	%f178, %f177, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f179, %f178, %f176, %p33;
	selp.f32	%f180, %f178, %f171, %p33;
	.loc 1 77 1
	mul.ftz.f32 	%f181, %f248, %f82;
	fma.rn.ftz.f32 	%f182, %f169, %f81, %f181;
	fma.rn.ftz.f32 	%f183, %f179, %f83, %f182;
	fma.rn.ftz.f32 	%f184, %f180, %f84, %f183;
	st.shared.f32 	[%rd26+256], %f184;
	.loc 1 77 1
	add.s32 	%r86, %r84, -2;
	setp.eq.s32	%p34, %r86, %r85;
	and.pred  	%p35, %p34, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f247, [%rd26+128];
	.loc 1 77 1
	mul.ftz.f32 	%f185, %f247, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f186, %f185, %f184, %p35;
	selp.f32	%f187, %f185, %f179, %p35;
	.loc 1 77 1
	mul.ftz.f32 	%f188, %f169, %f82;
	fma.rn.ftz.f32 	%f189, %f177, %f81, %f188;
	fma.rn.ftz.f32 	%f190, %f186, %f83, %f189;
	fma.rn.ftz.f32 	%f191, %f187, %f84, %f190;
	st.shared.f32 	[%rd26+128], %f191;
	.loc 1 77 1
	add.s32 	%r87, %r84, -3;
	setp.eq.s32	%p36, %r87, %r85;
	and.pred  	%p37, %p36, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f248, [%rd26];
	.loc 1 77 1
	mul.ftz.f32 	%f192, %f248, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f245, %f192, %f191, %p37;
	selp.f32	%f193, %f192, %f186, %p37;
	.loc 1 77 1
	mul.ftz.f32 	%f194, %f177, %f82;
	fma.rn.ftz.f32 	%f195, %f247, %f81, %f194;
	fma.rn.ftz.f32 	%f196, %f245, %f83, %f195;
	fma.rn.ftz.f32 	%f246, %f193, %f84, %f196;
	st.shared.f32 	[%rd26], %f246;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	setp.ge.s32	%p38, %r117, %r43;
	@%p38 bra 	BB4_46;

	cvta.to.global.u64 	%rd27, %rd2;
	.loc 1 77 1
	mad.lo.s32 	%r90, %r117, %r41, %r1;
	mul.wide.s32 	%rd28, %r90, 16;
	add.s64 	%rd29, %rd27, %rd28;
	mul.wide.s32 	%rd30, %r81, 4;
	add.s64 	%rd32, %rd22, %rd30;
	.loc 1 77 1
	ld.shared.f32 	%f197, [%rd32];
	ld.global.v4.f32 	{%f198, %f199, %f200, %f201}, [%rd29];
	.loc 1 77 1
	add.ftz.f32 	%f57, %f198, %f197;
	ld.shared.f32 	%f203, [%rd32+512];
	add.ftz.f32 	%f58, %f199, %f203;
	ld.shared.f32 	%f205, [%rd32+1024];
	add.ftz.f32 	%f59, %f200, %f205;
	ld.shared.f32 	%f207, [%rd32+1536];
	add.ftz.f32 	%f209, %f201, %f207;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f256, %f209;
	.loc 2 45 1
	add.ftz.f32 	%f210, %f256, 0fB70637BD;
	setp.gtu.ftz.f32	%p39, %f210, 0f00000000;
	@%p39 bra 	BB4_35;

	mov.f32 	%f259, 0f00000000;
	mov.f32 	%f258, %f259;
	mov.f32 	%f257, %f259;
	mov.f32 	%f256, %f259;
	bra.uni 	BB4_36;

BB4_35:
	mov.f32 	%f215, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f216, %f215, %f256;
	.loc 2 45 1
	mul.ftz.f32 	%f257, %f59, %f216;
	mul.ftz.f32 	%f258, %f58, %f216;
	mul.ftz.f32 	%f259, %f57, %f216;

BB4_36:
	.loc 1 77 164
	setp.ltu.ftz.f32	%p40, %f259, 0f00000000;
	@%p40 bra 	BB4_38;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f217, %f259;
	mul.ftz.f32 	%f218, %f217, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f260, %f218;
	bra.uni 	BB4_39;

BB4_38:
	neg.ftz.f32 	%f219, %f259;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f220, %f219;
	mul.ftz.f32 	%f221, %f220, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f222, %f221;
	neg.ftz.f32 	%f260, %f222;

BB4_39:
	setp.ltu.ftz.f32	%p41, %f258, 0f00000000;
	@%p41 bra 	BB4_41;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f223, %f258;
	mul.ftz.f32 	%f224, %f223, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f261, %f224;
	bra.uni 	BB4_42;

BB4_41:
	neg.ftz.f32 	%f225, %f258;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f226, %f225;
	mul.ftz.f32 	%f227, %f226, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f228, %f227;
	neg.ftz.f32 	%f261, %f228;

BB4_42:
	setp.ltu.ftz.f32	%p42, %f257, 0f00000000;
	@%p42 bra 	BB4_44;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f229, %f257;
	mul.ftz.f32 	%f230, %f229, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f262, %f230;
	bra.uni 	BB4_45;

BB4_44:
	neg.ftz.f32 	%f231, %f257;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f232, %f231;
	mul.ftz.f32 	%f233, %f232, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f234, %f233;
	neg.ftz.f32 	%f262, %f234;

BB4_45:
	mul.wide.s32 	%rd34, %r90, 16;
	add.s64 	%rd35, %rd27, %rd34;
	.loc 1 77 1
	st.global.v4.f32 	[%rd35], {%f260, %f261, %f262, %f256};

BB4_46:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r108, %r108, 1;
	.loc 1 77 1
	mad.lo.s32 	%r112, %r115, %r38, %r2;
	.loc 1 77 1
	setp.gt.s32	%p43, %r116, 0;
	@%p43 bra 	BB4_20;

BB4_47:
	.loc 1 77 2
	ret;
}

.visible .entry HorizontalRecursiveGaussianRGBAF16_kernel(
	.param .u64 HorizontalRecursiveGaussianRGBAF16_kernel_param_0,
	.param .u32 HorizontalRecursiveGaussianRGBAF16_kernel_param_1,
	.param .u32 HorizontalRecursiveGaussianRGBAF16_kernel_param_2,
	.param .u32 HorizontalRecursiveGaussianRGBAF16_kernel_param_3,
	.param .u64 HorizontalRecursiveGaussianRGBAF16_kernel_param_4,
	.param .u32 HorizontalRecursiveGaussianRGBAF16_kernel_param_5,
	.param .u32 HorizontalRecursiveGaussianRGBAF16_kernel_param_6,
	.param .u32 HorizontalRecursiveGaussianRGBAF16_kernel_param_7,
	.param .u32 HorizontalRecursiveGaussianRGBAF16_kernel_param_8,
	.param .f32 HorizontalRecursiveGaussianRGBAF16_kernel_param_9,
	.param .f32 HorizontalRecursiveGaussianRGBAF16_kernel_param_10,
	.param .f32 HorizontalRecursiveGaussianRGBAF16_kernel_param_11,
	.param .f32 HorizontalRecursiveGaussianRGBAF16_kernel_param_12,
	.param .f32 HorizontalRecursiveGaussianRGBAF16_kernel_param_13,
	.param .f32 HorizontalRecursiveGaussianRGBAF16_kernel_param_14,
	.param .f32 HorizontalRecursiveGaussianRGBAF16_kernel_param_15,
	.param .f32 HorizontalRecursiveGaussianRGBAF16_kernel_param_16
)
{
	.reg .pred 	%p<91>;
	.reg .s16 	%rs<65>;
	.reg .s32 	%r<223>;
	.reg .f32 	%f<481>;
	.reg .s64 	%rd<69>;
	// demoted variable
	.shared .align 4 .b8 HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem[4224];

	ld.param.u64 	%rd8, [HorizontalRecursiveGaussianRGBAF16_kernel_param_0];
	ld.param.u32 	%r32, [HorizontalRecursiveGaussianRGBAF16_kernel_param_1];
	ld.param.u32 	%r33, [HorizontalRecursiveGaussianRGBAF16_kernel_param_2];
	ld.param.u32 	%r34, [HorizontalRecursiveGaussianRGBAF16_kernel_param_3];
	ld.param.u64 	%rd9, [HorizontalRecursiveGaussianRGBAF16_kernel_param_4];
	ld.param.u32 	%r35, [HorizontalRecursiveGaussianRGBAF16_kernel_param_5];
	ld.param.u32 	%r36, [HorizontalRecursiveGaussianRGBAF16_kernel_param_6];
	ld.param.u32 	%r37, [HorizontalRecursiveGaussianRGBAF16_kernel_param_7];
	ld.param.u32 	%r38, [HorizontalRecursiveGaussianRGBAF16_kernel_param_8];
	ld.param.f32 	%f157, [HorizontalRecursiveGaussianRGBAF16_kernel_param_9];
	ld.param.f32 	%f158, [HorizontalRecursiveGaussianRGBAF16_kernel_param_10];
	ld.param.f32 	%f159, [HorizontalRecursiveGaussianRGBAF16_kernel_param_11];
	ld.param.f32 	%f160, [HorizontalRecursiveGaussianRGBAF16_kernel_param_12];
	ld.param.f32 	%f161, [HorizontalRecursiveGaussianRGBAF16_kernel_param_13];
	ld.param.f32 	%f162, [HorizontalRecursiveGaussianRGBAF16_kernel_param_14];
	ld.param.f32 	%f163, [HorizontalRecursiveGaussianRGBAF16_kernel_param_15];
	ld.param.f32 	%f164, [HorizontalRecursiveGaussianRGBAF16_kernel_param_16];
	mov.u32 	%r220, 0;
	.loc 1 77 1
	setp.gt.s32	%p7, %r36, 0;
	@%p7 bra 	BB5_1;
	bra.uni 	BB5_40;

BB5_1:
	mov.f32 	%f448, 0f00000000;
	mov.f32 	%f447, %f448;
	mov.f32 	%f446, %f448;
	mov.u32 	%r217, %r220;

BB5_2:
	.loc 1 77 1
	mov.u32 	%r215, %r217;
	mov.u32 	%r1, %r215;
	.loc 1 77 1
	mov.u32 	%r42, %ctaid.y;
	shl.b32 	%r43, %r42, 3;
	mov.u32 	%r44, %tid.y;
	add.s32 	%r45, %r43, %r44;
	sub.s32 	%r46, %r37, %r34;
	shr.s32 	%r47, %r46, 1;
	sub.s32 	%r48, %r45, %r47;
	setp.lt.s32	%p8, %r48, %r34;
	.loc 1 77 1
	mov.u32 	%r49, %tid.x;
	.loc 1 77 1
	add.s32 	%r3, %r49, %r220;
	sub.s32 	%r50, %r36, %r33;
	shr.s32 	%r4, %r50, 1;
	.loc 1 77 1
	setp.gt.s32	%p9, %r48, -1;
	and.pred  	%p1, %p9, %p8;
	.loc 1 77 1
	setp.ge.s32	%p10, %r48, %r34;
	@%p10 bra 	BB5_16;

	.loc 1 77 1
	sub.s32 	%r51, %r3, %r4;
	.loc 1 77 1
	setp.gt.s32	%p11, %r51, -1;
	and.pred  	%p12, %p1, %p11;
	.loc 1 77 1
	setp.lt.s32	%p13, %r51, %r33;
	and.pred  	%p14, %p12, %p13;
	.loc 1 77 1
	@%p14 bra 	BB5_5;

	mov.f32 	%f438, 0f00000000;
	mov.f32 	%f437, %f438;
	mov.f32 	%f436, %f438;
	mov.f32 	%f435, %f438;
	bra.uni 	BB5_15;

BB5_5:
	cvta.to.global.u64 	%rd10, %rd8;
	.loc 1 77 1
	mad.lo.s32 	%r60, %r48, %r32, %r51;
	mul.wide.s32 	%rd11, %r60, 8;
	add.s64 	%rd12, %rd10, %rd11;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs1, %rs2, %rs3, %rs4}, [%rd12];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs1;
	cvt.f32.f16 	%f4, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs2;
	cvt.f32.f16 	%f5, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs3;
	cvt.f32.f16 	%f6, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs4;
	cvt.f32.f16 	%f172, %temp;
	}
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f438, %f172;
	setp.ltu.ftz.f32	%p15, %f4, 0f00000000;
	@%p15 bra 	BB5_7;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f173, %f4;
	mul.ftz.f32 	%f174, %f173, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f432, %f174;
	bra.uni 	BB5_8;

BB5_7:
	neg.ftz.f32 	%f175, %f4;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f176, %f175;
	mul.ftz.f32 	%f177, %f176, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f178, %f177;
	neg.ftz.f32 	%f432, %f178;

BB5_8:
	setp.ltu.ftz.f32	%p16, %f5, 0f00000000;
	@%p16 bra 	BB5_10;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f179, %f5;
	mul.ftz.f32 	%f180, %f179, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f433, %f180;
	bra.uni 	BB5_11;

BB5_10:
	neg.ftz.f32 	%f181, %f5;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f182, %f181;
	mul.ftz.f32 	%f183, %f182, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f184, %f183;
	neg.ftz.f32 	%f433, %f184;

BB5_11:
	setp.ltu.ftz.f32	%p17, %f6, 0f00000000;
	@%p17 bra 	BB5_13;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f185, %f6;
	mul.ftz.f32 	%f186, %f185, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f434, %f186;
	bra.uni 	BB5_14;

BB5_13:
	neg.ftz.f32 	%f187, %f6;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f188, %f187;
	mul.ftz.f32 	%f189, %f188, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f190, %f189;
	neg.ftz.f32 	%f434, %f190;

BB5_14:
	mul.ftz.f32 	%f437, %f434, %f438;
	mul.ftz.f32 	%f436, %f433, %f438;
	mul.ftz.f32 	%f435, %f432, %f438;

BB5_15:
	.loc 1 77 1
	mad.lo.s32 	%r63, %r44, 33, %r49;
	mul.wide.s32 	%rd13, %r63, 4;
	mov.u64 	%rd14, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	add.s64 	%rd15, %rd14, %rd13;
	.loc 1 77 1
	st.shared.f32 	[%rd15], %f435;
	st.shared.f32 	[%rd15+1056], %f436;
	st.shared.f32 	[%rd15+2112], %f437;
	st.shared.f32 	[%rd15+3168], %f438;

BB5_16:
	.loc 1 77 1
	add.s32 	%r71, %r48, 4;
	.loc 1 77 1
	sub.s32 	%r74, %r3, %r4;
	mad.lo.s32 	%r75, %r48, %r32, %r74;
	.loc 1 77 1
	shl.b32 	%r76, %r32, 2;
	add.s32 	%r5, %r75, %r76;
	.loc 1 77 1
	setp.ge.s32	%p18, %r71, %r34;
	@%p18 bra 	BB5_30;

	.loc 1 77 1
	setp.gt.s32	%p19, %r74, -1;
	and.pred  	%p23, %p1, %p19;
	.loc 1 77 1
	setp.lt.s32	%p24, %r74, %r33;
	and.pred  	%p25, %p23, %p24;
	.loc 1 77 1
	@%p25 bra 	BB5_19;

	mov.f32 	%f445, 0f00000000;
	mov.f32 	%f444, %f445;
	mov.f32 	%f443, %f445;
	mov.f32 	%f442, %f445;
	bra.uni 	BB5_29;

BB5_19:
	cvta.to.global.u64 	%rd16, %rd8;
	mul.wide.s32 	%rd17, %r5, 8;
	add.s64 	%rd18, %rd16, %rd17;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs9, %rs10, %rs11, %rs12}, [%rd18];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs9;
	cvt.f32.f16 	%f24, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs10;
	cvt.f32.f16 	%f25, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs11;
	cvt.f32.f16 	%f26, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs12;
	cvt.f32.f16 	%f195, %temp;
	}
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f445, %f195;
	setp.ltu.ftz.f32	%p26, %f24, 0f00000000;
	@%p26 bra 	BB5_21;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f196, %f24;
	mul.ftz.f32 	%f197, %f196, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f439, %f197;
	bra.uni 	BB5_22;

BB5_21:
	neg.ftz.f32 	%f198, %f24;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f199, %f198;
	mul.ftz.f32 	%f200, %f199, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f201, %f200;
	neg.ftz.f32 	%f439, %f201;

BB5_22:
	setp.ltu.ftz.f32	%p27, %f25, 0f00000000;
	@%p27 bra 	BB5_24;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f202, %f25;
	mul.ftz.f32 	%f203, %f202, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f440, %f203;
	bra.uni 	BB5_25;

BB5_24:
	neg.ftz.f32 	%f204, %f25;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f205, %f204;
	mul.ftz.f32 	%f206, %f205, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f207, %f206;
	neg.ftz.f32 	%f440, %f207;

BB5_25:
	setp.ltu.ftz.f32	%p28, %f26, 0f00000000;
	@%p28 bra 	BB5_27;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f208, %f26;
	mul.ftz.f32 	%f209, %f208, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f441, %f209;
	bra.uni 	BB5_28;

BB5_27:
	neg.ftz.f32 	%f210, %f26;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f211, %f210;
	mul.ftz.f32 	%f212, %f211, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f213, %f212;
	neg.ftz.f32 	%f441, %f213;

BB5_28:
	mul.ftz.f32 	%f444, %f441, %f445;
	mul.ftz.f32 	%f443, %f440, %f445;
	mul.ftz.f32 	%f442, %f439, %f445;

BB5_29:
	.loc 1 77 1
	mad.lo.s32 	%r89, %r44, 33, %r49;
	mul.wide.s32 	%rd19, %r89, 4;
	mov.u64 	%rd20, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	add.s64 	%rd21, %rd20, %rd19;
	.loc 1 77 1
	st.shared.f32 	[%rd21+528], %f442;
	st.shared.f32 	[%rd21+1584], %f443;
	st.shared.f32 	[%rd21+2640], %f444;
	st.shared.f32 	[%rd21+3696], %f445;

BB5_30:
	.loc 1 77 1
	mad.lo.s32 	%r92, %r44, 16, %r49;
	mul.lo.s32 	%r93, %r92, 33;
	mul.wide.s32 	%rd22, %r93, 4;
	mov.u64 	%rd23, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	add.s64 	%rd67, %rd23, %rd22;
	.loc 1 77 1
	shl.b32 	%r94, %r44, 5;
	add.s32 	%r95, %r94, %r49;
	setp.lt.s32	%p2, %r95, 32;
	.loc 1 77 1
	bar.sync 	0;
	add.s64 	%rd68, %rd67, 16;
	.loc 1 77 1
	@!%p2 bra 	BB5_33;
	bra.uni 	BB5_31;

BB5_31:
	mov.u32 	%r218, 0;
	mov.u32 	%r216, %r1;

BB5_32:
	.loc 1 77 1
	mov.u32 	%r6, %r216;
	.loc 1 77 1
	mov.u64 	%rd4, %rd68;
	.loc 1 77 1
	setp.eq.s32	%p29, %r6, 0;
	setp.ne.s32	%p30, %r38, 0;
	and.pred  	%p31, %p29, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f214, [%rd67];
	.loc 1 77 1
	mul.ftz.f32 	%f215, %f214, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f216, %f215, %f447, %p31;
	selp.f32	%f217, %f215, %f446, %p31;
	.loc 1 77 1
	mul.ftz.f32 	%f218, %f448, %f158;
	fma.rn.ftz.f32 	%f219, %f214, %f157, %f218;
	fma.rn.ftz.f32 	%f220, %f216, %f159, %f219;
	fma.rn.ftz.f32 	%f221, %f217, %f160, %f220;
	st.shared.f32 	[%rd67], %f221;
	ld.shared.f32 	%f222, [%rd4+-12];
	mul.ftz.f32 	%f223, %f214, %f158;
	fma.rn.ftz.f32 	%f224, %f222, %f157, %f223;
	fma.rn.ftz.f32 	%f225, %f221, %f159, %f224;
	fma.rn.ftz.f32 	%f226, %f216, %f160, %f225;
	ld.shared.f32 	%f227, [%rd4+-8];
	ld.shared.f32 	%f228, [%rd4+-4];
	ld.shared.f32 	%f229, [%rd4];
	st.shared.f32 	[%rd4+-12], %f226;
	mul.ftz.f32 	%f230, %f222, %f158;
	fma.rn.ftz.f32 	%f231, %f227, %f157, %f230;
	fma.rn.ftz.f32 	%f232, %f226, %f159, %f231;
	fma.rn.ftz.f32 	%f233, %f221, %f160, %f232;
	st.shared.f32 	[%rd4+-8], %f233;
	mul.ftz.f32 	%f234, %f227, %f158;
	fma.rn.ftz.f32 	%f235, %f228, %f157, %f234;
	fma.rn.ftz.f32 	%f236, %f233, %f159, %f235;
	fma.rn.ftz.f32 	%f237, %f226, %f160, %f236;
	st.shared.f32 	[%rd4+-4], %f237;
	mul.ftz.f32 	%f238, %f228, %f158;
	fma.rn.ftz.f32 	%f239, %f229, %f157, %f238;
	fma.rn.ftz.f32 	%f240, %f237, %f159, %f239;
	fma.rn.ftz.f32 	%f241, %f233, %f160, %f240;
	st.shared.f32 	[%rd4], %f241;
	ld.shared.f32 	%f242, [%rd4+4];
	mul.ftz.f32 	%f243, %f229, %f158;
	fma.rn.ftz.f32 	%f244, %f242, %f157, %f243;
	fma.rn.ftz.f32 	%f245, %f241, %f159, %f244;
	fma.rn.ftz.f32 	%f246, %f237, %f160, %f245;
	ld.shared.f32 	%f247, [%rd4+8];
	ld.shared.f32 	%f448, [%rd4+12];
	st.shared.f32 	[%rd4+4], %f246;
	mul.ftz.f32 	%f248, %f242, %f158;
	fma.rn.ftz.f32 	%f249, %f247, %f157, %f248;
	fma.rn.ftz.f32 	%f250, %f246, %f159, %f249;
	fma.rn.ftz.f32 	%f446, %f241, %f160, %f250;
	st.shared.f32 	[%rd4+8], %f446;
	mul.ftz.f32 	%f251, %f247, %f158;
	fma.rn.ftz.f32 	%f252, %f448, %f157, %f251;
	fma.rn.ftz.f32 	%f253, %f446, %f159, %f252;
	fma.rn.ftz.f32 	%f447, %f246, %f160, %f253;
	st.shared.f32 	[%rd4+12], %f447;
	.loc 1 77 1
	add.s32 	%r8, %r6, -8;
	add.s64 	%rd68, %rd4, 32;
	.loc 1 77 1
	add.s32 	%r218, %r218, 32;
	add.s64 	%rd67, %rd4, 16;
	.loc 1 77 1
	setp.ne.s32	%p32, %r218, 128;
	mov.u32 	%r216, %r8;
	@%p32 bra 	BB5_32;

BB5_33:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	@!%p8 bra 	BB5_36;
	bra.uni 	BB5_34;

BB5_34:
	mov.u64 	%rd66, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	.loc 1 77 1
	mad.lo.s32 	%r106, %r44, 33, %r49;
	mul.wide.s32 	%rd24, %r106, 4;
	add.s64 	%rd26, %rd66, %rd24;
	.loc 1 77 1
	ld.shared.f32 	%f53, [%rd26];
	ld.shared.f32 	%f54, [%rd26+1056];
	ld.shared.f32 	%f55, [%rd26+2112];
	ld.shared.f32 	%f56, [%rd26+3168];
	.loc 1 77 1
	setp.ge.s32	%p33, %r3, %r36;
	@%p33 bra 	BB5_36;

	cvta.to.global.u64 	%rd27, %rd9;
	.loc 1 77 1
	mad.lo.s32 	%r111, %r45, %r35, %r3;
	mul.wide.s32 	%rd28, %r111, 8;
	add.s64 	%rd29, %rd27, %rd28;
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f53;
	mov.b16 	%rs17, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f54;
	mov.b16 	%rs18, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f55;
	mov.b16 	%rs19, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f56;
	mov.b16 	%rs20, %temp;
}
	st.global.v4.u16 	[%rd29], {%rs17, %rs18, %rs19, %rs20};

BB5_36:
	.loc 1 77 1
	mad.lo.s32 	%r120, %r45, %r35, %r3;
	.loc 1 77 1
	shl.b32 	%r121, %r35, 2;
	add.s32 	%r10, %r120, %r121;
	.loc 1 77 1
	@%p18 bra 	BB5_39;

	mov.u64 	%rd65, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	.loc 1 77 1
	mad.lo.s32 	%r124, %r44, 33, %r49;
	mul.wide.s32 	%rd30, %r124, 4;
	add.s64 	%rd32, %rd65, %rd30;
	.loc 1 77 1
	ld.shared.f32 	%f57, [%rd32+528];
	ld.shared.f32 	%f58, [%rd32+1584];
	ld.shared.f32 	%f59, [%rd32+2640];
	ld.shared.f32 	%f60, [%rd32+3696];
	.loc 1 77 1
	setp.ge.s32	%p35, %r3, %r36;
	@%p35 bra 	BB5_39;

	cvta.to.global.u64 	%rd33, %rd9;
	mul.wide.s32 	%rd34, %r10, 8;
	add.s64 	%rd35, %rd33, %rd34;
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f57;
	mov.b16 	%rs21, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f58;
	mov.b16 	%rs22, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f59;
	mov.b16 	%rs23, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f60;
	mov.b16 	%rs24, %temp;
}
	st.global.v4.u16 	[%rd35], {%rs21, %rs22, %rs23, %rs24};

BB5_39:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r220, %r220, 32;
	.loc 1 77 1
	setp.lt.s32	%p36, %r220, %r36;
	add.s32 	%r217, %r1, -32;
	@%p36 bra 	BB5_2;

BB5_40:
	.loc 1 77 1
	setp.lt.s32	%p37, %r220, 1;
	@%p37 bra 	BB5_102;

	add.s32 	%r14, %r220, -1;
	mov.f32 	%f466, 0f00000000;
	mov.f32 	%f465, %f466;
	mov.f32 	%f464, %f466;
	mov.f32 	%f463, %f466;
	mov.u32 	%r219, 0;

BB5_42:
	.loc 1 77 1
	mov.u32 	%r126, %ctaid.y;
	shl.b32 	%r127, %r126, 3;
	mov.u32 	%r128, %tid.y;
	add.s32 	%r129, %r127, %r128;
	sub.s32 	%r130, %r37, %r34;
	shr.s32 	%r131, %r130, 1;
	sub.s32 	%r132, %r129, %r131;
	setp.lt.s32	%p38, %r132, %r34;
	.loc 1 77 1
	mov.u32 	%r133, %tid.x;
	add.s32 	%r17, %r133, %r220;
	.loc 1 77 1
	add.s32 	%r134, %r17, -32;
	sub.s32 	%r135, %r36, %r33;
	shr.s32 	%r136, %r135, 1;
	sub.s32 	%r137, %r134, %r136;
	mad.lo.s32 	%r18, %r132, %r32, %r137;
	.loc 1 77 1
	setp.gt.s32	%p39, %r137, -1;
	setp.gt.s32	%p40, %r132, -1;
	and.pred  	%p41, %p40, %p38;
	and.pred  	%p42, %p41, %p39;
	.loc 1 77 1
	setp.lt.s32	%p43, %r137, %r33;
	and.pred  	%p4, %p42, %p43;
	.loc 1 77 1
	setp.ge.s32	%p44, %r132, %r34;
	@%p44 bra 	BB5_56;

	.loc 1 77 1
	@%p4 bra 	BB5_45;

	mov.f32 	%f455, 0f00000000;
	mov.f32 	%f454, %f455;
	mov.f32 	%f453, %f455;
	mov.f32 	%f452, %f455;
	bra.uni 	BB5_55;

BB5_45:
	cvta.to.global.u64 	%rd36, %rd8;
	mul.wide.s32 	%rd37, %r18, 8;
	add.s64 	%rd38, %rd36, %rd37;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs25, %rs26, %rs27, %rs28}, [%rd38];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs25;
	cvt.f32.f16 	%f65, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs26;
	cvt.f32.f16 	%f66, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs27;
	cvt.f32.f16 	%f67, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs28;
	cvt.f32.f16 	%f262, %temp;
	}
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f455, %f262;
	setp.ltu.ftz.f32	%p45, %f65, 0f00000000;
	@%p45 bra 	BB5_47;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f263, %f65;
	mul.ftz.f32 	%f264, %f263, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f449, %f264;
	bra.uni 	BB5_48;

BB5_47:
	neg.ftz.f32 	%f265, %f65;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f266, %f265;
	mul.ftz.f32 	%f267, %f266, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f268, %f267;
	neg.ftz.f32 	%f449, %f268;

BB5_48:
	setp.ltu.ftz.f32	%p46, %f66, 0f00000000;
	@%p46 bra 	BB5_50;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f269, %f66;
	mul.ftz.f32 	%f270, %f269, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f450, %f270;
	bra.uni 	BB5_51;

BB5_50:
	neg.ftz.f32 	%f271, %f66;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f272, %f271;
	mul.ftz.f32 	%f273, %f272, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f274, %f273;
	neg.ftz.f32 	%f450, %f274;

BB5_51:
	setp.ltu.ftz.f32	%p47, %f67, 0f00000000;
	@%p47 bra 	BB5_53;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f275, %f67;
	mul.ftz.f32 	%f276, %f275, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f451, %f276;
	bra.uni 	BB5_54;

BB5_53:
	neg.ftz.f32 	%f277, %f67;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f278, %f277;
	mul.ftz.f32 	%f279, %f278, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f280, %f279;
	neg.ftz.f32 	%f451, %f280;

BB5_54:
	mul.ftz.f32 	%f454, %f451, %f455;
	mul.ftz.f32 	%f453, %f450, %f455;
	mul.ftz.f32 	%f452, %f449, %f455;

BB5_55:
	.loc 1 77 1
	mad.lo.s32 	%r140, %r128, 33, %r133;
	mul.wide.s32 	%rd39, %r140, 4;
	mov.u64 	%rd40, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	add.s64 	%rd41, %rd40, %rd39;
	.loc 1 77 1
	st.shared.f32 	[%rd41], %f452;
	st.shared.f32 	[%rd41+1056], %f453;
	st.shared.f32 	[%rd41+2112], %f454;
	st.shared.f32 	[%rd41+3168], %f455;

BB5_56:
	.loc 1 77 1
	add.s32 	%r148, %r132, 4;
	.loc 1 77 1
	shl.b32 	%r149, %r32, 2;
	add.s32 	%r19, %r18, %r149;
	.loc 1 77 1
	setp.ge.s32	%p48, %r148, %r34;
	@%p48 bra 	BB5_70;

	.loc 1 77 1
	@%p4 bra 	BB5_59;

	mov.f32 	%f462, 0f00000000;
	mov.f32 	%f461, %f462;
	mov.f32 	%f460, %f462;
	mov.f32 	%f459, %f462;
	bra.uni 	BB5_69;

BB5_59:
	cvta.to.global.u64 	%rd42, %rd8;
	mul.wide.s32 	%rd43, %r19, 8;
	add.s64 	%rd44, %rd42, %rd43;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs33, %rs34, %rs35, %rs36}, [%rd44];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs33;
	cvt.f32.f16 	%f85, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs34;
	cvt.f32.f16 	%f86, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs35;
	cvt.f32.f16 	%f87, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs36;
	cvt.f32.f16 	%f285, %temp;
	}
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f462, %f285;
	setp.ltu.ftz.f32	%p56, %f85, 0f00000000;
	@%p56 bra 	BB5_61;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f286, %f85;
	mul.ftz.f32 	%f287, %f286, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f456, %f287;
	bra.uni 	BB5_62;

BB5_61:
	neg.ftz.f32 	%f288, %f85;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f289, %f288;
	mul.ftz.f32 	%f290, %f289, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f291, %f290;
	neg.ftz.f32 	%f456, %f291;

BB5_62:
	setp.ltu.ftz.f32	%p57, %f86, 0f00000000;
	@%p57 bra 	BB5_64;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f292, %f86;
	mul.ftz.f32 	%f293, %f292, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f457, %f293;
	bra.uni 	BB5_65;

BB5_64:
	neg.ftz.f32 	%f294, %f86;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f295, %f294;
	mul.ftz.f32 	%f296, %f295, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f297, %f296;
	neg.ftz.f32 	%f457, %f297;

BB5_65:
	setp.ltu.ftz.f32	%p58, %f87, 0f00000000;
	@%p58 bra 	BB5_67;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f298, %f87;
	mul.ftz.f32 	%f299, %f298, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f458, %f299;
	bra.uni 	BB5_68;

BB5_67:
	neg.ftz.f32 	%f300, %f87;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f301, %f300;
	mul.ftz.f32 	%f302, %f301, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f303, %f302;
	neg.ftz.f32 	%f458, %f303;

BB5_68:
	mul.ftz.f32 	%f461, %f458, %f462;
	mul.ftz.f32 	%f460, %f457, %f462;
	mul.ftz.f32 	%f459, %f456, %f462;

BB5_69:
	.loc 1 77 1
	mad.lo.s32 	%r163, %r128, 33, %r133;
	mul.wide.s32 	%rd45, %r163, 4;
	mov.u64 	%rd46, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	add.s64 	%rd47, %rd46, %rd45;
	.loc 1 77 1
	st.shared.f32 	[%rd47+528], %f459;
	st.shared.f32 	[%rd47+1584], %f460;
	st.shared.f32 	[%rd47+2640], %f461;
	st.shared.f32 	[%rd47+3696], %f462;

BB5_70:
	.loc 1 77 1
	shl.b32 	%r164, %r128, 5;
	add.s32 	%r165, %r164, %r133;
	setp.lt.s32	%p5, %r165, 32;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	shl.b32 	%r166, %r128, 4;
	add.s32 	%r167, %r166, %r133;
	.loc 1 77 1
	mad.lo.s32 	%r222, %r167, 33, 31;
	.loc 1 77 1
	@!%p5 bra 	BB5_73;
	bra.uni 	BB5_71;

BB5_71:
	mov.u32 	%r221, 0;

BB5_72:
	.loc 1 77 1
	mad.lo.s32 	%r169, %r219, -32, %r14;
	.loc 1 77 1
	sub.s32 	%r170, %r169, %r221;
	mul.wide.s32 	%rd48, %r222, 4;
	mov.u64 	%rd49, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	add.s64 	%rd50, %rd49, %rd48;
	.loc 1 77 1
	add.s32 	%r171, %r36, -1;
	setp.eq.s32	%p59, %r170, %r171;
	setp.ne.s32	%p60, %r38, 0;
	and.pred  	%p61, %p59, %p60;
	.loc 1 77 1
	ld.shared.f32 	%f304, [%rd50];
	.loc 1 77 1
	mul.ftz.f32 	%f305, %f304, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f306, %f305, %f464, %p61;
	selp.f32	%f307, %f305, %f463, %p61;
	.loc 1 77 1
	mul.ftz.f32 	%f308, %f465, %f162;
	fma.rn.ftz.f32 	%f309, %f466, %f161, %f308;
	fma.rn.ftz.f32 	%f310, %f306, %f163, %f309;
	fma.rn.ftz.f32 	%f311, %f307, %f164, %f310;
	ld.shared.f32 	%f312, [%rd50+-4];
	ld.shared.f32 	%f313, [%rd50+-8];
	ld.shared.f32 	%f314, [%rd50+-12];
	st.shared.f32 	[%rd50], %f311;
	not.b32 	%r172, %r221;
	.loc 1 77 1
	add.s32 	%r173, %r169, %r172;
	setp.eq.s32	%p62, %r173, %r171;
	and.pred  	%p63, %p62, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f315, %f312, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f316, %f315, %f311, %p63;
	selp.f32	%f317, %f315, %f306, %p63;
	.loc 1 77 1
	mul.ftz.f32 	%f318, %f466, %f162;
	fma.rn.ftz.f32 	%f319, %f304, %f161, %f318;
	fma.rn.ftz.f32 	%f320, %f316, %f163, %f319;
	fma.rn.ftz.f32 	%f321, %f317, %f164, %f320;
	st.shared.f32 	[%rd50+-4], %f321;
	mov.u32 	%r174, -2;
	.loc 1 77 1
	sub.s32 	%r175, %r174, %r221;
	.loc 1 77 1
	add.s32 	%r176, %r169, %r175;
	setp.eq.s32	%p64, %r176, %r171;
	and.pred  	%p65, %p64, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f322, %f313, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f323, %f322, %f321, %p65;
	selp.f32	%f324, %f322, %f316, %p65;
	.loc 1 77 1
	mul.ftz.f32 	%f325, %f304, %f162;
	fma.rn.ftz.f32 	%f326, %f312, %f161, %f325;
	fma.rn.ftz.f32 	%f327, %f323, %f163, %f326;
	fma.rn.ftz.f32 	%f328, %f324, %f164, %f327;
	st.shared.f32 	[%rd50+-8], %f328;
	mov.u32 	%r177, -3;
	.loc 1 77 1
	sub.s32 	%r178, %r177, %r221;
	.loc 1 77 1
	add.s32 	%r179, %r169, %r178;
	setp.eq.s32	%p66, %r179, %r171;
	and.pred  	%p67, %p66, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f329, %f314, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f330, %f329, %f328, %p67;
	selp.f32	%f331, %f329, %f323, %p67;
	.loc 1 77 1
	mul.ftz.f32 	%f332, %f312, %f162;
	fma.rn.ftz.f32 	%f333, %f313, %f161, %f332;
	fma.rn.ftz.f32 	%f334, %f330, %f163, %f333;
	fma.rn.ftz.f32 	%f335, %f331, %f164, %f334;
	st.shared.f32 	[%rd50+-12], %f335;
	mov.u32 	%r180, -4;
	.loc 1 77 1
	sub.s32 	%r181, %r180, %r221;
	.loc 1 77 1
	add.s32 	%r182, %r169, %r181;
	setp.eq.s32	%p68, %r182, %r171;
	and.pred  	%p69, %p68, %p60;
	.loc 1 77 1
	ld.shared.f32 	%f336, [%rd50+-16];
	.loc 1 77 1
	mul.ftz.f32 	%f337, %f336, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f338, %f337, %f335, %p69;
	selp.f32	%f339, %f337, %f330, %p69;
	.loc 1 77 1
	mul.ftz.f32 	%f340, %f313, %f162;
	fma.rn.ftz.f32 	%f341, %f314, %f161, %f340;
	fma.rn.ftz.f32 	%f342, %f338, %f163, %f341;
	fma.rn.ftz.f32 	%f343, %f339, %f164, %f342;
	ld.shared.f32 	%f344, [%rd50+-20];
	ld.shared.f32 	%f465, [%rd50+-24];
	ld.shared.f32 	%f466, [%rd50+-28];
	st.shared.f32 	[%rd50+-16], %f343;
	mov.u32 	%r183, -5;
	.loc 1 77 1
	sub.s32 	%r184, %r183, %r221;
	.loc 1 77 1
	add.s32 	%r185, %r169, %r184;
	setp.eq.s32	%p70, %r185, %r171;
	and.pred  	%p71, %p70, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f345, %f344, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f346, %f345, %f343, %p71;
	selp.f32	%f347, %f345, %f338, %p71;
	.loc 1 77 1
	mul.ftz.f32 	%f348, %f314, %f162;
	fma.rn.ftz.f32 	%f349, %f336, %f161, %f348;
	fma.rn.ftz.f32 	%f350, %f346, %f163, %f349;
	fma.rn.ftz.f32 	%f351, %f347, %f164, %f350;
	st.shared.f32 	[%rd50+-20], %f351;
	mov.u32 	%r186, -6;
	.loc 1 77 1
	sub.s32 	%r187, %r186, %r221;
	.loc 1 77 1
	add.s32 	%r188, %r169, %r187;
	setp.eq.s32	%p72, %r188, %r171;
	and.pred  	%p73, %p72, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f352, %f465, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f353, %f352, %f351, %p73;
	selp.f32	%f354, %f352, %f346, %p73;
	.loc 1 77 1
	mul.ftz.f32 	%f355, %f336, %f162;
	fma.rn.ftz.f32 	%f356, %f344, %f161, %f355;
	fma.rn.ftz.f32 	%f357, %f353, %f163, %f356;
	fma.rn.ftz.f32 	%f358, %f354, %f164, %f357;
	st.shared.f32 	[%rd50+-24], %f358;
	mov.u32 	%r189, -7;
	.loc 1 77 1
	sub.s32 	%r190, %r189, %r221;
	.loc 1 77 1
	add.s32 	%r191, %r169, %r190;
	setp.eq.s32	%p74, %r191, %r171;
	and.pred  	%p75, %p74, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f359, %f466, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f463, %f359, %f358, %p75;
	selp.f32	%f360, %f359, %f353, %p75;
	.loc 1 77 1
	mul.ftz.f32 	%f361, %f344, %f162;
	fma.rn.ftz.f32 	%f362, %f465, %f161, %f361;
	fma.rn.ftz.f32 	%f363, %f463, %f163, %f362;
	fma.rn.ftz.f32 	%f464, %f360, %f164, %f363;
	st.shared.f32 	[%rd50+-28], %f464;
	add.s32 	%r222, %r222, -8;
	.loc 1 77 1
	add.s32 	%r221, %r221, 8;
	setp.ne.s32	%p76, %r221, 32;
	@%p76 bra 	BB5_72;

BB5_73:
	setp.lt.s32	%p6, %r129, %r37;
	.loc 1 77 1
	bar.sync 	0;
	add.s32 	%r214, %r133, %r220;
	.loc 1 77 1
	add.s32 	%r213, %r214, -32;
	mad.lo.s32 	%r28, %r129, %r35, %r214;
	.loc 1 77 1
	setp.lt.s32	%p77, %r213, %r36;
	.loc 1 77 1
	and.pred  	%p78, %p6, %p77;
	@!%p78 bra 	BB5_87;
	bra.uni 	BB5_74;

BB5_74:
	cvta.to.global.u64 	%rd51, %rd9;
	.loc 1 77 1
	add.s32 	%r196, %r28, -32;
	mul.wide.s32 	%rd52, %r196, 8;
	add.s64 	%rd53, %rd51, %rd52;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs41, %rs42, %rs43, %rs44}, [%rd53];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs41;
	cvt.f32.f16 	%f364, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs42;
	cvt.f32.f16 	%f365, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs43;
	cvt.f32.f16 	%f366, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs44;
	cvt.f32.f16 	%f367, %temp;
	}
	.loc 1 77 1
	mad.lo.s32 	%r199, %r128, 33, %r133;
	mul.wide.s32 	%rd54, %r199, 4;
	mov.u64 	%rd55, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	add.s64 	%rd56, %rd55, %rd54;
	.loc 1 77 1
	ld.shared.f32 	%f368, [%rd56];
	add.ftz.f32 	%f117, %f364, %f368;
	ld.shared.f32 	%f369, [%rd56+1056];
	add.ftz.f32 	%f118, %f365, %f369;
	ld.shared.f32 	%f370, [%rd56+2112];
	add.ftz.f32 	%f119, %f366, %f370;
	ld.shared.f32 	%f371, [%rd56+3168];
	add.ftz.f32 	%f372, %f367, %f371;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f467, %f372;
	.loc 2 45 1
	add.ftz.f32 	%f373, %f467, 0fB70637BD;
	setp.gtu.ftz.f32	%p79, %f373, 0f00000000;
	@%p79 bra 	BB5_76;

	mov.f32 	%f470, 0f00000000;
	mov.f32 	%f469, %f470;
	mov.f32 	%f468, %f470;
	mov.f32 	%f467, %f470;
	bra.uni 	BB5_77;

BB5_76:
	mov.f32 	%f378, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f379, %f378, %f467;
	.loc 2 45 1
	mul.ftz.f32 	%f468, %f119, %f379;
	mul.ftz.f32 	%f469, %f118, %f379;
	mul.ftz.f32 	%f470, %f117, %f379;

BB5_77:
	.loc 1 77 165
	setp.ltu.ftz.f32	%p80, %f470, 0f00000000;
	@%p80 bra 	BB5_79;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f380, %f470;
	mul.ftz.f32 	%f381, %f380, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f471, %f381;
	bra.uni 	BB5_80;

BB5_79:
	neg.ftz.f32 	%f382, %f470;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f383, %f382;
	mul.ftz.f32 	%f384, %f383, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f385, %f384;
	neg.ftz.f32 	%f471, %f385;

BB5_80:
	setp.ltu.ftz.f32	%p81, %f469, 0f00000000;
	@%p81 bra 	BB5_82;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f386, %f469;
	mul.ftz.f32 	%f387, %f386, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f472, %f387;
	bra.uni 	BB5_83;

BB5_82:
	neg.ftz.f32 	%f388, %f469;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f389, %f388;
	mul.ftz.f32 	%f390, %f389, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f391, %f390;
	neg.ftz.f32 	%f472, %f391;

BB5_83:
	setp.ltu.ftz.f32	%p82, %f468, 0f00000000;
	@%p82 bra 	BB5_85;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f392, %f468;
	mul.ftz.f32 	%f393, %f392, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f473, %f393;
	bra.uni 	BB5_86;

BB5_85:
	neg.ftz.f32 	%f394, %f468;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f395, %f394;
	mul.ftz.f32 	%f396, %f395, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f397, %f396;
	neg.ftz.f32 	%f473, %f397;

BB5_86:
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f471;
	mov.b16 	%rs49, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f472;
	mov.b16 	%rs50, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f473;
	mov.b16 	%rs51, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f467;
	mov.b16 	%rs52, %temp;
}
	mul.wide.s32 	%rd58, %r196, 8;
	add.s64 	%rd59, %rd51, %rd58;
	.loc 1 77 238
	st.global.v4.u16 	[%rd59], {%rs49, %rs50, %rs51, %rs52};

BB5_87:
	.loc 1 77 1
	add.s32 	%r206, %r129, 4;
	setp.lt.s32	%p84, %r206, %r37;
	.loc 1 77 1
	shl.b32 	%r208, %r35, 2;
	add.s32 	%r29, %r28, %r208;
	.loc 1 77 1
	and.pred  	%p85, %p84, %p77;
	@!%p85 bra 	BB5_101;
	bra.uni 	BB5_88;

BB5_88:
	cvta.to.global.u64 	%rd60, %rd9;
	mul.wide.s32 	%rd61, %r29, 8;
	add.s64 	%rd7, %rd60, %rd61;
	.loc 1 77 1
	ld.global.v4.u16 	{%rs53, %rs54, %rs55, %rs56}, [%rd7+-256];
	.loc 3 3518 10
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs53;
	cvt.f32.f16 	%f398, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs54;
	cvt.f32.f16 	%f399, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs55;
	cvt.f32.f16 	%f400, %temp;
	}
	{
	.reg .b16 %temp;
	mov.b16 	%temp, %rs56;
	cvt.f32.f16 	%f401, %temp;
	}
	.loc 1 77 1
	mad.lo.s32 	%r211, %r128, 33, %r133;
	mul.wide.s32 	%rd62, %r211, 4;
	mov.u64 	%rd63, HorizontalRecursiveGaussianRGBAF16_kernel$__cuda_local_var_169785_10149_non_const_smem;
	add.s64 	%rd64, %rd63, %rd62;
	.loc 1 77 1
	ld.shared.f32 	%f402, [%rd64+528];
	add.ftz.f32 	%f137, %f398, %f402;
	ld.shared.f32 	%f403, [%rd64+1584];
	add.ftz.f32 	%f138, %f399, %f403;
	ld.shared.f32 	%f404, [%rd64+2640];
	add.ftz.f32 	%f139, %f400, %f404;
	ld.shared.f32 	%f405, [%rd64+3696];
	add.ftz.f32 	%f406, %f401, %f405;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f474, %f406;
	.loc 2 45 1
	add.ftz.f32 	%f407, %f474, 0fB70637BD;
	setp.gtu.ftz.f32	%p86, %f407, 0f00000000;
	@%p86 bra 	BB5_90;

	mov.f32 	%f477, 0f00000000;
	mov.f32 	%f476, %f477;
	mov.f32 	%f475, %f477;
	mov.f32 	%f474, %f477;
	bra.uni 	BB5_91;

BB5_90:
	mov.f32 	%f412, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f413, %f412, %f474;
	.loc 2 45 1
	mul.ftz.f32 	%f475, %f139, %f413;
	mul.ftz.f32 	%f476, %f138, %f413;
	mul.ftz.f32 	%f477, %f137, %f413;

BB5_91:
	.loc 1 77 165
	setp.ltu.ftz.f32	%p87, %f477, 0f00000000;
	@%p87 bra 	BB5_93;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f414, %f477;
	mul.ftz.f32 	%f415, %f414, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f478, %f415;
	bra.uni 	BB5_94;

BB5_93:
	neg.ftz.f32 	%f416, %f477;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f417, %f416;
	mul.ftz.f32 	%f418, %f417, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f419, %f418;
	neg.ftz.f32 	%f478, %f419;

BB5_94:
	setp.ltu.ftz.f32	%p88, %f476, 0f00000000;
	@%p88 bra 	BB5_96;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f420, %f476;
	mul.ftz.f32 	%f421, %f420, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f479, %f421;
	bra.uni 	BB5_97;

BB5_96:
	neg.ftz.f32 	%f422, %f476;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f423, %f422;
	mul.ftz.f32 	%f424, %f423, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f425, %f424;
	neg.ftz.f32 	%f479, %f425;

BB5_97:
	setp.ltu.ftz.f32	%p89, %f475, 0f00000000;
	@%p89 bra 	BB5_99;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f426, %f475;
	mul.ftz.f32 	%f427, %f426, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f480, %f427;
	bra.uni 	BB5_100;

BB5_99:
	neg.ftz.f32 	%f428, %f475;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f429, %f428;
	mul.ftz.f32 	%f430, %f429, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f431, %f430;
	neg.ftz.f32 	%f480, %f431;

BB5_100:
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f478;
	mov.b16 	%rs61, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f479;
	mov.b16 	%rs62, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f480;
	mov.b16 	%rs63, %temp;
}
	.loc 3 3513 10
	{
	.reg .b16 %temp;
	cvt.rn.ftz.f16.f32 	%temp, %f474;
	mov.b16 	%rs64, %temp;
}
	.loc 1 77 238
	st.global.v4.u16 	[%rd7+-256], {%rs61, %rs62, %rs63, %rs64};

BB5_101:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	mad.lo.s32 	%r212, %r219, -32, %r14;
	add.s32 	%r220, %r212, -31;
	.loc 1 77 1
	setp.gt.s32	%p90, %r220, 0;
	add.s32 	%r219, %r219, 1;
	@%p90 bra 	BB5_42;

BB5_102:
	.loc 1 77 2
	ret;
}

.visible .entry HorizontalRecursiveGaussianRGBAF32_kernel(
	.param .u64 HorizontalRecursiveGaussianRGBAF32_kernel_param_0,
	.param .u32 HorizontalRecursiveGaussianRGBAF32_kernel_param_1,
	.param .u32 HorizontalRecursiveGaussianRGBAF32_kernel_param_2,
	.param .u32 HorizontalRecursiveGaussianRGBAF32_kernel_param_3,
	.param .u64 HorizontalRecursiveGaussianRGBAF32_kernel_param_4,
	.param .u32 HorizontalRecursiveGaussianRGBAF32_kernel_param_5,
	.param .u32 HorizontalRecursiveGaussianRGBAF32_kernel_param_6,
	.param .u32 HorizontalRecursiveGaussianRGBAF32_kernel_param_7,
	.param .u32 HorizontalRecursiveGaussianRGBAF32_kernel_param_8,
	.param .f32 HorizontalRecursiveGaussianRGBAF32_kernel_param_9,
	.param .f32 HorizontalRecursiveGaussianRGBAF32_kernel_param_10,
	.param .f32 HorizontalRecursiveGaussianRGBAF32_kernel_param_11,
	.param .f32 HorizontalRecursiveGaussianRGBAF32_kernel_param_12,
	.param .f32 HorizontalRecursiveGaussianRGBAF32_kernel_param_13,
	.param .f32 HorizontalRecursiveGaussianRGBAF32_kernel_param_14,
	.param .f32 HorizontalRecursiveGaussianRGBAF32_kernel_param_15,
	.param .f32 HorizontalRecursiveGaussianRGBAF32_kernel_param_16
)
{
	.reg .pred 	%p<91>;
	.reg .s32 	%r<214>;
	.reg .f32 	%f<507>;
	.reg .s64 	%rd<69>;
	// demoted variable
	.shared .align 4 .b8 HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem[4224];

	ld.param.u64 	%rd8, [HorizontalRecursiveGaussianRGBAF32_kernel_param_0];
	ld.param.u32 	%r35, [HorizontalRecursiveGaussianRGBAF32_kernel_param_1];
	ld.param.u32 	%r36, [HorizontalRecursiveGaussianRGBAF32_kernel_param_2];
	ld.param.u32 	%r37, [HorizontalRecursiveGaussianRGBAF32_kernel_param_3];
	ld.param.u64 	%rd9, [HorizontalRecursiveGaussianRGBAF32_kernel_param_4];
	ld.param.u32 	%r38, [HorizontalRecursiveGaussianRGBAF32_kernel_param_5];
	ld.param.u32 	%r39, [HorizontalRecursiveGaussianRGBAF32_kernel_param_6];
	ld.param.u32 	%r40, [HorizontalRecursiveGaussianRGBAF32_kernel_param_7];
	ld.param.u32 	%r41, [HorizontalRecursiveGaussianRGBAF32_kernel_param_8];
	ld.param.f32 	%f161, [HorizontalRecursiveGaussianRGBAF32_kernel_param_9];
	ld.param.f32 	%f162, [HorizontalRecursiveGaussianRGBAF32_kernel_param_10];
	ld.param.f32 	%f163, [HorizontalRecursiveGaussianRGBAF32_kernel_param_11];
	ld.param.f32 	%f164, [HorizontalRecursiveGaussianRGBAF32_kernel_param_12];
	ld.param.f32 	%f165, [HorizontalRecursiveGaussianRGBAF32_kernel_param_13];
	ld.param.f32 	%f166, [HorizontalRecursiveGaussianRGBAF32_kernel_param_14];
	ld.param.f32 	%f167, [HorizontalRecursiveGaussianRGBAF32_kernel_param_15];
	ld.param.f32 	%f168, [HorizontalRecursiveGaussianRGBAF32_kernel_param_16];
	mov.u32 	%r211, 0;
	.loc 1 77 1
	setp.gt.s32	%p7, %r39, 0;
	@%p7 bra 	BB6_1;
	bra.uni 	BB6_40;

BB6_1:
	mov.f32 	%f474, 0f00000000;
	mov.f32 	%f473, %f474;
	mov.f32 	%f472, %f474;
	mov.u32 	%r208, %r211;

BB6_2:
	.loc 1 77 1
	mov.u32 	%r206, %r208;
	mov.u32 	%r1, %r206;
	.loc 1 77 1
	mov.u32 	%r45, %ctaid.y;
	shl.b32 	%r46, %r45, 3;
	mov.u32 	%r47, %tid.y;
	add.s32 	%r48, %r46, %r47;
	sub.s32 	%r49, %r40, %r37;
	shr.s32 	%r50, %r49, 1;
	sub.s32 	%r51, %r48, %r50;
	setp.lt.s32	%p8, %r51, %r37;
	.loc 1 77 1
	mov.u32 	%r52, %tid.x;
	.loc 1 77 1
	add.s32 	%r3, %r52, %r211;
	sub.s32 	%r53, %r39, %r36;
	shr.s32 	%r4, %r53, 1;
	.loc 1 77 1
	setp.gt.s32	%p9, %r51, -1;
	and.pred  	%p1, %p9, %p8;
	.loc 1 77 1
	setp.ge.s32	%p10, %r51, %r37;
	@%p10 bra 	BB6_16;

	.loc 1 77 1
	sub.s32 	%r54, %r3, %r4;
	.loc 1 77 1
	setp.gt.s32	%p11, %r54, -1;
	and.pred  	%p12, %p1, %p11;
	.loc 1 77 1
	setp.lt.s32	%p13, %r54, %r36;
	and.pred  	%p14, %p12, %p13;
	.loc 1 77 1
	@%p14 bra 	BB6_5;

	mov.f32 	%f464, 0f00000000;
	mov.f32 	%f463, %f464;
	mov.f32 	%f462, %f464;
	mov.f32 	%f461, %f464;
	bra.uni 	BB6_15;

BB6_5:
	cvta.to.global.u64 	%rd10, %rd8;
	.loc 1 77 1
	mad.lo.s32 	%r63, %r51, %r35, %r54;
	mul.wide.s32 	%rd11, %r63, 16;
	add.s64 	%rd12, %rd10, %rd11;
	ld.global.v4.f32 	{%f176, %f177, %f178, %f179}, [%rd12];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f464, %f179;
	setp.ltu.ftz.f32	%p15, %f176, 0f00000000;
	@%p15 bra 	BB6_7;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f180, %f176;
	mul.ftz.f32 	%f181, %f180, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f458, %f181;
	bra.uni 	BB6_8;

BB6_7:
	neg.ftz.f32 	%f182, %f176;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f183, %f182;
	mul.ftz.f32 	%f184, %f183, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f185, %f184;
	neg.ftz.f32 	%f458, %f185;

BB6_8:
	setp.ltu.ftz.f32	%p16, %f177, 0f00000000;
	@%p16 bra 	BB6_10;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f186, %f177;
	mul.ftz.f32 	%f187, %f186, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f459, %f187;
	bra.uni 	BB6_11;

BB6_10:
	neg.ftz.f32 	%f188, %f177;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f189, %f188;
	mul.ftz.f32 	%f190, %f189, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f191, %f190;
	neg.ftz.f32 	%f459, %f191;

BB6_11:
	setp.ltu.ftz.f32	%p17, %f178, 0f00000000;
	@%p17 bra 	BB6_13;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f192, %f178;
	mul.ftz.f32 	%f193, %f192, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f460, %f193;
	bra.uni 	BB6_14;

BB6_13:
	neg.ftz.f32 	%f194, %f178;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f195, %f194;
	mul.ftz.f32 	%f196, %f195, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f197, %f196;
	neg.ftz.f32 	%f460, %f197;

BB6_14:
	mul.ftz.f32 	%f463, %f460, %f464;
	mul.ftz.f32 	%f462, %f459, %f464;
	mul.ftz.f32 	%f461, %f458, %f464;

BB6_15:
	.loc 1 77 1
	mad.lo.s32 	%r66, %r47, 33, %r52;
	mul.wide.s32 	%rd13, %r66, 4;
	mov.u64 	%rd14, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	add.s64 	%rd15, %rd14, %rd13;
	.loc 1 77 1
	st.shared.f32 	[%rd15], %f461;
	st.shared.f32 	[%rd15+1056], %f462;
	st.shared.f32 	[%rd15+2112], %f463;
	st.shared.f32 	[%rd15+3168], %f464;

BB6_16:
	.loc 1 77 1
	add.s32 	%r74, %r51, 4;
	.loc 1 77 1
	sub.s32 	%r77, %r3, %r4;
	mad.lo.s32 	%r78, %r51, %r35, %r77;
	.loc 1 77 1
	shl.b32 	%r79, %r35, 2;
	add.s32 	%r5, %r78, %r79;
	.loc 1 77 1
	setp.ge.s32	%p18, %r74, %r37;
	@%p18 bra 	BB6_30;

	.loc 1 77 1
	setp.gt.s32	%p19, %r77, -1;
	and.pred  	%p23, %p1, %p19;
	.loc 1 77 1
	setp.lt.s32	%p24, %r77, %r36;
	and.pred  	%p25, %p23, %p24;
	.loc 1 77 1
	@%p25 bra 	BB6_19;

	mov.f32 	%f471, 0f00000000;
	mov.f32 	%f470, %f471;
	mov.f32 	%f469, %f471;
	mov.f32 	%f468, %f471;
	bra.uni 	BB6_29;

BB6_19:
	cvta.to.global.u64 	%rd16, %rd8;
	mul.wide.s32 	%rd17, %r5, 16;
	add.s64 	%rd18, %rd16, %rd17;
	ld.global.v4.f32 	{%f202, %f203, %f204, %f205}, [%rd18];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f471, %f205;
	setp.ltu.ftz.f32	%p26, %f202, 0f00000000;
	@%p26 bra 	BB6_21;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f206, %f202;
	mul.ftz.f32 	%f207, %f206, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f465, %f207;
	bra.uni 	BB6_22;

BB6_21:
	neg.ftz.f32 	%f208, %f202;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f209, %f208;
	mul.ftz.f32 	%f210, %f209, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f211, %f210;
	neg.ftz.f32 	%f465, %f211;

BB6_22:
	setp.ltu.ftz.f32	%p27, %f203, 0f00000000;
	@%p27 bra 	BB6_24;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f212, %f203;
	mul.ftz.f32 	%f213, %f212, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f466, %f213;
	bra.uni 	BB6_25;

BB6_24:
	neg.ftz.f32 	%f214, %f203;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f215, %f214;
	mul.ftz.f32 	%f216, %f215, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f217, %f216;
	neg.ftz.f32 	%f466, %f217;

BB6_25:
	setp.ltu.ftz.f32	%p28, %f204, 0f00000000;
	@%p28 bra 	BB6_27;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f218, %f204;
	mul.ftz.f32 	%f219, %f218, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f467, %f219;
	bra.uni 	BB6_28;

BB6_27:
	neg.ftz.f32 	%f220, %f204;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f221, %f220;
	mul.ftz.f32 	%f222, %f221, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f223, %f222;
	neg.ftz.f32 	%f467, %f223;

BB6_28:
	mul.ftz.f32 	%f470, %f467, %f471;
	mul.ftz.f32 	%f469, %f466, %f471;
	mul.ftz.f32 	%f468, %f465, %f471;

BB6_29:
	.loc 1 77 1
	mad.lo.s32 	%r92, %r47, 33, %r52;
	mul.wide.s32 	%rd19, %r92, 4;
	mov.u64 	%rd20, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	add.s64 	%rd21, %rd20, %rd19;
	.loc 1 77 1
	st.shared.f32 	[%rd21+528], %f468;
	st.shared.f32 	[%rd21+1584], %f469;
	st.shared.f32 	[%rd21+2640], %f470;
	st.shared.f32 	[%rd21+3696], %f471;

BB6_30:
	.loc 1 77 1
	mad.lo.s32 	%r95, %r47, 16, %r52;
	mul.lo.s32 	%r96, %r95, 33;
	mul.wide.s32 	%rd22, %r96, 4;
	mov.u64 	%rd23, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	add.s64 	%rd67, %rd23, %rd22;
	.loc 1 77 1
	shl.b32 	%r97, %r47, 5;
	add.s32 	%r98, %r97, %r52;
	setp.lt.s32	%p2, %r98, 32;
	.loc 1 77 1
	bar.sync 	0;
	add.s64 	%rd68, %rd67, 16;
	.loc 1 77 1
	@!%p2 bra 	BB6_33;
	bra.uni 	BB6_31;

BB6_31:
	mov.u32 	%r209, 0;
	mov.u32 	%r207, %r1;

BB6_32:
	.loc 1 77 1
	mov.u32 	%r6, %r207;
	.loc 1 77 1
	mov.u64 	%rd4, %rd68;
	.loc 1 77 1
	setp.eq.s32	%p29, %r6, 0;
	setp.ne.s32	%p30, %r41, 0;
	and.pred  	%p31, %p29, %p30;
	.loc 1 77 1
	ld.shared.f32 	%f224, [%rd67];
	.loc 1 77 1
	mul.ftz.f32 	%f225, %f224, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f226, %f225, %f473, %p31;
	selp.f32	%f227, %f225, %f472, %p31;
	.loc 1 77 1
	mul.ftz.f32 	%f228, %f474, %f162;
	fma.rn.ftz.f32 	%f229, %f224, %f161, %f228;
	fma.rn.ftz.f32 	%f230, %f226, %f163, %f229;
	fma.rn.ftz.f32 	%f231, %f227, %f164, %f230;
	st.shared.f32 	[%rd67], %f231;
	ld.shared.f32 	%f232, [%rd4+-12];
	mul.ftz.f32 	%f233, %f224, %f162;
	fma.rn.ftz.f32 	%f234, %f232, %f161, %f233;
	fma.rn.ftz.f32 	%f235, %f231, %f163, %f234;
	fma.rn.ftz.f32 	%f236, %f226, %f164, %f235;
	ld.shared.f32 	%f237, [%rd4+-8];
	ld.shared.f32 	%f238, [%rd4+-4];
	ld.shared.f32 	%f239, [%rd4];
	st.shared.f32 	[%rd4+-12], %f236;
	mul.ftz.f32 	%f240, %f232, %f162;
	fma.rn.ftz.f32 	%f241, %f237, %f161, %f240;
	fma.rn.ftz.f32 	%f242, %f236, %f163, %f241;
	fma.rn.ftz.f32 	%f243, %f231, %f164, %f242;
	st.shared.f32 	[%rd4+-8], %f243;
	mul.ftz.f32 	%f244, %f237, %f162;
	fma.rn.ftz.f32 	%f245, %f238, %f161, %f244;
	fma.rn.ftz.f32 	%f246, %f243, %f163, %f245;
	fma.rn.ftz.f32 	%f247, %f236, %f164, %f246;
	st.shared.f32 	[%rd4+-4], %f247;
	mul.ftz.f32 	%f248, %f238, %f162;
	fma.rn.ftz.f32 	%f249, %f239, %f161, %f248;
	fma.rn.ftz.f32 	%f250, %f247, %f163, %f249;
	fma.rn.ftz.f32 	%f251, %f243, %f164, %f250;
	st.shared.f32 	[%rd4], %f251;
	ld.shared.f32 	%f252, [%rd4+4];
	mul.ftz.f32 	%f253, %f239, %f162;
	fma.rn.ftz.f32 	%f254, %f252, %f161, %f253;
	fma.rn.ftz.f32 	%f255, %f251, %f163, %f254;
	fma.rn.ftz.f32 	%f256, %f247, %f164, %f255;
	ld.shared.f32 	%f257, [%rd4+8];
	ld.shared.f32 	%f474, [%rd4+12];
	st.shared.f32 	[%rd4+4], %f256;
	mul.ftz.f32 	%f258, %f252, %f162;
	fma.rn.ftz.f32 	%f259, %f257, %f161, %f258;
	fma.rn.ftz.f32 	%f260, %f256, %f163, %f259;
	fma.rn.ftz.f32 	%f472, %f251, %f164, %f260;
	st.shared.f32 	[%rd4+8], %f472;
	mul.ftz.f32 	%f261, %f257, %f162;
	fma.rn.ftz.f32 	%f262, %f474, %f161, %f261;
	fma.rn.ftz.f32 	%f263, %f472, %f163, %f262;
	fma.rn.ftz.f32 	%f473, %f256, %f164, %f263;
	st.shared.f32 	[%rd4+12], %f473;
	.loc 1 77 1
	add.s32 	%r8, %r6, -8;
	add.s64 	%rd68, %rd4, 32;
	.loc 1 77 1
	add.s32 	%r209, %r209, 32;
	add.s64 	%rd67, %rd4, 16;
	.loc 1 77 1
	setp.ne.s32	%p32, %r209, 128;
	mov.u32 	%r207, %r8;
	@%p32 bra 	BB6_32;

BB6_33:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	mov.u32 	%r205, %ctaid.y;
	shl.b32 	%r204, %r205, 3;
	add.s32 	%r203, %r204, %r47;
	.loc 1 77 1
	mad.lo.s32 	%r12, %r203, %r38, %r3;
	.loc 1 77 1
	@!%p8 bra 	BB6_36;
	bra.uni 	BB6_34;

BB6_34:
	mov.u64 	%rd66, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	.loc 1 77 1
	mad.lo.s32 	%r107, %r47, 33, %r52;
	mul.wide.s32 	%rd24, %r107, 4;
	add.s64 	%rd26, %rd66, %rd24;
	.loc 1 77 1
	ld.shared.f32 	%f57, [%rd26];
	ld.shared.f32 	%f58, [%rd26+1056];
	ld.shared.f32 	%f59, [%rd26+2112];
	ld.shared.f32 	%f60, [%rd26+3168];
	.loc 1 77 1
	setp.ge.s32	%p33, %r3, %r39;
	@%p33 bra 	BB6_36;

	cvta.to.global.u64 	%rd27, %rd9;
	mul.wide.s32 	%rd28, %r12, 16;
	add.s64 	%rd29, %rd27, %rd28;
	.loc 1 77 1
	st.global.v4.f32 	[%rd29], {%f57, %f58, %f59, %f60};

BB6_36:
	.loc 1 77 1
	shl.b32 	%r109, %r38, 2;
	add.s32 	%r13, %r12, %r109;
	.loc 1 77 1
	@%p18 bra 	BB6_39;

	mov.u64 	%rd65, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	.loc 1 77 1
	mad.lo.s32 	%r112, %r47, 33, %r52;
	mul.wide.s32 	%rd30, %r112, 4;
	add.s64 	%rd32, %rd65, %rd30;
	.loc 1 77 1
	ld.shared.f32 	%f61, [%rd32+528];
	ld.shared.f32 	%f62, [%rd32+1584];
	ld.shared.f32 	%f63, [%rd32+2640];
	ld.shared.f32 	%f64, [%rd32+3696];
	.loc 1 77 1
	setp.ge.s32	%p35, %r3, %r39;
	@%p35 bra 	BB6_39;

	cvta.to.global.u64 	%rd33, %rd9;
	mul.wide.s32 	%rd34, %r13, 16;
	add.s64 	%rd35, %rd33, %rd34;
	.loc 1 77 1
	st.global.v4.f32 	[%rd35], {%f61, %f62, %f63, %f64};

BB6_39:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	add.s32 	%r211, %r211, 32;
	.loc 1 77 1
	setp.lt.s32	%p36, %r211, %r39;
	add.s32 	%r208, %r1, -32;
	@%p36 bra 	BB6_2;

BB6_40:
	.loc 1 77 1
	setp.lt.s32	%p37, %r211, 1;
	@%p37 bra 	BB6_102;

	add.s32 	%r17, %r211, -1;
	mov.f32 	%f492, 0f00000000;
	mov.f32 	%f491, %f492;
	mov.f32 	%f490, %f492;
	mov.f32 	%f489, %f492;
	mov.u32 	%r210, 0;

BB6_42:
	.loc 1 77 1
	mov.u32 	%r114, %ctaid.y;
	shl.b32 	%r115, %r114, 3;
	mov.u32 	%r116, %tid.y;
	add.s32 	%r117, %r115, %r116;
	sub.s32 	%r118, %r40, %r37;
	shr.s32 	%r119, %r118, 1;
	sub.s32 	%r120, %r117, %r119;
	setp.lt.s32	%p38, %r120, %r37;
	.loc 1 77 1
	mov.u32 	%r121, %tid.x;
	add.s32 	%r20, %r121, %r211;
	.loc 1 77 1
	add.s32 	%r122, %r20, -32;
	sub.s32 	%r123, %r39, %r36;
	shr.s32 	%r124, %r123, 1;
	sub.s32 	%r125, %r122, %r124;
	mad.lo.s32 	%r21, %r120, %r35, %r125;
	.loc 1 77 1
	setp.gt.s32	%p39, %r125, -1;
	setp.gt.s32	%p40, %r120, -1;
	and.pred  	%p41, %p40, %p38;
	and.pred  	%p42, %p41, %p39;
	.loc 1 77 1
	setp.lt.s32	%p43, %r125, %r36;
	and.pred  	%p4, %p42, %p43;
	.loc 1 77 1
	setp.ge.s32	%p44, %r120, %r37;
	@%p44 bra 	BB6_56;

	.loc 1 77 1
	@%p4 bra 	BB6_45;

	mov.f32 	%f481, 0f00000000;
	mov.f32 	%f480, %f481;
	mov.f32 	%f479, %f481;
	mov.f32 	%f478, %f481;
	bra.uni 	BB6_55;

BB6_45:
	cvta.to.global.u64 	%rd36, %rd8;
	mul.wide.s32 	%rd37, %r21, 16;
	add.s64 	%rd38, %rd36, %rd37;
	ld.global.v4.f32 	{%f272, %f273, %f274, %f275}, [%rd38];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f481, %f275;
	setp.ltu.ftz.f32	%p45, %f272, 0f00000000;
	@%p45 bra 	BB6_47;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f277, %f272;
	mul.ftz.f32 	%f278, %f277, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f475, %f278;
	bra.uni 	BB6_48;

BB6_47:
	neg.ftz.f32 	%f279, %f272;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f280, %f279;
	mul.ftz.f32 	%f281, %f280, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f282, %f281;
	neg.ftz.f32 	%f475, %f282;

BB6_48:
	setp.ltu.ftz.f32	%p46, %f273, 0f00000000;
	@%p46 bra 	BB6_50;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f283, %f273;
	mul.ftz.f32 	%f284, %f283, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f476, %f284;
	bra.uni 	BB6_51;

BB6_50:
	neg.ftz.f32 	%f285, %f273;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f286, %f285;
	mul.ftz.f32 	%f287, %f286, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f288, %f287;
	neg.ftz.f32 	%f476, %f288;

BB6_51:
	setp.ltu.ftz.f32	%p47, %f274, 0f00000000;
	@%p47 bra 	BB6_53;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f289, %f274;
	mul.ftz.f32 	%f290, %f289, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f477, %f290;
	bra.uni 	BB6_54;

BB6_53:
	neg.ftz.f32 	%f291, %f274;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f292, %f291;
	mul.ftz.f32 	%f293, %f292, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f294, %f293;
	neg.ftz.f32 	%f477, %f294;

BB6_54:
	mul.ftz.f32 	%f480, %f477, %f481;
	mul.ftz.f32 	%f479, %f476, %f481;
	mul.ftz.f32 	%f478, %f475, %f481;

BB6_55:
	.loc 1 77 1
	mad.lo.s32 	%r128, %r116, 33, %r121;
	mul.wide.s32 	%rd39, %r128, 4;
	mov.u64 	%rd40, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	add.s64 	%rd41, %rd40, %rd39;
	.loc 1 77 1
	st.shared.f32 	[%rd41], %f478;
	st.shared.f32 	[%rd41+1056], %f479;
	st.shared.f32 	[%rd41+2112], %f480;
	st.shared.f32 	[%rd41+3168], %f481;

BB6_56:
	.loc 1 77 1
	add.s32 	%r136, %r120, 4;
	.loc 1 77 1
	shl.b32 	%r137, %r35, 2;
	add.s32 	%r22, %r21, %r137;
	.loc 1 77 1
	setp.ge.s32	%p48, %r136, %r37;
	@%p48 bra 	BB6_70;

	.loc 1 77 1
	@%p4 bra 	BB6_59;

	mov.f32 	%f488, 0f00000000;
	mov.f32 	%f487, %f488;
	mov.f32 	%f486, %f488;
	mov.f32 	%f485, %f488;
	bra.uni 	BB6_69;

BB6_59:
	cvta.to.global.u64 	%rd42, %rd8;
	mul.wide.s32 	%rd43, %r22, 16;
	add.s64 	%rd44, %rd42, %rd43;
	ld.global.v4.f32 	{%f299, %f300, %f301, %f302}, [%rd44];
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f488, %f302;
	setp.ltu.ftz.f32	%p56, %f299, 0f00000000;
	@%p56 bra 	BB6_61;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f304, %f299;
	mul.ftz.f32 	%f305, %f304, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f482, %f305;
	bra.uni 	BB6_62;

BB6_61:
	neg.ftz.f32 	%f306, %f299;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f307, %f306;
	mul.ftz.f32 	%f308, %f307, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f309, %f308;
	neg.ftz.f32 	%f482, %f309;

BB6_62:
	setp.ltu.ftz.f32	%p57, %f300, 0f00000000;
	@%p57 bra 	BB6_64;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f310, %f300;
	mul.ftz.f32 	%f311, %f310, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f483, %f311;
	bra.uni 	BB6_65;

BB6_64:
	neg.ftz.f32 	%f312, %f300;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f313, %f312;
	mul.ftz.f32 	%f314, %f313, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f315, %f314;
	neg.ftz.f32 	%f483, %f315;

BB6_65:
	setp.ltu.ftz.f32	%p58, %f301, 0f00000000;
	@%p58 bra 	BB6_67;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f316, %f301;
	mul.ftz.f32 	%f317, %f316, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f484, %f317;
	bra.uni 	BB6_68;

BB6_67:
	neg.ftz.f32 	%f318, %f301;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f319, %f318;
	mul.ftz.f32 	%f320, %f319, 0f400CCCCD;
	ex2.approx.ftz.f32 	%f321, %f320;
	neg.ftz.f32 	%f484, %f321;

BB6_68:
	mul.ftz.f32 	%f487, %f484, %f488;
	mul.ftz.f32 	%f486, %f483, %f488;
	mul.ftz.f32 	%f485, %f482, %f488;

BB6_69:
	.loc 1 77 1
	mad.lo.s32 	%r151, %r116, 33, %r121;
	mul.wide.s32 	%rd45, %r151, 4;
	mov.u64 	%rd46, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	add.s64 	%rd47, %rd46, %rd45;
	.loc 1 77 1
	st.shared.f32 	[%rd47+528], %f485;
	st.shared.f32 	[%rd47+1584], %f486;
	st.shared.f32 	[%rd47+2640], %f487;
	st.shared.f32 	[%rd47+3696], %f488;

BB6_70:
	.loc 1 77 1
	shl.b32 	%r152, %r116, 5;
	add.s32 	%r153, %r152, %r121;
	setp.lt.s32	%p5, %r153, 32;
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	shl.b32 	%r154, %r116, 4;
	add.s32 	%r155, %r154, %r121;
	.loc 1 77 1
	mad.lo.s32 	%r213, %r155, 33, 31;
	.loc 1 77 1
	@!%p5 bra 	BB6_73;
	bra.uni 	BB6_71;

BB6_71:
	mov.u32 	%r212, 0;

BB6_72:
	.loc 1 77 1
	mad.lo.s32 	%r157, %r210, -32, %r17;
	.loc 1 77 1
	sub.s32 	%r158, %r157, %r212;
	mul.wide.s32 	%rd48, %r213, 4;
	mov.u64 	%rd49, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	add.s64 	%rd50, %rd49, %rd48;
	.loc 1 77 1
	add.s32 	%r159, %r39, -1;
	setp.eq.s32	%p59, %r158, %r159;
	setp.ne.s32	%p60, %r41, 0;
	and.pred  	%p61, %p59, %p60;
	.loc 1 77 1
	ld.shared.f32 	%f322, [%rd50];
	.loc 1 77 1
	mul.ftz.f32 	%f323, %f322, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f324, %f323, %f490, %p61;
	selp.f32	%f325, %f323, %f489, %p61;
	.loc 1 77 1
	mul.ftz.f32 	%f326, %f491, %f166;
	fma.rn.ftz.f32 	%f327, %f492, %f165, %f326;
	fma.rn.ftz.f32 	%f328, %f324, %f167, %f327;
	fma.rn.ftz.f32 	%f329, %f325, %f168, %f328;
	ld.shared.f32 	%f330, [%rd50+-4];
	ld.shared.f32 	%f331, [%rd50+-8];
	ld.shared.f32 	%f332, [%rd50+-12];
	st.shared.f32 	[%rd50], %f329;
	not.b32 	%r160, %r212;
	.loc 1 77 1
	add.s32 	%r161, %r157, %r160;
	setp.eq.s32	%p62, %r161, %r159;
	and.pred  	%p63, %p62, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f333, %f330, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f334, %f333, %f329, %p63;
	selp.f32	%f335, %f333, %f324, %p63;
	.loc 1 77 1
	mul.ftz.f32 	%f336, %f492, %f166;
	fma.rn.ftz.f32 	%f337, %f322, %f165, %f336;
	fma.rn.ftz.f32 	%f338, %f334, %f167, %f337;
	fma.rn.ftz.f32 	%f339, %f335, %f168, %f338;
	st.shared.f32 	[%rd50+-4], %f339;
	mov.u32 	%r162, -2;
	.loc 1 77 1
	sub.s32 	%r163, %r162, %r212;
	.loc 1 77 1
	add.s32 	%r164, %r157, %r163;
	setp.eq.s32	%p64, %r164, %r159;
	and.pred  	%p65, %p64, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f340, %f331, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f341, %f340, %f339, %p65;
	selp.f32	%f342, %f340, %f334, %p65;
	.loc 1 77 1
	mul.ftz.f32 	%f343, %f322, %f166;
	fma.rn.ftz.f32 	%f344, %f330, %f165, %f343;
	fma.rn.ftz.f32 	%f345, %f341, %f167, %f344;
	fma.rn.ftz.f32 	%f346, %f342, %f168, %f345;
	st.shared.f32 	[%rd50+-8], %f346;
	mov.u32 	%r165, -3;
	.loc 1 77 1
	sub.s32 	%r166, %r165, %r212;
	.loc 1 77 1
	add.s32 	%r167, %r157, %r166;
	setp.eq.s32	%p66, %r167, %r159;
	and.pred  	%p67, %p66, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f347, %f332, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f348, %f347, %f346, %p67;
	selp.f32	%f349, %f347, %f341, %p67;
	.loc 1 77 1
	mul.ftz.f32 	%f350, %f330, %f166;
	fma.rn.ftz.f32 	%f351, %f331, %f165, %f350;
	fma.rn.ftz.f32 	%f352, %f348, %f167, %f351;
	fma.rn.ftz.f32 	%f353, %f349, %f168, %f352;
	st.shared.f32 	[%rd50+-12], %f353;
	mov.u32 	%r168, -4;
	.loc 1 77 1
	sub.s32 	%r169, %r168, %r212;
	.loc 1 77 1
	add.s32 	%r170, %r157, %r169;
	setp.eq.s32	%p68, %r170, %r159;
	and.pred  	%p69, %p68, %p60;
	.loc 1 77 1
	ld.shared.f32 	%f354, [%rd50+-16];
	.loc 1 77 1
	mul.ftz.f32 	%f355, %f354, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f356, %f355, %f353, %p69;
	selp.f32	%f357, %f355, %f348, %p69;
	.loc 1 77 1
	mul.ftz.f32 	%f358, %f331, %f166;
	fma.rn.ftz.f32 	%f359, %f332, %f165, %f358;
	fma.rn.ftz.f32 	%f360, %f356, %f167, %f359;
	fma.rn.ftz.f32 	%f361, %f357, %f168, %f360;
	ld.shared.f32 	%f362, [%rd50+-20];
	ld.shared.f32 	%f491, [%rd50+-24];
	ld.shared.f32 	%f492, [%rd50+-28];
	st.shared.f32 	[%rd50+-16], %f361;
	mov.u32 	%r171, -5;
	.loc 1 77 1
	sub.s32 	%r172, %r171, %r212;
	.loc 1 77 1
	add.s32 	%r173, %r157, %r172;
	setp.eq.s32	%p70, %r173, %r159;
	and.pred  	%p71, %p70, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f363, %f362, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f364, %f363, %f361, %p71;
	selp.f32	%f365, %f363, %f356, %p71;
	.loc 1 77 1
	mul.ftz.f32 	%f366, %f332, %f166;
	fma.rn.ftz.f32 	%f367, %f354, %f165, %f366;
	fma.rn.ftz.f32 	%f368, %f364, %f167, %f367;
	fma.rn.ftz.f32 	%f369, %f365, %f168, %f368;
	st.shared.f32 	[%rd50+-20], %f369;
	mov.u32 	%r174, -6;
	.loc 1 77 1
	sub.s32 	%r175, %r174, %r212;
	.loc 1 77 1
	add.s32 	%r176, %r157, %r175;
	setp.eq.s32	%p72, %r176, %r159;
	and.pred  	%p73, %p72, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f370, %f491, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f371, %f370, %f369, %p73;
	selp.f32	%f372, %f370, %f364, %p73;
	.loc 1 77 1
	mul.ftz.f32 	%f373, %f354, %f166;
	fma.rn.ftz.f32 	%f374, %f362, %f165, %f373;
	fma.rn.ftz.f32 	%f375, %f371, %f167, %f374;
	fma.rn.ftz.f32 	%f376, %f372, %f168, %f375;
	st.shared.f32 	[%rd50+-24], %f376;
	mov.u32 	%r177, -7;
	.loc 1 77 1
	sub.s32 	%r178, %r177, %r212;
	.loc 1 77 1
	add.s32 	%r179, %r157, %r178;
	setp.eq.s32	%p74, %r179, %r159;
	and.pred  	%p75, %p74, %p60;
	.loc 1 77 1
	mul.ftz.f32 	%f377, %f492, 0f3F000000;
	.loc 1 77 1
	selp.f32	%f489, %f377, %f376, %p75;
	selp.f32	%f378, %f377, %f371, %p75;
	.loc 1 77 1
	mul.ftz.f32 	%f379, %f362, %f166;
	fma.rn.ftz.f32 	%f380, %f491, %f165, %f379;
	fma.rn.ftz.f32 	%f381, %f489, %f167, %f380;
	fma.rn.ftz.f32 	%f490, %f378, %f168, %f381;
	st.shared.f32 	[%rd50+-28], %f490;
	add.s32 	%r213, %r213, -8;
	.loc 1 77 1
	add.s32 	%r212, %r212, 8;
	setp.ne.s32	%p76, %r212, 32;
	@%p76 bra 	BB6_72;

BB6_73:
	setp.lt.s32	%p6, %r117, %r40;
	.loc 1 77 1
	bar.sync 	0;
	add.s32 	%r202, %r121, %r211;
	.loc 1 77 1
	add.s32 	%r201, %r202, -32;
	mad.lo.s32 	%r31, %r117, %r38, %r202;
	.loc 1 77 1
	setp.lt.s32	%p77, %r201, %r39;
	.loc 1 77 1
	and.pred  	%p78, %p6, %p77;
	@!%p78 bra 	BB6_87;
	bra.uni 	BB6_74;

BB6_74:
	cvta.to.global.u64 	%rd51, %rd9;
	.loc 1 77 1
	add.s32 	%r184, %r31, -32;
	mul.wide.s32 	%rd52, %r184, 16;
	add.s64 	%rd53, %rd51, %rd52;
	.loc 1 77 1
	mad.lo.s32 	%r187, %r116, 33, %r121;
	mul.wide.s32 	%rd54, %r187, 4;
	mov.u64 	%rd55, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	add.s64 	%rd56, %rd55, %rd54;
	.loc 1 77 1
	ld.shared.f32 	%f382, [%rd56];
	ld.global.v4.f32 	{%f383, %f384, %f385, %f386}, [%rd53];
	.loc 1 77 1
	add.ftz.f32 	%f121, %f383, %f382;
	ld.shared.f32 	%f388, [%rd56+1056];
	add.ftz.f32 	%f122, %f384, %f388;
	ld.shared.f32 	%f390, [%rd56+2112];
	add.ftz.f32 	%f123, %f385, %f390;
	ld.shared.f32 	%f392, [%rd56+3168];
	add.ftz.f32 	%f394, %f386, %f392;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f493, %f394;
	.loc 2 45 1
	add.ftz.f32 	%f395, %f493, 0fB70637BD;
	setp.gtu.ftz.f32	%p79, %f395, 0f00000000;
	@%p79 bra 	BB6_76;

	mov.f32 	%f496, 0f00000000;
	mov.f32 	%f495, %f496;
	mov.f32 	%f494, %f496;
	mov.f32 	%f493, %f496;
	bra.uni 	BB6_77;

BB6_76:
	mov.f32 	%f400, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f401, %f400, %f493;
	.loc 2 45 1
	mul.ftz.f32 	%f494, %f123, %f401;
	mul.ftz.f32 	%f495, %f122, %f401;
	mul.ftz.f32 	%f496, %f121, %f401;

BB6_77:
	.loc 1 77 165
	setp.ltu.ftz.f32	%p80, %f496, 0f00000000;
	@%p80 bra 	BB6_79;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f402, %f496;
	mul.ftz.f32 	%f403, %f402, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f497, %f403;
	bra.uni 	BB6_80;

BB6_79:
	neg.ftz.f32 	%f404, %f496;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f405, %f404;
	mul.ftz.f32 	%f406, %f405, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f407, %f406;
	neg.ftz.f32 	%f497, %f407;

BB6_80:
	setp.ltu.ftz.f32	%p81, %f495, 0f00000000;
	@%p81 bra 	BB6_82;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f408, %f495;
	mul.ftz.f32 	%f409, %f408, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f498, %f409;
	bra.uni 	BB6_83;

BB6_82:
	neg.ftz.f32 	%f410, %f495;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f411, %f410;
	mul.ftz.f32 	%f412, %f411, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f413, %f412;
	neg.ftz.f32 	%f498, %f413;

BB6_83:
	setp.ltu.ftz.f32	%p82, %f494, 0f00000000;
	@%p82 bra 	BB6_85;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f414, %f494;
	mul.ftz.f32 	%f415, %f414, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f499, %f415;
	bra.uni 	BB6_86;

BB6_85:
	neg.ftz.f32 	%f416, %f494;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f417, %f416;
	mul.ftz.f32 	%f418, %f417, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f419, %f418;
	neg.ftz.f32 	%f499, %f419;

BB6_86:
	mul.wide.s32 	%rd58, %r184, 16;
	add.s64 	%rd59, %rd51, %rd58;
	.loc 1 77 1
	st.global.v4.f32 	[%rd59], {%f497, %f498, %f499, %f493};

BB6_87:
	.loc 1 77 1
	add.s32 	%r194, %r117, 4;
	setp.lt.s32	%p84, %r194, %r40;
	.loc 1 77 1
	shl.b32 	%r196, %r38, 2;
	add.s32 	%r32, %r31, %r196;
	.loc 1 77 1
	and.pred  	%p85, %p84, %p77;
	@!%p85 bra 	BB6_101;
	bra.uni 	BB6_88;

BB6_88:
	cvta.to.global.u64 	%rd60, %rd9;
	mul.wide.s32 	%rd61, %r32, 16;
	.loc 1 77 1
	mad.lo.s32 	%r199, %r116, 33, %r121;
	mul.wide.s32 	%rd62, %r199, 4;
	mov.u64 	%rd63, HorizontalRecursiveGaussianRGBAF32_kernel$__cuda_local_var_169785_10844_non_const_smem;
	add.s64 	%rd64, %rd63, %rd62;
	.loc 1 77 1
	ld.shared.f32 	%f420, [%rd64+528];
	add.s64 	%rd7, %rd60, %rd61;
	ld.global.v4.f32 	{%f421, %f422, %f423, %f424}, [%rd7+-512];
	.loc 1 77 1
	add.ftz.f32 	%f141, %f421, %f420;
	ld.shared.f32 	%f426, [%rd64+1584];
	add.ftz.f32 	%f142, %f422, %f426;
	ld.shared.f32 	%f428, [%rd64+2640];
	add.ftz.f32 	%f143, %f423, %f428;
	ld.shared.f32 	%f430, [%rd64+3696];
	add.ftz.f32 	%f432, %f424, %f430;
	.loc 3 2820 10
	cvt.ftz.sat.f32.f32	%f500, %f432;
	.loc 2 45 1
	add.ftz.f32 	%f433, %f500, 0fB70637BD;
	setp.gtu.ftz.f32	%p86, %f433, 0f00000000;
	@%p86 bra 	BB6_90;

	mov.f32 	%f503, 0f00000000;
	mov.f32 	%f502, %f503;
	mov.f32 	%f501, %f503;
	mov.f32 	%f500, %f503;
	bra.uni 	BB6_91;

BB6_90:
	mov.f32 	%f438, 0f3F800000;
	.loc 3 3606 10
	div.approx.ftz.f32 	%f439, %f438, %f500;
	.loc 2 45 1
	mul.ftz.f32 	%f501, %f143, %f439;
	mul.ftz.f32 	%f502, %f142, %f439;
	mul.ftz.f32 	%f503, %f141, %f439;

BB6_91:
	.loc 1 77 165
	setp.ltu.ftz.f32	%p87, %f503, 0f00000000;
	@%p87 bra 	BB6_93;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f440, %f503;
	mul.ftz.f32 	%f441, %f440, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f504, %f441;
	bra.uni 	BB6_94;

BB6_93:
	neg.ftz.f32 	%f442, %f503;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f443, %f442;
	mul.ftz.f32 	%f444, %f443, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f445, %f444;
	neg.ftz.f32 	%f504, %f445;

BB6_94:
	setp.ltu.ftz.f32	%p88, %f502, 0f00000000;
	@%p88 bra 	BB6_96;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f446, %f502;
	mul.ftz.f32 	%f447, %f446, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f505, %f447;
	bra.uni 	BB6_97;

BB6_96:
	neg.ftz.f32 	%f448, %f502;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f449, %f448;
	mul.ftz.f32 	%f450, %f449, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f451, %f450;
	neg.ftz.f32 	%f505, %f451;

BB6_97:
	setp.ltu.ftz.f32	%p89, %f501, 0f00000000;
	@%p89 bra 	BB6_99;

	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f452, %f501;
	mul.ftz.f32 	%f453, %f452, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f506, %f453;
	bra.uni 	BB6_100;

BB6_99:
	neg.ftz.f32 	%f454, %f501;
	.loc 3 3600 10
	lg2.approx.ftz.f32 	%f455, %f454;
	mul.ftz.f32 	%f456, %f455, 0f3EE8BA2E;
	ex2.approx.ftz.f32 	%f457, %f456;
	neg.ftz.f32 	%f506, %f457;

BB6_100:
	.loc 1 77 1
	st.global.v4.f32 	[%rd7+-512], {%f504, %f505, %f506, %f500};

BB6_101:
	.loc 1 77 1
	bar.sync 	0;
	.loc 1 77 1
	mad.lo.s32 	%r200, %r210, -32, %r17;
	add.s32 	%r211, %r200, -31;
	.loc 1 77 1
	setp.gt.s32	%p90, %r211, 0;
	add.s32 	%r210, %r210, 1;
	@%p90 bra 	BB6_42;

BB6_102:
	.loc 1 77 2
	ret;
}


