19#ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20#define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
22#if defined(__cplusplus) || defined(c_plusplus)
26#if defined(MAGICKCORE_OPENCL_SUPPORT)
31#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32#define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33#define OPENCL_ELSE() "\n #""else " " \n"
34#define OPENCL_ENDIF() "\n #""endif " " \n"
35#define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36#define STRINGIFY(...) #__VA_ARGS__ "\n"
38const char *accelerateKernels =
43 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
44 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
45 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
46 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
47 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
48 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
49 OPENCL_DEFINE(SigmaRandom, (attenuate))
50 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
51 OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
52 OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
53 OPENCL_DEFINE(QuantumScale, (1.0/QuantumRange))
80 Rec601YCbCrColorspace,
81 Rec709YCbCrColorspace,
85 TransparentColorspace,
101 UndefinedCompositeOp,
107 ChangeMaskCompositeOp,
109 ColorBurnCompositeOp,
110 ColorDodgeCompositeOp,
112 CopyBlackCompositeOp,
116 CopyGreenCompositeOp,
117 CopyMagentaCompositeOp,
118 CopyAlphaCompositeOp,
120 CopyYellowCompositeOp,
122 DarkenIntensityCompositeOp,
123 DifferenceCompositeOp,
127 DivideDstCompositeOp,
128 DivideSrcCompositeOp,
134 ExclusionCompositeOp,
135 HardLightCompositeOp,
139 IntensityCompositeOp,
141 LightenIntensityCompositeOp,
142 LinearBurnCompositeOp,
143 LinearDodgeCompositeOp,
144 LinearLightCompositeOp,
146 MathematicsCompositeOp,
150 ModulusAddCompositeOp,
151 ModulusSubtractCompositeOp,
157 PegtopLightCompositeOp,
163 SoftLightCompositeOp,
169 ThresholdCompositeOp,
170 VividLightCompositeOp,
193 MultiplicativeGaussianNoise,
204 UndefinedPixelIntensityMethod = 0,
205 AveragePixelIntensityMethod,
206 BrightnessPixelIntensityMethod,
207 LightnessPixelIntensityMethod,
208 MSPixelIntensityMethod,
209 Rec601LumaPixelIntensityMethod,
210 Rec601LuminancePixelIntensityMethod,
211 Rec709LumaPixelIntensityMethod,
212 Rec709LuminancePixelIntensityMethod,
213 RMSPixelIntensityMethod
214 } PixelIntensityMethod;
220 BoxWeightingFunction = 0,
221 TriangleWeightingFunction,
222 CubicBCWeightingFunction,
223 HannWeightingFunction,
224 HammingWeightingFunction,
225 BlackmanWeightingFunction,
226 GaussianWeightingFunction,
227 QuadraticWeightingFunction,
228 JincWeightingFunction,
229 SincWeightingFunction,
230 SincFastWeightingFunction,
231 KaiserWeightingFunction,
232 WelchWeightingFunction,
233 BohmanWeightingFunction,
234 LagrangeWeightingFunction,
235 CosineWeightingFunction
236 } ResizeWeightingFunctionType;
242 UndefinedChannel = 0x0000,
244 GrayChannel = 0x0001,
245 CyanChannel = 0x0001,
246 GreenChannel = 0x0002,
247 MagentaChannel = 0x0002,
248 BlueChannel = 0x0004,
249 YellowChannel = 0x0004,
250 BlackChannel = 0x0008,
251 AlphaChannel = 0x0010,
252 OpacityChannel = 0x0010,
253 IndexChannel = 0x0020,
254 ReadMaskChannel = 0x0040,
255 WriteMaskChannel = 0x0080,
256 MetaChannel = 0x0100,
257 CompositeChannels = 0x001F,
258 AllChannels = 0x7ffffff,
266 TrueAlphaChannel = 0x0100,
267 RGBChannels = 0x0200,
268 GrayChannels = 0x0400,
269 SyncChannels = 0x20000,
270 DefaultChannels = AllChannels
278OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
281 static inline CLQuantum ScaleCharToQuantum(const
unsigned char value)
283 return((CLQuantum) value);
287OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
290 static inline CLQuantum ScaleCharToQuantum(
const unsigned char value)
292 return((CLQuantum) (257.0f*value));
296OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
299 static inline CLQuantum ScaleCharToQuantum(
const unsigned char value)
301 return((CLQuantum) (16843009.0*value));
307OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
310 static inline CLQuantum ClampToQuantum(const
float value)
312 return (CLQuantum) clamp(value, 0.0f, QuantumRange);
319 static inline CLQuantum ClampToQuantum(
const float value)
321 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
328 static inline
int ClampToCanvas(const
int offset,const
int range)
330 return clamp(offset, (
int)0, range-1);
335 static inline uint ScaleQuantumToMap(CLQuantum value)
337 if (value >= (CLQuantum) MaxMap)
338 return ((uint)MaxMap);
340 return ((uint)value);
345 static inline float PerceptibleReciprocal(
const float x)
347 float sign = x < (float) 0.0 ? (
float) -1.0 : (float) 1.0;
348 return((sign*x) >= MagickEpsilon ? (
float) 1.0/x : sign*((
float) 1.0/MagickEpsilon));
354 static inline unsigned int getPixelIndex(
const unsigned int number_channels,
355 const unsigned int columns,
const unsigned int x,
const unsigned int y)
357 return (x * number_channels) + (y * columns * number_channels);
360 static inline float getPixelRed(
const __global CLQuantum *p) {
return (
float)*p; }
361 static inline float getPixelGreen(
const __global CLQuantum *p) {
return (
float)*(p+1); }
362 static inline float getPixelBlue(
const __global CLQuantum *p) {
return (
float)*(p+2); }
363 static inline float getPixelAlpha(
const __global CLQuantum *p,
const unsigned int number_channels) {
return (
float)*(p+number_channels-1); }
365 static inline void setPixelRed(__global CLQuantum *p,
const CLQuantum value) { *p=value; }
366 static inline void setPixelGreen(__global CLQuantum *p,
const CLQuantum value) { *(p+1)=value; }
367 static inline void setPixelBlue(__global CLQuantum *p,
const CLQuantum value) { *(p+2)=value; }
368 static inline void setPixelAlpha(__global CLQuantum *p,
const unsigned int number_channels,
const CLQuantum value) { *(p+number_channels-1)=value; }
370 static inline CLQuantum getBlue(CLPixelType p) {
return p.x; }
371 static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
372 static inline float getBlueF4(float4 p) {
return p.x; }
373 static inline void setBlueF4(float4* p,
float value) { (*p).x = value; }
375 static inline CLQuantum getGreen(CLPixelType p) {
return p.y; }
376 static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
377 static inline float getGreenF4(float4 p) {
return p.y; }
378 static inline void setGreenF4(float4* p,
float value) { (*p).y = value; }
380 static inline CLQuantum getRed(CLPixelType p) {
return p.z; }
381 static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
382 static inline float getRedF4(float4 p) {
return p.z; }
383 static inline void setRedF4(float4* p,
float value) { (*p).z = value; }
385 static inline CLQuantum getAlpha(CLPixelType p) {
return p.w; }
386 static inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
387 static inline float getAlphaF4(float4 p) {
return p.w; }
388 static inline void setAlphaF4(float4* p,
float value) { (*p).w = value; }
390 static inline void ReadChannels(
const __global CLQuantum *p,
const unsigned int number_channels,
391 const ChannelType channel,
float *red,
float *green,
float *blue,
float *alpha)
393 if ((channel & RedChannel) != 0)
396 if (number_channels > 2)
398 if ((channel & GreenChannel) != 0)
399 *green=getPixelGreen(p);
401 if ((channel & BlueChannel) != 0)
402 *blue=getPixelBlue(p);
405 if (((number_channels == 4) || (number_channels == 2)) &&
406 ((channel & AlphaChannel) != 0))
407 *alpha=getPixelAlpha(p,number_channels);
410 static inline float4 ReadAllChannels(
const __global CLQuantum *image,
const unsigned int number_channels,
411 const unsigned int columns,
const unsigned int x,
const unsigned int y)
413 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
417 pixel.x=getPixelRed(p);
419 if (number_channels > 2)
421 pixel.y=getPixelGreen(p);
422 pixel.z=getPixelBlue(p);
425 if ((number_channels == 4) || (number_channels == 2))
426 pixel.w=getPixelAlpha(p,number_channels);
430 static inline float4 ReadFloat4(
const __global CLQuantum *image,
const unsigned int number_channels,
431 const unsigned int columns,
const unsigned int x,
const unsigned int y,
const ChannelType channel)
433 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
440 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
441 return (float4)(red, green, blue, alpha);
444 static inline void WriteChannels(__global CLQuantum *p,
const unsigned int number_channels,
445 const ChannelType channel,
float red,
float green,
float blue,
float alpha)
447 if ((channel & RedChannel) != 0)
448 setPixelRed(p,ClampToQuantum(red));
450 if (number_channels > 2)
452 if ((channel & GreenChannel) != 0)
453 setPixelGreen(p,ClampToQuantum(green));
455 if ((channel & BlueChannel) != 0)
456 setPixelBlue(p,ClampToQuantum(blue));
459 if (((number_channels == 4) || (number_channels == 2)) &&
460 ((channel & AlphaChannel) != 0))
461 setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
464 static inline void WriteAllChannels(__global CLQuantum *image,
const unsigned int number_channels,
465 const unsigned int columns,
const unsigned int x,
const unsigned int y, float4 pixel)
467 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
469 setPixelRed(p,ClampToQuantum(pixel.x));
471 if (number_channels > 2)
473 setPixelGreen(p,ClampToQuantum(pixel.y));
474 setPixelBlue(p,ClampToQuantum(pixel.z));
477 if ((number_channels == 4) || (number_channels == 2))
478 setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
481 static inline void WriteFloat4(__global CLQuantum *image,
const unsigned int number_channels,
482 const unsigned int columns,
const unsigned int x,
const unsigned int y,
const ChannelType channel,
485 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
486 WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
489 static inline float GetPixelIntensity(
const unsigned int colorspace,
490 const unsigned int method,
float red,
float green,
float blue)
494 if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
499 case AveragePixelIntensityMethod:
501 intensity=(red+green+blue)/3.0;
504 case BrightnessPixelIntensityMethod:
506 intensity=MagickMax(MagickMax(red,green),blue);
509 case LightnessPixelIntensityMethod:
511 intensity=(MagickMin(MagickMin(red,green),blue)+
512 MagickMax(MagickMax(red,green),blue))/2.0;
515 case MSPixelIntensityMethod:
517 intensity=(float) (((
float) red*red+green*green+blue*blue)/
521 case Rec601LumaPixelIntensityMethod:
531 intensity=0.298839*red+0.586811*green+0.114350*blue;
534 case Rec601LuminancePixelIntensityMethod:
544 intensity=0.298839*red+0.586811*green+0.114350*blue;
547 case Rec709LumaPixelIntensityMethod:
558 intensity=0.212656*red+0.715158*green+0.072186*blue;
561 case Rec709LuminancePixelIntensityMethod:
571 intensity=0.212656*red+0.715158*green+0.072186*blue;
574 case RMSPixelIntensityMethod:
576 intensity=(float) (sqrt((
float) red*red+green*green+blue*blue)/
585 static inline int mirrorBottom(
int value)
587 return (value < 0) ? - (value) : value;
590 static inline int mirrorTop(
int value,
int width)
592 return (value >= width) ? (2 * width - value - 1) : value;
617 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
621 if( (v>=M) || (convert_float(v) < convert_float(a)) )
632 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
637 r=MWC_AddMod64(r,b,M);
638 b=MWC_AddMod64(b,b,M);
648 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
653 acc=MWC_MulMod64(acc,sqr,M);
654 sqr=MWC_MulMod64(sqr,sqr,M);
660 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
662 ulong m=MWC_PowMod64(A, distance, M);
663 ulong x=curr.x*(ulong)A+curr.y;
664 x=MWC_MulMod64(x, m, M);
665 return (uint2)((uint)(x/A), (uint)(x%A));
668 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
675 enum{ MWC_BASEID = 4077358422479273989UL };
677 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
678 ulong m=MWC_PowMod64(A, dist, M);
680 ulong x=MWC_MulMod64(MWC_BASEID, m, M);
681 return (uint2)((uint)(x/A), (uint)(x%A));
685 typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
687 void MWC64X_Step(mwc64x_state_t *s)
691 uint Xn=s->seed0*X+C;
692 uint carry=(uint)(Xn<C);
693 uint Cn=mad_hi(s->seed0,X,carry);
699 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
701 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
706 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
708 uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
714 uint MWC64X_NextUint(mwc64x_state_t *s)
716 uint res=s->x ^ s->c;
725 float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
727 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff);
730 float mwcGenerateDifferentialNoise(mwc64x_state_t* r,
float pixel, NoiseType noise_type,
float attenuate)
739 alpha=mwcReadPseudoRandomValue(r);
745 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
756 beta=mwcReadPseudoRandomValue(r);
757 gamma=sqrt(-2.0f*log(alpha));
758 sigma=gamma*cospi((2.0f*beta));
759 tau=gamma*sinpi((2.0f*beta));
760 noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
765 if (alpha < (SigmaImpulse/2.0f))
768 if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
778 if (alpha <= MagickEpsilon)
779 noise=(pixel-QuantumRange);
781 noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
786 if (beta <= (0.5f*MagickEpsilon))
787 noise=(pixel+QuantumRange);
789 noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
792 case MultiplicativeGaussianNoise:
795 if (alpha > MagickEpsilon)
796 sigma=sqrt(-2.0f*log(alpha));
797 beta=mwcReadPseudoRandomValue(r);
798 noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
799 cospi((2.0f*beta))/2.0f);
807 poisson=exp(-SigmaPoisson*QuantumScale*pixel);
808 for (i=0; alpha > poisson; i++)
810 beta=mwcReadPseudoRandomValue(r);
813 noise=(QuantumRange*i*PerceptibleReciprocal(SigmaPoisson));
818 noise=(QuantumRange*SigmaRandom*alpha);
842 __kernel
void BlurRow(
const __global CLQuantum *image,
843 const unsigned int number_channels,
const ChannelType channel,
844 __constant
float *filter,
const unsigned int width,
845 const unsigned int imageColumns,
const unsigned int imageRows,
846 __local float4 *temp,__global float4 *tempImage)
848 const int x = get_global_id(0);
849 const int y = get_global_id(1);
851 const int columns = imageColumns;
853 const unsigned int radius = (width-1)/2;
854 const int wsize = get_local_size(0);
855 const unsigned int loadSize = wsize+width;
858 const int groupX=get_local_size(0)*get_group_id(0);
861 for (
int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
863 int cx = ClampToCanvas(i + groupX - radius, columns);
864 temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
868 barrier(CLK_LOCAL_MEM_FENCE);
871 if (get_global_id(0) < columns)
874 float4 result = (float4) 0;
878 for ( ; i+7 < width; )
880 for (
int j=0; j < 8; j++)
881 result+=filter[i+j]*temp[i+j+get_local_id(0)];
885 for ( ; i < width; i++)
886 result+=filter[i]*temp[i+get_local_id(0)];
889 tempImage[y*columns+x] = result;
898 __kernel
void BlurColumn(
const __global float4 *blurRowData,
899 const unsigned int number_channels,
const ChannelType channel,
900 __constant
float *filter,
const unsigned int width,
901 const unsigned int imageColumns,
const unsigned int imageRows,
902 __local float4 *temp,__global CLQuantum *filteredImage)
904 const int x = get_global_id(0);
905 const int y = get_global_id(1);
907 const int columns = imageColumns;
908 const int rows = imageRows;
910 unsigned int radius = (width-1)/2;
911 const int wsize = get_local_size(1);
912 const unsigned int loadSize = wsize+width;
915 const int groupX=get_local_size(0)*get_group_id(0);
916 const int groupY=get_local_size(1)*get_group_id(1);
921 for (
int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
922 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
925 barrier(CLK_LOCAL_MEM_FENCE);
928 if (get_global_id(1) < rows)
931 float4 result = (float4) 0;
935 for ( ; i+7 < width; )
937 for (
int j=0; j < 8; j++)
938 result+=filter[i+j]*temp[i+j+get_local_id(1)];
942 for ( ; i < width; i++)
943 result+=filter[i]*temp[i+get_local_id(1)];
946 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
965 static inline float4 ConvertRGBToHSB(
const float4 pixel)
969 float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
972 float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
973 float delta=tmax-tmin;
976 result.z=QuantumScale*tmax;
979 result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
980 result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
981 (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
983 result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
989 static inline float4 ConvertHSBToRGB(
const float4 pixel)
992 float saturation=pixel.y;
993 float brightness=pixel.z;
997 if (saturation == 0.0f)
999 result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1003 float h=6.0f*(hue-floor(hue));
1005 float p=brightness*(1.0f-saturation);
1006 float q=brightness*(1.0f-saturation*f);
1007 float t=brightness*(1.0f-(saturation*(1.0f-f)));
1012 result.x=ClampToQuantum(QuantumRange*q);
1013 result.y=ClampToQuantum(QuantumRange*brightness);
1014 result.z=ClampToQuantum(QuantumRange*p);
1018 result.x=ClampToQuantum(QuantumRange*p);
1019 result.y=ClampToQuantum(QuantumRange*brightness);
1020 result.z=ClampToQuantum(QuantumRange*t);
1024 result.x=ClampToQuantum(QuantumRange*p);
1025 result.y=ClampToQuantum(QuantumRange*q);
1026 result.z=ClampToQuantum(QuantumRange*brightness);
1030 result.x=ClampToQuantum(QuantumRange*t);
1031 result.y=ClampToQuantum(QuantumRange*p);
1032 result.z=ClampToQuantum(QuantumRange*brightness);
1036 result.x=ClampToQuantum(QuantumRange*brightness);
1037 result.y=ClampToQuantum(QuantumRange*p);
1038 result.z=ClampToQuantum(QuantumRange*q);
1042 result.x=ClampToQuantum(QuantumRange*brightness);
1043 result.y=ClampToQuantum(QuantumRange*t);
1044 result.z=ClampToQuantum(QuantumRange*p);
1050 __kernel
void Contrast(__global CLQuantum *image,
1051 const unsigned int number_channels,
const int sign)
1053 const int x=get_global_id(0);
1054 const int y=get_global_id(1);
1055 const unsigned int columns=get_global_size(0);
1057 float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1058 if (number_channels < 3)
1059 pixel.y=pixel.z=pixel.x;
1061 pixel=ConvertRGBToHSB(pixel);
1062 float brightness=pixel.z;
1063 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1064 brightness=clamp(brightness,0.0f,1.0f);
1066 pixel=ConvertHSBToRGB(pixel);
1068 WriteAllChannels(image,number_channels,columns,x,y,pixel);
1087 __kernel
void Histogram(__global CLPixelType * restrict im,
1088 const ChannelType channel,
1089 const unsigned int colorspace,
1090 const unsigned int method,
1091 __global uint4 * restrict histogram)
1093 const int x = get_global_id(0);
1094 const int y = get_global_id(1);
1095 const int columns = get_global_size(0);
1096 const int c = x + y * columns;
1097 if ((channel & SyncChannels) != 0)
1099 float red=(float)getRed(im[c]);
1100 float green=(float)getGreen(im[c]);
1101 float blue=(float)getBlue(im[c]);
1103 float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1104 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1105 atomic_inc((__global uint *)(&(histogram[pos]))+2);
1118 __kernel
void ContrastStretch(__global CLPixelType * restrict im,
1119 const ChannelType channel,
1120 __global CLPixelType * restrict stretch_map,
1121 const float4 white,
const float4 black)
1123 const int x = get_global_id(0);
1124 const int y = get_global_id(1);
1125 const int columns = get_global_size(0);
1126 const int c = x + y * columns;
1129 CLPixelType oValue, eValue;
1130 CLQuantum red, green, blue, alpha;
1135 if ((channel & RedChannel) != 0)
1137 if (getRedF4(white) != getRedF4(black))
1139 ePos = ScaleQuantumToMap(getRed(oValue));
1140 eValue = stretch_map[ePos];
1141 red = getRed(eValue);
1145 if ((channel & GreenChannel) != 0)
1147 if (getGreenF4(white) != getGreenF4(black))
1149 ePos = ScaleQuantumToMap(getGreen(oValue));
1150 eValue = stretch_map[ePos];
1151 green = getGreen(eValue);
1155 if ((channel & BlueChannel) != 0)
1157 if (getBlueF4(white) != getBlueF4(black))
1159 ePos = ScaleQuantumToMap(getBlue(oValue));
1160 eValue = stretch_map[ePos];
1161 blue = getBlue(eValue);
1165 if ((channel & AlphaChannel) != 0)
1167 if (getAlphaF4(white) != getAlphaF4(black))
1169 ePos = ScaleQuantumToMap(getAlpha(oValue));
1170 eValue = stretch_map[ePos];
1171 alpha = getAlpha(eValue);
1176 im[c]=(CLPixelType)(blue, green, red, alpha);
1194 __kernel
void HullPass1(
const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1195 ,
const unsigned int imageWidth,
const unsigned int imageHeight
1196 ,
const int2 offset,
const int polarity,
const int matte) {
1198 int x = get_global_id(0);
1199 int y = get_global_id(1);
1201 CLPixelType v = inputImage[y*imageWidth+x];
1204 neighbor.y = y + offset.y;
1205 neighbor.x = x + offset.x;
1207 int2 clampedNeighbor;
1208 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1209 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1211 CLPixelType r = (clampedNeighbor.x == neighbor.x
1212 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1228 \n #pragma unroll 4\n
1229 for (
unsigned int i = 0; i < 4; i++) {
1230 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1234 \n #pragma unroll 4\n
1235 for (
unsigned int i = 0; i < 4; i++) {
1236 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1241 v.x = (CLQuantum)sv[0];
1242 v.y = (CLQuantum)sv[1];
1243 v.z = (CLQuantum)sv[2];
1246 v.w = (CLQuantum)sv[3];
1248 outputImage[y*imageWidth+x] = v;
1257 __kernel
void HullPass2(
const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1258 ,
const unsigned int imageWidth,
const unsigned int imageHeight
1259 ,
const int2 offset,
const int polarity,
const int matte) {
1261 int x = get_global_id(0);
1262 int y = get_global_id(1);
1264 CLPixelType v = inputImage[y*imageWidth+x];
1266 int2 neighbor, clampedNeighbor;
1268 neighbor.y = y + offset.y;
1269 neighbor.x = x + offset.x;
1270 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1271 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1273 CLPixelType r = (clampedNeighbor.x == neighbor.x
1274 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1278 neighbor.y = y - offset.y;
1279 neighbor.x = x - offset.x;
1280 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1281 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1283 CLPixelType s = (clampedNeighbor.x == neighbor.x
1284 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1307 \n #pragma unroll 4\n
1308 for (
unsigned int i = 0; i < 4; i++) {
1313 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1317 \n #pragma unroll 4\n
1318 for (
unsigned int i = 0; i < 4; i++) {
1322 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1326 v.x = (CLQuantum)sv[0];
1327 v.y = (CLQuantum)sv[1];
1328 v.z = (CLQuantum)sv[2];
1331 v.w = (CLQuantum)sv[3];
1333 outputImage[y*imageWidth+x] = v;
1353 __kernel
void Equalize(__global CLPixelType * restrict im,
1354 const ChannelType channel,
1355 __global CLPixelType * restrict equalize_map,
1356 const float4 white,
const float4 black)
1358 const int x = get_global_id(0);
1359 const int y = get_global_id(1);
1360 const int columns = get_global_size(0);
1361 const int c = x + y * columns;
1364 CLPixelType oValue, eValue;
1365 CLQuantum red, green, blue, alpha;
1370 if ((channel & SyncChannels) != 0)
1372 if (getRedF4(white) != getRedF4(black))
1374 ePos = ScaleQuantumToMap(getRed(oValue));
1375 eValue = equalize_map[ePos];
1376 red = getRed(eValue);
1377 ePos = ScaleQuantumToMap(getGreen(oValue));
1378 eValue = equalize_map[ePos];
1379 green = getRed(eValue);
1380 ePos = ScaleQuantumToMap(getBlue(oValue));
1381 eValue = equalize_map[ePos];
1382 blue = getRed(eValue);
1383 ePos = ScaleQuantumToMap(getAlpha(oValue));
1384 eValue = equalize_map[ePos];
1385 alpha = getRed(eValue);
1388 im[c]=(CLPixelType)(blue, green, red, alpha);
1415 CLQuantum ApplyFunction(
float pixel,
const MagickFunction function,
1416 const unsigned int number_parameters,__constant
float *parameters)
1418 float result = 0.0f;
1422 case PolynomialFunction:
1424 for (
unsigned int i=0; i < number_parameters; i++)
1425 result = result*QuantumScale*pixel + parameters[i];
1426 result *= QuantumRange;
1429 case SinusoidFunction:
1431 float freq,phase,ampl,bias;
1432 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1433 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1434 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1435 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1436 result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1437 (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1440 case ArcsinFunction:
1442 float width,range,center,bias;
1443 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1444 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1445 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1446 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1448 result = 2.0f/width*(QuantumScale*pixel - center);
1449 result = range/MagickPI*asin(result)+bias;
1450 result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1451 result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1452 result *= QuantumRange;
1455 case ArctanFunction:
1457 float slope,range,center,bias;
1458 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1459 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1460 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1461 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1462 result = MagickPI*slope*(QuantumScale*pixel-center);
1463 result = QuantumRange*(range/MagickPI*atan(result) + bias);
1466 case UndefinedFunction:
1469 return(ClampToQuantum(result));
1481 __kernel
void ComputeFunction(__global CLQuantum *image,
const unsigned int number_channels,
1482 const ChannelType channel,
const MagickFunction function,
const unsigned int number_parameters,
1483 __constant
float *parameters)
1485 const unsigned int x = get_global_id(0);
1486 const unsigned int y = get_global_id(1);
1487 const unsigned int columns = get_global_size(0);
1488 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1495 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1497 if ((channel & RedChannel) != 0)
1498 red=ApplyFunction(red, function, number_parameters, parameters);
1500 if (number_channels > 2)
1502 if ((channel & GreenChannel) != 0)
1503 green=ApplyFunction(green, function, number_parameters, parameters);
1505 if ((channel & BlueChannel) != 0)
1506 blue=ApplyFunction(blue, function, number_parameters, parameters);
1509 if (((number_channels == 4) || (number_channels == 2)) &&
1510 ((channel & AlphaChannel) != 0))
1511 alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1513 WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1530 __kernel
void Grayscale(__global CLQuantum *image,
const int number_channels,
1531 const unsigned int colorspace,
const unsigned int method)
1533 const unsigned int x = get_global_id(0);
1534 const unsigned int y = get_global_id(1);
1535 const unsigned int columns = get_global_size(0);
1536 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1544 green=getPixelGreen(p);
1545 blue=getPixelBlue(p);
1547 CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1549 setPixelRed(p,intensity);
1550 setPixelGreen(p,intensity);
1551 setPixelBlue(p,intensity);
1569 __kernel
void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global
float *tmpImage,
1571 const int imageWidth,
1572 const int imageHeight)
1574 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1576 int x = get_local_id(0);
1577 int y = get_global_id(1);
1579 if ((x >= imageWidth) || (y >= imageHeight))
1582 global CLPixelType *src = srcImage + y * imageWidth;
1584 for (
int i = x; i < imageWidth; i += get_local_size(0)) {
1586 float weight = 1.0f;
1589 while ((j + 7) < i) {
1590 for (
int k = 0; k < 8; ++k)
1591 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1596 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1601 while ((j + 7) < radius + i) {
1602 for (
int k = 0; k < 8; ++k)
1603 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1607 while (j < radius + i) {
1608 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1613 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1619 __kernel
void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global
float *blurImage,
1621 const float strength,
1622 const int imageWidth,
1623 const int imageHeight)
1625 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1627 int x = get_global_id(0);
1628 int y = get_global_id(1);
1630 if ((x >= imageWidth) || (y >= imageHeight))
1633 global
float *src = blurImage + x;
1636 float weight = 1.0f;
1639 while ((j + 7) < y) {
1640 for (
int k = 0; k < 8; ++k)
1641 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1646 sum += weight * src[mirrorBottom(j) * imageWidth];
1651 while ((j + 7) < radius + y) {
1652 for (
int k = 0; k < 8; ++k)
1653 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1657 while (j < radius + y) {
1658 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1663 CLPixelType pixel = srcImage[x + y * imageWidth];
1664 float srcVal = dot(RGB, convert_float4(pixel));
1665 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1666 mult = (srcVal + mult) / srcVal;
1668 pixel.x = ClampToQuantum(pixel.x * mult);
1669 pixel.y = ClampToQuantum(pixel.y * mult);
1670 pixel.z = ClampToQuantum(pixel.z * mult);
1672 dstImage[x + y * imageWidth] = pixel;
1690 static inline void ConvertRGBToHSL(
const CLQuantum red,
const CLQuantum green,
const CLQuantum blue,
1691 float *hue,
float *saturation,
float *lightness)
1701 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1702 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1706 *lightness=(tmax+tmin)/2.0;
1714 if (tmax == (QuantumScale*red))
1716 *hue=(QuantumScale*green-QuantumScale*blue)/c;
1717 if ((QuantumScale*green) < (QuantumScale*blue))
1721 if (tmax == (QuantumScale*green))
1722 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1724 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
1727 if (*lightness <= 0.5)
1728 *saturation=c/(2.0*(*lightness));
1730 *saturation=c/(2.0-2.0*(*lightness));
1733 static inline void ConvertHSLToRGB(
const float hue,
const float saturation,
const float lightness,
1734 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1749 if (lightness <= 0.5)
1750 c=2.0*lightness*saturation;
1752 c=(2.0-2.0*lightness)*saturation;
1753 tmin=lightness-0.5*c;
1754 h-=360.0*floor(h/360.0);
1756 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
1757 switch ((
int) floor(h) % 6)
1803 *red=ClampToQuantum(QuantumRange*r);
1804 *green=ClampToQuantum(QuantumRange*g);
1805 *blue=ClampToQuantum(QuantumRange*b);
1808 static inline void ModulateHSL(
const float percent_hue,
const float percent_saturation,
const float percent_lightness,
1809 CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1819 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
1820 hue+=0.5*(0.01*percent_hue-1.0);
1825 saturation*=0.01*percent_saturation;
1826 lightness*=0.01*percent_lightness;
1827 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
1830 __kernel
void Modulate(__global CLPixelType *im,
1831 const float percent_brightness,
1832 const float percent_hue,
1833 const float percent_saturation,
1834 const int colorspace)
1837 const int x = get_global_id(0);
1838 const int y = get_global_id(1);
1839 const int columns = get_global_size(0);
1840 const int c = x + y * columns;
1842 CLPixelType pixel = im[c];
1850 green=getGreen(pixel);
1851 blue=getBlue(pixel);
1858 ModulateHSL(percent_hue, percent_saturation, percent_brightness,
1859 &red, &green, &blue);
1864 CLPixelType filteredPixel;
1866 setRed(&filteredPixel, red);
1867 setGreen(&filteredPixel, green);
1868 setBlue(&filteredPixel, blue);
1869 filteredPixel.w = pixel.w;
1871 im[c] = filteredPixel;
1889 void MotionBlur(
const __global CLPixelType *input, __global CLPixelType *output,
1890 const unsigned int imageWidth,
const unsigned int imageHeight,
1891 const __global
float *filter,
const unsigned int width,
const __global int2* offset,
1893 const ChannelType channel,
const unsigned int matte) {
1896 currentPixel.x = get_global_id(0);
1897 currentPixel.y = get_global_id(1);
1899 if (currentPixel.x >= imageWidth
1900 || currentPixel.y >= imageHeight)
1904 pixel.x = (float)bias.x;
1905 pixel.y = (float)bias.y;
1906 pixel.z = (float)bias.z;
1907 pixel.w = (float)bias.w;
1909 if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1911 for (
int i = 0; i < width; i++) {
1914 int2 samplePixel = currentPixel + offset[i];
1915 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
1916 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
1917 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
1919 pixel.x += (filter[i] * (float)samplePixelValue.x);
1920 pixel.y += (filter[i] * (float)samplePixelValue.y);
1921 pixel.z += (filter[i] * (float)samplePixelValue.z);
1922 pixel.w += (filter[i] * (float)samplePixelValue.w);
1925 CLPixelType outputPixel;
1926 outputPixel.x = ClampToQuantum(pixel.x);
1927 outputPixel.y = ClampToQuantum(pixel.y);
1928 outputPixel.z = ClampToQuantum(pixel.z);
1929 outputPixel.w = ClampToQuantum(pixel.w);
1930 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
1935 for (
int i = 0; i < width; i++) {
1938 int2 samplePixel = currentPixel + offset[i];
1939 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
1940 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
1942 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
1944 float alpha = QuantumScale*samplePixelValue.w;
1945 float k = filter[i];
1946 pixel.x = pixel.x + k * alpha * samplePixelValue.x;
1947 pixel.y = pixel.y + k * alpha * samplePixelValue.y;
1948 pixel.z = pixel.z + k * alpha * samplePixelValue.z;
1950 pixel.w += k * alpha * samplePixelValue.w;
1954 gamma = PerceptibleReciprocal(gamma);
1955 pixel.xyz = gamma*pixel.xyz;
1957 CLPixelType outputPixel;
1958 outputPixel.x = ClampToQuantum(pixel.x);
1959 outputPixel.y = ClampToQuantum(pixel.y);
1960 outputPixel.z = ClampToQuantum(pixel.z);
1961 outputPixel.w = ClampToQuantum(pixel.w);
1962 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
1981 float BoxResizeFilter(
const float x)
1989 float CubicBC(
const float x,
const __global
float* resizeFilterCoefficients)
2021 return(resizeFilterCoefficients[0]+x*(x*
2022 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2024 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2025 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2031 float Sinc(
const float x)
2035 const float alpha=(float) (MagickPI*x);
2036 return sinpi(x)/alpha;
2043 float Triangle(
const float x)
2050 return ((x<1.0f)?(1.0f-x):0.0f);
2056 float Hann(
const float x)
2062 const float cosine=cos((MagickPI*x));
2063 return(0.5f+0.5f*cosine);
2068 float Hamming(
const float x)
2074 const float cosine=cos((MagickPI*x));
2075 return(0.54f+0.46f*cosine);
2080 float Blackman(
const float x)
2089 const float cosine=cos((MagickPI*x));
2090 return(0.34f+cosine*(0.5f+cosine*0.16f));
2095 static inline float applyResizeFilter(
const float x,
const ResizeWeightingFunctionType filterType,
const __global
float* filterCoefficients)
2101 case SincWeightingFunction:
2102 case SincFastWeightingFunction:
2104 case CubicBCWeightingFunction:
2105 return CubicBC(x,filterCoefficients);
2106 case BoxWeightingFunction:
2107 return BoxResizeFilter(x);
2108 case TriangleWeightingFunction:
2110 case HannWeightingFunction:
2112 case HammingWeightingFunction:
2114 case BlackmanWeightingFunction:
2125 static inline float getResizeFilterWeight(
const __global
float* resizeFilterCubicCoefficients,
const ResizeWeightingFunctionType resizeFilterType
2126 ,
const ResizeWeightingFunctionType resizeWindowType
2127 ,
const float resizeFilterScale,
const float resizeWindowSupport,
const float resizeFilterBlur,
const float x)
2130 float xBlur = fabs(x/resizeFilterBlur);
2131 if (resizeWindowSupport < MagickEpsilon
2132 || resizeWindowType == BoxWeightingFunction)
2138 scale = resizeFilterScale;
2139 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2141 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2148 const char *accelerateKernels2 =
2152 static inline unsigned int getNumWorkItemsPerPixel(
const unsigned int pixelPerWorkgroup,
const unsigned int numWorkItems) {
2153 return (numWorkItems/pixelPerWorkgroup);
2158 static inline int pixelToCompute(
const unsigned itemID,
const unsigned int pixelPerWorkgroup,
const unsigned int numWorkItems) {
2159 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2160 int pixelIndex = itemID/numWorkItemsPerPixel;
2161 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2168 __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2169 void ResizeHorizontalFilter(
const __global CLQuantum *inputImage,
const unsigned int number_channels,
2170 const unsigned int inputColumns,
const unsigned int inputRows, __global CLQuantum *filteredImage,
2171 const unsigned int filteredColumns,
const unsigned int filteredRows,
const float xFactor,
2172 const int resizeFilterType,
const int resizeWindowType,
const __global
float *resizeFilterCubicCoefficients,
2173 const float resizeFilterScale,
const float resizeFilterSupport,
const float resizeFilterWindowSupport,
2174 const float resizeFilterBlur, __local CLQuantum *inputImageCache,
const int numCachedPixels,
2175 const unsigned int pixelPerWorkgroup,
const unsigned int pixelChunkSize,
2176 __local float4 *outputPixelCache, __local
float *densityCache, __local
float *gammaCache)
2179 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2180 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2181 const unsigned int actualNumPixelToCompute = stopX - startX;
2184 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2185 const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2186 scale = PerceptibleReciprocal(scale);
2188 const int cacheRangeStartX = MagickMax((
int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(
int)(0));
2189 const int cacheRangeEndX = MagickMin((
int)(cacheRangeStartX + numCachedPixels), (
int)inputColumns);
2192 const unsigned int y = get_global_id(1);
2193 const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2194 const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2195 event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2196 wait_group_events(1, &e);
2198 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2199 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2200 for (
unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2202 const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2203 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2204 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2207 const unsigned int itemID = get_local_id(0);
2208 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2210 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2212 float4 filteredPixel = (float4)0.0f;
2213 float density = 0.0f;
2216 if (pixelIndex != -1)
2219 const int x = chunkStartX + pixelIndex;
2222 const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2223 const unsigned int start = (
unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2224 const unsigned int stop = (
unsigned int)MagickMin(bisect+support+0.5f,(
float)inputColumns);
2225 const unsigned int n = stop - start;
2228 unsigned int numStepsPerWorkItem = n / numItems;
2229 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2231 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2234 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2236 unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2237 for (
unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2239 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2240 (ResizeWeightingFunctionType) resizeFilterType,
2241 (ResizeWeightingFunctionType) resizeWindowType,
2242 resizeFilterScale, resizeFilterWindowSupport,
2243 resizeFilterBlur, scale*(start + i - bisect + 0.5));
2245 float4 cp = (float4)0.0f;
2247 __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2248 cp.x = (float) *(p);
2249 if (number_channels > 2)
2251 cp.y = (float) *(p + 1);
2252 cp.z = (float) *(p + 2);
2255 if (alpha_index != 0)
2257 cp.w = (float) *(p + alpha_index);
2259 float alpha = weight * QuantumScale * cp.w;
2261 filteredPixel.x += alpha * cp.x;
2262 filteredPixel.y += alpha * cp.y;
2263 filteredPixel.z += alpha * cp.z;
2264 filteredPixel.w += weight * cp.w;
2268 filteredPixel += ((float4) weight)*cp;
2276 if (itemID < actualNumPixelInThisChunk) {
2277 outputPixelCache[itemID] = (float4)0.0f;
2278 densityCache[itemID] = 0.0f;
2279 if (alpha_index != 0)
2280 gammaCache[itemID] = 0.0f;
2282 barrier(CLK_LOCAL_MEM_FENCE);
2285 for (
unsigned int i = 0; i < numItems; i++) {
2286 if (pixelIndex != -1) {
2287 if (itemID%numItems == i) {
2288 outputPixelCache[pixelIndex]+=filteredPixel;
2289 densityCache[pixelIndex]+=density;
2290 if (alpha_index != 0)
2291 gammaCache[pixelIndex]+=gamma;
2294 barrier(CLK_LOCAL_MEM_FENCE);
2297 if (itemID < actualNumPixelInThisChunk)
2299 float4 filteredPixel = outputPixelCache[itemID];
2302 if (alpha_index != 0)
2303 gamma = gammaCache[itemID];
2305 float density = densityCache[itemID];
2306 if ((density != 0.0f) && (density != 1.0f))
2308 density = PerceptibleReciprocal(density);
2309 filteredPixel *= (float4) density;
2310 if (alpha_index != 0)
2314 if (alpha_index != 0)
2316 gamma = PerceptibleReciprocal(gamma);
2317 filteredPixel.x *= gamma;
2318 filteredPixel.y *= gamma;
2319 filteredPixel.z *= gamma;
2322 WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2330 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2331 void ResizeVerticalFilter(
const __global CLQuantum *inputImage,
const unsigned int number_channels,
2332 const unsigned int inputColumns,
const unsigned int inputRows, __global CLQuantum *filteredImage,
2333 const unsigned int filteredColumns,
const unsigned int filteredRows,
const float yFactor,
2334 const int resizeFilterType,
const int resizeWindowType,
const __global
float *resizeFilterCubicCoefficients,
2335 const float resizeFilterScale,
const float resizeFilterSupport,
const float resizeFilterWindowSupport,
2336 const float resizeFilterBlur, __local CLQuantum *inputImageCache,
const int numCachedPixels,
2337 const unsigned int pixelPerWorkgroup,
const unsigned int pixelChunkSize,
2338 __local float4 *outputPixelCache, __local
float *densityCache, __local
float *gammaCache)
2341 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2342 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2343 const unsigned int actualNumPixelToCompute = stopY - startY;
2346 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2347 const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2348 scale = PerceptibleReciprocal(scale);
2350 const int cacheRangeStartY = MagickMax((
int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(
int)(0));
2351 const int cacheRangeEndY = MagickMin((
int)(cacheRangeStartY + numCachedPixels), (
int)inputRows);
2354 const unsigned int x = get_global_id(0);
2355 unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2356 unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2357 unsigned int stride = inputColumns * number_channels;
2358 for (
unsigned int i = 0; i < number_channels; i++)
2360 event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2361 wait_group_events(1,&e);
2364 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2365 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2366 for (
unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2368 const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2369 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2370 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2373 const unsigned int itemID = get_local_id(1);
2374 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2376 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2378 float4 filteredPixel = (float4)0.0f;
2379 float density = 0.0f;
2382 if (pixelIndex != -1)
2385 const int y = chunkStartY + pixelIndex;
2388 const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2389 const unsigned int start = (
unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2390 const unsigned int stop = (
unsigned int)MagickMin(bisect+support+0.5f,(
float)inputRows);
2391 const unsigned int n = stop - start;
2394 unsigned int numStepsPerWorkItem = n / numItems;
2395 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2397 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2400 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2402 unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2403 for (
unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2405 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2406 (ResizeWeightingFunctionType) resizeFilterType,
2407 (ResizeWeightingFunctionType) resizeWindowType,
2408 resizeFilterScale, resizeFilterWindowSupport,
2409 resizeFilterBlur, scale*(start + i - bisect + 0.5));
2411 float4 cp = (float4)0.0f;
2413 __local CLQuantum *p = inputImageCache + cacheIndex;
2414 cp.x = (float) *(p);
2415 if (number_channels > 2)
2417 cp.y = (float) *(p + rangeLength);
2418 cp.z = (float) *(p + (rangeLength * 2));
2421 if (alpha_index != 0)
2423 cp.w = (float) *(p + (rangeLength * alpha_index));
2425 float alpha = weight * QuantumScale * cp.w;
2427 filteredPixel.x += alpha * cp.x;
2428 filteredPixel.y += alpha * cp.y;
2429 filteredPixel.z += alpha * cp.z;
2430 filteredPixel.w += weight * cp.w;
2434 filteredPixel += ((float4) weight)*cp;
2442 if (itemID < actualNumPixelInThisChunk) {
2443 outputPixelCache[itemID] = (float4)0.0f;
2444 densityCache[itemID] = 0.0f;
2445 if (alpha_index != 0)
2446 gammaCache[itemID] = 0.0f;
2448 barrier(CLK_LOCAL_MEM_FENCE);
2451 for (
unsigned int i = 0; i < numItems; i++) {
2452 if (pixelIndex != -1) {
2453 if (itemID%numItems == i) {
2454 outputPixelCache[pixelIndex]+=filteredPixel;
2455 densityCache[pixelIndex]+=density;
2456 if (alpha_index != 0)
2457 gammaCache[pixelIndex]+=gamma;
2460 barrier(CLK_LOCAL_MEM_FENCE);
2463 if (itemID < actualNumPixelInThisChunk)
2465 float4 filteredPixel = outputPixelCache[itemID];
2468 if (alpha_index != 0)
2469 gamma = gammaCache[itemID];
2471 float density = densityCache[itemID];
2472 if ((density != 0.0f) && (density != 1.0f))
2474 density = PerceptibleReciprocal(density);
2475 filteredPixel *= (float4) density;
2476 if (alpha_index != 0)
2480 if (alpha_index != 0)
2482 gamma = PerceptibleReciprocal(gamma);
2483 filteredPixel.x *= gamma;
2484 filteredPixel.y *= gamma;
2485 filteredPixel.z *= gamma;
2488 WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2507 __kernel
void RotationalBlur(
const __global CLQuantum *image,
2508 const unsigned int number_channels,
const unsigned int channel,
2509 const float2 blurCenter,__constant
float *cos_theta,
2510 __constant
float *sin_theta,
const unsigned int cossin_theta_size,
2511 __global CLQuantum *filteredImage)
2513 const int x = get_global_id(0);
2514 const int y = get_global_id(1);
2515 const int columns = get_global_size(0);
2516 const int rows = get_global_size(1);
2517 unsigned int step = 1;
2518 float center_x = (float) x - blurCenter.x;
2519 float center_y = (float) y - blurCenter.y;
2520 float radius = hypot(center_x, center_y);
2522 float blur_radius = hypot(blurCenter.x, blurCenter.y);
2524 if (radius > MagickEpsilon)
2526 step = (
unsigned int) (blur_radius / radius);
2529 if (step >= cossin_theta_size)
2530 step = cossin_theta_size-1;
2533 float4 result = 0.0f;
2534 float normalize = 0.0f;
2537 for (
unsigned int i=0; i<cossin_theta_size; i+=step)
2539 int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2540 int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2542 float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2544 if ((number_channels == 4) || (number_channels == 2))
2546 float alpha = (float)(QuantumScale*pixel.w);
2550 result.x += alpha * pixel.x;
2551 result.y += alpha * pixel.y;
2552 result.z += alpha * pixel.z;
2553 result.w += pixel.w;
2561 normalize = PerceptibleReciprocal(normalize);
2563 if ((number_channels == 4) || (number_channels == 2))
2565 gamma = PerceptibleReciprocal(gamma);
2569 result.w *= normalize;
2572 result *= normalize;
2574 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2591 __kernel
void UnsharpMaskBlurColumn(
const __global CLQuantum* image,
2592 const __global float4 *blurRowData,
const unsigned int number_channels,
2593 const ChannelType channel,
const unsigned int columns,
2594 const unsigned int rows,__local float4* cachedData,
2595 __local
float* cachedFilter,
const __global
float *filter,
2596 const unsigned int width,
const float gain,
const float threshold,
2597 __global CLQuantum *filteredImage)
2599 const unsigned int radius = (width-1)/2;
2602 const int groupX = get_group_id(0);
2603 const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2604 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2606 if ((groupStartY >= 0) && (groupStopY < rows))
2608 event_t e = async_work_group_strided_copy(cachedData,
2609 blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2610 wait_group_events(1,&e);
2614 for (
int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2615 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2617 barrier(CLK_LOCAL_MEM_FENCE);
2620 event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2621 wait_group_events(1,&e);
2624 const int cy = get_global_id(1);
2628 float4 blurredPixel = (float4) 0.0f;
2632 for ( ; i+7 < width; )
2634 for (
int j=0; j < 8; j++, i++)
2635 blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2638 for ( ; i < width; i++)
2639 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2641 float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2642 float4 outputPixel = inputImagePixel - blurredPixel;
2644 float quantumThreshold = QuantumRange*threshold;
2646 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2647 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2650 WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2656 __kernel
void UnsharpMask(
const __global CLQuantum *image,
const unsigned int number_channels,
2657 const ChannelType channel,__constant
float *filter,
const unsigned int width,
2658 const unsigned int columns,
const unsigned int rows,__local float4 *pixels,
2659 const float gain,
const float threshold,__global CLQuantum *filteredImage)
2661 const unsigned int x = get_global_id(0);
2662 const unsigned int y = get_global_id(1);
2664 const unsigned int radius = (width - 1) / 2;
2666 int row = y - radius;
2667 int baseRow = get_group_id(1) * get_local_size(1) - radius;
2668 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2670 while (row < endRow) {
2671 int srcy = (row < 0) ? -row : row;
2672 srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2674 float4 value = 0.0f;
2676 int ix = x - radius;
2679 while (i + 7 < width) {
2680 for (
int j = 0; j < 8; ++j) {
2682 srcx = (srcx < 0) ? -srcx : srcx;
2683 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2684 value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2691 int srcx = (ix < 0) ? -ix : ix;
2692 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2693 value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2697 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2698 row += get_local_size(1);
2701 barrier(CLK_LOCAL_MEM_FENCE);
2703 const int px = get_local_id(0);
2704 const int py = get_local_id(1);
2705 const int prp = get_local_size(0);
2706 float4 value = (float4)(0.0f);
2709 while (i + 7 < width) {
2710 for (
int j = 0; j < 8; ++j)
2711 value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2715 value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2719 if ((x < columns) && (y < rows)) {
2720 float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
2721 float4 diff = srcPixel - value;
2723 float quantumThreshold = QuantumRange*threshold;
2725 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
2726 value = select(srcPixel + diff * gain, srcPixel, mask);
2728 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
2746 __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
2747 void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
2748 const unsigned int number_channels,
const unsigned int max_channels,
2749 const float threshold,
const int passes,
const unsigned int imageWidth,
2750 const unsigned int imageHeight)
2752 const int pad = (1 << (passes - 1));
2753 const int tileSize = 64;
2754 const int tileRowPixels = 64;
2755 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
2757 CLQuantum stage[48];
2759 local
float buffer[64 * 64];
2761 int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
2762 int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
2764 for (
int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2765 int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
2766 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
2768 for (
int channel = 0; channel < max_channels; ++channel)
2769 stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
2772 for (
int channel = 0; channel < max_channels; ++channel) {
2774 for (
int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2775 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
2783 for (
int i = 0; i < 16; i++)
2786 for (
int pass = 0; pass < passes; ++pass) {
2787 const int radius = 1 << pass;
2788 const int x = get_local_id(0);
2789 const float thresh = threshold * noise[pass];
2792 for (
int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2793 const int offset = i * tileRowPixels;
2795 tmp[i / 4] = buffer[x + offset];
2796 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
2797 barrier(CLK_LOCAL_MEM_FENCE);
2798 buffer[x + offset] = pixel;
2800 barrier(CLK_LOCAL_MEM_FENCE);
2803 for (
int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2804 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
2805 float delta = tmp[i / 4] - pixel;
2807 if (delta < -thresh)
2809 else if (delta > thresh)
2813 accum[i / 4] += delta;
2815 barrier(CLK_LOCAL_MEM_FENCE);
2817 if (pass < passes - 1)
2818 for (
int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2819 buffer[x + i * tileRowPixels] = tmp[i / 4];
2821 for (
int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2822 accum[i / 4] += tmp[i / 4];
2823 barrier(CLK_LOCAL_MEM_FENCE);
2826 for (
int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2827 stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
2829 barrier(CLK_LOCAL_MEM_FENCE);
2834 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
2835 for (
int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2836 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
2837 int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
2838 for (
int channel = 0; channel < max_channels; ++channel) {
2839 dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
2851#if defined(__cplusplus) || defined(c_plusplus)