@@ -7507,10 +7507,17 @@ static void ggml_compute_forward_upscale_f32(
75077507 float sf1 = (float )ne1/src0->ne [1 ];
75087508 float sf2 = (float )ne2/src0->ne [2 ];
75097509 float sf3 = (float )ne3/src0->ne [3 ];
7510+ float pixel_offset = 0 .5f ;
75107511
75117512 const int32_t mode_flags = ggml_get_op_params_i32 (dst, 0 );
75127513 const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF );
75137514
7515+ if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
7516+ pixel_offset = 0 .0f ;
7517+ sf0 = ne0 > 1 && ne00 > 1 ? (float )(ne0 - 1 ) / (ne00 - 1 ) : sf0;
7518+ sf1 = ne1 > 1 && ne01 > 1 ? (float )(ne1 - 1 ) / (ne01 - 1 ) : sf1;
7519+ }
7520+
75147521 if (mode == GGML_SCALE_MODE_NEAREST) {
75157522 for (int64_t i3 = 0 ; i3 < ne3; i3++) {
75167523 const int64_t i03 = i3 / sf3;
@@ -7530,13 +7537,6 @@ static void ggml_compute_forward_upscale_f32(
75307537 }
75317538 }
75327539 } else if (mode == GGML_SCALE_MODE_BILINEAR) {
7533- float pixel_offset = 0 .5f ;
7534- if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
7535- pixel_offset = 0 .0f ;
7536- sf0 = ne0 > 1 && ne00 > 1 ? (float )(ne0 - 1 ) / (ne00 - 1 ) : sf0;
7537- sf1 = ne1 > 1 && ne01 > 1 ? (float )(ne1 - 1 ) / (ne01 - 1 ) : sf1;
7538- }
7539-
75407540 for (int64_t i3 = 0 ; i3 < ne3; i3++) {
75417541 const int64_t i03 = i3 / sf3;
75427542 for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
@@ -7571,6 +7571,51 @@ static void ggml_compute_forward_upscale_f32(
75717571
75727572 const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
75737573
7574+ float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
7575+ *y_dst = val;
7576+ }
7577+ }
7578+ }
7579+ }
7580+ } else if (mode == GGML_SCALE_MODE_BICUBIC) {
7581+ // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
7582+ const float a = -0 .75f ; // use alpha = -0.75 (same as PyTorch)
7583+ auto weight1 = [a](float x) { return ((a + 2 ) * x - (a + 3 )) * x * x + 1 ; };
7584+ auto weight2 = [a](float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
7585+ auto bicubic = [=](float p0, float p1, float p2, float p3, float x) {
7586+ const float w0 = weight2 (x + 1 );
7587+ const float w1 = weight1 (x + 0 );
7588+ const float w2 = weight1 (1 - x);
7589+ const float w3 = weight2 (2 - x);
7590+ return p0*w0 + p1*w1 + p2*w2 + p3*w3;
7591+ };
7592+
7593+ for (int64_t i3 = 0 ; i3 < ne3; i3++) {
7594+ const int64_t i03 = i3 / sf3;
7595+ for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
7596+ const int64_t i02 = i2 / sf2;
7597+ for (int64_t i1 = 0 ; i1 < ne1; i1++) {
7598+ const float y = ((float )i1 + pixel_offset) / sf1 - pixel_offset;
7599+ const int64_t y0 = (int64_t )floorf (y);
7600+ const float dy = y - (float )y0;
7601+
7602+ for (int64_t i0 = 0 ; i0 < ne0; i0++) {
7603+ const float x = ((float )i0 + pixel_offset) / sf0 - pixel_offset;
7604+ const int64_t x0 = (int64_t )floorf (x);
7605+ const float dx = x - (float )x0;
7606+
7607+ auto p = [=](int64_t x_off, int64_t y_off) -> float {
7608+ int64_t i00 = std::max (int64_t (0 ), std::min (x0 + x_off, ne00 - 1 ));
7609+ int64_t i01 = std::max (int64_t (0 ), std::min (y0 + y_off, ne01 - 1 ));
7610+ return *(const float *)((const char *)src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
7611+ };
7612+
7613+ const float val = bicubic (
7614+ bicubic (p (-1 ,-1 ), p (0 ,-1 ), p (1 ,-1 ), p (2 ,-1 ), dx),
7615+ bicubic (p (-1 , 0 ), p (0 , 0 ), p (1 , 0 ), p (2 , 0 ), dx),
7616+ bicubic (p (-1 , 1 ), p (0 , 1 ), p (1 , 1 ), p (2 , 1 ), dx),
7617+ bicubic (p (-1 , 2 ), p (0 , 2 ), p (1 , 2 ), p (2 , 2 ), dx), dy);
7618+
75747619 float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
75757620 *y_dst = val;
75767621 }
0 commit comments