From e1b143c48f0b6c7626660351fa79ab9e4ba5f82e Mon Sep 17 00:00:00 2001 From: Bouke van der Bijl Date: Thu, 15 Jun 2023 14:34:10 +0200 Subject: [PATCH] Speed up cluster merge by batch copying arrays merge_clusters was using zarray_add_all which was copying over elements one-by-one doing 2 memcpys and a potential array resize per element. Here we replace it by a range copy that does a single resize and memcpy for the operation which is a lot faster. In my testing it reduces the total runtime for an image that's 2000x3000 by 20%. --- apriltag_quad_thresh.c | 73 ++++++++++++++++++------------------------ common/zarray.h | 29 +++++++++-------- 2 files changed, 47 insertions(+), 55 deletions(-) diff --git a/apriltag_quad_thresh.c b/apriltag_quad_thresh.c index 09dc2023..4bf67f5b 100644 --- a/apriltag_quad_thresh.c +++ b/apriltag_quad_thresh.c @@ -1062,10 +1062,10 @@ static void do_quad_task(void *p) for (int cidx = task->cidx0; cidx < task->cidx1; cidx++) { - zarray_t *cluster; - zarray_get(clusters, cidx, &cluster); + zarray_t **cluster; + zarray_get_volatile(clusters, cidx, &cluster); - if (zarray_size(cluster) < td->qtp.min_cluster_pixels) + if (zarray_size(*cluster) < td->qtp.min_cluster_pixels) continue; // a cluster should contain only boundary points around the @@ -1074,14 +1074,14 @@ static void do_quad_task(void *p) // fit quads to.) A typical point along an edge is added three // times (because it has 3 neighbors). The maximum perimeter // is 2w+2h. - if (zarray_size(cluster) > 3*(2*w+2*h)) { + if (zarray_size(*cluster) > 3*(2*w+2*h)) { continue; } struct quad quad; memset(&quad, 0, sizeof(struct quad)); - if (fit_quad(td, task->im, cluster, &quad, task->tag_width, task->normal_border, task->reversed_border)) { + if (fit_quad(td, task->im, *cluster, &quad, task->tag_width, task->normal_border, task->reversed_border)) { pthread_mutex_lock(&td->mutex); zarray_add(quads, &quad); pthread_mutex_unlock(&td->mutex); @@ -1588,14 +1588,14 @@ zarray_t* do_gradient_clusters(image_u8_t* threshim, int ts, int y0, int y1, int int n = end - start; for (int j = 0; j < n - 1; j++) { for (int k = 0; k < n - j - 1; k++) { - struct cluster_hash* hash1; - struct cluster_hash* hash2; - zarray_get(clusters, start + k, &hash1); - zarray_get(clusters, start + k + 1, &hash2); - if (hash1->id > hash2->id) { - struct cluster_hash tmp = *hash2; - *hash2 = *hash1; - *hash1 = tmp; + struct cluster_hash** hash1; + struct cluster_hash** hash2; + zarray_get_volatile(clusters, start + k, &hash1); + zarray_get_volatile(clusters, start + k + 1, &hash2); + if ((*hash1)->id > (*hash2)->id) { + struct cluster_hash tmp = **hash2; + **hash2 = **hash1; + **hash1 = tmp; } } } @@ -1626,38 +1626,29 @@ zarray_t* merge_clusters(zarray_t* c1, zarray_t* c2) { int l2 = zarray_size(c2); while (i1 < l1 && i2 < l2) { - struct cluster_hash* h1; - struct cluster_hash* h2; - zarray_get(c1, i1, &h1); - zarray_get(c2, i2, &h2); - - if (h1->hash == h2->hash && h1->id == h2->id) { - zarray_add_all(h1->data, h2->data); - zarray_add(ret, &h1); + struct cluster_hash** h1; + struct cluster_hash** h2; + zarray_get_volatile(c1, i1, &h1); + zarray_get_volatile(c2, i2, &h2); + + if ((*h1)->hash == (*h2)->hash && (*h1)->id == (*h2)->id) { + zarray_add_range((*h1)->data, (*h2)->data, 0, zarray_size((*h2)->data)); + zarray_add(ret, h1); i1++; i2++; - zarray_destroy(h2->data); - free(h2); - } else if (h2->hash < h1->hash || (h2->hash == h1->hash && h2->id < h1->id)) { - zarray_add(ret, &h2); + zarray_destroy((*h2)->data); + free(*h2); + } else if ((*h2)->hash < (*h1)->hash || ((*h2)->hash == (*h1)->hash && (*h2)->id < (*h1)->id)) { + zarray_add(ret, h2); i2++; } else { - zarray_add(ret, &h1); + zarray_add(ret, h1); i1++; } } - for (; i1 < l1; i1++) { - struct cluster_hash* h1; - zarray_get(c1, i1, &h1); - zarray_add(ret, &h1); - } - - for (; i2 < l2; i2++) { - struct cluster_hash* h2; - zarray_get(c2, i2, &h2); - zarray_add(ret, &h2); - } + zarray_add_range(ret, c1, i1, l1); + zarray_add_range(ret, c2, i2, l2); zarray_destroy(c1); zarray_destroy(c2); @@ -1716,10 +1707,10 @@ zarray_t* gradient_clusters(apriltag_detector_t *td, image_u8_t* threshim, int w clusters = zarray_create(sizeof(zarray_t*)); zarray_ensure_capacity(clusters, zarray_size(clusters_list[0])); for (int i = 0; i < zarray_size(clusters_list[0]); i++) { - struct cluster_hash* hash; - zarray_get(clusters_list[0], i, &hash); - zarray_add(clusters, &hash->data); - free(hash); + struct cluster_hash** hash; + zarray_get_volatile(clusters_list[0], i, &hash); + zarray_add(clusters, &(*hash)->data); + free(*hash); } zarray_destroy(clusters_list[0]); free(clusters_list); diff --git a/common/zarray.h b/common/zarray.h index 1a882c28..22b4c2bb 100644 --- a/common/zarray.h +++ b/common/zarray.h @@ -437,26 +437,27 @@ static inline int zarray_index_of(const zarray_t *za, const void *p) return -1; } - - /** - * Add all elements from 'source' into 'dest'. el_size must be the same - * for both lists + * Add elements from start up to and excluding end from 'source' into 'dest'. + * el_sz must be the same for both lists **/ -static inline void zarray_add_all(zarray_t * dest, const zarray_t * source) +static inline void zarray_add_range(zarray_t *dest, const zarray_t *source, int start, int end) { assert(dest->el_sz == source->el_sz); + assert(dest != NULL); + assert(source != NULL); + assert(start >= 0); + assert(end <= source->size); + if (start == end) { + return; + } + assert(start < end); - // Don't allocate on stack because el_sz could be larger than ~8 MB - // stack size - char *tmp = (char*)calloc(1, dest->el_sz); - - for (int i = 0; i < zarray_size(source); i++) { - zarray_get(source, i, tmp); - zarray_add(dest, tmp); - } + int count = end - start; + zarray_ensure_capacity(dest, dest->size + count); - free(tmp); + memcpy(&dest->data[dest->size*dest->el_sz], &source->data[source->el_sz*start], dest->el_sz*count); + dest->size += count; } #ifdef __cplusplus