Skip to content

Commit

Permalink
Optimize handling of contiguous with gaps datatypes.
Browse files Browse the repository at this point in the history
Both the pack and unpack have been optimized, and brought in sync.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
  • Loading branch information
bosilca committed May 29, 2019
1 parent 1217c0d commit 660fb00
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 162 deletions.
223 changes: 94 additions & 129 deletions opal/datatype/opal_datatype_pack.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2016 The University of Tennessee and The University
* Copyright (c) 2004-2019 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
Expand Down Expand Up @@ -53,8 +53,6 @@
#endif /* defined(CHECKSUM) */


#define IOVEC_MEM_LIMIT 8192

/* the contig versions does not use the stack. They can easily retrieve
* the status with just the informations from pConvertor->bConverted.
*/
Expand All @@ -68,9 +66,8 @@ opal_pack_homogeneous_contig_function( opal_convertor_t* pConv,
unsigned char *source_base = NULL;
uint32_t iov_count;
size_t length = pConv->local_size - pConv->bConverted, initial_amount = pConv->bConverted;
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;

source_base = (pConv->pBaseBuf + initial_displ + pStack[0].disp + pStack[1].disp);
source_base = (pConv->pBaseBuf + pConv->pDesc->true_lb + pStack[0].disp + pStack[1].disp);

/* There are some optimizations that can be done if the upper level
* does not provide a buffer.
Expand Down Expand Up @@ -111,155 +108,123 @@ opal_pack_homogeneous_contig_with_gaps_function( opal_convertor_t* pConv,
uint32_t* out_size,
size_t* max_data )
{
size_t remaining, length, initial_bytes_converted = pConv->bConverted;
const opal_datatype_t* pData = pConv->pDesc;
dt_stack_t* stack = pConv->pStack;
ptrdiff_t extent = pData->ub - pData->lb;
unsigned char *user_memory, *packed_buffer;
uint32_t iov_count, index;
uint32_t idx = 0;
size_t i;
size_t bConverted, remaining, length, initial_bytes_converted = pConv->bConverted;
ptrdiff_t extent= pData->ub - pData->lb;
ptrdiff_t initial_displ = pConv->use_desc->desc[pConv->use_desc->used].end_loop.first_elem_disp;

/* The memory layout is contiguous with gaps in the begining and at the end. The datatype true_lb
* is the initial displacement, the size the length of the contiguous area and the extent represent
* how much we should jump between elements.
*/
assert( (pData->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && ((ptrdiff_t)pData->size != extent) );
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( pBaseBuf %p, iov_count %d )\n",
(void*)pConv->pBaseBuf, *out_size ); );
if( stack[1].type != opal_datatype_uint1.id ) {
stack[1].count *= opal_datatype_basicDatatypes[stack[1].type]->size;
stack[1].type = opal_datatype_uint1.id;
}
/* We can provide directly the pointers in the user buffers (like the convertor_raw) */
if( NULL == iov[0].iov_base ) {
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
if( stack[1].count != pData->size ) {
iov[idx].iov_base = user_memory;
iov[idx].iov_len = stack[1].count;
COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
stack[0].count--; /* update the first stack position */
stack[0].disp += extent;
stack[1].count = pData->size; /* for safety */
stack[1].disp = 0;
idx++; /* update next iovec */
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp;
pConv->bConverted += stack[1].count;
}
for( ; (idx < (*out_size)) && stack[0].count; idx++ ) {
iov[idx].iov_base = user_memory;
iov[idx].iov_len = pData->size;
COMPUTE_CSUM( iov[idx].iov_base, iov[idx].iov_len, pConv );
stack[0].count--;
stack[0].disp += extent;
user_memory += extent;
pConv->bConverted += pData->size;
}
goto update_status_and_return;
}

/* There are some optimizations that can be done if the upper level
* does not provide a buffer.
*/
for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
for( idx = 0; idx < (*out_size); idx++ ) {
/* Limit the amount of packed data to the data left over on this convertor */
remaining = pConv->local_size - pConv->bConverted;
if( 0 == remaining ) break; /* we're done this time */
if( remaining > iov[iov_count].iov_len )
remaining = iov[iov_count].iov_len;
packed_buffer = (unsigned char *)iov[iov_count].iov_base;
bConverted = remaining; /* how much will get unpacked this time */
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp + stack[1].disp;
i = pConv->count - stack[0].count; /* how many we already packed */
assert(i == (pConv->bConverted / pData->size));

if( packed_buffer == NULL ) {
/* special case for small data. We avoid allocating memory if we
* can fill the iovec directly with the address of the remaining
* data.
*/
if( stack->count < (size_t)((*out_size) - iov_count) ) {
stack[1].count = pData->size - (pConv->bConverted % pData->size);
for( index = iov_count; i < pConv->count; i++, index++ ) {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = stack[1].count;
stack[0].disp += extent;
pConv->bConverted += stack[1].count;
stack[1].disp = 0; /* reset it for the next round */
stack[1].count = pData->size;
user_memory = pConv->pBaseBuf + initial_displ + stack[0].disp;
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
}
*out_size = iov_count + index;
*max_data = (pConv->bConverted - initial_bytes_converted);
pConv->flags |= CONVERTOR_COMPLETED;
return 1; /* we're done */
}
/* now special case for big contiguous data with gaps around */
if( pData->size >= IOVEC_MEM_LIMIT ) {
/* as we dont have to copy any data, we can simply fill the iovecs
* with data from the user data description.
*/
for( index = iov_count; (i < pConv->count) && (index < (*out_size));
i++, index++ ) {
if( remaining < pData->size ) {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = remaining;
remaining = 0;
COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
break;
} else {
iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
iov[index].iov_len = pData->size;
user_memory += extent;
COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv );
}
remaining -= iov[index].iov_len;
pConv->bConverted += iov[index].iov_len;
}
*out_size = index;
*max_data = (pConv->bConverted - initial_bytes_converted);
if( pConv->bConverted == pConv->local_size ) {
pConv->flags |= CONVERTOR_COMPLETED;
return 1;
}
return 0;
if( remaining > iov[idx].iov_len )
remaining = iov[idx].iov_len;
packed_buffer = (unsigned char *)iov[idx].iov_base;
pConv->bConverted += remaining;
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;

DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, remaining ); );

length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
/* data left from last round and enough space in the buffer */
if( (pData->size != length) && (length <= remaining)) {
/* copy the partial left-over from the previous round */
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [prologue]\n",
(void*)user_memory, (void*)packed_buffer, length ); );
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
packed_buffer += length;
remaining -= length;
stack[1].count -= length;
stack[1].disp += length; /* just in case, we overwrite this below */
if( 0 == stack[1].count) { /* one completed element */
stack[0].count--;
stack[0].disp += extent;
if( 0 == stack[0].count ) /* not yet done */
break;
stack[1].count = pData->size;
stack[1].disp = 0;
}
user_memory = pConv->pBaseBuf + pData->true_lb + stack[0].disp + stack[1].disp;
}

{
DO_DEBUG( opal_output( 0, "pack_homogeneous_contig( user_memory %p, packed_buffer %p length %lu\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); );

length = (0 == pConv->stack_pos ? 0 : stack[1].count); /* left over from the last pack */
/* data left from last round and enough space in the buffer */
if( (0 != length) && (length <= remaining)) {
/* copy the partial left-over from the previous round */
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, length, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "2. pack dest %p src %p length %lu\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)length ); );
MEMCPY_CSUM( packed_buffer, user_memory, length, pConv );
packed_buffer += length;
user_memory += (extent - pData->size + length);
remaining -= length;
stack[1].count -= length;
if( 0 == stack[1].count) { /* one completed element */
stack[0].count--;
stack[0].disp += extent;
if( 0 != stack[0].count ) { /* not yet done */
stack[1].count = pData->size;
stack[1].disp = 0;
}
}
}
for( i = 0; pData->size <= remaining; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "3. pack dest %p src %p length %lu\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)pData->size ); );
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
packed_buffer += pData->size;
user_memory += extent;
remaining -= pData->size;
}
stack[0].count -= i; /* the filled up and the entire types */
stack[0].disp += (i * extent);
stack[1].disp += remaining;
/* Copy the last bits */
if( 0 != remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %lu\n",
(void*)user_memory, (void*)packed_buffer, (unsigned long)remaining ); );
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
user_memory += remaining;
stack[1].count -= remaining;
}
for( i = 0; pData->size <= remaining; i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, pData->size, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "pack dest %p src %p length %" PRIsize_t " [%" PRIsize_t "/%" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, pData->size, remaining, iov[idx].iov_len ); );
MEMCPY_CSUM( packed_buffer, user_memory, pData->size, pConv );
packed_buffer += pData->size;
user_memory += extent;
remaining -= pData->size;
}
stack[0].count -= i; /* the entire datatype copied above */
stack[0].disp += (i * extent);

/* Copy the last bits */
if( 0 != remaining ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( user_memory, remaining, pConv->pBaseBuf,
pData, pConv->count );
DO_DEBUG( opal_output( 0, "4. pack dest %p src %p length %" PRIsize_t "\n",
(void*)user_memory, (void*)packed_buffer, remaining ); );
MEMCPY_CSUM( packed_buffer, user_memory, remaining, pConv );
stack[1].count -= remaining;
stack[1].disp += remaining; /* keep the += in case we are copying less that the datatype size */
if( 0 == stack[1].count ) { /* prepare for the next element */
stack[1].count = pData->size;
stack[1].disp = 0;
}
}
pConv->bConverted += bConverted;
}
*out_size = iov_count;
*max_data = (pConv->bConverted - initial_bytes_converted);
if( pConv->bConverted == pConv->local_size ) {
pConv->flags |= CONVERTOR_COMPLETED;
return 1;
}
return 0;

update_status_and_return:
*out_size = idx;
*max_data = pConv->bConverted - initial_bytes_converted;
if( pConv->bConverted == pConv->local_size ) pConv->flags |= CONVERTOR_COMPLETED;
return !!(pConv->flags & CONVERTOR_COMPLETED); /* done or not */
}

/* The pack/unpack functions need a cleanup. I have to create a proper interface to access
Expand Down
Loading

0 comments on commit 660fb00

Please sign in to comment.