Skip to content

Commit

Permalink
Get rid of the division in the critical path.
Browse files Browse the repository at this point in the history
Amazing how a bad instruction scheduling can have such a drastic impact
on the code performance. With this change, the get a boost of at least
50% on the performance of data with a small blocklen and/or count.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
  • Loading branch information
bosilca committed Jul 10, 2019
1 parent a802552 commit 3562d70
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 17 deletions.
28 changes: 19 additions & 9 deletions opal/datatype/opal_datatype_pack.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,24 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
size_t* SPACE )
{
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now, do_now_bytes;
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t cando_count = *(COUNT), do_now, do_now_bytes;
unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;

assert( *(COUNT) <= _elem->count * _elem->blocklen);

if( cando_count > *(COUNT) )
cando_count = *(COUNT);
if( (blocklen_bytes * cando_count) > *(SPACE) )
cando_count = (*SPACE) / blocklen_bytes;

do_now = *(COUNT); /* save the COUNT for later */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;

if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */
goto do_epilog;
}
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
*(COUNT) -= cando_count;
for(; cando_count > 0; cando_count--) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
Expand All @@ -59,17 +64,19 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
}
goto update_and_return;
}
blocklen_bytes *= _elem->blocklen;

blocklen_bytes *= _elem->blocklen;
if( (_elem->count * _elem->blocklen) == cando_count ) {
goto skip_prolog;
}
/**
* First check if we already did something on this element ? The COUNT is the number
* of remaining predefined types in the current elem, not how many predefined types
* should be manipulated in the current call (this number is instead reflected on the
* SPACE).
*/
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
do_now = do_now % _elem->blocklen; /* any partial elements ? */

if( 0 != do_now ) {
size_t left_in_block = do_now; /* left in the current blocklen */
do_now = (do_now > cando_count ) ? cando_count : do_now;
Expand All @@ -88,6 +95,7 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
cando_count -= do_now;
}

skip_prolog:
/* Do as many full blocklen as possible */
for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
Expand All @@ -104,6 +112,8 @@ pack_predefined_data( opal_convertor_t* CONVERTOR,
* As an epilog do anything left from the last blocklen.
*/
if( 0 != cando_count ) {

do_epilog:
assert( cando_count < _elem->blocklen );
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
Expand Down
27 changes: 19 additions & 8 deletions opal/datatype/opal_datatype_unpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,24 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
size_t* SPACE )
{
const ddt_elem_desc_t* _elem = &((ELEM)->elem);
size_t cando_count = (*SPACE) / opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t do_now, do_now_bytes;
size_t blocklen_bytes = opal_datatype_basicDatatypes[_elem->common.type]->size;
size_t cando_count = (*COUNT), do_now, do_now_bytes;
unsigned char* _memory = (*memory) + _elem->disp;
unsigned char* _packed = *packed;

assert( *(COUNT) <= (_elem->count * _elem->blocklen));

if( cando_count > *(COUNT) )
cando_count = *(COUNT);
if( (blocklen_bytes * cando_count) > *(SPACE) )
cando_count = (*SPACE) / blocklen_bytes;

do_now = *(COUNT); /* save the COUNT for later */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;

if( 1 == _elem->count ) { /* Everything is contiguous, handle it as a prologue */
goto do_epilog;
}
if( 1 == _elem->blocklen ) { /* Do as many full blocklen as possible */
*(COUNT) -= cando_count;
for(; cando_count > 0; cando_count--) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
(CONVERTOR)->pDesc, (CONVERTOR)->count );
Expand All @@ -59,17 +64,20 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
}
goto update_and_return;
}

blocklen_bytes *= _elem->blocklen;
if( (_elem->count * _elem->blocklen) == cando_count ) {
goto skip_prolog;
}

/**
* First check if we already did something on this element ? The COUNT is the number
* of remaining predefined types in the current elem, not how many predefined types
* should be manipulated in the current call (this number is instead reflected on the
* SPACE).
*/
do_now = *(COUNT) % _elem->blocklen; /* any partial elements ? */
/* premptively update the number of COUNT we will return. */
*(COUNT) -= cando_count;
do_now = do_now % _elem->blocklen; /* any partial elements ? */

if( 0 != do_now ) {
size_t left_in_block = do_now; /* left in the current blocklen */
do_now = (do_now > cando_count ) ? cando_count : do_now;
Expand All @@ -88,6 +96,7 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
cando_count -= do_now;
}

skip_prolog:
/* Do as many full blocklen as possible */
for(size_t _i = 0; _elem->blocklen <= cando_count; _i++ ) {
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, blocklen_bytes, (CONVERTOR)->pBaseBuf,
Expand All @@ -104,6 +113,8 @@ unpack_predefined_data( opal_convertor_t* CONVERTOR,
* As an epilog do anything left from the last blocklen.
*/
if( 0 != cando_count ) {

do_epilog:
assert( cando_count < _elem->blocklen );
do_now_bytes = cando_count * opal_datatype_basicDatatypes[_elem->common.type]->size;
OPAL_DATATYPE_SAFEGUARD_POINTER( _memory, do_now_bytes, (CONVERTOR)->pBaseBuf,
Expand Down

0 comments on commit 3562d70

Please sign in to comment.