Skip to content

Commit

Permalink
Fix cuda compression device lost (#347)
Browse files Browse the repository at this point in the history
* Fixing issue with cuda decompression context
remaining unusable after an error.

* Fix control flow

* Fix virtual memory allocation issues.

Implementes missing SysHost::VirtualProtect on Windows.

* Fix dllimport/export w/ interface target on harvester

* Return harvester to configurable lib.

Defaults to dynamic.

* Fix harvester build action

* Fix interface include dirs

* Fix device lost on non-errors.

Fixes CUDA decompressor also not being
torn-down or recreated properly.
  • Loading branch information
haorldbchi authored Jun 30, 2023
1 parent 6ab28ed commit 0ee2a1e
Show file tree
Hide file tree
Showing 8 changed files with 232 additions and 94 deletions.
2 changes: 1 addition & 1 deletion .github/actions/build-harvester.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ mkdir -p build-harvester
pushd build-harvester
cmake .. -DCMAKE_BUILD_TYPE=Release -DBB_HARVESTER_ONLY=ON

cmake --build . --config Release --target bladebit_harvester --target bladebit_harvester_dynamic -j$procs
cmake --build . --config Release --target bladebit_harvester
cmake --install . --prefix harvester_dist

pushd harvester_dist/green_reaper
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ option(BB_ENABLE_TESTS "Enable tests." OFF)
option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)


#
Expand Down
64 changes: 31 additions & 33 deletions Harvester.cmake
Original file line number Diff line number Diff line change
@@ -1,27 +1,21 @@
add_library(bladebit_harvester_base INTERFACE)

add_library(bladebit_harvester STATIC)
target_link_libraries(bladebit_harvester PRIVATE bladebit_harvester_base)

add_library(bladebit_harvester_dynamic SHARED)
target_link_libraries(bladebit_harvester_dynamic PRIVATE bladebit_harvester_base)

if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows")
set_target_properties(bladebit_harvester_dynamic PROPERTIES OUTPUT_NAME "bladebit_harvester")
if(NOT ${BB_HARVESTER_STATIC})
add_library(bladebit_harvester SHARED)
else()
add_library(bladebit_harvester STATIC)
endif()


set_property(TARGET bladebit_harvester bladebit_harvester_dynamic PROPERTY PUBLIC_HEADER
set_property(TARGET bladebit_harvester PROPERTY PUBLIC_HEADER
src/harvesting/GreenReaper.h
src/harvesting/GreenReaperPortable.h)

install(TARGETS bladebit_harvester bladebit_harvester_dynamic
install(TARGETS bladebit_harvester
LIBRARY DESTINATION green_reaper/lib
ARCHIVE DESTINATION green_reaper/lib
PUBLIC_HEADER DESTINATION green_reaper/include
)

target_sources(bladebit_harvester_base INTERFACE
target_sources(bladebit_harvester PRIVATE
src/pch.cpp

src/pos/chacha8.cpp
Expand Down Expand Up @@ -121,41 +115,45 @@ target_sources(bladebit_harvester_base INTERFACE
>
)

target_include_directories(bladebit_harvester_base INTERFACE src SYSTEM cuda)
target_include_directories(bladebit_harvester PRIVATE src SYSTEM cuda INTERFACE src/harvesting)

target_compile_features(bladebit_harvester_base INTERFACE cxx_std_17)
target_compile_features(bladebit_harvester PUBLIC cxx_std_17)

target_compile_definitions(bladebit_harvester
PRIVATE
THRUST_IGNORE_CUB_VERSION_CHECK=1
GR_EXPORT=1

target_compile_definitions(bladebit_harvester_base INTERFACE
BB_IS_HARVESTER=1
THRUST_IGNORE_CUB_VERSION_CHECK=1
GR_EXPORT=1
$<${have_cuda}:
BB_CUDA_ENABLED=1
>

PUBLIC
BB_IS_HARVESTER=1
INTERFACE
$<$<BOOL:${BB_HARVESTER_STATIC}>:GR_NO_IMPORT=1>
)

target_compile_options(bladebit_harvester_base INTERFACE

target_compile_options(bladebit_harvester PRIVATE
${preinclude_pch}
# $<${have_cuda}:${cuda_archs}>
)

target_link_options(bladebit_harvester_base INTERFACE $<DEVICE_LINK: ${cuda_archs}>)

target_link_libraries(bladebit_harvester_base INTERFACE
bladebit_config
Threads::Threads

$<${have_cuda}:CUDA::cudart_static>
if(${have_cuda})
target_link_options(bladebit_harvester PUBLIC $<DEVICE_LINK: ${cuda_archs}>)
endif()

INTERFACE
$<$<PLATFORM_ID:Linux>:
# ${NUMA_LIBRARY}
# dl
>
target_link_libraries(bladebit_harvester
PRIVATE
bladebit_config
PUBLIC
Threads::Threads
$<${have_cuda}:CUDA::cudart_static>
)

if(CUDAToolkit_FOUND)
set_target_properties(bladebit_harvester_base PROPERTIES
set_target_properties(bladebit_harvester PROPERTIES
EXCLUDE_FROM_ALL ON
MSVC_RUNTIME_LIBRARY MultiThreaded$<$<CONFIG:Debug>:Debug>
CUDA_RUNTIME_LIBRARY Static
Expand Down
97 changes: 73 additions & 24 deletions cuda/harvesting/CudaThresher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class CudaThresher : public IThresher
bool _isDecompressing = false; // Are we currently decompressing a proof?
TableId _currentTable = TableId::Table1; // Current table being decompressed

uint32 _maxCompressionLevel = 0; // Max compression level for which we have allocated buffers

size_t _bufferCapacity = 0;
size_t _matchCapacity = 0;
size_t _sortBufferSize = 0;
Expand Down Expand Up @@ -97,8 +99,14 @@ public:
ReleaseBuffers();
}

bool AllocateBuffers( const uint k, const uint maxCompressionLevel ) override
bool AllocateBuffers( const uint k, uint maxCompressionLevel ) override
{
// #NOTE: For now we always preallocate for the maximum compression level
maxCompressionLevel = 9;

if( _maxCompressionLevel >= maxCompressionLevel )
return true;

_info.k = 32;
_info.bucketCount = 64; // #TODO: Make this configurable
_info.yBits = _info.k + kExtraBits;
Expand Down Expand Up @@ -185,6 +193,7 @@ public:
}

//cErr = cudaMalloc( &cuda.devYBufferF1, sizeof( uint32 ) * allocEntryCount );
_maxCompressionLevel = maxCompressionLevel;
return true;

FAIL:
Expand All @@ -194,11 +203,13 @@ public:

void ReleaseBuffers() override
{
_bufferCapacity = 0;
_maxCompressionLevel = 0;

// Release all buffers
CudaSafeFreeHost( _hostMatchCount );

CudaSafeFree( _devSortTmpBuffer );

CudaSafeFree( _devChaChaInput );

CudaSafeFree( _devYBufferF1 );
Expand Down Expand Up @@ -227,7 +238,7 @@ public:
if( _downloadEvent ) cudaEventDestroy( _downloadEvent ); _downloadEvent = nullptr;
}

bool DecompressInitialTable(
ThresherResult DecompressInitialTable(
GreenReaperContext& cx,
const byte plotId[32],
const uint32 entryCountPerX,
Expand All @@ -240,16 +251,25 @@ public:
// Only k32 for now
ASSERT( x0 <= 0xFFFFFFFF );
ASSERT( x1 <= 0xFFFFFFFF );

ASSERT( entryCountPerX*2 < _bufferCapacity );

// Ensure our state is good
if( cudaStreamSynchronize( _computeStream ) != cudaSuccess ) return false;
if( cudaStreamSynchronize( _downloadStream ) != cudaSuccess ) return false;
ThresherResult result{};
result.kind = ThresherResultKind::Success;

if( entryCountPerX*2 > _bufferCapacity )
{
result.kind = ThresherResultKind::Error;
result.error = ThresherError::UnexpectedError;
return result;
}

uint64 table1EntryCount = 0;
cudaError cErr = cudaSuccess;

// Ensure we're in a good state
cErr = cudaStreamSynchronize( _computeStream ); if( cErr != cudaSuccess ) goto FAIL;
cErr = cudaStreamSynchronize( _downloadStream ); if( cErr != cudaSuccess ) goto FAIL;


{
byte key[32] = { 1 };
Expand Down Expand Up @@ -292,12 +312,12 @@ public:
const auto timer = TimerBegin();
#endif

cErr = cudaMemcpyAsync( _devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, _computeStream );
if( cErr != cudaSuccess ) return false;

uint64* f1Y = _devYBufferF1;
uint32* f1X = _devXBufferTmp;

cErr = cudaMemcpyAsync( _devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, _computeStream );
if( cErr != cudaSuccess ) goto FAIL;

for( uint32 i = 0; i < f1Iterations; i++ )
{
CudaGenF1K32(
Expand Down Expand Up @@ -337,8 +357,8 @@ public:

cErr = cub::DeviceRadixSort::SortPairs<uint64, uint32>(
_devSortTmpBuffer, _sortBufferSize,
_devYBufferF1, _devYBufferIn,
_devXBufferTmp, _devXBuffer,
_devYBufferF1, _devYBufferIn,
_devXBufferTmp, _devXBuffer,
f1EntryCount, 0, _info.k+kExtraBits,
_computeStream );
if( cErr != cudaSuccess ) goto FAIL;
Expand All @@ -361,7 +381,7 @@ public:
cErr = CudaHarvestMatchK32(
_devMatchesOut,
_devMatchCount,
_bufferCapacity,
(uint32)_bufferCapacity,
_devYBufferIn,
(uint32)table1EntryCount,
0,
Expand All @@ -382,6 +402,12 @@ public:
timer = TimerBegin();
#endif

if( matchCount < 1 )
{
result.kind = ThresherResultKind::NoMatches;
return result;
}

// Compute table 2 Fx
CudaFxHarvestK32(
TableId::Table2,
Expand Down Expand Up @@ -441,19 +467,28 @@ public:
#if BB_CUDA_HARVEST_USE_TIMINGS
_timings.download += TimerEndTicks( timer );
#endif

if( matchCount < 1 )
{
result.kind = ThresherResultKind::NoMatches;
return result;
}
}

return true;
return result;

FAIL:
// Log::Line( "DecompressInitialTable() Failed with CUDA error '%s': %s", cudaGetErrorName( cErr ), cudaGetErrorString( cErr ) );
ASSERT( cErr == cudaSuccess ); // Force debugger break
cudaStreamSynchronize( _computeStream );
cudaStreamSynchronize( _downloadStream );
return false;

result.kind = ThresherResultKind::Error;
result.error = ThresherError::CudaError;
result.internalError = (i32)cErr;

return result;
}

bool DecompressTableGroup(
ThresherResult DecompressTableGroup(
GreenReaperContext& cx,
const TableId table,
uint32 entryCount,
Expand All @@ -472,12 +507,20 @@ public:

outMatchCount = 0;

ThresherResult result{};
result.kind = ThresherResultKind::Success;

cudaError_t cErr = cudaSuccess;

const size_t inMetaMultiplier = GetTableMetaMultiplier( table - 1 );
const size_t inMetaByteSize = CDiv( _info.k * inMetaMultiplier, 8 );
uint32 matchCount = 0;

// Ensure we're in a good state
cErr = cudaStreamSynchronize( _uploadStream ); if( cErr != cudaSuccess ) goto FAIL;
cErr = cudaStreamSynchronize( _computeStream ); if( cErr != cudaSuccess ) goto FAIL;
cErr = cudaStreamSynchronize( _downloadStream ); if( cErr != cudaSuccess ) goto FAIL;

/// Upload input data
#if BB_CUDA_HARVEST_USE_TIMINGS
auto timer = TimerBegin();
Expand Down Expand Up @@ -558,6 +601,7 @@ public:
if( matchCount < 1 )
{
// Log::Line( "CUDA: No matches!" );
result.kind = ThresherResultKind::NoMatches;
goto FAIL;
}

Expand Down Expand Up @@ -617,16 +661,21 @@ public:
#endif

outMatchCount = matchCount;
return true;
return result;

FAIL:
// Log::Line( "DecompressTableGroup() Failed with CUDA error '%s': %s", cudaGetErrorName( cErr ), cudaGetErrorString( cErr ) );

ASSERT( cErr == cudaSuccess );
cudaStreamSynchronize( _uploadStream );
cudaStreamSynchronize( _computeStream );
cudaStreamSynchronize( _downloadStream );
return false;
ASSERT( cErr == cudaSuccess ); // Force debugger break

if( result.kind == ThresherResultKind::Success )
{
result.kind = ThresherResultKind::Error;
result.error = ThresherError::CudaError;
result.internalError = (i32)cErr;
}

return result;
}

cudaError_t SortEntriesOnY(
Expand Down
Loading

0 comments on commit 0ee2a1e

Please sign in to comment.