Skip to content

Commit

Permalink
[SYSTEMML-1325] Bugfix for GPU memory manager clear temporary memory.…
Browse files Browse the repository at this point in the history
… Fixes bug in GPU cleanup with JMLC.

In GPUMemoryManager.clearTemporaryMemory() we deallocate pointers but do not set the corresponding Pointer slots to null in the associated GPUObject instances. This can lead to attempted double-freeing of Pointers which results in an exception. This commit fixes this issue by creating a list of GPU objects associated with Pointers that have been freed as part of clearTemporaryMemory() and setting the corresponding pointer slots to null. This commit also addresses a minor issue with cleanup in JMLC which was causing Pointers for pinned data to be improperly cleared. Note this commit will reduce performance of GPUMemoryManager.clearTemporaryMemory() because it is now necessary to search through the list of managed GPUObjects to find the ones corresponding to pointers being freed. However, this method is only called once at the end of script invocation and so the performance cost will be small.

Closes #839.
  • Loading branch information
thomas9t authored and Niketan Pansare committed Nov 3, 2018
1 parent be2b3e2 commit cc7ec76
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
*/
package org.apache.sysml.runtime.instructions.gpu.context;

import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

Expand All @@ -43,8 +45,7 @@ public GPUMatrixMemoryManager(GPUMemoryManager gpuManager) {
void addGPUObject(GPUObject gpuObj) {
gpuObjects.add(gpuObj);
}



/**
* Get list of all Pointers in a GPUObject
* @param gObj gpu object
Expand Down Expand Up @@ -81,6 +82,20 @@ else if(sparsePtr.val != null)
* so that an extraneous host to dev transfer can be avoided
*/
HashSet<GPUObject> gpuObjects = new HashSet<>();

/**
* Return a set of GPU Objects associated with a list of pointers
* @param pointers A list of pointers
* @return A set of GPU objects corresponding to any of these pointers
*/
Set<GPUObject> getGpuObjects(Set<Pointer> pointers) {
Set<GPUObject> gObjs = new HashSet<>();
for (GPUObject g : gpuObjects) {
if (!Collections.disjoint(getPointers(g), pointers))
gObjs.add(g);
}
return gObjs;
}

/**
* Return all pointers in the first section
Expand All @@ -94,10 +109,14 @@ Set<Pointer> getPointers() {
* Get pointers from the first memory sections "Matrix Memory"
* @param locked return locked pointers if true
* @param dirty return dirty pointers if true
* @param isCleanupEnabled return pointers marked for cleanup if true
* @return set of pointers
*/
Set<Pointer> getPointers(boolean locked, boolean dirty) {
return gpuObjects.stream().filter(gObj -> gObj.isLocked() == locked && gObj.isDirty() == dirty).flatMap(gObj -> getPointers(gObj).stream()).collect(Collectors.toSet());
Set<Pointer> getPointers(boolean locked, boolean dirty, boolean isCleanupEnabled) {
return gpuObjects.stream().filter(
gObj -> (gObj.isLocked() == locked && gObj.isDirty() == dirty) ||
(gObj.mat.isCleanupEnabled() == isCleanupEnabled)).flatMap(
gObj -> getPointers(gObj).stream()).collect(Collectors.toSet());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,6 @@ void guardedCudaFree(Pointer toFree) {
else {
throw new RuntimeException("ERROR : Internal state corrupted, attempting to free an unaccounted pointer:" + toFree);
}

}

/**
Expand Down Expand Up @@ -521,11 +520,19 @@ private Set<Pointer> nonIn(Set<Pointer> superset, Set<Pointer> subset) {
*/
public void clearTemporaryMemory() {
// To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up.
Set<Pointer> unlockedDirtyPointers = matrixMemoryManager.getPointers(false, true);
Set<Pointer> unlockedDirtyPointers = matrixMemoryManager.getPointers(false, true, false);
Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), unlockedDirtyPointers);
for(Pointer tmpPtr : temporaryPointers) {
for (Pointer tmpPtr : temporaryPointers) {
guardedCudaFree(tmpPtr);
}

// Also set the pointer(s) to null in the corresponding GPU objects to avoid double freeing pointers
Set<GPUObject> gObjs = matrixMemoryManager.getGpuObjects(temporaryPointers);
for (GPUObject g : gObjs) {
g.jcudaDenseMatrixPtr = null;
g.jcudaSparseMatrixPtr = null;
removeGPUObject(g);
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public class GPUObject {
/**
* Pointer to the underlying sparse matrix block on GPU
*/
private CSRPointer jcudaSparseMatrixPtr = null;
CSRPointer jcudaSparseMatrixPtr = null;

/**
* whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host
Expand Down

0 comments on commit cc7ec76

Please sign in to comment.