From 75dd112513a0574feb39a031272b8b1b11e06239 Mon Sep 17 00:00:00 2001 From: Banit Agrawal Date: Fri, 10 Mar 2023 13:04:26 -0800 Subject: [PATCH 01/34] using different mechanism for host mapped pinned memory (#1638) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1638 This diff adds another mechanism for allocating the host mapped pinned memory to reduce adverse affect on other processes running on the same host when one process is doing some large allocations. Reviewed By: zyan0, jianyuh Differential Revision: D43950253 fbshipit-source-id: 41a434cb63354509d32e00c851c5f3a2d68be686 --- fbgemm_gpu/src/cumem_utils.cu | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fbgemm_gpu/src/cumem_utils.cu b/fbgemm_gpu/src/cumem_utils.cu index 7a060681f0..7b49040a83 100644 --- a/fbgemm_gpu/src/cumem_utils.cu +++ b/fbgemm_gpu/src/cumem_utils.cu @@ -41,7 +41,8 @@ struct CUDAHostMappedContext { ~CUDAHostMappedContext() { at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(cuda_device_); - AT_CUDA_CHECK(cudaFreeHost(ptr_)); + AT_CUDA_CHECK(cudaHostUnregister(ptr_)); + free(ptr_); } static void release(void* ptr) { @@ -206,9 +207,28 @@ Tensor new_host_mapped_tensor( auto strides = defaultStrides(sizes); size_t size_bytes = at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize()); - void* ptr; - AT_CUDA_CHECK(cudaHostAlloc( - &ptr, size_bytes, cudaHostAllocWriteCombined | cudaHostAllocMapped)); + + // When using cudaHostAlloc for large allocations, we found that it can + // potentially take a global lock and lock out CUDA APIs from other processes. + // The main cost in cudaHostAlloc is faulting/mapping the pages. So, instead + // of using this cuda API, we can do regular malloc, pre-fault the pages, and + // then do cudaHostRegister with GPU mapping flags to lock the pages, so we + // can minimize the cost while holding this global lock. + void* const ptr = malloc(size_bytes); + + // advise the kernel to allocate large 2M pages + madvise(ptr, size_bytes, MADV_HUGEPAGE); + + // pre-fault/map the pages by setting the first byte of the page + size_t pageSize = (1 << 21); + uintptr_t alignedPtr = (((uintptr_t)ptr + pageSize - 1) & ~(pageSize - 1)); + for (uintptr_t p = alignedPtr; p < ((uintptr_t)ptr + size_bytes); + p += pageSize) { + memset((void*)p, 0, 1); + } + + AT_CUDA_CHECK(cudaHostRegister( + ptr, size_bytes, cudaHostRegisterMapped | cudaHostRegisterPortable)); void* dev_ptr; AT_CUDA_CHECK(cudaHostGetDevicePointer(&dev_ptr, ptr, 0)); From 30833faf5fcea8917d3f22e69f1826c1431e8e3c Mon Sep 17 00:00:00 2001 From: Li Li Date: Mon, 13 Mar 2023 15:19:53 -0700 Subject: [PATCH 02/34] disable use_cpu test (#1635) Summary: This PR addresses the issue https://github.com/pytorch/FBGEMM/issues/1636 akin to https://github.com/pytorch/FBGEMM/blob/8616ed701015f8b9e4c2825ce592b204b4cfaf28/fbgemm_gpu/test/split_table_batched_embeddings_test.py#L1009 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1635 Reviewed By: shintaro-iwasaki Differential Revision: D44033725 Pulled By: q10 fbshipit-source-id: 49f28fc2f1c20948a42728eebf3defc5195baa5d --- fbgemm_gpu/test/jagged_tensor_ops_test.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py index 9ed5a39f47..98021007f4 100644 --- a/fbgemm_gpu/test/jagged_tensor_ops_test.py +++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py @@ -20,7 +20,12 @@ from fbgemm_gpu import open_source # noqa: F401 # pyre-ignore[21] - from test_utils import gpu_available, gpu_unavailable, running_on_github + from test_utils import ( + gpu_available, + gpu_unavailable, + running_on_github, + TEST_WITH_ROCM, + ) except Exception: torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops") torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu") @@ -28,6 +33,7 @@ gpu_available, gpu_unavailable, running_on_github, + TEST_WITH_ROCM, ) @@ -1466,7 +1472,11 @@ def jagged_index_select_2d_ref( torch.long, ] # Disable torch.bfloat16 due to large error bound ), - use_cpu=st.booleans() if gpu_available else st.just(True), + use_cpu=st.booleans() + if (gpu_available and not TEST_WITH_ROCM) + else st.just(False) + if (gpu_available and TEST_WITH_ROCM) + else st.just(True), ) @settings(max_examples=20, deadline=None) def test_jagged_index_select_2d( From b8241da8ab33b3093cb01f57af2d991f9686a457 Mon Sep 17 00:00:00 2001 From: Sabin Devkota Date: Mon, 13 Mar 2023 18:52:06 -0700 Subject: [PATCH 03/34] Update API interface and reroute backend for exact_rowwise_adagrad FE when using freq based methods (#1352) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1352 1. Update interface to accomadate rowwise_adagrad_with_counter. 2. Route backend for rowwise_adagrad to the new rowwise_adagrad_with_counter when freq based methods (e.g. freq sgd, counter adjusted regularization) are used. Reviewed By: csmiler Differential Revision: D36788395 fbshipit-source-id: 8eb5da8a5c8b52bc1e237af1054aac9f7245c443 --- fbgemm_gpu/codegen/__init__.template | 2 + .../embedding_backward_code_generator.py | 11 +- fbgemm_gpu/codegen/lookup_args.py | 7 + ..._embedding_codegen_lookup_invoker.template | 85 +++++ .../split_table_batched_embeddings_ops.py | 349 +++++++++++++++--- .../ssd_split_table_batched_embeddings_ops.py | 19 + .../split_table_batched_embeddings_test.py | 219 ++++++++++- 7 files changed, 618 insertions(+), 74 deletions(-) diff --git a/fbgemm_gpu/codegen/__init__.template b/fbgemm_gpu/codegen/__init__.template index de8bf21dd0..661622eff9 100644 --- a/fbgemm_gpu/codegen/__init__.template +++ b/fbgemm_gpu/codegen/__init__.template @@ -13,7 +13,9 @@ import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_lars_sgd as loo import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_adam as lookup_partial_rowwise_adam # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_partial_rowwise_lamb as lookup_partial_rowwise_lamb # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad as lookup_rowwise_adagrad # noqa: F401 +import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_adagrad_with_counter as lookup_rowwise_adagrad_with_counter # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_sgd as lookup_sgd # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_sgd as lookup_approx_sgd # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad as lookup_approx_rowwise_adagrad # noqa: F401 +import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_approx_rowwise_adagrad_with_counter as lookup_approx_rowwise_adagrad_with_counter # noqa: F401 import fbgemm_gpu.split_embedding_codegen_lookup_invokers.lookup_rowwise_weighted_adagrad as lookup_rowwise_weighted_adagrad # noqa: F401 diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py index 9d67358902..fd69a22f6e 100644 --- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py +++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py @@ -646,6 +646,11 @@ def rowwise_adagrad_with_counter() -> None: split_precomputation = """ at::acc_type freq = 1.0; at::acc_type l2_wd = 0.0; + at::acc_type tail_id_threshold_val = tail_id_threshold; + CUDA_KERNEL_ASSERT(max_counter > 0.0); // avoid divide by zero error + if (is_tail_id_thresh_ratio == 1){ + tail_id_threshold_val = floorf(tail_id_threshold * max_counter); + } if (counter_halflife > 0 && threadIdx.x == 0) { // if id occurs multiple times in a batch, iter_delta=1 const auto iter_delta = prev_iter[idx] == 0 ? 1.0 : iter * 1.0 - prev_iter[idx]; @@ -660,6 +665,7 @@ def rowwise_adagrad_with_counter() -> None: } freq = SHFL_SYNC(freq, 0); l2_wd = SHFL_SYNC(l2_wd, 0); + tail_id_threshold_val = SHFL_SYNC(tail_id_threshold_val, 0); at::acc_type g_local_sum_square = 0.0; @@ -682,10 +688,7 @@ def rowwise_adagrad_with_counter() -> None: at::acc_type multiplier; at::acc_type adjusted_multiplier; at::acc_type exp_reg_correction; - at::acc_type tail_id_threshold_val = tail_id_threshold; - if (is_tail_id_thresh_ratio == 1){ - tail_id_threshold_val = floorf(tail_id_threshold * max_counter); - } + if (threadIdx.x == 0) { at::acc_type new_sum_square_grads = momentum1[idx] + g_avg_square; momentum1[idx] = new_sum_square_grads; diff --git a/fbgemm_gpu/codegen/lookup_args.py b/fbgemm_gpu/codegen/lookup_args.py index c5a3d465e9..8c98a96a1a 100644 --- a/fbgemm_gpu/codegen/lookup_args.py +++ b/fbgemm_gpu/codegen/lookup_args.py @@ -44,6 +44,13 @@ class OptimizerArgs(NamedTuple): weight_decay_mode: int eta: float momentum: float + counter_halflife: int + adjustment_iter: int + adjustment_ub: float + learning_rate_mode: int + grad_sum_decay: int + tail_id_threshold: float + is_tail_id_thresh_ratio: int class Momentum(NamedTuple): diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template index 4cdc5b8766..bd406d39fa 100644 --- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template +++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template @@ -36,9 +36,18 @@ def invoke( {% if "momentum2_dev" in args.split_function_arg_names %} momentum2: Momentum, {% endif %} + {% if "prev_iter_dev" in args.split_function_arg_names %} + prev_iter: Momentum, + {% endif %} + {% if "row_counter_dev" in args.split_function_arg_names %} + row_counter: Momentum, + {% endif %} {% if "iter" in args.split_function_arg_names %} iter: int, {% endif %} + {% if "max_counter" in args.split_function_arg_names %} + max_counter: float, + {% endif %} ) -> torch.Tensor: if (common_args.host_weights.numel() > 0): return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function_cpu( @@ -84,6 +93,27 @@ def invoke( {% if "momentum" in args.split_function_arg_names %} momentum=optimizer_args.momentum, {% endif %} + {% if "counter_halflife" in args.split_function_arg_names %} + counter_halflife=optimizer_args.counter_halflife, + {% endif %} + {% if "adjustment_iter" in args.split_function_arg_names %} + adjustment_iter=optimizer_args.adjustment_iter, + {% endif %} + {% if "adjustment_ub" in args.split_function_arg_names %} + adjustment_ub=optimizer_args.adjustment_ub, + {% endif %} + {% if "learning_rate_mode" in args.split_function_arg_names %} + learning_rate_mode=optimizer_args.learning_rate_mode, + {% endif %} + {% if "grad_sum_decay" in args.split_function_arg_names %} + grad_sum_decay=optimizer_args.grad_sum_decay, + {% endif %} + {% if "tail_id_threshold" in args.split_function_arg_names %} + tail_id_threshold=optimizer_args.tail_id_threshold, + {% endif %} + {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %} + is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio, + {% endif %} # momentum1 {% if "momentum1_dev" in args.split_function_arg_names %} momentum1_host=momentum1.host, @@ -96,10 +126,26 @@ def invoke( momentum2_offsets=momentum2.offsets, momentum2_placements=momentum2.placements, {% endif %} + # prev_iter + {% if "prev_iter_dev" in args.split_function_arg_names %} + prev_iter_host=prev_iter.host, + prev_iter_offsets=prev_iter.offsets, + prev_iter_placements=prev_iter.placements, + {% endif %} + # row_counter + {% if "row_counter_dev" in args.split_function_arg_names %} + row_counter_host=row_counter.host, + row_counter_offsets=row_counter.offsets, + row_counter_placements=row_counter.placements, + {% endif %} # iter {% if "iter" in args.split_function_arg_names %} iter=iter, {% endif %} + # max counter + {% if "max_counter" in args.split_function_arg_names %} + max_counter=max_counter, + {% endif %} ) else: return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function( @@ -151,6 +197,27 @@ def invoke( {% if "momentum" in args.split_function_arg_names %} momentum=optimizer_args.momentum, {% endif %} + {% if "counter_halflife" in args.split_function_arg_names %} + counter_halflife=optimizer_args.counter_halflife, + {% endif %} + {% if "adjustment_iter" in args.split_function_arg_names %} + adjustment_iter=optimizer_args.adjustment_iter, + {% endif %} + {% if "adjustment_ub" in args.split_function_arg_names %} + adjustment_ub=optimizer_args.adjustment_ub, + {% endif %} + {% if "learning_rate_mode" in args.split_function_arg_names %} + learning_rate_mode=optimizer_args.learning_rate_mode, + {% endif %} + {% if "grad_sum_decay" in args.split_function_arg_names %} + grad_sum_decay=optimizer_args.grad_sum_decay, + {% endif %} + {% if "tail_id_threshold" in args.split_function_arg_names %} + tail_id_threshold=optimizer_args.tail_id_threshold, + {% endif %} + {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %} + is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio, + {% endif %} # momentum1 {% if "momentum1_dev" in args.split_function_arg_names %} momentum1_dev=momentum1.dev, @@ -165,9 +232,27 @@ def invoke( momentum2_offsets=momentum2.offsets, momentum2_placements=momentum2.placements, {% endif %} + # prev_iter + {% if "prev_iter_dev" in args.split_function_arg_names %} + prev_iter_dev=prev_iter.dev, + prev_iter_uvm=prev_iter.uvm, + prev_iter_offsets=prev_iter.offsets, + prev_iter_placements=prev_iter.placements, + {% endif %} + # row_counter + {% if "row_counter_dev" in args.split_function_arg_names %} + row_counter_dev=row_counter.dev, + row_counter_uvm=row_counter.uvm, + row_counter_offsets=row_counter.offsets, + row_counter_placements=row_counter.placements, + {% endif %} # iter {% if "iter" in args.split_function_arg_names %} iter=iter, {% endif %} + # max counter + {% if "max_counter" in args.split_function_arg_names %} + max_counter=max_counter, + {% endif %} output_dtype=common_args.output_dtype, ) diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py index 0552e9c981..87b9b1a559 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py @@ -71,6 +71,43 @@ class WeightDecayMode(enum.IntEnum): NONE = 0 L2 = 1 DECOUPLE = 2 + COUNTER = 3 + + +class CounterWeightDecayMode(enum.IntEnum): + NONE = 0 + L2 = 1 + DECOUPLE = 2 + + +class LearningRateMode(enum.IntEnum): + EQUAL = -1 + TAIL_ID_LR_INCREASE = 0 + TAIL_ID_LR_DECREASE = 1 + COUNTER_SGD = 2 + + +class GradSumDecay(enum.IntEnum): + NO_DECAY = -1 + CTR_DECAY = 0 + + +@dataclass +class TailIdThreshold: + val: float = 0 + is_ratio: bool = False + + +@dataclass +class CounterBasedRegularizationDefinition: + counter_weight_decay_mode: CounterWeightDecayMode = CounterWeightDecayMode.NONE + counter_halflife: int = -1 + adjustment_iter: int = -1 + adjustment_ub: float = 1.0 + learning_rate_mode: LearningRateMode = LearningRateMode.EQUAL + grad_sum_decay: GradSumDecay = GradSumDecay.NO_DECAY + tail_id_threshold: TailIdThreshold = TailIdThreshold(val=0, is_ratio=False) + max_counter_update_freq: int = 1000 RecordCacheMetrics: NamedTuple = NamedTuple( @@ -235,6 +272,9 @@ def __init__( # noqa C901 eta: float = 0.001, # used by LARS-SGD, beta1: float = 0.9, # used by LAMB and ADAM beta2: float = 0.999, # used by LAMB and ADAM + counter_based_regularization: Optional[ + CounterBasedRegularizationDefinition + ] = None, # used by Rowwise Adagrad pooling_mode: PoolingMode = PoolingMode.SUM, device: Optional[Union[str, int, torch.device]] = None, bounds_check_mode: BoundsCheckMode = BoundsCheckMode.WARNING, @@ -408,6 +448,34 @@ def __init__( # noqa C901 self.stochastic_rounding = stochastic_rounding self.optimizer = optimizer + self.weight_decay_mode = weight_decay_mode + if ( + weight_decay_mode == WeightDecayMode.COUNTER + and counter_based_regularization is None + ): + raise AssertionError( + "weight_decay_mode is set to WeightDecayMode.COUNTER but counter_based_regularization is None" + ) + + self._used_rowwise_adagrad_with_counter: bool = ( + optimizer in (OptimType.EXACT_ROWWISE_ADAGRAD, OptimType.ROWWISE_ADAGRAD) + and weight_decay_mode == WeightDecayMode.COUNTER + and counter_based_regularization is not None + ) + + if counter_based_regularization is None: + counter_based_regularization = CounterBasedRegularizationDefinition() + self._max_counter_update_freq: int = -1 + if self._used_rowwise_adagrad_with_counter: + self._max_counter_update_freq = ( + counter_based_regularization.max_counter_update_freq + ) + opt_arg_weight_decay_mode = ( + counter_based_regularization.counter_weight_decay_mode + ) + else: + opt_arg_weight_decay_mode = weight_decay_mode + self.optimizer_args = invokers.lookup_args.OptimizerArgs( stochastic_rounding=stochastic_rounding, gradient_clipping=gradient_clipping, @@ -417,9 +485,18 @@ def __init__( # noqa C901 beta1=beta1, beta2=beta2, weight_decay=weight_decay, - weight_decay_mode=weight_decay_mode.value, + weight_decay_mode=opt_arg_weight_decay_mode.value, eta=eta, momentum=momentum, + counter_halflife=counter_based_regularization.counter_halflife, + adjustment_iter=counter_based_regularization.adjustment_iter, + adjustment_ub=counter_based_regularization.adjustment_ub, + learning_rate_mode=counter_based_regularization.learning_rate_mode.value, + grad_sum_decay=counter_based_regularization.grad_sum_decay.value, + tail_id_threshold=counter_based_regularization.tail_id_threshold.val, + is_tail_id_thresh_ratio=int( + counter_based_regularization.tail_id_threshold.is_ratio + ), ) if optimizer in ( @@ -427,25 +504,7 @@ def __init__( # noqa C901 OptimType.EXACT_SGD, ): # NOTE: make TorchScript work! - self.register_buffer( - "momentum1_dev", torch.tensor([0], dtype=torch.int64), persistent=False - ) - self.register_buffer( - "momentum1_host", torch.tensor([0], dtype=torch.int64), persistent=False - ) - self.register_buffer( - "momentum1_uvm", torch.tensor([0], dtype=torch.int64), persistent=False - ) - self.register_buffer( - "momentum1_placements", - torch.tensor([0], dtype=torch.int64), - persistent=False, - ) - self.register_buffer( - "momentum1_offsets", - torch.tensor([0], dtype=torch.int64), - persistent=False, - ) + self._register_nonpersistent_buffers("momentum1") else: self._apply_split( construct_split_state( @@ -484,29 +543,40 @@ def __init__( # noqa C901 ) else: # NOTE: make TorchScript work! - self.register_buffer( - "momentum2_dev", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, - ) - self.register_buffer( - "momentum2_host", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, - ) - self.register_buffer( - "momentum2_uvm", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, + self._register_nonpersistent_buffers("momentum2") + if self._used_rowwise_adagrad_with_counter: + self._apply_split( + construct_split_state( + embedding_specs, + rowwise=True, + cacheable=False, + ), + prefix="prev_iter", + # TODO: ideally we should use int64 to track iter but it failed to compile. + # It may be related to low precision training code. Currently using float32 + # as a workaround while investigating the issue. + # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param + # but got `Type[torch.float32]`. + dtype=torch.float32, ) - self.register_buffer( - "momentum2_placements", - torch.zeros(1, dtype=torch.int64, device=self.current_device), - persistent=False, + self._apply_split( + construct_split_state( + embedding_specs, + rowwise=True, + cacheable=False, + ), + prefix="row_counter", + # pyre-fixme[6]: Expected `Type[Type[torch._dtype]]` for 3rd param + # but got `Type[torch.float32]`. + dtype=torch.float32, ) + self.register_buffer("max_counter", torch.tensor([1], dtype=torch.float32)) + else: + self._register_nonpersistent_buffers("prev_iter") + self._register_nonpersistent_buffers("row_counter") self.register_buffer( - "momentum2_offsets", - torch.zeros(1, dtype=torch.int64, device=self.current_device), + "max_counter", + torch.ones(1, dtype=torch.float32, device=self.current_device), persistent=False, ) if optimizer in ( @@ -519,6 +589,7 @@ def __init__( # noqa C901 self.register_buffer( "iter", torch.zeros(1, dtype=torch.int64, device=self.current_device) ) + else: self.register_buffer( "iter", @@ -572,6 +643,34 @@ def __init__( # noqa C901 self.step = 0 + def _register_nonpersistent_buffers(self, prefix: str) -> None: + # NOTE: make TorchScript work! + self.register_buffer( + f"{prefix}_dev", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + self.register_buffer( + f"{prefix}_host", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + self.register_buffer( + f"{prefix}_uvm", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + self.register_buffer( + f"{prefix}_placements", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + self.register_buffer( + f"{prefix}_offsets", + torch.zeros(1, dtype=torch.int64, device=self.current_device), + persistent=False, + ) + def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: if not hasattr(self, f"{prefix}_physical_placements"): raise DoesNotHavePrefix() @@ -590,7 +689,7 @@ def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tenso def get_all_states(self) -> List[Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]]: all_states = [] - for prefix in ["weights", "momentum1", "momentum2"]: + for prefix in ["weights", "momentum1", "momentum2", "prev_iter", "row_counter"]: try: all_states.append(self.get_states(prefix)) except DoesNotHavePrefix: @@ -681,10 +780,20 @@ def forward( return invokers.lookup_approx_sgd.invoke(common_args, self.optimizer_args) momentum1 = invokers.lookup_args.Momentum( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor, + # nn.Module]`. dev=self.momentum1_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor, + # nn.Module]`. host=self.momentum1_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor, + # nn.Module]`. uvm=self.momentum1_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor, + # nn.Module]`. offsets=self.momentum1_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor, + # nn.Module]`. placements=self.momentum1_placements, ) @@ -696,21 +805,22 @@ def forward( return invokers.lookup_adagrad.invoke( common_args, self.optimizer_args, momentum1 ) - if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD: - return invokers.lookup_rowwise_adagrad.invoke( - common_args, self.optimizer_args, momentum1 - ) - if self.optimizer == OptimType.ROWWISE_ADAGRAD: - assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode" - return invokers.lookup_approx_rowwise_adagrad.invoke( - common_args, self.optimizer_args, momentum1 - ) momentum2 = invokers.lookup_args.Momentum( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor, + # nn.Module]`. dev=self.momentum2_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor, + # nn.Module]`. host=self.momentum2_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor, + # nn.Module]`. uvm=self.momentum2_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor, + # nn.Module]`. offsets=self.momentum2_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor, + # nn.Module]`. placements=self.momentum2_placements, ) # Ensure iter is always on CPU so the increment doesn't synchronize. @@ -768,6 +878,79 @@ def forward( self.iter.item(), ) + prev_iter = invokers.lookup_args.Momentum( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor, + # nn.Module]`. + dev=self.prev_iter_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor, + # nn.Module]`. + host=self.prev_iter_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor, + # nn.Module]`. + uvm=self.prev_iter_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor, + # nn.Module]`. + offsets=self.prev_iter_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor, + # nn.Module]`. + placements=self.prev_iter_placements, + ) + row_counter = invokers.lookup_args.Momentum( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got `Union[Tensor, + # nn.Module]`. + dev=self.row_counter_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got `Union[Tensor, + # nn.Module]`. + host=self.row_counter_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got `Union[Tensor, + # nn.Module]`. + uvm=self.row_counter_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got `Union[Tensor, + # nn.Module]`. + offsets=self.row_counter_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got `Union[Tensor, + # nn.Module]`. + placements=self.row_counter_placements, + ) + if self._used_rowwise_adagrad_with_counter: + if self.iter.item() % self._max_counter_update_freq == 0: + max_counter = torch.max(self.row_counter_dev.detach()) + self.max_counter = max_counter.cpu() + 1 + + if self.optimizer == OptimType.EXACT_ROWWISE_ADAGRAD: + if self._used_rowwise_adagrad_with_counter: + return invokers.lookup_rowwise_adagrad_with_counter.invoke( + common_args, + self.optimizer_args, + momentum1, + prev_iter, + row_counter, + # pyre-fixme[6]: Expected `int` for 6th param but got `Union[float, int]`. + self.iter.item(), + self.max_counter.item(), + ) + else: + return invokers.lookup_rowwise_adagrad.invoke( + common_args, self.optimizer_args, momentum1 + ) + if self.optimizer == OptimType.ROWWISE_ADAGRAD: + assert self.use_cpu, "Approx rowwise AdaGrad is only supported in CPU mode" + if self._used_rowwise_adagrad_with_counter: + return invokers.lookup_approx_rowwise_adagrad_with_counter.invoke( + common_args, + self.optimizer_args, + momentum1, + prev_iter, + row_counter, + # pyre-fixme[6]: Expected `int` for 6th param but got `Union[float, int]`. + self.iter.item(), + self.max_counter.item(), + ) + else: + return invokers.lookup_approx_rowwise_adagrad.invoke( + common_args, self.optimizer_args, momentum1 + ) + raise ValueError(f"Invalid OptimType: {self.optimizer}") def reset_uvm_cache_stats(self) -> None: @@ -1013,8 +1196,12 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]: or self.optimizer == OptimType.ROWWISE_ADAGRAD or self.optimizer == OptimType.EXACT_ROWWISE_WEIGHTED_ADAGRAD ): + split_optimizer_states = self.split_optimizer_states() list_of_state_dict = [ - {"sum": _sum[0]} for _sum in self.split_optimizer_states() + {"sum": states[0], "prev_iter": states[1], "row_counter": states[2]} + if self._used_rowwise_adagrad_with_counter + else {"sum": states[0]} + for states in split_optimizer_states ] else: raise NotImplementedError( @@ -1024,7 +1211,9 @@ def get_optimizer_state(self) -> List[Dict[str, torch.Tensor]]: return list_of_state_dict @torch.jit.ignore - def split_optimizer_states(self) -> List[Tuple[torch.Tensor]]: + def split_optimizer_states( + self, + ) -> List[List[torch.Tensor]]: """ Returns a list of states, split by table """ @@ -1062,8 +1251,14 @@ def get_optimizer_states( ): states.append( get_optimizer_states( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got + # `Union[Tensor, nn.Module]`. self.momentum1_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got + # `Union[Tensor, nn.Module]`. self.momentum1_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got + # `Union[Tensor, nn.Module]`. self.momentum1_uvm, # pyre-fixme[6]: Expected `Tensor` for 4th param but got # `Union[Tensor, nn.Module]`. @@ -1087,8 +1282,14 @@ def get_optimizer_states( ): states.append( get_optimizer_states( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got + # `Union[Tensor, nn.Module]`. self.momentum2_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got + # `Union[Tensor, nn.Module]`. self.momentum2_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got + # `Union[Tensor, nn.Module]`. self.momentum2_uvm, # pyre-fixme[6]: Expected `Tensor` for 4th param but got # `Union[Tensor, nn.Module]`. @@ -1100,7 +1301,49 @@ def get_optimizer_states( in (OptimType.PARTIAL_ROWWISE_ADAM, OptimType.PARTIAL_ROWWISE_LAMB), ) ) - return list(zip(*states)) + if self._used_rowwise_adagrad_with_counter: + states.append( + get_optimizer_states( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_physical_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got + # `Union[Tensor, nn.Module]`. + self.prev_iter_physical_placements, + rowwise=True, + ) + ) + states.append( + get_optimizer_states( + # pyre-fixme[6]: Expected `Tensor` for 1st param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_dev, + # pyre-fixme[6]: Expected `Tensor` for 2nd param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_host, + # pyre-fixme[6]: Expected `Tensor` for 3rd param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_uvm, + # pyre-fixme[6]: Expected `Tensor` for 4th param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_physical_offsets, + # pyre-fixme[6]: Expected `Tensor` for 5th param but got + # `Union[Tensor, nn.Module]`. + self.row_counter_physical_placements, + rowwise=True, + ) + ) + return_states = [list(s) for s in zip(*states)] + return return_states @torch.jit.export def set_learning_rate(self, lr: float) -> None: diff --git a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py index 1eec03fdd9..250f84abb6 100644 --- a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py @@ -18,6 +18,7 @@ from fbgemm_gpu.split_table_batched_embeddings_ops import ( align_to_cacheline, CacheAlgorithm, + CounterBasedRegularizationDefinition, DEFAULT_SCALE_BIAS_SIZE_IN_BYTES, EmbeddingLocation, PoolingMode, @@ -88,6 +89,9 @@ def __init__( eta: float = 0.001, # used by LARS-SGD, beta1: float = 0.9, # used by LAMB and ADAM beta2: float = 0.999, # used by LAMB and ADAM + counter_based_regularization: Optional[ + CounterBasedRegularizationDefinition + ] = None, # used by Rowwise Adagrad pooling_mode: PoolingMode = PoolingMode.SUM, ) -> None: super(SSDTableBatchedEmbeddingBags, self).__init__() @@ -217,6 +221,12 @@ def __init__( self.ssd_set_end = torch.cuda.Event() self.timesteps_prefetched: List[int] = [] + if weight_decay_mode == WeightDecayMode.COUNTER or counter_based_regularization: + raise AssertionError( + "weight_decay_mode = WeightDecayMode.COUNTER is not supported for SSD TBE." + ) + counter_based_regularization = CounterBasedRegularizationDefinition() + self.optimizer_args = invokers.lookup_args.OptimizerArgs( stochastic_rounding=stochastic_rounding, gradient_clipping=gradient_clipping, @@ -229,6 +239,15 @@ def __init__( weight_decay_mode=weight_decay_mode.value, eta=eta, momentum=momentum, + counter_halflife=counter_based_regularization.counter_halflife, + adjustment_iter=counter_based_regularization.adjustment_iter, + adjustment_ub=counter_based_regularization.adjustment_ub, + learning_rate_mode=counter_based_regularization.learning_rate_mode.value, + grad_sum_decay=counter_based_regularization.grad_sum_decay.value, + tail_id_threshold=counter_based_regularization.tail_id_threshold.val, + is_tail_id_thresh_ratio=int( + counter_based_regularization.tail_id_threshold.is_ratio + ), ) self.weights_dev = nn.Parameter( torch.empty((0,), device=self.current_device, dtype=torch.float32) diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py index c0ba5f6f64..6a4d299b80 100644 --- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py +++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py @@ -9,10 +9,11 @@ import copy +import math import pickle import random import unittest -from typing import List, Optional +from typing import List, Optional, Tuple import fbgemm_gpu import fbgemm_gpu.split_table_batched_embeddings_ops as split_table_batched_embeddings_ops @@ -31,11 +32,16 @@ ) from fbgemm_gpu.split_table_batched_embeddings_ops import ( BoundsCheckMode, + CounterBasedRegularizationDefinition, + CounterWeightDecayMode, + GradSumDecay, INT8_EMB_ROW_DIM_OFFSET, + LearningRateMode, OptimType, RecordCacheMetrics, rounded_row_size_in_bytes, SparseType, + TailIdThreshold, WeightDecayMode, ) from hypothesis import assume, given, HealthCheck, settings, Verbosity @@ -1627,6 +1633,7 @@ def execute_backward_adagrad_( # noqa C901 use_cpu: bool, exact: bool, output_dtype: SparseType, + weight_decay_mode: WeightDecayMode = WeightDecayMode.NONE, ) -> None: # NOTE: cache is not applicable to CPU version. assume(not use_cpu or not use_cache) @@ -1826,31 +1833,39 @@ def execute_backward_adagrad_( # noqa C901 goc = torch.cat(gos, dim=0) fc2.backward(goc) cc.flush() - split_optimizer_states = [s for (s,) in cc.split_optimizer_states()] + split_optimizer_states = cc.split_optimizer_states() + assert len(split_optimizer_states) == T tolerance = ( 1.0e-4 if weights_precision == SparseType.FP32 and output_dtype == SparseType.FP32 else 1.0e-2 ) for t in range(T): + if row_wise and weight_decay_mode == WeightDecayMode.COUNTER: + (m1, c1, c2) = split_optimizer_states[t] + else: + (m1,) = split_optimizer_states[t] # pyre-fixme[16]: `Optional` has no attribute `float`. ref_optimizer_state = bs[t].weight.grad.float().cpu().to_dense().pow(2) torch.testing.assert_close( - split_optimizer_states[t].float().cpu(), + m1.float().cpu(), ref_optimizer_state.mean(dim=1) if row_wise else ref_optimizer_state, atol=tolerance, rtol=tolerance, ) for t in range(T): # optimizer_state = squares (no row-wise) or sum squares (row-wise) + if row_wise and weight_decay_mode == WeightDecayMode.COUNTER: + (m1, c1, c2) = split_optimizer_states[t] + else: + (m1,) = split_optimizer_states[t] torch.testing.assert_close( cc.split_embedding_weights()[t].float().cpu(), torch.addcdiv( bs[t].weight.float().cpu(), value=-lr, tensor1=bs[t].weight.grad.float().cpu().to_dense(), - tensor2=split_optimizer_states[t] - .float() + tensor2=m1.float() .sqrt_() .add_(eps) .view(Es[t], 1 if row_wise else Ds[t]) @@ -2589,6 +2604,8 @@ def execute_backward_optimizers_( # noqa C901 0.9, 0.01, ) + counter_based_regularization: CounterBasedRegularizationDefinition + if optimizer == OptimType.EXACT_ADAGRAD: optimizer_kwargs["eps"] = eps @@ -2596,6 +2613,21 @@ def execute_backward_optimizers_( # noqa C901 optimizer_kwargs["eps"] = eps optimizer_kwargs["weight_decay"] = weight_decay optimizer_kwargs["weight_decay_mode"] = weight_decay_mode + if weight_decay_mode == WeightDecayMode.COUNTER: + counter_based_regularization = CounterBasedRegularizationDefinition( + counter_weight_decay_mode=CounterWeightDecayMode.DECOUPLE, + counter_halflife=20000, + adjustment_iter=24000, + adjustment_ub=0.1, + learning_rate_mode=LearningRateMode.TAIL_ID_LR_DECREASE, + grad_sum_decay=GradSumDecay.NO_DECAY, + tail_id_threshold=TailIdThreshold(val=1000, is_ratio=False), + ) + + optimizer_kwargs[ + "counter_based_regularization" + # pyre-fixme[6]: Expected `float` for 2nd param but got `CounterBasedRegularizationDefinition`. + ] = counter_based_regularization if optimizer == OptimType.EXACT_ROWWISE_WEIGHTED_ADAGRAD: optimizer_kwargs["eps"] = eps @@ -2654,15 +2686,39 @@ def execute_backward_optimizers_( # noqa C901 if optimizer in (OptimType.EXACT_ROWWISE_ADAGRAD, OptimType.EXACT_ADAGRAD): rowwise = optimizer == OptimType.EXACT_ROWWISE_ADAGRAD for t in range(T): - (m1,) = split_optimizer_states[t] + row_counter: Optional[torch.Tensor] = None + freq: Optional[torch.Tensor] = None + iter_: int = -1 + + if rowwise and weight_decay_mode == WeightDecayMode.COUNTER: + (m1, prev_iter, row_counter) = split_optimizer_states[t] + else: + (m1,) = split_optimizer_states[t] # to_dense in GPU is non-deterministic due to atmomics used in # coalescing and floating point non-associativity. # pyre-fixme[16]: `Optional` has no attribute `cpu`. dense_cpu_grad = bs[t].weight.grad.cpu().to_dense() - if rowwise and not use_cpu and weight_decay_mode == WeightDecayMode.L2: + if rowwise and not use_cpu: # We need to skip when using cpu because use_fbgemm (https://fburl.com/code/12131iub) # is true and the template code (https://fburl.com/code/1kctlup3) is not executed. - dense_cpu_grad += weight_decay * bs[t].weight.cpu() + if weight_decay_mode == WeightDecayMode.L2: + dense_cpu_grad += weight_decay * bs[t].weight.cpu() + elif weight_decay_mode == WeightDecayMode.COUNTER: + iter_ = int(cc.iter.item()) + ( + dense_cpu_grad, + row_counter, + freq, + ) = self.get_grad_from_counter_adagrad( + dense_cpu_grad, + bs[t].weight.cpu(), + counter_based_regularization, + row_counter.cpu(), + prev_iter.cpu(), + iter_, + weight_decay, + ) + m1_ref = ( dense_cpu_grad.pow(2) if not rowwise @@ -2681,14 +2737,31 @@ def execute_backward_optimizers_( # noqa C901 ) + eps ) - if ( - rowwise - and not use_cpu - and weight_decay_mode == WeightDecayMode.DECOUPLE - ): - weights_ref = bs[t].weight.cpu() - lr * ( - dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu() - ) + if rowwise and not use_cpu: + if weight_decay_mode == WeightDecayMode.DECOUPLE: + weights_ref = bs[t].weight.cpu() - lr * ( + dense_cpu_grad / denom + weight_decay * bs[t].weight.cpu() + ) + elif weight_decay_mode == WeightDecayMode.L2: + # pyre-fixme[58]: `/` is not supported for operand types `float` + # and `Tensor`. + weights_ref = bs[t].weight.cpu() - lr * dense_cpu_grad / denom + elif weight_decay_mode == WeightDecayMode.COUNTER: + max_counter = cc.max_counter.item() + weights_ref = self.get_wts_from_counter_adagrad( + dense_cpu_grad, + bs[t].weight.cpu(), + denom, + counter_based_regularization, + row_counter, + # pyre-fixme[6]: Expected `Tensor` for 6th param but got `Optional[Tensor]` + freq, + max_counter, + iter_, + eps, + lr, + weight_decay, + ) else: # pyre-fixme[58]: `/` is not supported for operand types `float` # and `Tensor`. @@ -2833,6 +2906,117 @@ def execute_backward_optimizers_( # noqa C901 rtol=1.0e-4, ) + def get_grad_from_counter_adagrad( + self, + dense_cpu_grad: torch.Tensor, + weights: torch.Tensor, + counter_based_regularization: CounterBasedRegularizationDefinition, + row_counter: torch.Tensor, + prev_iter: torch.Tensor, + iter_: int, + weight_decay: float, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + row_counter = row_counter.view(row_counter.numel(), 1) + prev_iter = prev_iter.view(prev_iter.numel(), 1) + freq = torch.ones_like(row_counter) + counter_weight_decay_mode = ( + counter_based_regularization.counter_weight_decay_mode + ) + counter_halflife = counter_based_regularization.counter_halflife + l2_wd = 1.0 if counter_weight_decay_mode == CounterWeightDecayMode.L2 else 0.0 + + if counter_halflife > 0: + counter_log_rho = math.log(2.0) / counter_halflife + # if id occurs multiple times in a batch, iter_delta=1 + iter_delta = torch.where(prev_iter == 0.0, 1.0, iter_ * 1.0 - prev_iter) + prev_iter = iter_ * torch.ones_like(prev_iter) + row_counter = 1.0 + torch.exp(-iter_delta * counter_log_rho) * row_counter + freq = torch.tensor([counter_halflife]) / row_counter + + dense_cpu_grad += l2_wd * freq * weight_decay * weights + return dense_cpu_grad, row_counter, freq + + def get_wts_from_counter_adagrad( + self, + dense_cpu_grad: torch.Tensor, + weights: torch.Tensor, + denom: torch.Tensor, + counter_based_regularization: CounterBasedRegularizationDefinition, + row_counter: torch.Tensor, + freq: torch.Tensor, + max_counter: float, + iter_: int, + eps: float, + learning_rate: float, + weight_decay: float, + ) -> torch.Tensor: + counter_weight_decay_mode = ( + counter_based_regularization.counter_weight_decay_mode + ) + counter_halflife = counter_based_regularization.counter_halflife + tail_id_threshold_val = counter_based_regularization.tail_id_threshold.val + if counter_based_regularization.tail_id_threshold.is_ratio: + tail_id_threshold_val = math.floor(tail_id_threshold_val * max_counter) + learning_rate_mode = counter_based_regularization.learning_rate_mode + adjustment_iter = counter_based_regularization.adjustment_iter + adjustment_ub = counter_based_regularization.adjustment_ub + + multiplier = torch.tensor([learning_rate]) / denom + adjusted_multiplier = multiplier + exp_reg_correction = torch.ones_like(row_counter) + + if counter_halflife > 0: + if adjustment_iter <= 0 or ( + adjustment_iter > 0 and iter_ > adjustment_iter + ): + if learning_rate_mode == LearningRateMode.TAIL_ID_LR_INCREASE: + adjusted_multiplier = torch.where( + row_counter > tail_id_threshold_val, + multiplier + * torch.maximum( + torch.minimum( + torch.pow( + torch.tensor([max_counter]) / (row_counter + 1.0), + adjustment_ub, + ), + torch.Tensor([10.0]), + ), + torch.Tensor([1.0]), + ), + multiplier, + ) + elif learning_rate_mode == LearningRateMode.TAIL_ID_LR_DECREASE: + adjusted_multiplier = torch.where( + row_counter > tail_id_threshold_val, + multiplier + * torch.minimum( + torch.maximum( + torch.pow( + (row_counter + 1.0) / max_counter, + adjustment_ub, + ), + torch.Tensor([0.1]), + ), + torch.Tensor([1.0]), + ), + multiplier, + ) + elif learning_rate_mode == LearningRateMode.COUNTER_SGD: + adjusted_multiplier = torch.where( + row_counter > tail_id_threshold_val, + torch.Tensor([learning_rate]) + / (torch.sqrt(adjustment_ub * row_counter) + eps), + multiplier, + ) + + if counter_weight_decay_mode == CounterWeightDecayMode.DECOUPLE: + exp_reg_correction = 1.0 - freq * weight_decay * learning_rate + elif counter_weight_decay_mode == CounterWeightDecayMode.L2: + exp_reg_correction = 1.0 - freq * weight_decay * multiplier + + weights = exp_reg_correction * weights - adjusted_multiplier * dense_cpu_grad + return weights + @given( T=st.integers(min_value=1, max_value=5), D=st.integers(min_value=2, max_value=256), @@ -2901,7 +3085,7 @@ def test_backward_optimizers_adam( # noqa C901 D=st.integers(min_value=2, max_value=256), B=st.integers(min_value=1, max_value=128), log_E=st.integers(min_value=3, max_value=5), - L=st.integers(min_value=0, max_value=20), + L=st.integers(min_value=2, max_value=20), weighted=st.booleans(), mixed=st.booleans(), optimizer=st.sampled_from( @@ -2928,6 +3112,7 @@ def test_backward_optimizers_adam( # noqa C901 [ WeightDecayMode.L2, WeightDecayMode.DECOUPLE, + WeightDecayMode.COUNTER, ] ), ) From f388b955cf5d3f080e750fb3fd445cf69d523f65 Mon Sep 17 00:00:00 2001 From: Junjie Yang Date: Tue, 14 Mar 2023 00:08:01 -0700 Subject: [PATCH 04/34] Remove sync point in jagged_dense_elementwise_add_jagged_output backward (#1642) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1642 Remove sync point in jagged_dense_elementwise_add_jagged_output backward Reviewed By: brad-mengchi Differential Revision: D44039901 fbshipit-source-id: 8e7e23e4d9e01359e67e5b166adc57f894a1224d --- fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp index 283422b7ae..347ec089e0 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops_autograd.cpp @@ -644,7 +644,7 @@ jagged_dense_elementwise_add_jagged_output( const Tensor& y) { // Convert to jagged auto jagged_values = - DenseToJaggedOp::apply(y, x_offsets, c10::optional())[0]; + DenseToJaggedOp::apply(y, x_offsets, x_values.size(0))[0]; // Add jagged_values + x_values -> sum_values auto sum_values = x_values + jagged_values; From f158490f6fe6e1ce3646a2f1dc80c1da86d1ad53 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Tue, 14 Mar 2023 19:28:13 -0700 Subject: [PATCH 05/34] Add Comprehensive Build Instructions and Isolate CPU and ROCm Builds (#1639) Summary: - Remove `.post0` suffix from the autogenerated package version - Document the full FBGEMM_GPU OSS build process in a separate Markdown file - Remove installation of packages not needed for ROCm builds - Migrate CPU and ROCm jobs to run on top of Docker containers instead of bare metal instances - Update GitHub workflow configuration to cancel previous jobs for a PR if a new commit is pushed to the PR Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1639 Reviewed By: shintaro-iwasaki Differential Revision: D44076312 Pulled By: q10 fbshipit-source-id: 6b2d083022feb7421b26da2d998678e00c11f283 --- .github/scripts/setup_env.bash | 213 +++++---- .github/workflows/fbgemm_ci.yml | 5 + .github/workflows/fbgemm_gpu_ci.yml | 19 +- .github/workflows/fbgemm_gpu_lint.yml | 5 + .github/workflows/fbgemm_nightly_build.yml | 5 + .../workflows/fbgemm_nightly_build_cpu.yml | 18 + .github/workflows/fbgemm_release_build.yml | 5 + .../workflows/fbgemm_release_build_cpu.yml | 17 + fbgemm_gpu/docs/BuildInstructions.md | 430 ++++++++++++++++++ fbgemm_gpu/docs/README.md | 2 +- fbgemm_gpu/setup.py | 6 +- 11 files changed, 639 insertions(+), 86 deletions(-) create mode 100644 fbgemm_gpu/docs/BuildInstructions.md diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index 4f1c808598..ccdac79097 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -13,8 +13,13 @@ print_exec () { echo "+ $*" echo "" - "$@" + if "$@"; then + local retcode=0 + else + local retcode=$? + fi echo "" + return $retcode } exec_with_retries () { @@ -205,7 +210,7 @@ run_python_test () { echo "################################################################################" fi - if conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then + if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then echo "[TEST] Python test suite PASSED: ${python_test_file}" else echo "[TEST] Python test suite FAILED: ${python_test_file}" @@ -313,7 +318,7 @@ print_ec2_info () { ################################################################################ -# Environment Setup and Install Functions +# Miniconda Setup Functions ################################################################################ setup_miniconda () { @@ -398,6 +403,11 @@ create_conda_environment () { echo "[SETUP] Successfully created Conda environment: ${env_name}" } + +################################################################################ +# PyTorch Setup Functions +################################################################################ + install_pytorch_conda () { local env_name="$1" local pytorch_version="$2" @@ -553,6 +563,28 @@ install_pytorch_pip () { echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}" } + +################################################################################ +# CUDA Setup Functions +################################################################################ + +install_nvidia_drivers_centos () { + echo "################################################################################" + echo "# Install NVIDIA Drivers" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + echo "[SETUP] Adding NVIDIA repos to yum ..." + print_exec sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm + print_exec sudo yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo + print_exec sudo yum clean expire-cache + + echo "[SETUP] Installing NVIDIA drivers ..." + install_system_packages nvidia-driver-latest-dkms +} + install_cuda () { local env_name="$1" local cuda_version="$2" @@ -604,6 +636,86 @@ install_cuda () { echo "[INSTALL] Successfully installed CUDA ${cuda_version}" } +install_cudnn () { + local env_name="$1" + local install_path="$2" + local cuda_version="$3" + if [ "$cuda_version" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION" + echo "Example:" + echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7" + return 1 + else + echo "################################################################################" + echo "# Install cuDNN" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + fi + + # Install cuDNN manually + # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh + local cudnn_packages=( + ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" + ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" + ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz" + ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz" + ) + + # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1] + # shellcheck disable=SC2206 + local cuda_version_arr=(${cuda_version//./ }) + # Fetch the major and minor version to concat + local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}" + + # Get the URL + local cudnn_url="${cudnn_packages[cuda_concat_version]}" + if [ "$cudnn_url" == "" ]; then + # Default to cuDNN for 11.7 if no CUDA version fits + echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7" + cudnn_url="${cudnn_packages[117]}" + fi + + # Clear the install path + rm -rf "$install_path" + mkdir -p "$install_path" + + # Create temporary directory + # shellcheck disable=SC2155 + local tmp_dir=$(mktemp -d) + cd "$tmp_dir" || return 1 + + # Download cuDNN + echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..." + (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1 + + # Unpack the tarball + echo "[INSTALL] Unpacking cuDNN ..." + tar -xvf cudnn.tar.xz + + # Copy the includes and libs over to the install path + echo "[INSTALL] Moving cuDNN files to ${install_path} ..." + rm -rf "${install_path:?}/include" + rm -rf "${install_path:?}/lib" + mv cudnn-linux-*/include "$install_path" + mv cudnn-linux-*/lib "$install_path" + + # Delete the temporary directory + cd - || return 1 + rm -rf "$tmp_dir" + + # Export the environment variables to the Conda environment + echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..." + print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib" + + echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})" +} + +################################################################################ +# ROCm Setup Functions +################################################################################ + install_rocm_ubuntu () { local env_name="$1" local rocm_version="$2" @@ -652,15 +764,25 @@ install_rocm_ubuntu () { (exec_with_retries amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms) || return 1 echo "[INSTALL] Installing HIP-relevant packages ..." - install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev install_system_packages hipify-clang miopen-hip miopen-hip-dev + # There is no need to install these packages for ROCm + # install_system_packages mesa-common-dev clang comgr libopenblas-dev jp intel-mkl-full locales libnuma-dev + echo "[INSTALL] Cleaning up ..." print_exec rm -f "${package_name}" + echo "[INFO] Check ROCM GPU info ..." + print_exec rocm-smi + echo "[INSTALL] Successfully installed ROCm ${rocm_version}" } + +################################################################################ +# Build Tools Setup Functions +################################################################################ + install_cxx_compiler () { local env_name="$1" local use_system_package_manager="$2" @@ -759,82 +881,6 @@ install_build_tools () { echo "[INSTALL] Successfully installed all the build tools" } -install_cudnn () { - local env_name="$1" - local install_path="$2" - local cuda_version="$3" - if [ "$cuda_version" == "" ]; then - echo "Usage: ${FUNCNAME[0]} ENV_NAME INSTALL_PATH CUDA_VERSION" - echo "Example:" - echo " ${FUNCNAME[0]} build_env \$(pwd)/cudnn_install 11.7" - return 1 - else - echo "################################################################################" - echo "# Install cuDNN" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" - fi - - # Install cuDNN manually - # Based on install script in https://github.com/pytorch/builder/blob/main/common/install_cuda.sh - local cudnn_packages=( - ["115"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" - ["116"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive.tar.xz" - ["117"]="https://ossci-linux.s3.amazonaws.com/cudnn-linux-x86_64-8.5.0.96_cuda11-archive.tar.xz" - ["118"]="https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz" - ) - - # Split version string by dot into array, i.e. 11.7.1 => [11, 7, 1] - # shellcheck disable=SC2206 - local cuda_version_arr=(${cuda_version//./ }) - # Fetch the major and minor version to concat - local cuda_concat_version="${cuda_version_arr[0]}${cuda_version_arr[1]}" - - # Get the URL - local cudnn_url="${cudnn_packages[cuda_concat_version]}" - if [ "$cudnn_url" == "" ]; then - # Default to cuDNN for 11.7 if no CUDA version fits - echo "[INSTALL] Defaulting to cuDNN for CUDA 11.7" - cudnn_url="${cudnn_packages[117]}" - fi - - # Clear the install path - rm -rf "$install_path" - mkdir -p "$install_path" - - # Create temporary directory - # shellcheck disable=SC2155 - local tmp_dir=$(mktemp -d) - cd "$tmp_dir" || return 1 - - # Download cuDNN - echo "[INSTALL] Downloading cuDNN to ${tmp_dir} ..." - (exec_with_retries wget -q "$cudnn_url" -O cudnn.tar.xz) || return 1 - - # Unpack the tarball - echo "[INSTALL] Unpacking cuDNN ..." - tar -xvf cudnn.tar.xz - - # Copy the includes and libs over to the install path - echo "[INSTALL] Moving cuDNN files to ${install_path} ..." - rm -rf "${install_path:?}/include" - rm -rf "${install_path:?}/lib" - mv cudnn-linux-*/include "$install_path" - mv cudnn-linux-*/lib "$install_path" - - # Delete the temporary directory - cd - || return 1 - rm -rf "$tmp_dir" - - # Export the environment variables to the Conda environment - echo "[INSTALL] Set environment variables CUDNN_INCLUDE_DIR and CUDNN_LIBRARY ..." - print_exec conda env config vars set -n "${env_name}" CUDNN_INCLUDE_DIR="${install_path}/include" CUDNN_LIBRARY="${install_path}/lib" - - echo "[INSTALL] Successfully installed cuDNN (for CUDA ${cuda_version})" -} - ################################################################################ # Combination Functions @@ -876,7 +922,7 @@ create_conda_pytorch_environment () { ################################################################################ -# Build Functions +# FBGEMM_GPU Build Functions ################################################################################ prepare_fbgemm_gpu_build () { @@ -895,6 +941,11 @@ prepare_fbgemm_gpu_build () { echo "" fi + if [[ "${GITHUB_WORKSPACE}" ]]; then + # https://github.com/actions/checkout/issues/841 + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + fi + echo "[BUILD] Running git submodules update ..." git submodule sync git submodule update --init --recursive diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml index f6bae56123..977b443a2b 100644 --- a/.github/workflows/fbgemm_ci.yml +++ b/.github/workflows/fbgemm_ci.yml @@ -13,6 +13,11 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build-posix: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index 8e021c4451..bd62f23761 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -13,9 +13,17 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build_and_test_amd: runs-on: ${{ matrix.os }} + container: + image: ${{ matrix.container-image }} + options: --user root defaults: run: shell: bash @@ -25,11 +33,18 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-20.04 ] + os: [ linux.12xlarge ] + container-image: [ "ubuntu:20.04" ] python-version: [ "3.10" ] rocm-version: [ "5.3" ] steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -74,7 +89,7 @@ jobs: print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a print_exec conda run -n $BUILD_ENV python setup.py build develop - - name: Test FBGEMM_GPU-ROCM Nightly installation + - name: Test FBGEMM_GPU-ROCM Nightly Installation timeout-minutes: 10 run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml index dc2b6344ce..1ff7203108 100644 --- a/.github/workflows/fbgemm_gpu_lint.yml +++ b/.github/workflows/fbgemm_gpu_lint.yml @@ -14,6 +14,11 @@ on: branches: - main +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: run_pylint: runs-on: ubuntu-latest diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml index 4cdb10aaa8..bc699ef62b 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_nightly_build.yml @@ -30,6 +30,11 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts and upload to GHA build_artifact: diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml index 72a0af01e7..1125b17a0d 100644 --- a/.github/workflows/fbgemm_nightly_build_cpu.yml +++ b/.github/workflows/fbgemm_nightly_build_cpu.yml @@ -30,10 +30,19 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + # https://stackoverflow.com/questions/66335225/how-to-cancel-previous-runs-in-the-pr-when-you-push-new-commitsupdate-the-curre + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -48,6 +57,9 @@ jobs: python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -93,6 +105,9 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -107,6 +122,9 @@ jobs: needs: build_artifact steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml index 5e3d369fe4..def6002a76 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_release_build.yml @@ -22,6 +22,11 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts and upload to GHA build_artifact: diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml index a652c89854..c7fb53cabd 100644 --- a/.github/workflows/fbgemm_release_build_cpu.yml +++ b/.github/workflows/fbgemm_release_build_cpu.yml @@ -22,10 +22,18 @@ on: # workflow_dispatch: +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -40,6 +48,9 @@ jobs: python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -85,6 +96,9 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: runs-on: ${{ matrix.os }} + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -99,6 +113,9 @@ jobs: needs: build_artifact steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md new file mode 100644 index 0000000000..a90a059b40 --- /dev/null +++ b/fbgemm_gpu/docs/BuildInstructions.md @@ -0,0 +1,430 @@ +# FBGEMM_GPU Build Instructions + +The most up-to-date instructions are embedded in +[`setup_env.bash`](../../.github/scripts/setup_env.bash). The general steps for +building FBGEMM_GPU are as follows: + +1. Set up an isolated environment for building (Miniconda) +1. Install the relevant build tools (C/C++ compiler) +1. Set up for either CUDA, ROCm, or CPU build +1. Install PyTorch +1. Run the build + + +## Set Up an Isolated Build Environment + +### Install Miniconda + +Setting up a [Miniconda](https://docs.conda.io/en/latest/miniconda.html) +environment is recommended for reproducible builds: + +```sh +# Set the Miniconda prefix directory +miniconda_prefix=$HOME/miniconda + +# Download the Miniconfs installer +wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + +# Run the installer +bash miniconda.sh -b -p "$miniconda_prefix" -u + +# Load the shortcuts +. ~/.bashrc + +# Run updates +conda update -n base -c defaults -y conda +``` + +From here on out, all installation commands will be run against or inside a +Conda environment. + + +### Set Up the Conda Environment + +Create a Conda environment with the specified Python version: + +```sh +env_name= +python_version=3.10 + +# Create the environment +conda create -y --name "${env_name}" python="${python_version}" + +# Upgrade PIP and pyOpenSSL package +conda run -n "${env_name}" pip install --upgrade pip +conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0 +``` + +## Install the Build Tools + +### C/C++ Compiler + +Install the GCC toolchain. Note that GCC (as opposed to LLVM for example) is +required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++` +in the path. + +```sh +conda install -n "${env_name}" -y gxx_linux-64=9.3.0 +``` + +Note that while newer versions of GCC can be used, binaries compiled under newer +versions of GCC will not be compatible with older systems such as Ubuntu 20.04 +or CentOS Stream 8, because the compiled library will reference symbols from +versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support. To +see what versions of GLIBCXX that the available `libstdc++.so.6` supports: + +```sh +libcxx_path=/path/to/libstdc++.so.6 +objdump -TC "${libcxx_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat +``` + +### Other Build Tools + +Install the other necessary build tools such as `ninja`, `cmake`, etc: + +```sh +conda install -n "${env_name}" -y \ + click \ + cmake \ + hypothesis \ + jinja2 \ + ninja \ + numpy \ + scikit-build \ + wheel +``` + + +## Set Up for CUDA Build + +The CUDA build of FBGEMM_GPU requires `nvcc` that supports compute capability +3.5+. Setting the machine up for CUDA builds of FBGEMM_GPU can be done either +through pre-built Docker images or through Conda installation on bare metal. +Note that neither a GPU nor the NVIDIA drivers need to be present for builds, +since they are only used at runtime. + +### Docker Image + +For setups through Docker, simply pull the pre-installed +[Docker image for CUDA](https://hub.docker.com/r/nvidia/cuda) for the desired +Linux distribution and CUDA version. + +```sh +# Run for Ubuntu 22.04, CUDA 11.8 +docker run -it --entrypoint "/bin/bash" nvidia/cuda:11.8.0-devel-ubuntu22.04 +``` + +From there, the rest of the build environment may be constructed through Conda. + +### Install CUDA + +Install the full CUDA package through Conda, which includes +[NVML](https://developer.nvidia.com/nvidia-management-library-nvml): + +```sh +cuda_version=11.7.1 + +# Install the full CUDA package +conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}" +``` + +Ensure that at the minimum, **`cuda_runtime.h`** and **`libnvidia-ml.so`** are +found: + +```sh +conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX) +find "${conda_prefix}" -name cuda_runtime.h +find "${conda_prefix}" -name libnvidia-ml.so +``` + +### Install cuDNN + +[cuDNN](https://developer.nvidia.com/cudnn) is a build-time dependency for the +CUDA variant of FBGEMM_GPU. Download and extract the cuDNN package for the +given CUDA version: + +```sh +# cuDNN package URLs can be found in: https://github.com/pytorch/builder/blob/main/common/install_cuda.sh +cudnn_url=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz + +# Download and unpack cuDNN +wget -q "${cudnn_url}" -O cudnn.tar.xz +``` + +### [OPTIONAL] Install CUB + +[CUB](https://docs.nvidia.com/cuda/cub/index.html) is a build-time dependency for +the CUDA variant FBGEMM_GPU. This must be installed separately for +**previous versions of CUDA (prior to 11.1)** since they did not come with CUB packaged. + +To install CUB through Conda: + +```sh +conda install -c bottler nvidiacub +``` + +Alternatively, CUB may be installed manually by downloading from the +[GitHub Releases](https://github.com/NVIDIA/cub/releases ) page and unpacking +the package: + +```sh +# Download and unpack CUB +wget -q https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz +``` + + +## Set Up for ROCm Build + +Setting the machine up for ROCm builds of FBGEMM_GPU can be done either through +pre-built Docker images or through bare metal. + +### Docker Image + +For setups through Docker, simply pull the pre-installed +[Docker image for ROCm](https://hub.docker.com/r/rocm/rocm-terminal) for the +desired ROCm CUDA version. + +```sh +# Run for ROCm 5.4.2 +docker run -it --entrypoint "/bin/bash" rocm/rocm-terminal:5.4.2 +``` + +From there, the rest of the build environment may be constructed through Conda. + +### Install ROCm + +Install the full ROCm package through the operating system package manger. The +full instructions can be found in the +[ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html): + +```sh +# [OPTIONAL] Disable apt installation prompts +export DEBIAN_FRONTEND=noninteractive + +# Update the repo DB +apt update + +# Download the installer +wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/focal/amdgpu-install_5.4.50403-1_all.deb + +# Run the installer +apt install ./amdgpu-install_5.4.50403-1_all.deb + +# Install ROCm +amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms +``` + +### Install MIOpen + +[MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen) is a dependency for the +ROCm variant of FBGEMM_GPU that needs to be installed: + +```sh +apt install hipify-clang miopen-hip miopen-hip-dev +``` + + +## Install PyTorch + +The official [PyTorch Homepage](https://pytorch.org/get-started/locally/) contains +the most authoritative instructions on how to install PyTorch, either through +Conda or through PIP. + +### Installation Through Conda + +```sh +# Install the latest nightly +conda install -n "${env_name}" -y pytorch -c pytorch-nightly +# Install the latest test (RC) +conda install -n "${env_name}" -y pytorch -c pytorch-test +# Install a specific version +conda install -n "${env_name}" -y pytorch==1.13.1 -c pytorch +``` + +Note that installing PyTorch through Conda without specifying a version (as in +the case of nightly builds) may not always be reliable. For example, it is known +that the GPU builds for PyTorch nightlies arrive in Conda 2 hours later than the +CPU-only builds. As such, a Conda installation of `pytorch-nightly` in that time +window will silently fall back to installing the CPU-only version. + +Also note that, because both the GPU and CPU-only versions of PyTorch are placed +into the same artifact bucket, the PyTorch variant that is selected during +installation will depend on whether or not CUDA is installed on the system. Thus +for GPU builds, it is important to install CUDA first prior to PyTorch. + +### Installation Through PIP + +Note that PIP is the only choice of installation of PyTorch for ROCm builds. + +```sh +# Install the latest nightly +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu117/ +# Install the latest test (RC) +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cu117/ +# Install a specific version +conda run -n "${env_name}" pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117/ +# Install the latest nightly (ROCm 5.3) +conda run -n "${env_name}" pip install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/rocm5.3/ +``` + +### Post-Install Checks + +Verify the PyTorch installation with an `import` test: + +```sh +conda run -n "${env_name}" python -c "import torch.distributed" +``` + +For the GPU variant of PyTorch, ensure that at the minimum, **`cuda_cmake_macros.h`** +is found: + +```sh +conda_prefix=$(conda run -n "${env_name}" printenv CONDA_PREFIX) +find "${conda_prefix}" -name cuda_cmake_macros.h +``` + + +## Build the FBGEMM_GPU Package + +### Preparing the Build + +Clone the repo along with its submodules, and install the `requirements.txt`: + +```sh +# !! Run inside the Conda environment !! + +# Select a version tag +FBGEMM_VERSION=v0.4.0 + +# Clone the repo along with its submodules +git clone --recursive -b ${FBGEMM_VERSION} https://github.com/pytorch/FBGEMM.git fbgemm_${FBGEMM_VERSION} + +# Install additional required packages for building and testing +cd fbgemm_${FBGEMM_VERSION}/fbgemm_gpu +pip install requirements.txt +``` + +### The Build Process + +The FBGEMM_GPU build process uses a scikit-build CMake-based build flow, and it +keeps state across install runs. As such, builds can become stale and can cause +problems when re-runs are attempted after a build failure due to missing +dependencies, etc. To address this, simply clear the build cache: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +python setup.py clean +``` + +### CUDA Build + +Building FBGEMM_GPU for CUDA requires both NVML and cuDNN to be installed and +made available to the build through environment variables: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# [OPTIONAL] Specify the CUDA installation paths +# This may be required if CMake is unable to find nvcc +export CUDACXX=/path/to/nvcc +export CUDA_BIN_PATH=/path/to/cuda/installation + +# [OPTIONAL] Provide the CUB installation directory (applicable only to CUDA versions prior to 11.1) +export CUB_DIR=/path/to/cub + +# Specify cuDNN header and library paths +export CUDNN_INCLUDE_DIR=/path/to/cudnn/include +export CUDNN_LIBRARY=/path/to/cudnn/lib + +# Specify NVML path +export NVML_LIB_PATH=/path/to/libnvidia-ml.so + +# Update to reflect the version of Python in the Conda environment +python_tag=py310 +package_name=fbgemm_gpu + +# Build for SM70/80 (V100/A100 GPU); update as needed +# If not specified, only the CUDA architecture supported by current system will be targeted +# Ifo CUDA device is present either, all CUDA architectures will be targeted +cuda_arch_list=7.0;8.0 + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 \ + --nvml_lib_path=${NVML_LIB_PATH} \ + -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}" + +# Build and install the library into the Conda environment +python setup.py install \ + --nvml_lib_path=${NVML_LIB_PATH} \ + -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}" +``` + +### ROCm Build + +For ROCm builds, `ROCM_PATH` and `PYTORCH_ROCM_ARCH` need to be specified: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# Build for the ROCm architecture on current machine; update as needed (e.g. 'gfx906;gfx908;gfx90a') +export ROCM_PATH=/path/to/rocm +export PYTORCH_ROCM_ARCH=$(${ROCM_PATH}/bin/rocminfo | grep -o -m 1 'gfx.*') + +python_tag=py310 +package_name=fbgemm_gpu_rocm + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 + +# Build and install the library into the Conda environment +python setup.py install develop +``` + +### CPU-Only Build + +For CPU-only builds, the `--cpu_only` needs to be specified: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +python_tag=py310 +package_name=fbgemm_gpu_cpu + +# Build the wheel artifact only +python setup.py bdist_wheel \ + --package_name="${package_name}" \ + --python-tag="${python_tag}" \ + --plat-name=manylinux1_x86_64 \ + --cpu_only + +# Build and install the library into the Conda environment +python setup.py install --cpu_only +``` + +### Post-Build Checks + +After the build completes, it is useful to check the built library and verify +the version numbers of GLIBCXX referenced as well as the availability of certain +function symbols: + +```sh +# !! Run in fbgemm_gpu/ directory inside the Conda environment !! + +# Locate the built .SO file +fbgemm_gpu_lib_path=$(find . -name fbgemm_gpu_py.so) + +# Note the versions of GLIBCXX referenced by the .SO +# The libstdc++.so.6 available on the install target must support these versions +objdump -TC "${fbgemm_gpu_lib_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + +# Test for the existence of a given function symbol in the .SO +nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::merge_pooled_embeddings(" +nm -gDC "${fbgemm_gpu_lib_path}" | grep " fbgemm_gpu::jagged_2d_to_dense(" +``` diff --git a/fbgemm_gpu/docs/README.md b/fbgemm_gpu/docs/README.md index 097cde17dc..e2b0c81ae7 100644 --- a/fbgemm_gpu/docs/README.md +++ b/fbgemm_gpu/docs/README.md @@ -123,7 +123,7 @@ Follow these instructions to document, generate, and publish a new C++ descripti ``` pip3 install -r requirements.txt - doxygen Doxygen.ini + doxygen Doxyfile.in make html ``` diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py index 6b8ebbb570..2b34cb240a 100644 --- a/fbgemm_gpu/setup.py +++ b/fbgemm_gpu/setup.py @@ -7,6 +7,7 @@ import argparse import os import random +import re import subprocess import sys @@ -38,8 +39,9 @@ def generate_package_version(package_name: str): print( f"[SETUP.PY] TAG: {gitversion.get_tag()}, BRANCH: {gitversion.get_branch()}, SHA: {gitversion.get_sha()}" ) - # Remove the local version identifier, if any (0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0) - version = gitversion.version_from_git().split("+")[0] + # Remove the local version identifier, if any (e.g. 0.4.0rc0.post0+git.6a63116c.dirty => 0.4.0rc0.post0) + # Then remove post0 (keep postN for N > 0) (e.g. 0.4.0rc0.post0 => 0.4.0rc0) + version = re.sub(".post0$", "", gitversion.version_from_git().split("+")[0]) print(f"[SETUP.PY] Setting the package version: {version}") return version From da01a59556fec9776733bf20aea8fe8fb29cdd3d Mon Sep 17 00:00:00 2001 From: Alfredo Tupone Date: Tue, 14 Mar 2023 19:39:40 -0700 Subject: [PATCH 06/34] include cstdint (#1640) Summary: fix build with gcc-13 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1640 Reviewed By: shintaro-iwasaki Differential Revision: D44044422 Pulled By: q10 fbshipit-source-id: 692ec9c34f4aaf726294a2b643fbceabf8159033 --- include/fbgemm/UtilsAvx2.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h index a1af6078a8..4fb1220eba 100644 --- a/include/fbgemm/UtilsAvx2.h +++ b/include/fbgemm/UtilsAvx2.h @@ -8,6 +8,7 @@ // This file defines common utilities used in code compiled with avx2/avx512 // flags. +#include #include namespace fbgemm { From ae6235bc4b521102155fad4b54f92df34b5a6afe Mon Sep 17 00:00:00 2001 From: Sarunya Pumma Date: Wed, 15 Mar 2023 13:55:25 -0700 Subject: [PATCH 07/34] Add support for group size > 54 in group_index_select (#1611) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1611 If group size is larger than 54, internally breaks the group down into smaller groups (each subgroup size is less than or equal to 54). Reviewed By: jianyuh Differential Revision: D43585937 fbshipit-source-id: bf14eeb79881a5737dcf7660e3e0f56d21f7b326 --- fbgemm_gpu/src/sparse_ops_gpu.cpp | 39 +++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/fbgemm_gpu/src/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops_gpu.cpp index e3e1225fb9..0126ff414f 100644 --- a/fbgemm_gpu/src/sparse_ops_gpu.cpp +++ b/fbgemm_gpu/src/sparse_ops_gpu.cpp @@ -500,12 +500,41 @@ Tensor index_select_dim0_gpu( std::vector group_index_select_dim0_gpu( const std::vector& input_group, const std::vector& indices_group) { + const auto group_size = input_group.size(); std::vector output_group; - apply_( - [&](auto&&... args) { - output_group = GroupIndexSelectDim0GPUOp::apply(indices_group, args...); - }, - input_group); + // We use the APPLY_AUTOGRAD_FN macros to instantiate + // GroupIndexSelectDim0GPUOp for different group sizes. We only instantiate + // up to group size of 54. + constexpr size_t max_group_size = 54; + // Specialize this path to avoid copy + if (group_size <= max_group_size) { + apply_( + [&](auto&&... args) { + output_group = + GroupIndexSelectDim0GPUOp::apply(indices_group, args...); + }, + input_group); + return output_group; + } + + const auto input_itr = input_group.begin(); + const auto indices_itr = indices_group.begin(); + + for (size_t start = 0; start < group_size; start += max_group_size) { + const auto end = std::min(start + max_group_size, group_size); + std::vector input_subgroup(input_itr + start, input_itr + end); + std::vector indices_subgroup( + indices_itr + start, indices_itr + end); + std::vector output_subgroup; + apply_( + [&](auto&&... args) { + output_subgroup = + GroupIndexSelectDim0GPUOp::apply(indices_subgroup, args...); + }, + input_subgroup); + output_group.insert( + output_group.end(), output_subgroup.begin(), output_subgroup.end()); + } return output_group; } From fd0eb83bd68a0980a03d64da2e846675777a095c Mon Sep 17 00:00:00 2001 From: Doe Hyun Yoon Date: Thu, 16 Mar 2023 10:18:15 -0700 Subject: [PATCH 08/34] Implement cache miss emulation in UVM_CACHING (#1637) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1637 Enforce cache misses (even if trace-driven testing doesn't experience cache miss due to limited trace size) so that we can evaluate performance under cache misses. Note that it's not exactly cache misses; enforce access to UVM by overriding lxu_cache_locations -- N / 256 requests. Reviewed By: YuzeDaiMeta Differential Revision: D42194019 fbshipit-source-id: ab04c1cc7a749e84d605cfe4f1687489ceab5725 --- .../embedding_forward_quantized_host.cpp | 65 ++++++++-- .../split_embeddings_cache_cuda.cuh | 6 + fbgemm_gpu/src/split_embeddings_cache_cuda.cu | 92 +++++++++++--- .../test/uvm_cache_miss_emulate_test.cpp | 119 ++++++++++++++++++ 4 files changed, 258 insertions(+), 24 deletions(-) create mode 100644 fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp index 6d4426cb27..43a182b6b1 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp @@ -4,12 +4,12 @@ * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ + #include #include #include #include #include -#include #include "c10/core/ScalarType.h" #ifdef FBCODE_CAFFE2 #include "common/stats/Stats.h" @@ -18,6 +18,8 @@ #include "fbgemm_gpu/sparse_ops_utils.h" #include "fbgemm_gpu/split_embeddings_cache_cuda.cuh" +#include + using Tensor = at::Tensor; using namespace fbgemm_gpu; @@ -37,7 +39,7 @@ DEFINE_quantile_stat( facebook::fb303::ExportTypeConsts::kNone, std::array{{.25, .50, .75, .99}}); -// Miss rate due to conflict in cache associativity. +// (Unique) Miss rate due to conflict in cache associativity. // # unique misses due to conflict / # requested indices. DEFINE_quantile_stat( tbe_uvm_cache_conflict_unique_miss_rate, @@ -45,6 +47,21 @@ DEFINE_quantile_stat( facebook::fb303::ExportTypeConsts::kNone, std::array{{.25, .50, .75, .99}}); +// Miss rate due to conflict in cache associativity. +// # misses due to conflict / # requested indices. +DEFINE_quantile_stat( + tbe_uvm_cache_conflict_miss_rate, + "tbe_uvm_cache_conflict_miss_rate_per_mille", + facebook::fb303::ExportTypeConsts::kNone, + std::array{{.25, .50, .75, .99}}); + +// Total miss rate. +DEFINE_quantile_stat( + tbe_uvm_cache_total_miss_rate, + "tbe_uvm_cache_total_miss_rate_per_mille", + facebook::fb303::ExportTypeConsts::kNone, + std::array{{.25, .50, .75, .99}}); + // FLAGs to control UVMCacheStats. DEFINE_int32( tbe_uvm_cache_stat_report, @@ -58,6 +75,12 @@ DEFINE_int32( "If tbe_uvm_cache_stat_report is enabled, more detailed raw stats will be printed with this " "period. This should be an integer multiple of tbe_uvm_cache_stat_report."); +DEFINE_int32( + tbe_uvm_cache_enforced_misses, + 0, + "If set to non-zero, some cache lookups (tbe_uvm_cache_enforced_misses / 256) are enforced to be misses; " + "this is performance evaluation purposes only; and should be zero otherwise."); + // TODO: align this with uvm_cache_stats_index in // split_embeddings_cache_cuda.cu. const int kUvmCacheStatsSize = 6; @@ -84,10 +107,11 @@ void process_uvm_cache_stats( // uvm_cache_stats_counters[0]: num_req_indices // uvm_cache_stats_counters[1]: num_unique_indices // uvm_cache_stats_counters[2]: num_unique_misses - // uvm_cache_stats_counters[3]: num_unique_conflict_misses + // uvm_cache_stats_counters[3]: num_conflict_unique_misses + // uvm_cache_stats_counters[4]: num_conflict_misses // They should be zero-out after the calculated rates are populated into // cache counters. - static std::vector uvm_cache_stats_counters(4); + static std::vector uvm_cache_stats_counters(5); // Export cache stats. auto uvm_cache_stats_cpu = uvm_cache_stats.cpu(); @@ -107,19 +131,32 @@ void process_uvm_cache_stats( // Calculate cache related ratios based on the cumulated numbers and // push them into the counter pools. if (populate_uvm_stats && uvm_cache_stats_counters[0] > 0) { - double unique_rate = + const double unique_rate = static_cast(uvm_cache_stats_counters[1]) / uvm_cache_stats_counters[0] * 1000; - double unique_miss_rate = + const double unique_miss_rate = static_cast(uvm_cache_stats_counters[2]) / uvm_cache_stats_counters[0] * 1000; - double unique_conflict_miss_rate = + const double conflict_unique_miss_rate = static_cast(uvm_cache_stats_counters[3]) / uvm_cache_stats_counters[0] * 1000; + const double conflict_miss_rate = + static_cast(uvm_cache_stats_counters[4]) / + uvm_cache_stats_counters[0] * 1000; + // total # misses = unique misses - conflict_unique_misses + conflict + // misses. + const double total_miss_rate = + static_cast( + uvm_cache_stats_counters[2] - uvm_cache_stats_counters[3] + + uvm_cache_stats_counters[4]) / + uvm_cache_stats_counters[0] * 1000; + STATS_tbe_uvm_cache_unique_rate.addValue(unique_rate); STATS_tbe_uvm_cache_unique_miss_rate.addValue(unique_miss_rate); STATS_tbe_uvm_cache_conflict_unique_miss_rate.addValue( - unique_conflict_miss_rate); + conflict_unique_miss_rate); + STATS_tbe_uvm_cache_conflict_miss_rate.addValue(conflict_miss_rate); + STATS_tbe_uvm_cache_total_miss_rate.addValue(total_miss_rate); // Fill all the elements of the vector uvm_cache_stats_counters as 0 // to zero out the cumulated counters. @@ -365,7 +402,7 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function( // cache_index_table_map: (linearized) index to table number map. // 1D tensor, dtype=int32. c10::optional cache_index_table_map, - // lxu_cache_state: Cache state (cached idnex, or invalid). + // lxu_cache_state: Cache state (cached index, or invalid). // 2D tensor: # sets x assoc. dtype=int64. c10::optional lxu_cache_state, // lxu_state: meta info for replacement (time stamp for LRU). @@ -461,6 +498,16 @@ Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function( uvm_cache_stats); #ifdef FBCODE_CAFFE2 + if (FLAGS_tbe_uvm_cache_enforced_misses > 0) { + // Override some lxu_cache_locations (N for every 256 indices) with cache + // miss to enforce access to UVM. + lxu_cache_locations = emulate_cache_miss( + lxu_cache_locations.value(), + FLAGS_tbe_uvm_cache_enforced_misses, + gather_uvm_stats, + uvm_cache_stats); + } + process_uvm_cache_stats( signature, total_cache_hash_size.value(), diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh index 52854a4f2e..3532928963 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh @@ -155,6 +155,12 @@ at::Tensor lxu_cache_lookup_cuda( bool gather_cache_stats, c10::optional uvm_cache_stats); +at::Tensor emulate_cache_miss( + at::Tensor lxu_cache_locations, + const int64_t enforced_misses_per_256, + const bool gather_cache_stats, + at::Tensor uvm_cache_stats); + ///@ingroup table-batched-embed-cuda /// Lookup the LRU/LFU cache: find the cache weights location for all indices. /// Look up the slots in the cache corresponding to `linear_cache_indices`, with diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu index 9d23ee9fff..e5930ab745 100644 --- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu +++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu @@ -79,6 +79,18 @@ enum uvm_cache_stats_index { num_conflict_misses = 5, }; +// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is +// not sensitive to grid size as long as the number thread blocks per SM is not +// too small nor too big. +constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16; + +int get_max_thread_blocks_for_cache_kernels_() { + cudaDeviceProp* deviceProp = + at::cuda::getDeviceProperties(c10::cuda::current_device()); + return deviceProp->multiProcessorCount * + MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS; +} + } // namespace int64_t host_lxu_cache_slot(int64_t h_in, int64_t C) { @@ -495,6 +507,69 @@ std::tuple> get_unique_indices_cuda( namespace { +template +__global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel( + at::PackedTensorAccessor32 + lxu_cache_locations, + const int64_t enforced_misses_per_256, + const bool gather_cache_stats, + at::PackedTensorAccessor32 + uvm_cache_stats) { + const int32_t N = lxu_cache_locations.size(0); + int64_t n_enforced_misses = 0; + CUDA_KERNEL_LOOP(n, N) { + if ((n & 0x00FF) < enforced_misses_per_256) { + if (lxu_cache_locations[n] >= 0) { + n_enforced_misses++; + } + lxu_cache_locations[n] = kCacheLocationMissing; + } + } + if (gather_cache_stats && n_enforced_misses > 0) { + atomicAdd( + &uvm_cache_stats[uvm_cache_stats_index::num_conflict_misses], + n_enforced_misses); + } +} +} // namespace + +Tensor emulate_cache_miss( + Tensor lxu_cache_locations, + const int64_t enforced_misses_per_256, + const bool gather_cache_stats, + Tensor uvm_cache_stats) { + TENSOR_ON_CUDA_GPU(lxu_cache_locations); + TENSOR_ON_CUDA_GPU(uvm_cache_stats); + + const auto N = lxu_cache_locations.numel(); + if (lxu_cache_locations.numel() == 0) { + // nothing to do + return lxu_cache_locations; + } + + const dim3 blocks(std::min( + div_round_up(N, kMaxThreads), + get_max_thread_blocks_for_cache_kernels_())); + + AT_DISPATCH_INDEX_TYPES( + lxu_cache_locations.scalar_type(), "emulate_cache_miss", [&] { + emulate_cache_miss_kernel<<< + blocks, + kMaxThreads, + 0, + at::cuda::getCurrentCUDAStream()>>>( + lxu_cache_locations + .packed_accessor32(), + enforced_misses_per_256, + gather_cache_stats, + uvm_cache_stats + .packed_accessor32()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + return lxu_cache_locations; +} + +namespace { template __global__ __launch_bounds__(kMaxThreads) void lru_cache_find_uncached_kernel( const at::PackedTensorAccessor32 @@ -622,19 +697,6 @@ __launch_bounds__(kMaxThreads) void direct_mapped_lru_cache_find_uncached_kernel } } } - -// Experiments showed that performance of lru/lxu_cache_find_uncached_kernel is -// not sensitive to grid size as long as the number thread blocks per SM is not -// too small nor too big. -constexpr int MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS = 16; - -int get_max_thread_blocks_for_cache_kernels_() { - cudaDeviceProp* deviceProp = - at::cuda::getDeviceProperties(c10::cuda::current_device()); - return deviceProp->multiProcessorCount * - MAX_THREAD_BLOCKS_PER_SM_FOR_CACHE_KERNELS; -} - } // namespace std::pair lru_cache_find_uncached_cuda( @@ -798,8 +860,8 @@ __global__ __launch_bounds__(kMaxThreads) void lru_cache_insert_kernel( at::PackedTensorAccessor32 uvm_cache_stats) { const int32_t C = lxu_cache_state.size(0); - int64_t n_conflict_misses = 0; - int64_t n_inserted = 0; + int32_t n_conflict_misses = 0; + int32_t n_inserted = 0; for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < *N_unique; n += gridDim.x * blockDim.y) { // check if this warp is responsible for this whole segment. diff --git a/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp b/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp new file mode 100644 index 0000000000..808ed33624 --- /dev/null +++ b/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#include + +#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh" + +using namespace ::testing; + +// Helper function that generates input tensor for emulate_cache_miss testing. +at::Tensor generate_lxu_cache_locations( + const int64_t num_requests, + const int64_t num_sets, + const int64_t associativity = 32) { + const auto lxu_cache_locations = at::randint( + 0, + num_sets * associativity, + {num_requests}, + at::device(at::kCPU).dtype(at::kInt)); + return lxu_cache_locations; +} + +// Wrapper function that takes lxu_cache_locations on CPU, copies it to GPU, +// runs emulate_cache_miss(), and then returns the result, placed on CPU. +std::pair run_emulate_cache_miss( + at::Tensor lxu_cache_locations, + const int64_t enforced_misses_per_256, + const bool gather_uvm_stats = false) { + at::Tensor lxu_cache_locations_copy = at::_to_copy(lxu_cache_locations); + const auto options = + lxu_cache_locations.options().device(at::kCUDA).dtype(at::kInt); + const auto uvm_cache_stats = + gather_uvm_stats ? at::zeros({6}, options) : at::empty({0}, options); + + const auto lxu_cache_location_with_cache_misses = emulate_cache_miss( + lxu_cache_locations_copy.to(at::kCUDA), + enforced_misses_per_256, + gather_uvm_stats, + uvm_cache_stats); + return {lxu_cache_location_with_cache_misses.cpu(), uvm_cache_stats.cpu()}; +} + +TEST(uvm_cache_miss_emulate_test, no_cache_miss) { + constexpr int64_t num_requests = 10000; + constexpr int64_t num_sets = 32768; + constexpr int64_t associativity = 32; + + auto lxu_cache_locations_cpu = + generate_lxu_cache_locations(num_requests, num_sets, associativity); + auto lxu_cache_location_with_cache_misses_and_uvm_cache_stats = + run_emulate_cache_miss(lxu_cache_locations_cpu, 0); + auto lxu_cache_location_with_cache_misses = + lxu_cache_location_with_cache_misses_and_uvm_cache_stats.first; + EXPECT_TRUE( + at::equal(lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses)); +} + +TEST(uvm_cache_miss_emulate_test, enforced_cache_miss) { + constexpr int64_t num_requests = 10000; + constexpr int64_t num_sets = 32768; + constexpr int64_t associativity = 32; + constexpr std::array enforced_misses_per_256_for_testing = { + 1, 5, 7, 33, 100, 256}; + + for (const bool miss_in_lxu_cache_locations : {false, true}) { + for (const bool gather_cache_stats : {false, true}) { + for (const auto enforced_misses_per_256 : + enforced_misses_per_256_for_testing) { + auto lxu_cache_locations_cpu = + generate_lxu_cache_locations(num_requests, num_sets, associativity); + if (miss_in_lxu_cache_locations) { + // one miss in the original lxu_cache_locations; shouldn't be counted + // as enforced misses from emulate_cache_miss(). + auto z = lxu_cache_locations_cpu.data_ptr(); + z[0] = -1; + } + auto lxu_cache_location_with_cache_misses_and_uvm_cache_stats = + run_emulate_cache_miss( + lxu_cache_locations_cpu, + enforced_misses_per_256, + gather_cache_stats); + auto lxu_cache_location_with_cache_misses = + lxu_cache_location_with_cache_misses_and_uvm_cache_stats.first; + EXPECT_FALSE(at::equal( + lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses)); + + auto x = lxu_cache_locations_cpu.data_ptr(); + auto y = lxu_cache_location_with_cache_misses.data_ptr(); + int64_t enforced_misses = 0; + for (int32_t i = 0; i < lxu_cache_locations_cpu.numel(); ++i) { + if (x[i] != y[i]) { + EXPECT_EQ(y[i], -1); + enforced_misses++; + } + } + int64_t num_requests_over_256 = + static_cast(num_requests / 256); + int64_t expected_misses = num_requests_over_256 * + enforced_misses_per_256 + + std::min((num_requests - num_requests_over_256 * 256), + enforced_misses_per_256); + if (miss_in_lxu_cache_locations) { + expected_misses--; + } + EXPECT_EQ(expected_misses, enforced_misses); + if (gather_cache_stats) { + auto uvm_cache_stats = + lxu_cache_location_with_cache_misses_and_uvm_cache_stats.second; + auto cache_stats_ptr = uvm_cache_stats.data_ptr(); + // enforced misses are recorded as conflict misses. + EXPECT_EQ(expected_misses, cache_stats_ptr[5]); + } + } + } + } +} From 39c5aa4f9fc32202b13a33af9bebdd3dd8266b9a Mon Sep 17 00:00:00 2001 From: Sarunya Pumma Date: Thu, 16 Mar 2023 11:01:37 -0700 Subject: [PATCH 09/34] Add TensorAccessor with memcheck (#1602) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1602 Illegal memory access is a common problem during GPU kernel execution. The FBGEMM GPU relies on PyTorch's `C10_CUDA_KERNEL_LAUNCH_CHECK()` and the CUDA runtime to detect such problems and throw an error. However, there are a few known issues with this approach. (1) `C10_CUDA_KERNEL_LAUNCH_CHECK()` detects errors on the host. However, due to the non-blocking, asynchronous nature of GPU kernel execution, the error is caught on the host at a later point than where the problematic kernel was launched. This can cause the stack trace to be inaccurate and make debugging more difficult. Although the issue can be fixed by running the code with `CUDA_LAUNCH_BLOCKING=1`, this can change the state of the execution and cause Heisenbugs. (2) Not all illegal memory accesses are caught by the runtime. This means that the system may not always throw an error when illegal memory access occurs. (3) Although the runtime throws an error for illegal memory access, it is difficult to pinpoint the specific kernel and memory buffer/address that is causing the problem. For all the aforementioned reasons, we attempt to catch and throw an error as soon as possible in the kernel when illegal memory accesses occur in FBGEMM GPU. We introduce the `FBGEMM_GPU_MEMCHECK` flag to enable memory checking during compile time. We copy PyTorch's `TensorAccessor.h` into the FBGEMM GPU and extend it to check every memory access through the `PackedTensorAccessor`. If an invalid memory access occurs, we throw an error using `CUDA_KERNEL_ASSERT`. The error message includes the name of the tensor and the kernel that caused the problem. If `FBGEMM_GPU_MEMCHECK` is enabled, FBGEMM operators will use `fbgemm::PackedTensorAccessor`. Otherwise, they will use `at::PackedTensorAccessor` `FBGEMM_GPU_MEMCHECK` integration in FBGEMM ops will be done in subsequent diffs Reviewed By: r-barnes Differential Revision: D43421838 fbshipit-source-id: c8ef04970d94bb097cb5f09b42f994db72845167 --- fbgemm_gpu/CMakeLists.txt | 3 +- .../fbgemm_gpu/fbgemm_tensor_accessor.h | 575 ++++++++++++++++++ 2 files changed, 577 insertions(+), 1 deletion(-) create mode 100644 fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 5f393b0010..036470adf2 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -167,7 +167,8 @@ set(codegen_dependencies ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/quantize_ops_utils.h ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_utils.cuh ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/sparse_ops_utils.h - ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh) + ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh + ${CMAKE_CURRENT_SOURCE_DIR}/include/fbgemm_gpu/fbgemm_tensor_accessor.h) if(USE_ROCM) message(STATUS "${PYTHON_EXECUTABLE}" "${CMAKE_CODEGEN_DIR}/embedding_backward_code_generator.py" "--opensource --is_rocm") diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h new file mode 100644 index 0000000000..750d315d05 --- /dev/null +++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h @@ -0,0 +1,575 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace fbgemm_gpu { + +static constexpr size_t PTR_NAME_MAX_LEN = 16; +static constexpr size_t FUNC_NAME_MAX_LEN = 64; + +// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor +// is used to enable the __restrict__ keyword/modifier for the data +// passed to cuda. +template +struct DefaultPtrTraits { + typedef T* PtrType; +}; + +#if defined(__CUDACC__) || defined(__HIPCC__) +template +struct RestrictPtrTraits { + typedef T* __restrict__ PtrType; +}; +#endif + +// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors. +// For CUDA tensors it is used in device code (only). This means that we +// restrict ourselves to functions and types available there (e.g. +// at::IntArrayRef isn't). + +// The PtrTraits argument is only relevant to cuda to support `__restrict__` +// pointers. +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +class TensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + + C10_HOST_DEVICE TensorAccessorBase( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : data_(data), + sizes_(sizes), + strides_(strides), + ptr_name_(ptr_name), + func_name_(func_name) { + numel_ = 0; + for (size_t d = 0; d < N; d++) { + numel_ += sizes[d]; + } + } + C10_HOST at::IntArrayRef sizes() const { + return at::IntArrayRef(sizes_, N); + } + C10_HOST at::IntArrayRef strides() const { + return at::IntArrayRef(strides_, N); + } + C10_HOST_DEVICE index_t stride(index_t i) const { + return strides_[i]; + } + C10_HOST_DEVICE index_t size(index_t i) const { + return sizes_[i]; + } + C10_HOST_DEVICE PtrType data() { + return data_; + } + C10_HOST_DEVICE const PtrType data() const { + return data_; + } + C10_HOST_DEVICE T& at(index_t idx) const { + if (idx < 0) { + printf( + "ERROR: idx < 0, tensor %s in %s, idx %lld\n", + ptr_name_, + func_name_, + static_cast(idx)); + CUDA_KERNEL_ASSERT(idx >= 0) + } else if (idx >= numel_) { + printf( + "ERROR: idx >= numel, tensor %s in %s, idx %lld, numel %lld\n", + ptr_name_, + func_name_, + static_cast(idx), + static_cast(numel_)); + CUDA_KERNEL_ASSERT(idx < numel_); + } + return data_[idx]; + } + + protected: + PtrType data_; + const index_t* const sizes_; + const index_t* const strides_; + index_t numel_; + const char* const ptr_name_; + const char* const func_name_; +}; + +// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using +// `Tensor.accessor()`. +// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and +// only indexing on the device uses `TensorAccessor`s. +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +class TensorAccessor : public TensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + + C10_HOST_DEVICE TensorAccessor( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : TensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + C10_HOST_DEVICE TensorAccessor operator[]( + index_t i) { + return TensorAccessor( + this->data_ + this->strides_[0] * i, + this->sizes_ + 1, + this->strides_ + 1, + this->ptr_name_, + this->func_name); + } + + C10_HOST_DEVICE const TensorAccessor operator[]( + index_t i) const { + return TensorAccessor( + this->data_ + this->strides_[0] * i, + this->sizes_ + 1, + this->strides_ + 1, + this->ptr_name_, + this->func_name); + } +}; + +template class PtrTraits, typename index_t> +class TensorAccessor + : public TensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + + C10_HOST_DEVICE TensorAccessor( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* func_name) + : TensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + C10_HOST_DEVICE T& operator[](index_t i) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + return this->at(this->strides_[0] * i); + } + C10_HOST_DEVICE const T& operator[](index_t i) const { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + return this->at(this->strides_[0] * i); + } +}; + +// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on +// for CUDA `Tensor`s on the host and as In contrast to `TensorAccessor`s, they +// copy the strides and sizes on instantiation (on the host) in order to +// transfer them on the device when calling kernels. On the device, indexing of +// multidimensional tensors gives to `TensorAccessor`s. Use RestrictPtrTraits as +// PtrTraits if you want the tensor's data pointer to be marked as __restrict__. +// Instantiation from data, sizes, strides is only needed on the host and +// std::copy isn't available on the device, so those functions are host only. +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +class GenericPackedTensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + C10_HOST GenericPackedTensorAccessorBase( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : data_(data) { + std::copy(sizes, sizes + N, std::begin(sizes_)); + std::copy(strides, strides + N, std::begin(strides_)); + // Compute numel_ + numel_ = 0; + for (size_t d = 0; d < N; d++) { + numel_ += sizes[d]; + } + copy_str(ptr_name_, ptr_name, PTR_NAME_MAX_LEN); + copy_str(func_name_, func_name, FUNC_NAME_MAX_LEN); + } + + // if index_t is not int64_t, we want to have an int64_t constructor + template < + typename source_index_t, + class = typename std::enable_if< + std::is_same::value>::type> + C10_HOST GenericPackedTensorAccessorBase( + PtrType data, + const source_index_t* const sizes, + const source_index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : data_(data) { + for (const auto i : c10::irange(N)) { + this->sizes_[i] = sizes[i]; + this->strides_[i] = strides[i]; + } + // Compute numel_ + numel_ = 0; + for (size_t d = 0; d < N; d++) { + numel_ += sizes[d]; + } + copy_str(ptr_name_, ptr_name, PTR_NAME_MAX_LEN); + copy_str(func_name_, func_name, FUNC_NAME_MAX_LEN); + } + + C10_HOST void copy_str(char* dst, const char* src, const size_t max_len) { + const auto len = std::min(strlen(src), max_len - 1); + std::memcpy(dst, src, sizeof(char) * len); + dst[len] = '\0'; + } + + C10_HOST_DEVICE T& at(index_t idx) const { + if (idx < 0) { + printf( + "ERROR: idx < 0, tensor %s in %s, idx %lld\n", + ptr_name_, + func_name_, + static_cast(idx)); + CUDA_KERNEL_ASSERT(idx >= 0) + } else if (idx >= numel_) { + printf( + "ERROR: idx >= numel, tensor %s in %s, idx %lld, numel %lld\n", + ptr_name_, + func_name_, + static_cast(idx), + static_cast(numel_)); + CUDA_KERNEL_ASSERT(idx < numel_) + } + return data_[idx]; + } + + C10_HOST_DEVICE index_t stride(index_t i) const { + return strides_[i]; + } + C10_HOST_DEVICE index_t size(index_t i) const { + return sizes_[i]; + } + C10_HOST_DEVICE PtrType data() { + return data_; + } + C10_HOST_DEVICE const PtrType data() const { + return data_; + } + + protected: + PtrType data_; + index_t sizes_[N]; + index_t strides_[N]; + index_t numel_; + char ptr_name_[PTR_NAME_MAX_LEN]; + char func_name_[FUNC_NAME_MAX_LEN]; + C10_HOST void bounds_check_(index_t i) const { + TORCH_CHECK_INDEX( + 0 <= i && i < index_t{N}, + "Index ", + i, + " is not within bounds of a tensor of dimension ", + N); + } +}; + +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +class GenericPackedTensorAccessor + : public GenericPackedTensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + + C10_HOST GenericPackedTensorAccessor( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : GenericPackedTensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + // if index_t is not int64_t, we want to have an int64_t constructor + template < + typename source_index_t, + class = typename std::enable_if< + std::is_same::value>::type> + C10_HOST GenericPackedTensorAccessor( + PtrType data, + const source_index_t* const sizes, + const source_index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : GenericPackedTensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + C10_DEVICE TensorAccessor operator[]( + index_t i) { + index_t* new_sizes = this->sizes_ + 1; + index_t* new_strides = this->strides_ + 1; + return TensorAccessor( + this->data_ + this->strides_[0] * i, + new_sizes, + new_strides, + this->ptr_name_, + this->func_name_); + } + + C10_DEVICE const TensorAccessor operator[]( + index_t i) const { + const index_t* const new_sizes = this->sizes_ + 1; + const index_t* const new_strides = this->strides_ + 1; + return TensorAccessor( + this->data_ + this->strides_[0] * i, + new_sizes, + new_strides, + this->ptr_name_, + this->func_name_); + } + + /// Returns a PackedTensorAccessor of the same dimension after transposing the + /// two dimensions given. Does not actually move elements; transposition is + /// made by permuting the size/stride arrays. If the dimensions are not valid, + /// asserts. + C10_HOST GenericPackedTensorAccessor transpose( + index_t dim1, + index_t dim2) const { + this->bounds_check_(dim1); + this->bounds_check_(dim2); + GenericPackedTensorAccessor result( + this->data_, this->sizes_, this->strides_); + std::swap(result.strides_[dim1], result.strides_[dim2]); + std::swap(result.sizes_[dim1], result.sizes_[dim2]); + return result; + } +}; + +template class PtrTraits, typename index_t> +class GenericPackedTensorAccessor + : public GenericPackedTensorAccessorBase { + public: + typedef typename PtrTraits::PtrType PtrType; + C10_HOST GenericPackedTensorAccessor( + PtrType data, + const index_t* const sizes, + const index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : GenericPackedTensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + // if index_t is not int64_t, we want to have an int64_t constructor + template < + typename source_index_t, + class = typename std::enable_if< + std::is_same::value>::type> + C10_HOST GenericPackedTensorAccessor( + PtrType data, + const source_index_t* const sizes, + const source_index_t* const strides, + const char* const ptr_name, + const char* const func_name) + : GenericPackedTensorAccessorBase( + data, + sizes, + strides, + ptr_name, + func_name) {} + + C10_DEVICE T& operator[](index_t i) { + return this->at(this->strides_[0] * i); + } + C10_DEVICE const T& operator[](index_t i) const { + return this->at(this->strides_[0] * i); + } + + // Same as in the general N-dimensional case, but note that in the + // 1-dimensional case the returned PackedTensorAccessor will always be an + // identical copy of the original + C10_HOST GenericPackedTensorAccessor transpose( + index_t dim1, + index_t dim2) const { + this->bounds_check_(dim1); + this->bounds_check_(dim2); + return GenericPackedTensorAccessor( + this->data_, this->sizes_, this->strides_); + } +}; + +// Can't put this directly into the macro function args because of commas +#define AT_X GenericPackedTensorAccessor + +// Old name for `GenericPackedTensorAccessor` +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits, + typename index_t = int64_t> +C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X) + +#undef AT_X + +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits> +using PackedTensorAccessor32 = + GenericPackedTensorAccessor; + +template < + typename T, + size_t N, + template class PtrTraits = DefaultPtrTraits> +using PackedTensorAccessor64 = + GenericPackedTensorAccessor; + +} // namespace fbgemm_gpu + +#ifdef FBGEMM_GPU_MEMCHECK +namespace pta = fbgemm_gpu; +#else +namespace pta = at; +#endif + +#ifdef FBGEMM_GPU_MEMCHECK +template < + typename T, + size_t N, + template class PtrTraits = at::DefaultPtrTraits, + typename index_t = int64_t> +const fbgemm_gpu::GenericPackedTensorAccessor +make_generic_packed_tensor_accessor( + at::Tensor& tensor, + const char* const ptr_name, + const char* const func_name) { + static_assert( + N > 0, + "accessor is used for indexing tensor, for scalars use *data_ptr()"); + TORCH_CHECK( + tensor.dim() == N, + "TensorAccessor expected ", + N, + " dims but tensor has ", + tensor.dim()); + return fbgemm_gpu::GenericPackedTensorAccessor( + static_cast::PtrType>(tensor.data_ptr()), + tensor.sizes().data(), + tensor.strides().data(), + ptr_name, + func_name); +} +#endif + +template < + typename T, + size_t N, + template class PtrTraits = at::DefaultPtrTraits> +const pta::PackedTensorAccessor32 +make_packed_tensor_accessor32( +#ifdef FBGEMM_GPU_MEMCHECK + at::Tensor& tensor, + const char* const ptr_name, + const char* const func_name) { +#else + at::Tensor& tensor) { +#endif + TORCH_CHECK( + tensor.numel() <= + static_cast(std::numeric_limits::max()), + "numel needs to be smaller than int32_t max; otherwise, please use packed_accessor64"); +#ifdef FBGEMM_GPU_MEMCHECK + return make_generic_packed_tensor_accessor( + tensor, ptr_name, func_name); +#else + return tensor.packed_accessor32(); +#endif +} + +template < + typename T, + size_t N, + template class PtrTraits = at::DefaultPtrTraits> +const pta::PackedTensorAccessor64 +make_packed_tensor_accessor64( +#ifdef FBGEMM_GPU_MEMCHECK + at::Tensor& tensor, + const char* const ptr_name, + const char* const func_name) { + return make_generic_packed_tensor_accessor( + tensor, ptr_name, func_name); +#else + at::Tensor& tensor) { + return tensor.packed_accessor64(); +#endif +} + +#ifdef FBGEMM_GPU_MEMCHECK +#define MAKE_PACKED_TENSOR_ACCESSOR_BASE( \ + FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \ + make_packed_tensor_accessor##INDEX_NBITS( \ + TENSOR, #TENSOR, FUNC_NAME) + +#define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE( \ + FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \ + make_packed_tensor_accessor##INDEX_NBITS< \ + at::acc_type, \ + N, \ + PTR_TRAITS>(TENSOR, #TENSOR, FUNC_NAME) +#else +#define MAKE_PACKED_TENSOR_ACCESSOR_BASE( \ + FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \ + make_packed_tensor_accessor##INDEX_NBITS(TENSOR) + +#define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE( \ + FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS) \ + make_packed_tensor_accessor##INDEX_NBITS< \ + at::acc_type, \ + N, \ + PTR_TRAITS>(TENSOR) +#endif From c7cddecb8ddeaad71bb42f53f2e2cebc20a0cf91 Mon Sep 17 00:00:00 2001 From: Matt Galloway Date: Thu, 16 Mar 2023 11:37:50 -0700 Subject: [PATCH 10/34] Fix compiling with Xcode 14.3 (#1648) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1648 This hack is not needed in Xcode 14.3 anymore, where the clang version is 14.0.3. So change the workaround to only include up to 14.0.2. Reviewed By: MatzeB Differential Revision: D44130421 fbshipit-source-id: 1fb2948567941bdf6ee9487ccfaa9dfb2caf92dd --- src/InlineAsmDefines.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/InlineAsmDefines.h b/src/InlineAsmDefines.h index 80612536b7..fa3f706602 100644 --- a/src/InlineAsmDefines.h +++ b/src/InlineAsmDefines.h @@ -10,13 +10,14 @@ // We need to do a hack in inline assembly in some clang versions where we have // to do `.intel_syntax noprefix`. This was fixed in clang in // https://reviews.llvm.org/D113707, which made it into clang-14, but not in -// Apple's clang-14 that ships with Xcode 14. +// Apple's clang-14 that ships with Xcode 14.2. It was first fixed in Xcode 14.3 +// where the clang version is 14.0.3. #if defined(__clang__) #if ( \ defined(__apple_build_version__) || \ (defined(__has_builtin) && __has_builtin(__builtin_pika_xxhash64))) && \ - (__clang_major__ < 15) + (__clang_major__ < 15 && __clang_minor__ == 0 && __clang_patchlevel__ < 3) #define FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK 1 #elif (__clang_major__ < 14) #define FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK 1 From 64833b5185893cbc71ea80c9b01443f762b5cba4 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Fri, 17 Mar 2023 12:21:44 -0700 Subject: [PATCH 11/34] Add support for building FBGEMM_GPU against Python 3.11 in OSS (#1646) Summary: - Parallelize the FBGEMM CI builds to build and test static and shared libraries independently instead of in serial - Move the FBGEMM CI builds to run inside Docker containers - Add support for building FBGEMM_GPU against Python 3.11 in OSS - Move all FBGEMM_GPU nightly and release build jobs to run inside `amazonlinux:2023` Docker container - Assuming no build errors or resource starvation, the full OSS build process now runs under 30 minutes. Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1646 Reviewed By: shintaro-iwasaki Differential Revision: D44157228 Pulled By: q10 fbshipit-source-id: 6403ea9955856157785c50837b0b8e4c0cd26d53 --- .github/scripts/setup_env.bash | 100 +++++-- .github/workflows/fbgemm_ci.yml | 244 ++++++++---------- .github/workflows/fbgemm_gpu_ci.yml | 32 +-- .github/workflows/fbgemm_nightly_build.yml | 27 +- .../workflows/fbgemm_nightly_build_cpu.yml | 24 +- .github/workflows/fbgemm_release_build.yml | 27 +- .../workflows/fbgemm_release_build_cpu.yml | 24 +- fbgemm_gpu/docs/BuildInstructions.md | 10 +- .../split_table_batched_embeddings_ops.py | 4 +- 9 files changed, 257 insertions(+), 235 deletions(-) diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index ccdac79097..a22a09b19e 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -264,22 +264,13 @@ print_gpu_info () { if which nvidia-smi; then # If nvidia-smi is installed on a machine without GPUs, this will return error (print_exec nvidia-smi) || true + else + echo "[CHECK] nvidia-smi not found" fi fi } -print_system_info () { - echo "################################################################################" - echo "# Print System Info" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" - - echo "################################################################################" - echo "[INFO] Printing environment variables ..." - print_exec printenv - +__print_system_info_linux () { echo "################################################################################" echo "[INFO] Check ldd version ..." print_exec ldd --version @@ -296,6 +287,36 @@ print_system_info () { print_exec cat /etc/os-release } +__print_system_info_macos () { + echo "################################################################################" + echo "[INFO] Check CPU info ..." + sysctl -a | grep machdep.cpu + + echo "################################################################################" + echo "[INFO] Check MacOS version info ..." + print_exec uname -a + print_exec sw_vers +} + +print_system_info () { + echo "################################################################################" + echo "# Print System Info" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + echo "################################################################################" + echo "[INFO] Printing environment variables ..." + print_exec printenv + + if [[ $OSTYPE == 'darwin'* ]]; then + __print_system_info_macos + else + __print_system_info_linux + fi +} + print_ec2_info () { echo "################################################################################" echo "# Print EC2 Instance Info" @@ -316,6 +337,30 @@ print_ec2_info () { echo "instance-type: $(get_ec2_metadata instance-type)" } +print_glibc_info () { + local library_path="$1" + if [ "$library_path" == "" ]; then + echo "Usage: ${FUNCNAME[0]} LIBRARY_PATH" + echo "Example(s):" + echo " ${FUNCNAME[0]} /usr/lib/x86_64-linux-gnu/libstdc++.so.6" + return 1 + fi + + if [ -f "${library_path}" ]; then + echo "[CHECK] Listing out the GLIBC versions referenced by: ${library_path}" + objdump -TC "${library_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat + echo "" + + echo "[CHECK] Listing out the GLIBCXX versions referenced by: ${library_path}" + objdump -TC "${library_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + echo "" + + else + echo "[CHECK] No file at path: ${library_path}" + return 1 + fi +} + ################################################################################ # Miniconda Setup Functions @@ -342,7 +387,7 @@ setup_miniconda () { print_exec mkdir -p "$miniconda_prefix" echo "[SETUP] Downloading the Miniconda installer ..." - print_exec wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh + (exec_with_retries wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh) || return 1 echo "[SETUP] Installing Miniconda ..." print_exec bash miniconda.sh -b -p "$miniconda_prefix" -u @@ -360,9 +405,16 @@ setup_miniconda () { print_exec conda info # These variables will be exported outside + echo "[SETUP] Exporting Miniconda variables ..." export PATH="${miniconda_prefix}/bin:${PATH}" export CONDA="${miniconda_prefix}" + if [ -f "${GITHUB_PATH}" ]; then + echo "[SETUP] Saving Miniconda variables to ${GITHUB_PATH} ..." + echo "${miniconda_prefix}/bin" >> "${GITHUB_PATH}" + echo "CONDA=${miniconda_prefix}" >> "${GITHUB_PATH}" + fi + echo "[SETUP] Successfully set up Miniconda at ${miniconda_prefix}" } @@ -448,9 +500,11 @@ install_pytorch_conda () { fi # Install PyTorch packages + # NOTE: Installation of large package might fail due to corrupt package download + # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, CPU=${pytorch_cpu:-0}) through Conda using channel '${pytorch_channel}' ..." # shellcheck disable=SC2086 - (exec_with_retries conda install -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1 + (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1 # Run check for GPU variant if [ "$pytorch_cpu" == "" ]; then @@ -612,7 +666,7 @@ install_cuda () { # Install CUDA packages echo "[INSTALL] Installing CUDA ${cuda_version} ..." - (exec_with_retries conda install -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1 + (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1 # Ensure that nvcc is properly installed (test_binpath "${env_name}" nvcc) || return 1 @@ -806,15 +860,19 @@ install_cxx_compiler () { install_system_packages gcc gcc-c++ else - # Install gxx_linux-64 from main instead of cxx-compiler from conda-forge, as - # the latter breaks builds: + # Install gxx_linux-64 from conda-forge instead of from anaconda channel. + # sysroot_linux-64 needs to be installed alongside this: + # # https://root-forum.cern.ch/t/error-timespec-get-has-not-been-declared-with-conda-root-package/45712/6 + # https://github.com/conda-forge/conda-forge.github.io/issues/1625 + # https://conda-forge.org/docs/maintainer/knowledge_base.html#using-centos-7 + # https://github.com/conda/conda-build/issues/4371 # - # NOTE: Install g++ 9.x instead of 11.x becaue 11.x builds libraries with - # references to GLIBCXX_3.4.29, which is not available on systems with older + # NOTE: We install g++ 10.x instead of 11.x becaue 11.x builds binaries that + # reference GLIBCXX_3.4.29, which may not be available on systems with older # versions of libstdc++.so.6 such as CentOS Stream 8 and Ubuntu 20.04 echo "[INSTALL] Installing C/C++ compilers through Conda ..." - (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=9.3.0) || return 1 + (exec_with_retries conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge) || return 1 # The compilers are visible in the PATH as `x86_64-conda-linux-gnu-cc` and # `x86_64-conda-linux-gnu-c++`, so symlinks will need to be created @@ -1055,7 +1113,7 @@ check_fbgemm_gpu_build () { for library in "${fbgemm_gpu_so_files[@]}"; do echo "[CHECK] Listing out the GLIBCXX versions referenced by the library: ${library}" - objdump -TC "${library}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + print_glibc_info "${library}" echo "[CHECK] Verifying sample subset of symbols in the library ..." for symbol in "${lib_symbols_to_check[@]}"; do diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml index 977b443a2b..9b18dfb884 100644 --- a/.github/workflows/fbgemm_ci.yml +++ b/.github/workflows/fbgemm_ci.yml @@ -19,185 +19,165 @@ concurrency: cancel-in-progress: true jobs: - build-posix: - runs-on: ${{ matrix.os }} + build-linux: + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_DIR: build_${{ matrix.library-type }} + DEBIAN_FRONTEND: noninteractive strategy: + fail-fast: false matrix: - os: [ ubuntu-latest, macos-latest ] + container-image: [ "ubuntu:20.04" ] + library-type: [ static, shared ] steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash + - name: Setup Build Container run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 + apt update -y + apt install -y binutils build-essential cmake git libblas-dev python3 sudo wget + git config --global --add safe.directory '*' - - name: Get CPU info on Ubuntu - if: contains(runner.os, 'linux') - run: | - cat /proc/cpuinfo + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true - - name: Get CPU info on macOS - if: contains(runner.os, 'macOs') - run: | - sysctl -a | grep machdep.cpu + - name: Display System Info + run: . $PRELUDE; print_system_info - - name: Get env vars - run: | - echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW - echo HOME = $HOME - echo GITHUB_ACTION = $GITHUB_ACTION - echo GITHUB_ACTIONS = $GITHUB_ACTIONS - echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY - echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME - echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH - echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE - echo GITHUB_SHA = $GITHUB_SHA - echo GITHUB_REF = $GITHUB_REF - c++ --verbose - - - name: Build static FBGEMM lib + - name: Build FBGEMM Library (${{ matrix.library-type }}) run: | set -e - mkdir build_static - cd build_static - cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=static .. - make + mkdir $BUILD_DIR; cd $BUILD_DIR + cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DPYTHON_EXECUTABLE=/usr/bin/python3 .. + make -j - - name: Test static FBGEMM lib - if: contains(runner.os, 'linux') # not run on macos-latest now due to supporting AVX2 + - name: Test FBGEMM Library (${{ matrix.library-type }}) run: | set -e - cd build_static + cd $BUILD_DIR ctest --rerun-failed --output-on-failure - - name: Build shared FBGEMM lib + + build-macos: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_DIR: build_${{ matrix.library-type }} + strategy: + fail-fast: false + matrix: + os: [ macos-latest ] + library-type: [ static, shared ] + + steps: + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + # Build but skip tests due to lack of support for AVX2 + - name: Build FBGEMM Library (${{ matrix.library-type }}) run: | set -e - mkdir build_shared - cd build_shared - cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=shared .. - make + mkdir $BUILD_DIR; cd $BUILD_DIR + cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} .. + make -j + + + build-bazel: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] - - name: Test shared FBGEMM lib - if: contains(runner.os, 'linux') # not run on macos-latest now due to supporting AVX2 + steps: + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Download bazel run: | set -e - cd build_shared - ctest --rerun-failed --output-on-failure + wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel + # verify content + echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c + chmod +x bazel + + - name: Build FBGEMM with bazel + run: ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :* + + - name: Test FBGEMM bazel build + run: ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :* + build-windows: runs-on: ${{ matrix.os }} + defaults: + run: + shell: cmd + env: + BUILD_DIR: build_${{ matrix.library-type }} strategy: + fail-fast: false matrix: - os: [windows-2019] + os: [ windows-2019 ] + library-type: [ static, shared ] steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash - run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true - name: Get CPU info on Windows shell: cmd run: | wmic cpu list full - - name: Build static FBGEMM lib + - name: Build FBGEMM Library (${{ matrix.library-type }}) shell: cmd run: | call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 echo "INSTALL NINJA:" pip install ninja which ninja - mkdir build_static - cd build_static + mkdir %BUILD_DIR% + cd %BUILD_DIR% echo "STARTING CMAKE" - cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=static -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. + cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. ninja all echo "Build Success" - - name: Test static FBGEMM lib - shell: cmd - run: | - echo %cd% - cd build_static - ctest --rerun-failed --output-on-failure - if errorlevel 1 exit /b 1 - - - name: Build shared FBGEMM lib - shell: cmd - run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 - echo "INSTALL NINJA:" - pip install ninja - which ninja - mkdir build_shared - cd build_shared - echo "STARTING CMAKE" - cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=shared -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. - ninja all - if errorlevel 1 exit /b 1 - - - name: Test shared FBGEMM lib + - name: Test FBGEMM Library (${{ matrix.library-type }}) shell: cmd run: | echo %cd% - cd build_shared + cd %BUILD_DIR% set PATH=%PATH%;%cd%;%cd%\asmjit echo %PATH% ctest --rerun-failed --output-on-failure if errorlevel 1 exit /b 1 - - build-bazel: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ ubuntu-latest ] - - steps: - - uses: actions/checkout@v3 - - name: Checkout submodules - shell: bash - run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 - - - name: Get env vars - run: | - echo GITHUB_WORKFLOW = $GITHUB_WORKFLOW - echo HOME = $HOME - echo GITHUB_ACTION = $GITHUB_ACTION - echo GITHUB_ACTIONS = $GITHUB_ACTIONS - echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY - echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME - echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH - echo GITHUB_WORKSPACE = $GITHUB_WORKSPACE - echo GITHUB_SHA = $GITHUB_SHA - echo GITHUB_REF = $GITHUB_REF - c++ --verbose - - - name: Download bazel - run: | - set -e - wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel - # verify content - echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c - chmod +x bazel - - - - name: Build FBGEMM with bazel - run: | - set -e - ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :* - - - name: Test FBGEMM bazel build - run: | - set -e - ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :* diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index bd62f23761..adf8443eae 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -20,7 +20,7 @@ concurrency: jobs: build_and_test_amd: - runs-on: ${{ matrix.os }} + runs-on: linux.12xlarge container: image: ${{ matrix.container-image }} options: --user root @@ -33,9 +33,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.12xlarge ] container-image: [ "ubuntu:20.04" ] - python-version: [ "3.10" ] + python-version: [ "3.8", "3.9", "3.10" ] rocm-version: [ "5.3" ] steps: @@ -60,10 +59,7 @@ jobs: run: . $PRELUDE; free_disk_space - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -77,7 +73,7 @@ jobs: - name: Install PyTorch-ROCm Nightly run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU-ROCM Nightly @@ -146,7 +142,10 @@ jobs: build_and_test_cpu: - runs-on: ${{ matrix.os }} + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root defaults: run: shell: bash @@ -156,10 +155,16 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-20.04, ubuntu-latest ] + container-image: [ "ubuntu:20.04", "ubuntu:22.04" ] python-version: [ "3.8", "3.9", "3.10" ] steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils build-essential git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -172,10 +177,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -186,7 +188,7 @@ jobs: - name: Install PyTorch run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build and Install FBGEMM_GPU (CPU version) diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml index bc699ef62b..b0ac76900c 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_nightly_build.yml @@ -38,7 +38,10 @@ concurrency: jobs: # Build on CPU hosts and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.24xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -49,11 +52,13 @@ jobs: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.12xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo tar wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -66,10 +71,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -89,7 +91,7 @@ jobs: - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU Nightly @@ -116,7 +118,7 @@ jobs: fail-fast: false matrix: os: [ linux.g5.4xlarge.nvidia.gpu ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "11.7.1" ] @@ -135,10 +137,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -149,7 +148,7 @@ jobs: - name: Install PyTorch Nightly run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Download Wheel Artifact from GHA diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml index 1125b17a0d..d99c3f73ee 100644 --- a/.github/workflows/fbgemm_nightly_build_cpu.yml +++ b/.github/workflows/fbgemm_nightly_build_cpu.yml @@ -39,7 +39,7 @@ concurrency: jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge container: image: amazonlinux:2023 options: --user root @@ -53,8 +53,7 @@ jobs: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] steps: - name: Setup Build Container @@ -72,10 +71,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -89,7 +85,7 @@ jobs: - name: Install PyTorch-CPU Nightly run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU Nightly (CPU version) @@ -104,7 +100,7 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge container: image: amazonlinux:2023 options: --user root @@ -117,8 +113,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] needs: build_artifact steps: @@ -137,10 +132,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -148,7 +140,7 @@ jobs: - name: Install PyTorch Nightly run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Download Wheel Artifact from GHA diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml index def6002a76..75d5235b69 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_release_build.yml @@ -30,7 +30,10 @@ concurrency: jobs: # Build on CPU hosts and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.24xlarge + container: + image: amazonlinux:2023 + options: --user root defaults: run: shell: bash @@ -41,11 +44,13 @@ jobs: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.12xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git sudo tar wget which + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -58,10 +63,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -81,7 +83,7 @@ jobs: - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU @@ -108,7 +110,7 @@ jobs: fail-fast: false matrix: os: [ linux.g5.4xlarge.nvidia.gpu ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "11.7.1" ] @@ -126,10 +128,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -140,7 +139,7 @@ jobs: - name: Install PyTorch Test run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Download Wheel Artifact from GHA diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml index c7fb53cabd..f13ebd32c9 100644 --- a/.github/workflows/fbgemm_release_build_cpu.yml +++ b/.github/workflows/fbgemm_release_build_cpu.yml @@ -30,7 +30,7 @@ concurrency: jobs: # Build on CPU hosts, run tests, and upload to GHA build_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge container: image: amazonlinux:2023 options: --user root @@ -44,8 +44,7 @@ jobs: # Don't fast-fail all the other builds if one of the them fails fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] steps: - name: Setup Build Container @@ -63,10 +62,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -80,7 +76,7 @@ jobs: - name: Install PyTorch-CPU Test run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU (CPU version) @@ -95,7 +91,7 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.4xlarge container: image: amazonlinux:2023 options: --user root @@ -108,8 +104,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.4xlarge ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11" ] needs: build_artifact steps: @@ -128,10 +123,7 @@ jobs: run: . $PRELUDE; print_gpu_info - name: Setup Miniconda - run: | - . $PRELUDE; setup_miniconda $HOME/miniconda - echo "${HOME}/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=${HOME}/miniconda" >> $GITHUB_PATH + run: . $PRELUDE; setup_miniconda $HOME/miniconda - name: Create Conda Environment run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} @@ -139,7 +131,7 @@ jobs: - name: Install PyTorch Test run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly - - name: Prepare FBGEMM Build + - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Download Wheel Artifact from GHA diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md index a90a059b40..56aa780fe3 100644 --- a/fbgemm_gpu/docs/BuildInstructions.md +++ b/fbgemm_gpu/docs/BuildInstructions.md @@ -22,7 +22,7 @@ environment is recommended for reproducible builds: # Set the Miniconda prefix directory miniconda_prefix=$HOME/miniconda -# Download the Miniconfs installer +# Download the Miniconda installer wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh # Run the installer @@ -59,7 +59,7 @@ conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0 ### C/C++ Compiler -Install the GCC toolchain. Note that GCC (as opposed to LLVM for example) is +Install the GCC toolchain. Note that GCC (as opposed to Clang for example) is required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++` in the path. @@ -71,7 +71,7 @@ Note that while newer versions of GCC can be used, binaries compiled under newer versions of GCC will not be compatible with older systems such as Ubuntu 20.04 or CentOS Stream 8, because the compiled library will reference symbols from versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support. To -see what versions of GLIBCXX that the available `libstdc++.so.6` supports: +see what versions of GLIBCXX the available `libstdc++.so.6` supports: ```sh libcxx_path=/path/to/libstdc++.so.6 @@ -193,7 +193,7 @@ From there, the rest of the build environment may be constructed through Conda. ### Install ROCm -Install the full ROCm package through the operating system package manger. The +Install the full ROCm package through the operating system package manager. The full instructions can be found in the [ROCm installation guide](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.3/page/How_to_Install_ROCm.html): @@ -346,7 +346,7 @@ package_name=fbgemm_gpu # Build for SM70/80 (V100/A100 GPU); update as needed # If not specified, only the CUDA architecture supported by current system will be targeted -# Ifo CUDA device is present either, all CUDA architectures will be targeted +# If no CUDA device is present either, all CUDA architectures will be targeted cuda_arch_list=7.0;8.0 # Build the wheel artifact only diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py index 87b9b1a559..8120cdcb03 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py @@ -9,7 +9,7 @@ import enum import logging -from dataclasses import dataclass +from dataclasses import dataclass, field from itertools import accumulate from math import log2 from typing import Dict, List, NamedTuple, Optional, Tuple, Type, Union @@ -106,7 +106,7 @@ class CounterBasedRegularizationDefinition: adjustment_ub: float = 1.0 learning_rate_mode: LearningRateMode = LearningRateMode.EQUAL grad_sum_decay: GradSumDecay = GradSumDecay.NO_DECAY - tail_id_threshold: TailIdThreshold = TailIdThreshold(val=0, is_ratio=False) + tail_id_threshold: TailIdThreshold = field(default_factory=TailIdThreshold) max_counter_update_freq: int = 1000 From 54eeae214af0834cc07632eb916879d71468a4cd Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Fri, 17 Mar 2023 19:37:29 -0700 Subject: [PATCH 12/34] Remove magic numbers from fbgemm/Types.h (#1629) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1629 Replaces magic numbers with constexpr variables Reviewed By: sryap Differential Revision: D43776442 fbshipit-source-id: 5cef7566816f8730f5daa08948ee3260367787aa --- include/fbgemm/Types.h | 189 +++++++++++++++++++++++++---------------- 1 file changed, 114 insertions(+), 75 deletions(-) diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h index be8ac4ec8b..e5daa28d8b 100644 --- a/include/fbgemm/Types.h +++ b/include/fbgemm/Types.h @@ -15,145 +15,184 @@ namespace fbgemm { using float16 = std::uint16_t; using bfloat16 = std::uint16_t; +// The IEEE754 standard species a binary16 as having the following format: +// SEEEEEMMMMMMMMMM +// 0432109876543210 +// That is: +// * 1 sign bit +// * 5 exponent bits +// * 10 mantissa/significand bits (an 11th bit is implicit) +constexpr uint32_t f16_num_bits = 16; +constexpr uint32_t f16_num_exponent_bits = 5; +constexpr uint32_t f16_num_mantissa_bits = 10; +constexpr uint32_t f16_num_non_sign_bits = + f16_num_exponent_bits + f16_num_mantissa_bits; +constexpr uint32_t f16_exponent_mask = 0x1F; // 5 bits +constexpr uint32_t f16_sign_bit = 1u + << (f16_num_exponent_bits + f16_num_mantissa_bits); +constexpr uint32_t f16_exponent_bits = f16_exponent_mask + << f16_num_mantissa_bits; +constexpr uint32_t f16_mantissa_mask = 0x3FF; // 10 bits +constexpr uint32_t f16_exponent_bias = 15; +constexpr uint32_t f16_nan = 0x7FFF; + +// The IEEE754 standard specifies a binary32 as having: +// SEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMM +// That is: +// * 1 sign bit +// * 8 exponent bits +// * 23 mantissa/significand bits (a 24th bit is implicit) +constexpr uint32_t f32_num_exponent_bits = 8; +constexpr uint32_t f32_num_mantissa_bits = 23; +constexpr uint32_t f32_exponent_mask = 0xFF; // 8 bits +constexpr uint32_t f32_mantissa_mask = 0x7FFFFF; // 23 bits +constexpr uint32_t f32_exponent_bias = 127; +constexpr uint32_t f32_all_non_sign_mask = 0x7FFFFFFF; // 31 bits +constexpr uint32_t f32_most_significant_bit = 1u << 22; // Turn on 23rd bit +constexpr uint32_t f32_num_non_sign_bits = + f32_num_exponent_bits + f32_num_mantissa_bits; + // Round to nearest even static inline float16 cpu_float2half_rn(float f) { - float16 ret; - static_assert( - sizeof(unsigned int) == sizeof(float), - "Programming error sizeof(unsigned int) != sizeof(float)"); + sizeof(uint32_t) == sizeof(float), + "Programming error sizeof(uint32_t) != sizeof(float)"); - unsigned* xp = reinterpret_cast(&f); - unsigned x = *xp; - unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; - unsigned sign, exponent, mantissa; + uint32_t* xp = reinterpret_cast(&f); + uint32_t x = *xp; + uint32_t u = (x & f32_all_non_sign_mask); // Get rid of +NaN/-NaN case first. if (u > 0x7f800000) { - ret = 0x7fffU; - return ret; + return static_cast(f16_nan); } - sign = ((x >> 16) & 0x8000); + uint32_t sign = ((x >> f16_num_bits) & f16_sign_bit); // Get rid of +Inf/-Inf, +0/-0. if (u > 0x477fefff) { - ret = static_cast(sign | 0x7c00U); - return ret; + return static_cast(sign | f16_exponent_bits); } if (u < 0x33000001) { - ret = static_cast(sign | 0x0000); - return ret; + return static_cast(sign | 0x0000); } - exponent = ((u >> 23) & 0xff); - mantissa = (u & 0x7fffff); + uint32_t exponent = ((u >> f32_num_mantissa_bits) & f32_exponent_mask); + uint32_t mantissa = (u & f32_mantissa_mask); - if (exponent > 0x70) { - shift = 13; - exponent -= 0x70; + uint32_t shift; + if (exponent > f32_exponent_bias - f16_exponent_bias) { + shift = f32_num_mantissa_bits - f16_num_mantissa_bits; + exponent -= f32_exponent_bias - f16_exponent_bias; } else { - shift = 0x7e - exponent; + shift = (f32_exponent_bias - 1) - exponent; exponent = 0; - mantissa |= 0x800000; + mantissa |= + (1u + << f32_num_mantissa_bits); // Bump the least significant exponent bit } - lsb = (1 << shift); - lsb_s1 = (lsb >> 1); - lsb_m1 = (lsb - 1); + const uint32_t lsb = (1u << shift); + const uint32_t lsb_s1 = (lsb >> 1); + const uint32_t lsb_m1 = (lsb - 1); // Round to nearest even. - remainder = (mantissa & lsb_m1); + const uint32_t remainder = (mantissa & lsb_m1); mantissa >>= shift; if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { ++mantissa; - if (!(mantissa & 0x3ff)) { + if (!(mantissa & f16_mantissa_mask)) { ++exponent; mantissa = 0; } } - ret = static_cast(sign | (exponent << 10) | mantissa); - - return ret; + return static_cast( + sign | (exponent << f16_num_mantissa_bits) | mantissa); } // Round to zero static inline float16 cpu_float2half_rz(float f) { - float16 ret; - static_assert( - sizeof(unsigned int) == sizeof(float), - "Programming error sizeof(unsigned int) != sizeof(float)"); + sizeof(uint32_t) == sizeof(float), + "Programming error sizeof(uint32_t) != sizeof(float)"); - unsigned* xp = reinterpret_cast(&f); - unsigned x = *xp; - unsigned u = (x & 0x7fffffff); - unsigned shift, sign, exponent, mantissa; + const uint32_t* xp = reinterpret_cast(&f); + const uint32_t x = *xp; + const uint32_t u = (x & f32_all_non_sign_mask); // Get rid of +NaN/-NaN case first. if (u > 0x7f800000) { - ret = static_cast(0x7fffU); - return ret; + return static_cast(f16_nan); } - sign = ((x >> 16) & 0x8000); + uint32_t sign = ((x >> f16_num_bits) & f16_sign_bit); // Get rid of +Inf/-Inf, +0/-0. if (u > 0x477fefff) { - ret = static_cast(sign | 0x7c00U); - return ret; + return static_cast(sign | f16_exponent_bits); } if (u < 0x33000001) { - ret = static_cast(sign | 0x0000); - return ret; + return static_cast(sign | 0x0000); } - exponent = ((u >> 23) & 0xff); - mantissa = (u & 0x7fffff); + uint32_t exponent = ((u >> f32_num_mantissa_bits) & f32_exponent_mask); + uint32_t mantissa = (u & f32_mantissa_mask); - if (exponent > 0x70) { - shift = 13; - exponent -= 0x70; + uint32_t shift; + if (exponent > f32_exponent_bias - f16_exponent_bias) { + shift = f32_num_mantissa_bits - f16_num_mantissa_bits; + exponent -= f32_exponent_bias - f16_exponent_bias; } else { - shift = 0x7e - exponent; + shift = (f32_exponent_bias - 1) - exponent; exponent = 0; - mantissa |= 0x800000; + mantissa |= + (1u + << f32_num_mantissa_bits); // Bump the least significant exponent bit } // Round to zero. mantissa >>= shift; - ret = static_cast(sign | (exponent << 10) | mantissa); - - return ret; + return static_cast( + sign | (exponent << f16_num_mantissa_bits) | mantissa); } -static inline float cpu_half2float(float16 h) { - unsigned sign = ((h >> 15) & 1); - unsigned exponent = ((h >> 10) & 0x1f); - unsigned mantissa = ((h & 0x3ff) << 13); - - if (exponent == 0x1f) { /* NaN or Inf */ - mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); - exponent = 0xff; - } else if (!exponent) { /* Denorm or Zero */ +// Converts a 16-bit unsigned integer representation of a IEEE754 half-precision +// float into an IEEE754 32-bit single-precision float +static inline float cpu_half2float(const float16 h) { + // Get sign and exponent alone by themselves + uint32_t sign_bit = (h >> f16_num_non_sign_bits) & 1; + uint32_t exponent = (h >> f16_num_mantissa_bits) & f16_exponent_mask; + // Shift mantissa so that it fills the most significant bits of a float32 + uint32_t mantissa = (h & f16_mantissa_mask) + << (f32_num_mantissa_bits - f16_num_mantissa_bits); + + if (exponent == f16_exponent_mask) { // NaN or Inf if (mantissa) { - unsigned int msb; - exponent = 0x71; + mantissa = f32_mantissa_mask; + sign_bit = 0; + } + exponent = f32_exponent_mask; + } else if (!exponent) { // Denorm or Zero + if (mantissa) { + uint32_t msb; + exponent = f32_exponent_bias - f16_exponent_bias + 1; do { - msb = (mantissa & 0x400000); - mantissa <<= 1; /* normalize */ + msb = mantissa & f32_most_significant_bit; + mantissa <<= 1; // normalize --exponent; } while (!msb); - mantissa &= 0x7fffff; /* 1.mantissa is implicit */ + mantissa &= f32_mantissa_mask; // 1.mantissa is implicit } } else { - exponent += 0x70; + exponent += f32_exponent_bias - f16_exponent_bias; } - unsigned i = ((sign << 31) | (exponent << 23) | mantissa); + const uint32_t i = (sign_bit << f32_num_non_sign_bits) | + (exponent << f32_num_mantissa_bits) | mantissa; + float ret; - memcpy(&ret, &i, sizeof(i)); + std::memcpy(&ret, &i, sizeof(float)); return ret; } @@ -161,14 +200,14 @@ static inline float cpu_bf162float(bfloat16 src) { float ret; uint32_t val_fp32 = static_cast(reinterpret_cast(&src)[0]) << 16; - memcpy(&ret, &val_fp32, sizeof(ret)); + memcpy(&ret, &val_fp32, sizeof(float)); return ret; } static inline bfloat16 cpu_float2bfloat16(float src) { uint32_t temp; - memcpy(&temp, &src, sizeof(temp)); - return (temp + (1 << 15)) >> 16; + memcpy(&temp, &src, sizeof(uint32_t)); + return (temp + (1u << 15)) >> 16; } } // namespace fbgemm From 35bdd402608b552c44087d2a68de7ddb6e488d3a Mon Sep 17 00:00:00 2001 From: Xiao Sun Date: Sat, 18 Mar 2023 14:44:44 -0700 Subject: [PATCH 13/34] added check to avoid div 0 errors in cache report (#1645) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1645 as in title Reviewed By: jianyuh Differential Revision: D44096435 fbshipit-source-id: a7a87a14ffecc2fb6e0be74d199d385357946672 --- .../split_table_batched_embeddings_ops.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py index 8120cdcb03..ff8ce4d094 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py @@ -979,10 +979,11 @@ def print_uvm_cache_stats(self) -> None: f"N_conflict_unique_misses: {uvm_cache_stats[4]}\n" f"N_conflict_misses: {uvm_cache_stats[5]}\n" ) - logging.info( - f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n" - f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n" - ) + if uvm_cache_stats[1]: + logging.info( + f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n" + f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n" + ) def prefetch(self, indices: Tensor, offsets: Tensor) -> None: self.timestep += 1 @@ -2347,10 +2348,11 @@ def print_uvm_cache_stats(self) -> None: f"N_conflict_unique_misses: {uvm_cache_stats[4]}\n" f"N_conflict_misses: {uvm_cache_stats[5]}\n" ) - logging.info( - f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n" - f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n" - ) + if uvm_cache_stats[1]: + logging.info( + f"unique indices / requested indices: {uvm_cache_stats[2]/uvm_cache_stats[1]}\n" + f"unique misses / requested indices: {uvm_cache_stats[3]/uvm_cache_stats[1]}\n" + ) @torch.jit.export def prefetch(self, indices: Tensor, offsets: Tensor) -> None: From 125ce44718023e0025d0cc154f17f04b072b73f1 Mon Sep 17 00:00:00 2001 From: Rengan Xu Date: Mon, 20 Mar 2023 11:10:46 -0700 Subject: [PATCH 14/34] jagged_dense_bmm operator optimization (#1643) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1643 This diff optimizes the jagged_dense_bmm operator with the following optimizations: * tiling across thread blocks, and use GPU shared memory for thread block * tiling across threads within a thread block, and use registers for each thread Reviewed By: brad-mengchi Differential Revision: D43674845 fbshipit-source-id: 85f0abf89fa958f79636ef59c3070a1c569b73c2 --- .../include/fbgemm_gpu/fbgemm_cuda_utils.cuh | 5 + fbgemm_gpu/src/jagged_tensor_ops.cu | 183 +++++++++++++++--- 2 files changed, 158 insertions(+), 30 deletions(-) diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh index 5ce7d4f5d1..c21057ac49 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh @@ -62,6 +62,11 @@ static constexpr int32_t kWarpSize = 32; #endif // Max thread num in one thread block static constexpr int32_t kMaxThreads = 1024; +// Max block size in Y dimension of a grid +static constexpr int32_t kMaxBlockYDim = 65535; +// Max block size in Z dimension of a grid +static constexpr int32_t kMaxBlockZDim = 65535; + static constexpr float kQParamEps = 1e-8f; /* For rowwise int8 quantization, two quantization parameters (qparams) diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu index 4e93d08a65..860a83ecd6 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops.cu @@ -2071,36 +2071,135 @@ Tensor jagged_jagged_bmm_forward( return output; } -template +template < + const int BLOCK_TILE_M, // tile height of C that each thread block + // calculates + const int BLOCK_TILE_N, // tile width of C that each thread block + // calculates + const int BLOCK_TILE_K, // tile width of A that each thread block calculates + const int THREAD_TILE_M, // tile height of C that each thread + // calculates + const int THREAD_TILE_N, // tile width of C that each thread calcualtes + typename index_t, + typename scalar_t> __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel( - const at::PackedTensorAccessor32 x_values, - const at::PackedTensorAccessor32 x_offsets, - const at::PackedTensorAccessor32 y, - at::PackedTensorAccessor32 output, + const at::PackedTensorAccessor32 __restrict__ x_values, + const at::PackedTensorAccessor32 __restrict__ x_offsets, + const at::PackedTensorAccessor32 __restrict__ y, + at::PackedTensorAccessor32 __restrict__ output, const int max_L) { const int B = x_offsets.size(0) - 1; const int K = x_values.size(1); const int N = y.size(2); - const int b_l_begin = blockIdx.x * blockDim.y + threadIdx.y; - const int b_l_step = gridDim.x * blockDim.y; - for (int b_l = b_l_begin; b_l < B * max_L; b_l += b_l_step) { - const int b = b_l / max_L; - const int l = b_l % max_L; + const auto block_row = blockIdx.y; + const auto block_col = blockIdx.x; + + const int THREADS_X_PER_BLOCK = BLOCK_TILE_N / THREAD_TILE_N; + const int THREADS_Y_PER_BLOCK = BLOCK_TILE_M / THREAD_TILE_M; + const int THREADS_PER_BLOCK = THREADS_X_PER_BLOCK * THREADS_Y_PER_BLOCK; + const auto thread_row = threadIdx.x / THREADS_X_PER_BLOCK; + const auto thread_col = threadIdx.x % THREADS_X_PER_BLOCK; + const auto NUM_K_BLOCKS = (K + BLOCK_TILE_K - 1) / BLOCK_TILE_K; + + __shared__ scalar_t As[BLOCK_TILE_M][BLOCK_TILE_K]; + __shared__ scalar_t Bs[BLOCK_TILE_K][BLOCK_TILE_N]; + + for (auto b = blockIdx.z; b < B; b += gridDim.z) { + const index_t row_start = x_offsets[b]; + const index_t row_end = x_offsets[b + 1]; + const auto length = min(row_end - row_start, (index_t)max_L); + + // the indices that this current will load into shared mem + const auto inner_row_a = threadIdx.x / BLOCK_TILE_K; + const auto inner_col_a = threadIdx.x % BLOCK_TILE_K; + // the number of rows of As that will be loaded per step by a thread block + const auto A_TILE_ROW_STRIDE = THREADS_PER_BLOCK / BLOCK_TILE_K; + + const auto inner_row_b = threadIdx.x / BLOCK_TILE_N; + const auto inner_col_b = threadIdx.x % BLOCK_TILE_N; + const auto B_TILE_ROW_STRIDE = THREADS_PER_BLOCK / BLOCK_TILE_N; + + // registers for C + scalar_t accum[THREAD_TILE_M][THREAD_TILE_N] = {0}; + + // registers for As and Bs + scalar_t fragment_a[THREAD_TILE_M] = {0}; + scalar_t fragment_b[THREAD_TILE_N] = {0}; + + // loop for block tiles in K dimension + for (auto block = 0; block < NUM_K_BLOCKS; block++) { +// load a block of x_values from global memory to shared memory +// apply tiling for threads in a block +#pragma unroll + for (auto offset = 0; offset < BLOCK_TILE_M; + offset += A_TILE_ROW_STRIDE) { + auto x_row_offset = block_row * BLOCK_TILE_M + inner_row_a + offset; + auto x_col_offset = block * BLOCK_TILE_K + inner_col_a; + if ((x_row_offset < length) && (x_col_offset < K)) { + As[inner_row_a + offset][inner_col_a] = + x_values[row_start + x_row_offset][x_col_offset]; + } else { + As[inner_row_a + offset][inner_col_a] = 0; + } + } - const int row_start = x_offsets[b]; - const int row_end = x_offsets[b + 1]; - const int length = min(row_end - row_start, max_L); - if (length == 0 || l >= length) { - return; - } else { - // TODO: use shared memory and better reduction - for (int n = threadIdx.x; n < N; n += blockDim.x) { - at::acc_type acc = 0; - for (int k = 0; k < K; ++k) { - acc += x_values[row_start + l][k] * y[b][k][n]; +// load a block of y from global memory to shared memory +// apply tiling for threads in a block +#pragma unroll + for (auto offset = 0; offset < BLOCK_TILE_K; + offset += B_TILE_ROW_STRIDE) { + auto y_row_offset = block * BLOCK_TILE_K + inner_row_b + offset; + auto y_col_offset = block_col * BLOCK_TILE_N + inner_col_b; + if ((y_row_offset < K) && (y_col_offset < N)) { + Bs[inner_row_b + offset][inner_col_b] = + y[b][y_row_offset][y_col_offset]; + } else { + Bs[inner_row_b + offset][inner_col_b] = 0; + } + } + + __syncthreads(); + +// calculate the results per thread +#pragma unroll + for (auto k = 0; k < BLOCK_TILE_K; k++) { + // load values from shared memory to registers for x_values + for (auto row = 0; row < THREAD_TILE_M; row++) { + fragment_a[row] = As[thread_row * THREAD_TILE_M + row][k]; + } + +// load values from shared memory to registers for y +#pragma unroll + for (auto col = 0; col < THREAD_TILE_N; col++) { + fragment_b[col] = Bs[k][thread_col * THREAD_TILE_N + col]; + } + +// each thread calcualtes THREAD_TILE_M * THREAD_TILE_N elements +#pragma unroll + for (auto row = 0; row < THREAD_TILE_M; row++) { +#pragma unroll + for (auto col = 0; col < THREAD_TILE_N; col++) { + accum[row][col] += fragment_a[row] * fragment_b[col]; + } + } + } + + __syncthreads(); + } + +// write the result to the output +#pragma unroll + for (auto row = 0; row < THREAD_TILE_M; row++) { +#pragma unroll + for (auto col = 0; col < THREAD_TILE_N; col++) { + auto out_row_offset = + block_row * BLOCK_TILE_M + thread_row * THREAD_TILE_M + row; + auto out_col_offset = + block_col * BLOCK_TILE_N + thread_col * THREAD_TILE_N + col; + if ((out_row_offset < length) && (out_col_offset < N)) { + output[row_start + out_row_offset][out_col_offset] = accum[row][col]; } - output[row_start + l][n] = acc; } } } @@ -2124,9 +2223,29 @@ Tensor jagged_dense_bmm_forward( const int total_L = x_values.size(0); auto output = at::zeros({total_L, N}, x_values.options()); if (B > 0 && M > 0 && N > 0) { - const int block_dim_x = - std::min(div_round_up(N, kWarpSize) * kWarpSize, kMaxThreads); - const int block_dim_y = kMaxThreads / block_dim_x; + // The shared memory size is (BLOCK_TILE_M + BLOCK_TILE_N) * BLOCK_TILE_K + // BLOCK_TILE_M needs to be multiple of THREAD_TILE_M, and + // BLOCK_TILE_N needs to be multiple of THREAD_TILE_N + // The setting of these parameters needs to balance the hardware's shared + // memory size limit and occupancy + // TODO: autotune these parameters based on max_L and input and output + // tensor sizes + constexpr int BLOCK_TILE_M = 64; + constexpr int BLOCK_TILE_N = 8; + constexpr int BLOCK_TILE_K = 8; + constexpr int THREAD_TILE_M = 4; + constexpr int THREAD_TILE_N = 4; + + const dim3 block( + (BLOCK_TILE_M * BLOCK_TILE_N) / (THREAD_TILE_M * THREAD_TILE_N)); + const auto grid_dim_x = div_round_up(N, BLOCK_TILE_N); + const auto grid_dim_y = div_round_up(max_L, BLOCK_TILE_M); + TORCH_CHECK( + grid_dim_y <= kMaxBlockYDim, + "max_L cannot be larger than", + grid_dim_y * BLOCK_TILE_M + 1 - BLOCK_TILE_M); + const auto grid_dim_z = std::min(B, kMaxBlockZDim); + const dim3 grid(grid_dim_x, grid_dim_y, grid_dim_z); AT_DISPATCH_INDEX_TYPES( x_offsets.scalar_type(), "jagged_dense_bmm_kernel_1", [&] { @@ -2136,11 +2255,15 @@ Tensor jagged_dense_bmm_forward( x_values.scalar_type(), "jagged_dense_bmm_kernel_2", [&] { - jagged_dense_bmm_kernel - <<>>( + jagged_dense_bmm_kernel< + BLOCK_TILE_M, + BLOCK_TILE_N, + BLOCK_TILE_K, + THREAD_TILE_M, + THREAD_TILE_N, + index_t, + scalar_t> + <<>>( x_values.packed_accessor32(), x_offsets.packed_accessor32(), y.packed_accessor32(), From f3af571783d80bb23acbb1d3b9584320bb23e4db Mon Sep 17 00:00:00 2001 From: siwasaki Date: Tue, 21 Mar 2023 20:37:59 -0700 Subject: [PATCH 15/34] jagged_dense_bmm: fix ROCm test failures (#1655) Summary: This patch fixes test failures on AMD GPUs. 1. Remove `__restrict__ `. I don't think it is needed even for CUDA, but it confuses HIPCC. 2. Use `uint32_t` instead of `auto`: old ROCm (including ROCm <= 5.3) does not have `+=` operator for the type of `blockIdx.z`, causing a compilation error. We observed that this issue is fixed in ROCm 5.4.3, but let's use `uint32_t` for now. We should revisit and use `auto` later. See this for details: https://github.com/ROCm-Developer-Tools/hipamd/commit/86a1634c642daeda1e984d4124bcc2aeba5c4e19 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1655 Test Plan: GitHub Actions' AMD CI Reviewed By: q10, brad-mengchi Differential Revision: D44242622 Pulled By: shintaro-iwasaki fbshipit-source-id: c9b88155ebf1ed881b2d03e3be0e8991b4b30174 --- fbgemm_gpu/src/jagged_tensor_ops.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu index 860a83ecd6..0282fa9f19 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops.cu @@ -2083,10 +2083,10 @@ template < typename index_t, typename scalar_t> __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel( - const at::PackedTensorAccessor32 __restrict__ x_values, - const at::PackedTensorAccessor32 __restrict__ x_offsets, - const at::PackedTensorAccessor32 __restrict__ y, - at::PackedTensorAccessor32 __restrict__ output, + const at::PackedTensorAccessor32 x_values, + const at::PackedTensorAccessor32 x_offsets, + const at::PackedTensorAccessor32 y, + at::PackedTensorAccessor32 output, const int max_L) { const int B = x_offsets.size(0) - 1; const int K = x_values.size(1); @@ -2105,7 +2105,9 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel( __shared__ scalar_t As[BLOCK_TILE_M][BLOCK_TILE_K]; __shared__ scalar_t Bs[BLOCK_TILE_K][BLOCK_TILE_N]; - for (auto b = blockIdx.z; b < B; b += gridDim.z) { + // Once we remove ROCm<=5.3 support, we should replace uint32_t with auto. + // See #1655 + for (uint32_t b = blockIdx.z; b < B; b += gridDim.z) { const index_t row_start = x_offsets[b]; const index_t row_end = x_offsets[b + 1]; const auto length = min(row_end - row_start, (index_t)max_L); From 22c97d54f0d76385750401cef7d986f9554c0643 Mon Sep 17 00:00:00 2001 From: Jianyu Huang Date: Wed, 22 Mar 2023 13:11:40 -0700 Subject: [PATCH 16/34] Support embedding dim 1024 ~ 2048 (#1656) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1656 wushirong reported the failure on https://fburl.com/code/hae91ra7 . - The embedding config is from f418615450 . - `max_int8_128b_rows` is 10 --> D = 1280 Our embedding dim has grown to 1024 + ? Note that the static shared memory can only go up to 48 KB: > Kernels relying on shared memory allocations over 48 KB per block are architecture-specific, as such they must use dynamic shared memory (rather than statically sized arrays) in https://docs.nvidia.com/cuda/cuda-c-programming-guide/ for ptx shared mem error: ``` [2023-03-21T22:04:33.899-07:00] ptxas error : Entry function '_ZN4nbit60INT8_split_embedding_codegen_forward_weighted_kernel_small_LIiN3c104HalfELm2ELm4ELm4E Lm8ELm16ELb1EEEvN2at27GenericPackedTensorAccessorIhLm1ENS3_17RestrictPtrTraitsElEES6_NS4_IiLm1ES5_iEENS4_IlLm1ES5_iEENS4_IhLm1ES5_iEES7_N10fbgemm_gpu12FixedDiv isorENS4_IT_Lm1ES5_iEESD_llNS4_IfLm1ES5_iEENS4_IT0_Lm2ES5_iEENS4_IhLm2ES5_lEES7_' uses too much shared data (0x10080 bytes, 0xc000 max) ``` Currently we reduce `InputRowsInFlight` to bypass the issue (the static shared memory used in the kernel is ``` typedef uint4 AllBuffers[WarpsPerBlock][OutputRowsPerThread][InputRowsInFlight][NumUint4LoadsPerRow]; __shared__ AllBuffers buffers; ``` Long term, we can change the static shared memory to dynamic shared memory, and increase the shared memory size to be 64 KB +. Reviewed By: wushirong Differential Revision: D44270081 fbshipit-source-id: 367ae838ea073dfe58d859ea3c0e6c7190beca6a --- ...edding_forward_quantized_split_template.cu | 30 +++++++++++++++---- .../split_table_batched_embeddings_test.py | 4 +-- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu index 6ac2b2d3c0..4b4345f1cc 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu @@ -737,13 +737,16 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int2_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_int2_D > 0) { auto max_int2_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int2_D, SparseType::INT2, row_alignment), 128); - TORCH_CHECK(max_int2_128b_rows <= 2); + TORCH_CHECK(max_int2_128b_rows <= 4); if (max_int2_128b_rows > 0) { Y(2, 16, 0, 1); } if (max_int2_128b_rows > 1) { Y(2, 8, 1, 2); } + if (max_int2_128b_rows > 2) { + Y(2, 8, 2, 4); + } } })); #undef X @@ -783,7 +786,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int4_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_int4_D > 0) { auto max_int4_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int4_D, SparseType::INT4, row_alignment), 128); - TORCH_CHECK(max_int4_128b_rows <= 4); + TORCH_CHECK(max_int4_128b_rows <= 8); if (max_int4_128b_rows > 0) { Y(4, 8, 0, 1); } @@ -793,6 +796,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_int4_128b_rows > 2) { Y(1, 4, 2, 4); } + if (max_int4_128b_rows > 4) { + Y(1, 4, 4, 8); + } } })); #undef X @@ -831,7 +837,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "int8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_int8_D > 0) { auto max_int8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_int8_D, SparseType::INT8, row_alignment), 128); - TORCH_CHECK(max_int8_128b_rows <= 8); + TORCH_CHECK(max_int8_128b_rows <= 16); if (max_int8_128b_rows > 0) { Y(2, 8, 0, 1); } @@ -844,6 +850,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_int8_128b_rows > 4) { Y(2, 4, 4, 8); } + if (max_int8_128b_rows > 8) { + Y(2, 2, 8, 16); + } } })); #undef X @@ -884,7 +893,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_float8_D > 0) { auto max_fp8_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float8_D, SparseType::FP8, row_alignment), 128); - TORCH_CHECK(max_fp8_128b_rows <= 8); + TORCH_CHECK(max_fp8_128b_rows <= 16); if (max_fp8_128b_rows > 0) { Y(2, 8, 0, 1); } @@ -897,6 +906,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_fp8_128b_rows > 4) { Y(2, 4, 4, 8); } + if (max_fp8_128b_rows > 8) { + Y(2, 2, 4, 8); + } } })); #undef X @@ -935,7 +947,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp16_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_float16_D > 0) { auto max_fp16_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float16_D, SparseType::FP16, row_alignment), 128); - TORCH_CHECK(max_fp16_128b_rows <= 16); + TORCH_CHECK(max_fp16_128b_rows <= 32); if (max_fp16_128b_rows > 0) { Y(2, 8, 0, 2); } @@ -948,6 +960,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_fp16_128b_rows > 8) { Y(2, 2, 8, 16); } + if (max_fp16_128b_rows > 16) { + Y(2, 1, 16, 32); + } } })); #undef X @@ -986,7 +1001,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ DISPATCH_OUTPUT_TYPES(output.scalar_type(), "fp32_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_kernel", ([&] { if (max_float32_D > 0) { auto max_fp32_128b_rows = nbit::div_round_up(nbit::padded_row_size_in_bytes(max_float32_D, SparseType::FP32, row_alignment), 128); - TORCH_CHECK(max_fp32_128b_rows <= 32); + TORCH_CHECK(max_fp32_128b_rows <= 64); if (max_fp32_128b_rows > 0) { Y(2, 4, 0, 4); } @@ -996,6 +1011,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ if (max_fp32_128b_rows > 16) { Y(1, 1, 16, 32); } + if (max_fp32_128b_rows > 32) { + Y(1, 1, 32, 64); + } } })); #undef X diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py index 6a4d299b80..ddab386bf0 100644 --- a/fbgemm_gpu/test/split_table_batched_embeddings_test.py +++ b/fbgemm_gpu/test/split_table_batched_embeddings_test.py @@ -3579,7 +3579,7 @@ def test_nbit_forward_cpu( T = random.randint(1, 50) B = random.randint(0, 128) L = random.randint(0, 32) - D = random.randint(2, 1024) + D = random.randint(2, 2048) log_E = random.randint(2, 4) use_cache = False @@ -3660,7 +3660,7 @@ def test_nbit_forward_gpu_no_cache( T = random.randint(1, 50) B = random.randint(0, 128) L = random.randint(0, 32) - D = random.randint(2, 1024) + D = random.randint(2, 2048) log_E = random.randint(2, 4) use_cache = False From db9eee1c532135a17e83450162d82d371ca94c8a Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 22 Mar 2023 13:23:22 -0700 Subject: [PATCH 17/34] Containerize the remaining FBGEMM_GPU CI jobs (#1658) Summary: - Containerize the remaining FBGEMM_GPU CI jobs - Add Conda cleanups to make PyTorch and CUDA installs more reliable - Update post-install checks for PyTorch to work with ROCm - Update the CI to continue running on jobs that fail on just a few variants - Use PIP to install PyTorch GPU nightly as the nightly packages show up in PIP more reliably than in Conda Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1658 Reviewed By: shintaro-iwasaki Differential Revision: D44306708 Pulled By: q10 fbshipit-source-id: 5f0862f18eca7151759d9983aa97849222539d7d --- .github/scripts/setup_env.bash | 83 ++++++++++++------- .github/workflows/fbgemm_gpu_ci.yml | 15 +++- .github/workflows/fbgemm_nightly_build.yml | 32 ++++--- .../workflows/fbgemm_nightly_build_cpu.yml | 17 ++-- .github/workflows/fbgemm_release_build.yml | 27 ++++-- .../workflows/fbgemm_release_build_cpu.yml | 17 ++-- fbgemm_gpu/docs/BuildInstructions.md | 8 +- 7 files changed, 127 insertions(+), 72 deletions(-) diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index a22a09b19e..8329d661cc 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -212,8 +212,10 @@ run_python_test () { if print_exec conda run -n "${env_name}" python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then echo "[TEST] Python test suite PASSED: ${python_test_file}" + echo "" else echo "[TEST] Python test suite FAILED: ${python_test_file}" + echo "" return 1 fi } @@ -366,6 +368,12 @@ print_glibc_info () { # Miniconda Setup Functions ################################################################################ +__conda_cleanup () { + echo "[SETUP] Cleaning up Conda packages ..." + (print_exec conda clean --packages --tarball -y) || return 1 + (print_exec conda clean --all -y) || return 1 +} + setup_miniconda () { local miniconda_prefix="$1" if [ "$miniconda_prefix" == "" ]; then @@ -399,7 +407,10 @@ setup_miniconda () { print_exec . ~/.bashrc echo "[SETUP] Updating Miniconda base packages ..." - (exec_with_retries conda update -n base -c defaults -y conda) || return 1 + (exec_with_retries conda update -n base -c defaults --update-deps -y conda) || return 1 + + # Clean up packages + __conda_cleanup # Print Conda info print_exec conda info @@ -463,14 +474,14 @@ create_conda_environment () { install_pytorch_conda () { local env_name="$1" local pytorch_version="$2" - local pytorch_cpu="$3" + local pytorch_variant_type="$3" if [ "$pytorch_version" == "" ]; then echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_VERSION [CPU]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env 1.11.0 # Install a specific version" - echo " ${FUNCNAME[0]} build_env latest # Install the latest stable release" - echo " ${FUNCNAME[0]} build_env test # Install the pre-release" - echo " ${FUNCNAME[0]} build_env nightly 1 # Install the CPU variant of the nightly" + echo " ${FUNCNAME[0]} build_env 1.11.0 # Install a specific version" + echo " ${FUNCNAME[0]} build_env latest # Install the latest stable release" + echo " ${FUNCNAME[0]} build_env test # Install the pre-release" + echo " ${FUNCNAME[0]} build_env nightly cpu # Install the CPU variant of the nightly" return 1 else echo "################################################################################" @@ -481,11 +492,11 @@ install_pytorch_conda () { echo "" fi - # Install cpuonly if needed - if [ "$pytorch_cpu" != "" ]; then - pytorch_cpu=1 + # Install the cpuonly package if needed + if [ "$pytorch_variant_type" == "cpu" ]; then local pytorch_package="cpuonly pytorch" else + pytorch_variant_type="cuda" local pytorch_package="pytorch" fi @@ -499,15 +510,25 @@ install_pytorch_conda () { local pytorch_channel="pytorch" fi + # Clean up packages before installation + __conda_cleanup + # Install PyTorch packages # NOTE: Installation of large package might fail due to corrupt package download # Use --force-reinstall to address this on retries - https://datascience.stackexchange.com/questions/41732/conda-verification-failed - echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, CPU=${pytorch_cpu:-0}) through Conda using channel '${pytorch_channel}' ..." + echo "[INSTALL] Attempting to install '${pytorch_package}' (${pytorch_version}, variant = ${pytorch_variant_type}) through Conda using channel '${pytorch_channel}' ..." # shellcheck disable=SC2086 (exec_with_retries conda install --force-reinstall -n "${env_name}" -y ${pytorch_package} -c "${pytorch_channel}") || return 1 + # Check that PyTorch is importable + (test_python_import "${env_name}" torch.distributed) || return 1 + + # Print out the actual installed PyTorch version + installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)") + echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}" + # Run check for GPU variant - if [ "$pytorch_cpu" == "" ]; then + if [ "$pytorch_variant_type" == "cuda" ]; then # Ensure that the PyTorch build is the GPU variant (i.e. contains cuDNN reference) # This test usually applies to the PyTorch nightly builds if conda list -n "${env_name}" pytorch | grep cudnn; then @@ -526,13 +547,7 @@ install_pytorch_conda () { (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1 fi - # Check that PyTorch is importable - (test_python_import "${env_name}" torch.distributed) || return 1 - - # Print out the actual installed PyTorch version - installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)") - echo "[INSTALL] Installed PyTorch through Conda" - echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}" + echo "[INSTALL] Successfully installed PyTorch through Conda" } install_pytorch_pip () { @@ -591,30 +606,31 @@ install_pytorch_pip () { # shellcheck disable=SC2086 (exec_with_retries conda run -n "${env_name}" pip install ${pytorch_package} --extra-index-url ${pytorch_channel}) || return 1 - if [ "$pytorch_variant_type" != "cpu" ]; then - if [ "$pytorch_variant_type" == "cuda" ]; then - # Ensure that the PyTorch-CUDA headers are properly installed - (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1 - fi + # Check that PyTorch is importable + (test_python_import "${env_name}" torch.distributed) || return 1 + # Print out the actual installed PyTorch version + installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)") + echo "[CHECK] NOTE: The installed version is: ${installed_pytorch_version}" + + if [ "$pytorch_variant_type" != "cpu" ]; then # Ensure that the PyTorch build is of the correct variant # This test usually applies to the PyTorch nightly builds - if conda run -n build_binary pip list torch | grep torch | grep "${pytorch_variant}"; then + if conda run -n "${env_name}" pip list torch | grep torch | grep "${pytorch_variant}"; then echo "[CHECK] The installed PyTorch ${pytorch_version} is the correct variant (${pytorch_variant})" else echo "[CHECK] The installed PyTorch ${pytorch_version} appears to be an incorrect variant as it is missing references to ${pytorch_variant}!" - echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA presently installed on the system has not been published yet." + echo "[CHECK] This can happen if the variant of PyTorch (e.g. GPU, nightly) for the MAJOR.MINOR version of CUDA or ROCm presently installed on the system is not available." return 1 fi fi - # Check that PyTorch is importable - (test_python_import "${env_name}" torch.distributed) || return 1 + if [ "$pytorch_variant_type" == "cuda" ]; then + # Ensure that the PyTorch-CUDA headers are properly installed + (test_filepath "${env_name}" cuda_cmake_macros.h) || return 1 + fi - # Print out the actual installed PyTorch version - installed_pytorch_version=$(conda run -n "${env_name}" python -c "import torch; print(torch.__version__)") - echo "[INSTALL] Installed PyTorch through PIP" - echo "[INSTALL] NOTE: The installed version is: ${installed_pytorch_version}" + echo "[INSTALL] Successfully installed PyTorch through PIP" } @@ -664,6 +680,9 @@ install_cuda () { return 1 fi + # Clean up packages before installation + __conda_cleanup + # Install CUDA packages echo "[INSTALL] Installing CUDA ${cuda_version} ..." (exec_with_retries conda install --force-reinstall -n "${env_name}" -y cuda -c "nvidia/label/cuda-${cuda_version}") || return 1 @@ -970,7 +989,7 @@ create_conda_pytorch_environment () { if [ "${cuda_version}" == "" ]; then # Install the CPU variant of PyTorch - install_pytorch_conda "${env_name}" "${pytorch_version}" 1 + install_pytorch_conda "${env_name}" "${pytorch_version}" cpu else # Install CUDA and the GPU variant of PyTorch install_cuda "${env_name}" "${cuda_version}" diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index adf8443eae..b7dea4093a 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -6,13 +6,22 @@ name: FBGEMM_GPU CI on: - push: + # PR Trigger + # + pull_request: branches: - main - pull_request: + + # Push Trigger (enable to catch errors coming out of multiple merges) + # + push: branches: - main + # Manual Trigger (for testing only) + # + workflow_dispatch: + concurrency: # Cancel previous runs in the PR if a new commit is pushed group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -35,7 +44,7 @@ jobs: matrix: container-image: [ "ubuntu:20.04" ] python-version: [ "3.8", "3.9", "3.10" ] - rocm-version: [ "5.3" ] + rocm-version: [ "5.3", "5.4.2" ] steps: - name: Setup Build Container diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_nightly_build.yml index b0ac76900c..0d9257d554 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_nightly_build.yml @@ -48,6 +48,7 @@ jobs: env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + continue-on-error: true strategy: # Don't fast-fail all the other builds if one of the them fails fail-fast: false @@ -85,8 +86,9 @@ jobs: - name: Install CUDA run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready - name: Install PyTorch Nightly - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }} - name: Install cuDNN run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} @@ -106,7 +108,10 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.g5.4xlarge.nvidia.gpu + container: + image: ${{ matrix.container-image }} + options: --user root --gpus all defaults: run: shell: bash @@ -117,7 +122,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.g5.4xlarge.nvidia.gpu ] + container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ] python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] # Specify exactly ONE CUDA version for artifact publish @@ -125,11 +130,22 @@ jobs: needs: build_artifact steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils curl git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: submodules: true + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl + - name: Display System Info run: . $PRELUDE; print_system_info; print_ec2_info @@ -145,21 +161,17 @@ jobs: - name: Install CUDA run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready - name: Install PyTorch Nightly - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }} - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Download Wheel Artifact from GHA - uses: actions/download-artifact@v3 - with: - name: fbgemm_gpu_nightly_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl - - name: Install FBGEMM_GPU Nightly run: | . $PRELUDE - ls . + pwd; ls -la . install_fbgemm_gpu_package $BUILD_ENV *.whl - name: Test with PyTest diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_nightly_build_cpu.yml index d99c3f73ee..8d1d39805f 100644 --- a/.github/workflows/fbgemm_nightly_build_cpu.yml +++ b/.github/workflows/fbgemm_nightly_build_cpu.yml @@ -49,6 +49,7 @@ jobs: env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + continue-on-error: true strategy: # Don't fast-fail all the other builds if one of the them fails fail-fast: false @@ -83,7 +84,7 @@ jobs: run: . $PRELUDE; install_build_tools $BUILD_ENV - name: Install PyTorch-CPU Nightly - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly + run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV @@ -125,6 +126,11 @@ jobs: with: submodules: true + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl + - name: Display System Info run: . $PRELUDE; print_system_info; print_ec2_info @@ -138,20 +144,15 @@ jobs: run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} - name: Install PyTorch Nightly - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpuonly + run: . $PRELUDE; install_pytorch_conda $BUILD_ENV nightly cpu - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Download Wheel Artifact from GHA - uses: actions/download-artifact@v3 - with: - name: fbgemm_gpu_nightly_cpu_${{ matrix.python-version }}.whl - - name: Install FBGEMM_GPU Nightly (CPU version) run: | . $PRELUDE - ls . + pwd; ls -la . install_fbgemm_gpu_package $BUILD_ENV *.whl - name: Test with PyTest diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_release_build.yml index 75d5235b69..b909cec274 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_release_build.yml @@ -40,6 +40,7 @@ jobs: env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + continue-on-error: true strategy: # Don't fast-fail all the other builds if one of the them fails fail-fast: false @@ -98,7 +99,10 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - runs-on: ${{ matrix.os }} + runs-on: linux.g5.4xlarge.nvidia.gpu + container: + image: ${{ matrix.container-image }} + options: --user root --gpus all defaults: run: shell: bash @@ -109,18 +113,30 @@ jobs: strategy: fail-fast: false matrix: - os: [ linux.g5.4xlarge.nvidia.gpu ] + container-image: [ "nvidia/cuda:11.8.0-base-ubuntu20.04" ] python-version: [ "3.8", "3.9", "3.10", "3.11" ] cuda-version: [ "11.7.1", "11.8.0" ] # Specify exactly ONE CUDA version for artifact publish cuda-version-publish: [ "11.7.1" ] needs: build_artifact + steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils curl git sudo wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: submodules: true + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl + - name: Display System Info run: . $PRELUDE; print_system_info; print_ec2_info @@ -142,15 +158,10 @@ jobs: - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Download Wheel Artifact from GHA - uses: actions/download-artifact@v3 - with: - name: fbgemm_gpu_${{ matrix.python-version }}_cuda${{ matrix.cuda-version }}.whl - - name: Install FBGEMM_GPU run: | . $PRELUDE - ls . + pwd; ls -la . install_fbgemm_gpu_package $BUILD_ENV *.whl - name: Test with PyTest diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_release_build_cpu.yml index f13ebd32c9..577f0b5e88 100644 --- a/.github/workflows/fbgemm_release_build_cpu.yml +++ b/.github/workflows/fbgemm_release_build_cpu.yml @@ -40,6 +40,7 @@ jobs: env: PRELUDE: .github/scripts/setup_env.bash BUILD_ENV: build_binary + continue-on-error: true strategy: # Don't fast-fail all the other builds if one of the them fails fail-fast: false @@ -74,7 +75,7 @@ jobs: run: . $PRELUDE; install_build_tools $BUILD_ENV - name: Install PyTorch-CPU Test - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly + run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV @@ -116,6 +117,11 @@ jobs: with: submodules: true + - name: Download Wheel Artifact from GHA + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl + - name: Display System Info run: . $PRELUDE; print_system_info; print_ec2_info @@ -129,20 +135,15 @@ jobs: run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} - name: Install PyTorch Test - run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpuonly + run: . $PRELUDE; install_pytorch_conda $BUILD_ENV test cpu - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Download Wheel Artifact from GHA - uses: actions/download-artifact@v3 - with: - name: fbgemm_gpu_cpu_${{ matrix.python-version }}.whl - - name: Install FBGEMM_GPU (CPU version) run: | . $PRELUDE - ls . + pwd; ls -la . install_fbgemm_gpu_package $BUILD_ENV *.whl - name: Test with PyTest diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md index 56aa780fe3..c50bd50d03 100644 --- a/fbgemm_gpu/docs/BuildInstructions.md +++ b/fbgemm_gpu/docs/BuildInstructions.md @@ -59,9 +59,11 @@ conda run -n "${env_name}" python -m pip install pyOpenSSL>22.1.0 ### C/C++ Compiler -Install the GCC toolchain. Note that GCC (as opposed to Clang for example) is -required for GPU (CUDA) builds because NVIDIA's `nvcc` relies on `gcc` and `g++` -in the path. +Install a version of the GCC toolchain that supports **C++17**. Note that GCC +(as opposed to Clang for example) is required for GPU (CUDA) builds because +NVIDIA's `nvcc` relies on `gcc` and `g++` in the path. The `sysroot` package +will also need to be installed to avoid issues with missing versioned symbols +when compiling FBGEMM_CPU: ```sh conda install -n "${env_name}" -y gxx_linux-64=9.3.0 From 7dc393295c70126bb8f14f5c64538e12e92ecaca Mon Sep 17 00:00:00 2001 From: Sarunya Pumma Date: Wed, 22 Mar 2023 19:06:59 -0700 Subject: [PATCH 18/34] Add tbe_input_combine_with_length for GPU (#1647) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1647 Implement `tbe_input_combine_with_length` for GPU. The operator takes 3 lists of tensors (`indices`, `lengths`, and `per_sample_weights`) and concatenates each one into a single tensor. Implicit type casting is also performed if the input types are different from the output types. `indices` and `lengths` tensors can be of type `int32_t` or `int64_t`. The outputs for `indices` concatenation and `lengths` concatenation are fixed to `int32_t`. `per_sample_weights` must be `float`. Reviewed By: bangshengtang Differential Revision: D44076452 fbshipit-source-id: f6ce8628e7345093bb55835f9523870c2914516f --- fbgemm_gpu/CMakeLists.txt | 6 +- fbgemm_gpu/include/fbgemm_gpu/input_combine.h | 15 ++ fbgemm_gpu/src/input_combine.cu | 160 +++++++++++++ fbgemm_gpu/src/input_combine_gpu.cpp | 226 ++++++++++++++++++ fbgemm_gpu/test/input_combine_test.py | 61 +++-- 5 files changed, 446 insertions(+), 22 deletions(-) create mode 100644 fbgemm_gpu/src/input_combine.cu create mode 100644 fbgemm_gpu/src/input_combine_gpu.cpp diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 036470adf2..51348505c4 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -318,7 +318,8 @@ if(NOT FBGEMM_CPU_ONLY) src/split_embeddings_utils.cpp src/split_table_batched_embeddings.cpp src/metric_ops_host.cpp - src/embedding_inplace_update_gpu.cpp) + src/embedding_inplace_update_gpu.cpp + src/input_combine_gpu.cpp) if(NVML_LIB_PATH) message(STATUS "Found NVML_LIB_PATH: ${NVML_LIB_PATH}") @@ -352,7 +353,8 @@ if(NOT FBGEMM_CPU_ONLY) src/split_embeddings_cache_cuda.cu src/split_embeddings_utils.cu src/metric_ops.cu - src/embedding_inplace_update.cu) + src/embedding_inplace_update.cu + src/input_combine.cu) set_source_files_properties( ${fbgemm_gpu_sources_gpu} PROPERTIES COMPILE_OPTIONS diff --git a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h index 348e0bebfc..c329d6c9d9 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/input_combine.h +++ b/fbgemm_gpu/include/fbgemm_gpu/input_combine.h @@ -30,4 +30,19 @@ padding_fused_tbe_input_combine_cpu( const at::Tensor& include_last_offsets, int64_t batch_size); +std::tuple +tbe_input_combine_with_length_cuda( + const uint64_t* const indices_addrs, + const uint64_t* const lengths_addrs, + const uint64_t* const per_sample_weights_addrs, + const uint32_t* const indices_is_long, + const uint32_t* const lengths_is_long, + const uint64_t* const indices_offsets, + const uint64_t* const lengths_offsets, + const uint64_t num_lists, + const uint64_t total_indices, + const uint64_t total_lengths, + const uint64_t max_list_size, + const c10::DeviceIndex& device); + } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/input_combine.cu b/fbgemm_gpu/src/input_combine.cu new file mode 100644 index 0000000000..040ca14bbf --- /dev/null +++ b/fbgemm_gpu/src/input_combine.cu @@ -0,0 +1,160 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh" +#include "fbgemm_gpu/input_combine.h" + +using Tensor = at::Tensor; + +namespace fbgemm_gpu { + +template +DEVICE_INLINE void vec_copy_with_implicit_type_cast( + dst_t* const __restrict__ dst, + const uint64_t src_addr, + const uint64_t src_offset, + const uint64_t dst_offset, + const uint64_t src_bound) { + // TODO: Use vector load/store if address aligns with the vector type + const src_t* const src = reinterpret_cast(src_addr); +#pragma unroll + for (uint64_t i = 0; i < VEC_WIDTH && src_offset + i < src_bound; i++) { + dst[dst_offset + i] = src[src_offset + i]; + } +} + +template +__global__ +__launch_bounds__(kMaxThreads) void tbe_input_combine_with_length_kernel( + int32_t* const __restrict__ combined_indices, + int32_t* const __restrict__ combined_lengths, + float* const __restrict__ combined_weights, + const uint64_t* const __restrict__ indices_addrs, + const uint64_t* const __restrict__ lengths_addrs, + const uint64_t* const __restrict__ per_sample_weights_addrs, + const uint32_t* const __restrict__ indices_is_long, + const uint32_t* const __restrict__ lengths_is_long, + const uint64_t* const __restrict__ indices_offsets, + const uint64_t* const __restrict__ lengths_offsets, + const uint64_t num_lists, + const FixedDivisor fd_num_warps_per_list) { + const auto global_warp_id = blockIdx.x * blockDim.y + threadIdx.y; + uint32_t list_id; + uint32_t warp_id; + fd_num_warps_per_list.DivMod( + global_warp_id, + reinterpret_cast(&list_id), + reinterpret_cast(&warp_id)); + + if (list_id >= num_lists) { + return; + } + + // IS_LONG_NUM_BITS is power of 2 (default = 32); div and mod should be cheap + const uint32_t is_long_idx = list_id / IS_LONG_NUM_BITS; + const uint32_t is_long_mask = 1u << (list_id % IS_LONG_NUM_BITS); + const uint64_t src_idx = (warp_id * kWarpSize + threadIdx.x) * VEC_WIDTH; + const auto indices_start = indices_offsets[list_id]; + const auto indices_end = indices_offsets[list_id + 1]; + const auto lengths_start = lengths_offsets[list_id]; + const auto lengths_end = lengths_offsets[list_id + 1]; + + // Invoke a function based on the indices type + ((indices_is_long[is_long_idx] & is_long_mask) + ? vec_copy_with_implicit_type_cast + : vec_copy_with_implicit_type_cast< + int32_t, + int32_t, + VEC_WIDTH>)(combined_indices, indices_addrs[list_id], src_idx, indices_start + src_idx, indices_end - indices_start); + + // Invoke a function based on the lengths type + ((lengths_is_long[is_long_idx] & is_long_mask) + ? vec_copy_with_implicit_type_cast + : vec_copy_with_implicit_type_cast< + int32_t, + int32_t, + VEC_WIDTH>)(combined_lengths, lengths_addrs[list_id], src_idx, lengths_start + src_idx, lengths_end - lengths_start); + + if (per_sample_weights_addrs) { + vec_copy_with_implicit_type_cast( + combined_weights, + per_sample_weights_addrs[list_id], + src_idx, + indices_start + src_idx, + indices_end - indices_start); + } +} + +std::tuple tbe_input_combine_with_length_cuda( + const uint64_t* const indices_addrs, + const uint64_t* const lengths_addrs, + const uint64_t* const per_sample_weights_addrs, + const uint32_t* const indices_is_long, + const uint32_t* const lengths_is_long, + const uint64_t* const indices_offsets, + const uint64_t* const lengths_offsets, + const uint64_t num_lists, + const uint64_t total_indices, + const uint64_t total_lengths, + const uint64_t max_list_size, + const c10::DeviceIndex& device) { + constexpr uint32_t IS_LONG_NUM_BITS = 32; + at::cuda::OptionalCUDAGuard device_guard; + device_guard.set_index(device); + + // combined_indices and combined_legnths are int tensors + const auto int_options = at::TensorOptions().dtype(at::kInt).device( + at::kCUDA, at::cuda::current_device()); + Tensor combined_indices = + at::empty({static_cast(total_indices)}, int_options); + Tensor combined_lengths = + at::empty({static_cast(total_lengths)}, int_options); + // combined_weights is a float tensor + Tensor combined_weights = at::empty( + {per_sample_weights_addrs ? static_cast(total_indices) + : static_cast(0)}, + at::TensorOptions() + .dtype(at::kFloat) + .device(at::kCUDA, at::cuda::current_device())); + + // Each thread loads 4 elements (rule of thumb; should work well with 32-bit + // inputs) + constexpr uint32_t VEC_WIDTH = 4; + constexpr uint32_t NUM_WARPS_PER_BLOCK = kMaxThreads / kWarpSize; + const auto num_warps_per_list = + div_round_up(max_list_size, kWarpSize * VEC_WIDTH); + const auto num_blocks = + div_round_up(num_warps_per_list * num_lists, NUM_WARPS_PER_BLOCK); + + tbe_input_combine_with_length_kernel + <<>>( + combined_indices.data_ptr(), + combined_lengths.data_ptr(), + per_sample_weights_addrs ? combined_weights.data_ptr() + : nullptr, + indices_addrs, + lengths_addrs, + per_sample_weights_addrs, + indices_is_long, + lengths_is_long, + indices_offsets, + lengths_offsets, + num_lists, + FixedDivisor(num_warps_per_list)); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return { + std::move(combined_indices), + std::move(combined_lengths), + std::move(combined_weights)}; +} + +} // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/input_combine_gpu.cpp b/fbgemm_gpu/src/input_combine_gpu.cpp new file mode 100644 index 0000000000..482cabd963 --- /dev/null +++ b/fbgemm_gpu/src/input_combine_gpu.cpp @@ -0,0 +1,226 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "fbgemm_gpu/input_combine.h" +#include "fbgemm_gpu/sparse_ops_utils.h" + +#include +#include +#include + +using Tensor = at::Tensor; + +namespace fbgemm_gpu { + +constexpr uint32_t IS_LONG_NUM_BITS = 32; +constexpr uint32_t NUM_ARGS = 7; +enum args_pos { + P_indices_prts = 0, + P_lengths_addrs = 1, + P_indices_offsets = 2, + P_lengths_offsets = 3, + P_per_sample_weight = 4, + P_indices_is_long = 5, + P_lengths_is_long = 6 +}; + +template +uint64_t compute_num_uint64s(const uint64_t num_elements) { + const uint64_t ratio = sizeof(uint64_t) / sizeof(T); + return (num_elements + ratio - 1) / ratio; +} + +void offset_tbe_input_combine_with_length_args( + uint64_t** indices_addrs, + uint64_t** lengths_addrs, + uint64_t** indices_offsets, + uint64_t** lengths_offsets, + uint64_t** per_sample_weights_addrs, + uint32_t** indices_is_long, + uint32_t** lengths_is_long, + uint64_t* base_addr, + const uint64_t* const ptr_offsets, + const bool need_weights) { + *indices_addrs = base_addr + ptr_offsets[P_indices_prts]; + *lengths_addrs = base_addr + ptr_offsets[P_lengths_addrs]; + *indices_offsets = base_addr + ptr_offsets[P_indices_offsets]; + *lengths_offsets = base_addr + ptr_offsets[P_lengths_offsets]; + *per_sample_weights_addrs = + need_weights ? (base_addr + ptr_offsets[P_per_sample_weight]) : nullptr; + *indices_is_long = + reinterpret_cast(base_addr + ptr_offsets[P_indices_is_long]); + *lengths_is_long = + reinterpret_cast(base_addr + ptr_offsets[P_lengths_is_long]); +} + +std::tuple tbe_input_combine_with_length_gpu( + const std::vector& indices_list, + const std::vector& lengths_list, + const std::vector& per_sample_weights) { + const auto num_lists = indices_list.size(); + TORCH_CHECK(num_lists > 0); + TORCH_CHECK(lengths_list.size() == num_lists); + TORCH_CHECK(per_sample_weights.size() == num_lists); + const bool need_weights = std::any_of( + per_sample_weights.begin(), per_sample_weights.end(), [](const auto& x) { + return x.numel() > 0; + }); + + // Store is_longs in 32-bit variables. i-th bit (LSB) indicates if + // list i-th is long. + const uint64_t num_is_longs = + (num_lists + IS_LONG_NUM_BITS - 1) / IS_LONG_NUM_BITS; + const uint64_t num_is_longs_64 = compute_num_uint64s(num_is_longs); + // args_tensor stores kernel arguments: + // - indices_prts (num_lists uint64_t elements) + // - lengths_addrs (num_lists uint64_t elements) + // - indices_offsets (num_lists + 1 uint64_t elements) + // - lengths_offsets (num_lists + 1 uint64_t elements) + // - per_sample_weight (num_lists uint64_t elements; optional) + // - indices_is_long (num_is_longs uint32_t elements) + // - lengths_is_long (num_is_longs uint32_t elements) + uint64_t args_offsets[NUM_ARGS + 1]; + // Initialize offsets with lengths first + args_offsets[P_indices_prts] = num_lists; + args_offsets[P_lengths_addrs] = num_lists; + args_offsets[P_indices_offsets] = num_lists + 1; + args_offsets[P_lengths_offsets] = num_lists + 1; + args_offsets[P_per_sample_weight] = need_weights ? num_lists : 0; + args_offsets[P_indices_is_long] = num_is_longs_64; + args_offsets[P_lengths_is_long] = num_is_longs_64; + + // Compute offsets + uint64_t offset = 0; + auto next = args_offsets[0]; + for (uint32_t i = 0; i < NUM_ARGS; i++) { + args_offsets[i] = offset; + offset += next; + next = args_offsets[i + 1]; + } + args_offsets[NUM_ARGS] = offset; // total number of uint64_t elements required + + Tensor args_tensor = at::empty( + {static_cast(args_offsets[NUM_ARGS] * sizeof(uint64_t))}, + at::TensorOptions().dtype(at::kByte).pinned_memory(true)); + + uint64_t* indices_addrs = nullptr; + uint64_t* lengths_addrs = nullptr; + uint64_t* indices_offsets = nullptr; + uint64_t* lengths_offsets = nullptr; + uint64_t* per_sample_weights_addrs = nullptr; + uint32_t* indices_is_long = nullptr; + uint32_t* lengths_is_long = nullptr; + + // Offset host pointers + offset_tbe_input_combine_with_length_args( + &indices_addrs, + &lengths_addrs, + &indices_offsets, + &lengths_offsets, + &per_sample_weights_addrs, + &indices_is_long, + &lengths_is_long, + reinterpret_cast(args_tensor.data_ptr()), + args_offsets, + need_weights); + + const auto& indices_0 = indices_list[0]; + uint64_t total_indices = 0; + uint64_t total_lengths = 0; + uint64_t max_list_size = 0; + for (uint64_t i = 0; i < num_lists; i++) { + const uint64_t is_long_idx = i / IS_LONG_NUM_BITS; + auto& indices_is_long_ = indices_is_long[is_long_idx]; + auto& lengths_is_long_ = lengths_is_long[is_long_idx]; + if (i % IS_LONG_NUM_BITS == 0) { + indices_is_long_ = 0; + lengths_is_long_ = 0; + } + const auto& indices = indices_list[i]; + const auto& lengths = lengths_list[i]; + TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(indices); + TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(lengths); + TENSORS_ON_SAME_DEVICE(indices, indices_0); + TENSORS_ON_SAME_DEVICE(lengths, indices_0); + TORCH_CHECK(indices.dtype() == c10::kInt || indices.dtype() == c10::kLong); + TORCH_CHECK(lengths.dtype() == c10::kInt || lengths.dtype() == c10::kLong); + TENSOR_NDIM_EQUALS(indices, 1); + TENSOR_NDIM_EQUALS(lengths, 1); + + const auto indices_numel = indices.numel(); + const auto lengths_numel = lengths.numel(); + indices_offsets[i] = total_indices; + lengths_offsets[i] = total_lengths; + total_indices += indices_numel; + total_lengths += lengths_numel; + max_list_size = + std::max(max_list_size, static_cast(indices_numel)); + max_list_size = + std::max(max_list_size, static_cast(lengths_numel)); + + // Store pointers in args_tensor + indices_addrs[i] = reinterpret_cast(indices.data_ptr()); + lengths_addrs[i] = reinterpret_cast(lengths.data_ptr()); + indices_is_long_ |= static_cast(indices.dtype() == c10::kLong) + << (i % IS_LONG_NUM_BITS); + lengths_is_long_ |= static_cast(lengths.dtype() == c10::kLong) + << (i % IS_LONG_NUM_BITS); + + const auto& weights = per_sample_weights[i]; + if (weights.numel() > 0) { + TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU(weights); + TENSORS_ON_SAME_DEVICE(weights, indices_0); + TENSOR_TYPE_MUST_BE(weights, c10::kFloat); + TENSOR_NDIM_EQUALS(weights, 1); + TENSORS_HAVE_SAME_NUMEL(weights, indices); + + per_sample_weights_addrs[i] = + reinterpret_cast(weights.data_ptr()); + } + } + indices_offsets[num_lists] = total_indices; + lengths_offsets[num_lists] = total_lengths; + + const auto& device = indices_0.device(); + // Transfer args_tensor from host to device + args_tensor = args_tensor.to(device, /*non_blocking=*/true); + + // Offset device pointers + offset_tbe_input_combine_with_length_args( + &indices_addrs, + &lengths_addrs, + &indices_offsets, + &lengths_offsets, + &per_sample_weights_addrs, + &indices_is_long, + &lengths_is_long, + reinterpret_cast(args_tensor.data_ptr()), + args_offsets, + need_weights); + + return tbe_input_combine_with_length_cuda( + indices_addrs, + lengths_addrs, + per_sample_weights_addrs, + indices_is_long, + lengths_is_long, + indices_offsets, + lengths_offsets, + num_lists, + total_indices, + total_lengths, + max_list_size, + device.index()); +} + +TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) { + DISPATCH_TO_CUDA( + "tbe_input_combine_with_length", + fbgemm_gpu::tbe_input_combine_with_length_gpu); +}; + +} // namespace fbgemm_gpu diff --git a/fbgemm_gpu/test/input_combine_test.py b/fbgemm_gpu/test/input_combine_test.py index 74f7581576..07102aec90 100644 --- a/fbgemm_gpu/test/input_combine_test.py +++ b/fbgemm_gpu/test/input_combine_test.py @@ -11,12 +11,20 @@ from typing import List, Optional, Tuple import torch +from hypothesis import given, settings try: # pyre-ignore[21] from fbgemm_gpu import open_source # noqa: F401 + + # pyre-ignore[21] + from test_utils import cpu_and_maybe_gpu except Exception: + torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine") torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:input_combine_cpu") + from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu + +DEFAULT_DEVICE = torch.device("cpu") class TBEInputPrepareReference(torch.nn.Module): @@ -120,23 +128,23 @@ def forward( class InputCombineTest(unittest.TestCase): - def _get_inputs(self, dtypes): + def _get_inputs(self, dtypes, device=DEFAULT_DEVICE): indices_list = [ - torch.tensor([1, 2, 3], dtype=dtypes[0]), - torch.tensor([1, 2, 3, 4], dtype=dtypes[1]), + torch.tensor([1, 2, 3], dtype=dtypes[0], device=device), + torch.tensor([1, 2, 3, 4], dtype=dtypes[1], device=device), ] offsets_list = [ - torch.tensor([0, 2], dtype=dtypes[0]), - torch.tensor([0, 1, 4], dtype=dtypes[1]), + torch.tensor([0, 2], dtype=dtypes[0], device=device), + torch.tensor([0, 1, 4], dtype=dtypes[1], device=device), ] include_last_offsets = [False, True] per_sample_weights = [ - torch.tensor([1, 2, 1], dtype=torch.float), - torch.tensor([1, 2, 1, 3], dtype=torch.float), + torch.tensor([1, 2, 1], dtype=torch.float, device=device), + torch.tensor([1, 2, 1, 3], dtype=torch.float, device=device), ] empty_per_sample_weights = [ - torch.tensor([], dtype=torch.float), - torch.tensor([], dtype=torch.float), + torch.tensor([], dtype=torch.float, device=device), + torch.tensor([], dtype=torch.float, device=device), ] return ( indices_list, @@ -226,27 +234,34 @@ def _run_padding_fused_test(self, dtypes, batch_size) -> None: self.assertTrue(outputs[1].dtype == torch.int32) self.assertTrue(outputs[-1].size(0) == 0) - def _offsets_to_lengths(self, offsets, indices, include_last_offsets): + def _offsets_to_lengths( + self, offsets, indices, include_last_offsets, device=DEFAULT_DEVICE + ): if include_last_offsets: offsets_complete = offsets else: offsets_complete = torch.cat( - [offsets, torch.tensor([indices.numel()], dtype=offsets.dtype)] + [ + offsets, + torch.tensor([indices.numel()], dtype=offsets.dtype, device=device), + ] ) return offsets_complete[1:] - offsets_complete[:-1] - def _run_test_with_length(self, dtypes) -> None: + def _run_test_with_length(self, dtypes, device=DEFAULT_DEVICE) -> None: ( indices_list, offsets_list, per_sample_weights, empty_per_sample_weights, include_last_offsets, - ) = self._get_inputs(dtypes) + ) = self._get_inputs(dtypes, device=device) ref_mod = TBEInputPrepareReference(include_last_offsets) lengths_list = [ - self._offsets_to_lengths(offsets, indices, include_last_offsets) + self._offsets_to_lengths( + offsets, indices, include_last_offsets, device=device + ) for offsets, indices, include_last_offsets in zip( offsets_list, indices_list, include_last_offsets ) @@ -307,14 +322,20 @@ def test_input_combine_int32(self) -> None: def test_input_combined_mix(self) -> None: self._run_test((torch.int64, torch.int32)) - def test_input_combine_int64_with_length(self) -> None: - self._run_test_with_length((torch.int64, torch.int64)) + @given(device=cpu_and_maybe_gpu()) + @settings(deadline=None) + def test_input_combine_int64_with_length(self, device: torch.device) -> None: + self._run_test_with_length((torch.int64, torch.int64), device=device) - def test_input_combine_int32_with_length(self) -> None: - self._run_test_with_length((torch.int64, torch.int64)) + @given(device=cpu_and_maybe_gpu()) + @settings(deadline=None) + def test_input_combine_int32_with_length(self, device: torch.device) -> None: + self._run_test_with_length((torch.int32, torch.int32), device=device) - def test_input_combined_mix_with_length(self) -> None: - self._run_test_with_length((torch.int64, torch.int32)) + @given(device=cpu_and_maybe_gpu()) + @settings(deadline=None) + def test_input_combine_mix_with_length(self, device: torch.device) -> None: + self._run_test_with_length((torch.int64, torch.int32), device=device) def test_padding_fused_input_combine_int64(self) -> None: self._run_padding_fused_test((torch.int64, torch.int64), 64) From c960b4595bdb52d3fc3b5ca02a976705fb48132f Mon Sep 17 00:00:00 2001 From: Rengan Xu Date: Wed, 22 Mar 2023 22:47:28 -0700 Subject: [PATCH 19/34] jagged_jagged_bmm operator optimization (#1644) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1644 This diff optimizes the jagged_jagged_bmm operator using tiling across thread blocks and GPU shared memory. Reviewed By: brad-mengchi Differential Revision: D44029528 fbshipit-source-id: fa5cd5a26893f935427bce5efb7dfcc731c3f47d --- fbgemm_gpu/src/jagged_tensor_ops.cu | 85 +++++++++++++++++++---------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu index 0282fa9f19..e646d28be2 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops.cu @@ -1986,7 +1986,7 @@ Tensor jagged_softmax_backward( return grad_input; } -template +template __global__ __launch_bounds__(kMaxThreads) void jagged_jagged_bmm_kernel( const at::PackedTensorAccessor32 x_values, const at::PackedTensorAccessor32 y_values, @@ -1997,30 +1997,53 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_jagged_bmm_kernel( const int M = x_values.size(1); const int N = y_values.size(1); - const int b_m_begin = blockIdx.x * blockDim.y + threadIdx.y; - const int b_m_step = gridDim.x * blockDim.y; - for (int b_m = b_m_begin; b_m < B * M; b_m += b_m_step) { - const int b = b_m / M; - const int m = b_m % M; + const auto block_row = blockIdx.y; + const auto block_col = blockIdx.x; + const auto row = threadIdx.y; + const auto col = threadIdx.x; + __shared__ scalar_t Xs[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ scalar_t Ys[BLOCK_SIZE][BLOCK_SIZE]; - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = min(row_end - row_start, max_L); - if (length == 0) { - for (int n = threadIdx.x; n < N; n += blockDim.x) { - output[b][m][n] = 0; + for (uint32_t b = blockIdx.z; b < B; b += gridDim.z) { + const index_t row_start = offsets[b]; + const index_t row_end = offsets[b + 1]; + const auto length = min(row_end - row_start, (index_t)max_L); + auto num_l_blocks = (length + BLOCK_SIZE - 1) / BLOCK_SIZE; + + at::acc_type acc = 0; + + const auto row_offset = block_row * BLOCK_SIZE + row; + const auto col_offset = block_col * BLOCK_SIZE + col; + + // for loop block tile in length dimension + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + Xs[row][col] = 0; + Ys[row][col] = 0; + const auto bk_offset = bk_l * BLOCK_SIZE; + + // load data from global memory to shared memory + const auto l_x = bk_offset + col; + if (row_offset < M && l_x < length) { + Xs[row][col] = x_values[row_start + l_x][row_offset]; } - } else { - // TODO: use shared memory and better reduction - for (int n = threadIdx.x; n < N; n += blockDim.x) { - at::acc_type acc = - x_values[row_start][m] * y_values[row_start][n]; - for (int l = 1; l < length; ++l) { - acc += x_values[row_start + l][m] * y_values[row_start + l][n]; - } - output[b][m][n] = acc; + + const auto l_y = bk_offset + row; + if (l_y < length && col_offset < N) { + Ys[row][col] = y_values[row_start + l_y][col_offset]; } + + __syncthreads(); + +#pragma unroll + for (auto e = 0; e < BLOCK_SIZE; e++) { + acc += Xs[row][e] * Ys[e][col]; + } + __syncthreads(); } + + // write the result to the output + if ((row_offset < M) && (col_offset < N)) + output[b][row_offset][col_offset] = acc; } } @@ -2042,9 +2065,16 @@ Tensor jagged_jagged_bmm_forward( auto output = at::zeros({B, M, N}, x_values.options()); if (B > 0 && M > 0 && N > 0) { - const int block_dim_x = - std::min(div_round_up(N, kWarpSize) * kWarpSize, kMaxThreads); - const int block_dim_y = kMaxThreads / block_dim_x; + constexpr int BLOCK_SIZE = 16; + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const auto grid_dim_x = div_round_up(N, BLOCK_SIZE); + const auto grid_dim_y = div_round_up(M, BLOCK_SIZE); + TORCH_CHECK( + grid_dim_y <= kMaxBlockYDim, + "M cannot be larger than", + grid_dim_y * BLOCK_SIZE + 1 - BLOCK_SIZE); + const auto grid_dim_z = std::min(B, kMaxBlockZDim); + const dim3 grid(grid_dim_x, grid_dim_y, grid_dim_z); AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_jagged_bmm_kernel_1", [&] { @@ -2054,11 +2084,8 @@ Tensor jagged_jagged_bmm_forward( x_values.scalar_type(), "jagged_jagged_bmm_kernel_2", [&] { - jagged_jagged_bmm_kernel - <<>>( + jagged_jagged_bmm_kernel + <<>>( x_values.packed_accessor32(), y_values.packed_accessor32(), offsets.packed_accessor32(), From edc23dc13c261f4d296788f86cc7e7f3311762b7 Mon Sep 17 00:00:00 2001 From: Doe Hyun Yoon Date: Thu, 23 Mar 2023 14:40:53 -0700 Subject: [PATCH 20/34] Specify device to emulate_cache_miss kernel (#1660) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1660 When enabled emulate cache miss, it caused illegal memory access, if we're using more than one GPU. It turns out that previous diff didn't specify device within emulate_cache_miss kernel. This diff fixes it. In addition, cleaned up a bit (e.g., no need to used index_t based kernel launch for emulate_cache_miss kernel, as lxu_cache_locations is always with int32_t. Reviewed By: sryap, YuzeDaiMeta Differential Revision: D44340131 fbshipit-source-id: d99ba2364e9030cbca6c1166e578d24d99646bb1 --- fbgemm_gpu/src/split_embeddings_cache_cuda.cu | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu index e5930ab745..513f32cf8e 100644 --- a/fbgemm_gpu/src/split_embeddings_cache_cuda.cu +++ b/fbgemm_gpu/src/split_embeddings_cache_cuda.cu @@ -507,9 +507,8 @@ std::tuple> get_unique_indices_cuda( namespace { -template __global__ __launch_bounds__(kMaxThreads) void emulate_cache_miss_kernel( - at::PackedTensorAccessor32 + at::PackedTensorAccessor32 lxu_cache_locations, const int64_t enforced_misses_per_256, const bool gather_cache_stats, @@ -541,8 +540,11 @@ Tensor emulate_cache_miss( TENSOR_ON_CUDA_GPU(lxu_cache_locations); TENSOR_ON_CUDA_GPU(uvm_cache_stats); + at::cuda::OptionalCUDAGuard device_guard; + device_guard.set_index(lxu_cache_locations.get_device()); + const auto N = lxu_cache_locations.numel(); - if (lxu_cache_locations.numel() == 0) { + if (N == 0) { // nothing to do return lxu_cache_locations; } @@ -551,21 +553,17 @@ Tensor emulate_cache_miss( div_round_up(N, kMaxThreads), get_max_thread_blocks_for_cache_kernels_())); - AT_DISPATCH_INDEX_TYPES( - lxu_cache_locations.scalar_type(), "emulate_cache_miss", [&] { - emulate_cache_miss_kernel<<< - blocks, - kMaxThreads, - 0, - at::cuda::getCurrentCUDAStream()>>>( - lxu_cache_locations - .packed_accessor32(), - enforced_misses_per_256, - gather_cache_stats, - uvm_cache_stats - .packed_accessor32()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); + emulate_cache_miss_kernel<<< + blocks, + kMaxThreads, + 0, + at::cuda::getCurrentCUDAStream()>>>( + lxu_cache_locations + .packed_accessor32(), + enforced_misses_per_256, + gather_cache_stats, + uvm_cache_stats.packed_accessor32()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return lxu_cache_locations; } From d62b5cf5a311578fd47485b9fe3cbb6e66640e19 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Fri, 24 Mar 2023 12:33:10 -0700 Subject: [PATCH 21/34] Add C++17 Support to FBGEMM and FBGEMM_GPU OSS builds (#1652) Summary: - Add C++17 support for the entire FBGEMM_GPU build - Add C++17 support for the entire FBGEMM build - Update FBGEMM tests and benchmarks to be C++17-compatible - Make FBGEMM builds output more logging - Cherry-pick code changes from D43776442 v4 now that C++17 is fully supported Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1652 Reviewed By: shintaro-iwasaki Differential Revision: D44287321 Pulled By: q10 fbshipit-source-id: 4bf2bcf66d528939865d42b6deafc470bee55d17 --- .bazelrc | 48 +++++++++++++++ .github/scripts/setup_env.bash | 41 +++++++++++++ .github/workflows/fbgemm_ci.yml | 38 +++++++----- BUILD.bazel | 20 +++---- CMakeLists.txt | 35 ++++++----- WORKSPACE.bazel | 4 +- bench/CMakeLists.txt | 15 +++-- bench/EmbeddingSpMDM8BitBenchmark.cc | 2 +- bench/EmbeddingSpMDMBenchmark.cc | 2 +- bench/EmbeddingSpMDMNBitBenchmark.cc | 2 +- ...mbeddingSpMDMNBitRowWiseSparseBenchmark.cc | 2 +- bench/RowwiseAdagradFusedBenchmark.cc | 2 +- fbgemm_gpu/CMakeLists.txt | 59 +++++++++++-------- fbgemm_gpu/docs/BuildInstructions.md | 19 +++--- include/fbgemm/Types.h | 12 ++-- test/CMakeLists.txt | 19 +++--- third_party/asmjit.BUILD | 2 - 17 files changed, 226 insertions(+), 96 deletions(-) create mode 100644 .bazelrc diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 0000000000..1e5dbcfcb7 --- /dev/null +++ b/.bazelrc @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +################################################################################ +# FBGEMM Bazel configuration file +# +# Based on MozoLM build options: +# https://github.com/google-research/mozolm/blob/main/.bazelrc +# +# Documentation for Bazel configuration options can be found in: +# https://bazel.build/reference/command-line-reference +################################################################################ + +# Automatically picks up host-OS-specific config lines from bazelrc files +# Enabling this is equivalent to auto-calling --config=linux on Linux, --config=windows, etc +build --enable_platform_specific_config + +# Print logs for all tests +test --test_output=all + +# Build with verbose logging +build --verbose_explanations --verbose_failures +test --verbose_explanations --verbose_failures + +# Build with optimization mode turned on +build --compilation_mode opt +test --compilation_mode opt + +# Build FBGEMM with C17 and C++17 +build:linux --cxxopt=-std=c++17 +build:linux --host_cxxopt=-std=c++17 +build:linux --conlyopt=-std=c17 +build:linux --host_conlyopt=-std=c17 +build:macos --cxxopt=-std=c++17 +build:macos --host_cxxopt=-std=c++17 +build:macos --conlyopt=-std=c17 +build:macos --host_conlyopt=-std=c17 +build:windows --cxxopt=/std:c++17 +build:windows --host_cxxopt=/std:c++17 +build:windows --conlyopt=/std:c17 +build:windows --host_conlyopt=/std:c17 + +# Generation of `runfiles` directories on Windows has to be explicitly enabled. +# See https://github.com/bazelbuild/bazel/issues/8843. +build:windows --enable_runfiles +test:windows --enable_runfiles diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index 8329d661cc..f998bdba3f 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -364,6 +364,38 @@ print_glibc_info () { } +################################################################################ +# Bazel Setup Functions +################################################################################ + +setup_bazel () { + echo "################################################################################" + echo "# Setup Bazel" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + local bazel_version="6.1.1" + + if [[ $OSTYPE == 'darwin'* ]]; then + local bazel_variant="darwin-$(uname -m)" + else + local bazel_variant="linux-x86_64" + fi + + echo "[SETUP] Downloading installer Bazel ${bazel_version} (${bazel_variant}) ..." + print_exec wget -q "https://github.com/bazelbuild/bazel/releases/download/${bazel_version}/bazel-${bazel_version}-installer-${bazel_variant}.sh" -O install-bazel.sh + + echo "[SETUP] Installing Bazel ..." + print_exec bash install-bazel.sh + print_exec rm -f install-bazel.sh + + print_exec bazel --version + echo "[SETUP] Successfully set up Bazel" +} + + ################################################################################ # Miniconda Setup Functions ################################################################################ @@ -915,6 +947,15 @@ install_cxx_compiler () { # Print out the C++ version print_exec conda run -n "${env_name}" c++ --version + + # https://stackoverflow.com/questions/2324658/how-to-determine-the-version-of-the-c-standard-used-by-the-compiler + echo "[INSTALL] Printing the default version of the C++ standard used by the compiler ..." + print_exec conda run -n "${env_name}" c++ -x c++ /dev/null -E -dM | grep __cplusplus + + # https://stackoverflow.com/questions/4991707/how-to-find-my-current-compilers-standard-like-if-it-is-c90-etc + echo "[INSTALL] Printing the default version of the C standard used by the compiler ..." + print_exec conda run -n "${env_name}" cc -dM -E - < /dev/null | grep __STDC_VERSION__ + echo "[INSTALL] Successfully installed C/C++ compilers" } diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml index 9b18dfb884..79561102af 100644 --- a/.github/workflows/fbgemm_ci.yml +++ b/.github/workflows/fbgemm_ci.yml @@ -56,8 +56,9 @@ jobs: run: | set -e mkdir $BUILD_DIR; cd $BUILD_DIR + cmake --version cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DPYTHON_EXECUTABLE=/usr/bin/python3 .. - make -j + make -j VERBOSE=1 - name: Test FBGEMM Library (${{ matrix.library-type }}) run: | @@ -94,23 +95,34 @@ jobs: run: | set -e mkdir $BUILD_DIR; cd $BUILD_DIR + cmake --version cmake -DUSE_SANITIZER=address -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} .. - make -j + make -j VERBOSE=1 build-bazel: - runs-on: ${{ matrix.os }} + runs-on: linux.12xlarge + container: + image: ${{ matrix.container-image }} + options: --user root defaults: run: shell: bash env: PRELUDE: .github/scripts/setup_env.bash + DEBIAN_FRONTEND: noninteractive strategy: fail-fast: false matrix: - os: [ ubuntu-latest ] + container-image: [ "ubuntu:20.04" ] steps: + - name: Setup Build Container + run: | + apt update -y + apt install -y binutils build-essential cmake git libblas-dev python3 sudo unzip wget + git config --global --add safe.directory '*' + - name: Checkout the Repository uses: actions/checkout@v3 with: @@ -120,18 +132,13 @@ jobs: run: . $PRELUDE; print_system_info - name: Download bazel - run: | - set -e - wget https://github.com/bazelbuild/bazel/releases/download/2.2.0/bazel-2.2.0-linux-x86_64 -O bazel - # verify content - echo 'b2f002ea0e6194a181af6ac84cd94bd8dc797722eb2354690bebac92dda233ff bazel' | sha256sum --quiet -c - chmod +x bazel + run: . $PRELUDE; setup_bazel - - name: Build FBGEMM with bazel - run: ./bazel build --verbose_explanations --verbose_failures --compilation_mode opt :* + - name: Build FBGEMM Library + run: bazel build -s :* - - name: Test FBGEMM bazel build - run: ./bazel test --test_output=all --verbose_explanations --verbose_failures --compilation_mode opt :* + - name: Test FBGEMM Library + run: bazel test -s :* build-windows: @@ -168,8 +175,9 @@ jobs: mkdir %BUILD_DIR% cd %BUILD_DIR% echo "STARTING CMAKE" + cmake --version cmake -G Ninja -DFBGEMM_BUILD_BENCHMARKS=OFF -DFBGEMM_LIBRARY_TYPE=${{ matrix.library-type }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER="cl.exe" -DCMAKE_CXX_COMPILER="cl.exe" .. - ninja all + ninja -v all echo "Build Success" - name: Test FBGEMM Library (${{ matrix.library-type }}) diff --git a/BUILD.bazel b/BUILD.bazel index e998487255..12e05c4522 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -159,14 +159,14 @@ cc_library( ) [ - cc_test( - name = paths.split_extension(paths.basename(filename))[0], - size = "medium", - srcs = [ - filename, - ], - deps = [ - ":test_utils", - ], - ) for filename in get_fbgemm_tests() + cc_test( + name = paths.split_extension(paths.basename(filename))[0], + size = "medium", + srcs = [ + filename, + ], + deps = [ + ":test_utils", + ], + ) for filename in get_fbgemm_tests() ] diff --git a/CMakeLists.txt b/CMakeLists.txt index 58dcb9aeb0..32920d1d48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,19 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +# Set the default C++ standard to C++17 +# Individual targets can have this value overridden; see +# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_VISIBILITY_PRESET hidden) + +# Set the default C standard to C11 +# Individual targets can have this value overridden; see +# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_C_STANDARD_REQUIRED ON) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") @@ -114,17 +129,11 @@ add_dependencies(fbgemm_generic defs.bzl) add_dependencies(fbgemm_avx2 defs.bzl) add_dependencies(fbgemm_avx512 defs.bzl) -set_target_properties(fbgemm_generic fbgemm_avx2 fbgemm_avx512 PROPERTIES - CXX_STANDARD 14 - CXX_STANDARD_REQUIRED YES - CXX_EXTENSIONS NO - CXX_VISIBILITY_PRESET hidden) - -#On Windows: -#1) Adding definition of ASMJIT_STATIC to avoid generating asmjit function -#calls with _dllimport attribute -#2) MSVC uses /MD in default cxx compiling flags, -#need to change it to /MT in static case +# On Windows: +# 1) Adding definition of ASMJIT_STATIC to avoid generating asmjit function +# calls with _dllimport attribute +# 2) MSVC uses /MD in default cxx compiling flags, +# Need to change it to /MT in static case if(MSVC) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267 /wd4305 /wd4309") if(FBGEMM_LIBRARY_TYPE STREQUAL "static") @@ -267,8 +276,6 @@ elseif(FBGEMM_LIBRARY_TYPE STREQUAL "shared") set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON) - set_target_properties(fbgemm PROPERTIES - CXX_VISIBILITY_PRESET hidden) elseif(FBGEMM_LIBRARY_TYPE STREQUAL "static") add_library(fbgemm STATIC $ diff --git a/WORKSPACE.bazel b/WORKSPACE.bazel index 30b1a80424..aff61b2b94 100644 --- a/WORKSPACE.bazel +++ b/WORKSPACE.bazel @@ -16,9 +16,9 @@ http_archive( http_archive( name = "com_google_googletest", - strip_prefix = "googletest-cd6b9ae3243985d4dc725abd513a874ab4161f3e", + strip_prefix = "googletest-1.13.0", urls = [ - "https://github.com/google/googletest/archive/cd6b9ae3243985d4dc725abd513a874ab4161f3e.tar.gz", + "https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz", ], ) diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index b4fad7510a..49f9e38fa2 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -1,4 +1,12 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_VISIBILITY_PRESET hidden) +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_C_STANDARD_REQUIRED ON) find_package(MKL) if (NOT ${MKL_FOUND}) @@ -21,15 +29,12 @@ if (${BLAS_FOUND}) message(STATUS "BLAS_LIBRARIES= ${BLAS_LIBRARIES}") endif() -#benchmarks +# Benchmarks macro(add_benchmark BENCHNAME) add_executable(${BENCHNAME} ${ARGN} BenchUtils.cc ../test/QuantizationHelpers.cc ../test/EmbeddingSpMDMTestUtils.cc) - set_target_properties(${BENCHNAME} PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) target_compile_options(${BENCHNAME} PRIVATE "-m64" "-mavx2" "-mfma" "-masm=intel") target_link_libraries(${BENCHNAME} fbgemm) diff --git a/bench/EmbeddingSpMDM8BitBenchmark.cc b/bench/EmbeddingSpMDM8BitBenchmark.cc index 1fcf4607de..17934b6101 100644 --- a/bench/EmbeddingSpMDM8BitBenchmark.cc +++ b/bench/EmbeddingSpMDM8BitBenchmark.cc @@ -111,7 +111,7 @@ int run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/bench/EmbeddingSpMDMBenchmark.cc b/bench/EmbeddingSpMDMBenchmark.cc index b987586aac..246549f6a7 100644 --- a/bench/EmbeddingSpMDMBenchmark.cc +++ b/bench/EmbeddingSpMDMBenchmark.cc @@ -104,7 +104,7 @@ void run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc index ed5485ae29..fff665babb 100644 --- a/bench/EmbeddingSpMDMNBitBenchmark.cc +++ b/bench/EmbeddingSpMDMNBitBenchmark.cc @@ -116,7 +116,7 @@ int run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc index d1b28f54b5..c50500768d 100644 --- a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc +++ b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc @@ -131,7 +131,7 @@ int run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/bench/RowwiseAdagradFusedBenchmark.cc b/bench/RowwiseAdagradFusedBenchmark.cc index 6f1203e6ab..a0524afaa5 100644 --- a/bench/RowwiseAdagradFusedBenchmark.cc +++ b/bench/RowwiseAdagradFusedBenchmark.cc @@ -90,7 +90,7 @@ void run_benchmark( // please note we generate unique indices for (int i = 0; i < batch_size; ++i) { iota(container.begin(), container.end(), 0); - random_shuffle(container.begin(), container.end()); + shuffle(container.begin(), container.end(), generator); copy( container.begin(), container.begin() + (offsets[i + 1] - offsets[i]), diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 51348505c4..1fb8f397e0 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -1,15 +1,34 @@ -cmake_minimum_required(VERSION 3.11.0 FATAL_ERROR) - -option(FBGEMM_CPU_ONLY "Build fbgemm_gpu without GPU support" OFF) - -set(message_line - "-------------------------------------------------------------") -message("${message_line}") +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +# Set the default C++ standard to C++17 +# Individual targets can have this value overridden; see +# https://cmake.org/cmake/help/latest/prop_tgt/CXX_STANDARD.html +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Set the default C standard to C17 +# Individual targets can have this value overridden; see +# https://cmake.org/cmake/help/latest/prop_tgt/C_STANDARD.html +set(CMAKE_C_STANDARD 17) +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_C_STANDARD_REQUIRED ON) + +function(BLOCK_PRINT) + message("================================================================================") + foreach(ARG IN LISTS ARGN) + message("${ARG}") + endforeach() + message("================================================================================") + message("") +endfunction() if(SKBUILD) - message("The project is built using scikit-build") + BLOCK_PRINT("The project is built using scikit-build") endif() +# Build options +option(FBGEMM_CPU_ONLY "Build FBGEMM_GPU without GPU support" OFF) option(USE_CUDA "Use CUDA" ON) option(USE_ROCM "Use ROCm" OFF) @@ -21,11 +40,10 @@ if(((EXISTS "/opt/rocm/") OR (EXISTS $ENV{ROCM_PATH})) endif() if(FBGEMM_CPU_ONLY) - message("Building for CPU-only") + BLOCK_PRINT("Building the CPU-only variant of FBGEMM-GPU") endif() -message("${message_line}") -message(STATUS "USE_ROCM ${USE_ROCM}") +BLOCK_PRINT("USE_ROCM: ${USE_ROCM}") if(FBGEMM_CPU_ONLY OR USE_ROCM) project( @@ -46,12 +64,16 @@ set(THIRDPARTY ${FBGEMM}/third_party) if(DEFINED GLIBCXX_USE_CXX11_ABI) if(${GLIBCXX_USE_CXX11_ABI} EQUAL 1) - set(CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() - message("${CMAKE_CXX_FLAGS}") + BLOCK_PRINT( + "Default C++ compiler flags" + "(values may be overridden by CMAKE_CXX_STANDARD and CXX_STANDARD):" + "" + "${CMAKE_CXX_FLAGS}" + ) endif() # @@ -72,8 +94,7 @@ if(USE_ROCM) include(Hip) include(Hipify) - message("${message_line}") - message(STATUS "hip found ${HIP_FOUND}") + BLOCK_PRINT("HIP found: ${HIP_FOUND}") endif() # @@ -414,13 +435,6 @@ if(USE_ROCM) else() add_library(fbgemm_gpu_py MODULE ${fbgemm_gpu_sources} ${gen_source_files} ${cpp_asmjit_files} ${cpp_fbgemm_files}) - set_property(TARGET fbgemm_gpu_py PROPERTY CUDA_ARCHITECTURES - "${cuda_architectures}") - - # FBGEMM_CUB_USE_NAMESPACE will cause compilation errors on CUB for CUDA 12+ - # if(NOT FBGEMM_CPU_ONLY) - # target_compile_definitions(fbgemm_gpu_py PRIVATE FBGEMM_CUB_USE_NAMESPACE) - # endif() endif() set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "") @@ -430,7 +444,6 @@ if(NVML_LIB_PATH) target_link_libraries(fbgemm_gpu_py ${NVML_LIB_PATH}) endif() target_include_directories(fbgemm_gpu_py PRIVATE ${TORCH_INCLUDE_DIRS}) -set_property(TARGET fbgemm_gpu_py PROPERTY CXX_STANDARD 17) install(TARGETS fbgemm_gpu_py DESTINATION fbgemm_gpu) diff --git a/fbgemm_gpu/docs/BuildInstructions.md b/fbgemm_gpu/docs/BuildInstructions.md index c50bd50d03..4f2c9c142b 100644 --- a/fbgemm_gpu/docs/BuildInstructions.md +++ b/fbgemm_gpu/docs/BuildInstructions.md @@ -66,18 +66,23 @@ will also need to be installed to avoid issues with missing versioned symbols when compiling FBGEMM_CPU: ```sh -conda install -n "${env_name}" -y gxx_linux-64=9.3.0 +conda install -n "${env_name}" -y gxx_linux-64=10.4.0 sysroot_linux-64=2.17 -c conda-forge ``` -Note that while newer versions of GCC can be used, binaries compiled under newer -versions of GCC will not be compatible with older systems such as Ubuntu 20.04 -or CentOS Stream 8, because the compiled library will reference symbols from -versions of `GLIBCXX` that the system's `libstdc++.so.6` will not support. To -see what versions of GLIBCXX the available `libstdc++.so.6` supports: +While newer versions of GCC can be used, binaries compiled under newer versions +of GCC will not be compatible with older systems such as Ubuntu 20.04 or CentOS +Stream 8, because the compiled library will reference symbols from versions of +`GLIBCXX` that the system's `libstdc++.so.6` will not support. To see what +versions of GLIBC and GLIBCXX the available `libstdc++.so.6` supports: ```sh libcxx_path=/path/to/libstdc++.so.6 -objdump -TC "${libcxx_path}" | grep GLIBCXX | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat + +# Print supported for GLIBC versions +objdump -TC "${libcxx_path}" | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/GLIBC_\1/g' | sort -Vu | cat + +# Print supported for GLIBCXX versions +objdump -TC "${libcxx_path}" | grep GLIBCXX_ | sed 's/.*GLIBCXX_\([.0-9]*\).*/GLIBCXX_\1/g' | sort -Vu | cat ``` ### Other Build Tools diff --git a/include/fbgemm/Types.h b/include/fbgemm/Types.h index e5daa28d8b..e7d8278464 100644 --- a/include/fbgemm/Types.h +++ b/include/fbgemm/Types.h @@ -27,14 +27,14 @@ constexpr uint32_t f16_num_exponent_bits = 5; constexpr uint32_t f16_num_mantissa_bits = 10; constexpr uint32_t f16_num_non_sign_bits = f16_num_exponent_bits + f16_num_mantissa_bits; -constexpr uint32_t f16_exponent_mask = 0x1F; // 5 bits +constexpr uint32_t f16_exponent_mask = 0b1'1111; // 5 bits constexpr uint32_t f16_sign_bit = 1u << (f16_num_exponent_bits + f16_num_mantissa_bits); constexpr uint32_t f16_exponent_bits = f16_exponent_mask << f16_num_mantissa_bits; -constexpr uint32_t f16_mantissa_mask = 0x3FF; // 10 bits +constexpr uint32_t f16_mantissa_mask = 0b11'1111'1111; // 10 bits constexpr uint32_t f16_exponent_bias = 15; -constexpr uint32_t f16_nan = 0x7FFF; +constexpr uint32_t f16_nan = 0x7F'FF; // The IEEE754 standard specifies a binary32 as having: // SEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMM @@ -44,10 +44,10 @@ constexpr uint32_t f16_nan = 0x7FFF; // * 23 mantissa/significand bits (a 24th bit is implicit) constexpr uint32_t f32_num_exponent_bits = 8; constexpr uint32_t f32_num_mantissa_bits = 23; -constexpr uint32_t f32_exponent_mask = 0xFF; // 8 bits -constexpr uint32_t f32_mantissa_mask = 0x7FFFFF; // 23 bits +constexpr uint32_t f32_exponent_mask = 0b1111'1111; // 8 bits +constexpr uint32_t f32_mantissa_mask = 0x7F'FF'FF; // 23 bits constexpr uint32_t f32_exponent_bias = 127; -constexpr uint32_t f32_all_non_sign_mask = 0x7FFFFFFF; // 31 bits +constexpr uint32_t f32_all_non_sign_mask = 0x7F'FF'FF'FF; // 31 bits constexpr uint32_t f32_most_significant_bit = 1u << 22; // Turn on 23rd bit constexpr uint32_t f32_num_non_sign_bits = f32_num_exponent_bits + f32_num_mantissa_bits; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a30735354a..1e996256bf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,12 @@ -cmake_minimum_required(VERSION 3.5 FATAL_ERROR) +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_VISIBILITY_PRESET hidden) +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_C_STANDARD_REQUIRED ON) if(FBGEMM_BUILD_TESTS AND NOT TARGET gtest) #Download Googletest framework from github if @@ -38,12 +46,9 @@ macro(add_gtest TESTNAME) EmbeddingSpMDMTestUtils.cc QuantizationHelpers.cc TestUtils.cc) - set_target_properties(${TESTNAME} PROPERTIES - CXX_STANDARD 11 - CXX_EXTENSIONS NO) - #To compile test files with AVX2 turned on - #For static build, defining FBGEMM_STATIC to avoid generating - #functions with _dllimport attributes. + # To compile test files with AVX2 turned on + # For static build, defining FBGEMM_STATIC to avoid generating + # functions with _dllimport attributes. if(MSVC) target_compile_options(${TESTNAME} PRIVATE "/arch:AVX2" "/wd4244" "/wd4267" "/wd4305" "/wd4309") diff --git a/third_party/asmjit.BUILD b/third_party/asmjit.BUILD index 71dc5c7e6c..c2764a97c4 100644 --- a/third_party/asmjit.BUILD +++ b/third_party/asmjit.BUILD @@ -16,9 +16,7 @@ cc_library( copts = [ "-DASMJIT_STATIC", "-fno-tree-vectorize", - "-std=c++17", "-fmerge-all-constants", - "-std=gnu++11", "-DTH_BLAS_MKL", ], includes = [ From 277677039bae25b2570a73013b03bfaa9d2a523e Mon Sep 17 00:00:00 2001 From: Sarunya Pumma Date: Mon, 27 Mar 2023 09:10:01 -0700 Subject: [PATCH 22/34] Prune CPU/GPU TBE optimizer codegen (#1659) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1659 This diff aims to reduce the build time and libary size of `//deeplearning/fbgemm/fbgemm_gpu/codegen:embedding_ops`. The diff modifies the build target to generate and compile only the necessary files. This is based on the fact that CPU and GPU do not support all optimizers in `SplitTBE`. (Before this diff, all optimizers were generated and compiled for both CPU and GPU.) The following is the list of supported optimizers |OptimType|Generated optimizer|Supported on CPU|Supported on GPU| |EXACT_ADAGRAD|adagrad|x|x| |EXACT_ROWWISE_ADAGRAD|rowwise_adagrad_with_counter|x|x| ||rowwise_adagrad|x|x| |EXACT_ROWWISE_WEIGHTED_ADAGRAD|rowwise_weighted_adagrad|x|x| |EXACT_SGD|sgd|x|x| |SGD|approx_sgd|x|x| |ROWWISE_ADAGRAD|approx_rowwise_adagrad_with_counter|x|| ||approx_rowwise_adagrad|x|| |ADAM|adam||x| |LAMB|lamb||x| |LARS_SGD|lars_sgd||x| |PARTIAL_ROWWISE_ADAM|partial_rowwise_adam||x| |PARTIAL_ROWWISE_LAMB|partial_rowwise_lamb||x| |-|rowwise_adagrad_with_weight_decay||| |-|approx_rowwise_adagrad_with_weight_decay||| Note: x = supported Reviewed By: jianyuh Differential Revision: D44326540 fbshipit-source-id: 02413256b4a675f13ada8e8820820cb5112cb405 --- fbgemm_gpu/CMakeLists.txt | 38 +-- .../embedding_backward_code_generator.py | 109 ++++++--- ..._embedding_codegen_lookup_invoker.template | 224 +++++++++--------- 3 files changed, 215 insertions(+), 156 deletions(-) diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 1fb8f397e0..b30bc1eab4 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -103,21 +103,27 @@ endif() set(OPTIMIZERS adagrad - adam - approx_rowwise_adagrad - approx_rowwise_adagrad_with_weight_decay - approx_rowwise_adagrad_with_counter approx_sgd - lamb - lars_sgd - partial_rowwise_adam - partial_rowwise_lamb rowwise_adagrad - rowwise_adagrad_with_weight_decay rowwise_adagrad_with_counter rowwise_weighted_adagrad sgd) +set(CPU_ONLY_OPTIMIZERS + approx_rowwise_adagrad + approx_rowwise_adagrad_with_counter) + +set(GPU_ONLY_OPTIMIZERS + adam + lamb + lars_sgd + partial_rowwise_adam + partial_rowwise_lamb) + +set(CPU_OPTIMIZERS ${OPTIMIZERS} ${CPU_ONLY_OPTIMIZERS}) +set(GPU_OPTIMIZERS ${OPTIMIZERS} ${GPU_ONLY_OPTIMIZERS}) +set(ALL_OPTIMIZERS ${OPTIMIZERS} ${CPU_ONLY_OPTIMIZERS} ${GPU_ONLY_OPTIMIZERS}) + set(gen_gpu_source_files "gen_embedding_forward_dense_weighted_codegen_cuda.cu" "gen_embedding_forward_dense_unweighted_codegen_cuda.cu" @@ -137,16 +143,16 @@ set(gen_cpu_source_files set(gen_python_files ${CMAKE_BINARY_DIR}/__init__.py) -foreach(optimizer ${OPTIMIZERS}) - list(APPEND gen_gpu_host_source_files - "gen_embedding_backward_split_${optimizer}.cpp") - +foreach(optimizer ${CPU_OPTIMIZERS}) list(APPEND gen_cpu_source_files "gen_embedding_backward_split_${optimizer}_cpu.cpp") list(APPEND gen_cpu_source_files "gen_embedding_backward_${optimizer}_split_cpu.cpp") +endforeach() - list(APPEND gen_python_files "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py") +foreach(optimizer ${GPU_OPTIMIZERS}) + list(APPEND gen_gpu_host_source_files + "gen_embedding_backward_split_${optimizer}.cpp") foreach(weight weighted unweighted) list(APPEND gen_gpu_source_files @@ -154,6 +160,10 @@ foreach(optimizer ${OPTIMIZERS}) endforeach() endforeach() +foreach(optimizer ${ALL_OPTIMIZERS}) + list(APPEND gen_python_files "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py") +endforeach() + set(CMAKE_CODEGEN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/codegen) set(codegen_dependencies diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py index fd69a22f6e..aa832947c3 100644 --- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py +++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py @@ -127,53 +127,60 @@ def int_arg(name: str, default: int = 0) -> str: def generate(**kwargs: Any) -> None: gen_args = kwargs["args"] - # Generates CUDA variants. kwargs["args"] = gen_args["cuda"] + if kwargs.get("has_gpu_support"): + # Generates CUDA variants. + template = env.get_template("embedding_backward_split_template.cu") + src_cu = template.render(weighted=False, **kwargs) + write( + f"gen_embedding_backward_{kwargs.get('optimizer')}_split_unweighted_cuda.cu", + src_cu, + ) + src_cu = template.render(weighted=True, **kwargs) + write( + f"gen_embedding_backward_{kwargs.get('optimizer')}_split_weighted_cuda.cu", + src_cu, + ) + if not kwargs.get("dense"): + template = env.get_template("embedding_backward_split_host_template.cpp") + src_cpp = template.render(**kwargs) + write( + f"gen_embedding_backward_split_{kwargs.get('optimizer')}.cpp", src_cpp + ) - template = env.get_template("embedding_backward_split_template.cu") - src_cu = template.render(weighted=False, **kwargs) - write( - f"gen_embedding_backward_{kwargs.get('optimizer')}_split_unweighted_cuda.cu", - src_cu, - ) - src_cu = template.render(weighted=True, **kwargs) - write( - f"gen_embedding_backward_{kwargs.get('optimizer')}_split_weighted_cuda.cu", - src_cu, - ) if not kwargs.get("dense"): - template = env.get_template("embedding_backward_split_host_template.cpp") - src_cpp = template.render(**kwargs) - write(f"gen_embedding_backward_split_{kwargs.get('optimizer')}.cpp", src_cpp) - # Generates Python invoker for CUDA + CPU template = env.get_template("split_embedding_codegen_lookup_invoker.template") src_py = template.render(is_fbcode=args.is_fbcode, **kwargs) write(f"lookup_{kwargs.get('optimizer')}.py", src_py) - # Generates CPU variants. - kwargs["args"] = gen_args["cpu"] + if kwargs.get("has_cpu_support"): + # Generates CPU variants. + kwargs["args"] = gen_args["cpu"] - is_approx = "approx" in kwargs.get("optimizer") - template = ( - env.get_template("embedding_backward_split_cpu_approx_template.cpp") - if is_approx - else env.get_template("embedding_backward_split_cpu_template.cpp") - ) - - src_cpp = template.render(**kwargs) - write( - f"gen_embedding_backward_{kwargs.get('optimizer')}_split_cpu.cpp", - src_cpp, - ) + is_approx = "approx" in kwargs.get("optimizer") + template = ( + env.get_template("embedding_backward_split_cpu_approx_template.cpp") + if is_approx + else env.get_template("embedding_backward_split_cpu_template.cpp") + ) - if not kwargs.get("dense"): - template = env.get_template("embedding_backward_split_host_cpu_template.cpp") src_cpp = template.render(**kwargs) write( - f"gen_embedding_backward_split_{kwargs.get('optimizer')}_cpu.cpp", src_cpp + f"gen_embedding_backward_{kwargs.get('optimizer')}_split_cpu.cpp", + src_cpp, ) + if not kwargs.get("dense"): + template = env.get_template( + "embedding_backward_split_host_cpu_template.cpp" + ) + src_cpp = template.render(**kwargs) + write( + f"gen_embedding_backward_split_{kwargs.get('optimizer')}_cpu.cpp", + src_cpp, + ) + @dataclass class Args: @@ -369,6 +376,8 @@ def adagrad() -> None: split_precomputation="", split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=True, + has_gpu_support=True, ) @@ -490,6 +499,8 @@ def rowwise_adagrad() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=True, + has_gpu_support=True, ) approx_split_weight_update = """ @@ -512,6 +523,8 @@ def rowwise_adagrad() -> None: split_precomputation=split_precomputation, split_weight_update=approx_split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=True, + has_gpu_support=False, ) @@ -611,6 +624,9 @@ def rowwise_adagrad_with_weight_decay() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + # Disable both CPU and GPU support + has_cpu_support=False, + has_gpu_support=False, ) approx_split_weight_update = """ @@ -633,6 +649,9 @@ def rowwise_adagrad_with_weight_decay() -> None: split_precomputation=split_precomputation, split_weight_update=approx_split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + # Disable both CPU and GPU support + has_cpu_support=False, + has_gpu_support=False, ) @@ -771,6 +790,8 @@ def rowwise_adagrad_with_counter() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=True, + has_gpu_support=True, ) approx_split_weight_update = """ @@ -804,6 +825,8 @@ def rowwise_adagrad_with_counter() -> None: split_precomputation=split_precomputation, split_weight_update=approx_split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=True, + has_gpu_support=False, ) @@ -874,6 +897,8 @@ def rowwise_weighted_adagrad() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=True, + has_gpu_support=True, ) @@ -893,6 +918,8 @@ def sgd() -> None: split_precomputation="", split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=True, + has_gpu_support=True, ) approx_split_weight_update = """ @@ -908,6 +935,8 @@ def sgd() -> None: split_precomputation="", split_weight_update=approx_split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=True, + has_gpu_support=True, ) @@ -978,6 +1007,8 @@ def lamb() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=False, + has_gpu_support=True, ) @@ -1064,6 +1095,8 @@ def partial_rowwise_lamb() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=False, + has_gpu_support=True, ) @@ -1114,6 +1147,8 @@ def adam() -> None: split_precomputation="", split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=False, + has_gpu_support=True, ) @@ -1174,6 +1209,8 @@ def partial_rowwise_adam() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=False, + has_gpu_support=True, ) @@ -1232,6 +1269,8 @@ def lars_sgd() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, + has_cpu_support=False, + has_gpu_support=True, ) @@ -1296,6 +1335,8 @@ def backward_dense() -> None: (FLOAT, "unused"), ] ), + has_cpu_support=True, + has_gpu_support=True, ) @@ -1323,7 +1364,7 @@ def emb_codegen( partial_rowwise_adam() partial_rowwise_lamb() rowwise_adagrad() - rowwise_adagrad_with_weight_decay() + # rowwise_adagrad_with_weight_decay() # Disabled rowwise_adagrad_with_counter() rowwise_weighted_adagrad() sgd() diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template index bd406d39fa..844f04782b 100644 --- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template +++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template @@ -49,6 +49,7 @@ def invoke( max_counter: float, {% endif %} ) -> torch.Tensor: + {% if has_cpu_support %} if (common_args.host_weights.numel() > 0): return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function_cpu( # common_args @@ -147,112 +148,119 @@ def invoke( max_counter=max_counter, {% endif %} ) + {% if not has_gpu_support %} else: - return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function( - # common_args - {% if not dense %} - placeholder_autograd_tensor=common_args.placeholder_autograd_tensor, - {% endif %} - dev_weights=common_args.dev_weights, - uvm_weights=common_args.uvm_weights, - lxu_cache_weights=common_args.lxu_cache_weights, - weights_placements=common_args.weights_placements, - weights_offsets=common_args.weights_offsets, - D_offsets=common_args.D_offsets, - total_D=common_args.total_D, - max_D=common_args.max_D, - hash_size_cumsum=common_args.hash_size_cumsum, - total_hash_size_bits=common_args.total_hash_size_bits, - indices=common_args.indices, - offsets=common_args.offsets, - pooling_mode=common_args.pooling_mode, - indice_weights=common_args.indice_weights, - feature_requires_grad=common_args.feature_requires_grad, - lxu_cache_locations=common_args.lxu_cache_locations, - # optimizer_args - gradient_clipping = optimizer_args.gradient_clipping, - max_gradient=optimizer_args.max_gradient, - stochastic_rounding=optimizer_args.stochastic_rounding, - {% if "learning_rate" in args.split_function_arg_names %} - learning_rate=optimizer_args.learning_rate, - {% endif %} - {% if "eps" in args.split_function_arg_names %} - eps=optimizer_args.eps, - {% endif %} - {% if "beta1" in args.split_function_arg_names %} - beta1=optimizer_args.beta1, - {% endif %} - {% if "beta2" in args.split_function_arg_names %} - beta2=optimizer_args.beta2, - {% endif %} - {% if "weight_decay" in args.split_function_arg_names %} - weight_decay=optimizer_args.weight_decay, - {% endif %} - {% if "weight_decay_mode" in args.split_function_arg_names %} - weight_decay_mode=optimizer_args.weight_decay_mode, - {% endif %} - {% if "eta" in args.split_function_arg_names %} - eta=optimizer_args.eta, - {% endif %} - {% if "momentum" in args.split_function_arg_names %} - momentum=optimizer_args.momentum, - {% endif %} - {% if "counter_halflife" in args.split_function_arg_names %} - counter_halflife=optimizer_args.counter_halflife, - {% endif %} - {% if "adjustment_iter" in args.split_function_arg_names %} - adjustment_iter=optimizer_args.adjustment_iter, - {% endif %} - {% if "adjustment_ub" in args.split_function_arg_names %} - adjustment_ub=optimizer_args.adjustment_ub, - {% endif %} - {% if "learning_rate_mode" in args.split_function_arg_names %} - learning_rate_mode=optimizer_args.learning_rate_mode, - {% endif %} - {% if "grad_sum_decay" in args.split_function_arg_names %} - grad_sum_decay=optimizer_args.grad_sum_decay, - {% endif %} - {% if "tail_id_threshold" in args.split_function_arg_names %} - tail_id_threshold=optimizer_args.tail_id_threshold, - {% endif %} - {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %} - is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio, - {% endif %} - # momentum1 - {% if "momentum1_dev" in args.split_function_arg_names %} - momentum1_dev=momentum1.dev, - momentum1_uvm=momentum1.uvm, - momentum1_offsets=momentum1.offsets, - momentum1_placements=momentum1.placements, - {% endif %} - # momentum2 - {% if "momentum2_dev" in args.split_function_arg_names %} - momentum2_dev=momentum2.dev, - momentum2_uvm=momentum2.uvm, - momentum2_offsets=momentum2.offsets, - momentum2_placements=momentum2.placements, - {% endif %} - # prev_iter - {% if "prev_iter_dev" in args.split_function_arg_names %} - prev_iter_dev=prev_iter.dev, - prev_iter_uvm=prev_iter.uvm, - prev_iter_offsets=prev_iter.offsets, - prev_iter_placements=prev_iter.placements, - {% endif %} - # row_counter - {% if "row_counter_dev" in args.split_function_arg_names %} - row_counter_dev=row_counter.dev, - row_counter_uvm=row_counter.uvm, - row_counter_offsets=row_counter.offsets, - row_counter_placements=row_counter.placements, - {% endif %} - # iter - {% if "iter" in args.split_function_arg_names %} - iter=iter, - {% endif %} - # max counter - {% if "max_counter" in args.split_function_arg_names %} - max_counter=max_counter, - {% endif %} - output_dtype=common_args.output_dtype, - ) + assert False, "{{ optimizer }} has only CPU support. host_weights.numel() must be greater than 0." + {% endif %} + {% endif %} + + {% if has_gpu_support %} + return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function( + # common_args + {% if not dense %} + placeholder_autograd_tensor=common_args.placeholder_autograd_tensor, + {% endif %} + dev_weights=common_args.dev_weights, + uvm_weights=common_args.uvm_weights, + lxu_cache_weights=common_args.lxu_cache_weights, + weights_placements=common_args.weights_placements, + weights_offsets=common_args.weights_offsets, + D_offsets=common_args.D_offsets, + total_D=common_args.total_D, + max_D=common_args.max_D, + hash_size_cumsum=common_args.hash_size_cumsum, + total_hash_size_bits=common_args.total_hash_size_bits, + indices=common_args.indices, + offsets=common_args.offsets, + pooling_mode=common_args.pooling_mode, + indice_weights=common_args.indice_weights, + feature_requires_grad=common_args.feature_requires_grad, + lxu_cache_locations=common_args.lxu_cache_locations, + # optimizer_args + gradient_clipping = optimizer_args.gradient_clipping, + max_gradient=optimizer_args.max_gradient, + stochastic_rounding=optimizer_args.stochastic_rounding, + {% if "learning_rate" in args.split_function_arg_names %} + learning_rate=optimizer_args.learning_rate, + {% endif %} + {% if "eps" in args.split_function_arg_names %} + eps=optimizer_args.eps, + {% endif %} + {% if "beta1" in args.split_function_arg_names %} + beta1=optimizer_args.beta1, + {% endif %} + {% if "beta2" in args.split_function_arg_names %} + beta2=optimizer_args.beta2, + {% endif %} + {% if "weight_decay" in args.split_function_arg_names %} + weight_decay=optimizer_args.weight_decay, + {% endif %} + {% if "weight_decay_mode" in args.split_function_arg_names %} + weight_decay_mode=optimizer_args.weight_decay_mode, + {% endif %} + {% if "eta" in args.split_function_arg_names %} + eta=optimizer_args.eta, + {% endif %} + {% if "momentum" in args.split_function_arg_names %} + momentum=optimizer_args.momentum, + {% endif %} + {% if "counter_halflife" in args.split_function_arg_names %} + counter_halflife=optimizer_args.counter_halflife, + {% endif %} + {% if "adjustment_iter" in args.split_function_arg_names %} + adjustment_iter=optimizer_args.adjustment_iter, + {% endif %} + {% if "adjustment_ub" in args.split_function_arg_names %} + adjustment_ub=optimizer_args.adjustment_ub, + {% endif %} + {% if "learning_rate_mode" in args.split_function_arg_names %} + learning_rate_mode=optimizer_args.learning_rate_mode, + {% endif %} + {% if "grad_sum_decay" in args.split_function_arg_names %} + grad_sum_decay=optimizer_args.grad_sum_decay, + {% endif %} + {% if "tail_id_threshold" in args.split_function_arg_names %} + tail_id_threshold=optimizer_args.tail_id_threshold, + {% endif %} + {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %} + is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio, + {% endif %} + # momentum1 + {% if "momentum1_dev" in args.split_function_arg_names %} + momentum1_dev=momentum1.dev, + momentum1_uvm=momentum1.uvm, + momentum1_offsets=momentum1.offsets, + momentum1_placements=momentum1.placements, + {% endif %} + # momentum2 + {% if "momentum2_dev" in args.split_function_arg_names %} + momentum2_dev=momentum2.dev, + momentum2_uvm=momentum2.uvm, + momentum2_offsets=momentum2.offsets, + momentum2_placements=momentum2.placements, + {% endif %} + # prev_iter + {% if "prev_iter_dev" in args.split_function_arg_names %} + prev_iter_dev=prev_iter.dev, + prev_iter_uvm=prev_iter.uvm, + prev_iter_offsets=prev_iter.offsets, + prev_iter_placements=prev_iter.placements, + {% endif %} + # row_counter + {% if "row_counter_dev" in args.split_function_arg_names %} + row_counter_dev=row_counter.dev, + row_counter_uvm=row_counter.uvm, + row_counter_offsets=row_counter.offsets, + row_counter_placements=row_counter.placements, + {% endif %} + # iter + {% if "iter" in args.split_function_arg_names %} + iter=iter, + {% endif %} + # max counter + {% if "max_counter" in args.split_function_arg_names %} + max_counter=max_counter, + {% endif %} + output_dtype=common_args.output_dtype, + ) + {% endif %} From dde6d13814a8323fd690af3d42842c53f3acd862 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Tue, 28 Mar 2023 16:41:50 -0700 Subject: [PATCH 23/34] Fix the Documentation Build Job (#1673) Summary: - Rewrite the documentation builds job to use the build infrastructure tooling - Rename workflow files for consistency Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1673 Reviewed By: shintaro-iwasaki Differential Revision: D44472660 Pulled By: q10 fbshipit-source-id: 60434c1f7098b7efa8c750133bb22f14fc98d5dc --- .github/scripts/setup_env.bash | 70 +++++++++++++- .github/workflows/fbgemm_docs.yml | 91 ------------------- .github/workflows/fbgemm_gpu_ci.yml | 2 +- ...ild_cpu.yml => fbgemm_gpu_cpu_nightly.yml} | 0 ...ild_cpu.yml => fbgemm_gpu_cpu_release.yml} | 0 ..._build.yml => fbgemm_gpu_cuda_nightly.yml} | 2 +- ..._build.yml => fbgemm_gpu_cuda_release.yml} | 2 +- .github/workflows/fbgemm_gpu_docs.yml | 89 ++++++++++++++++++ 8 files changed, 159 insertions(+), 97 deletions(-) delete mode 100644 .github/workflows/fbgemm_docs.yml rename .github/workflows/{fbgemm_nightly_build_cpu.yml => fbgemm_gpu_cpu_nightly.yml} (100%) rename .github/workflows/{fbgemm_release_build_cpu.yml => fbgemm_gpu_cpu_release.yml} (100%) rename .github/workflows/{fbgemm_nightly_build.yml => fbgemm_gpu_cuda_nightly.yml} (99%) rename .github/workflows/{fbgemm_release_build.yml => fbgemm_gpu_cuda_release.yml} (99%) create mode 100644 .github/workflows/fbgemm_gpu_docs.yml diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index f998bdba3f..57da549463 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -369,6 +369,7 @@ print_glibc_info () { ################################################################################ setup_bazel () { + local bazel_version="${1:-6.1.1}" echo "################################################################################" echo "# Setup Bazel" echo "#" @@ -376,9 +377,8 @@ setup_bazel () { echo "################################################################################" echo "" - local bazel_version="6.1.1" - if [[ $OSTYPE == 'darwin'* ]]; then + # shellcheck disable=SC2155 local bazel_variant="darwin-$(uname -m)" else local bazel_variant="linux-x86_64" @@ -999,6 +999,31 @@ install_build_tools () { echo "[INSTALL] Successfully installed all the build tools" } +install_docs_tools () { + local env_name="$1" + if [ "$env_name" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME" + echo "Example(s):" + echo " ${FUNCNAME[0]} build_env" + return 1 + else + echo "################################################################################" + echo "# Install Documentation Tools" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + fi + + echo "[INSTALL] Installing docs tools ..." + (exec_with_retries conda install -n "${env_name}" -c conda-forge -y \ + doxygen) || return 1 + + # Check binaries are visible in the PAATH + (test_binpath "${env_name}" doxygen) || return 1 + + echo "[INSTALL] Successfully installed all the build tools" +} ################################################################################ # Combination Functions @@ -1087,12 +1112,16 @@ __build_fbgemm_gpu_common_pre_steps () { (test_binpath "${env_name}" g++) || return 1 if [ "$fbgemm_variant" == "cpu" ]; then + echo "[BUILD] Proceeding to build CPU variant" + # Update the package name and build args depending on if CUDA is specified echo "[BUILD] Applying CPU-only build args ..." build_args=(--cpu_only) package_name="${package_name}-cpu" elif [ "$fbgemm_variant" == "rocm" ]; then + echo "[BUILD] Proceeding to build ROCm variant" + (test_env_var "${env_name}" PYTORCH_ROCM_ARCH) || return 1 echo "[BUILD] Applying ROCm build args ..." @@ -1102,6 +1131,7 @@ __build_fbgemm_gpu_common_pre_steps () { else # Set to the default variant fbgemm_variant="gpu" + echo "[BUILD] Proceeding to build GPU variant (default)" # Check nvcc is visible (test_binpath "${env_name}" nvcc) || return 1 @@ -1247,7 +1277,7 @@ build_fbgemm_gpu_install () { fi # Run all the common FBGEMM-GPU build pre-steps (set up variables) - __build_fbgemm_gpu_common_pre_steps + __build_fbgemm_gpu_common_pre_steps || return 1 # Parallelism may need to be limited to prevent the build from being # canceled for going over ulimits @@ -1258,9 +1288,43 @@ build_fbgemm_gpu_install () { # Run checks on the built libraries (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1 + echo "[INSTALL] Checking imports ..." + # Exit this directory to prevent import clashing, since there is an + # fbgemm_gpu/ subdirectory present + cd - || return 1 + (test_python_import "${env_name}" fbgemm_gpu) || return 1 + echo "[BUILD] FBGEMM-GPU build + install completed" } +build_fbgemm_gpu_docs () { + env_name="$1" + if [ "$env_name" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME" + echo "Example(s):" + echo " ${FUNCNAME[0]} build_env # Build the docs" + return 1 + else + echo "################################################################################" + echo "# Build FBGEMM-GPU Documentation" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + fi + + echo "[BUILD] Installing docs-build dependencies ..." + (exec_with_retries conda run -n "${env_name}" python -m pip install -r requirements.txt) || return 1 + + echo "[BUILD] Running Doxygen build ..." + (exec_with_retries conda run -n "${env_name}" doxygen Doxyfile.in) || return 1 + + echo "[BUILD] Building HTML pages ..." + (exec_with_retries conda run -n "${env_name}" make html) || return 1 + + echo "[INSTALL] FBGEMM-GPU documentation build completed" +} + install_fbgemm_gpu_package () { local env_name="$1" local package_name="$2" diff --git a/.github/workflows/fbgemm_docs.yml b/.github/workflows/fbgemm_docs.yml deleted file mode 100644 index 06e2045a03..0000000000 --- a/.github/workflows/fbgemm_docs.yml +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -name: FBGEMM Documentation -on: - push: - branches: - - main -jobs: - build_docs_job: - runs-on: linux.2xlarge - steps: - # Checkout the repository to the GitHub Actions runner - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - # Update references - # TODO: update the git submodule sync after we fixed the auto-sync part - - name: Git Sumbodule Update - run: | - git submodule init - git submodule update --remote --recursive - git log - - name: Update pip - run: | - sudo yum update -y - sudo yum -y install git python3-pip - sudo pip3 install --upgrade pip - - name: Setup conda - run: | - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh - bash ~/miniconda.sh -b -p $HOME/miniconda -u - - name: setup Path - run: | - echo "/home/ec2-user/miniconda/bin" >> $GITHUB_PATH - echo "CONDA=/home/ec2-user/miniconda" >> $GITHUB_PATH - - name: create conda env - run: | - conda create --name build_binary python=3.9 - conda info - - name: check python version - run: | - conda run -n build_binary python --version - - name: Install gcc - shell: bash - run: | - sudo yum group install -y "Development Tools" - - name: Setup Path - run: | - echo /usr/local/bin >> $GITHUB_PATH - - name: Install PyTorch - shell: bash - run: | - conda run -n build_binary python -m pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - - name: Test PyTorch Installation - run: | - conda run -n build_binary python -c "import torch.distributed" - echo "torch.distributed succeeded" - - name: Install fbgemm_gpu nightly - run: | - cd ./fbgemm_gpu - conda run -n build_binary python -m pip install -r requirements.txt - conda run -n build_binary python setup.py install --cpu_only - - name: Test fbgemm_gpu installation - shell: bash - run: | - cd ./fbgemm_gpu/docs - conda run -n build_binary \ - python -c "import fbgemm_gpu" - - name: Install Doxygen - run: | - conda install -n build_binary -c conda-forge doxygen - which doxygen - - name: Build the docset - run: | - cd ./fbgemm_gpu/docs - conda run -n build_binary python -m pip install -r requirements.txt - conda run -n build_binary doxygen Doxyfile.in - conda run -n build_binary make html - cd .. - - name: Get output time - run: echo "The time was ${{ steps.build.outputs.time }}" - - name: Deploy - uses: JamesIves/github-pages-deploy-action@releases/v3 - with: - ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BRANCH: gh-pages # The branch the action should deploy to. - FOLDER: fbgemm_gpu/docs/build/html # The folder the action should deploy. diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index b7dea4093a..646c9de168 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -200,7 +200,7 @@ jobs: - name: Prepare FBGEMM_GPU Build run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - - name: Build and Install FBGEMM_GPU (CPU version) + - name: Build + Install FBGEMM_GPU (CPU version) run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu - name: Test with PyTest diff --git a/.github/workflows/fbgemm_nightly_build_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_nightly.yml similarity index 100% rename from .github/workflows/fbgemm_nightly_build_cpu.yml rename to .github/workflows/fbgemm_gpu_cpu_nightly.yml diff --git a/.github/workflows/fbgemm_release_build_cpu.yml b/.github/workflows/fbgemm_gpu_cpu_release.yml similarity index 100% rename from .github/workflows/fbgemm_release_build_cpu.yml rename to .github/workflows/fbgemm_gpu_cpu_release.yml diff --git a/.github/workflows/fbgemm_nightly_build.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml similarity index 99% rename from .github/workflows/fbgemm_nightly_build.yml rename to .github/workflows/fbgemm_gpu_cuda_nightly.yml index 0d9257d554..7ccdbcbf3e 100644 --- a/.github/workflows/fbgemm_nightly_build.yml +++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml @@ -3,7 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -name: FBGEMM_GPU Nightly Build +name: FBGEMM_GPU-CUDA Nightly Build on: # PR Trigger (enabled only for debugging) diff --git a/.github/workflows/fbgemm_release_build.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml similarity index 99% rename from .github/workflows/fbgemm_release_build.yml rename to .github/workflows/fbgemm_gpu_cuda_release.yml index b909cec274..7516e6a021 100644 --- a/.github/workflows/fbgemm_release_build.yml +++ b/.github/workflows/fbgemm_gpu_cuda_release.yml @@ -3,7 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -name: FBGEMM_GPU Release Build +name: FBGEMM_GPU-CUDA Release Build on: # PR Trigger (enabled only for debugging) diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml new file mode 100644 index 0000000000..fb63995752 --- /dev/null +++ b/.github/workflows/fbgemm_gpu_docs.yml @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +name: FBGEMM_GPU Documentation + +on: + # PR Trigger + # + pull_request: + branches: + - main + + # Push Trigger (enable to catch errors coming out of multiple merges) + # + push: + branches: + - main + + # Manual Trigger (for testing only) + # + workflow_dispatch: + +jobs: + build-docs: + runs-on: linux.2xlarge + container: + image: amazonlinux:2023 + options: --user root + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + strategy: + fail-fast: false + matrix: + python-version: [ "3.11" ] + + steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git rsync sudo tar wget which + + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install C/C++ Compilers + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV + + - name: Install Build Tools + run: . $PRELUDE; install_build_tools $BUILD_ENV + + - name: Install Documentation Tools + run: . $PRELUDE; install_docs_tools $BUILD_ENV + + - name: Install PyTorch-CPU Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cpu + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Build + Install FBGEMM_GPU (CPU version) + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu + + - name: Build FBGEMM_GPU Documentation + run: . $PRELUDE; cd fbgemm_gpu/docs; build_fbgemm_gpu_docs $BUILD_ENV + + - name: Deploy FBGEMM_GPU Documentation + if: ${{ github.event_name != 'pull_request' }} + uses: JamesIves/github-pages-deploy-action@releases/v4 + with: + branch: gh-pages # The branch the action should deploy to + folder: fbgemm_gpu/docs/build/html # The folder the action should deploy From 7ed2a096af1cac33aeb16cadf7a367fdd5b85def Mon Sep 17 00:00:00 2001 From: Sarunya Pumma Date: Tue, 28 Mar 2023 19:54:47 -0700 Subject: [PATCH 24/34] Back out "Prune CPU/GPU TBE optimizer codegen" (#1675) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1675 Original commit changeset: 02413256b4a6 Original Phabricator Diff: D44326540 Reviewed By: q10, jianyuh Differential Revision: D44475251 fbshipit-source-id: 5be66944a833e03a2737fc6d1baaa5c351455b2c --- fbgemm_gpu/CMakeLists.txt | 38 ++- .../embedding_backward_code_generator.py | 109 +++------ ..._embedding_codegen_lookup_invoker.template | 224 +++++++++--------- 3 files changed, 156 insertions(+), 215 deletions(-) diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index b30bc1eab4..1fb8f397e0 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -103,27 +103,21 @@ endif() set(OPTIMIZERS adagrad + adam + approx_rowwise_adagrad + approx_rowwise_adagrad_with_weight_decay + approx_rowwise_adagrad_with_counter approx_sgd + lamb + lars_sgd + partial_rowwise_adam + partial_rowwise_lamb rowwise_adagrad + rowwise_adagrad_with_weight_decay rowwise_adagrad_with_counter rowwise_weighted_adagrad sgd) -set(CPU_ONLY_OPTIMIZERS - approx_rowwise_adagrad - approx_rowwise_adagrad_with_counter) - -set(GPU_ONLY_OPTIMIZERS - adam - lamb - lars_sgd - partial_rowwise_adam - partial_rowwise_lamb) - -set(CPU_OPTIMIZERS ${OPTIMIZERS} ${CPU_ONLY_OPTIMIZERS}) -set(GPU_OPTIMIZERS ${OPTIMIZERS} ${GPU_ONLY_OPTIMIZERS}) -set(ALL_OPTIMIZERS ${OPTIMIZERS} ${CPU_ONLY_OPTIMIZERS} ${GPU_ONLY_OPTIMIZERS}) - set(gen_gpu_source_files "gen_embedding_forward_dense_weighted_codegen_cuda.cu" "gen_embedding_forward_dense_unweighted_codegen_cuda.cu" @@ -143,16 +137,16 @@ set(gen_cpu_source_files set(gen_python_files ${CMAKE_BINARY_DIR}/__init__.py) -foreach(optimizer ${CPU_OPTIMIZERS}) +foreach(optimizer ${OPTIMIZERS}) + list(APPEND gen_gpu_host_source_files + "gen_embedding_backward_split_${optimizer}.cpp") + list(APPEND gen_cpu_source_files "gen_embedding_backward_split_${optimizer}_cpu.cpp") list(APPEND gen_cpu_source_files "gen_embedding_backward_${optimizer}_split_cpu.cpp") -endforeach() -foreach(optimizer ${GPU_OPTIMIZERS}) - list(APPEND gen_gpu_host_source_files - "gen_embedding_backward_split_${optimizer}.cpp") + list(APPEND gen_python_files "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py") foreach(weight weighted unweighted) list(APPEND gen_gpu_source_files @@ -160,10 +154,6 @@ foreach(optimizer ${GPU_OPTIMIZERS}) endforeach() endforeach() -foreach(optimizer ${ALL_OPTIMIZERS}) - list(APPEND gen_python_files "${CMAKE_BINARY_DIR}/lookup_${optimizer}.py") -endforeach() - set(CMAKE_CODEGEN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/codegen) set(codegen_dependencies diff --git a/fbgemm_gpu/codegen/embedding_backward_code_generator.py b/fbgemm_gpu/codegen/embedding_backward_code_generator.py index aa832947c3..fd69a22f6e 100644 --- a/fbgemm_gpu/codegen/embedding_backward_code_generator.py +++ b/fbgemm_gpu/codegen/embedding_backward_code_generator.py @@ -127,60 +127,53 @@ def int_arg(name: str, default: int = 0) -> str: def generate(**kwargs: Any) -> None: gen_args = kwargs["args"] + # Generates CUDA variants. kwargs["args"] = gen_args["cuda"] - if kwargs.get("has_gpu_support"): - # Generates CUDA variants. - template = env.get_template("embedding_backward_split_template.cu") - src_cu = template.render(weighted=False, **kwargs) - write( - f"gen_embedding_backward_{kwargs.get('optimizer')}_split_unweighted_cuda.cu", - src_cu, - ) - src_cu = template.render(weighted=True, **kwargs) - write( - f"gen_embedding_backward_{kwargs.get('optimizer')}_split_weighted_cuda.cu", - src_cu, - ) - if not kwargs.get("dense"): - template = env.get_template("embedding_backward_split_host_template.cpp") - src_cpp = template.render(**kwargs) - write( - f"gen_embedding_backward_split_{kwargs.get('optimizer')}.cpp", src_cpp - ) + template = env.get_template("embedding_backward_split_template.cu") + src_cu = template.render(weighted=False, **kwargs) + write( + f"gen_embedding_backward_{kwargs.get('optimizer')}_split_unweighted_cuda.cu", + src_cu, + ) + src_cu = template.render(weighted=True, **kwargs) + write( + f"gen_embedding_backward_{kwargs.get('optimizer')}_split_weighted_cuda.cu", + src_cu, + ) if not kwargs.get("dense"): + template = env.get_template("embedding_backward_split_host_template.cpp") + src_cpp = template.render(**kwargs) + write(f"gen_embedding_backward_split_{kwargs.get('optimizer')}.cpp", src_cpp) + # Generates Python invoker for CUDA + CPU template = env.get_template("split_embedding_codegen_lookup_invoker.template") src_py = template.render(is_fbcode=args.is_fbcode, **kwargs) write(f"lookup_{kwargs.get('optimizer')}.py", src_py) - if kwargs.get("has_cpu_support"): - # Generates CPU variants. - kwargs["args"] = gen_args["cpu"] + # Generates CPU variants. + kwargs["args"] = gen_args["cpu"] - is_approx = "approx" in kwargs.get("optimizer") - template = ( - env.get_template("embedding_backward_split_cpu_approx_template.cpp") - if is_approx - else env.get_template("embedding_backward_split_cpu_template.cpp") - ) + is_approx = "approx" in kwargs.get("optimizer") + template = ( + env.get_template("embedding_backward_split_cpu_approx_template.cpp") + if is_approx + else env.get_template("embedding_backward_split_cpu_template.cpp") + ) + + src_cpp = template.render(**kwargs) + write( + f"gen_embedding_backward_{kwargs.get('optimizer')}_split_cpu.cpp", + src_cpp, + ) + if not kwargs.get("dense"): + template = env.get_template("embedding_backward_split_host_cpu_template.cpp") src_cpp = template.render(**kwargs) write( - f"gen_embedding_backward_{kwargs.get('optimizer')}_split_cpu.cpp", - src_cpp, + f"gen_embedding_backward_split_{kwargs.get('optimizer')}_cpu.cpp", src_cpp ) - if not kwargs.get("dense"): - template = env.get_template( - "embedding_backward_split_host_cpu_template.cpp" - ) - src_cpp = template.render(**kwargs) - write( - f"gen_embedding_backward_split_{kwargs.get('optimizer')}_cpu.cpp", - src_cpp, - ) - @dataclass class Args: @@ -376,8 +369,6 @@ def adagrad() -> None: split_precomputation="", split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=True, - has_gpu_support=True, ) @@ -499,8 +490,6 @@ def rowwise_adagrad() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=True, - has_gpu_support=True, ) approx_split_weight_update = """ @@ -523,8 +512,6 @@ def rowwise_adagrad() -> None: split_precomputation=split_precomputation, split_weight_update=approx_split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=True, - has_gpu_support=False, ) @@ -624,9 +611,6 @@ def rowwise_adagrad_with_weight_decay() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - # Disable both CPU and GPU support - has_cpu_support=False, - has_gpu_support=False, ) approx_split_weight_update = """ @@ -649,9 +633,6 @@ def rowwise_adagrad_with_weight_decay() -> None: split_precomputation=split_precomputation, split_weight_update=approx_split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - # Disable both CPU and GPU support - has_cpu_support=False, - has_gpu_support=False, ) @@ -790,8 +771,6 @@ def rowwise_adagrad_with_counter() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=True, - has_gpu_support=True, ) approx_split_weight_update = """ @@ -825,8 +804,6 @@ def rowwise_adagrad_with_counter() -> None: split_precomputation=split_precomputation, split_weight_update=approx_split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=True, - has_gpu_support=False, ) @@ -897,8 +874,6 @@ def rowwise_weighted_adagrad() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=True, - has_gpu_support=True, ) @@ -918,8 +893,6 @@ def sgd() -> None: split_precomputation="", split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=True, - has_gpu_support=True, ) approx_split_weight_update = """ @@ -935,8 +908,6 @@ def sgd() -> None: split_precomputation="", split_weight_update=approx_split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=True, - has_gpu_support=True, ) @@ -1007,8 +978,6 @@ def lamb() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=False, - has_gpu_support=True, ) @@ -1095,8 +1064,6 @@ def partial_rowwise_lamb() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=False, - has_gpu_support=True, ) @@ -1147,8 +1114,6 @@ def adam() -> None: split_precomputation="", split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=False, - has_gpu_support=True, ) @@ -1209,8 +1174,6 @@ def partial_rowwise_adam() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=False, - has_gpu_support=True, ) @@ -1269,8 +1232,6 @@ def lars_sgd() -> None: split_precomputation=split_precomputation, split_weight_update=split_weight_update, split_weight_update_cpu=split_weight_update_cpu, - has_cpu_support=False, - has_gpu_support=True, ) @@ -1335,8 +1296,6 @@ def backward_dense() -> None: (FLOAT, "unused"), ] ), - has_cpu_support=True, - has_gpu_support=True, ) @@ -1364,7 +1323,7 @@ def emb_codegen( partial_rowwise_adam() partial_rowwise_lamb() rowwise_adagrad() - # rowwise_adagrad_with_weight_decay() # Disabled + rowwise_adagrad_with_weight_decay() rowwise_adagrad_with_counter() rowwise_weighted_adagrad() sgd() diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template index 844f04782b..bd406d39fa 100644 --- a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template +++ b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template @@ -49,7 +49,6 @@ def invoke( max_counter: float, {% endif %} ) -> torch.Tensor: - {% if has_cpu_support %} if (common_args.host_weights.numel() > 0): return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function_cpu( # common_args @@ -148,119 +147,112 @@ def invoke( max_counter=max_counter, {% endif %} ) - {% if not has_gpu_support %} else: - assert False, "{{ optimizer }} has only CPU support. host_weights.numel() must be greater than 0." - {% endif %} - {% endif %} - - {% if has_gpu_support %} - return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function( - # common_args - {% if not dense %} - placeholder_autograd_tensor=common_args.placeholder_autograd_tensor, - {% endif %} - dev_weights=common_args.dev_weights, - uvm_weights=common_args.uvm_weights, - lxu_cache_weights=common_args.lxu_cache_weights, - weights_placements=common_args.weights_placements, - weights_offsets=common_args.weights_offsets, - D_offsets=common_args.D_offsets, - total_D=common_args.total_D, - max_D=common_args.max_D, - hash_size_cumsum=common_args.hash_size_cumsum, - total_hash_size_bits=common_args.total_hash_size_bits, - indices=common_args.indices, - offsets=common_args.offsets, - pooling_mode=common_args.pooling_mode, - indice_weights=common_args.indice_weights, - feature_requires_grad=common_args.feature_requires_grad, - lxu_cache_locations=common_args.lxu_cache_locations, - # optimizer_args - gradient_clipping = optimizer_args.gradient_clipping, - max_gradient=optimizer_args.max_gradient, - stochastic_rounding=optimizer_args.stochastic_rounding, - {% if "learning_rate" in args.split_function_arg_names %} - learning_rate=optimizer_args.learning_rate, - {% endif %} - {% if "eps" in args.split_function_arg_names %} - eps=optimizer_args.eps, - {% endif %} - {% if "beta1" in args.split_function_arg_names %} - beta1=optimizer_args.beta1, - {% endif %} - {% if "beta2" in args.split_function_arg_names %} - beta2=optimizer_args.beta2, - {% endif %} - {% if "weight_decay" in args.split_function_arg_names %} - weight_decay=optimizer_args.weight_decay, - {% endif %} - {% if "weight_decay_mode" in args.split_function_arg_names %} - weight_decay_mode=optimizer_args.weight_decay_mode, - {% endif %} - {% if "eta" in args.split_function_arg_names %} - eta=optimizer_args.eta, - {% endif %} - {% if "momentum" in args.split_function_arg_names %} - momentum=optimizer_args.momentum, - {% endif %} - {% if "counter_halflife" in args.split_function_arg_names %} - counter_halflife=optimizer_args.counter_halflife, - {% endif %} - {% if "adjustment_iter" in args.split_function_arg_names %} - adjustment_iter=optimizer_args.adjustment_iter, - {% endif %} - {% if "adjustment_ub" in args.split_function_arg_names %} - adjustment_ub=optimizer_args.adjustment_ub, - {% endif %} - {% if "learning_rate_mode" in args.split_function_arg_names %} - learning_rate_mode=optimizer_args.learning_rate_mode, - {% endif %} - {% if "grad_sum_decay" in args.split_function_arg_names %} - grad_sum_decay=optimizer_args.grad_sum_decay, - {% endif %} - {% if "tail_id_threshold" in args.split_function_arg_names %} - tail_id_threshold=optimizer_args.tail_id_threshold, - {% endif %} - {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %} - is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio, - {% endif %} - # momentum1 - {% if "momentum1_dev" in args.split_function_arg_names %} - momentum1_dev=momentum1.dev, - momentum1_uvm=momentum1.uvm, - momentum1_offsets=momentum1.offsets, - momentum1_placements=momentum1.placements, - {% endif %} - # momentum2 - {% if "momentum2_dev" in args.split_function_arg_names %} - momentum2_dev=momentum2.dev, - momentum2_uvm=momentum2.uvm, - momentum2_offsets=momentum2.offsets, - momentum2_placements=momentum2.placements, - {% endif %} - # prev_iter - {% if "prev_iter_dev" in args.split_function_arg_names %} - prev_iter_dev=prev_iter.dev, - prev_iter_uvm=prev_iter.uvm, - prev_iter_offsets=prev_iter.offsets, - prev_iter_placements=prev_iter.placements, - {% endif %} - # row_counter - {% if "row_counter_dev" in args.split_function_arg_names %} - row_counter_dev=row_counter.dev, - row_counter_uvm=row_counter.uvm, - row_counter_offsets=row_counter.offsets, - row_counter_placements=row_counter.placements, - {% endif %} - # iter - {% if "iter" in args.split_function_arg_names %} - iter=iter, - {% endif %} - # max counter - {% if "max_counter" in args.split_function_arg_names %} - max_counter=max_counter, - {% endif %} - output_dtype=common_args.output_dtype, - ) - {% endif %} + return torch.ops.fbgemm.split_embedding_codegen_lookup_{{ optimizer }}_function( + # common_args + {% if not dense %} + placeholder_autograd_tensor=common_args.placeholder_autograd_tensor, + {% endif %} + dev_weights=common_args.dev_weights, + uvm_weights=common_args.uvm_weights, + lxu_cache_weights=common_args.lxu_cache_weights, + weights_placements=common_args.weights_placements, + weights_offsets=common_args.weights_offsets, + D_offsets=common_args.D_offsets, + total_D=common_args.total_D, + max_D=common_args.max_D, + hash_size_cumsum=common_args.hash_size_cumsum, + total_hash_size_bits=common_args.total_hash_size_bits, + indices=common_args.indices, + offsets=common_args.offsets, + pooling_mode=common_args.pooling_mode, + indice_weights=common_args.indice_weights, + feature_requires_grad=common_args.feature_requires_grad, + lxu_cache_locations=common_args.lxu_cache_locations, + # optimizer_args + gradient_clipping = optimizer_args.gradient_clipping, + max_gradient=optimizer_args.max_gradient, + stochastic_rounding=optimizer_args.stochastic_rounding, + {% if "learning_rate" in args.split_function_arg_names %} + learning_rate=optimizer_args.learning_rate, + {% endif %} + {% if "eps" in args.split_function_arg_names %} + eps=optimizer_args.eps, + {% endif %} + {% if "beta1" in args.split_function_arg_names %} + beta1=optimizer_args.beta1, + {% endif %} + {% if "beta2" in args.split_function_arg_names %} + beta2=optimizer_args.beta2, + {% endif %} + {% if "weight_decay" in args.split_function_arg_names %} + weight_decay=optimizer_args.weight_decay, + {% endif %} + {% if "weight_decay_mode" in args.split_function_arg_names %} + weight_decay_mode=optimizer_args.weight_decay_mode, + {% endif %} + {% if "eta" in args.split_function_arg_names %} + eta=optimizer_args.eta, + {% endif %} + {% if "momentum" in args.split_function_arg_names %} + momentum=optimizer_args.momentum, + {% endif %} + {% if "counter_halflife" in args.split_function_arg_names %} + counter_halflife=optimizer_args.counter_halflife, + {% endif %} + {% if "adjustment_iter" in args.split_function_arg_names %} + adjustment_iter=optimizer_args.adjustment_iter, + {% endif %} + {% if "adjustment_ub" in args.split_function_arg_names %} + adjustment_ub=optimizer_args.adjustment_ub, + {% endif %} + {% if "learning_rate_mode" in args.split_function_arg_names %} + learning_rate_mode=optimizer_args.learning_rate_mode, + {% endif %} + {% if "grad_sum_decay" in args.split_function_arg_names %} + grad_sum_decay=optimizer_args.grad_sum_decay, + {% endif %} + {% if "tail_id_threshold" in args.split_function_arg_names %} + tail_id_threshold=optimizer_args.tail_id_threshold, + {% endif %} + {% if "is_tail_id_thresh_ratio" in args.split_function_arg_names %} + is_tail_id_thresh_ratio=optimizer_args.is_tail_id_thresh_ratio, + {% endif %} + # momentum1 + {% if "momentum1_dev" in args.split_function_arg_names %} + momentum1_dev=momentum1.dev, + momentum1_uvm=momentum1.uvm, + momentum1_offsets=momentum1.offsets, + momentum1_placements=momentum1.placements, + {% endif %} + # momentum2 + {% if "momentum2_dev" in args.split_function_arg_names %} + momentum2_dev=momentum2.dev, + momentum2_uvm=momentum2.uvm, + momentum2_offsets=momentum2.offsets, + momentum2_placements=momentum2.placements, + {% endif %} + # prev_iter + {% if "prev_iter_dev" in args.split_function_arg_names %} + prev_iter_dev=prev_iter.dev, + prev_iter_uvm=prev_iter.uvm, + prev_iter_offsets=prev_iter.offsets, + prev_iter_placements=prev_iter.placements, + {% endif %} + # row_counter + {% if "row_counter_dev" in args.split_function_arg_names %} + row_counter_dev=row_counter.dev, + row_counter_uvm=row_counter.uvm, + row_counter_offsets=row_counter.offsets, + row_counter_placements=row_counter.placements, + {% endif %} + # iter + {% if "iter" in args.split_function_arg_names %} + iter=iter, + {% endif %} + # max counter + {% if "max_counter" in args.split_function_arg_names %} + max_counter=max_counter, + {% endif %} + output_dtype=common_args.output_dtype, + ) From a49926789619fbb864ecf49f4e3a9e81315149c3 Mon Sep 17 00:00:00 2001 From: Sarunya Pumma Date: Tue, 28 Mar 2023 22:53:26 -0700 Subject: [PATCH 25/34] Prepare bounds_check_indices for VBE (#1633) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1633 Prepare `bounds_check_indices` for variable batch size TBE (VBE). - Update the frontend API to accept VBE args - Update the backend logic to process VBE data Reviewed By: jianyuh Differential Revision: D43253703 fbshipit-source-id: 2870f0c41a96265650281a9b6362d4e6dc48009b --- fbgemm_gpu/codegen/embedding_bounds_check.cu | 146 +++++++++++------- .../codegen/embedding_bounds_check_host.cpp | 4 +- .../embedding_bounds_check_host_cpu.cpp | 11 +- 3 files changed, 103 insertions(+), 58 deletions(-) diff --git a/fbgemm_gpu/codegen/embedding_bounds_check.cu b/fbgemm_gpu/codegen/embedding_bounds_check.cu index 4d77d2b508..bc18695ece 100644 --- a/fbgemm_gpu/codegen/embedding_bounds_check.cu +++ b/fbgemm_gpu/codegen/embedding_bounds_check.cu @@ -23,31 +23,52 @@ __device__ void adjust_offset_kernel( *offset_acc_end = indices_end; } -template +template __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( const at::PackedTensorAccessor32 rows_per_table, at::PackedTensorAccessor32 indices, at::PackedTensorAccessor32 offsets, + const int32_t* const vbe_metadata, const int64_t bounds_check_mode_, at::PackedTensorAccessor32 warning, FixedDivisor fd) { int32_t T = rows_per_table.size(0); - int32_t B = (offsets.size(0) - 1) / T; - int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y; - int32_t b; // = b_t % B; - int32_t t; // = b_t / B; - fd.DivMod(b_t, &t, &b); - if (t >= T) { + int32_t b; + int32_t t; + int32_t B = 0; + int32_t total_B = offsets.size(0) - 1; + + if (!vbe && b_t >= total_B) { return; } - auto bounds_check_mode = static_cast(bounds_check_mode_); - auto num_rows = rows_per_table[t]; - auto indices_start = offsets[t * B + b]; - auto indices_end = offsets[t * B + b + 1]; - index_t num_indices = indices.size(0); + fd.DivMod(b_t, &t, &b); + + if (vbe) { + // Check if t is valid + if (t >= T) { + return; + } + const auto B_start = vbe_metadata[t]; + B = vbe_metadata[t + 1] - B_start; + // Check if b is valid + if (b >= B) { + return; + } + // Update b_t value + b_t = B_start + b; + } else { + B = total_B / T; + } + + const auto bounds_check_mode = + static_cast(bounds_check_mode_); + const auto num_rows = rows_per_table[t]; + auto indices_start = offsets[b_t]; + auto indices_end = offsets[b_t + 1]; + const index_t num_indices = indices.size(0); if (bounds_check_mode == BoundsCheckMode::FATAL) { CUDA_KERNEL_ASSERT(indices_start >= 0); @@ -58,12 +79,13 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( indices_end > num_indices) { if (gpuAtomicIncrement(&warning[0]) == 0) { printf( - "EmbeddingBoundsCheck: (at least one) Out of bounds access for " - "batch: %lld, table: %lld, indices_start: %lld, indices_end: %lld," + "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for " + "batch: %d, table: %d, indices_start: %lld, indices_end: %lld," " num_indices: %lld. Setting indices_start and indices_end within " "the range.\n", - static_cast(b), - static_cast(t), + vbe ? "true" : "false", + b, + t, static_cast(indices_start), static_cast(indices_end), static_cast(num_indices)); @@ -72,16 +94,16 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( indices_start, indices_end, num_indices, - &offsets[t * B + b], - &offsets[t * B + b + 1]); + &offsets[b_t], + &offsets[b_t + 1]); } } else if (bounds_check_mode == BoundsCheckMode::IGNORE) { adjust_offset_kernel( indices_start, indices_end, num_indices, - &offsets[t * B + b], - &offsets[t * B + b + 1]); + &offsets[b_t], + &offsets[b_t + 1]); } const auto L = indices_end - indices_start; @@ -100,9 +122,10 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( if (idx < 0 || idx >= num_rows) { if (gpuAtomicIncrement(&warning[0]) == 0) { printf( - "EmbeddingBoundsCheck: (at least one) Out of bounds access for batch: %lld, table: %lld, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n", - static_cast(b), - static_cast(t), + "EmbeddingBoundsCheck (VBE %s): (at least one) Out of bounds access for batch: %d, table: %d, bag element: %lld, idx: %lld, num_rows: %lld, indices_start: %lld, indices_end: %lld, T: %d, B: %d, b_t: %d. Setting idx to zero.\n", + vbe ? "true" : "false", + b, + t, static_cast(i), static_cast(idx), num_rows, @@ -122,25 +145,27 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel( } if (bounds_check_mode == BoundsCheckMode::FATAL) { - CUDA_KERNEL_ASSERT(num_indices == offsets[B * T]); + CUDA_KERNEL_ASSERT(num_indices == offsets[total_B]); } else if (bounds_check_mode == BoundsCheckMode::WARNING) { - if (num_indices != offsets[B * T]) { + if (num_indices != offsets[total_B]) { if (gpuAtomicIncrement(&warning[0]) == 0) { printf( - "EmbeddingBoundsCheck: the last element in offsets is incorrect for " - "total batch size B: %lld, total table num T: %lld, " + "EmbeddingBoundsCheck (VBE %s): the last element in offsets is incorrect for " + "total batch size %s: %d, total table num T: %d, " " last element in offsets: %lld, indices size: %lld. " " Setting the last element in offsets to be indices size.\n", - static_cast(B), - static_cast(T), - static_cast(offsets[B * T]), + vbe ? "true" : "false", + vbe ? "total_B" : "B", + vbe ? total_B : B, + T, + static_cast(offsets[total_B]), static_cast(num_indices)); } - offsets[B * T] = num_indices; + offsets[total_B] = num_indices; } } else if (bounds_check_mode == BoundsCheckMode::IGNORE) { - if (num_indices != offsets[B * T]) { - offsets[B * T] = num_indices; + if (num_indices != offsets[total_B]) { + offsets[total_B] = num_indices; } } } @@ -151,19 +176,23 @@ void bounds_check_indices_cuda( Tensor& offsets, int64_t bounds_check_mode_, Tensor& warning, - c10::optional weights) { + const c10::optional& weights, + const c10::optional& vbe_metadata, + const int64_t max_B) { TENSOR_ON_CUDA_GPU(rows_per_table); TENSOR_ON_CUDA_GPU(indices); TENSOR_ON_CUDA_GPU(offsets); TENSOR_ON_CUDA_GPU(warning); TENSOR_EMPTY_OR_ON_CUDA_GPU(weights); + TENSOR_EMPTY_OR_ON_CUDA_GPU(vbe_metadata); at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(rows_per_table.get_device()); const int32_t T = rows_per_table.size(0); - const int32_t B = (offsets.size(0) - 1) / T; - if (B == 0 || T == 0) { + const int32_t total_B = offsets.size(0) - 1; + const int32_t B = (total_B) / T; + if (total_B == 0 || T == 0) { return; } const auto bounds_check_mode = @@ -172,12 +201,17 @@ void bounds_check_indices_cuda( warning.zero_(); } const int64_t num_indices = indices.size(0); + const auto vbe = vbe_metadata.has_value(); - TORCH_CHECK( - offsets.size(0) == B * T + 1, - "offsets size " + std::to_string(offsets.size(0)) + - " is not equal to B (" + std::to_string(B) + ") * T (" + - std::to_string(T) + ") + 1"); + if (vbe) { + TORCH_CHECK(max_B >= 0); + } else { + TORCH_CHECK( + offsets.size(0) == B * T + 1, + "offsets size " + std::to_string(offsets.size(0)) + + " is not equal to B (" + std::to_string(B) + ") * T (" + + std::to_string(T) + ") + 1"); + } if (weights.has_value()) { TORCH_CHECK( weights.value().size(0) == num_indices, @@ -186,20 +220,24 @@ void bounds_check_indices_cuda( } constexpr size_t kNumThreads = 256; + const auto max_B_ = vbe ? max_B : B; AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "bounds_check_indices", [&] { - bounds_check_indices_kernel - <<>>( - rows_per_table - .packed_accessor32(), - indices.packed_accessor32(), - offsets.packed_accessor32(), - bounds_check_mode_, - warning.packed_accessor32(), - FixedDivisor(B)); + const auto bounds_check_kernel = + (vbe ? bounds_check_indices_kernel + : bounds_check_indices_kernel); + bounds_check_kernel<<< + div_round_up(max_B_ * T, kNumThreads / fbgemm_gpu::kWarpSize), + dim3(fbgemm_gpu::kWarpSize, kNumThreads / fbgemm_gpu::kWarpSize), + 0, + at::cuda::getCurrentCUDAStream()>>>( + rows_per_table.packed_accessor32(), + indices.packed_accessor32(), + offsets.packed_accessor32(), + vbe ? vbe_metadata.value().data_ptr() : nullptr, + bounds_check_mode_, + warning.packed_accessor32(), + FixedDivisor(max_B_)); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp index 84575a3361..87e3cd7521 100644 --- a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp +++ b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp @@ -23,7 +23,9 @@ void bounds_check_indices_cuda( Tensor& offsets, int64_t bounds_check_mode, Tensor& warning, - c10::optional weights); + const c10::optional& weights, + const c10::optional& vbe_metadata, + const int64_t max_B); // Deprecated for fb namespace! Please use fbgemm namespace instead! TORCH_LIBRARY_FRAGMENT(fb, m) { diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp index a2dd19a75e..a33e02e164 100644 --- a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp +++ b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp @@ -42,7 +42,12 @@ void bounds_check_indices_cpu( Tensor& offsets, int64_t bounds_check_mode_, Tensor& warning, - c10::optional weights) { + const c10::optional& weights, + const c10::optional& vbe_metadata, + const int64_t /*max_B*/) { + TORCH_CHECK( + !vbe_metadata.has_value(), + "bounds_check_indices on CPU does not support variable length (batch size)"); auto bounds_check_mode = static_cast(bounds_check_mode_); if (bounds_check_mode == BoundsCheckMode::WARNING) { warning.zero_(); @@ -163,7 +168,7 @@ TORCH_LIBRARY_FRAGMENT(fb, m) { // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd // or DCE'd, etc. m.def( - "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()"); + "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()"); DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu); } @@ -171,6 +176,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd // or DCE'd, etc. m.def( - "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None) -> ()"); + "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(b!) offsets, int bounds_check_mode, Tensor(c!) warning, Tensor(d!)? weights=None, Tensor? vbe_metadata=None, int max_B=-1) -> ()"); DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu); } From c2d6c5e0d16425b77ba15106fdd0be0f47878cac Mon Sep 17 00:00:00 2001 From: Yue Dong Date: Wed, 29 Mar 2023 05:29:26 -0700 Subject: [PATCH 26/34] Move pruning/index_remapping support to embedding inplace update files (#1667) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1667 As title. This diff moves pruning/index_remapping support to embedding inplace update files. Reviewed By: jianyuh Differential Revision: D44409419 fbshipit-source-id: 93fc91d83502eb95cb0feca2a8a03b003c336078 --- ...bedding_forward_quantized_cpu_template.cpp | 39 -------- .../embedding_forward_quantized_host.cpp | 10 -- .../embedding_forward_quantized_host_cpu.cpp | 13 --- ...edding_forward_quantized_split_template.cu | 77 --------------- .../fbgemm_gpu/embedding_inplace_update.h | 24 +++++ fbgemm_gpu/src/embedding_inplace_update.cu | 94 +++++++++++++++++++ .../src/embedding_inplace_update_cpu.cpp | 58 ++++++++++++ .../src/embedding_inplace_update_gpu.cpp | 6 ++ 8 files changed, 182 insertions(+), 139 deletions(-) diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp index 9caaacbfb8..829249b297 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp @@ -534,44 +534,5 @@ Tensor pruned_array_lookup_cpu( return dense_indices; } -Tensor pruned_array_lookup_from_row_idx_cpu( - Tensor update_row_indices, - Tensor update_table_indices, - Tensor index_remappings, - Tensor index_remappings_offsets) { - TENSOR_ON_CPU(update_row_indices); - TENSOR_ON_CPU(update_table_indices); - TENSOR_ON_CPU(index_remappings); - TENSOR_ON_CPU(index_remappings_offsets); - - int32_t T = index_remappings_offsets.size(0) - 1; - auto dense_indices = empty_like(update_row_indices); - const auto num_indices = update_row_indices.numel(); - - AT_DISPATCH_INDEX_TYPES( - update_row_indices.scalar_type(), "pruned_array_lookup_from_row_idx_cpu_kernel", [&] { - const auto update_row_indices_acc = update_row_indices.accessor(); - auto dense_indices_acc = dense_indices.accessor(); - const auto update_table_indices_acc = update_table_indices.accessor(); - - const auto index_remappings_acc = index_remappings.accessor(); - const auto index_remappings_offsets_acc = index_remappings_offsets.accessor(); - - for (int64_t idx = 0; idx < num_indices; idx++) { - const int table_idx = update_table_indices_acc[idx]; - const auto row_idx = update_row_indices_acc[idx]; - int64_t index_remappings_start = index_remappings_offsets_acc[table_idx]; - int64_t index_remappings_end = index_remappings_offsets_acc[table_idx + 1]; - int64_t capacity = index_remappings_end - index_remappings_start; - if (capacity > 0) { - dense_indices_acc[idx] = index_remappings_acc[index_remappings_start + row_idx]; - } else { - dense_indices_acc[idx] = row_idx; - } - } - }); - return dense_indices; -} - {% endif %} // clang-format on diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp index 43a182b6b1..01c054f818 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp @@ -558,13 +558,6 @@ Tensor pruned_array_lookup_cuda( Tensor index_remappings, Tensor index_remappings_offsets); -///@ingroup embedding-cuda -Tensor pruned_array_lookup_from_row_idx_cuda( - Tensor update_row_indices, - Tensor update_table_indices, - Tensor index_remappings, - Tensor index_remappings_offsets); - TORCH_LIBRARY_FRAGMENT(fbgemm, m) { DISPATCH_TO_CUDA( "int_nbit_split_embedding_codegen_lookup_function", @@ -576,7 +569,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { "pruned_hashmap_lookup", pruned_hashmap_lookup_unweighted_cuda); DISPATCH_TO_CUDA("pruned_array_lookup", pruned_array_lookup_cuda); - DISPATCH_TO_CUDA( - "pruned_array_lookup_from_row_idx", - pruned_array_lookup_from_row_idx_cuda); } diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp index 93db44ac76..a43671f880 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp @@ -240,13 +240,6 @@ Tensor pruned_array_lookup_cpu( Tensor index_remappings, Tensor index_remappings_offsets); -///@ingroup embedding-cpu -Tensor pruned_array_lookup_from_row_idx_cpu( - Tensor update_row_indices, - Tensor update_table_indices, - Tensor index_remappings, - Tensor index_remappings_offsets); - TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.def( "int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int total_D, int max_int2_D, int max_int4_D, int max_int8_D, int max_float16_D, int max_float32_D, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, int output_dtype=1, Tensor? lxu_cache_weights=None, Tensor? lxu_cache_locations=None, int? row_alignment = None, int? max_float8_D=0, int? fp8_exponent_bits=-1, int? fp8_exponent_bias=-1) -> Tensor"); @@ -278,12 +271,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { m.def( "pruned_array_lookup(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor"); DISPATCH_TO_CPU("pruned_array_lookup", pruned_array_lookup_cpu); - - // GPU version of array lookup. - m.def( - "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor"); - DISPATCH_TO_CPU( - "pruned_array_lookup_from_row_idx", pruned_array_lookup_from_row_idx_cpu); } class PrunedMapCPU : public torch::jit::CustomClassHolder { diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu index 4b4345f1cc..e0a2f04ee8 100644 --- a/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu +++ b/fbgemm_gpu/codegen/embedding_forward_quantized_split_template.cu @@ -552,36 +552,6 @@ __global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_ } {% endif %} -{% if not weighted %} -template -__global__ __launch_bounds__(kMaxThreads) void int_nbit_split_embedding_codegen_forward_pruned_array_lookup_from_row_idx_kernel( - const at::PackedTensorAccessor32 update_row_indices, - const at::PackedTensorAccessor32 update_table_indices, - const at::PackedTensorAccessor32 index_remappings, - const at::PackedTensorAccessor32 index_remappings_offsets, - at::PackedTensorAccessor32 dense_indices) { - - const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= update_row_indices.size(0)) { - return; - } - const int table_idx = update_table_indices[idx]; - const auto row_idx = update_row_indices[idx]; - - const int64_t index_remappings_start = index_remappings_offsets[table_idx]; - const int64_t index_remappings_end = index_remappings_offsets[table_idx + 1]; - const int64_t capacity = index_remappings_end - index_remappings_start; - - if (capacity > 0) { - dense_indices[idx] = index_remappings[index_remappings_start + row_idx]; - } else { - dense_indices[idx] = row_idx; - } -} -{% endif %} - - - } {% for nobag in [True, False] %} @@ -1107,53 +1077,6 @@ Tensor pruned_array_lookup_cuda( C10_CUDA_KERNEL_LAUNCH_CHECK(); return dense_indices; } - -Tensor pruned_array_lookup_from_row_idx_cuda( - Tensor update_row_indices, - Tensor update_table_indices, - Tensor index_remappings, - Tensor index_remappings_offsets) { - - TENSOR_ON_CUDA_GPU(update_row_indices); - TENSOR_ON_CUDA_GPU(update_table_indices); - TENSOR_ON_CUDA_GPU(index_remappings); - TENSOR_ON_CUDA_GPU(index_remappings_offsets); - - at::cuda::OptionalCUDAGuard device_guard; - device_guard.set_index(update_table_indices.get_device()); - auto dense_indices = at::empty_like(update_row_indices); - const int32_t T = index_remappings_offsets.size(0) - 1; - - const auto num_indices = update_row_indices.numel(); - if (num_indices == 0) { - return dense_indices; - } - - TORCH_CHECK(index_remappings.size(0) < std::numeric_limits::max()); - TORCH_CHECK(update_row_indices.dim() == 1, "Tensor dim: ", update_row_indices.dim()); - TORCH_CHECK(update_table_indices.dim() == 1, "Tensor dim: ", update_table_indices.dim()); - TORCH_CHECK(index_remappings.dim() == 1, "Tensor dim: ", index_remappings.dim()); - TORCH_CHECK(index_remappings_offsets.dim() == 1, "Tensor dim: ", index_remappings_offsets.dim()); - TORCH_CHECK(dense_indices.dim() == 1, "Tensor dim: ", dense_indices.dim()); - constexpr size_t kForwardMaxThreads = 256; - - AT_DISPATCH_INDEX_TYPES( - update_row_indices.scalar_type(), "embedding_inplace_update_kernel", [&] { - nbit::int_nbit_split_embedding_codegen_forward_pruned_array_lookup_from_row_idx_kernel<<< - nbit::div_round_up(num_indices, kForwardMaxThreads), - kForwardMaxThreads, - 0, - at::cuda::getCurrentCUDAStream()>>>( - update_row_indices.packed_accessor32(), - update_table_indices.packed_accessor32(), - index_remappings.packed_accessor32(), - index_remappings_offsets.packed_accessor32(), - dense_indices.packed_accessor32() - ); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); - return dense_indices; -} {% endif %} // clang-format on diff --git a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h index 10670b48d4..cfa457d04b 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h +++ b/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h @@ -75,4 +75,28 @@ void embedding_inplace_update_cpu( c10::nullopt // Not used, to match cache interface for CUDA op ); +/** + * Index remapping function that returns the remapped indices. + * + * Args: + * update_row_indices: row indices for every new row + * update_table_indices: table indices for every new row + * index_remappings: concated index remapping for every embedding table + * index_remappings_offsets: offset for each embedding table + * + * Returns: + * remapped indices for each new row. + */ +Tensor pruned_array_lookup_from_row_idx_cuda( + const Tensor& update_row_indices, + const Tensor& update_table_indices, + const Tensor& index_remappings, + const Tensor& index_remappings_offsets); + +Tensor pruned_array_lookup_from_row_idx_cpu( + const Tensor& update_row_indices, + const Tensor& update_table_indices, + const Tensor& index_remappings, + const Tensor& index_remappings_offsets); + } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/embedding_inplace_update.cu b/fbgemm_gpu/src/embedding_inplace_update.cu index 1d0e394919..f301576a49 100644 --- a/fbgemm_gpu/src/embedding_inplace_update.cu +++ b/fbgemm_gpu/src/embedding_inplace_update.cu @@ -186,4 +186,98 @@ void embedding_inplace_update_cuda( }); } +template +__global__ +__launch_bounds__(kMaxThreads) void pruned_array_lookup_from_row_idx_kernel( + const at::PackedTensorAccessor32 + update_row_indices, + const at::PackedTensorAccessor32 + update_table_indices, + const at::PackedTensorAccessor32 + index_remappings, + const at::PackedTensorAccessor32 + index_remappings_offsets, + at::PackedTensorAccessor32 + dense_indices) { + const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= update_row_indices.size(0)) { + return; + } + const auto row_idx = update_row_indices[idx]; + if (idx >= update_table_indices.size(0)) { + return; + } + const int table_idx = update_table_indices[idx]; + + const int64_t index_remappings_start = index_remappings_offsets[table_idx]; + const int64_t index_remappings_end = index_remappings_offsets[table_idx + 1]; + const int64_t capacity = index_remappings_end - index_remappings_start; + + if (capacity > 0) { + dense_indices[idx] = index_remappings[index_remappings_start + row_idx]; + } else { + dense_indices[idx] = row_idx; + } +} + +Tensor pruned_array_lookup_from_row_idx_cuda( + const Tensor& update_row_indices, + const Tensor& update_table_indices, + const Tensor& index_remappings, + const Tensor& index_remappings_offsets) { + TENSOR_ON_CUDA_GPU(update_row_indices); + TENSOR_ON_CUDA_GPU(update_table_indices); + TENSOR_ON_CUDA_GPU(index_remappings); + TENSOR_ON_CUDA_GPU(index_remappings_offsets); + + at::cuda::OptionalCUDAGuard device_guard; + device_guard.set_index(update_table_indices.get_device()); + auto dense_indices = at::empty_like(update_row_indices); + const int32_t T = index_remappings_offsets.size(0) - 1; + + const auto num_indices = update_row_indices.numel(); + if (num_indices == 0) { + return dense_indices; + } + + TORCH_CHECK(index_remappings.size(0) < std::numeric_limits::max()); + TORCH_CHECK( + update_row_indices.dim() == 1, "Tensor dim: ", update_row_indices.dim()); + TORCH_CHECK( + update_table_indices.dim() == 1, + "Tensor dim: ", + update_table_indices.dim()); + TORCH_CHECK( + index_remappings.dim() == 1, "Tensor dim: ", index_remappings.dim()); + TORCH_CHECK( + index_remappings_offsets.dim() == 1, + "Tensor dim: ", + index_remappings_offsets.dim()); + TORCH_CHECK(dense_indices.dim() == 1, "Tensor dim: ", dense_indices.dim()); + constexpr size_t kForwardMaxThreads = 256; + + AT_DISPATCH_INDEX_TYPES( + update_row_indices.scalar_type(), + "pruned_array_lookup_from_row_idx_kernel", + [&] { + pruned_array_lookup_from_row_idx_kernel<<< + nbit::div_round_up(num_indices, kForwardMaxThreads), + kForwardMaxThreads, + 0, + at::cuda::getCurrentCUDAStream()>>>( + update_row_indices + .packed_accessor32(), + update_table_indices + .packed_accessor32(), + index_remappings + .packed_accessor32(), + index_remappings_offsets + .packed_accessor32(), + dense_indices + .packed_accessor32()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + return dense_indices; +} + } // namespace fbgemm_gpu diff --git a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp index bd1315e023..5f3a648872 100644 --- a/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp +++ b/fbgemm_gpu/src/embedding_inplace_update_cpu.cpp @@ -116,6 +116,53 @@ void embedding_inplace_update_cpu( }); } +Tensor pruned_array_lookup_from_row_idx_cpu( + const Tensor& update_row_indices, + const Tensor& update_table_indices, + const Tensor& index_remappings, + const Tensor& index_remappings_offsets) { + TENSOR_ON_CPU(update_row_indices); + TENSOR_ON_CPU(update_table_indices); + TENSOR_ON_CPU(index_remappings); + TENSOR_ON_CPU(index_remappings_offsets); + + auto dense_indices = empty_like(update_row_indices); + const auto num_indices = update_row_indices.numel(); + + AT_DISPATCH_INDEX_TYPES( + update_row_indices.scalar_type(), + "pruned_array_lookup_from_row_idx_cpu_kernel", + [&] { + const auto update_row_indices_acc = + update_row_indices.accessor(); + auto dense_indices_acc = dense_indices.accessor(); + const auto update_table_indices_acc = + update_table_indices.accessor(); + + const auto index_remappings_acc = + index_remappings.accessor(); + const auto index_remappings_offsets_acc = + index_remappings_offsets.accessor(); + + for (int64_t idx = 0; idx < num_indices; idx++) { + const int table_idx = update_table_indices_acc[idx]; + const auto row_idx = update_row_indices_acc[idx]; + int64_t index_remappings_start = + index_remappings_offsets_acc[table_idx]; + int64_t index_remappings_end = + index_remappings_offsets_acc[table_idx + 1]; + int64_t capacity = index_remappings_end - index_remappings_start; + if (capacity > 0) { + dense_indices_acc[idx] = + index_remappings_acc[index_remappings_start + row_idx]; + } else { + dense_indices_acc[idx] = row_idx; + } + } + }); + return dense_indices; +} + } // namespace fbgemm_gpu TORCH_LIBRARY_FRAGMENT(fbgemm, m) { @@ -127,3 +174,14 @@ TORCH_LIBRARY_IMPL(fbgemm, CPU, m) { DISPATCH_TO_CPU( "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cpu); } + +TORCH_LIBRARY_FRAGMENT(fbgemm, m) { + m.def( + "pruned_array_lookup_from_row_idx(Tensor update_row_indices, Tensor update_table_indices, Tensor index_remappings, Tensor index_remappings_offsets) -> Tensor"); +} + +TORCH_LIBRARY_IMPL(fbgemm, CPU, m) { + DISPATCH_TO_CPU( + "pruned_array_lookup_from_row_idx", + fbgemm_gpu::pruned_array_lookup_from_row_idx_cpu); +} diff --git a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp index 743a902b68..cfb48c2427 100644 --- a/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp +++ b/fbgemm_gpu/src/embedding_inplace_update_gpu.cpp @@ -14,3 +14,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { DISPATCH_TO_CUDA( "emb_inplace_update", fbgemm_gpu::embedding_inplace_update_cuda); } + +TORCH_LIBRARY_FRAGMENT(fbgemm, m) { + DISPATCH_TO_CUDA( + "pruned_array_lookup_from_row_idx", + fbgemm_gpu::pruned_array_lookup_from_row_idx_cuda); +} From 92305da6dfa1b9845b55fad85edbaf9374092eef Mon Sep 17 00:00:00 2001 From: Rengan Xu Date: Wed, 29 Mar 2023 22:05:47 -0700 Subject: [PATCH 27/34] jagged_softmax forward optimization (#1661) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1661 This diff optimizes jagged_softmax forward with more efficient reduction from cub library. Reviewed By: brad-mengchi Differential Revision: D44161021 fbshipit-source-id: bf2e059d14ef4d7ad311edac65155a463ba653ff --- fbgemm_gpu/src/jagged_tensor_ops.cu | 122 +++++++++++++++++++++------- 1 file changed, 92 insertions(+), 30 deletions(-) diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu index e646d28be2..94400a58c6 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops.cu @@ -12,6 +12,7 @@ #include #include #include +#include // clang-format off #include "fbgemm_gpu/cub_namespace_prefix.cuh" @@ -1824,39 +1825,101 @@ std::tuple batched_dense_vec_jagged_2d_mul_backward( return {v_grad, a_values_grad}; } -template +template __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel( const at::PackedTensorAccessor32 values, const at::PackedTensorAccessor32 offsets, at::PackedTensorAccessor32 output, const int max_L) { - const int B = offsets.size(0) - 1; - const int D = output.size(1); + const auto B = offsets.size(0) - 1; + const auto D = output.size(1); - const int b_begin = blockIdx.x * blockDim.y + threadIdx.y; - const int b_step = gridDim.x * blockDim.y; - for (int b = b_begin; b < B; b += b_step) { - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = min(row_end - row_start, max_L); - if (length != 0) { - // TODO: use shared memory and better reduction - for (int d = threadIdx.x; d < D; d += blockDim.x) { - scalar_t max_value = values[row_start][d]; - for (int l = 1; l < length; ++l) { - max_value = max(max_value, values[row_start + l][d]); + // Specialize BlockReduce type for our thread block + typedef cub::BlockReduce BlockReduceT; + + // Allocate shared memory for BlockReduce + __shared__ typename BlockReduceT::TempStorage temp_storage; + + __shared__ scalar_t max_value; + __shared__ scalar_t exp_sum; + + const auto tid = threadIdx.x; + for (auto b = blockIdx.y; b < B; b += gridDim.y) { + const index_t row_start = offsets[b]; + const index_t row_end = offsets[b + 1]; + const auto length = min(row_end - row_start, (index_t)max_L); + + if (length > 0) { + const auto num_l_blocks = + (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + for (auto d = blockIdx.x; d < D; d += gridDim.x) { + if (tid == 0) { + max_value = values[row_start][d]; + exp_sum = 0; } - at::acc_type acc = - exp(values[row_start][d] - max_value); - for (int l = 1; l < length; ++l) { - acc += exp(values[row_start + l][d] - max_value); + // Loop through all blocks to calculate the max value + // Each block has its own max value block_max_value, and + // max_value is the max value across all blocks + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + const auto l = bk_l * blockDim.x + tid; + scalar_t thread_val = values[row_start][d]; + if (l < length) { + thread_val = values[row_start + l][d]; + } + + // Collectively compute the block-wide max reduction + scalar_t block_max_value = + BlockReduceT(temp_storage).Reduce(thread_val, cub::Max()); + __syncthreads(); + + if (tid == 0) { + max_value = max(max_value, block_max_value); + } } - for (int l = 0; l < length; ++l) { - output[row_start + l][d] = - exp(values[row_start + l][d] - max_value) / acc; + // The max_value was updated by thread 0 in the last loop, sync here to + // make sure the next loop uses the updated max_value + __syncthreads(); + + // Loop through all blocks to calculate the sum of exp + // Each block has its own sum block_exp_acc, and + // exp_sum is the sum across all blocks + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + auto l = bk_l * blockDim.x + tid; + + scalar_t thread_exp = 0; + if (l < length) { + thread_exp = std::exp(values[row_start + l][d] - max_value); + } + + // Collectively compute the block-wide sum reduction + scalar_t block_exp_sum = BlockReduceT(temp_storage).Sum(thread_exp); + __syncthreads(); + + if (tid == 0) { + exp_sum += block_exp_sum; + } } + + // The exp_sum was updated by thread 0 in the last loop, sync here to + // make sure the next loop uses the updated exp_sum + __syncthreads(); + + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + auto l = bk_l * blockDim.x + tid; + scalar_t thread_exp = 0; + if (l < length) { + thread_exp = std::exp(values[row_start + l][d] - max_value); + output[row_start + l][d] = thread_exp / exp_sum; + } + } + + // The max_value and exp_sum will be reinitialized by thread 0 in the + // next d iteration, sync here to make sure the last loop still uses the + // reduced values before reinitialization + __syncthreads(); } } } @@ -1872,14 +1935,13 @@ Tensor jagged_softmax_forward( at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(values.get_device()); - const int B = offsets.numel() - 1; - const int D = values.size(1); + const auto B = offsets.numel() - 1; + const auto D = values.size(1); auto output = at::empty_like(values); if (B > 0 && D > 0) { - const int block_dim_x = - std::min(div_round_up(D, kWarpSize) * kWarpSize, kMaxThreads); - const int block_dim_y = kMaxThreads / block_dim_x; + constexpr int THREADS_PER_BLOCK = 128; + const dim3 grid(D, std::min((int32_t)B, (int32_t)kMaxBlockYDim), 1); AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_softmax_kernel_1", [&] { @@ -1889,9 +1951,9 @@ Tensor jagged_softmax_forward( values.scalar_type(), "jagged_softmax_kernel_2", [&] { - jagged_softmax_kernel - << + <<>>( values.packed_accessor32(), From 802b8dc454c9248b79bad51b91b291202d82106b Mon Sep 17 00:00:00 2001 From: Rengan Xu Date: Wed, 29 Mar 2023 22:05:47 -0700 Subject: [PATCH 28/34] jagged_softmax backward optimization (#1662) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1662 This diff optimizes jagged_softmax backward with more efficient reduction from cub library Reviewed By: brad-mengchi Differential Revision: D44205819 fbshipit-source-id: cd1d7a886d6ba68201dc1ad782c2e8cde7ff706b --- fbgemm_gpu/src/jagged_tensor_ops.cu | 97 +++++++++++++++++++++-------- 1 file changed, 70 insertions(+), 27 deletions(-) diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu index 94400a58c6..4e249d9553 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops.cu @@ -1968,35 +1968,76 @@ Tensor jagged_softmax_forward( return output; } -template +template __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_backward_kernel( const at::PackedTensorAccessor32 grad_output, const at::PackedTensorAccessor32 output, const at::PackedTensorAccessor32 offsets, at::PackedTensorAccessor32 grad_input, const int max_L) { - const int B = offsets.size(0) - 1; - const int D = grad_output.size(1); + const auto B = offsets.size(0) - 1; + const auto D = grad_output.size(1); - const int b_begin = blockIdx.x * blockDim.y + threadIdx.y; - const int b_step = gridDim.x * blockDim.y; - for (int b = b_begin; b < B; b += b_step) { - const int row_start = offsets[b]; - const int row_end = offsets[b + 1]; - const int length = min(row_end - row_start, max_L); - if (length != 0) { - // TODO: use shared memory and better reduction - for (int d = threadIdx.x; d < D; d += blockDim.x) { - scalar_t sum_value = grad_output[row_start][d] * output[row_start][d]; - for (int l = 1; l < length; ++l) { - sum_value += grad_output[row_start + l][d] * output[row_start + l][d]; + // Specialize BlockReduce type for our thread block + typedef cub::BlockReduce BlockReduceT; + + // Allocate shared memory for BlockReduce + __shared__ typename BlockReduceT::TempStorage temp_storage; + + __shared__ scalar_t sum_value; + + const auto tid = threadIdx.x; + for (auto b = blockIdx.y; b < B; b += gridDim.y) { + const index_t row_start = offsets[b]; + const index_t row_end = offsets[b + 1]; + const auto length = min(row_end - row_start, (index_t)max_L); + + if (length > 0) { + const auto num_l_blocks = + (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + + for (auto d = blockIdx.x; d < D; d += gridDim.x) { + if (tid == 0) { + sum_value = 0; + } + + // Loop through all blocks to calculate the sum value + // Each block has its own sum, and sum_value is the sum value across all + // blocks + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + const auto l = bk_l * blockDim.x + tid; + scalar_t thread_val = 0; + if (l < length) { + thread_val = + grad_output[row_start + l][d] * output[row_start + l][d]; + } + + // Collectively compute the block-wide sum reduction + scalar_t block_sum_value = BlockReduceT(temp_storage).Sum(thread_val); + __syncthreads(); + + if (tid == 0) { + sum_value += block_sum_value; + } } - for (int l = 0; l < length; ++l) { - grad_input[row_start + l][d] = - (grad_output[row_start + l][d] - sum_value) * - output[row_start + l][d]; + // The sum_value was updated by thread 0 in the last loop, sync here to + // make sure the next loop uses the updated sum_value + __syncthreads(); + + for (auto bk_l = 0; bk_l < num_l_blocks; bk_l++) { + const auto l = bk_l * blockDim.x + tid; + if (l < length) { + grad_input[row_start + l][d] = + (grad_output[row_start + l][d] - sum_value) * + output[row_start + l][d]; + } } + + // The sum_value will be reinitialized by thread 0 in the + // next d iteration, sync here to make sure the last loop still uses the + // reduced value before reinitialization + __syncthreads(); } } } @@ -2014,14 +2055,13 @@ Tensor jagged_softmax_backward( at::cuda::OptionalCUDAGuard device_guard; device_guard.set_index(grad_output.get_device()); - const int B = offsets.numel() - 1; - const int D = grad_output.size(1); + const auto B = offsets.numel() - 1; + const auto D = grad_output.size(1); auto grad_input = at::empty_like(grad_output); if (B > 0 && D > 0) { - const int block_dim_x = - std::min(div_round_up(D, kWarpSize) * kWarpSize, kMaxThreads); - const int block_dim_y = kMaxThreads / block_dim_x; + constexpr int THREADS_PER_BLOCK = 128; + const dim3 grid(D, std::min((int32_t)B, (int32_t)kMaxBlockYDim), 1); AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_softmax_backward_kernel_1", [&] { @@ -2031,9 +2071,12 @@ Tensor jagged_softmax_backward( grad_output.scalar_type(), "jagged_softmax_backward_kernel_2", [&] { - jagged_softmax_backward_kernel - << + <<>>( grad_output.packed_accessor32(), From b74d407863c30b7e4eb7048cce1d270139d1154b Mon Sep 17 00:00:00 2001 From: Geet Sethi Date: Thu, 30 Mar 2023 02:26:36 -0700 Subject: [PATCH 29/34] multi-gpu all_to_one improvements (#1674) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1674 improved multi-gpu all_to_one with: 1. new intermediate hop selection taking advantage of distinct NVLinks 2. overlapping of intermediate hop transfers with each-other and with direct-peer transfers Reviewed By: doehyun Differential Revision: D44285941 fbshipit-source-id: 0202083f04388b5ba60b8155809433f334993ef4 --- .../src/merge_pooled_embeddings_gpu.cpp | 219 ++++++++++++------ .../test/merge_pooled_embeddings_test.py | 2 +- 2 files changed, 154 insertions(+), 67 deletions(-) diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp index ed3c075bd0..d03b961a79 100644 --- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp +++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp @@ -23,72 +23,85 @@ using Tensor = at::Tensor; namespace { -// Hilariously unoptimized, but algorithmic correctness matters more here, and -// we only do it once. -AdjacencyMatrix get_intermediate_node(AdjacencyMatrix links) { - auto world_size = at::cuda::getNumGPUs(); - auto intermediate_node = [&](Node i, Node j) { - if (i == j) { - return std::vector{-1}; - } - if (links(i, j) != 0) { - return std::vector{-1}; - } +struct DirectConnectedPeer { + int64_t num_peer_links; + int64_t peer_id; + // number of transfers from peer + int32_t peer_transfers; +}; - std::vector> paths; - for (const auto k : c10::irange(world_size)) { - if (k != i && k != j && links(i, k) != 0 && links(k, j) != 0) { - paths.push_back({k, links(i, k) + links(k, j)}); - } - } - if (paths.empty()) { - LOG(WARNING) - << "Expect very bad performance for p2p copies, we are going via sys path for GPU " - << i << " -> GPU " << j; - return std::vector{-1}; - } - auto mp = std::max_element( - paths.begin(), - paths.end(), - [](std::pair a, std::pair b) { - return a.second < b.second; - }) - ->second; - std::vector candidates; - for (const auto& p : paths) { - if (p.second == mp) { - candidates.push_back(p.first); - } - } - return candidates; - }; +struct TwoHopTransferContainer { + Tensor intermediate_tensor; + uint64_t output_idx; + std::unique_ptr transfer_cuda_event; +}; - std::vector assignments(world_size * world_size); - // Use a two-phase assignment protocol as the greedy approach - // can lead to unbalanced usage. - std::unordered_map uses; +AdjacencyMatrix get_intermediate_node( + const AdjacencyMatrix& links) { + const auto world_size = at::cuda::getNumGPUs(); + std::vector link_vec(static_cast(world_size * world_size)); for (const auto i : c10::irange(world_size)) { for (const auto j : c10::irange(world_size)) { - auto ims = intermediate_node(i, j); - if (ims.size() == 1) { - auto v = ims.front(); - if (v != -1) { - uses[v] += 1; - } - assignments[i * world_size + j] = v; - } + link_vec[i * world_size + j] = links(i, j); } } + auto link_tensor = at::from_blob( + link_vec.data(), + {world_size, world_size}, + at::TensorOptions().dtype(at::kLong)); + LOG(INFO) << "NVLink Topology Matrix: \n" << link_tensor; + std::vector assignments( + static_cast(world_size * world_size), -1); + for (const auto dst_rank_id : c10::irange(world_size)) { + std::vector non_direct_src_ids; + non_direct_src_ids.reserve(world_size); + std::vector direct_connected_peers; + direct_connected_peers.reserve(world_size); + for (const auto src_rank_id : c10::irange(world_size)) { + if (dst_rank_id == src_rank_id) { + continue; + } - for (const auto i : c10::irange(world_size)) { - for (const auto j : c10::irange(world_size)) { - auto ims = intermediate_node(i, j); - if (ims.size() > 1) { - auto v = *std::min_element(ims.begin(), ims.end(), [&](Node a, Node b) { - return uses[a] < uses[b]; - }); - uses[v] += 1; - assignments[i * world_size + j] = v; + const auto num_peer_links = links(dst_rank_id, src_rank_id); + if (num_peer_links > 0) { + direct_connected_peers.push_back( + {.num_peer_links = num_peer_links, + .peer_id = src_rank_id, + .peer_transfers = 1}); + } else { + non_direct_src_ids.push_back(src_rank_id); + } + } + + // Assign intermediate hop ranks for non-directly connected peers. + // Assigns intermediate hops based on the number of links from the + // potential intermediate rank to target rank, as well as + // the number of two_hop connections already assigned to the + // intermediate rank. + for (const auto i : c10::irange(non_direct_src_ids.size())) { + std::sort( + direct_connected_peers.begin(), + direct_connected_peers.end(), + [](const auto& a, const auto& b) { + if (a.num_peer_links > b.num_peer_links) { + return true; + } else if (a.num_peer_links == b.num_peer_links) { + return a.peer_transfers < b.peer_transfers; + } else { + return false; + } + }); + const auto non_direct_src_id = non_direct_src_ids.at(i); + for (auto& j : direct_connected_peers) { + const auto potential_hop_id = j.peer_id; + const auto potential_hop_peer_links = + links(potential_hop_id, non_direct_src_id); + if (potential_hop_peer_links > 0) { + assignments[dst_rank_id * world_size + non_direct_src_id] = + potential_hop_id; + j.peer_transfers += 1; + break; + } } } } @@ -100,7 +113,8 @@ AdjacencyMatrix get_intermediate_node(AdjacencyMatrix links) { {world_size, world_size}, at::TensorOptions().dtype(at::kLong)); LOG(INFO) << "Detected a multi-hop NVLink configuration: \n" << tensor; - return [=](Node i, Node j) { return assignments[i * world_size + j]; }; + return + [=](Node src, Node dst) { return assignments[dst * world_size + src]; }; } else { return [](Node, Node) { return -1; }; } @@ -111,7 +125,7 @@ AdjacencyMatrix get_intermediate_node(AdjacencyMatrix links) { // tensor in `input_tensors` is already in the `target_device`, we will skip // copy it if `skip_if_same_device` is true. void all_to_one( - std::vector& input_tensors, + const std::vector& input_tensors, std::vector& output_tensors, at::Device target_device, bool skip_if_same_device) { @@ -119,19 +133,48 @@ void all_to_one( std::vector copy_begin_events(num_gpus); std::vector copy_completion_events(num_gpus); + std::vector two_hop_transfers; + two_hop_transfers.reserve(input_tensors.size()); + std::vector is_two_hop_transfer; + is_two_hop_transfer.reserve(input_tensors.size()); + static auto intermediate_nodes = get_intermediate_node(fbgemm_gpu::get_nvlink_matrix()); - for (auto& ten : input_tensors) { - Node src_device_id = ten.get_device(); + for (const auto i : c10::irange(input_tensors.size())) { + const auto& src = input_tensors.at(i); + Node src_device_id = src.get_device(); auto intermediate_node = intermediate_nodes(src_device_id, target_device.index()); if (intermediate_node != -1) { - ten = ten.to(at::Device(at::kCUDA, intermediate_node)); + two_hop_transfers.push_back( + {.intermediate_tensor = at::empty( + src.sizes(), + src.options().device(at::Device(at::kCUDA, intermediate_node))), + .output_idx = i, + .transfer_cuda_event = + std::make_unique(cudaEventDisableTiming)}); + auto& dst = two_hop_transfers.back().intermediate_tensor; + at::cuda::CUDAStream copy_stream = + at::cuda::getCurrentCUDAStream(src_device_id); + AT_CUDA_CHECK(cudaMemcpy2DAsync( + dst.data_ptr(), + dst.stride(0) * dst.element_size(), + src.data_ptr(), + src.stride(0) * src.element_size(), + src.size(1) * src.element_size(), + src.size(0), + cudaMemcpyDeviceToDevice, + copy_stream)); + two_hop_transfers.back().transfer_cuda_event->record(copy_stream); + is_two_hop_transfer.push_back(true); + } else { + is_two_hop_transfer.push_back(false); } } - // For each source device, we sync its current stream and launch all the - // copies that are from that device. + // For each source device directly connected to the destination device, we + // sync its current stream and launch all the copies that are from that + // device. for (const auto device_id : c10::irange(num_gpus)) { auto src_device = at::Device(at::kCUDA, device_id); if (src_device == target_device) { @@ -160,6 +203,13 @@ void all_to_one( device_guard.set_device(src_device); dst_ready.block(copy_stream); for (const auto i : c10::irange(input_tensors.size())) { + const auto metadata = is_two_hop_transfer.at(i); + // Initiate all transfer for tensors with direct + // NVLink connection to target rank + if (metadata) { + continue; + } + auto& src = input_tensors[i]; if (src.device() != src_device) { continue; @@ -179,6 +229,43 @@ void all_to_one( } } + // Complete 2-hop transfers to target rank + for (auto& two_hop_transfer : two_hop_transfers) { + const auto& src = two_hop_transfer.intermediate_tensor; + const auto src_device_id = src.get_device(); + const auto src_device = at::Device(at::kCUDA, src_device_id); + if (src_device == target_device) { + continue; + } + + // intermediate rank + at::cuda::CUDAGuard device_guard(src_device); + // intermediate rank stream + at::cuda::CUDAStream copy_stream = + at::cuda::getCurrentCUDAStream(src_device_id); + // wait on first hop transfer + two_hop_transfer.transfer_cuda_event->block(copy_stream); + // synchronize with target rank + auto& dst_ready = copy_begin_events[src_device_id]; + device_guard.set_device(target_device); + dst_ready.record(at::cuda::getCurrentCUDAStream(target_device.index())); + device_guard.set_device(src_device); + dst_ready.block(copy_stream); + // originating tensor output position + const auto output_index = two_hop_transfer.output_idx; + auto& dst = output_tensors.at(output_index); + // on source device, launch memcpy. + AT_CUDA_CHECK(cudaMemcpy2DAsync( + dst.data_ptr(), + dst.stride(0) * dst.element_size(), + src.data_ptr(), + src.stride(0) * src.element_size(), + src.size(1) * src.element_size(), + src.size(0), + cudaMemcpyDeviceToDevice, + copy_stream)); + } + // Do the same-GPU cases. if (!skip_if_same_device) { for (const auto i : c10::irange(input_tensors.size())) { diff --git a/fbgemm_gpu/test/merge_pooled_embeddings_test.py b/fbgemm_gpu/test/merge_pooled_embeddings_test.py index de7c80b79b..98e1ede2ee 100644 --- a/fbgemm_gpu/test/merge_pooled_embeddings_test.py +++ b/fbgemm_gpu/test/merge_pooled_embeddings_test.py @@ -100,7 +100,7 @@ def ref(pooled_ad_embeddings, batch_indices): r=st.randoms(use_true_random=False), ) # Can instantiate 8 contexts which takes a long time. - @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None) + @settings(verbosity=Verbosity.verbose, max_examples=40, deadline=None) def test_all_to_one_device( self, num_inputs, From 177ba08dbc3bae56d11e4c89bdf19b04974374dc Mon Sep 17 00:00:00 2001 From: Janet Yang Date: Thu, 30 Mar 2023 07:33:20 -0700 Subject: [PATCH 30/34] Extract and export weights offsets/placements initialization functions (#1669) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1669 Extract portions initializing the weights_placements/offsets tensors into separate functions and jit.export them. SplitState is converted to a NamedTuple since we can't jit.script a dataclass that also holds an enum. Reviewed By: houseroad Differential Revision: D44338256 fbshipit-source-id: e1c12e5956f7217d51cd190958c3764d220e521d --- .../split_table_batched_embeddings_ops.py | 107 +++++++++++------- 1 file changed, 68 insertions(+), 39 deletions(-) diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py index ff8ce4d094..c327d359cc 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py @@ -115,14 +115,16 @@ class CounterBasedRegularizationDefinition: [("record_cache_miss_counter", bool), ("record_tablewise_cache_miss", bool)], ) - -@dataclass -class SplitState: - dev_size: int - host_size: int - uvm_size: int - placements: List[EmbeddingLocation] - offsets: List[int] +SplitState: NamedTuple = NamedTuple( + "SplitState", + [ + ("dev_size", int), + ("host_size", int), + ("uvm_size", int), + ("placements", List[EmbeddingLocation]), + ("offsets", List[int]), + ], +) def construct_split_state( @@ -132,11 +134,11 @@ def construct_split_state( precision: SparseType = SparseType.FP32, int8_emb_row_dim_offset: int = INT8_EMB_ROW_DIM_OFFSET, ) -> SplitState: - placements = [] - offsets = [] - dev_size = 0 - host_size = 0 - uvm_size = 0 + placements: List[EmbeddingLocation] = [] + offsets: List[int] = [] + dev_size: int = 0 + host_size: int = 0 + uvm_size: int = 0 for num_embeddings, embedding_dim, location, _ in embedding_specs: assert ( embedding_dim % 4 == 0 @@ -1935,8 +1937,8 @@ def nbit_construct_split_state( scale_bias_size_in_bytes: int = DEFAULT_SCALE_BIAS_SIZE_IN_BYTES, cacheline_alignment: bool = True, ) -> SplitState: - placements = [] - offsets = [] + placements = torch.jit.annotate(List[EmbeddingLocation], []) + offsets = torch.jit.annotate(List[int], []) dev_size = 0 host_size = 0 uvm_size = 0 @@ -1984,6 +1986,8 @@ class IntNBitTableBatchedEmbeddingBagsCodegen(nn.Module): cache_miss_counter: torch.Tensor uvm_cache_stats: torch.Tensor local_uvm_cache_stats: torch.Tensor + weights_offsets: torch.Tensor + weights_placements: torch.Tensor def __init__( self, @@ -2165,21 +2169,7 @@ def max_ty_D(ty: SparseType) -> int: ] self.max_D_cache: int = max(cached_dims) if len(cached_dims) > 0 else 0 - weight_split: SplitState = nbit_construct_split_state( - self.embedding_specs, - cacheable=True, - row_alignment=self.row_alignment, - scale_bias_size_in_bytes=self.scale_bias_size_in_bytes, - cacheline_alignment=cacheline_alignment, - ) - - self.weights_physical_placements: List[int] = [ - t.value for t in weight_split.placements - ] - self.weights_physical_offsets: List[int] = weight_split.offsets - self.host_size: int = weight_split.host_size - self.dev_size: int = weight_split.dev_size - self.uvm_size: int = weight_split.uvm_size + self.initialize_physical_weights_placements_and_offsets(cacheline_alignment) self.enforce_hbm: bool = enforce_hbm # Assign weights after weights and weights_offsets are initialized. @@ -2192,7 +2182,8 @@ def max_ty_D(ty: SparseType) -> int: self.weights_physical_offsets, self.enforce_hbm, ) - self.assign_embedding_weights(weight_lists) # type: ignore + # pyre-fixme [6]: In call `IntNBitTableBatchedEmbeddingBagsCodegen.assign_embedding_weights`, for 1st positional argument, expected `List[Tuple[Tensor, Optional[Tensor]]]` but got `List[Tuple[Tensor, Tensor]]`. + self.assign_embedding_weights(weight_lists) # Handle index remapping for embedding pruning. self.register_buffer( @@ -2654,6 +2645,51 @@ def forward( fp8_exponent_bias=self.fp8_exponent_bias, ) + def initialize_logical_weights_placements_and_offsets( + self, + ) -> None: + assert len(self.weights_physical_offsets) == len(self.embedding_specs) + assert len(self.weights_physical_offsets) == len( + self.weights_physical_placements + ) + offsets = [self.weights_physical_offsets[t] for t in self.feature_table_map] + placements = [ + self.weights_physical_placements[t] for t in self.feature_table_map + ] + self.weights_offsets = torch.tensor( + offsets, device=self.current_device, dtype=torch.int64 + ) + self.weights_placements = torch.tensor( + placements, device=self.current_device, dtype=torch.int32 + ) + + def initialize_physical_weights_placements_and_offsets( + self, + cacheline_alignment: bool = True, + ) -> None: + # Initialize physical weights placements and offsets + # and host/dev/uvm sizes + weight_split: SplitState = nbit_construct_split_state( + self.embedding_specs, + cacheable=True, + row_alignment=self.row_alignment, + scale_bias_size_in_bytes=self.scale_bias_size_in_bytes, + cacheline_alignment=cacheline_alignment, + ) + self.weights_physical_placements = [t.value for t in weight_split.placements] + self.weights_physical_offsets = weight_split.offsets + self.host_size = weight_split.host_size + self.dev_size = weight_split.dev_size + self.uvm_size = weight_split.uvm_size + + @torch.jit.export + def reset_weights_placements_and_offsets( + self, + ) -> None: + # Initialize all physical/logical weights placements and offsets without initializing large dev weights tensor + self.initialize_physical_weights_placements_and_offsets() + self.initialize_logical_weights_placements_and_offsets() + def _apply_split( self, dev_size: int, @@ -2672,14 +2708,7 @@ def _apply_split( self.dev_size = dev_size self.uvm_size = uvm_size - offsets = [offsets[t] for t in self.feature_table_map] - placements = [placements[t] for t in self.feature_table_map] - self.weights_offsets = torch.tensor( - offsets, device=self.current_device, dtype=torch.int64 - ) - self.weights_placements = torch.tensor( - placements, device=self.current_device, dtype=torch.int32 - ) + self.initialize_logical_weights_placements_and_offsets() if dev_size > 0: self.weights_dev = torch.zeros( From d559a109432222fb827f8cc462eb298d04900901 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Thu, 30 Mar 2023 22:55:25 -0700 Subject: [PATCH 31/34] Fix the ROCm Test Job (#1668) Summary: - Clean up the ROCm test job and re-enable ROCm testing on the rocm instances. - Update the build scripts framework to build FBGEMM_GPU against the correct hardware target that it is intended to be tested on. One thing that was discovered was that if FBGEMM_GPU was built with `PYTORCH_ROCM_ARCH=gfx90a` but run on `gfx908` target, the tests will fail with a segfault. While the failure is expected, the segfault can be unfriendly and confusing for users. - Enable correct compilation of `merge_pooled_embeddings` operator under ROCm - Fix existing code in `jagged_tensor_ops` from PR https://github.com/pytorch/FBGEMM/issues/1661 and https://github.com/pytorch/FBGEMM/issues/1662 that break its compilation under ROCm 5.3 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1668 Reviewed By: shintaro-iwasaki Differential Revision: D44453594 Pulled By: q10 fbshipit-source-id: 2030cd0e00c6ff9694c2783dfd62c31cf5543da2 --- .github/scripts/setup_env.bash | 273 +++++++++++++----- .github/workflows/fbgemm_gpu_ci.yml | 100 ++++--- .github/workflows/fbgemm_gpu_cuda_nightly.yml | 2 +- .github/workflows/fbgemm_gpu_cuda_release.yml | 2 +- .github/workflows/fbgemm_gpu_lint.yml | 14 +- fbgemm_gpu/CMakeLists.txt | 7 +- fbgemm_gpu/src/jagged_tensor_ops.cu | 8 +- .../src/merge_pooled_embeddings_gpu.cpp | 8 + 8 files changed, 287 insertions(+), 127 deletions(-) diff --git a/.github/scripts/setup_env.bash b/.github/scripts/setup_env.bash index 57da549463..9cf928883c 100755 --- a/.github/scripts/setup_env.bash +++ b/.github/scripts/setup_env.bash @@ -261,7 +261,6 @@ print_gpu_info () { echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!" return 1 fi - else if which nvidia-smi; then # If nvidia-smi is installed on a machine without GPUs, this will return error @@ -270,6 +269,21 @@ print_gpu_info () { echo "[CHECK] nvidia-smi not found" fi fi + + if [[ "${ENFORCE_AMD_GPU}" ]]; then + # Ensure that rocm-smi is available and returns GPU entries + if ! rocm-smi; then + echo "[CHECK] AMD driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!" + return 1 + fi + else + if which rocm-smi; then + # If rocm-smi is installed on a machine without GPUs, this will return error + (print_exec rocm-smi) || true + else + echo "[CHECK] rocm-smi not found" + fi + fi } __print_system_info_linux () { @@ -1102,6 +1116,103 @@ prepare_fbgemm_gpu_build () { echo "[BUILD] Successfully ran git submodules update" } +__configure_fbgemm_gpu_build_cpu () { + # Update the package name and build args depending on if CUDA is specified + echo "[BUILD] Setting CPU-only build args ..." + build_args=(--cpu_only) +} + +__configure_fbgemm_gpu_build_rocm () { + local fbgemm_variant_targets="$1" + + # Fetch available ROCm architectures on the machine + if [ "$fbgemm_variant_targets" != "" ]; then + echo "[BUILD] ROCm targets have been manually provided: ${fbgemm_variant_targets}" + local arch_list="${fbgemm_variant_targets}" + else + if which rocminfo; then + # shellcheck disable=SC2155 + local arch_list=$(rocminfo | grep -o -m 1 'gfx.*') + echo "[BUILD] Architectures list from rocminfo: ${arch_list}" + + if [ "$arch_list" == "" ]; then + # By default, build for MI250 only to save time + local arch_list=gfx90a + fi + else + echo "[BUILD] rocminfo not found in PATH!" + fi + fi + + echo "[BUILD] Setting the following ROCm targets: ${arch_list}" + print_exec conda env config vars set -n "${env_name}" PYTORCH_ROCM_ARCH="${arch_list}" + + echo "[BUILD] Setting ROCm build args ..." + build_args=() +} + +__configure_fbgemm_gpu_build_cuda () { + local fbgemm_variant_targets="$1" + + # Check nvcc is visible + (test_binpath "${env_name}" nvcc) || return 1 + + # Check that cuDNN environment variables are available + (test_env_var "${env_name}" CUDNN_INCLUDE_DIR) || return 1 + (test_env_var "${env_name}" CUDNN_LIBRARY) || return 1 + (test_env_var "${env_name}" NVML_LIB_PATH) || return 1 + + local arch_list="${fbgemm_variant_targets:-7.0;8.0}" + echo "[BUILD] Setting the following CUDA targets: ${arch_list}" + + # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI. + echo "[BUILD] Setting CUDA build args ..." + # shellcheck disable=SC2155 + local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH) + build_args=( + --nvml_lib_path="${nvml_lib_path}" + -DTORCH_CUDA_ARCH_LIST="'${arch_list}'" + ) +} + +__configure_fbgemm_gpu_build () { + local fbgemm_variant="$1" + local fbgemm_variant_targets="$2" + if [ "$fbgemm_variant" == "" ]; then + echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT" + echo "Example(s):" + echo " ${FUNCNAME[0]} cpu # CPU-only variant" + echo " ${FUNCNAME[0]} cuda # CUDA variant for default target(s)" + echo " ${FUNCNAME[0]} cuda '7.0;8.0' # CUDA variant for custom target(s)" + echo " ${FUNCNAME[0]} rocm # ROCm variant for default target(s)" + echo " ${FUNCNAME[0]} rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" + return 1 + else + echo "################################################################################" + echo "# Configure FBGEMM-GPU Build" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + fi + + if [ "$fbgemm_variant" == "cpu" ]; then + echo "[BUILD] Configuring build as CPU variant ..." + __configure_fbgemm_gpu_build_cpu + + elif [ "$fbgemm_variant" == "rocm" ]; then + echo "[BUILD] Configuring build as ROCm variant ..." + __configure_fbgemm_gpu_build_rocm "${fbgemm_variant_targets}" + + else + echo "[BUILD] Configuring build as CUDA variant (this is the default behavior) ..." + __configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}" + fi + + # shellcheck disable=SC2145 + echo "[BUILD] FBGEMM_GPU build arguments have been set: ${build_args[@]}" +} + __build_fbgemm_gpu_common_pre_steps () { # Private function that uses variables instantiated by its caller @@ -1112,43 +1223,12 @@ __build_fbgemm_gpu_common_pre_steps () { (test_binpath "${env_name}" g++) || return 1 if [ "$fbgemm_variant" == "cpu" ]; then - echo "[BUILD] Proceeding to build CPU variant" - - # Update the package name and build args depending on if CUDA is specified - echo "[BUILD] Applying CPU-only build args ..." - build_args=(--cpu_only) package_name="${package_name}-cpu" - elif [ "$fbgemm_variant" == "rocm" ]; then - echo "[BUILD] Proceeding to build ROCm variant" - - (test_env_var "${env_name}" PYTORCH_ROCM_ARCH) || return 1 - - echo "[BUILD] Applying ROCm build args ..." - build_args=() package_name="${package_name}-rocm" - else # Set to the default variant - fbgemm_variant="gpu" - echo "[BUILD] Proceeding to build GPU variant (default)" - - # Check nvcc is visible - (test_binpath "${env_name}" nvcc) || return 1 - - # Check that cuDNN environment variables are available - (test_env_var "${env_name}" CUDNN_INCLUDE_DIR) || return 1 - (test_env_var "${env_name}" CUDNN_LIBRARY) || return 1 - (test_env_var "${env_name}" NVML_LIB_PATH) || return 1 - - # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI. - echo "[BUILD] Applying GPU build args ..." - # shellcheck disable=SC2155 - local nvml_lib_path=$(conda run -n "${env_name}" printenv NVML_LIB_PATH) - build_args=( - --nvml_lib_path="${nvml_lib_path}" - -DTORCH_CUDA_ARCH_LIST='7.0;8.0' - ) + fbgemm_variant="cuda" fi # Extract the Python tag @@ -1168,12 +1248,14 @@ __build_fbgemm_gpu_common_pre_steps () { print_exec git diff } -check_fbgemm_gpu_build () { +run_fbgemm_gpu_postbuild_checks () { local fbgemm_variant="$1" if [ "$fbgemm_variant" == "" ]; then echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT" echo "Example(s):" echo " ${FUNCNAME[0]} cpu" + echo " ${FUNCNAME[0]} cuda" + echo " ${FUNCNAME[0]} rocm" return 1 fi @@ -1194,7 +1276,13 @@ check_fbgemm_gpu_build () { ) # Add more symbols to check for if it's a non-CPU variant - if [ "${fbgemm_variant}" != "cpu" ]; then + if [ "${fbgemm_variant}" == "cuda" ]; then + lib_symbols_to_check+=( + fbgemm_gpu::asynchronous_inclusive_cumsum_gpu + fbgemm_gpu::merge_pooled_embeddings + ) + elif [ "${fbgemm_variant}" == "rocm" ]; then + # merge_pooled_embeddings is missing in ROCm builds bc it requires NVML lib_symbols_to_check+=( fbgemm_gpu::asynchronous_inclusive_cumsum_gpu fbgemm_gpu::merge_pooled_embeddings @@ -1218,27 +1306,32 @@ build_fbgemm_gpu_package () { env_name="$1" package_name="$2" fbgemm_variant="$3" - if [ "$package_name" == "" ]; then - echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME [CPU_ONLY]" + fbgemm_variant_targets="$4" + if [ "$fbgemm_variant" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME PACKAGE_NAME VARIANT [TARGETS]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly # Build the full wheel package" - echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cpu # Build the CPU-only variant of the wheel package" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cpu # CPU-only variant" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cuda # CUDA variant for default target(s)" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly cuda '7.0;8.0' # CUDA variant for custom target(s)" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly rocm # ROCm variant for default target(s)" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu_nightly rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" return 1 - else - echo "################################################################################" - echo "# Build FBGEMM-GPU Package (Wheel)" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" fi - # Run all the common FBGEMM-GPU build pre-steps (set up variables) + # Set up and configure the build __build_fbgemm_gpu_common_pre_steps || return 1 + __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1 + + echo "################################################################################" + echo "# Build FBGEMM-GPU Package (Wheel)" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" # manylinux1_x86_64 is specified for PyPI upload # Distribute Python extensions as wheels on Linux - echo "[BUILD] Building FBGEMM-GPU (VARIANT=${fbgemm_variant}) wheel ..." + echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..." print_exec conda run -n "${env_name}" \ python setup.py bdist_wheel \ --package_name="${package_name}" \ @@ -1247,7 +1340,7 @@ build_fbgemm_gpu_package () { "${build_args[@]}" # Run checks on the built libraries - (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1 + (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1 echo "[BUILD] Enumerating the built wheels ..." print_exec ls -lth dist/*.whl @@ -1261,32 +1354,37 @@ build_fbgemm_gpu_package () { build_fbgemm_gpu_install () { env_name="$1" fbgemm_variant="$2" - if [ "$env_name" == "" ]; then - echo "Usage: ${FUNCNAME[0]} ENV_NAME [CPU_ONLY]" + fbgemm_variant_targets="$3" + if [ "$fbgemm_variant" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env # Build + install the package" - echo " ${FUNCNAME[0]} build_env cpu # Build + Install the CPU-only variant of the package" + echo " ${FUNCNAME[0]} build_env cpu # CPU-only variant" + echo " ${FUNCNAME[0]} build_env cuda # CUDA variant for default target(s)" + echo " ${FUNCNAME[0]} build_env cuda '7.0;8.0' # CUDA variant for custom target(s)" + echo " ${FUNCNAME[0]} build_env rocm # ROCm variant for default target(s)" + echo " ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" return 1 - else - echo "################################################################################" - echo "# Build + Install FBGEMM-GPU Package" - echo "#" - echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" - echo "################################################################################" - echo "" fi - # Run all the common FBGEMM-GPU build pre-steps (set up variables) + # Set up and configure the build __build_fbgemm_gpu_common_pre_steps || return 1 + __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1 + + echo "################################################################################" + echo "# Build + Install FBGEMM-GPU Package" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" # Parallelism may need to be limited to prevent the build from being # canceled for going over ulimits - echo "[BUILD] Building and installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..." + echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..." print_exec conda run -n "${env_name}" \ python setup.py install "${build_args[@]}" # Run checks on the built libraries - (check_fbgemm_gpu_build "${fbgemm_variant}") || return 1 + (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1 echo "[INSTALL] Checking imports ..." # Exit this directory to prevent import clashing, since there is an @@ -1297,6 +1395,44 @@ build_fbgemm_gpu_install () { echo "[BUILD] FBGEMM-GPU build + install completed" } +build_fbgemm_gpu_develop () { + env_name="$1" + fbgemm_variant="$2" + fbgemm_variant_targets="$3" + if [ "$fbgemm_variant" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME VARIANT [TARGETS]" + echo "Example(s):" + echo " ${FUNCNAME[0]} build_env cpu # CPU-only variant" + echo " ${FUNCNAME[0]} build_env cuda # CUDA variant for default target(s)" + echo " ${FUNCNAME[0]} build_env cuda '7.0;8.0' # CUDA variant for custom target(s)" + echo " ${FUNCNAME[0]} build_env rocm # ROCm variant for default target(s)" + echo " ${FUNCNAME[0]} build_env rocm 'gfx906;gfx908;gfx90a' # ROCm variant for custom target(s)" + return 1 + fi + + # Set up and configure the build + __build_fbgemm_gpu_common_pre_steps || return 1 + __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1 + + echo "################################################################################" + echo "# Build + Install FBGEMM-GPU Package" + echo "#" + echo "# [TIMESTAMP] $(date --utc +%FT%T.%3NZ)" + echo "################################################################################" + echo "" + + # Parallelism may need to be limited to prevent the build from being + # canceled for going over ulimits + echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..." + print_exec conda run -n "${env_name}" \ + python setup.py build develop "${build_args[@]}" + + # Run checks on the built libraries + (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1 + + echo "[BUILD] FBGEMM-GPU build + develop completed" +} + build_fbgemm_gpu_docs () { env_name="$1" if [ "$env_name" == "" ]; then @@ -1357,7 +1493,7 @@ install_fbgemm_gpu_package () { ################################################################################ -# Test Functions +# FBGEMM_GPU Test Functions ################################################################################ run_fbgemm_gpu_tests () { @@ -1366,7 +1502,7 @@ run_fbgemm_gpu_tests () { if [ "$env_name" == "" ]; then echo "Usage: ${FUNCNAME[0]} ENV_NAME [FBGEMM_VARIANT]" echo "Example(s):" - echo " ${FUNCNAME[0]} build_env # Run all tests applicable to GPU (Nvidia)" + echo " ${FUNCNAME[0]} build_env # Run all tests applicable to CUDA" echo " ${FUNCNAME[0]} build_env cpu # Run all tests applicable to CPU" echo " ${FUNCNAME[0]} build_env rocm # Run all tests applicable to ROCm" return 1 @@ -1398,7 +1534,10 @@ run_fbgemm_gpu_tests () { uvm_test.py ) elif [ "$fbgemm_variant" == "rocm" ]; then - local ignored_tests=() + # https://github.com/pytorch/FBGEMM/issues/1559 + local ignored_tests=( + batched_unary_embeddings_test.py + ) else local ignored_tests=() fi @@ -1430,7 +1569,7 @@ run_fbgemm_gpu_tests () { ################################################################################ -# Publish Functions +# FBGEMM_GPU Publish Functions ################################################################################ publish_to_pypi () { diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml index 646c9de168..50e7c3814b 100644 --- a/.github/workflows/fbgemm_gpu_ci.yml +++ b/.github/workflows/fbgemm_gpu_ci.yml @@ -86,13 +86,7 @@ jobs: run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU-ROCM Nightly - run: | - . $PRELUDE - cd fbgemm_gpu - - # Build for MI250 only to save time. - print_exec conda env config vars set -n $BUILD_ENV PYTORCH_ROCM_ARCH=gfx90a - print_exec conda run -n $BUILD_ENV python setup.py build develop + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a - name: Test FBGEMM_GPU-ROCM Nightly Installation timeout-minutes: 10 @@ -100,54 +94,66 @@ jobs: test_amd_gpu: - if: ${{ false }} # Disable the job for now runs-on: rocm + container: + image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete" + options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + ENFORCE_AMD_GPU: 1 strategy: fail-fast: false matrix: - os: [ubuntu-latest] + # ROCm machines are limited, so we only test against Python 3.10 + python-version: [ "3.10" ] + rocm-version: [ "5.3", "5.4.2" ] steps: - - name: pre-checkout - shell: bash + - name: Setup Build Container run: | - if [ -d ${{ github.workspace }} ] - then - sudo chown -R $USER:$USER ${{ github.workspace }} - fi - sudo add-apt-repository ppa:git-core/ppa - sudo apt update - sudo apt -y install --only-upgrade git - - - uses: actions/checkout@v3 + apt update -y + apt install -y git wget + git config --global --add safe.directory '*' + + - name: Checkout the Repository + uses: actions/checkout@v3 with: - ref: ${{ github.ref }} - submodules: 'true' + submodules: true - - name: build fbgemm_gpu and test - shell: bash - run: | - set -eux - env - ls -l - DOCKER_IMAGE=rocm/pytorch:rocm5.4_ubuntu20.04_py3.8_pytorch_staging_base - docker pull $DOCKER_IMAGE - JENKINS_REPO_DIR=fbgemm-private-jenkins - JENKINS_REPO_DIR_BAREMETAL=$PWD - JENKINS_REPO_DIR_DOCKER=/workspace/$JENKINS_REPO_DIR - DOCKER_OPTIONS="\ - --user 0 \ - --network=host \ - --ipc=host \ - --shm-size 16G \ - --group-add video \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device=/dev/kfd \ - --device=/dev/dri \ - -v $JENKINS_REPO_DIR_BAREMETAL:$JENKINS_REPO_DIR_DOCKER - " - docker run $DOCKER_OPTIONS $DOCKER_IMAGE $JENKINS_REPO_DIR_DOCKER/.jenkins/rocm/build_and_test.sh $JENKINS_REPO_DIR_DOCKER + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Free Disk Space + run: . $PRELUDE; free_disk_space + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install Build Tools + run: . $PRELUDE; install_build_tools $BUILD_ENV + + - name: Install PyTorch-ROCm Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }} + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Build FBGEMM_GPU-ROCM Nightly + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm + + - name: Test FBGEMM_GPU-ROCM Nightly Installation + timeout-minutes: 15 + run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm build_and_test_cpu: @@ -203,6 +209,6 @@ jobs: - name: Build + Install FBGEMM_GPU (CPU version) run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_install $BUILD_ENV cpu - - name: Test with PyTest + - name: Test FBGEMM_GPU-CPU Nightly Installation timeout-minutes: 10 run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu diff --git a/.github/workflows/fbgemm_gpu_cuda_nightly.yml b/.github/workflows/fbgemm_gpu_cuda_nightly.yml index 7ccdbcbf3e..c08d088991 100644 --- a/.github/workflows/fbgemm_gpu_cuda_nightly.yml +++ b/.github/workflows/fbgemm_gpu_cuda_nightly.yml @@ -97,7 +97,7 @@ jobs: run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU Nightly - run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu_nightly + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu_nightly cuda - name: Upload Built Wheel as GHA Artifact uses: actions/upload-artifact@v3 diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml index 7516e6a021..3a41125170 100644 --- a/.github/workflows/fbgemm_gpu_cuda_release.yml +++ b/.github/workflows/fbgemm_gpu_cuda_release.yml @@ -88,7 +88,7 @@ jobs: run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV - name: Build FBGEMM_GPU - run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV fbgemm_gpu cuda - name: Upload Built Wheel as GHA Artifact uses: actions/upload-artifact@v3 diff --git a/.github/workflows/fbgemm_gpu_lint.yml b/.github/workflows/fbgemm_gpu_lint.yml index 1ff7203108..8a484e9844 100644 --- a/.github/workflows/fbgemm_gpu_lint.yml +++ b/.github/workflows/fbgemm_gpu_lint.yml @@ -6,10 +6,14 @@ name: FBGEMM_GPU Lint on: + # PR Trigger + # push: branches: - main + # Push Trigger (enable to catch errors coming out of multiple merges) + # pull_request: branches: - main @@ -20,11 +24,11 @@ concurrency: cancel-in-progress: true jobs: - run_pylint: + run-lint: runs-on: ubuntu-latest strategy: matrix: - python-version: [ "3.8" ] + python-version: [ "3.10" ] steps: - uses: actions/checkout@v3 @@ -38,7 +42,7 @@ jobs: python -m pip install --upgrade pip pip install click flake8 ufmt - - name: Analyzing the code with flake8 + - name: Analyzing the Code with flake8 run: | echo "::add-matcher::fbgemm_gpu/test/lint/flake8_problem_matcher.json" flake8 --ignore=E501,W503,E203 . @@ -46,13 +50,13 @@ jobs: # W503 = line break before binary operator (deprecated) # E203 = whitespace before ":" - - name: Analyzing the code with ufmt + - name: Analyzing the Code with ufmt run: | ufmt diff fbgemm_gpu/fbgemm_gpu ufmt diff fbgemm_gpu/test ufmt diff fbgemm_gpu/bench - - name: Check Meta copyright header + - name: Check Meta Copyright Header run: | python fbgemm_gpu/test/lint/check_meta_header.py --path=./fbgemm_gpu/fbgemm_gpu --fixit=False python fbgemm_gpu/test/lint/check_meta_header.py --path=./fbgemm_gpu/test --fixit=False diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 1fb8f397e0..2276ca9ff2 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -344,6 +344,10 @@ if(NOT FBGEMM_CPU_ONLY) if(NVML_LIB_PATH) message(STATUS "Found NVML_LIB_PATH: ${NVML_LIB_PATH}") + endif() + + if(NVML_LIB_PATH OR USE_ROCM) + message(STATUS "Adding merge_pooled_embeddings sources") list( APPEND fbgemm_gpu_sources_cpu @@ -351,8 +355,7 @@ if(NOT FBGEMM_CPU_ONLY) src/merge_pooled_embeddings_gpu.cpp src/topology_utils.cpp) else() - message(STATUS - "Could not find NVML_LIB_PATH; skipping certain sources into the build") + message(STATUS "Skipping merge_pooled_embeddings sources") endif() endif() diff --git a/fbgemm_gpu/src/jagged_tensor_ops.cu b/fbgemm_gpu/src/jagged_tensor_ops.cu index 4e249d9553..62cef01113 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops.cu @@ -1844,7 +1844,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel( __shared__ scalar_t exp_sum; const auto tid = threadIdx.x; - for (auto b = blockIdx.y; b < B; b += gridDim.y) { + for (uint32_t b = blockIdx.y; b < B; b += gridDim.y) { const index_t row_start = offsets[b]; const index_t row_end = offsets[b + 1]; const auto length = min(row_end - row_start, (index_t)max_L); @@ -1853,7 +1853,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_kernel( const auto num_l_blocks = (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - for (auto d = blockIdx.x; d < D; d += gridDim.x) { + for (uint32_t d = blockIdx.x; d < D; d += gridDim.x) { if (tid == 0) { max_value = values[row_start][d]; exp_sum = 0; @@ -1987,7 +1987,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_backward_kernel( __shared__ scalar_t sum_value; const auto tid = threadIdx.x; - for (auto b = blockIdx.y; b < B; b += gridDim.y) { + for (uint32_t b = blockIdx.y; b < B; b += gridDim.y) { const index_t row_start = offsets[b]; const index_t row_end = offsets[b + 1]; const auto length = min(row_end - row_start, (index_t)max_L); @@ -1996,7 +1996,7 @@ __global__ __launch_bounds__(kMaxThreads) void jagged_softmax_backward_kernel( const auto num_l_blocks = (length + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - for (auto d = blockIdx.x; d < D; d += gridDim.x) { + for (uint32_t d = blockIdx.x; d < D; d += gridDim.x) { if (tid == 0) { sum_value = 0; } diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp index d03b961a79..8257faff9b 100644 --- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp +++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp @@ -20,6 +20,14 @@ #include "fbgemm_gpu/sparse_ops_utils.h" #include "fbgemm_gpu/topology_utils.h" +// For some reason, hipify fails to replace the macro names when compiling for +// ROCm, so we manually replace it here. Name mapping based on: +// https://github.com/pytorch/pytorch/blob/master/torch/utils/hipify/cuda_to_hip_mappings.py +#ifdef __HIP_PLATFORM_HCC__ +#define C10_CUDA_CLEAR_ERROR C10_HIP_CLEAR_ERROR +#define C10_CUDA_ERROR_HANDLED C10_HIP_ERROR_HANDLED +#endif + using Tensor = at::Tensor; namespace { From 1ac526f7935c432911f887a0a113f53934a1ea98 Mon Sep 17 00:00:00 2001 From: Janet Yang Date: Fri, 31 Mar 2023 09:36:10 -0700 Subject: [PATCH 32/34] Use exported functions instead of calling initialize_weights in weights loading (#1676) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1676 Export a function to reset the embedding specs by target location Reviewed By: RoshanPAN, houseroad Differential Revision: D44338258 fbshipit-source-id: 502733e9f3a164450a02656d2822492fbf69f994 --- .../split_table_batched_embeddings_ops.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py index c327d359cc..f8ad2ccaf2 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py @@ -2684,12 +2684,33 @@ def initialize_physical_weights_placements_and_offsets( @torch.jit.export def reset_weights_placements_and_offsets( - self, + self, device: torch.device, location: int ) -> None: + # Reset device/location denoted in embedding specs + self.reset_embedding_spec_location(device, location) # Initialize all physical/logical weights placements and offsets without initializing large dev weights tensor self.initialize_physical_weights_placements_and_offsets() self.initialize_logical_weights_placements_and_offsets() + def reset_embedding_spec_location( + self, device: torch.device, location: int + ) -> None: + # Overwrite location in embedding_specs with new location + # Use map since can't script enum call (ie. EmbeddingLocation(value)) + INT_TO_EMBEDDING_LOCATION = { + 0: EmbeddingLocation.DEVICE, + 1: EmbeddingLocation.MANAGED, + 2: EmbeddingLocation.MANAGED_CACHING, + 3: EmbeddingLocation.HOST, + } + target_location = INT_TO_EMBEDDING_LOCATION[location] + self.current_device = device + self.row_alignment = 1 if target_location == EmbeddingLocation.HOST else 16 + self.embedding_specs = [ + (spec[0], spec[1], spec[2], spec[3], target_location) + for spec in self.embedding_specs + ] + def _apply_split( self, dev_size: int, From 99edf260151a463d95f35d45897281ae25c0d65b Mon Sep 17 00:00:00 2001 From: Janet Yang Date: Fri, 31 Mar 2023 09:36:10 -0700 Subject: [PATCH 33/34] Extract index remappings array initialization and jit.export it (#1670) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/1670 ATT Reviewed By: RoshanPAN, houseroad Differential Revision: D44338257 fbshipit-source-id: c091666c7a4d294c283f5e3774d0494089fc3478 --- .../split_table_batched_embeddings_ops.py | 75 +++++++++++-------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py index f8ad2ccaf2..2c7d99610f 100644 --- a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py @@ -3111,6 +3111,49 @@ def assign_embedding_weights( else: assert dest_weight[1] is None + @torch.jit.export + def set_index_remappings_array( + self, + index_remapping: List[Tensor], + ) -> None: + rows: List[int] = [e[1] for e in self.embedding_specs] + index_remappings_array_offsets = [0] + original_feature_rows = torch.jit.annotate(List[int], []) + last_offset = 0 + for t, mapping in enumerate(index_remapping): + if mapping is not None: + current_original_row = mapping.numel() + last_offset += current_original_row + original_feature_rows.append(current_original_row) + else: + original_feature_rows.append(rows[t]) + index_remappings_array_offsets.append(last_offset) + + self.index_remappings_array_offsets = torch.tensor( + index_remappings_array_offsets, + device=self.current_device, + dtype=torch.int64, + ) + if len(original_feature_rows) == 0: + original_feature_rows = rows + self.original_rows_per_table = torch.tensor( + [original_feature_rows[t] for t in self.feature_table_map], + device=self.current_device, + dtype=torch.int64, + ) + if self.index_remappings_array_offsets[-1] == 0: + self.index_remappings_array = torch.empty( + 0, dtype=torch.int32, device=self.current_device + ) + else: + index_remappings_filter_nones = [] + for mapping in index_remapping: + if mapping is not None: + index_remappings_filter_nones.append(mapping) + self.index_remappings_array = torch.cat(index_remappings_filter_nones).to( + self.current_device + ) + def set_index_remappings( self, index_remapping: List[Tensor], @@ -3177,37 +3220,7 @@ def set_index_remappings( self.index_remapping_hash_table_cpu = None # Array mapping pruning else: - index_remappings_array_offsets = [0] - original_feature_rows = [] - last_offset = 0 - for t, mapping in enumerate(index_remapping): - if mapping is not None: - current_original_row = mapping.numel() - last_offset += current_original_row - original_feature_rows.append(current_original_row) - else: - original_feature_rows.append(rows[t]) - index_remappings_array_offsets.append(last_offset) - - self.index_remappings_array_offsets = torch.tensor( - index_remappings_array_offsets, - device=self.current_device, - dtype=torch.int64, - ) - if len(original_feature_rows) == 0: - original_feature_rows = rows - self.original_rows_per_table = torch.tensor( - [original_feature_rows[t] for t in self.feature_table_map], - device=self.current_device, - dtype=torch.int64, - ) - self.index_remappings_array = ( - torch.empty(0, dtype=torch.int32, device=self.current_device) - if self.index_remappings_array_offsets[-1] == 0 - else torch.cat( - [mapping for mapping in index_remapping if mapping is not None] - ).to(self.current_device) - ) + self.set_index_remappings_array(index_remapping) def _embedding_inplace_update_per_table( self, From 49c1fc811ec9131b9adcb00d4d7fdf59f50205a5 Mon Sep 17 00:00:00 2001 From: Li Li Date: Fri, 31 Mar 2023 18:13:47 +0000 Subject: [PATCH 34/34] update hipify_torch and remove the manually mapping of the C10 macros --- fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp | 8 -------- third_party/hipify_torch | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp index 8257faff9b..d03b961a79 100644 --- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp +++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp @@ -20,14 +20,6 @@ #include "fbgemm_gpu/sparse_ops_utils.h" #include "fbgemm_gpu/topology_utils.h" -// For some reason, hipify fails to replace the macro names when compiling for -// ROCm, so we manually replace it here. Name mapping based on: -// https://github.com/pytorch/pytorch/blob/master/torch/utils/hipify/cuda_to_hip_mappings.py -#ifdef __HIP_PLATFORM_HCC__ -#define C10_CUDA_CLEAR_ERROR C10_HIP_CLEAR_ERROR -#define C10_CUDA_ERROR_HANDLED C10_HIP_ERROR_HANDLED -#endif - using Tensor = at::Tensor; namespace { diff --git a/third_party/hipify_torch b/third_party/hipify_torch index 1840658c18..23f53b025b 160000 --- a/third_party/hipify_torch +++ b/third_party/hipify_torch @@ -1 +1 @@ -Subproject commit 1840658c184f3eeba787dae0f06c45756c1daaf5 +Subproject commit 23f53b025b466d8ec3c45d52290d3442f7fbe6b1