diff --git a/flagai/model/glm_model.py b/flagai/model/glm_model.py index fe08115a..3c4f9746 100644 --- a/flagai/model/glm_model.py +++ b/flagai/model/glm_model.py @@ -462,7 +462,7 @@ def forward(self, else: loss = F.cross_entropy( - logits_parallel.contiguous().float(), labels.long()) + logits_parallel.reshape(-1, logits_parallel.shape[-1]).contiguous().float(), labels.reshape(-1).long()) if self.parallel_output: # Put in different GPUs return { diff --git a/flagai/trainer.py b/flagai/trainer.py index 1b7d1afd..b9a6a1ba 100644 --- a/flagai/trainer.py +++ b/flagai/trainer.py @@ -309,12 +309,17 @@ def get_dataloader(self, dataset, collate_fn, shuffle=False): shuffle=shuffle) else: if self.env_type == 'deepspeed+mpu': - num_replicas = self.world_size // mpu.get_model_parallel_world_size( - ) - rank = self.rank // mpu.get_model_parallel_world_size() + # num_replicas = self.world_size // mpu.get_model_parallel_world_size( + # ) + # rank = self.rank // mpu.get_model_parallel_world_size() + # rank = mpu.get_model_parallel_rank() + rank = mpu.get_model_parallel_src_rank() + print("*"*80) + print("local rank",self.rank, "model rank", rank) + print("*"*80) sampler = torch.utils.data.distributed.DistributedSampler( dataset, - num_replicas=num_replicas, + # num_replicas=num_replicas, rank=rank, shuffle=shuffle) else: @@ -474,13 +479,12 @@ def train(self, for epoch in range(self.epochs): # log_dist('working on epoch {} ...'.format(epoch), [0]) # Set the data loader epoch to shuffle the index iterator. - if self.env_type == 'deepspeed+mpu': - if mpu.get_model_parallel_rank() == 0: - train_dataloader.sampler.set_epoch(epoch + self.world_size) - elif self.env_type != 'pytorch': + # if self.env_type == 'deepspeed+mpu': + # if mpu.get_model_parallel_rank() == 0: + # train_dataloader.sampler.set_epoch(epoch + self.world_size) + if self.env_type != 'pytorch': train_dataloader.sampler.set_epoch(epoch + self.world_size) - # For all the batches in the dataset. for iteration_, batch in enumerate(train_dataloader): # Train for one step.