diff --git a/paddlevlp/examples/blip2/run_eval.py b/paddlevlp/examples/blip2/run_eval.py index 983f9cb6df729..b51003e6d499f 100644 --- a/paddlevlp/examples/blip2/run_eval.py +++ b/paddlevlp/examples/blip2/run_eval.py @@ -13,6 +13,9 @@ # limitations under the License. import sys +import os +sys.path.insert( + 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..')) import paddle.distributed as dist from paddle.distributed import fleet import os @@ -27,45 +30,17 @@ from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config import paddlevlp from paddlevlp.datasets import load_dataset -from paddlevlp.models.blip2.configuration import (Blip2Config, - Blip2QFormerConfig, - Blip2VisionConfig) +from paddlevlp.models.blip2.configuration import ( + Blip2Config, Blip2QFormerConfig, Blip2VisionConfig) from paddlevlp.models.blip2.modeling import Blip2ForConditionalGeneration from paddlevlp.processors.blip_processing import Blip2Processor from paddlevlp.trainer.blip2_trainer import BLIP2Trainer as Trainer from paddlevlp.utils.log import logger from paddlenlp.transformers import AutoTokenizer from paddlevlp.models.blip2.eva_vit import interpolate_pos_embed -from paddlevlp.processors.blip_processing import BlipImageProcessor,BlipTextProcessor -class BlipCollator: - """ - Data collator that will dynamically pad the inputs to the longest sequence in the batch. - - Args: - processor (`paddlevlp.processors.ProcessorMixin`): - The processor used for pre-process the data. - """ - - def __init__(self, processor,mode="train"): - self.processor = processor - - def __call__(self, data_list): - images = [sample["image"] for sample in data_list] - if "text_input" not in data_list[0].keys(): - text=None - else: - text = [sample["text_input"] for sample in data_list] - image_id = [sample["image_id"] for sample in data_list] - batch = self.processor( - images=images, - text=text, - max_length=32, - return_tensors="pd", - return_attention_mask=True, - mode="train", - ) - batch.update({'image_id':image_id}) - return batch +from paddlevlp.processors.blip_processing import BlipImageProcessor, BlipTextProcessor +from paddlevlp.examples.blip2.utils import BlipCollator +from paddlevlp.examples.blip2.utils import load_pretrained_model @dataclass @@ -79,12 +54,13 @@ class DataArguments: task_name: str = field( default="coco_caption", - metadata={"help": "The name of the task to use (via the datasets library)."}, - ) + metadata={ + "help": "The name of the task to use (via the datasets library)." + }, ) prompt: str = field( - default="a photo of ", metadata={"help": "The prompt of the image to be generated."} - ) # "Question: how many cats are there? Answer:" - + default="a photo of ", + metadata={"help": "The prompt of the image to be generated." + }) # "Question: how many cats are there? Answer:" @dataclass @@ -95,16 +71,13 @@ class ModelArguments: model_name_or_path: str = field( default="Salesforce/blip2-opt-2.7b", - metadata={"help": "Path to pretrained model or model identifier"}, - ) + metadata={"help": "Path to pretrained model or model identifier"}, ) text_model_name_or_path: str = field( default="facebook/opt-2.7b", - metadata={"help": "The type of text model to use (OPT, T5)."}, - ) + metadata={"help": "The type of text model to use (OPT, T5)."}, ) image_size: int = field( - default=364, metadata={"help": " image size for evaluation."} - ) + default=364, metadata={"help": " image size for evaluation."}) @dataclass @@ -116,54 +89,67 @@ class PreTrainingArguments(TrainingArguments): pretrained_model_path: str = field( default="https://bj.bcebos.com/v1/paddlenlp/models/community/Salesforce/blip2-opt-2.7b/blip2_pretrained.pdparams", metadata={ - "help": "The path to pre-trained model that we will use for pretraining." - }, - ) + "help": + "The path to pre-trained model that we will use for pretraining." + }, ) weight_decay: float = field( - default=0.05, metadata={"help": "Weight decay if we apply some."} - ) + default=0.05, metadata={"help": "Weight decay if we apply some."}) learning_rate: float = field( - default=0.0001, metadata={"help": "The initial learning rate."} - ) + default=0.0001, metadata={"help": "The initial learning rate."}) num_train_epochs: float = field( - default=10.0, metadata={"help": "Total number of training epochs to perform."} - ) + default=10.0, + metadata={"help": "Total number of training epochs to perform."}) warmup_start_lr: float = field( - default=1e-6, metadata={"help": "Initial learning rate of warm up."} - ) + default=1e-6, metadata={"help": "Initial learning rate of warm up."}) eta_min: float = field( - default=1e-5, metadata={"help": "The minimum value of learning rate."} - ) + default=1e-5, metadata={"help": "The minimum value of learning rate."}) warmup_steps: int = field( - default=2000, metadata={"help": "Number of warmup steps."} - ) + default=2000, metadata={"help": "Number of warmup steps."}) lr_scheduler_name: str = field( - default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."} - ) + default="CosineDecayWithWarmup", + metadata={"help": "The scheduler name to use."}) per_device_train_batch_size: int = field( - default=128, metadata={"help":"Batch size per GPU core/CPU for training. (default: 8)"} - ) - per_device_eval_batch_size : int = field( - default=1, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"} - ) - warmup_start_lr : float = field( - default=1e-6, metadata={"help": " The initial learning rate of blip2."} - ) - output_dir : str = field( - default=".", metadata={"help": "The output path"} - ) - do_eval : bool = field(default=True, metadata={"help": "Whether to evaluation."}) - do_train : bool = field(default=True, metadata={"help": "Whether to train."}) - - logging_steps : int = field(default=50, metadata={"help": "Logging interval"}) - evaluation_strategy : str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"}) + default=128, + metadata={ + "help": "Batch size per GPU core/CPU for training. (default: 8)" + }) + per_device_eval_batch_size: int = field( + default=1, + metadata={ + "help": " Batch size per GPU core/CPU for evaluation. (default:8)" + }) + warmup_start_lr: float = field( + default=1e-6, + metadata={"help": " The initial learning rate of blip2."}) + output_dir: str = field(default=".", metadata={"help": "The output path"}) + do_eval: bool = field( + default=True, metadata={"help": "Whether to evaluation."}) + do_train: bool = field(default=True, metadata={"help": "Whether to train."}) + + logging_steps: int = field( + default=50, metadata={"help": "Logging interval"}) + evaluation_strategy: str = field( + default="no", + metadata={"help": "Evaluation strategy (epoch/steps/no)"}) + + fp16_opt_level: str = field( + default="O1", metadata={"help": "Mixed Precision Type"}) + fp16: bool = field( + default=True, metadata={"help": "Whether to use mixed Precision"}) + gradient_checkpointing: bool = field( + default=False, + metadata={"help": "Forward recompute for saving graphics memory"}) + tensor_parallel_degree: int = field( + default=1, + metadata={"help": "Set the number of tensor model parallel"}) + sharding_parallel_degree: int = field( + default=1, + metadata={ + "help": "Set the number of sharding, enable sharding parallel" + }) + pipeline_parallel_degree: int = field( + default=1, metadata={"help": "Enable pipeline parallel"}) - fp16_opt_level : str = field(default="O1", metadata={"help": "Mixed Precision Type"}) - fp16 : bool = field(default=True, metadata={"help": "Whether to use mixed Precision"}) - gradient_checkpointing : bool = field(default=False, metadata={"help": "Forward recompute for saving graphics memory"}) - tensor_parallel_degree : int = field(default=1, metadata={"help": "Set the number of tensor model parallel"}) - sharding_parallel_degree : int = field(default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"}) - pipeline_parallel_degree : int = field(default=1, metadata={"help": "Enable pipeline parallel"}) def get_text_config(text_model_name_or_path): if "t5" in text_model_name_or_path: @@ -178,67 +164,51 @@ def get_text_config(text_model_name_or_path): def create_model(config): # blip2_config = Blip2ForConditionalGeneration(onfig.model_name_or_path) vision_config = Blip2VisionConfig.from_pretrained(config.model_name_or_path) - qformer_config = Blip2QFormerConfig.from_pretrained(config.model_name_or_path) + qformer_config = Blip2QFormerConfig.from_pretrained( + config.model_name_or_path) text_config = get_text_config(config.text_model_name_or_path) # add tensor_parallel_degree - vision_config.image_size= config.image_size - vision_config.mp_degree=config.mp_degree - qformer_config.mp_degree=config.mp_degree - text_config.mp_degree=config.mp_degree - vision_config.gradient_checkpointing=config.gradient_checkpointing - qformer_config.gradient_checkpointing=config.gradient_checkpointing - text_config.gradient_checkpointing=config.gradient_checkpointing + vision_config.image_size = config.image_size + vision_config.mp_degree = config.mp_degree + qformer_config.mp_degree = config.mp_degree + text_config.mp_degree = config.mp_degree + vision_config.gradient_checkpointing = config.gradient_checkpointing + qformer_config.gradient_checkpointing = config.gradient_checkpointing + text_config.gradient_checkpointing = config.gradient_checkpointing blip2_config = Blip2Config.from_vision_qformer_text_configs( - vision_config, qformer_config, text_config - ) + vision_config, qformer_config, text_config) model = Blip2ForConditionalGeneration(blip2_config) - paddle.device.cuda.empty_cache()# post_init_func(self, init_func, *args, **kwargs)吃显存 + paddle.device.cuda.empty_cache( + ) # post_init_func(self, init_func, *args, **kwargs)吃显存 return model - -def load_pretrained_model(model, pretrained_model_path): - if pretrained_model_path is None: - return - - if not os.path.exists(pretrained_model_path): - ValueError( - "Cannot find pretrained model path: {}".format(pretrained_model_path) - ) - - state_dict = paddle.load(pretrained_model_path) - interpolate_pos_embed(model, state_dict) - model.set_state_dict(state_dict) - - def main(): - parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser( + (ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") - training_args.prompt=data_args.prompt + training_args.prompt = data_args.prompt setdistenv(training_args) - model_args.data_world_rank = training_args.data_world_rank model_args.data_world_size = training_args.data_world_size paddle.set_device(training_args.device) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" + + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" ) # Detecting last checkpoint last_checkpoint = None - if ( - os.path.isdir(training_args.output_dir) - and training_args.do_train - and not training_args.overwrite_output_dir - ): + if (os.path.isdir(training_args.output_dir) and training_args.do_train and + not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( @@ -247,23 +217,31 @@ def main(): ) # create dataset - tokenizer_class = AutoTokenizer.from_pretrained("facebook/opt-2.7b", use_fast=False) - image_processor = BlipImageProcessor.from_pretrained("paddlevlp/models/blip2/model_cfg/BlipImageProcessor_stage2.json") - text_processor_class = BlipTextProcessor.from_pretrained("paddlevlp/models/blip2/model_cfg/BlipTextProcessor_stage2.json") - processor = Blip2Processor(image_processor,text_processor_class,tokenizer_class) - image_processor_eval = BlipImageProcessor.from_pretrained("paddlevlp/models/blip2/model_cfg/BlipImageEvalProcessor_stage2.json") - text_processor_class_eval = BlipTextProcessor.from_pretrained("paddlevlp/models/blip2/model_cfg/BlipTextEvalProcessor_stage2.json") - eval_processor = Blip2Processor(image_processor_eval,text_processor_class_eval,tokenizer_class) + tokenizer_class = AutoTokenizer.from_pretrained( + "facebook/opt-2.7b", use_fast=False) + image_processor = BlipImageProcessor.from_pretrained( + "paddlevlp/models/blip2/model_cfg/BlipImageProcessor_stage2.json") + text_processor_class = BlipTextProcessor.from_pretrained( + "paddlevlp/models/blip2/model_cfg/BlipTextProcessor_stage2.json") + processor = Blip2Processor(image_processor, text_processor_class, + tokenizer_class) + image_processor_eval = BlipImageProcessor.from_pretrained( + "paddlevlp/models/blip2/model_cfg/BlipImageEvalProcessor_stage2.json") + text_processor_class_eval = BlipTextProcessor.from_pretrained( + "paddlevlp/models/blip2/model_cfg/BlipTextEvalProcessor_stage2.json") + eval_processor = Blip2Processor(image_processor_eval, + text_processor_class_eval, tokenizer_class) train_dataset = load_dataset(data_args.task_name, splits="train") - eval_dataset = {"test":load_dataset(data_args.task_name, splits="test")} + eval_dataset = {"test": load_dataset(data_args.task_name, splits="test")} # create model blip_collator = BlipCollator(processor) - blip_eval_collator = BlipCollator(eval_processor,mode="test") - model_args.mp_degree=training_args.tensor_parallel_degree - model_args.gradient_checkpointing=training_args.gradient_checkpointing + blip_eval_collator = BlipCollator(eval_processor, mode="test") + model_args.mp_degree = training_args.tensor_parallel_degree + model_args.gradient_checkpointing = training_args.gradient_checkpointing model = create_model(model_args) - logger.info("training_args.use_hybrid_parallel:{}".format(training_args.use_hybrid_parallel)) + logger.info("training_args.use_hybrid_parallel:{}".format( + training_args.use_hybrid_parallel)) # create trainer load_pretrained_model(model, training_args.pretrained_model_path) trainer = Trainer( @@ -275,25 +253,25 @@ def main(): eval_collator=blip_eval_collator, processor=processor, eval_processor=eval_processor, - tokenizer=tokenizer_class - ) + tokenizer=tokenizer_class) eval_metrics = trainer.evaluate(eval_dataset) trainer.log_metrics("eval", eval_metrics) def setdistenv(args): - if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree!=1: - args.use_hybrid_parallel=True + if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1: + args.use_hybrid_parallel = True args.dp_degree = dist.get_world_size() \ // (args.tensor_parallel_degree \ * args.sharding_parallel_degree * \ args.pipeline_parallel_degree) strategy = fleet.DistributedStrategy() - if args.tensor_parallel_degree>1: + if args.tensor_parallel_degree > 1: strategy.tensor_parallel = True - args.data_parallel_degree=args.dp_degree + args.data_parallel_degree = args.dp_degree logger.info("args.dp_degree:{}".format(args.dp_degree)) - logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree)) + logger.info("args.sharding_parallel_degree):{}".format( + args.sharding_parallel_degree)) # breakpoint() strategy.hybrid_configs = { "dp_degree": args.dp_degree, @@ -301,12 +279,12 @@ def setdistenv(args): "sharding_degree": args.sharding_parallel_degree, "pp_degree": args.pipeline_parallel_degree, } - BATCH_SIZE=128 - MICRO_BATCH_SIZE=32 + BATCH_SIZE = 128 + MICRO_BATCH_SIZE = 32 strategy.pipeline_configs = { "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE, "micro_batch_size": MICRO_BATCH_SIZE -} + } strategy.find_unused_parameters = True # set control in tensor parallel @@ -325,11 +303,13 @@ def setdistenv(args): args.sharding_rank = hcg.get_sharding_parallel_rank() args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank - args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree) + args.data_world_size = dist.get_world_size() // abs( + args.tensor_parallel_degree * args.pipeline_parallel_degree) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank) + def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): device_id = paddle.device.get_device() assert 'gpu' in device_id @@ -344,5 +324,7 @@ def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): tracker = get_rng_state_tracker() tracker.add("global_seed", global_seed) tracker.add("local_seed", local_seed) + + if __name__ == "__main__": main() diff --git a/paddlevlp/examples/blip2/run_predict.py b/paddlevlp/examples/blip2/run_predict.py index 51a95507ec32c..cade88841ad6b 100644 --- a/paddlevlp/examples/blip2/run_predict.py +++ b/paddlevlp/examples/blip2/run_predict.py @@ -11,7 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import sys +import os +sys.path.insert( + 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..')) from dataclasses import dataclass, field import paddle import requests @@ -21,6 +24,8 @@ from paddlevlp.models.blip2.modeling import Blip2ForConditionalGeneration from paddlevlp.processors.blip_processing import Blip2Processor from paddlevlp.utils.log import logger +from paddlevlp.examples.blip2.utils import load_pretrained_model + @dataclass class DataArguments: @@ -31,12 +36,13 @@ class DataArguments: the command line. """ - input_image: str = field( - metadata={"help": "The name of input image."} - ) # "http://images.cocodataset.org/val2017/000000039769.jpg" + input_image: str = field(metadata={ + "help": "The name of input image." + }) # "http://images.cocodataset.org/val2017/000000039769.jpg" prompt: str = field( - default=None, metadata={"help": "The prompt of the image to be generated."} - ) # "Question: how many cats are there? Answer:" + default=None, + metadata={"help": "The prompt of the image to be generated." + }) # "Question: how many cats are there? Answer:" @dataclass @@ -47,36 +53,33 @@ class ModelArguments: model_name_or_path: str = field( default="Salesforce/blip2-opt-2.7b", - metadata={"help": "Path to pretrained model or model identifier"}, - ) + metadata={"help": "Path to pretrained model or model identifier"}, ) pretrained_model_path: str = field( default=None, metadata={ - "help": "The path to pre-trained model that we will use for inference." - }, - ) + "help": + "The path to pre-trained model that we will use for inference." + }, ) def main(): parser = PdArgumentParser((ModelArguments, DataArguments)) model_args, data_args = parser.parse_args_into_dataclasses() - url = ( - data_args.input_image - ) # "http://images.cocodataset.org/val2017/000000039769.jpg" + url = (data_args.input_image + ) # "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) prompt = data_args.prompt processor = Blip2Processor.from_pretrained( - model_args.model_name_or_path - ) # "Salesforce/blip2-opt-2.7b" + model_args.model_name_or_path) # "Salesforce/blip2-opt-2.7b" inputs = processor( images=image, text=prompt, return_tensors="pd", return_attention_mask=True, - mode="test", - ) - model = Blip2ForConditionalGeneration.from_pretrained(model_args.model_name_or_path) + mode="test", ) + model = Blip2ForConditionalGeneration.from_pretrained( + model_args.model_name_or_path) # load checkpoint if model_args.pretrained_model_path: @@ -86,9 +89,8 @@ def main(): model.eval() model.to("gpu") # doctest: +IGNORE_RESULT generated_ids, scores = model.generate(**inputs) - generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[ - 0 - ].strip() + generated_text = processor.batch_decode( + generated_ids, skip_special_tokens=True)[0].strip() logger.info("Generate text: {}".format(generated_text)) diff --git a/paddlevlp/examples/blip2/run_pretrain_stage2.py b/paddlevlp/examples/blip2/run_pretrain_stage2.py index 3cefbaec40517..5f092d6813218 100644 --- a/paddlevlp/examples/blip2/run_pretrain_stage2.py +++ b/paddlevlp/examples/blip2/run_pretrain_stage2.py @@ -13,6 +13,9 @@ # limitations under the License. import sys +import os +sys.path.insert( + 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..')) import paddle.distributed as dist from paddle.distributed import fleet import os @@ -27,45 +30,17 @@ from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config import paddlevlp from paddlevlp.datasets import load_dataset -from paddlevlp.models.blip2.configuration import (Blip2Config, - Blip2QFormerConfig, - Blip2VisionConfig) +from paddlevlp.models.blip2.configuration import ( + Blip2Config, Blip2QFormerConfig, Blip2VisionConfig) from paddlevlp.models.blip2.modeling import Blip2ForConditionalGeneration from paddlevlp.processors.blip_processing import Blip2Processor from paddlevlp.trainer.blip2_trainer import BLIP2Trainer as Trainer from paddlevlp.utils.log import logger from paddlenlp.transformers import AutoTokenizer from paddlevlp.models.blip2.eva_vit import interpolate_pos_embed -from paddlevlp.processors.blip_processing import BlipImageProcessor,BlipTextProcessor -class BlipCollator: - """ - Data collator that will dynamically pad the inputs to the longest sequence in the batch. - - Args: - processor (`paddlevlp.processors.ProcessorMixin`): - The processor used for pre-process the data. - """ - - def __init__(self, processor,mode="train"): - self.processor = processor - self.mode=mode - def __call__(self, data_list): - images = [sample["image"] for sample in data_list] - if "text_input" not in data_list[0].keys(): - text=None - else: - text = [sample["text_input"] for sample in data_list] - image_id = [sample["image_id"] for sample in data_list] - batch = self.processor( - images=images, - text=text, - max_length=32, - return_tensors="pd", - return_attention_mask=True, - mode=self.mode, - ) - batch.update({'image_id':image_id}) - return batch +from paddlevlp.processors.blip_processing import BlipImageProcessor, BlipTextProcessor +from paddlevlp.examples.blip2.utils import BlipCollator +from paddlevlp.examples.blip2.utils import load_pretrained_model @dataclass @@ -79,12 +54,13 @@ class DataArguments: task_name: str = field( default="coco_caption", - metadata={"help": "The name of the task to use (via the datasets library)."}, - ) + metadata={ + "help": "The name of the task to use (via the datasets library)." + }, ) prompt: str = field( - default="a photo of ", metadata={"help": "The prompt of the image to be generated."} - ) # "Question: how many cats are there? Answer:" - + default="a photo of ", + metadata={"help": "The prompt of the image to be generated." + }) # "Question: how many cats are there? Answer:" @dataclass @@ -95,16 +71,14 @@ class ModelArguments: model_name_or_path: str = field( default="Salesforce/blip2-opt-2.7b", - metadata={"help": "Path to pretrained model or model identifier"}, - ) + metadata={"help": "Path to pretrained model or model identifier"}, ) text_model_name_or_path: str = field( default="facebook/opt-2.7b", - metadata={"help": "The type of text model to use (OPT, T5)."}, - ) - image_size : int = field( - default=224, metadata={"help": " Image size for training. (default:224)"} - ) + metadata={"help": "The type of text model to use (OPT, T5)."}, ) + image_size: int = field( + default=224, + metadata={"help": " Image size for training. (default:224)"}) @dataclass @@ -116,54 +90,67 @@ class PreTrainingArguments(TrainingArguments): pretrained_model_path: str = field( default="https://bj.bcebos.com/v1/paddlenlp/models/community/Salesforce/blip2-opt-2.7b/blip2_pretrained.pdparams", metadata={ - "help": "The path to pre-trained model that we will use for pretraining." - }, - ) + "help": + "The path to pre-trained model that we will use for pretraining." + }, ) weight_decay: float = field( - default=0.05, metadata={"help": "Weight decay if we apply some."} - ) + default=0.05, metadata={"help": "Weight decay if we apply some."}) learning_rate: float = field( - default=0.0001, metadata={"help": "The initial learning rate."} - ) + default=0.0001, metadata={"help": "The initial learning rate."}) num_train_epochs: float = field( - default=10.0, metadata={"help": "Total number of training epochs to perform."} - ) + default=10.0, + metadata={"help": "Total number of training epochs to perform."}) warmup_start_lr: float = field( - default=1e-6, metadata={"help": "Initial learning rate of warm up."} - ) + default=1e-6, metadata={"help": "Initial learning rate of warm up."}) eta_min: float = field( - default=1e-5, metadata={"help": "The minimum value of learning rate."} - ) + default=1e-5, metadata={"help": "The minimum value of learning rate."}) warmup_steps: int = field( - default=2000, metadata={"help": "Number of warmup steps."} - ) + default=2000, metadata={"help": "Number of warmup steps."}) lr_scheduler_name: str = field( - default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."} - ) + default="CosineDecayWithWarmup", + metadata={"help": "The scheduler name to use."}) per_device_train_batch_size: int = field( - default=128, metadata={"help":"Batch size per GPU core/CPU for training. (default: 8)"} - ) - per_device_eval_batch_size : int = field( - default=128, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"} - ) - warmup_start_lr : float = field( - default=1e-6, metadata={"help": " The initial learning rate of blip2."} - ) - output_dir : str = field( - default=".", metadata={"help": "The output path"} - ) - do_eval : bool = field(default=False, metadata={"help": "Whether to evaluation."}) - do_train : bool = field(default=True, metadata={"help": "Whether to train."}) - - logging_steps : int = field(default=50, metadata={"help": "Logging interval"}) - evaluation_strategy : str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"}) + default=128, + metadata={ + "help": "Batch size per GPU core/CPU for training. (default: 8)" + }) + per_device_eval_batch_size: int = field( + default=128, + metadata={ + "help": " Batch size per GPU core/CPU for evaluation. (default:8)" + }) + warmup_start_lr: float = field( + default=1e-6, + metadata={"help": " The initial learning rate of blip2."}) + output_dir: str = field(default=".", metadata={"help": "The output path"}) + do_eval: bool = field( + default=False, metadata={"help": "Whether to evaluation."}) + do_train: bool = field(default=True, metadata={"help": "Whether to train."}) + + logging_steps: int = field( + default=50, metadata={"help": "Logging interval"}) + evaluation_strategy: str = field( + default="no", + metadata={"help": "Evaluation strategy (epoch/steps/no)"}) + + fp16_opt_level: str = field( + default="O1", metadata={"help": "Mixed Precision Type"}) + fp16: bool = field( + default=True, metadata={"help": "Whether to use mixed Precision"}) + gradient_checkpointing: bool = field( + default=False, + metadata={"help": "Forward recompute for saving graphics memory"}) + tensor_parallel_degree: int = field( + default=1, + metadata={"help": "Set the number of tensor model parallel"}) + sharding_parallel_degree: int = field( + default=1, + metadata={ + "help": "Set the number of sharding, enable sharding parallel" + }) + pipeline_parallel_degree: int = field( + default=1, metadata={"help": "Enable pipeline parallel"}) - fp16_opt_level : str = field(default="O1", metadata={"help": "Mixed Precision Type"}) - fp16 : bool = field(default=True, metadata={"help": "Whether to use mixed Precision"}) - gradient_checkpointing : bool = field(default=False, metadata={"help": "Forward recompute for saving graphics memory"}) - tensor_parallel_degree : int = field(default=1, metadata={"help": "Set the number of tensor model parallel"}) - sharding_parallel_degree : int = field(default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"}) - pipeline_parallel_degree : int = field(default=1, metadata={"help": "Enable pipeline parallel"}) def get_text_config(text_model_name_or_path): if "t5" in text_model_name_or_path: @@ -178,67 +165,51 @@ def get_text_config(text_model_name_or_path): def create_model(config): # blip2_config = Blip2ForConditionalGeneration(onfig.model_name_or_path) vision_config = Blip2VisionConfig.from_pretrained(config.model_name_or_path) - qformer_config = Blip2QFormerConfig.from_pretrained(config.model_name_or_path) + qformer_config = Blip2QFormerConfig.from_pretrained( + config.model_name_or_path) text_config = get_text_config(config.text_model_name_or_path) - vision_config.image_size= config.image_size + vision_config.image_size = config.image_size # add tensor_parallel_degree - vision_config.mp_degree=config.mp_degree - qformer_config.mp_degree=config.mp_degree - text_config.mp_degree=config.mp_degree - vision_config.gradient_checkpointing=config.gradient_checkpointing - qformer_config.gradient_checkpointing=config.gradient_checkpointing - text_config.gradient_checkpointing=config.gradient_checkpointing + vision_config.mp_degree = config.mp_degree + qformer_config.mp_degree = config.mp_degree + text_config.mp_degree = config.mp_degree + vision_config.gradient_checkpointing = config.gradient_checkpointing + qformer_config.gradient_checkpointing = config.gradient_checkpointing + text_config.gradient_checkpointing = config.gradient_checkpointing blip2_config = Blip2Config.from_vision_qformer_text_configs( - vision_config, qformer_config, text_config - ) + vision_config, qformer_config, text_config) model = Blip2ForConditionalGeneration(blip2_config) - paddle.device.cuda.empty_cache()# post_init_func(self, init_func, *args, **kwargs)吃显存 + paddle.device.cuda.empty_cache( + ) # post_init_func(self, init_func, *args, **kwargs)吃显存 return model - -def load_pretrained_model(model, pretrained_model_path): - if pretrained_model_path is None: - return - - if not os.path.exists(pretrained_model_path): - ValueError( - "Cannot find pretrained model path: {}".format(pretrained_model_path) - ) - - state_dict = paddle.load(pretrained_model_path) - interpolate_pos_embed(model, state_dict) - model.set_state_dict(state_dict) - - def main(): - parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser( + (ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") - training_args.prompt=data_args.prompt + training_args.prompt = data_args.prompt setdistenv(training_args) - model_args.data_world_rank = training_args.data_world_rank model_args.data_world_size = training_args.data_world_size paddle.set_device(training_args.device) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" + + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" ) # Detecting last checkpoint last_checkpoint = None - if ( - os.path.isdir(training_args.output_dir) - and training_args.do_train - and not training_args.overwrite_output_dir - ): + if (os.path.isdir(training_args.output_dir) and training_args.do_train and + not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( @@ -247,23 +218,31 @@ def main(): ) # create dataset - tokenizer_class = AutoTokenizer.from_pretrained("facebook/opt-2.7b", use_fast=False) - image_processor = BlipImageProcessor.from_pretrained("paddlevlp/models/blip2/model_cfg/BlipImageProcessor_stage2.json") - text_processor_class = BlipTextProcessor.from_pretrained("paddlevlp/models/blip2/model_cfg/BlipTextProcessor_stage2.json") - processor = Blip2Processor(image_processor,text_processor_class,tokenizer_class) - image_processor_eval = BlipImageProcessor.from_pretrained("paddlevlp/models/blip2/model_cfg/BlipImageEvalProcessor_stage2.json") - text_processor_class_eval = BlipTextProcessor.from_pretrained("paddlevlp/models/blip2/model_cfg/BlipTextEvalProcessor_stage2.json") - eval_processor = Blip2Processor(image_processor_eval,text_processor_class_eval,tokenizer_class) + tokenizer_class = AutoTokenizer.from_pretrained( + "facebook/opt-2.7b", use_fast=False) + image_processor = BlipImageProcessor.from_pretrained( + "paddlevlp/models/blip2/model_cfg/BlipImageProcessor_stage2.json") + text_processor_class = BlipTextProcessor.from_pretrained( + "paddlevlp/models/blip2/model_cfg/BlipTextProcessor_stage2.json") + processor = Blip2Processor(image_processor, text_processor_class, + tokenizer_class) + image_processor_eval = BlipImageProcessor.from_pretrained( + "paddlevlp/models/blip2/model_cfg/BlipImageEvalProcessor_stage2.json") + text_processor_class_eval = BlipTextProcessor.from_pretrained( + "paddlevlp/models/blip2/model_cfg/BlipTextEvalProcessor_stage2.json") + eval_processor = Blip2Processor(image_processor_eval, + text_processor_class_eval, tokenizer_class) train_dataset = load_dataset(data_args.task_name, splits="train") - eval_dataset = {"test":load_dataset(data_args.task_name, splits="test")} + eval_dataset = {"test": load_dataset(data_args.task_name, splits="test")} # create model blip_collator = BlipCollator(processor) - blip_eval_collator = BlipCollator(eval_processor,mode="test") - model_args.mp_degree=training_args.tensor_parallel_degree - model_args.gradient_checkpointing=training_args.gradient_checkpointing + blip_eval_collator = BlipCollator(eval_processor, mode="test") + model_args.mp_degree = training_args.tensor_parallel_degree + model_args.gradient_checkpointing = training_args.gradient_checkpointing model = create_model(model_args) - logger.info("training_args.use_hybrid_parallel:{}".format(training_args.use_hybrid_parallel)) + logger.info("training_args.use_hybrid_parallel:{}".format( + training_args.use_hybrid_parallel)) # create trainer load_pretrained_model(model, training_args.pretrained_model_path) trainer = Trainer( @@ -275,10 +254,9 @@ def main(): eval_collator=blip_eval_collator, processor=processor, eval_processor=eval_processor, - tokenizer=tokenizer_class - ) + tokenizer=tokenizer_class) # Training - checkpoint=None + checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint state_dict = paddle.load(checkpoint) @@ -294,18 +272,19 @@ def main(): def setdistenv(args): - if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree!=1: - args.use_hybrid_parallel=True + if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1: + args.use_hybrid_parallel = True args.dp_degree = dist.get_world_size() \ // (args.tensor_parallel_degree \ * args.sharding_parallel_degree * \ args.pipeline_parallel_degree) strategy = fleet.DistributedStrategy() - if args.tensor_parallel_degree>1: + if args.tensor_parallel_degree > 1: strategy.tensor_parallel = True - args.data_parallel_degree=args.dp_degree + args.data_parallel_degree = args.dp_degree logger.info("args.dp_degree:{}".format(args.dp_degree)) - logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree)) + logger.info("args.sharding_parallel_degree):{}".format( + args.sharding_parallel_degree)) # breakpoint() strategy.hybrid_configs = { "dp_degree": args.dp_degree, @@ -313,12 +292,12 @@ def setdistenv(args): "sharding_degree": args.sharding_parallel_degree, "pp_degree": args.pipeline_parallel_degree, } - BATCH_SIZE=128 - MICRO_BATCH_SIZE=32 + BATCH_SIZE = 128 + MICRO_BATCH_SIZE = 32 strategy.pipeline_configs = { "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE, "micro_batch_size": MICRO_BATCH_SIZE -} + } strategy.find_unused_parameters = True # set control in tensor parallel @@ -334,11 +313,13 @@ def setdistenv(args): args.sharding_rank = hcg.get_sharding_parallel_rank() args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank - args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree) + args.data_world_size = dist.get_world_size() // abs( + args.tensor_parallel_degree * args.pipeline_parallel_degree) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank) + def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): device_id = paddle.device.get_device() assert 'gpu' in device_id @@ -353,5 +334,7 @@ def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): tracker = get_rng_state_tracker() tracker.add("global_seed", global_seed) tracker.add("local_seed", local_seed) + + if __name__ == "__main__": main() diff --git a/paddlevlp/examples/blip2/utils.py b/paddlevlp/examples/blip2/utils.py index c6bb556c00c81..5596b7d2c1bfc 100644 --- a/paddlevlp/examples/blip2/utils.py +++ b/paddlevlp/examples/blip2/utils.py @@ -1,11 +1,49 @@ - import os from pycocoevalcap.eval import COCOEvalCap from pycocotools.coco import COCO +from paddlevlp.utils.downloader import get_weights_path_from_url +from paddlevlp.utils.downloader import is_url +from paddlevlp.models.blip2.eva_vit import interpolate_pos_embed +import paddle + + +class BlipCollator: + """ + Data collator that will dynamically pad the inputs to the longest sequence in the batch. + + Args: + processor (`paddlevlp.processors.ProcessorMixin`): + The processor used for pre-process the data. + """ + + def __init__(self, processor, mode="train"): + self.processor = processor + self.mode = mode + + def __call__(self, data_list): + images = [sample["image"] for sample in data_list] + if "text_input" not in data_list[0].keys(): + text = None + else: + text = [sample["text_input"] for sample in data_list] + image_id = [sample["image_id"] for sample in data_list] + batch = self.processor( + images=images, + text=text, + max_length=32, + return_tensors="pd", + return_attention_mask=True, + mode=self.mode, ) + batch.update({'image_id': image_id}) + return batch + + def coco_caption_eval(coco_gt_root, results_file, split): urls = { - "val": "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json", - "test": "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json", + "val": + "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json", + "test": + "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json", } filenames = { "val": "coco_karpathy_val_gt.json", @@ -27,3 +65,23 @@ def coco_caption_eval(coco_gt_root, results_file, split): print(f"{metric}: {score:.3f}") return coco_eval + + +def load_pretrained_model(model, pretrained_model_path): + if pretrained_model_path is None: + return + + if not os.path.exists(pretrained_model_path): + ValueError("Cannot find pretrained model path: {}".format( + pretrained_model_path)) + + if os.path.isfile(pretrained_model_path): + path = pretrained_model_path + elif is_url(pretrained_model_path): + path = get_weights_path_from_url(pretrained_model_path) + else: + assert os.path.exists( + pretrained_model_path), f"{pretrained_model_path} not exist" + state_dict = paddle.load(path) + interpolate_pos_embed(model, state_dict) + model.set_state_dict(state_dict)