diff --git a/README.md b/README.md index a18a0f2..d754786 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ # VideoHallucer: Evaluating Intrinsic and Extrinsic Hallucinations in Large Video-Language Models -[![videohallucer-page](https://img.shields.io/badge/videohallucer-page-green)](https://videohallucer.github.io/) - +[![videohallucer-page](https://img.shields.io/badge/videohallucer-page-blue)](https://videohallucer.github.io/) +[![arXiv](https://img.shields.io/badge/arXiv-2406.16338-green.svg)](https://arxiv.org/abs/2406.16338) + @@ -22,7 +22,8 @@ - [Usage](#usage) - [Leaderboard](#leaderboard) -*We are still refining our paper, and it may take up to two weeks from June 19th due to my current workload.* + + ## VideoHallucer @@ -92,6 +93,10 @@ We offer a selection of case examples from our dataset for further elucidation: ## VideoHallucerKit + + *If you want to upload results from your models, feel free to submit a PR following one of these baselines, or send an email to me (flagwyx@gmail.com) to update your results on our page.* + + ### Installation @@ -107,6 +112,7 @@ We offer a selection of case examples from our dataset for further elucidation: - MiniGPT4-Video-7B - PLLaVA-7B/13B/34B - LLaVA-NeXT-Video-DPO-DPO-7B/34B +- ShareGPT4Video-8B - Gemini-1.5-pro - GPT4O (Azure) @@ -131,6 +137,11 @@ cd baselines python ../evaluations/evaluation.py --model_name Gemini-1.5-pro --eval_obj --eval_obj_rel --eval_temporal --eval_semantic --eval_fact --eval_nonfact ``` +evaluate "yes/no" bias +```bash +python ../evaluations/evaluation.py GPT4O Gemini-1.5-pro # ["VideoChatGPT", "Valley", "Video-LLaMA-2", "VideoChat2", "VideoLLaVA", "LLaMA-VID", "VideoLaVIT", "PLLaVA", "PLLaVA-13B", "PLLaVA-34B", "LLaVA-NeXT-Video", "LLaVA-NeXT-Video-34B", "Gemini-1.5-pro", "GPT4O", "GPT4V", "LLaVA"] +``` + ## Leaderboard @@ -152,6 +163,7 @@ more detailed results see `baselines/results` | LLaMA-VID | 43.5 | 21 | 17 | 2.5 | 21 | 21 | | VideoLaVIT | 35.5 | 25.5 | 10.5 | 4 | 19 | 18.9 | | VideoLLaVA | 34.5 | 13.5 | 12 | 3 | 26 | 17.8 | +| ShareGPT4Video | 16.5 | 39.5 | 8.5 | 0.5 | 14 | 15.8 | | Video-LLaMA-2 | 18 | 7.5 | 1 | 6.5 | 17 | 10 | | VideoChat2 | 10.5 | 7.5 | 9 | 7 | 0.5 | 7.8 | | VideoChatGPT | 6 | 0 | 2 | 7 | 17 | 6.4| diff --git a/baselines/share4video/__init__.py b/baselines/share4video/__init__.py new file mode 100644 index 0000000..39503f5 --- /dev/null +++ b/baselines/share4video/__init__.py @@ -0,0 +1 @@ +# from .model import LlavaLlamaForCausalLM diff --git a/baselines/share4video/constants.py b/baselines/share4video/constants.py new file mode 100644 index 0000000..374be09 --- /dev/null +++ b/baselines/share4video/constants.py @@ -0,0 +1,13 @@ +CONTROLLER_HEART_BEAT_EXPIRATION = 30 +WORKER_HEART_BEAT_INTERVAL = 15 + +LOGDIR = "." + +# Model Constants +IGNORE_INDEX = -100 +IMAGE_TOKEN_INDEX = -200 +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_IMAGE_PATCH_TOKEN = "" +DEFAULT_IM_START_TOKEN = "" +DEFAULT_IM_END_TOKEN = "" +IMAGE_PLACEHOLDER = "" diff --git a/baselines/share4video/conversation.py b/baselines/share4video/conversation.py new file mode 100644 index 0000000..35a8dc6 --- /dev/null +++ b/baselines/share4video/conversation.py @@ -0,0 +1,428 @@ +import base64 +import dataclasses +from enum import Enum, auto +from io import BytesIO +from typing import Any, List, Tuple + +from PIL import Image +from transformers import AutoTokenizer + + +class SeparatorStyle(Enum): + """Different separator style.""" + SINGLE = auto() + TWO = auto() + MPT = auto() + PLAIN = auto() + LLAMA_2 = auto() + LLAMA_3 = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + system: str + roles: List[str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle = SeparatorStyle.SINGLE + sep: str = "###" + sep2: str = None + version: str = "Unknown" + + skip_next: bool = False + + def get_prompt(self): + messages = self.messages + if len(messages) > 0 and type(messages[0][1]) is tuple: + messages = self.messages.copy() + init_role, init_msg = messages[0].copy() + init_msg = init_msg[0].replace("", "").strip() + if 'mmtag' in self.version: + messages[0] = (init_role, init_msg) + messages.insert(0, (self.roles[0], "")) + messages.insert(1, (self.roles[1], "Received.")) + else: + messages[0] = (init_role, "\n" + init_msg) + + if self.sep_style == SeparatorStyle.SINGLE: + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + self.sep + else: + ret += role + ":" + elif self.sep_style == SeparatorStyle.TWO: + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + elif self.sep_style == SeparatorStyle.MPT: + ret = self.system + self.sep + for role, message in messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + message + self.sep + else: + ret += role + elif self.sep_style == SeparatorStyle.LLAMA_2: + def wrap_sys( + msg): return f"<>\n{msg}\n<>\n\n" if len(msg) > 0 else msg + + def wrap_inst(msg): return f"[INST] {msg} [/INST]" + ret = "" + + for i, (role, message) in enumerate(messages): + if i == 0: + assert message, "first message should not be none" + assert role == self.roles[0], "first message should come from user" + if message: + if type(message) is tuple: + message, _, _ = message + if i == 0: + message = wrap_sys(self.system) + message + if i % 2 == 0: + message = wrap_inst(message) + ret += self.sep + message + else: + ret += " " + message + " " + self.sep2 + else: + ret += "" + ret = ret.lstrip(self.sep) + elif self.sep_style == SeparatorStyle.PLAIN: + seps = [self.sep, self.sep2] + ret = self.system + for i, (role, message) in enumerate(messages): + if message: + if type(message) is tuple: + message, _, _ = message + ret += message + seps[i % 2] + else: + ret += "" + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + return ret + + def append_message(self, role, message): + self.messages.append([role, message]) + + def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672): + if image_process_mode == "Pad": + def expand2square(pil_img, background_color=(122, 116, 104)): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new( + pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new( + pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + image = expand2square(image) + elif image_process_mode in ["Default", "Crop"]: + pass + elif image_process_mode == "Resize": + image = image.resize((336, 336)) + else: + raise ValueError( + f"Invalid image_process_mode: {image_process_mode}") + if max(image.size) > max_len: + max_hw, min_hw = max(image.size), min(image.size) + aspect_ratio = max_hw / min_hw + shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) + longest_edge = int(shortest_edge * aspect_ratio) + W, H = image.size + if H > W: + H, W = longest_edge, shortest_edge + else: + H, W = shortest_edge, longest_edge + image = image.resize((W, H)) + if return_pil: + return image + else: + buffered = BytesIO() + image.save(buffered, format=image_format) + img_b64_str = base64.b64encode(buffered.getvalue()).decode() + return img_b64_str + + def get_images(self, return_pil=False): + images = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image, image_process_mode = msg + image = self.process_image( + image, image_process_mode, return_pil=return_pil) + images.append(image) + return images + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + if type(msg) is tuple: + msg, image, image_process_mode = msg + img_b64_str = self.process_image( + image, "Default", return_pil=False, + image_format='JPEG') + img_str = f'user upload image' + msg = img_str + msg.replace('', '').strip() + ret.append([msg, None]) + else: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + version=self.version) + + def dict(self): + if len(self.get_images()) > 0: + return { + "system": self.system, + "roles": self.roles, + "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages], + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + return { + "system": self.system, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + } + + +conv_vicuna_v0 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant"), + messages=( + ("Human", "What are the key differences between renewable and non-renewable energy sources?"), + ("Assistant", + "Renewable energy sources are those that can be replenished naturally in a relatively " + "short amount of time, such as solar, wind, hydro, geothermal, and biomass. " + "Non-renewable energy sources, on the other hand, are finite and will eventually be " + "depleted, such as coal, oil, and natural gas. Here are some key differences between " + "renewable and non-renewable energy sources:\n" + "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable " + "energy sources are finite and will eventually run out.\n" + "2. Environmental impact: Renewable energy sources have a much lower environmental impact " + "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, " + "and other negative effects.\n" + "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically " + "have lower operational costs than non-renewable sources.\n" + "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote " + "locations than non-renewable sources.\n" + "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different " + "situations and needs, while non-renewable sources are more rigid and inflexible.\n" + "6. Sustainability: Renewable energy sources are more sustainable over the long term, while " + "non-renewable sources are not, and their depletion can lead to economic and social instability.\n") + ), + offset=2, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_vicuna_v1 = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the user's questions.", + roles=("USER", "ASSISTANT"), + version="v1", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + +conv_llama_2 = Conversation( + system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=(), + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", +) + +conv_llava_llama_2 = Conversation( + system="You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=(), + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", +) + +conv_mpt = Conversation( + system="""<|im_start|>system +A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=(), + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_llava_plain = Conversation( + system="", + roles=("", ""), + messages=( + ), + offset=0, + sep_style=SeparatorStyle.PLAIN, + sep="\n", +) + +conv_llava_v0 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant"), + messages=( + ), + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_llava_v0_mmtag = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language." + "The visual content will be provided with the following format: visual content.", + roles=("Human", "Assistant"), + messages=( + ), + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", + version="v0_mmtag", +) + +conv_llava_v1 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("USER", "ASSISTANT"), + version="v1", + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + +conv_llava_v1_mmtag = Conversation( + system="A chat between a curious user and an artificial intelligence assistant. " + "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language." + "The visual content will be provided with the following format: visual content.", + roles=("USER", "ASSISTANT"), + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", + version="v1_mmtag", +) + +conv_mistral_instruct = Conversation( + system="", + roles=("USER", "ASSISTANT"), + version="llama_v2", + messages=(), + offset=0, + sep_style=SeparatorStyle.LLAMA_2, + sep="", + sep2="", +) + +conv_chatml_direct = Conversation( + system="""<|im_start|>system +Answer the questions.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="mpt", + messages=(), + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", +) + +conv_yi = Conversation( + system="""<|im_start|>system\nAnswer the questions.""", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + version="yi", + messages=(), + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>\n", +) + +conv_llava_llama_3 = Conversation( + system="<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.", + roles=("<|start_header_id|>user<|end_header_id|>\n\n", + "<|start_header_id|>assistant<|end_header_id|>\n\n"), + version="llama3", + messages=[], + offset=0, + sep_style=SeparatorStyle.MPT, + sep="<|eot_id|>", +) + +default_conversation = conv_vicuna_v1 +conv_templates = { + "default": conv_vicuna_v0, + "v0": conv_vicuna_v0, + "v1": conv_vicuna_v1, + "vicuna_v1": conv_vicuna_v1, + "llama_2": conv_llama_2, + "mistral_instruct": conv_mistral_instruct, + "chatml_direct": conv_chatml_direct, + "mistral_direct": conv_chatml_direct, + + "plain": conv_llava_plain, + "v0_plain": conv_llava_plain, + "llava_v0": conv_llava_v0, + "v0_mmtag": conv_llava_v0_mmtag, + "llava_v1": conv_llava_v1, + "v1_mmtag": conv_llava_v1_mmtag, + "llava_llama_2": conv_llava_llama_2, + "llava_llama_3": conv_llava_llama_3, + + "mpt": conv_mpt, +} + + +if __name__ == "__main__": + print(default_conversation.get_prompt()) diff --git a/baselines/share4video/eval/evaluate_benchmark_1_correctness.py b/baselines/share4video/eval/evaluate_benchmark_1_correctness.py new file mode 100644 index 0000000..b463432 --- /dev/null +++ b/baselines/share4video/eval/evaluate_benchmark_1_correctness.py @@ -0,0 +1,207 @@ +import argparse +import ast +import json +import os +from multiprocessing.pool import Pool + +from openai import OpenAI + +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get('OPENAIKEY'), + base_url=os.environ.get('OPENAIBASE') +) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="question-answer-generation-using-gpt-3") + parser.add_argument("--pred_path", required=True, + help="The path to file containing prediction.") + parser.add_argument("--output_dir", required=True, + help="The path to save annotation json files.") + parser.add_argument("--output_json", required=True, + help="The path to save annotation final combined json file.") + parser.add_argument("--num_tasks", required=True, + type=int, help="Number of splits.") + parser.add_argument("--num_chunks", default=1, + type=int, help="Result splits") + parser.add_argument("--gpt_version", default="gpt-3.5-turbo-0125", + type=str, help="version of gpt") + args = parser.parse_args() + return args + + +def annotate(prediction_set, caption_files, output_dir, gpt_version): + """ + Evaluates question and answer pairs using GPT-3 + Returns a score for correctness. + """ + for file in caption_files: + key = file[:-5] # Strip file extension + qa_set = prediction_set[key] + question = qa_set['q'] + answer = qa_set['a'] + pred = qa_set['pred'] + try: + print(key, 'query') + # Compute the correctness score + completion = client.chat.completions.create( + model=gpt_version, + messages=[ + { + "role": "system", + "content": + "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n" + "- The predicted answer must be factually accurate and align with the video content.\n" + "- Consider synonyms or paraphrases as valid matches.\n" + "- Evaluate the factual accuracy of the prediction compared to the answer." + }, + { + "role": "user", + "content": + "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}." + } + ] + ) + # Convert response to a Python dictionary. + response_message = completion.choices[0].message.content + response_dict = ast.literal_eval(response_message) + result_qa_pair = [response_dict, qa_set] + + print(key, 'done') + + # Save the question-answer pairs to a json file. + with open(f"{output_dir}/{key}.json", "w") as f: + json.dump(result_qa_pair, f, indent=4) + + except Exception as e: + print(f"Error processing file '{key}': {e}") + + +def main(): + """ + Main function to control the flow of the program. + """ + # Parse arguments. + args = parse_args() + + pred_contents = [] + for _idx in range(args.num_chunks): + file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json") + pred_contents += [json.loads(line) for line in open(file)] + + # Dictionary to store the count of occurrences for each video_id + video_id_counts = {} + new_pred_contents = [] + + # Iterate through each sample in pred_contents + for sample in pred_contents: + video_id = sample['video_name'] + if video_id in video_id_counts: + video_id_counts[video_id] += 1 + else: + video_id_counts[video_id] = 0 + + # Create a new sample with the modified key + new_sample = sample + new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}" + new_pred_contents.append(new_sample) + + # Generating list of id's and corresponding files + id_list = [x['video_name'] for x in new_pred_contents] + caption_files = [f"{id}.json" for id in id_list] + + output_dir = args.output_dir + # Generate output directory if not exists. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Preparing dictionary of question-answer sets + prediction_set = {} + for sample in new_pred_contents: + id = sample['video_name'] + question = sample['Q'] + answer = sample['A'] + pred = sample['pred'] + qa_set = {"q": question, "a": answer, "pred": pred} + prediction_set[id] = qa_set + + num_tasks = args.num_tasks + + # While loop to ensure that all captions are processed. + while True: + try: + # Files that have not been processed yet. + completed_files = os.listdir(output_dir) + print(f"completed_files: {len(completed_files)}") + + # Files that have not been processed yet. + incomplete_files = [ + f for f in caption_files if f not in completed_files] + print(f"incomplete_files: {len(incomplete_files)}") + + # Break the loop when there are no incomplete files + if len(incomplete_files) == 0: + break + if len(incomplete_files) <= num_tasks: + num_tasks = 1 + + # Split tasks into parts. + part_len = len(incomplete_files) // num_tasks + all_parts = [incomplete_files[i:i + part_len] + for i in range(0, len(incomplete_files), part_len)] + task_args = [(prediction_set, part, args.output_dir, args.gpt_version) + for part in all_parts] + print('Generate', len(all_parts), 'subprocess.') + + # Use a pool of workers to process the files in parallel. + with Pool() as pool: + pool.starmap(annotate, task_args) + + except Exception as e: + print(f"Error: {e}") + + # Combine all the processed files into one + combined_contents = {} + json_path = args.output_json + + # Iterate through json files + for file_name in os.listdir(output_dir): + if file_name.endswith(".json"): + file_path = os.path.join(output_dir, file_name) + with open(file_path, "r") as json_file: + content = json.load(json_file) + combined_contents[file_name[:-5]] = content + + # Write combined content to a json file + with open(json_path, "w") as json_file: + json.dump(combined_contents, json_file, indent=4) + print("All evaluation completed!") + + # Calculate average score + score_sum = 0 + count = 0 + for key, result in combined_contents.items(): + count += 1 + score_match = result[0]['score'] + score = int(score_match) + score_sum += score + average_score = score_sum / count + + print("Average score for correctness:", average_score) + + +if __name__ == "__main__": + main() diff --git a/baselines/share4video/eval/evaluate_benchmark_2_detailed_orientation.py b/baselines/share4video/eval/evaluate_benchmark_2_detailed_orientation.py new file mode 100644 index 0000000..9163ddf --- /dev/null +++ b/baselines/share4video/eval/evaluate_benchmark_2_detailed_orientation.py @@ -0,0 +1,210 @@ +import argparse +import ast +import json +import os +from multiprocessing.pool import Pool + +from openai import OpenAI + +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get('OPENAIKEY'), + base_url=os.environ.get('OPENAIBASE') +) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="question-answer-generation-using-gpt-3") + parser.add_argument("--pred_path", required=True, + help="The path to file containing prediction.") + parser.add_argument("--output_dir", required=True, + help="The path to save annotation json files.") + parser.add_argument("--output_json", required=True, + help="The path to save annotation final combined json file.") + parser.add_argument("--num_tasks", required=True, + type=int, help="Number of splits.") + parser.add_argument("--num_chunks", default=1, + type=int, help="Result splits") + parser.add_argument("--gpt_version", default="gpt-3.5-turbo-0125", + type=str, help="version of gpt") + args = parser.parse_args() + return args + + +def annotate(prediction_set, caption_files, output_dir, gpt_version): + """ + Evaluates question and answer pairs using GPT-3 and + returns a score for detailed orientation. + """ + for file in caption_files: + key = file[:-5] # Strip file extension + qa_set = prediction_set[key] + question = qa_set['q'] + answer = qa_set['a'] + pred = qa_set['pred'] + try: + print(key, 'query') + # Compute the detailed-orientation score + completion = client.chat.completions.create( + model=gpt_version, + messages=[ + { + "role": "system", + "content": + "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n" + "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n" + "- Consider synonyms or paraphrases as valid matches.\n" + "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity." + }, + { + "role": "user", + "content": + "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}." + } + ] + ) + # Convert response to a Python dictionary. + response_message = completion.choices[0].message.content + response_dict = ast.literal_eval(response_message) + result_qa_pair = [response_dict, qa_set] + + print(key, 'done') + + # Save the question-answer pairs to a json file. + with open(f"{output_dir}/{key}.json", "w") as f: + json.dump(result_qa_pair, f, indent=4) + + except Exception as e: + print(f"Error processing file '{key}': {e}") + + +def main(): + """ + Main function to control the flow of the program. + """ + # Parse arguments. + args = parse_args() + + pred_contents = [] + for _idx in range(args.num_chunks): + file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json") + pred_contents += [json.loads(line) for line in open(file)] + + # Dictionary to store the count of occurrences for each video_id + video_id_counts = {} + new_pred_contents = [] + + # Iterate through each sample in pred_contents + for sample in pred_contents: + video_id = sample['video_name'] + if video_id in video_id_counts: + video_id_counts[video_id] += 1 + else: + video_id_counts[video_id] = 0 + + # Create a new sample with the modified key + new_sample = sample + new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}" + new_pred_contents.append(new_sample) + + # Generating list of id's and corresponding files + id_list = [x['video_name'] for x in new_pred_contents] + caption_files = [f"{id}.json" for id in id_list] + + output_dir = args.output_dir + # Generate output directory if not exists. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Preparing dictionary of question-answer sets + prediction_set = {} + for sample in new_pred_contents: + id = sample['video_name'] + question = sample['Q'] + answer = sample['A'] + pred = sample['pred'] + qa_set = {"q": question, "a": answer, "pred": pred} + prediction_set[id] = qa_set + + num_tasks = args.num_tasks + + # While loop to ensure that all captions are processed. + while True: + try: + # Files that have not been processed yet. + completed_files = os.listdir(output_dir) + print(f"completed_files: {len(completed_files)}") + + # Files that have not been processed yet. + incomplete_files = [ + f for f in caption_files if f not in completed_files] + print(f"incomplete_files: {len(incomplete_files)}") + + # Break the loop when there are no incomplete files + if len(incomplete_files) == 0: + break + if len(incomplete_files) <= num_tasks: + num_tasks = 1 + + # Split tasks into parts. + part_len = len(incomplete_files) // num_tasks + all_parts = [incomplete_files[i:i + part_len] + for i in range(0, len(incomplete_files), part_len)] + task_args = [(prediction_set, part, args.output_dir, args.gpt_version) + for part in all_parts] + print('Generate', len(all_parts), 'subprocess.') + + # Use a pool of workers to process the files in parallel. + with Pool() as pool: + pool.starmap(annotate, task_args) + + except Exception as e: + print(f"Error: {e}") + + # Combine all the processed files into one + combined_contents = {} + json_path = args.output_json + + # Iterate through json files + for file_name in os.listdir(output_dir): + if file_name.endswith(".json"): + file_path = os.path.join(output_dir, file_name) + with open(file_path, "r") as json_file: + content = json.load(json_file) + combined_contents[file_name[:-5]] = content + + # Write combined content to a json file + with open(json_path, "w") as json_file: + json.dump(combined_contents, json_file, indent=4) + print("All evaluation completed!") + + # Calculate average score + score_sum = 0 + count = 0 + for key, result in combined_contents.items(): + count += 1 + try: + score_match = result[0]['score'] + except: + score_match = 0 + score = int(score_match) + score_sum += score + average_score = score_sum / count + + print("Average score for detailed orientation:", average_score) + + +if __name__ == "__main__": + main() diff --git a/baselines/share4video/eval/evaluate_benchmark_3_context.py b/baselines/share4video/eval/evaluate_benchmark_3_context.py new file mode 100644 index 0000000..b7e85ff --- /dev/null +++ b/baselines/share4video/eval/evaluate_benchmark_3_context.py @@ -0,0 +1,205 @@ +from openai import OpenAI +import os +import argparse +import json +import ast +from multiprocessing.pool import Pool + +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get('OPENAIKEY'), + base_url=os.environ.get('OPENAIBASE') +) + +def parse_args(): + parser = argparse.ArgumentParser( + description="question-answer-generation-using-gpt-3") + parser.add_argument("--pred_path", required=True, + help="The path to file containing prediction.") + parser.add_argument("--output_dir", required=True, + help="The path to save annotation json files.") + parser.add_argument("--output_json", required=True, + help="The path to save annotation final combined json file.") + parser.add_argument("--num_tasks", required=True, + type=int, help="Number of splits.") + parser.add_argument("--num_chunks", default=1, + type=int, help="Result splits") + parser.add_argument("--gpt_version", default="gpt-3.5-turbo-0125", + type=str, help="version of gpt") + args = parser.parse_args() + return args + + +def annotate(prediction_set, caption_files, output_dir, gpt_version): + """ + Evaluates question and answer pairs using GPT-3 and + returns a score for contextual understanding. + """ + for file in caption_files: + key = file[:-5] # Strip file extension + qa_set = prediction_set[key] + question = qa_set['q'] + answer = qa_set['a'] + pred = qa_set['pred'] + try: + print(key, 'query') + # Compute the contextual understanding score + completion = client.chat.completions.create( + model=gpt_version, + messages=[ + { + "role": "system", + "content": + "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n" + "- The predicted answer must capture the main themes and sentiments of the video.\n" + "- Consider synonyms or paraphrases as valid matches.\n" + "- Provide your evaluation of the contextual understanding of the prediction compared to the answer." + }, + { + "role": "user", + "content": + "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}." + } + ] + ) + # Convert response to a Python dictionary. + response_message = completion.choices[0].message.content + response_dict = ast.literal_eval(response_message) + result_qa_pair = [response_dict, qa_set] + + print(key, 'done') + + # Save the question-answer pairs to a json file. + with open(f"{output_dir}/{key}.json", "w") as f: + json.dump(result_qa_pair, f, indent=4) + + except Exception as e: + print(f"Error processing file '{key}': {e}") + + +def main(): + """ + Main function to control the flow of the program. + """ + # Parse arguments. + args = parse_args() + + pred_contents = [] + for _idx in range(args.num_chunks): + file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json") + pred_contents += [json.loads(line) for line in open(file)] + + # Dictionary to store the count of occurrences for each video_id + video_id_counts = {} + new_pred_contents = [] + + # Iterate through each sample in pred_contents + for sample in pred_contents: + video_id = sample['video_name'] + if video_id in video_id_counts: + video_id_counts[video_id] += 1 + else: + video_id_counts[video_id] = 0 + + # Create a new sample with the modified key + new_sample = sample + new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}" + new_pred_contents.append(new_sample) + + # Generating list of id's and corresponding files + id_list = [x['video_name'] for x in new_pred_contents] + caption_files = [f"{id}.json" for id in id_list] + + output_dir = args.output_dir + # Generate output directory if not exists. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Preparing dictionary of question-answer sets + prediction_set = {} + for sample in new_pred_contents: + id = sample['video_name'] + question = sample['Q'] + answer = sample['A'] + pred = sample['pred'] + qa_set = {"q": question, "a": answer, "pred": pred} + prediction_set[id] = qa_set + + num_tasks = args.num_tasks + + # While loop to ensure that all captions are processed. + while True: + try: + # Files that have not been processed yet. + completed_files = os.listdir(output_dir) + print(f"completed_files: {len(completed_files)}") + + # Files that have not been processed yet. + incomplete_files = [ + f for f in caption_files if f not in completed_files] + print(f"incomplete_files: {len(incomplete_files)}") + + # Break the loop when there are no incomplete files + if len(incomplete_files) == 0: + break + if len(incomplete_files) <= num_tasks: + num_tasks = 1 + + # Split tasks into parts. + part_len = len(incomplete_files) // num_tasks + all_parts = [incomplete_files[i:i + part_len] + for i in range(0, len(incomplete_files), part_len)] + task_args = [(prediction_set, part, args.output_dir, args.gpt_version) + for part in all_parts] + print('Generate', len(all_parts), 'subprocess.') + + # Use a pool of workers to process the files in parallel. + with Pool() as pool: + pool.starmap(annotate, task_args) + + except Exception as e: + print(f"Error: {e}") + + # Combine all the processed files into one + combined_contents = {} + json_path = args.output_json + + # Iterate through json files + for file_name in os.listdir(output_dir): + if file_name.endswith(".json"): + file_path = os.path.join(output_dir, file_name) + with open(file_path, "r") as json_file: + content = json.load(json_file) + combined_contents[file_name[:-5]] = content + + # Write combined content to a json file + with open(json_path, "w") as json_file: + json.dump(combined_contents, json_file, indent=4) + print("All evaluation completed!") + + # Calculate average score + score_sum = 0 + count = 0 + for key, result in combined_contents.items(): + count += 1 + score_match = result[0]['score'] + score = int(score_match) + score_sum += score + average_score = score_sum / count + + print("Average score for contextual understanding:", average_score) + + +if __name__ == "__main__": + main() diff --git a/baselines/share4video/eval/evaluate_benchmark_4_temporal.py b/baselines/share4video/eval/evaluate_benchmark_4_temporal.py new file mode 100644 index 0000000..dfc4f22 --- /dev/null +++ b/baselines/share4video/eval/evaluate_benchmark_4_temporal.py @@ -0,0 +1,204 @@ +from openai import OpenAI +import os +import argparse +import json +import ast +from multiprocessing.pool import Pool + +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get('OPENAIKEY'), + base_url=os.environ.get('OPENAIBASE') +) + +def parse_args(): + parser = argparse.ArgumentParser( + description="question-answer-generation-using-gpt-3") + parser.add_argument("--pred_path", required=True, + help="The path to file containing prediction.") + parser.add_argument("--output_dir", required=True, + help="The path to save annotation json files.") + parser.add_argument("--output_json", required=True, + help="The path to save annotation final combined json file.") + parser.add_argument("--num_tasks", required=True, + type=int, help="Number of splits.") + parser.add_argument("--num_chunks", default=1, + type=int, help="Result splits") + parser.add_argument("--gpt_version", default="gpt-3.5-turbo-0125", + type=str, help="version of gpt") + args = parser.parse_args() + return args + + +def annotate(prediction_set, caption_files, output_dir, gpt_version): + """ + Evaluates question and answer pairs using GPT-3 and + returns a score for temporal understanding. + """ + for file in caption_files: + key = file[:-5] # Strip file extension + qa_set = prediction_set[key] + question = qa_set['q'] + answer = qa_set['a'] + pred = qa_set['pred'] + try: + print(key, 'query') + # Compute the temporal understanding score + completion = client.chat.completions.create( + model=gpt_version, + messages=[ + { + "role": "system", + "content": + "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. " + "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n" + "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n" + "- Evaluate the temporal accuracy of the prediction compared to the answer." + }, + { + "role": "user", + "content": + "Please evaluate the following video-based question-answer pair:\n\n" + f"Question: {question}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer: {pred}\n\n" + "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}." + } + ] + ) + # Convert response to a Python dictionary. + response_message = completion.choices[0].message.content + response_dict = ast.literal_eval(response_message) + result_qa_pair = [response_dict, qa_set] + + print(key, 'done') + + # Save the question-answer pairs to a json file. + with open(f"{output_dir}/{key}.json", "w") as f: + json.dump(result_qa_pair, f, indent=4) + + except Exception as e: + print(f"Error processing file '{key}': {e}") + + +def main(): + """ + Main function to control the flow of the program. + """ + # Parse arguments. + args = parse_args() + + pred_contents = [] + for _idx in range(args.num_chunks): + file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json") + pred_contents += [json.loads(line) for line in open(file)] + + # Dictionary to store the count of occurrences for each video_id + video_id_counts = {} + new_pred_contents = [] + + # Iterate through each sample in pred_contents + for sample in pred_contents: + video_id = sample['video_name'] + if video_id in video_id_counts: + video_id_counts[video_id] += 1 + else: + video_id_counts[video_id] = 0 + + # Create a new sample with the modified key + new_sample = sample + new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}" + new_pred_contents.append(new_sample) + + # Generating list of id's and corresponding files + id_list = [x['video_name'] for x in new_pred_contents] + caption_files = [f"{id}.json" for id in id_list] + + output_dir = args.output_dir + # Generate output directory if not exists. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Preparing dictionary of question-answer sets + prediction_set = {} + for sample in new_pred_contents: + id = sample['video_name'] + question = sample['Q'] + answer = sample['A'] + pred = sample['pred'] + qa_set = {"q": question, "a": answer, "pred": pred} + prediction_set[id] = qa_set + + num_tasks = args.num_tasks + + # While loop to ensure that all captions are processed. + while True: + try: + # Files that have not been processed yet. + completed_files = os.listdir(output_dir) + print(f"completed_files: {len(completed_files)}") + + # Files that have not been processed yet. + incomplete_files = [ + f for f in caption_files if f not in completed_files] + print(f"incomplete_files: {len(incomplete_files)}") + + # Break the loop when there are no incomplete files + if len(incomplete_files) == 0: + break + if len(incomplete_files) <= num_tasks: + num_tasks = 1 + + # Split tasks into parts. + part_len = len(incomplete_files) // num_tasks + all_parts = [incomplete_files[i:i + part_len] + for i in range(0, len(incomplete_files), part_len)] + task_args = [(prediction_set, part, args.output_dir, args.gpt_version) + for part in all_parts] + print('Generate', len(all_parts), 'subprocess.') + + # Use a pool of workers to process the files in parallel. + with Pool() as pool: + pool.starmap(annotate, task_args) + + except Exception as e: + print(f"Error: {e}") + + # Combine all the processed files into one + combined_contents = {} + json_path = args.output_json + + # Iterate through json files + for file_name in os.listdir(output_dir): + if file_name.endswith(".json"): + file_path = os.path.join(output_dir, file_name) + with open(file_path, "r") as json_file: + content = json.load(json_file) + combined_contents[file_name[:-5]] = content + + # Write combined content to a json file + with open(json_path, "w") as json_file: + json.dump(combined_contents, json_file, indent=4) + print("All evaluation completed!") + + # Calculate average score + score_sum = 0 + count = 0 + for key, result in combined_contents.items(): + count += 1 + score_match = result[0]['score'] + score = int(score_match) + score_sum += score + average_score = score_sum / count + + print("Average score temporal understanding:", average_score) + + +if __name__ == "__main__": + main() diff --git a/baselines/share4video/eval/evaluate_benchmark_5_consistency.py b/baselines/share4video/eval/evaluate_benchmark_5_consistency.py new file mode 100644 index 0000000..6699a97 --- /dev/null +++ b/baselines/share4video/eval/evaluate_benchmark_5_consistency.py @@ -0,0 +1,216 @@ +import argparse +import ast +import json +import os +from multiprocessing.pool import Pool + +from openai import OpenAI + +client = OpenAI( + # This is the default and can be omitted + api_key=os.environ.get('OPENAIKEY'), + base_url=os.environ.get('OPENAIBASE') +) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="question-answer-generation-using-gpt-3") + parser.add_argument("--pred_path", required=True, + help="The path to file containing prediction.") + parser.add_argument("--output_dir", required=True, + help="The path to save annotation json files.") + parser.add_argument("--output_json", required=True, + help="The path to save annotation final combined json file.") + parser.add_argument("--num_tasks", required=True, + type=int, help="Number of splits.") + parser.add_argument("--num_chunks", default=1, + type=int, help="Result splits") + parser.add_argument("--gpt_version", default="gpt-3.5-turbo-0125", + type=str, help="version of gpt") + args = parser.parse_args() + return args + + +def annotate(prediction_set, caption_files, output_dir, gpt_version): + """ + Evaluates question and answer pairs using GPT-3 and + returns a score for consistency. + """ + for file in caption_files: + key = file[:-5] # Strip file extension + qa_set = prediction_set[key] + question1 = qa_set['q1'] + question2 = qa_set['q2'] + answer = qa_set['a'] + pred1 = qa_set['pred1'] + pred2 = qa_set['pred2'] + try: + print(key, 'query') + + # Compute the consistency score + completion = client.chat.completions.create( + model=gpt_version, + messages=[ + { + "role": "system", + "content": + "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. " + "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ." + "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:" + "------" + "##INSTRUCTIONS: " + "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n" + "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n" + "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n" + "- Evaluate the consistency of the two predicted answers compared to the correct answer." + }, + { + "role": "user", + "content": + "Please evaluate the following video-based question-answer pair:\n\n" + f"Question 1: {question1}\n" + f"Question 2: {question2}\n" + f"Correct Answer: {answer}\n" + f"Predicted Answer to Question 1: {pred1}\n" + f"Predicted Answer to Question 2: {pred2}\n\n" + "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. " + "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING." + "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. " + "For example, your response should look like this: {''score': 4.8}." + } + ] + ) + # Convert response to a Python dictionary. + response_message = completion.choices[0].message.content + response_dict = ast.literal_eval(response_message) + result_qa_pair = [response_dict, qa_set] + + print(key, 'done') + + # Save the question-answer pairs to a json file. + with open(f"{output_dir}/{key}.json", "w") as f: + json.dump(result_qa_pair, f, indent=4) + + except Exception as e: + print(f"Error processing file '{key}': {e}") + + +def main(): + """ + Main function to control the flow of the program. + """ + # Parse arguments. + args = parse_args() + + pred_contents = [] + for _idx in range(args.num_chunks): + file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json") + pred_contents += [json.loads(line) for line in open(file)] + + # Dictionary to store the count of occurrences for each video_id + video_id_counts = {} + new_pred_contents = [] + + # Iterate through each sample in pred_contents + for sample in pred_contents: + video_id = sample['video_name'] + if video_id in video_id_counts: + video_id_counts[video_id] += 1 + else: + video_id_counts[video_id] = 0 + + # Create a new sample with the modified key + new_sample = sample + new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}" + new_pred_contents.append(new_sample) + + # Generating list of id's and corresponding files + id_list = [x['video_name'] for x in new_pred_contents] + caption_files = [f"{id}.json" for id in id_list] + + output_dir = args.output_dir + # Generate output directory if not exists. + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Preparing dictionary of question-answer sets + prediction_set = {} + for sample in new_pred_contents: + id = sample['video_name'] + question1 = sample['Q1'] + question2 = sample['Q1'] + answer = sample['A'] + pred1 = sample['pred1'] + pred2 = sample['pred2'] + qa_set = {"q1": question1, "q2": question2, + "a": answer, "pred1": pred1, "pred2": pred2} + prediction_set[id] = qa_set + + num_tasks = args.num_tasks + + # While loop to ensure that all captions are processed. + while True: + try: + # Files that have not been processed yet. + completed_files = os.listdir(output_dir) + print(f"completed_files: {len(completed_files)}") + + # Files that have not been processed yet. + incomplete_files = [ + f for f in caption_files if f not in completed_files] + print(f"incomplete_files: {len(incomplete_files)}") + + # Break the loop when there are no incomplete files + if len(incomplete_files) == 0: + break + if len(incomplete_files) <= num_tasks: + num_tasks = 1 + + # Split tasks into parts. + part_len = len(incomplete_files) // num_tasks + all_parts = [incomplete_files[i:i + part_len] + for i in range(0, len(incomplete_files), part_len)] + task_args = [(prediction_set, part, args.output_dir, args.gpt_version) + for part in all_parts] + print('Generate', len(all_parts), 'subprocess.') + + # Use a pool of workers to process the files in parallel. + with Pool() as pool: + pool.starmap(annotate, task_args) + + except Exception as e: + print(f"Error: {e}") + + # Combine all the processed files into one + combined_contents = {} + json_path = args.output_json + + # Iterate through json files + for file_name in os.listdir(output_dir): + if file_name.endswith(".json"): + file_path = os.path.join(output_dir, file_name) + with open(file_path, "r") as json_file: + content = json.load(json_file) + combined_contents[file_name[:-5]] = content + + # Write combined content to a json file + with open(json_path, "w") as json_file: + json.dump(combined_contents, json_file, indent=4) + print("All evaluation completed!") + + # Calculate average score + score_sum = 0 + count = 0 + for key, result in combined_contents.items(): + count += 1 + score_match = result[0]['score'] + score = int(score_match) + score_sum += score + average_score = score_sum / count + + print("Average score for consistency:", average_score) + + +if __name__ == "__main__": + main() diff --git a/baselines/share4video/eval/model_vqa_loader.py b/baselines/share4video/eval/model_vqa_loader.py new file mode 100644 index 0000000..2e10eb5 --- /dev/null +++ b/baselines/share4video/eval/model_vqa_loader.py @@ -0,0 +1,146 @@ +import argparse +import torch +import os +import json +from tqdm import tqdm +import shortuuid + +from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava.conversation import conv_templates, SeparatorStyle +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init +from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path +from torch.utils.data import Dataset, DataLoader + +from PIL import Image +import math + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +# Custom dataset class +class CustomDataset(Dataset): + def __init__(self, questions, image_folder, tokenizer, image_processor, model_config): + self.questions = questions + self.image_folder = image_folder + self.tokenizer = tokenizer + self.image_processor = image_processor + self.model_config = model_config + + def __getitem__(self, index): + line = self.questions[index] + image_file = line["image"] + qs = line["text"] + if self.model_config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB') + image_tensor = process_images([image], self.image_processor, self.model_config)[0] + + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + + return input_ids, image_tensor, image.size + + def __len__(self): + return len(self.questions) + + +def collate_fn(batch): + input_ids, image_tensors, image_sizes = zip(*batch) + input_ids = torch.stack(input_ids, dim=0) + image_tensors = torch.stack(image_tensors, dim=0) + return input_ids, image_tensors, image_sizes + + +# DataLoader +def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4): + assert batch_size == 1, "batch_size must be 1" + dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config) + data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn) + return data_loader + + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + + if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: + args.conv_mode = args.conv_mode + '_mmtag' + print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') + + data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config) + + for (input_ids, image_tensor, image_sizes), line in tqdm(zip(data_loader, questions), total=len(questions)): + idx = line["question_id"] + cur_prompt = line["text"] + + input_ids = input_ids.to(device='cuda', non_blocking=True) + pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token is not None else tokenizer.eos_token_id + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True), + image_sizes=image_sizes, + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + top_p=args.top_p, + num_beams=args.num_beams, + max_new_tokens=args.max_new_tokens, + pad_token_id=pad_token_id, + use_cache=True) + + outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({"question_id": idx, + "prompt": cur_prompt, + "text": outputs, + "answer_id": ans_id, + "model_id": model_name, + "metadata": {}}) + "\n") + # ans_file.flush() + ans_file.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v1") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + parser.add_argument("--max_new_tokens", type=int, default=128) + args = parser.parse_args() + + eval_model(args) diff --git a/baselines/share4video/eval/model_vqa_tempcompass.py b/baselines/share4video/eval/model_vqa_tempcompass.py new file mode 100644 index 0000000..443b105 --- /dev/null +++ b/baselines/share4video/eval/model_vqa_tempcompass.py @@ -0,0 +1,220 @@ +import argparse +import json +import math +import os + +import numpy as np +import torch +from decord import VideoReader +from PIL import Image +from tqdm import tqdm + +from llava.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, + DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +from llava.conversation import conv_templates +from llava.mm_utils import (get_model_name_from_path, process_images, + tokenizer_image_token) +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init + + +def create_frame_grid(img_array, interval_width): + n, h, w, c = img_array.shape + grid_size = int(np.ceil(np.sqrt(n))) + + horizontal_band = np.ones((h, interval_width, c), + dtype=img_array.dtype) * 255 + vertical_band = np.ones((interval_width, w + (grid_size - 1) + * (w + interval_width), c), dtype=img_array.dtype) * 255 + + rows = [] + for i in range(grid_size): + row_frames = [] + for j in range(grid_size): + idx = i * grid_size + j + if idx < n: + frame = img_array[idx] + else: + frame = np.ones_like(img_array[0]) * 255 + if j > 0: + row_frames.append(horizontal_band) + row_frames.append(frame) + combined_row = np.concatenate(row_frames, axis=1) + if i > 0: + rows.append(vertical_band) + rows.append(combined_row) + + final_grid = np.concatenate(rows, axis=0) + return final_grid + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +def get_seq_frames(total_num_frames, desired_num_frames): + seg_size = float(total_num_frames - 1) / desired_num_frames + seq = [] + for i in range(desired_num_frames): + start = int(np.round(seg_size * i)) + end = int(np.round(seg_size * (i + 1))) + seq.append((start + end) // 2) + + return seq + + +def load_video(vis_path, num_frm=8): + vr = VideoReader(vis_path) + total_frame_num = len(vr) + frame_idx = get_seq_frames(total_frame_num, num_frm) + img_array = vr.get_batch(frame_idx).asnumpy() # (n_clips*num_frm, H, W, 3) + img_grid = create_frame_grid(img_array, 50) + img_grid = Image.fromarray(img_grid).convert("RGB") + + return [img_grid] + + +def process_data(video_id, qs, model_config, image_folder, tokenizer, processor, num_grid=-1): + if num_grid != -1: + qs = "The provided image arranges keyframes from a video in a grid view, keyframes are separated with white bands. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the frames. Question: " + qs + if model_config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + \ + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + image = os.path.join(image_folder, video_id) + image_grid = load_video(image, num_grid) + image_size = image_grid[0].size + image_tensor = process_images( + image_grid, processor, model_config)[0] + + input_ids = tokenizer_image_token( + prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + + return input_ids, image_tensor, image_size + + +def eval_dataset(args): + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model( + model_path, args.model_base, model_name, lora_alpha=args.lora_alpha) + + if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode: + args.conv_mode = args.conv_mode + '_mmtag' + print( + f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.') + + qa_json = args.Eval_QA_root + image_folder = args.image_folder + + with open(qa_json, 'r', encoding='utf-8') as f: + data = json.load(f) + key_list = list(data.keys()) + key_list.sort() + keys = get_chunk(key_list, args.num_chunks, args.chunk_idx) + + answer_prompt = { + # "multi-choice": "\nBest Option:", # The old version + "multi-choice": "\nPlease directly give the best option:", + "yes_no": "\nPlease answer yes or no:", + # "caption_matching": "\nBest Option:", #The old version + "caption_matching": "\nPlease directly give the best option:", + "captioning": "" # The answer "Generated Caption:" is already contained in the question + } + + eval_dict = {} + for v_id in tqdm(keys): + items = data[v_id] + for dim in items: + for item in items[dim]: + question = item['question'] + answer_prompt[args.task_type] + # =================================You need to change this code ========================= + # ...... + input_ids, image_tensor, image_size = process_data( + v_id+'.mp4', question, model.config, image_folder, tokenizer, image_processor, args.num_grid) + input_ids = input_ids.unsqueeze(0).to( + device='cuda', non_blocking=True) + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.unsqueeze(0).to( + dtype=torch.float16, device='cuda', non_blocking=True), + image_sizes=[image_size], + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + top_p=args.top_p, + num_beams=args.num_beams, + max_new_tokens=args.max_new_tokens, + pad_token_id=tokenizer.eos_token_id, + use_cache=True) + + outputs = tokenizer.batch_decode( + output_ids, skip_special_tokens=True)[0].strip() + + # ...... + # ======================================================================================= + + if v_id not in eval_dict: + eval_dict[v_id] = {} + if dim not in eval_dict[v_id]: + eval_dict[v_id][dim] = [] + + pred = { + 'question': item['question'], + 'answer': item['answer'], + 'prediction': outputs + } + eval_dict[v_id][dim].append(pred) + + eval_dataset_json = args.chat_conversation_output_folder + os.makedirs(os.path.dirname(eval_dataset_json), exist_ok=True) + with open(eval_dataset_json, 'w', encoding='utf-8') as f: + json.dump(eval_dict, f, indent=2) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, + default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v1") + parser.add_argument("--num-grid", type=int, default=16) + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--task_type", type=str, default=None) + parser.add_argument("--num_beams", type=int, default=1) + parser.add_argument("--lora-alpha", default=None, + type=int, help="lora alpha for scaling weight") + parser.add_argument("--max_new_tokens", type=int, default=128) + parser.add_argument("--dataset_name", type=str, + default=None, help="The type of LLM") + parser.add_argument("--Eval_QA_root", type=str, + default='./', help="folder containing QA JSON files") + parser.add_argument("--Eval_Video_root", type=str, + default='./', help="folder containing video data") + parser.add_argument("--chat_conversation_output_folder", + type=str, default='./Chat_results', help="") + args = parser.parse_args() + + eval_dataset(args) diff --git a/baselines/share4video/eval/run_llava.py b/baselines/share4video/eval/run_llava.py new file mode 100644 index 0000000..24b0fff --- /dev/null +++ b/baselines/share4video/eval/run_llava.py @@ -0,0 +1,145 @@ +import argparse +import torch + +from llava.constants import ( + IMAGE_TOKEN_INDEX, + DEFAULT_IMAGE_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IM_END_TOKEN, + IMAGE_PLACEHOLDER, +) +from llava.conversation import conv_templates, SeparatorStyle +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init +from llava.mm_utils import ( + process_images, + tokenizer_image_token, + get_model_name_from_path, +) + +from PIL import Image + +import requests +from PIL import Image +from io import BytesIO +import re + + +def image_parser(args): + out = args.image_file.split(args.sep) + return out + + +def load_image(image_file): + if image_file.startswith("http") or image_file.startswith("https"): + response = requests.get(image_file) + image = Image.open(BytesIO(response.content)).convert("RGB") + else: + image = Image.open(image_file).convert("RGB") + return image + + +def load_images(image_files): + out = [] + for image_file in image_files: + image = load_image(image_file) + out.append(image) + return out + + +def eval_model(args): + # Model + disable_torch_init() + + model_name = get_model_name_from_path(args.model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model( + args.model_path, args.model_base, model_name + ) + + qs = args.query + image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + if IMAGE_PLACEHOLDER in qs: + if model.config.mm_use_im_start_end: + qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs) + else: + qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs) + else: + if model.config.mm_use_im_start_end: + qs = image_token_se + "\n" + qs + else: + qs = DEFAULT_IMAGE_TOKEN + "\n" + qs + + if "llama-2" in model_name.lower(): + conv_mode = "llava_llama_2" + elif "mistral" in model_name.lower(): + conv_mode = "mistral_instruct" + elif "v1.6-34b" in model_name.lower(): + conv_mode = "chatml_direct" + elif "v1" in model_name.lower(): + conv_mode = "llava_v1" + elif "mpt" in model_name.lower(): + conv_mode = "mpt" + else: + conv_mode = "llava_v0" + + if args.conv_mode is not None and conv_mode != args.conv_mode: + print( + "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format( + conv_mode, args.conv_mode, args.conv_mode + ) + ) + else: + args.conv_mode = conv_mode + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + image_files = image_parser(args) + images = load_images(image_files) + image_sizes = [x.size for x in images] + images_tensor = process_images( + images, + image_processor, + model.config + ).to(model.device, dtype=torch.float16) + + input_ids = ( + tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") + .unsqueeze(0) + .cuda() + ) + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=images_tensor, + image_sizes=image_sizes, + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + top_p=args.top_p, + num_beams=args.num_beams, + max_new_tokens=args.max_new_tokens, + use_cache=True, + ) + + outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + print(outputs) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-file", type=str, required=True) + parser.add_argument("--query", type=str, required=True) + parser.add_argument("--conv-mode", type=str, default=None) + parser.add_argument("--sep", type=str, default=",") + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + parser.add_argument("--max_new_tokens", type=int, default=512) + args = parser.parse_args() + + eval_model(args) diff --git a/baselines/share4video/eval/video/eval_mvbench.py b/baselines/share4video/eval/video/eval_mvbench.py new file mode 100644 index 0000000..ad62d14 --- /dev/null +++ b/baselines/share4video/eval/video/eval_mvbench.py @@ -0,0 +1,248 @@ + +import functools +import itertools +import logging +import multiprocessing as mp +import os +import pdb +from argparse import ArgumentParser +from multiprocessing import Pool + +import numpy as np +import torch +import transformers +from decord import VideoReader, cpu +from PIL import Image +from tqdm import tqdm + +from llava.eval.video.general_utils import (conv_templates, create_frame_grid, + resize_image_grid, video_answer) +from llava.eval.video.mvbench_utils import (MVBenchDataset, check_ans, + load_results, save_results) +from llava.mm_utils import get_model_name_from_path +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init + +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def load_model_and_dataset(rank, world_size, args): + # remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes. + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, processor, context_len = load_pretrained_model( + model_path, args.model_base, model_name, device_map='cpu') + logger.info('done loading llava') + + # position embedding + model = model.to(torch.device(rank)) + model = model.eval() + + dataset = MVBenchDataset(num_segments=args.num_frames) + dataset.set_rank_and_world_size(rank, world_size) + return model, tokenizer, processor, dataset + + +def infer_mvbench( + model, + processor, + tokenizer, + data_sample, + conv_mode, + pre_query_prompt=None, # add in the head of question + post_query_prompt=None, # add in the end of question + answer_prompt=None, # add in the begining of answer + return_prompt=None, # add in the begining of return message + print_res=False, +): + video_list = data_sample["video_pils"] + conv = conv_templates[conv_mode].copy() + conv.user_query(data_sample['question'], + pre_query_prompt, post_query_prompt, is_mm=True) + if answer_prompt is not None: + conv.assistant_response(answer_prompt) + + llm_message, conv = video_answer( + conv=conv, + model=model, + processor=processor, + tokenizer=tokenizer, + img_grid=video_list, + max_new_tokens=32, + do_sample=False, + print_res=print_res + ) + + # if answer_prompt is not None: + # llm_message = ''.join(llm_message.split(answer_prompt)[1:]) + if return_prompt is not None: + llm_message = return_prompt + llm_message + + return llm_message + + +def single_test(model, processor, tokenizer, vid_path, num_frames=4, conv_mode="plain"): + def get_index(num_frames, num_segments): + seg_size = float(num_frames - 1) / num_segments + start = int(seg_size / 2) + offsets = np.array([ + start + int(np.round(seg_size * idx)) for idx in range(num_segments) + ]) + return offsets + + def load_video(video_path, num_segments=8, return_msg=False, num_frames=4): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + num_frames = len(vr) + frame_indices = get_index(num_frames, num_segments) + img_array = vr.get_batch(frame_indices).asnumpy() + img_grid = create_frame_grid(img_array, 50) + img_grid = Image.fromarray(img_grid).convert("RGB") + img_grid = resize_image_grid(img_grid) + if return_msg: + fps = float(vr.get_avg_fps()) + sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) + # " " should be added in the start and end + msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." + return img_grid, msg + else: + return img_grid + if num_frames != 0: + vid, msg = load_video( + vid_path, num_segments=num_frames, return_msg=True) + else: + vid, msg = None, 'num_frames is 0, not inputing image' + img_grid = vid + conv = conv_templates[conv_mode].copy() + conv.user_query("Describe the video in details.", is_mm=True) + llm_response, conv = video_answer(conv=conv, model=model, processor=processor, tokenizer=tokenizer, + do_sample=False, img_grid=img_grid, max_new_tokens=256, print_res=True) + + +def run(rank, args, world_size): + if rank != 0: + transformers.utils.logging.set_verbosity_error() + logger.setLevel(transformers.logging.ERROR) + + print_res = True + conv_mode = args.conv_mode + + pre_query_prompt = "The provided image arranges keyframes from a video in a grid view, keyframes are separated with white bands. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the frames." + post_query_prompt = "\nOnly give the best option." + + logger.info(f'loading model and constructing dataset to gpu {rank}...') + model, tokenizer, processor, dataset = load_model_and_dataset( + rank, world_size, args) + logger.info('done model and dataset...') + logger.info('constructing dataset...') + logger.info('single test...') + + vid_path = "images/104554.webm" + if rank == 0: + single_test(model, + processor, + tokenizer, + vid_path, + num_frames=args.num_frames, + conv_mode=args.conv_mode) + logger.info('single test done...') + tbar = tqdm(total=len(dataset)) + + correct = 0 + total = 0 + result_list = [] + acc_dict = {} + done_count = 0 + + for example in dataset: + task_type = example['task_type'] + if task_type not in acc_dict: + acc_dict[task_type] = [0, 0] # correct, total + acc_dict[task_type][1] += 1 + total += 1 + pred = infer_mvbench( + model, + processor, + tokenizer, + example, + conv_mode=conv_mode, + pre_query_prompt=pre_query_prompt, + post_query_prompt=post_query_prompt, + answer_prompt="Best option:(", + return_prompt='(', + print_res=print_res, + ) + gt = example['answer'] + result_list.append({ + 'pred': pred, + 'gt': gt, + 'task_type': task_type, + 'video_path': example['video_path'], + 'question': example['question'], + + }) + if check_ans(pred=pred, gt=gt): + acc_dict[task_type][0] += 1 + correct += 1 + if rank == 0: + tbar.update(len(result_list) - done_count, ) + tbar.set_description_str( + f"One Chunk--Task Type: {task_type}, Chunk Part Acc: {acc_dict[task_type][0] / acc_dict[task_type][1] * 100 :.2f}%;" + f" Chunk Total Acc: {correct / total * 100 :.2f}%" + ) + done_count = len(result_list) + return result_list + + +def main(): + multiprocess = torch.cuda.device_count() >= 2 + mp.set_start_method('spawn') + args = parse_args() + save_path = args.save_path + json_data = load_results(save_path) + if json_data is None: + if multiprocess: + logger.info(f'started benchmarking, saving to: {save_path}') + n_gpus = torch.cuda.device_count() + # assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}" + world_size = n_gpus + with Pool(world_size) as pool: + func = functools.partial(run, args=args, world_size=world_size) + result_lists = pool.map(func, range(world_size)) + + logger.info('finished running') + result_list = [res for res in itertools.chain(*result_lists)] + else: + result_list = run(0, world_size=1, args=args) # debug + + else: + logger.info(f'loaded results from {save_path}') + result_list = json_data + save_results(result_list, save_path) + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument("--model-path", + type=str, + default='checkpoints/llava-v1.6-7b_vicuna-1.5-7b_clip-large-336_video-sft-mix294k_ft-mlp-llm-lora_lr-mlp-2e-5-llm-2e-4') + parser.add_argument("--model-base", + type=str, + default=None) + parser.add_argument("--save_path", + type=str, + default='./playground/results/mvbench') + parser.add_argument("--num_frames", + type=int, + default=16) + parser.add_argument("--conv-mode", + type=str, + default='eval_mvbench') + args = parser.parse_args() + return args + + +if __name__ == "__main__": + main() diff --git a/baselines/share4video/eval/video/eval_vbench.py b/baselines/share4video/eval/video/eval_vbench.py new file mode 100644 index 0000000..7a9876f --- /dev/null +++ b/baselines/share4video/eval/video/eval_vbench.py @@ -0,0 +1,249 @@ + +import functools +import itertools +import logging +import multiprocessing as mp +import os +import pdb +from argparse import ArgumentParser +from multiprocessing import Pool + +import numpy as np +import torch +import transformers +from decord import VideoReader, cpu +from PIL import Image +from tqdm import tqdm + +from llava.eval.video.general_utils import (conv_templates, create_frame_grid, + resize_image_grid, video_answer) +from llava.eval.video.vbench_utils import (VBenchDataset, check_ans, + load_results, save_results) +from llava.mm_utils import get_model_name_from_path +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init + +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def load_model_and_dataset(rank, world_size, args): + # remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes. + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, processor, context_len = load_pretrained_model( + model_path, args.model_base, model_name, device_map='cpu') + logger.info('done loading llava') + + # position embedding + model = model.to(torch.device(rank)) + model = model.eval() + + dataset = VBenchDataset(num_segments=args.num_frames) + dataset.set_rank_and_world_size(rank, world_size) + return model, tokenizer, processor, dataset + + +def infer_mvbench( + model, + processor, + tokenizer, + data_sample, + conv_mode, + pre_query_prompt=None, # add in the head of question + post_query_prompt=None, # add in the end of question + answer_prompt=None, # add in the begining of answer + return_prompt=None, # add in the begining of return message + print_res=False, +): + video_list = data_sample["video_pils"] + conv = conv_templates[conv_mode].copy() + conv.user_query(data_sample['question'], + pre_query_prompt, post_query_prompt, is_mm=True) + if answer_prompt is not None: + conv.assistant_response(answer_prompt) + + llm_message, conv = video_answer( + conv=conv, + model=model, + processor=processor, + tokenizer=tokenizer, + img_grid=video_list, + max_new_tokens=32, + do_sample=False, + print_res=print_res + ) + + # if answer_prompt is not None: + # llm_message = ''.join(llm_message.split(answer_prompt)[1:]) + if return_prompt is not None: + llm_message = return_prompt + llm_message + + return llm_message + + +def single_test(model, processor, tokenizer, vid_path, num_frames=4, conv_mode="plain"): + def get_index(num_frames, num_segments): + seg_size = float(num_frames - 1) / num_segments + start = int(seg_size / 2) + offsets = np.array([ + start + int(np.round(seg_size * idx)) for idx in range(num_segments) + ]) + return offsets + + def load_video(video_path, num_segments=8, return_msg=False, num_frames=4): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + num_frames = len(vr) + frame_indices = get_index(num_frames, num_segments) + img_array = vr.get_batch(frame_indices).asnumpy() + img_grid = create_frame_grid(img_array, 50) + img_grid = Image.fromarray(img_grid).convert("RGB") + img_grid = resize_image_grid(img_grid) + if return_msg: + fps = float(vr.get_avg_fps()) + sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) + # " " should be added in the start and end + msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." + return img_grid, msg + else: + return img_grid + if num_frames != 0: + vid, msg = load_video( + vid_path, num_segments=num_frames, return_msg=True) + else: + vid, msg = None, 'num_frames is 0, not inputing image' + img_grid = vid + conv = conv_templates[conv_mode].copy() + conv.user_query("Describe the video in details.", is_mm=True) + llm_response, conv = video_answer(conv=conv, model=model, processor=processor, tokenizer=tokenizer, + do_sample=False, img_grid=img_grid, max_new_tokens=256, print_res=True) + + +def run(rank, args, world_size): + if rank != 0: + transformers.utils.logging.set_verbosity_error() + logger.setLevel(transformers.logging.ERROR) + + print_res = True + conv_mode = args.conv_mode + + pre_query_prompt = "The provided image arranges keyframes from a video in a grid view, keyframes are separated with white bands. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the frames." + post_query_prompt = "\nOnly give the best option." + + logger.info(f'loading model and constructing dataset to gpu {rank}...') + model, tokenizer, processor, dataset = load_model_and_dataset( + rank, world_size, args) + logger.info('done model and dataset...') + logger.info('constructing dataset...') + logger.info('single test...') + + vid_path = "images/104554.webm" + if rank == 0: + single_test(model, + processor, + tokenizer, + vid_path, + num_frames=args.num_frames, + conv_mode=args.conv_mode) + logger.info('single test done...') + tbar = tqdm(total=len(dataset)) + + correct = 0 + total = 0 + result_list = [] + acc_dict = {} + done_count = 0 + + for example in dataset: + task_type = example['task_type'] + if task_type not in acc_dict: + acc_dict[task_type] = [0, 0] # correct, total + acc_dict[task_type][1] += 1 + total += 1 + pred = infer_mvbench( + model, + processor, + tokenizer, + example, + conv_mode=conv_mode, + pre_query_prompt=pre_query_prompt, + post_query_prompt=post_query_prompt, + answer_prompt="Best option:(", + return_prompt='(', + print_res=print_res, + ) + gt = example['answer'] + result_list.append({ + 'pred': pred, + 'gt': gt, + 'task_type': task_type, + 'task_split': example['task_split'], + 'video_path': example['video_path'], + 'question': example['question'], + + }) + if check_ans(pred=pred, gt=gt): + acc_dict[task_type][0] += 1 + correct += 1 + if rank == 0: + tbar.update(len(result_list) - done_count, ) + tbar.set_description_str( + f"One Chunk--Task Type: {task_type}, Chunk Part Acc: {acc_dict[task_type][0] / acc_dict[task_type][1] * 100 :.2f}%;" + f" Chunk Total Acc: {correct / total * 100 :.2f}%" + ) + done_count = len(result_list) + return result_list + + +def main(): + multiprocess = torch.cuda.device_count() >= 2 + mp.set_start_method('spawn') + args = parse_args() + save_path = args.save_path + json_data = load_results(save_path) + if json_data is None: + if multiprocess: + logger.info(f'started benchmarking, saving to: {save_path}') + n_gpus = torch.cuda.device_count() + # assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}" + world_size = n_gpus + with Pool(world_size) as pool: + func = functools.partial(run, args=args, world_size=world_size) + result_lists = pool.map(func, range(world_size)) + + logger.info('finished running') + result_list = [res for res in itertools.chain(*result_lists)] + else: + result_list = run(0, world_size=1, args=args) # debug + + else: + logger.info(f'loaded results from {save_path}') + result_list = json_data + save_results(result_list, save_path) + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument("--model-path", + type=str, + default='checkpoints/llava-v1.6-7b_vicuna-1.5-7b_clip-large-336_video-sft-mix294k_ft-mlp-llm-lora_lr-mlp-2e-5-llm-2e-4') + parser.add_argument("--model-base", + type=str, + default=None) + parser.add_argument("--save_path", + type=str, + default='./playground/results/vbench') + parser.add_argument("--num_frames", + type=int, + default=16) + parser.add_argument("--conv-mode", + type=str, + default='eval_vbench') + args = parser.parse_args() + return args + + +if __name__ == "__main__": + main() diff --git a/baselines/share4video/eval/video/general_utils.py b/baselines/share4video/eval/video/general_utils.py new file mode 100644 index 0000000..56c3b98 --- /dev/null +++ b/baselines/share4video/eval/video/general_utils.py @@ -0,0 +1,548 @@ +import copy +import dataclasses +import itertools +import json +import os +import re +from enum import Enum, auto +from typing import Any, List + +import cv2 +import imageio +import numpy as np +import torch +from decord import VideoReader, cpu +from moviepy.editor import VideoFileClip +from PIL import Image +from torch.utils.data import Dataset +from transformers import StoppingCriteria + +from llava.constants import IMAGE_TOKEN_INDEX +from llava.mm_utils import process_images, tokenizer_image_token + + +def load_json(load_dir_path, json_file_name): + + load_path = os.path.join(load_dir_path, json_file_name) + if not os.path.exists(load_path): + return None + with open(load_path, 'r', encoding='utf-8') as f: + obj_serializable = json.load(f) + return obj_serializable + + +def dump_json(obj_serializable, save_dir_path, json_file_name): + os.makedirs(save_dir_path, exist_ok=True) + save_path = os.path.join(save_dir_path, json_file_name) + with open(save_path, 'w', encoding='utf-8') as f: + json.dump(obj_serializable, f, indent=4, ensure_ascii=False, ) + + +def create_frame_grid(img_array, interval_width=50): + n, h, w, c = img_array.shape + grid_size = int(np.ceil(np.sqrt(n))) + + horizontal_band = np.ones((h, interval_width, c), + dtype=img_array.dtype) * 255 + vertical_band = np.ones((interval_width, w + (grid_size - 1) + * (w + interval_width), c), dtype=img_array.dtype) * 255 + + rows = [] + for i in range(grid_size): + row_frames = [] + for j in range(grid_size): + idx = i * grid_size + j + if idx < n: + frame = img_array[idx] + else: + frame = np.ones_like(img_array[0]) * 255 + if j > 0: + row_frames.append(horizontal_band) + row_frames.append(frame) + combined_row = np.concatenate(row_frames, axis=1) + if i > 0: + rows.append(vertical_band) + rows.append(combined_row) + + final_grid = np.concatenate(rows, axis=0) + return final_grid + + +def resize_image_grid(image, max_length=1920): + width, height = image.size + if max(width, height) > max_length: + if width > height: + scale = max_length / width + else: + scale = max_length / height + + new_width = int(width * scale) + new_height = int(height * scale) + + img_resized = image.resize((new_width, new_height), Image.BILINEAR) + else: + img_resized = image + return img_resized + + +class EasyDict(dict): + """ + Get attributes + + >>> d = EasyDict({'foo':3}) + >>> d['foo'] + 3 + >>> d.foo + 3 + >>> d.bar + Traceback (most recent call last): + ... + AttributeError: 'EasyDict' object has no attribute 'bar' + + Works recursively + + >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}}) + >>> isinstance(d.bar, dict) + True + >>> d.bar.x + 1 + + Bullet-proof + + >>> EasyDict({}) + {} + >>> EasyDict(d={}) + {} + >>> EasyDict(None) + {} + >>> d = {'a': 1} + >>> EasyDict(**d) + {'a': 1} + + Set attributes + + >>> d = EasyDict() + >>> d.foo = 3 + >>> d.foo + 3 + >>> d.bar = {'prop': 'value'} + >>> d.bar.prop + 'value' + >>> d + {'foo': 3, 'bar': {'prop': 'value'}} + >>> d.bar.prop = 'newer' + >>> d.bar.prop + 'newer' + + + Values extraction + + >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]}) + >>> isinstance(d.bar, list) + True + >>> from operator import attrgetter + >>> map(attrgetter('x'), d.bar) + [1, 3] + >>> map(attrgetter('y'), d.bar) + [2, 4] + >>> d = EasyDict() + >>> d.keys() + [] + >>> d = EasyDict(foo=3, bar=dict(x=1, y=2)) + >>> d.foo + 3 + >>> d.bar.x + 1 + + Still like a dict though + + >>> o = EasyDict({'clean':True}) + >>> o.items() + [('clean', True)] + + And like a class + + >>> class Flower(EasyDict): + ... power = 1 + ... + >>> f = Flower() + >>> f.power + 1 + >>> f = Flower({'height': 12}) + >>> f.height + 12 + >>> f['power'] + 1 + >>> sorted(f.keys()) + ['height', 'power'] + + update and pop items + >>> d = EasyDict(a=1, b='2') + >>> e = EasyDict(c=3.0, a=9.0) + >>> d.update(e) + >>> d.c + 3.0 + >>> d['c'] + 3.0 + >>> d.get('c') + 3.0 + >>> d.update(a=4, b=4) + >>> d.b + 4 + >>> d.pop('a') + 4 + >>> d.a + Traceback (most recent call last): + ... + AttributeError: 'EasyDict' object has no attribute 'a' + """ + + def __init__(self, d=None, **kwargs): + if d is None: + d = {} + if kwargs: + d.update(**kwargs) + for k, v in d.items(): + setattr(self, k, v) + # Class attributes + for k in self.__class__.__dict__.keys(): + if not (k.startswith("__") and k.endswith("__")) and not k in ("update", "pop"): + setattr(self, k, getattr(self, k)) + + def __setattr__(self, name, value): + if isinstance(value, (list, tuple)): + value = [self.__class__(x) if isinstance( + x, dict) else x for x in value] + elif isinstance(value, dict) and not isinstance(value, self.__class__): + value = self.__class__(value) + super(EasyDict, self).__setattr__(name, value) + super(EasyDict, self).__setitem__(name, value) + + __setitem__ = __setattr__ + + def update(self, e=None, **f): + d = e or dict() + d.update(f) + for k in d: + setattr(self, k, d[k]) + + def pop(self, k, d=None): + if hasattr(self, k): + delattr(self, k) + return super(EasyDict, self).pop(k, d) + + +class EvalDataset(Dataset): + + def __init__(self, num_segments, test_ratio=None): + super().__init__() + self.num_segments = num_segments + self.test_ratio = test_ratio + self.decord_method = { + 'video': self.read_video, + 'gif': self.read_clip_gif, + 'frame': self.read_frame, + } + + def __getitem__(self, index) -> Any: + raise NotImplementedError('') + + def __str__(self): + len_list = {} + option_list = {} + for data in self.data_list: + if data['task_type'] not in len_list: + len_list[data['task_type']] = 0 + len_list[data['task_type']] += 1 + if data['task_type'] not in option_list: + option_list[data['task_type']] = 0 + option_list[data['task_type']] += len(data['data']['candidates']) + + correct = 0 + total = 0 + res = f"There are {len(self.data_list)} videos as follow:\n" + for k, v in len_list.items(): + correct += len_list[k] + total += option_list[k] + res += f"{v} for {k} ({option_list[k]} options => {len_list[k]/option_list[k]*100:.2f}%)\n" + correct = correct + 1 / option_list[k] + res += f"Total random accuracy: {correct/total*100:.2f}%" + return res.rstrip() + + def __len__(self): + return len(self.data_list) + + def get_index(self, bound, fps, max_frame, first_idx=0): + if bound: + start, end = bound[0], bound[1] + else: + start, end = -100000, 100000 + start_idx = max(first_idx, round(start * fps)) + end_idx = min(round(end * fps), max_frame) + seg_size = float(end_idx - start_idx) / self.num_segments + frame_indices = np.array([ + int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) + for idx in range(self.num_segments) + ]) + return frame_indices + + def read_video(self, video_path, bound=None): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + max_frame = len(vr) - 1 + fps = float(vr.get_avg_fps()) + + images_group = list() + frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) + for frame_index in frame_indices: + img = Image.fromarray(vr[frame_index].asnumpy()) + images_group.append(img) + return images_group + + def read_gif(self, video_path, bound=None, fps=25): + gif = imageio.get_reader(video_path) + max_frame = len(gif) - 1 + + images_group = list() + frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) + for index, frame in enumerate(gif): + if index in frame_indices: + img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) + img = Image.fromarray(img) + images_group.append(img) + if len(images_group) == len(frame_indices): + break + + # might be some really short videos in the gif datasets + if len(images_group) < self.num_segments: + multiplier = int(self.num_segments/len(images_group)) + 1 + images_group = [image for _ in range( + multiplier) for image in images_group][:self.num_segments] + assert len(images_group) == self.num_segments + + return images_group + + def read_clip_gif(self, video_path, bound=None, fps=25): + gif = VideoFileClip(video_path) + frames = gif.iter_frames() + max_frame = gif.reader.nframes - 1 + images_group = list() + frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) + for index, frame in enumerate(frames): + if index in frame_indices: + img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) + img = Image.fromarray(img) + images_group.append(img) + + # might be some really short videos in the gif datasets + if len(images_group) < self.num_segments: + multiplier = int(self.num_segments/len(images_group)) + 1 + images_group = [image for _ in range( + multiplier) for image in images_group][:self.num_segments] + assert len(images_group) == self.num_segments + + return images_group + + def read_frame(self, video_path, bound=None, fps=3): + max_frame = len(os.listdir(video_path)) + images_group = list() + frame_indices = self.get_index( + bound, fps, max_frame, first_idx=1) # frame_idx starts from 1 + for frame_index in frame_indices: + img = Image.open(os.path.join( + video_path, f"{frame_index:05d}.jpg")) + images_group.append(img) + return images_group + + def set_rank_and_world_size(self, rank, world_size): + self.rank = rank + self.world_size = world_size + # self.data_list = self.data_list[::200] # debug + if self.test_ratio is None: + self.data_list = self.data_list[rank::world_size] + else: + np.random.RandomState(42).shuffle(self.data_list) + if isinstance(self.test_ratio, (float, int)): + num_samples = int(len(self.data_list) * self.test_ratio) + else: + num_samples = int(self.test_ratio) + self.data_list = self.data_list[rank:num_samples:world_size] + + +class SeparatorStyle(Enum): + """Different separator style.""" + SINGLE = auto() + TWO = auto() + MPT = auto() + + +class MultiModalConvStyle(Enum): + """Different separator style.""" + MM_ALONE = 'mm_alone' + MM_INTERLEAF = 'mm_inferleaf' + + +@dataclasses.dataclass +class Conversation(EasyDict): + """A class that keeps all conversation history.""" + system: str + roles: List[str] + messages: List[List[str]] + sep: List[str] + mm_token: str + + mm_style: MultiModalConvStyle = MultiModalConvStyle.MM_INTERLEAF + pre_query_prompt: str = None + post_query_prompt: str = None + answer_prompt: str = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if isinstance(self.sep, str): + self.sep = [self.sep for _ in self.roles] + + def get_prompt(self): + # if only one sep given, then both sep are the sames + sep = [self.sep for _ in self.roles] if isinstance( + self.sep, str) else self.sep + sep = dict(zip(self.roles, sep)) + ret = self.system + sep[self.roles[0]] if self.system != "" else "" + for i, (role, message) in enumerate(self.messages): + # if is last msg(the prompt for assistant), if answer prompt exists, no sep added + if i+1 == len(self.messages): + if role != self.roles[-1]: # last role is not the model + ret += role + message + sep[role] + self.roles[-1] + else: + ret += role + message + else: + ret += role + message + sep[role] + return ret + + def user_query(self, query=None, pre_query_prompt=None, post_query_prompt=None, is_mm=False, num_mm_token=1): + if post_query_prompt is not None: + query = f"{query} {post_query_prompt}" + + if pre_query_prompt is not None: + query = f"{pre_query_prompt} {query}" + role = self.roles[0] + # TODO: remove the num_mm_token and hack the self.mm_token outside + if is_mm: + mm_str = num_mm_token*self.mm_token[:-1] + self.mm_token[-1] + if self.mm_style == MultiModalConvStyle.MM_ALONE: + self._append_message(role, mm_str) + elif self.mm_style == MultiModalConvStyle.MM_INTERLEAF: + if self.mm_token not in query: + query = f'{mm_str} {query}' + self._append_message(role, query) + + def assistant_response(self, response, pre_query_prompt=None, post_query_prompt=None): + if post_query_prompt is not None: + response = f"{response} {post_query_prompt}" + + if pre_query_prompt is not None: + response = f"{post_query_prompt} {response}" + + role = self.roles[1] + self._append_message(role, response) + + def _append_message(self, role, message): + message = '' if message is None else message + self.messages.append([role, message]) + + def copy(self): + return copy.deepcopy(self) + + +def video_answer(conv: Conversation, model, processor, tokenizer, img_grid, do_sample=True, max_new_tokens=200, num_beams=1, top_p=0.9, + temperature=1.0, print_res=False, **kwargs): + prompt = conv.get_prompt() + if not isinstance(img_grid, (list, tuple)): + img_grid = [img_grid] + image_size = img_grid[0].size + image_tensor = process_images(img_grid, processor, model.config)[0] + input_ids = tokenizer_image_token( + prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + input_ids = input_ids.unsqueeze(0).to( + device=model.device, non_blocking=True) + pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token is not None else tokenizer.eos_token_id + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.to( + dtype=torch.float16, device=model.device, non_blocking=True), + image_sizes=[image_size], + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + num_beams=num_beams, + max_new_tokens=max_new_tokens, + pad_token_id=pad_token_id, + use_cache=True, + **kwargs) + outputs = tokenizer.batch_decode( + output_ids, skip_special_tokens=True)[0].strip() + if print_res: # debug usage + print('### PROMPTING LM WITH: ', prompt) + print('### LM OUTPUT TEXT: ', outputs) + + conv.messages[-1][1] = outputs + return outputs, conv + + +conv_plain_v1 = Conversation( + system="", + roles=("USER:", "ASSISTANT:"), + messages=[], + sep=(" ", ""), + mm_token='' +) + +SYSTEM_MVBENCH = "Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n" +conv_eval_mvbench = Conversation( + system=SYSTEM_MVBENCH, + roles=("USER: ", "ASSISTANT:"), + messages=[], + sep=[" ", ""], + mm_token='\n', + mm_style=MultiModalConvStyle.MM_INTERLEAF, +) +conv_eval_mvbench_llama3 = Conversation( + system=f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_MVBENCH}""", + roles=("<|start_header_id|>user<|end_header_id|>\n\n", + "<|start_header_id|>assistant<|end_header_id|>\n\n"), + messages=[], + sep_style=SeparatorStyle.MPT, + sep="<|eot_id|>", + mm_token='\n', + mm_style=MultiModalConvStyle.MM_INTERLEAF, +) + +SYSTEM_VBENCH = "Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons. Based on your observations, select the best option that accurately addresses the question.\n" +conv_eval_vbench = Conversation( + system=SYSTEM_VBENCH, + roles=("USER: ", "ASSISTANT:"), + messages=[], + sep=[" ", ""], + mm_token='\n', + mm_style=MultiModalConvStyle.MM_INTERLEAF, +) +conv_eval_vbench_llama3 = Conversation( + system=f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{SYSTEM_VBENCH}""", + roles=("<|start_header_id|>user<|end_header_id|>\n\n", + "<|start_header_id|>assistant<|end_header_id|>\n\n"), + messages=[], + sep_style=SeparatorStyle.MPT, + sep="<|eot_id|>", + mm_token='\n', + mm_style=MultiModalConvStyle.MM_INTERLEAF, +) + + +conv_templates = { + "plain": conv_plain_v1, + "eval_mvbench": conv_eval_mvbench, + "eval_mvbench_llama3": conv_eval_mvbench_llama3, + "eval_cvrrbench": conv_eval_cvrrbench, + "eval_vbench": conv_eval_vbench, + "eval_vbench_llama3": conv_eval_vbench_llama3 +} diff --git a/baselines/share4video/eval/video/mvbench_utils.py b/baselines/share4video/eval/video/mvbench_utils.py new file mode 100644 index 0000000..0724838 --- /dev/null +++ b/baselines/share4video/eval/video/mvbench_utils.py @@ -0,0 +1,207 @@ +import copy +import dataclasses +import json +import os +from enum import Enum, auto +from typing import Any, List + +import numpy as np +import torch +from PIL import Image +from transformers import StoppingCriteria + +from llava.constants import IMAGE_TOKEN_INDEX +from llava.eval.video.general_utils import EvalDataset, EasyDict, load_json, dump_json +from llava.mm_utils import process_images, tokenizer_image_token +from llava.eval.video.general_utils import create_frame_grid, resize_image_grid + + +def load_results(save_path): + all_results = load_json(save_path, 'all_results.json') + if all_results is not None: + result_list = all_results['result_list'] + else: + result_list = None + # json_data = load_json(save_path, 'all_results.json')['result_list'] + return result_list + + +def save_results(result_list, save_path): + + final_res, acc_dict = {}, {} + correct, total = 0, 0 + for res in result_list: + task_type = res['task_type'] + if task_type not in acc_dict: + acc_dict[task_type] = [0, 0] # correct, total + acc_dict[task_type][1] += 1 + total += 1 + pred = res['pred'] + gt = res['gt'] + if check_ans(pred=pred, gt=gt): + acc_dict[task_type][0] += 1 + correct += 1 + + for k, v in acc_dict.items(): + final_res[k] = v[0] / v[1] * 100 + correct += v[0] + total += v[1] + final_res['Avg'] = correct / total * 100 + + all_results = { + "acc_dict": acc_dict, + "result_list": result_list + } + dump_json(all_results, save_path, 'all_results.json') + dump_json(final_res, save_path, 'upload_leaderboard.json') + + +def check_ans(pred, gt): + flag = False + + pred_list = pred.lower().split(' ') + pred_option, pred_content = pred_list[0], ' '.join(pred_list[1:]) + gt_list = gt.lower().split(' ') + gt_option, gt_content = gt_list[0], ' '.join(gt_list[1:]) + if gt_content[-1] == '.': + gt_content = gt_content[:-1] + + if not any([c in pred_option for c in 'abcdefgABCDEFG']): + print(f"model doesn't follow instructions: {pred}") + elif pred_option.replace('.', '') in gt_option: + flag = True + elif gt_option in pred_option: + flag = True + + return flag + + +class MVBenchDataset(EvalDataset): + data_list_info = { + # "task_type (sub task name)": ("json file name", "image/video prefix", "data_type", "bound") + # has start & end + "Action Sequence": ("action_sequence.json", "playground/data/mvbench/star/Charades_v1_480/", "video", True), + # has start & end + "Action Prediction": ("action_prediction.json", "playground/data/mvbench/star/Charades_v1_480/", "video", True), + "Action Antonym": ("action_antonym.json", "playground/data/mvbench/ssv2_video/", "video", False), + "Fine-grained Action": ("fine_grained_action.json", "playground/data/mvbench/Moments_in_Time_Raw/videos/", "video", False), + "Unexpected Action": ("unexpected_action.json", "playground/data/mvbench/FunQA_test/test/", "video", False), + "Object Existence": ("object_existence.json", "playground/data/mvbench/clevrer/video_validation/", "video", False), + # has start & end + "Object Interaction": ("object_interaction.json", "playground/data/mvbench/star/Charades_v1_480/", "video", True), + "Object Shuffle": ("object_shuffle.json", "playground/data/mvbench/perception/videos/", "video", False), + "Moving Direction": ("moving_direction.json", "playground/data/mvbench/clevrer/video_validation/", "video", False), + # has start & end + "Action Localization": ("action_localization.json", "playground/data/mvbench/sta/sta_video/", "video", True), + "Scene Transition": ("scene_transition.json", "playground/data/mvbench/scene_qa/video/", "video", False), + "Action Count": ("action_count.json", "playground/data/mvbench/perception/videos/", "video", False), + "Moving Count": ("moving_count.json", "playground/data/mvbench/clevrer/video_validation/", "video", False), + "Moving Attribute": ("moving_attribute.json", "playground/data/mvbench/clevrer/video_validation/", "video", False), + "State Change": ("state_change.json", "playground/data/mvbench/perception/videos/", "video", False), + "Fine-grained Pose": ("fine_grained_pose.json", "playground/data/mvbench/nturgbd/", "video", False), + "Character Order": ("character_order.json", "playground/data/mvbench/perception/videos/", "video", False), + "Egocentric Navigation": ("egocentric_navigation.json", "playground/data/mvbench/vlnqa/", "video", False), + # has start & end, read frame + "Episodic Reasoning": ("episodic_reasoning.json", "playground/data/mvbench/tvqa/frames_fps3_hq/", "frame", True), + "Counterfactual Inference": ("counterfactual_inference.json", "playground/data/mvbench/clevrer/video_validation/", "video", False), + } + data_dir = "playground/data/mvbench/json" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + data_list_info = self.data_list_info + data_dir = self.data_dir + + self.data_list = [] + for k, v in data_list_info.items(): + with open(os.path.join(data_dir, v[0]), 'r') as f: + json_data = json.load(f) + for data in json_data: + self.data_list.append({ + 'task_type': k, + 'prefix': v[1], + 'data_type': v[2], + 'bound': v[3], + 'data': data + }) + # self.data_list = self.data_list[:100] # for debug + self.decord_method = { + 'video': self.read_video, + 'gif': self.read_gif, + 'frame': self.read_frame, + } + + def __getitem__(self, idx): + question, answer = self.qa_template(self.data_list[idx]['data']) + task_type = self.data_list[idx]['task_type'] + decord_method = self.decord_method[self.data_list[idx]['data_type']] + bound = None + if self.data_list[idx]['bound']: + bound = ( + self.data_list[idx]['data']['start'], + self.data_list[idx]['data']['end'], + ) + video_path = os.path.join( + self.data_list[idx]['prefix'], self.data_list[idx]['data']['video']) + + try: # might be problem with decord + images_group = decord_method(video_path, bound) + img_group = np.stack(np.array([np.asarray(image) + for image in images_group]), axis=0) + img_grid = create_frame_grid(img_group) + img_grid = [resize_image_grid( + Image.fromarray(img_grid).convert("RGB"))] + except Exception as e: + print(f'Error! {e}') + print(f'error decoding {video_path}') + task_type = 'error_reading_video' + img_grid = None + + return { + 'video_path': video_path, + 'video_pils': img_grid, + 'question': question, + 'answer': answer, + 'task_type': task_type, + } + + def qa_template(self, data): + question = f"Question: {data['question']}\n" + question += "Options:\n" + answer = data['answer'] + answer_idx = -1 + for idx, c in enumerate(data['candidates']): + question += f"({chr(ord('A') + idx)}) {c}\n" + if c == answer: + answer_idx = idx + question = question.rstrip() + answer = f"({chr(ord('A') + answer_idx)}) {answer}" + return question, answer + + +# conversation +class KeywordsStoppingCriteria(StoppingCriteria): + def __init__(self, keywords, tokenizer, input_ids): + self.keywords = keywords + self.tokenizer = tokenizer + self.start_len = None + self.input_ids = input_ids + + def __call__( + self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs + ) -> bool: + if self.start_len is None: + self.start_len = self.input_ids.shape[1] + return False + else: + outputs = self.tokenizer.batch_decode( + output_ids[:, self.start_len:], skip_special_tokens=True + ) + flag = True + for output in outputs: + for keyword in self.keywords: + if keyword not in output: + flag = False + return False + return flag diff --git a/baselines/share4video/eval/video/vbench_utils.py b/baselines/share4video/eval/video/vbench_utils.py new file mode 100644 index 0000000..3e6d49b --- /dev/null +++ b/baselines/share4video/eval/video/vbench_utils.py @@ -0,0 +1,140 @@ +import json +import os + +import numpy as np +from PIL import Image + +from llava.eval.video.general_utils import (EvalDataset, create_frame_grid, + dump_json, load_json, + resize_image_grid) + + +def load_results(save_path): + all_results = load_json(save_path, 'all_results.json') + if all_results is not None: + result_list = all_results['result_list'] + else: + result_list = None + return result_list + + +def save_results(result_list, save_path): + final_res, acc_dict = {}, {} + correct, total = 0, 0 + for res in result_list: + task_split = res['task_split'] + if task_split not in acc_dict: + acc_dict[task_split] = [0, 0] # correct, total + acc_dict[task_split][1] += 1 + total += 1 + pred = res['pred'] + gt = res['gt'] + if check_ans(pred=pred, gt=gt): + acc_dict[task_split][0] += 1 + correct += 1 + + for k, v in acc_dict.items(): + final_res[k] = v[0] / v[1] * 100 + correct += v[0] + total += v[1] + final_res['Avg'] = correct / total * 100 + + all_results = { + "acc_dict": acc_dict, + "result_list": result_list + } + dump_json(all_results, save_path, 'all_results.json') + dump_json(final_res, save_path, 'upload_leaderboard.json') + + +def check_ans(pred, gt): + flag = False + + pred_list = pred.lower().split(' ') + pred_option, pred_content = pred_list[0], ' '.join(pred_list[1:]) + gt_list = gt.lower().split(' ') + gt_option = gt_list[0] + + if not any([c in pred_option for c in 'abcdefghABCDEFGH']): + print(f"model doesn't follow instructions: {pred}") + elif pred_option.replace('.', '') in gt_option: + flag = True + elif gt_option in pred_option: + flag = True + + return flag + + +class VBenchDataset(EvalDataset): + data_list_info = { + # "task_type (sub task name)": ("json file name", "image/video prefix", "data_type", "bound") + "ActivityNet": ("ActivityNet_QA_new.json",), + "Driving-decision-making": ("Driving-decision-making_QA_new.json",), + "Driving-exam": ("Driving-exam_QA_new.json",), + "MOT": ("MOT_QA_new.json",), + "MSRVTT": ("MSRVTT_QA_new.json",), + "MSVD": ("MSVD_QA_new.json",), + "MV": ("MV_QA_new.json",), + "NBA": ("NBA_QA_new.json",), + "SQA3D": ("SQA3D_QA_new.json",), + "TGIF": ("TGIF_QA_new.json",), + "TVQA": ("TVQA_QA_new.json",), + "Ucfcrime": ("Ucfcrime_QA_new.json",), + "Youcook2": ("Youcook2_QA_new.json",) + } + data_dir = "playground/data/vbench/Eval_QA" + video_dir = "playground/data/vbench" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + data_list_info = self.data_list_info + data_dir = self.data_dir + + self.data_list = [] + for k, v in data_list_info.items(): + with open(os.path.join(data_dir, v[0]), 'r') as f: + json_data = json.load(f) + for data in json_data: + self.data_list.append({ + 'task_type': k, + 'data': data + }) + # self.data_list = self.data_list[:100] # for debug + self.decord_method = self.read_video + + def __getitem__(self, idx): + question, answer = self.qa_template(self.data_list[idx]['data']) + task_type = self.data_list[idx]['task_type'] + task_split = self.data_list[idx]['data']['task_split'] + video_path = os.path.join( + self.video_dir, self.data_list[idx]['data']['video_path']) + + try: # might be problem with decord + images_group = self.decord_method(video_path) + img_group = np.stack(np.array([np.asarray(image) + for image in images_group]), axis=0) + img_grid = create_frame_grid(img_group) + img_grid = [resize_image_grid( + Image.fromarray(img_grid).convert("RGB"))] + except Exception as e: + print(f'Error! {e}') + print(f'error decoding {video_path}') + task_type = 'error_reading_video' + img_grid = None + + return { + 'video_path': video_path, + 'video_pils': img_grid, + 'question': question, + 'answer': answer, + 'task_type': task_type, + 'task_split': task_split, + } + + def qa_template(self, data): + question = f"Question: {data['question']}\n" + question += f"Options:\n {data['options']}" + answer = data['answer'] + question = question.rstrip() + return question, answer diff --git a/baselines/share4video/mm_utils.py b/baselines/share4video/mm_utils.py new file mode 100644 index 0000000..cca2c95 --- /dev/null +++ b/baselines/share4video/mm_utils.py @@ -0,0 +1,250 @@ +from PIL import Image +from io import BytesIO +import base64 +import torch +import math +import ast + +from transformers import StoppingCriteria +from llava.constants import IMAGE_TOKEN_INDEX + + +def select_best_resolution(original_size, possible_resolutions): + """ + Selects the best resolution from a list of possible resolutions based on the original size. + + Args: + original_size (tuple): The original size of the image in the format (width, height). + possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. + + Returns: + tuple: The best fit resolution in the format (width, height). + """ + original_width, original_height = original_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float('inf') + + for width, height in possible_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + +def resize_and_pad_image(image, target_resolution): + """ + Resize and pad an image to a target resolution while maintaining aspect ratio. + + Args: + image (PIL.Image.Image): The input image. + target_resolution (tuple): The target resolution (width, height) of the image. + + Returns: + PIL.Image.Image: The resized and padded image. + """ + original_width, original_height = image.size + target_width, target_height = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + # Resize the image + resized_image = image.resize((new_width, new_height)) + + new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0)) + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + new_image.paste(resized_image, (paste_x, paste_y)) + + return new_image + + +def divide_to_patches(image, patch_size): + """ + Divides an image into patches of a specified size. + + Args: + image (PIL.Image.Image): The input image. + patch_size (int): The size of each patch. + + Returns: + list: A list of PIL.Image.Image objects representing the patches. + """ + patches = [] + width, height = image.size + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + box = (j, i, j + patch_size, i + patch_size) + patch = image.crop(box) + patches.append(patch) + + return patches + + +def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): + """ + Calculate the shape of the image patch grid after the preprocessing for images of any resolution. + + Args: + image_size (tuple): The size of the input image in the format (width, height). + grid_pinpoints (str): A string representation of a list of possible resolutions. + patch_size (int): The size of each image patch. + + Returns: + tuple: The shape of the image patch grid in the format (width, height). + """ + if type(grid_pinpoints) is list: + possible_resolutions = grid_pinpoints + else: + possible_resolutions = ast.literal_eval(grid_pinpoints) + width, height = select_best_resolution(image_size, possible_resolutions) + return width // patch_size, height // patch_size + + +def process_anyres_image(image, processor, grid_pinpoints): + """ + Process an image with variable resolutions. + + Args: + image (PIL.Image.Image): The input image to be processed. + processor: The image processor object. + grid_pinpoints (str): A string representation of a list of possible resolutions. + + Returns: + torch.Tensor: A tensor containing the processed image patches. + """ + if type(grid_pinpoints) is list: + possible_resolutions = grid_pinpoints + else: + possible_resolutions = ast.literal_eval(grid_pinpoints) + best_resolution = select_best_resolution(image.size, possible_resolutions) + image_padded = resize_and_pad_image(image, best_resolution) + + patches = divide_to_patches(image_padded, processor.crop_size['height']) + + shortest_edge = processor.size['shortest_edge'] if isinstance( + processor.size, dict) else min(processor.size[0], processor.size[1]) + image_original_resize = image.resize( + (shortest_edge, shortest_edge)) + + image_patches = [image_original_resize] + patches + image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0] + for image_patch in image_patches] + return torch.stack(image_patches, dim=0) + + +def load_image_from_base64(image): + return Image.open(BytesIO(base64.b64decode(image))) + + +def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + +def process_images(images, image_processor, model_cfg): + image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) + new_images = [] + if image_aspect_ratio == 'pad': + for image in images: + image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) + image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + new_images.append(image) + elif image_aspect_ratio == "anyres": + for image in images: + image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints) + new_images.append(image) + else: + return image_processor(images, return_tensors='pt')['pixel_values'] + if all(x.shape == new_images[0].shape for x in new_images): + new_images = torch.stack(new_images, dim=0) + return new_images + + +def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): + prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] + + def insert_separator(X, sep): + return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] + + input_ids = [] + offset = 0 + if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: + offset = 1 + input_ids.append(prompt_chunks[0][0]) + + for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): + input_ids.extend(x[offset:]) + + if return_tensors is not None: + if return_tensors == 'pt': + return torch.tensor(input_ids, dtype=torch.long) + raise ValueError(f'Unsupported tensor type: {return_tensors}') + return input_ids + + +def get_model_name_from_path(model_path): + model_path = model_path.strip("/") + model_paths = model_path.split("/") + if model_paths[-1].startswith('checkpoint-'): + return model_paths[-2] + "_" + model_paths[-1] + else: + return model_paths[-1] + +class KeywordsStoppingCriteria(StoppingCriteria): + def __init__(self, keywords, tokenizer, input_ids): + self.keywords = keywords + self.keyword_ids = [] + self.max_keyword_len = 0 + for keyword in keywords: + cur_keyword_ids = tokenizer(keyword).input_ids + if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: + cur_keyword_ids = cur_keyword_ids[1:] + if len(cur_keyword_ids) > self.max_keyword_len: + self.max_keyword_len = len(cur_keyword_ids) + self.keyword_ids.append(torch.tensor(cur_keyword_ids)) + self.tokenizer = tokenizer + self.start_len = input_ids.shape[1] + + def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) + self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] + for keyword_id in self.keyword_ids: + truncated_output_ids = output_ids[0, -keyword_id.shape[0]:] + if torch.equal(truncated_output_ids, keyword_id): + return True + outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] + for keyword in self.keywords: + if keyword in outputs: + return True + return False + + def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: + outputs = [] + for i in range(output_ids.shape[0]): + outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores)) + return all(outputs) diff --git a/baselines/share4video/model/__init__.py b/baselines/share4video/model/__init__.py new file mode 100644 index 0000000..dbd9178 --- /dev/null +++ b/baselines/share4video/model/__init__.py @@ -0,0 +1,6 @@ +try: + from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig + from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig + from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig +except: + pass diff --git a/baselines/share4video/model/apply_delta.py b/baselines/share4video/model/apply_delta.py new file mode 100644 index 0000000..666dd96 --- /dev/null +++ b/baselines/share4video/model/apply_delta.py @@ -0,0 +1,48 @@ +""" +Usage: +python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta +""" +import argparse + +import torch +from tqdm import tqdm +from transformers import AutoTokenizer, AutoModelForCausalLM +from llava import LlavaLlamaForCausalLM + + +def apply_delta(base_model_path, target_model_path, delta_path): + print("Loading base model") + base = AutoModelForCausalLM.from_pretrained( + base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) + + print("Loading delta") + delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) + delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) + + print("Applying delta") + for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): + if name not in base.state_dict(): + assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' + continue + if param.data.shape == base.state_dict()[name].shape: + param.data += base.state_dict()[name] + else: + assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ + f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' + bparam = base.state_dict()[name] + param.data[:bparam.shape[0], :bparam.shape[1]] += bparam + + print("Saving target model") + delta.save_pretrained(target_model_path) + delta_tokenizer.save_pretrained(target_model_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--base-model-path", type=str, required=True) + parser.add_argument("--target-model-path", type=str, required=True) + parser.add_argument("--delta-path", type=str, required=True) + + args = parser.parse_args() + + apply_delta(args.base_model_path, args.target_model_path, args.delta_path) diff --git a/baselines/share4video/model/builder.py b/baselines/share4video/model/builder.py new file mode 100644 index 0000000..ace3b59 --- /dev/null +++ b/baselines/share4video/model/builder.py @@ -0,0 +1,204 @@ +# Copyright 2023 Haotian Liu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import shutil +import warnings + +import torch +from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from llava.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, + DEFAULT_IMAGE_PATCH_TOKEN) +from llava.model import * +from llava.train.train import smart_tokenizer_and_embedding_resize + + +def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, lora_alpha=None, **kwargs): + kwargs = {"device_map": device_map, **kwargs} + + if device != "cuda": + kwargs['device_map'] = {"": device} + + if load_8bit: + kwargs['load_in_8bit'] = True + elif load_4bit: + kwargs['load_in_4bit'] = True + kwargs['quantization_config'] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4' + ) + else: + kwargs['torch_dtype'] = torch.float16 + + if use_flash_attn: + kwargs['attn_implementation'] = 'flash_attention_2' + + if 'llava' or 'sharegpt4video' in model_name.lower(): + # Load LLaVA model + if 'lora' in model_name.lower() and model_base is None: + warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.') + if 'lora' in model_name.lower() and model_base is not None: + from llava.model.language_model.llava_llama import LlavaConfig + lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path) + tokenizer = AutoTokenizer.from_pretrained( + model_base, use_fast=False, model_max_length=lora_cfg_pretrained.tokenizer_model_max_length) + print('Loading LLaVA from base model...') + model = LlavaLlamaForCausalLM.from_pretrained( + model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) + token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features + if model.lm_head.weight.shape[0] != token_num: + model.lm_head.weight = torch.nn.Parameter(torch.empty( + token_num, tokem_dim, device=model.device, dtype=model.dtype)) + model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty( + token_num, tokem_dim, device=model.device, dtype=model.dtype)) + print('Loading additional LLaVA weights...') + if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')): + non_lora_trainables = torch.load(os.path.join( + model_path, 'non_lora_trainables.bin'), map_location='cpu') + else: + # this is probably from HF Hub + from huggingface_hub import hf_hub_download + + def load_from_hf(repo_id, filename, subfolder=None): + cache_file = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder) + return torch.load(cache_file, map_location='cpu') + non_lora_trainables = load_from_hf( + model_path, 'non_lora_trainables.bin') + non_lora_trainables = {(k[11:] if k.startswith( + 'base_model.') else k): v for k, v in non_lora_trainables.items()} + if any(k.startswith('model.model.') for k in non_lora_trainables): + non_lora_trainables = {(k[6:] if k.startswith( + 'model.') else k): v for k, v in non_lora_trainables.items()} + model.load_state_dict(non_lora_trainables, strict=False) + + from peft import PeftModel + print('Loading LoRA weights...') + if lora_alpha is not None: + print("Lora Scaling:", lora_alpha/128) + model = PeftModel.from_pretrained( + model, model_path, lora_alpha=lora_alpha, torch_device='cpu') + else: + model = PeftModel.from_pretrained(model, model_path, torch_device='cpu') + print('Merging LoRA weights...') + model = model.merge_and_unload() + print('Model is loaded...') + elif model_base is not None: + # this may be mm projector only + print('Loading LLaVA from base model...') + if 'mpt' in model_name.lower(): + if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')): + shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join( + model_path, 'configuration_mpt.py')) + tokenizer = AutoTokenizer.from_pretrained( + model_base, use_fast=True) + cfg_pretrained = AutoConfig.from_pretrained( + model_path, trust_remote_code=True) + model = LlavaMptForCausalLM.from_pretrained( + model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_base, use_fast=False) + cfg_pretrained = AutoConfig.from_pretrained(model_path) + model = LlavaLlamaForCausalLM.from_pretrained( + model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) + + mm_projector_weights = torch.load(os.path.join( + model_path, 'mm_projector.bin'), map_location='cpu') + mm_projector_weights = {k: v.to(torch.float16) + for k, v in mm_projector_weights.items()} + model.load_state_dict(mm_projector_weights, strict=False) + else: + if 'mpt' in model_name.lower(): + tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=True) + model = LlavaMptForCausalLM.from_pretrained( + model_path, low_cpu_mem_usage=True, **kwargs) + elif 'mistral' in model_name.lower(): + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = LlavaMistralForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=True, + **kwargs + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=False) + model = LlavaLlamaForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=True, + **kwargs + ) + else: + # Load language model + if model_base is not None: + # PEFT model + from peft import PeftModel + tokenizer = AutoTokenizer.from_pretrained( + model_base, use_fast=False) + model = AutoModelForCausalLM.from_pretrained( + model_base, low_cpu_mem_usage=True, **kwargs) + print(f"Loading LoRA weights from {model_path}") + model = PeftModel.from_pretrained(model, model_path) + print(f"Merging weights") + model = model.merge_and_unload() + print('Convert to FP16...') + model.to(torch.float16) + else: + use_fast = False + if 'mpt' in model_name.lower(): + tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=True) + model = AutoModelForCausalLM.from_pretrained( + model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs) + else: + tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=False) + model = AutoModelForCausalLM.from_pretrained( + model_path, low_cpu_mem_usage=True, **kwargs) + + image_processor = None + + if 'llava' or 'sharegpt4video' in model_name.lower(): + mm_use_im_start_end = getattr( + model.config, "mm_use_im_start_end", False) + mm_use_im_patch_token = getattr( + model.config, "mm_use_im_patch_token", True) + if mm_use_im_patch_token: + tokenizer.add_tokens( + [DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + if mm_use_im_start_end: + tokenizer.add_tokens( + [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + + vision_tower = model.get_vision_tower() + if not vision_tower.is_loaded: + vision_tower.load_model(device_map=device_map) + if device_map != 'auto': + vision_tower.to(device=device_map, dtype=torch.float16) + image_processor = vision_tower.image_processor + + if hasattr(model.config, "max_sequence_length"): + context_len = model.config.max_sequence_length + else: + context_len = 2048 + + return tokenizer, model, image_processor, context_len diff --git a/baselines/share4video/model/consolidate.py b/baselines/share4video/model/consolidate.py new file mode 100644 index 0000000..1e32421 --- /dev/null +++ b/baselines/share4video/model/consolidate.py @@ -0,0 +1,29 @@ +""" +Usage: +python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate +""" +import argparse + +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +from llava.model import * +from llava.model.utils import auto_upgrade + + +def consolidate_ckpt(src_path, dst_path): + print("Loading model") + auto_upgrade(src_path) + src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) + src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) + src_model.save_pretrained(dst_path) + src_tokenizer.save_pretrained(dst_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--src", type=str, required=True) + parser.add_argument("--dst", type=str, required=True) + + args = parser.parse_args() + + consolidate_ckpt(args.src, args.dst) diff --git a/baselines/share4video/model/language_model/llava_llama.py b/baselines/share4video/model/language_model/llava_llama.py new file mode 100644 index 0000000..8d101a5 --- /dev/null +++ b/baselines/share4video/model/language_model/llava_llama.py @@ -0,0 +1,159 @@ +# Copyright 2023 Haotian Liu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn + +from transformers import AutoConfig, AutoModelForCausalLM, \ + LlamaConfig, LlamaModel, LlamaForCausalLM + +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.generation.utils import GenerateOutput + +from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM + + +class LlavaConfig(LlamaConfig): + model_type = "llava_llama" + + +class LlavaLlamaModel(LlavaMetaModel, LlamaModel): + config_class = LlavaConfig + + def __init__(self, config: LlamaConfig): + super(LlavaLlamaModel, self).__init__(config) + + +class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM): + config_class = LlavaConfig + + def __init__(self, config): + super(LlamaForCausalLM, self).__init__(config) + self.model = LlavaLlamaModel(config) + self.pretraining_tp = config.pretraining_tp + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + image_sizes: Optional[List[List[int]]] = None, + return_dict: Optional[bool] = None, + **kwargs + ) -> Union[Tuple, CausalLMOutputWithPast]: + + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + images, + image_sizes + ) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + @torch.no_grad() + def generate( + self, + inputs: Optional[torch.Tensor] = None, + images: Optional[torch.Tensor] = None, + image_sizes: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[GenerateOutput, torch.LongTensor]: + position_ids = kwargs.pop("position_ids", None) + attention_mask = kwargs.pop("attention_mask", None) + if "inputs_embeds" in kwargs: + raise NotImplementedError("`inputs_embeds` is not supported") + + if images is not None: + ( + inputs, + position_ids, + attention_mask, + _, + inputs_embeds, + _ + ) = self.prepare_inputs_labels_for_multimodal( + inputs, + position_ids, + attention_mask, + None, + None, + images, + image_sizes=image_sizes + ) + else: + inputs_embeds = self.get_model().embed_tokens(inputs) + + return super().generate( + position_ids=position_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + **kwargs + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, + inputs_embeds=None, **kwargs): + images = kwargs.pop("images", None) + image_sizes = kwargs.pop("image_sizes", None) + inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs + ) + if images is not None: + inputs['images'] = images + if image_sizes is not None: + inputs['image_sizes'] = image_sizes + return inputs + +AutoConfig.register("llava_llama", LlavaConfig) +AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM) diff --git a/baselines/share4video/model/language_model/llava_mistral.py b/baselines/share4video/model/language_model/llava_mistral.py new file mode 100644 index 0000000..0def682 --- /dev/null +++ b/baselines/share4video/model/language_model/llava_mistral.py @@ -0,0 +1,158 @@ +# Copyright 2023 Haotian Liu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss + +from transformers import AutoConfig, AutoModelForCausalLM, \ + MistralConfig, MistralModel, MistralForCausalLM + +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.generation.utils import GenerateOutput + +from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM + + +class LlavaMistralConfig(MistralConfig): + model_type = "llava_mistral" + + +class LlavaMistralModel(LlavaMetaModel, MistralModel): + config_class = LlavaMistralConfig + + def __init__(self, config: MistralConfig): + super(LlavaMistralModel, self).__init__(config) + + +class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM): + config_class = LlavaMistralConfig + + def __init__(self, config): + super(MistralForCausalLM, self).__init__(config) + self.model = LlavaMistralModel(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + image_sizes: Optional[List[List[int]]] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + images, + image_sizes + ) + + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + @torch.no_grad() + def generate( + self, + inputs: Optional[torch.Tensor] = None, + images: Optional[torch.Tensor] = None, + image_sizes: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[GenerateOutput, torch.LongTensor]: + position_ids = kwargs.pop("position_ids", None) + attention_mask = kwargs.pop("attention_mask", None) + if "inputs_embeds" in kwargs: + raise NotImplementedError("`inputs_embeds` is not supported") + + if images is not None: + ( + inputs, + position_ids, + attention_mask, + _, + inputs_embeds, + _ + ) = self.prepare_inputs_labels_for_multimodal( + inputs, + position_ids, + attention_mask, + None, + None, + images, + image_sizes=image_sizes + ) + else: + inputs_embeds = self.get_model().embed_tokens(inputs) + + return super().generate( + position_ids=position_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + **kwargs + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, + inputs_embeds=None, **kwargs): + images = kwargs.pop("images", None) + image_sizes = kwargs.pop("image_sizes", None) + inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs + ) + if images is not None: + inputs['images'] = images + if image_sizes is not None: + inputs['image_sizes'] = image_sizes + return inputs + +AutoConfig.register("llava_mistral", LlavaMistralConfig) +AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM) diff --git a/baselines/share4video/model/language_model/llava_mpt.py b/baselines/share4video/model/language_model/llava_mpt.py new file mode 100644 index 0000000..02e5237 --- /dev/null +++ b/baselines/share4video/model/language_model/llava_mpt.py @@ -0,0 +1,97 @@ +# Copyright 2023 Haotian Liu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional, Tuple + +import torch + +from transformers import AutoConfig, AutoModelForCausalLM, \ + MptConfig, MptForCausalLM, MptModel +from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM + + +class LlavaMptConfig(MptConfig): + model_type = "llava_mpt" + + +class LlavaMptModel(LlavaMetaModel, MptModel): + config_class = LlavaMptConfig + + def __init__(self, config: MptConfig): + config.hidden_size = config.d_model + super(LlavaMptModel, self).__init__(config) + + def embed_tokens(self, x): + return self.wte(x) + + +class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM): + config_class = LlavaMptConfig + supports_gradient_checkpointing = True + + def __init__(self, config): + super(MptForCausalLM, self).__init__(config) + + self.transformer = LlavaMptModel(config) + self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.transformer + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, LlavaMptModel): + module.gradient_checkpointing = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + images=None): + + input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images) + + return super().forward( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): + images = kwargs.pop("images", None) + _inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs + ) + _inputs['images'] = images + return _inputs + + +AutoConfig.register("llava_mpt", LlavaMptConfig) +AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM) diff --git a/baselines/share4video/model/llava_arch.py b/baselines/share4video/model/llava_arch.py new file mode 100644 index 0000000..aa9c9d4 --- /dev/null +++ b/baselines/share4video/model/llava_arch.py @@ -0,0 +1,367 @@ +# Copyright 2023 Haotian Liu +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from abc import ABC, abstractmethod + +import torch +import torch.nn as nn + +from .multimodal_encoder.builder import build_vision_tower +from .multimodal_projector.builder import build_vision_projector + +from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + +from llava.mm_utils import get_anyres_image_grid_shape + + +class LlavaMetaModel: + + def __init__(self, config): + super(LlavaMetaModel, self).__init__(config) + if hasattr(config, "mm_vision_tower"): + self.vision_tower = build_vision_tower(config, delay_load=True) + self.mm_projector = build_vision_projector(config) + + if 'unpad' in getattr(config, 'mm_patch_merge_type', ''): + self.image_newline = nn.Parameter( + torch.empty(config.hidden_size, dtype=self.dtype) + ) + + def get_vision_tower(self): + vision_tower = getattr(self, 'vision_tower', None) + if type(vision_tower) is list: + vision_tower = vision_tower[0] + return vision_tower + + def initialize_vision_modules(self, model_args, fsdp=None): + vision_tower = model_args.vision_tower + mm_vision_select_layer = model_args.mm_vision_select_layer + mm_vision_select_feature = model_args.mm_vision_select_feature + pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter + mm_patch_merge_type = model_args.mm_patch_merge_type + + self.config.mm_vision_tower = vision_tower + + if self.get_vision_tower() is None: + vision_tower = build_vision_tower(model_args) + + if fsdp is not None and len(fsdp) > 0: + self.vision_tower = [vision_tower] + else: + self.vision_tower = vision_tower + else: + if fsdp is not None and len(fsdp) > 0: + vision_tower = self.vision_tower[0] + else: + vision_tower = self.vision_tower + vision_tower.load_model() + + self.config.use_mm_proj = True + self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear') + self.config.mm_hidden_size = vision_tower.hidden_size + self.config.mm_vision_select_layer = mm_vision_select_layer + self.config.mm_vision_select_feature = mm_vision_select_feature + self.config.mm_patch_merge_type = mm_patch_merge_type + + if getattr(self, 'mm_projector', None) is None: + self.mm_projector = build_vision_projector(self.config) + + if 'unpad' in mm_patch_merge_type: + embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype)) + self.image_newline = nn.Parameter( + torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std + ) + else: + # In case it is frozen by LoRA + for p in self.mm_projector.parameters(): + p.requires_grad = True + + if pretrain_mm_mlp_adapter is not None: + mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu') + def get_w(weights, keyword): + return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k} + + self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector')) + + +def unpad_image(tensor, original_size): + """ + Unpads a PyTorch tensor of a padded and resized image. + + Args: + tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format. + original_size (tuple): The original size of PIL image (width, height). + + Returns: + torch.Tensor: The unpadded image tensor. + """ + original_width, original_height = original_size + current_height, current_width = tensor.shape[1:] + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + scale_factor = current_width / original_width + new_height = int(original_height * scale_factor) + padding = (current_height - new_height) // 2 + unpadded_tensor = tensor[:, padding:current_height - padding, :] + else: + scale_factor = current_height / original_height + new_width = int(original_width * scale_factor) + padding = (current_width - new_width) // 2 + unpadded_tensor = tensor[:, :, padding:current_width - padding] + + return unpadded_tensor + + +class LlavaMetaForCausalLM(ABC): + + @abstractmethod + def get_model(self): + pass + + def get_vision_tower(self): + return self.get_model().get_vision_tower() + + def encode_images(self, images): + image_features = self.get_model().get_vision_tower()(images) + image_features = self.get_model().mm_projector(image_features) + return image_features + + def prepare_inputs_labels_for_multimodal( + self, input_ids, position_ids, attention_mask, past_key_values, labels, + images, image_sizes=None + ): + vision_tower = self.get_vision_tower() + if vision_tower is None or images is None or input_ids.shape[1] == 1: + return input_ids, position_ids, attention_mask, past_key_values, None, labels + + if type(images) is list or images.ndim == 5: + if type(images) is list: + images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images] + concat_images = torch.cat([image for image in images], dim=0) + image_features = self.encode_images(concat_images) + split_sizes = [image.shape[0] for image in images] + image_features = torch.split(image_features, split_sizes, dim=0) + mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat') + image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square') + if mm_patch_merge_type == 'flat': + image_features = [x.flatten(0, 1) for x in image_features] + elif mm_patch_merge_type.startswith('spatial'): + new_image_features = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + height = width = self.get_vision_tower().num_patches_per_side + assert height * width == base_image_feature.shape[0] + if image_aspect_ratio == 'anyres': + num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size) + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + else: + raise NotImplementedError + if 'unpad' in mm_patch_merge_type: + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, image_sizes[image_idx]) + image_feature = torch.cat(( + image_feature, + self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device) + ), dim=-1) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + else: + image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() + image_feature = image_feature.flatten(0, 3) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + else: + image_feature = image_feature[0] + if 'unpad' in mm_patch_merge_type: + image_feature = torch.cat(( + image_feature, + self.model.image_newline[None].to(image_feature.device) + ), dim=0) + new_image_features.append(image_feature) + image_features = new_image_features + else: + raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}") + else: + image_features = self.encode_images(images) + + # TODO: image start / end is not implemented here to support pretraining. + if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): + raise NotImplementedError + + # Let's just add dummy tensors if they do not exist, + # it is a headache to deal with None all the time. + # But it is not ideal, and if you have a better idea, + # please open an issue / submit a PR, thanks. + _labels = labels + _position_ids = position_ids + _attention_mask = attention_mask + if attention_mask is None: + attention_mask = torch.ones_like(input_ids, dtype=torch.bool) + else: + attention_mask = attention_mask.bool() + if position_ids is None: + position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device) + if labels is None: + labels = torch.full_like(input_ids, IGNORE_INDEX) + + # remove the padding using attention_mask -- FIXME + _input_ids = input_ids + input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)] + labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)] + + new_input_embeds = [] + new_labels = [] + cur_image_idx = 0 + for batch_idx, cur_input_ids in enumerate(input_ids): + num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum() + if num_images == 0: + cur_image_features = image_features[cur_image_idx] + cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids) + cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0) + new_input_embeds.append(cur_input_embeds) + new_labels.append(labels[batch_idx]) + cur_image_idx += 1 + continue + + image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]] + cur_input_ids_noim = [] + cur_labels = labels[batch_idx] + cur_labels_noim = [] + for i in range(len(image_token_indices) - 1): + cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]]) + cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]]) + split_sizes = [x.shape[0] for x in cur_labels_noim] + cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim)) + cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0) + cur_new_input_embeds = [] + cur_new_labels = [] + + for i in range(num_images + 1): + cur_new_input_embeds.append(cur_input_embeds_no_im[i]) + cur_new_labels.append(cur_labels_noim[i]) + if i < num_images: + cur_image_features = image_features[cur_image_idx] + cur_image_idx += 1 + cur_new_input_embeds.append(cur_image_features) + cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype)) + + cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds] + + cur_new_input_embeds = torch.cat(cur_new_input_embeds) + cur_new_labels = torch.cat(cur_new_labels) + + new_input_embeds.append(cur_new_input_embeds) + new_labels.append(cur_new_labels) + + # Truncate sequences to max length as image embeddings can make the sequence longer + tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None) + if tokenizer_model_max_length is not None: + new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds] + new_labels = [x[:tokenizer_model_max_length] for x in new_labels] + + # Combine them + max_len = max(x.shape[0] for x in new_input_embeds) + batch_size = len(new_input_embeds) + + new_input_embeds_padded = [] + new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device) + attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device) + position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device) + + for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)): + cur_len = cur_new_embed.shape[0] + if getattr(self.config, 'tokenizer_padding_side', 'right') == "left": + new_input_embeds_padded.append(torch.cat(( + torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), + cur_new_embed + ), dim=0)) + if cur_len > 0: + new_labels_padded[i, -cur_len:] = cur_new_labels + attention_mask[i, -cur_len:] = True + position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device) + else: + new_input_embeds_padded.append(torch.cat(( + cur_new_embed, + torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device) + ), dim=0)) + if cur_len > 0: + new_labels_padded[i, :cur_len] = cur_new_labels + attention_mask[i, :cur_len] = True + position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device) + + new_input_embeds = torch.stack(new_input_embeds_padded, dim=0) + + if _labels is None: + new_labels = None + else: + new_labels = new_labels_padded + + if _attention_mask is None: + attention_mask = None + else: + attention_mask = attention_mask.to(dtype=_attention_mask.dtype) + + if _position_ids is None: + position_ids = None + + return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels + + def initialize_vision_tokenizer(self, model_args, tokenizer): + if model_args.mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + self.resize_token_embeddings(len(tokenizer)) + + if model_args.mm_use_im_start_end: + num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + self.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = self.get_input_embeddings().weight.data + output_embeddings = self.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + if model_args.tune_mm_mlp_adapter: + for p in self.get_input_embeddings().parameters(): + p.requires_grad = True + for p in self.get_output_embeddings().parameters(): + p.requires_grad = False + + if model_args.pretrain_mm_mlp_adapter: + mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu') + embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight'] + assert num_new_tokens == 2 + if input_embeddings.shape == embed_tokens_weight.shape: + input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:] + elif embed_tokens_weight.shape[0] == num_new_tokens: + input_embeddings[-num_new_tokens:] = embed_tokens_weight + else: + raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.") + elif model_args.mm_use_im_patch_token: + if model_args.tune_mm_mlp_adapter: + for p in self.get_input_embeddings().parameters(): + p.requires_grad = False + for p in self.get_output_embeddings().parameters(): + p.requires_grad = False diff --git a/baselines/share4video/model/make_delta.py b/baselines/share4video/model/make_delta.py new file mode 100644 index 0000000..4ae55d5 --- /dev/null +++ b/baselines/share4video/model/make_delta.py @@ -0,0 +1,52 @@ +""" +Usage: +python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta +""" +import argparse + +import torch +from tqdm import tqdm +from transformers import AutoTokenizer, AutoModelForCausalLM +from llava.model.utils import auto_upgrade + + +def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id): + print("Loading base model") + base = AutoModelForCausalLM.from_pretrained( + base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) + + print("Loading target model") + auto_upgrade(target_model_path) + target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) + + print("Calculating delta") + for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): + if name not in base.state_dict(): + assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' + continue + if param.data.shape == base.state_dict()[name].shape: + param.data -= base.state_dict()[name] + else: + assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' + bparam = base.state_dict()[name] + param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam + + print("Saving delta") + if hub_repo_id: + kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} + else: + kwargs = {} + target.save_pretrained(delta_path, **kwargs) + target_tokenizer = AutoTokenizer.from_pretrained(target_model_path) + target_tokenizer.save_pretrained(delta_path, **kwargs) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--base-model-path", type=str, required=True) + parser.add_argument("--target-model-path", type=str, required=True) + parser.add_argument("--delta-path", type=str, required=True) + parser.add_argument("--hub-repo-id", type=str, default=None) + args = parser.parse_args() + + make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id) diff --git a/baselines/share4video/model/multimodal_encoder/builder.py b/baselines/share4video/model/multimodal_encoder/builder.py new file mode 100644 index 0000000..9a4f7cd --- /dev/null +++ b/baselines/share4video/model/multimodal_encoder/builder.py @@ -0,0 +1,20 @@ +import os +from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 +from .siglip_encoder import SigLipVisionTower + + +def build_vision_tower(vision_tower_cfg, **kwargs): + vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) + is_absolute_path_exists = os.path.exists(vision_tower) + use_s2 = getattr(vision_tower_cfg, 's2', False) + if 'siglip' not in vision_tower.lower(): + if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: + if use_s2: + return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) + else: + return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) + else: + if is_absolute_path_exists or vision_tower.startswith("google") or vision_tower.startswith('bczhou'): + return SigLipVisionTower(vision_tower, vision_tower_cfg, **kwargs) + + raise ValueError(f'Unknown vision tower: {vision_tower}') diff --git a/baselines/share4video/model/multimodal_encoder/clip_encoder.py b/baselines/share4video/model/multimodal_encoder/clip_encoder.py new file mode 100644 index 0000000..bffe45f --- /dev/null +++ b/baselines/share4video/model/multimodal_encoder/clip_encoder.py @@ -0,0 +1,147 @@ +import torch +import torch.nn as nn + +from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig + + +class CLIPVisionTower(nn.Module): + def __init__(self, vision_tower, args, delay_load=False): + super().__init__() + + self.is_loaded = False + + self.vision_tower_name = vision_tower + self.select_layer = args.mm_vision_select_layer + self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch') + + if not delay_load: + self.load_model() + elif getattr(args, 'unfreeze_mm_vision_tower', False): + self.load_model() + else: + self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name) + + def load_model(self, device_map=None): + if self.is_loaded: + print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name)) + return + print(f'Load vision tower from {self.vision_tower_name}') + self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) + self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) + self.vision_tower.requires_grad_(False) + + self.is_loaded = True + + def feature_select(self, image_forward_outs): + image_features = image_forward_outs.hidden_states[self.select_layer] + if self.select_feature == 'patch': + image_features = image_features[:, 1:] + elif self.select_feature == 'cls_patch': + image_features = image_features + else: + raise ValueError(f'Unexpected select feature: {self.select_feature}') + return image_features + + # @torch.no_grad() + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True) + image_feature = self.feature_select(image_forward_out).to(image.dtype) + image_features.append(image_feature) + else: + image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) + image_features = self.feature_select(image_forward_outs).to(images.dtype) + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + return self.vision_tower.dtype + + @property + def device(self): + return self.vision_tower.device + + @property + def config(self): + if self.is_loaded: + return self.vision_tower.config + else: + return self.cfg_only + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches_per_side(self): + return self.config.image_size // self.config.patch_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 + + + +class CLIPVisionTowerS2(CLIPVisionTower): + def __init__(self, vision_tower, args, delay_load=False): + super().__init__(vision_tower, args, delay_load) + + self.s2_scales = getattr(args, 's2_scales', '336,672,1008') + self.s2_scales = list(map(int, self.s2_scales.split(','))) + self.s2_scales.sort() + self.s2_split_size = self.s2_scales[0] + self.s2_image_size = self.s2_scales[-1] + + try: + from s2wrapper import forward as multiscale_forward + except ImportError: + raise ImportError('Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git') + self.multiscale_forward = multiscale_forward + + # change resize/crop size in preprocessing to the largest image size in s2_scale + if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False): + self.image_processor.size['shortest_edge'] = self.s2_image_size + self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size + + def load_model(self, device_map=None): + if self.is_loaded: + print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name)) + return + + self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name) + self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map) + self.vision_tower.requires_grad_(False) + + self.image_processor.size['shortest_edge'] = self.s2_image_size + self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size + + self.is_loaded = True + + # @torch.no_grad() + def forward_feature(self, images): + image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) + image_features = self.feature_select(image_forward_outs).to(images.dtype) + return image_features + + # @torch.no_grad() + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size) + image_features.append(image_feature) + else: + image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size) + + return image_features + + @property + def hidden_size(self): + return self.config.hidden_size * len(self.s2_scales) diff --git a/baselines/share4video/model/multimodal_encoder/siglip_encoder.py b/baselines/share4video/model/multimodal_encoder/siglip_encoder.py new file mode 100644 index 0000000..2ac5fd2 --- /dev/null +++ b/baselines/share4video/model/multimodal_encoder/siglip_encoder.py @@ -0,0 +1,654 @@ +''' +# Adapted from https://huggingface.co/MILVLG/imp-v1-3b/blob/main/vision_encoder.py +''' + +from typing import Optional, Tuple, Union, Dict +from dataclasses import dataclass +from functools import partial, reduce +from PIL import Image +import torch +import torch.utils.checkpoint +from torch import nn +import os +from transformers.image_processing_utils import BatchFeature, get_size_dict +from transformers.image_transforms import ( + convert_to_rgb, normalize, rescale, resize, to_channel_dimension_format, ) +from transformers.image_utils import ( + ChannelDimension, PILImageResampling, to_numpy_array, ) +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from transformers.modeling_utils import PreTrainedModel +from transformers import PretrainedConfig +from transformers.utils import ModelOutput +from transformers.image_processing_utils import BaseImageProcessor + + +class SigLipImageProcessor(BaseImageProcessor): + def __init__(self, + image_mean=(0.5, 0.5, 0.5), + image_std=(0.5, 0.5, 0.5), + size=(384, 384), + crop_size: Dict[str, int] = None, + resample=PILImageResampling.BICUBIC, + rescale_factor=1 / 255, + data_format=ChannelDimension.FIRST): + crop_size = crop_size if crop_size is not None else { + "height": 384, "width": 384} + crop_size = get_size_dict( + crop_size, default_to_square=True, param_name="crop_size") + + self.image_mean = image_mean + self.image_std = image_std + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.data_format = data_format + self.crop_size = crop_size + + def preprocess(self, images, return_tensors): + if isinstance(images, Image.Image): + images = [images] + else: + assert isinstance(images, list) + + transforms = [ + convert_to_rgb, + to_numpy_array, + partial(resize, size=self.size, resample=self.resample, + data_format=self.data_format), + partial(rescale, scale=self.rescale_factor, + data_format=self.data_format), + partial(normalize, mean=self.image_mean, + std=self.image_std, data_format=self.data_format), + partial(to_channel_dimension_format, channel_dim=self.data_format, + input_channel_dim=self.data_format), + ] + + images = reduce(lambda x, f: [*map(f, x)], transforms, images) + data = {"pixel_values": images} + + return BatchFeature(data=data, tensor_type=return_tensors) + + +class SigLipVisionConfig(PretrainedConfig): + model_type = "siglip_vision_model" + + def __init__( + self, + hidden_size=1152, + image_mean=(0.5, 0.5, 0.5), + intermediate_size=4304, + num_hidden_layers=27, + num_attention_heads=16, + num_channels=3, + image_size=384, + patch_size=14, + hidden_act="gelu_pytorch_tanh", + layer_norm_eps=1e-6, + attention_dropout=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.image_mean = image_mean + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from SigLipConfig + if config_dict.get("model_type") == "siglip": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +@dataclass +# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->SigLip +class SigLipVisionModelOutput(ModelOutput): + """ + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + + Args: + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class SigLipVisionEmbeddings(nn.Module): + def __init__(self, config: SigLipVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding="valid", + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + self.position_embedding = nn.Embedding( + self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange( + self.num_positions).expand((1, -1)), persistent=False) + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + patch_embeds = self.patch_embedding( + pixel_values) # shape = [*, width, grid, grid] + embeddings = patch_embeds.flatten(2).transpose(1, 2) + + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class SigLipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim ** -0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view( + batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view( + batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view( + batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + k_v_seq_len = key_states.shape[-2] + attn_weights = torch.matmul( + query_states, key_states.transpose(2, 3)) * self.scale + + if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len): + raise ValueError( + f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->SigLip +class SigLipMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->SigLip +class SigLipEncoderLayer(nn.Module): + def __init__(self, config: SigLipVisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = SigLipAttention(config) + self.layer_norm1 = nn.LayerNorm( + self.embed_dim, eps=config.layer_norm_eps) + self.mlp = SigLipMLP(config) + self.layer_norm2 = nn.LayerNorm( + self.embed_dim, eps=config.layer_norm_eps) + + # Ignore copy + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(batch, seq_len, embed_dim)`. + attention_mask (`torch.FloatTensor`): + Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class SigLipPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SigLipVisionConfig + base_model_prefix = "siglip" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + pass + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip +class SigLipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`SigLipEncoderLayer`]. + + Args: + config: SigLipVisionConfig + """ + + def __init__(self, config: SigLipVisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList( + [SigLipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + # Ignore copy + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class SigLipVisionTransformer(nn.Module): + def __init__(self, config: SigLipVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = SigLipVisionEmbeddings(config) + self.encoder = SigLipEncoder(config) + self.post_layernorm = nn.LayerNorm( + embed_dim, eps=config.layer_norm_eps) + self.head = SigLipMultiheadAttentionPoolingHead(config) + + def forward( + self, + pixel_values, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = self.head(last_hidden_state) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class SigLipMultiheadAttentionPoolingHead(nn.Module): + """Multihead Attention Pooling.""" + + def __init__(self, config: SigLipVisionConfig): + super().__init__() + + self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.attention = torch.nn.MultiheadAttention( + config.hidden_size, config.num_attention_heads, batch_first=True) + self.layernorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps) + self.mlp = SigLipMLP(config) + + def forward(self, hidden_state): + batch_size = hidden_state.shape[0] + probe = self.probe.repeat(batch_size, 1, 1) + + hidden_state = self.attention(probe, hidden_state, hidden_state)[0] + + residual = hidden_state + hidden_state = self.layernorm(hidden_state) + hidden_state = residual + self.mlp(hidden_state) + + return hidden_state[:, 0] + + +class SigLipVisionModel(SigLipPreTrainedModel): + config_class = SigLipVisionConfig + main_input_name = "pixel_values" + _no_split_modules = ["SigLipEncoderLayer"] + + def __init__(self, config: SigLipVisionConfig): + super().__init__(config) + + self.vision_model = SigLipVisionTransformer(config) + del self.vision_model.encoder.layers[-1:] + self.vision_model.head = nn.Identity() + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + def forward( + self, + pixel_values, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, SigLipVisionModel + + >>> model = SigLipVisionModel.from_pretrained("google/siglip-base-patch16-224") + >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled features + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class SigLipVisionTower(nn.Module): + def __init__(self, vision_tower, vision_tower_cfg, delay_load=False): + super().__init__() + + self.is_loaded = False + + self.config = SigLipVisionConfig() + + self.vision_tower_name = vision_tower + + self.image_processor = SigLipImageProcessor() + + if not delay_load: + self.load_model() + else: + self.cfg_only = self.config + + def load_model(self, device_map=None): + if self.is_loaded: + print('{} is already loaded, `load_model` called again, skipping.'.format( + self.vision_tower_name)) + return + + self.vision_tower = SigLipVisionModel.from_pretrained( + self.vision_tower_name, device_map=device_map) + + self.vision_tower.requires_grad_(False) + self.vision_tower.eval() + + self.is_loaded = True + + # @torch.no_grad() + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), + output_hidden_states=True) + image_feature = image_forward_out.hidden_states[-1].to( + image.dtype) + assert image_features.shape[-2] == 729 + image_features.append(image_feature) + else: + image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), + output_hidden_states=True) + image_features = image_forward_outs.hidden_states[-1].to( + images.dtype) + assert image_features.shape[-2] == 729 + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + for p in self.vision_tower.parameters(): + return p.dtype + + @property + def device(self): + for p in self.vision_tower.parameters(): + return p.device + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches_per_side(self): + return self.config.image_size // self.config.patch_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 diff --git a/baselines/share4video/model/multimodal_projector/builder.py b/baselines/share4video/model/multimodal_projector/builder.py new file mode 100644 index 0000000..31cd4f4 --- /dev/null +++ b/baselines/share4video/model/multimodal_projector/builder.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn +import re + + +class IdentityMap(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, *args, **kwargs): + return x + + @property + def config(self): + return {"mm_projector_type": 'identity'} + + +class SimpleResBlock(nn.Module): + def __init__(self, channels): + super().__init__() + self.pre_norm = nn.LayerNorm(channels) + + self.proj = nn.Sequential( + nn.Linear(channels, channels), + nn.GELU(), + nn.Linear(channels, channels) + ) + def forward(self, x): + x = self.pre_norm(x) + return x + self.proj(x) + + +def build_vision_projector(config, delay_load=False, **kwargs): + projector_type = getattr(config, 'mm_projector_type', 'linear') + + if projector_type == 'linear': + return nn.Linear(config.mm_hidden_size, config.hidden_size) + + mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) + if mlp_gelu_match: + mlp_depth = int(mlp_gelu_match.group(1)) + modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] + for _ in range(1, mlp_depth): + modules.append(nn.GELU()) + modules.append(nn.Linear(config.hidden_size, config.hidden_size)) + return nn.Sequential(*modules) + + if projector_type == 'identity': + return IdentityMap() + + raise ValueError(f'Unknown projector type: {projector_type}') diff --git a/baselines/share4video/model/utils.py b/baselines/share4video/model/utils.py new file mode 100644 index 0000000..2563f89 --- /dev/null +++ b/baselines/share4video/model/utils.py @@ -0,0 +1,20 @@ +from transformers import AutoConfig + + +def auto_upgrade(config): + cfg = AutoConfig.from_pretrained(config) + if 'llava' in config and 'llava' not in cfg.model_type: + assert cfg.model_type == 'llama' + print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") + print("You must upgrade the checkpoint to the new code base (this can be done automatically).") + confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") + if confirm.lower() in ["y", "yes"]: + print("Upgrading checkpoint...") + assert len(cfg.architectures) == 1 + setattr(cfg.__class__, "model_type", "llava") + cfg.architectures[0] = 'LlavaLlamaForCausalLM' + cfg.save_pretrained(config) + print("Checkpoint upgraded.") + else: + print("Checkpoint upgrade aborted.") + exit(1) diff --git a/baselines/share4video/serve/gradio_utils.py b/baselines/share4video/serve/gradio_utils.py new file mode 100644 index 0000000..0003094 --- /dev/null +++ b/baselines/share4video/serve/gradio_utils.py @@ -0,0 +1,185 @@ +import numpy as np +import torch +from decord import VideoReader, cpu +from PIL import Image + +from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX +from llava.mm_utils import (get_model_name_from_path, process_images, + tokenizer_image_token) +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init + +title_markdown = (""" +
+
+ + ShareGPT4Video🚀 + +
+
+

ShareGPT4Video: Improving Video Understanding and Generation with Better Captions

+
If you like our project, please give us a star ✨ on Github for the latest update.
+
[Project Page] [Code] [Paper] +
+
+""") + +block_css = """ +#buttons button { + min-width: min(120px,100%); +} +""" + + +learn_more_markdown = (""" +### License +The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation. +""") + + +def create_frame_grid(img_array, interval_width=50): + n, h, w, c = img_array.shape + grid_size = int(np.ceil(np.sqrt(n))) + + horizontal_band = np.ones((h, interval_width, c), + dtype=img_array.dtype) * 255 + vertical_band = np.ones((interval_width, w + (grid_size - 1) + * (w + interval_width), c), dtype=img_array.dtype) * 255 + + rows = [] + for i in range(grid_size): + row_frames = [] + for j in range(grid_size): + idx = i * grid_size + j + if idx < n: + frame = img_array[idx] + else: + frame = np.ones_like(img_array[0]) * 255 + if j > 0: + row_frames.append(horizontal_band) + row_frames.append(frame) + combined_row = np.concatenate(row_frames, axis=1) + if i > 0: + rows.append(vertical_band) + rows.append(combined_row) + + final_grid = np.concatenate(rows, axis=0) + return final_grid + + +def resize_image_grid(image, max_length=1920): + width, height = image.size + if max(width, height) > max_length: + if width > height: + scale = max_length / width + else: + scale = max_length / height + + new_width = int(width * scale) + new_height = int(height * scale) + + img_resized = image.resize((new_width, new_height), Image.BILINEAR) + else: + img_resized = image + return img_resized + + +def get_index(num_frames, num_segments): + seg_size = float(num_frames - 1) / num_segments + start = int(seg_size / 2) + offsets = np.array([ + start + int(np.round(seg_size * idx)) for idx in range(num_segments) + ]) + return offsets + + +def load_video(video_path, num_segments=8, return_msg=False, num_frames=4): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + num_frames = len(vr) + frame_indices = get_index(num_frames, num_segments) + img_array = vr.get_batch(frame_indices).asnumpy() + img_grid = create_frame_grid(img_array, 50) + img_grid = Image.fromarray(img_grid).convert("RGB") + img_grid = resize_image_grid(img_grid) + if return_msg: + fps = float(vr.get_avg_fps()) + sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) + # " " should be added in the start and end + msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." + return img_grid, msg + else: + return img_grid + + +def video_answer(prompt, model, processor, tokenizer, img_grid, do_sample=True, + max_new_tokens=200, num_beams=1, top_p=0.9, + temperature=1.0, print_res=False, **kwargs): + if not isinstance(img_grid, (list, tuple)): + img_grid = [img_grid] + image_size = img_grid[0].size + image_tensor = process_images(img_grid, processor, model.config)[0] + input_ids = tokenizer_image_token( + prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + input_ids = input_ids.unsqueeze(0).to( + device=model.device, non_blocking=True) + pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token is not None else tokenizer.eos_token_id + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.to( + dtype=torch.float16, device=model.device, non_blocking=True), + image_sizes=[image_size], + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + num_beams=num_beams, + max_new_tokens=max_new_tokens, + pad_token_id=pad_token_id, + use_cache=True, + **kwargs) + outputs = tokenizer.batch_decode( + output_ids, skip_special_tokens=True)[0].strip() + if print_res: # debug usage + print('### PROMPTING LM WITH: ', prompt) + print('### LM OUTPUT TEXT: ', outputs) + + return outputs + + +class Chat: + def __init__(self, model_path, conv_mode, model_base=None, load_8bit=False, load_4bit=False, device='cuda', num_frames=16): + disable_torch_init() + model_name = get_model_name_from_path(model_path) + self.tokenizer, self.model, self.processor, context_len = load_pretrained_model( + model_path, model_base, model_name, + load_8bit, load_4bit, + device=device) + self.model.eval() + self.conv_mode = conv_mode + self.device = self.model.device + self.num_frames = num_frames + self.pre_query_prompt = "The provided image arranges keyframes from a video in a grid view, keyframes are separated with white bands. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the frames." + + def get_prompt(self, qs, state): + state.append_message(state.roles[0], qs) + state.append_message(state.roles[1], None) + return state + + @torch.inference_mode() + def generate(self, vid_path: list, prompt: str, first_run: bool, state): + if self.num_frames != 0: + vid, msg = load_video( + vid_path, num_segments=self.num_frames, return_msg=True) + else: + vid, msg = None, 'num_frames is 0, not inputing image' + img_grid = vid + if self.pre_query_prompt is not None: + prompt = DEFAULT_IMAGE_TOKEN + '\n' + self.pre_query_prompt + prompt + else: + prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt + state = self.get_prompt(prompt, state) + prompt = state.get_prompt() + llm_response = video_answer(prompt, model=self.model, processor=self.processor, tokenizer=self.tokenizer, + do_sample=True, temperature=0.1, img_grid=img_grid, max_new_tokens=1024, print_res=True) + return llm_response, state diff --git a/baselines/share4video/train/llava_trainer.py b/baselines/share4video/train/llava_trainer.py new file mode 100644 index 0000000..4d7a3ae --- /dev/null +++ b/baselines/share4video/train/llava_trainer.py @@ -0,0 +1,301 @@ +import os +import torch +import torch.nn as nn + +from torch.utils.data import Sampler + +from transformers import Trainer +from transformers.trainer import ( + is_sagemaker_mp_enabled, + get_parameter_names, + has_length, + ALL_LAYERNORM_LAYERS, + logger, +) +from typing import List, Optional + + +def maybe_zero_3(param, ignore_status=False, name=None): + from deepspeed import zero + from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + if hasattr(param, "ds_id"): + if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: + if not ignore_status: + print(name, 'no ignore status') + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): + to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()} + return to_return + + +def split_to_even_chunks(indices, lengths, num_chunks): + """ + Split a list of indices into `chunks` chunks of roughly equal lengths. + """ + + if len(indices) % num_chunks != 0: + return [indices[i::num_chunks] for i in range(num_chunks)] + + num_indices_per_chunk = len(indices) // num_chunks + + chunks = [[] for _ in range(num_chunks)] + chunks_lengths = [0 for _ in range(num_chunks)] + for index in indices: + shortest_chunk = chunks_lengths.index(min(chunks_lengths)) + chunks[shortest_chunk].append(index) + chunks_lengths[shortest_chunk] += lengths[index] + if len(chunks[shortest_chunk]) == num_indices_per_chunk: + chunks_lengths[shortest_chunk] = float("inf") + + return chunks + + +def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None): + # We need to use torch for the random part as a distributed sampler will set the random seed for torch. + assert all(l != 0 for l in lengths), "Should not have zero length." + if all(l > 0 for l in lengths) or all(l < 0 for l in lengths): + # all samples are in the same modality + return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator) + mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0]) + lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0]) + + mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)] + lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)] + megabatch_size = world_size * batch_size + mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)] + lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)] + + last_mm = mm_megabatches[-1] + last_lang = lang_megabatches[-1] + additional_batch = last_mm + last_lang + megabatches = mm_megabatches[:-1] + lang_megabatches[:-1] + megabatch_indices = torch.randperm(len(megabatches), generator=generator) + megabatches = [megabatches[i] for i in megabatch_indices] + + if len(additional_batch) > 0: + megabatches.append(sorted(additional_batch)) + + return [i for megabatch in megabatches for i in megabatch] + + +def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True): + # We need to use torch for the random part as a distributed sampler will set the random seed for torch. + indices = torch.randperm(len(lengths), generator=generator) + megabatch_size = world_size * batch_size + megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)] + megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches] + megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches] + + return [i for megabatch in megabatches for batch in megabatch for i in batch] + + +class LengthGroupedSampler(Sampler): + r""" + Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while + keeping a bit of randomness. + """ + + def __init__( + self, + batch_size: int, + world_size: int, + lengths: Optional[List[int]] = None, + generator=None, + group_by_modality: bool = False, + ): + if lengths is None: + raise ValueError("Lengths must be provided.") + + self.batch_size = batch_size + self.world_size = world_size + self.lengths = lengths + self.generator = generator + self.group_by_modality = group_by_modality + + def __len__(self): + return len(self.lengths) + + def __iter__(self): + if self.group_by_modality: + indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator) + else: + indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator) + return iter(indices) + + +class LLaVATrainer(Trainer): + + def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: + if self.train_dataset is None or not has_length(self.train_dataset): + return None + + if self.args.group_by_modality_length: + lengths = self.train_dataset.modality_lengths + return LengthGroupedSampler( + self.args.train_batch_size, + world_size=self.args.world_size * self.args.gradient_accumulation_steps, + lengths=lengths, + group_by_modality=True, + ) + else: + return super()._get_train_sampler() + + def create_optimizer(self): + """ + Setup the optimizer. + + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + Trainer's init through `optimizers`, or subclass and override this method in a subclass. + """ + if is_sagemaker_mp_enabled(): + return super().create_optimizer() + + opt_model = self.model + + if self.optimizer is None: + decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS) + decay_parameters = [name for name in decay_parameters if "bias" not in name] + if self.args.mm_projector_lr is not None: + projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name] + if self.args.mm_vision_tower_lr is not None: + vision_tower_parameters = [ + name for name, _ in opt_model.named_parameters() if "vision_tower" in name] + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and n not in vision_tower_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and n in vision_tower_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + "lr": self.args.mm_vision_tower_lr, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and n not in vision_tower_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and n in vision_tower_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + "lr": self.args.mm_vision_tower_lr, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + "lr": self.args.mm_projector_lr, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + "lr": self.args.mm_projector_lr, + }, + ] + else: + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + "lr": self.args.mm_projector_lr, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + "lr": self.args.mm_projector_lr, + }, + ] + else: + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + }, + ] + + optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) + + self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) + if optimizer_cls.__name__ == "Adam8bit": + import bitsandbytes + + manager = bitsandbytes.optim.GlobalOptimManager.get_instance() + + skipped = 0 + for module in opt_model.modules(): + if isinstance(module, nn.Embedding): + skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values()) + logger.info(f"skipped {module}: {skipped/2**20}M params") + manager.register_module_override(module, "weight", {"optim_bits": 32}) + logger.debug(f"bitsandbytes: will optimize {module} in fp32") + logger.info(f"skipped: {skipped/2**20}M params") + + return self.optimizer + + def _save_checkpoint(self, model, trial, metrics=None): + if getattr(self.args, 'tune_mm_mlp_adapter', False): + from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR + checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" + + run_dir = self._get_output_dir(trial=trial) + output_dir = os.path.join(run_dir, checkpoint_folder) + + # Only save Adapter + keys_to_match = ['mm_projector', 'vision_resampler'] + if getattr(self.args, "use_im_start_end", False): + keys_to_match.extend(['embed_tokens', 'embed_in']) + + weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match) + + if self.args.local_rank == 0 or self.args.local_rank == -1: + self.model.config.save_pretrained(output_dir) + torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) + else: + super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics) + + def _save(self, output_dir: Optional[str] = None, state_dict=None): + if getattr(self.args, 'tune_mm_mlp_adapter', False): + pass + else: + super(LLaVATrainer, self)._save(output_dir, state_dict) diff --git a/baselines/share4video/train/train.py b/baselines/share4video/train/train.py new file mode 100644 index 0000000..4a9e81a --- /dev/null +++ b/baselines/share4video/train/train.py @@ -0,0 +1,1323 @@ +# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: +# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import json +import logging +import os +import pathlib +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Sequence + +import tokenizers +import torch +import transformers +from packaging import version +from PIL import Image +from torch.utils.data import Dataset + +from llava import conversation as conversation_lib +from llava.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, + DEFAULT_IMAGE_TOKEN, IGNORE_INDEX, + IMAGE_TOKEN_INDEX) +from llava.mm_utils import process_anyres_image, tokenizer_image_token +from llava.model import * +from llava.train.llava_trainer import LLaVATrainer + +local_rank = None + + +def rank0_print(*args): + if local_rank == 0: + print(*args) + + +IS_TOKENIZER_GREATER_THAN_0_14 = version.parse( + tokenizers.__version__) >= version.parse('0.14') + + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="facebook/opt-125m") + version: Optional[str] = field(default="v0") + freeze_backbone: bool = field(default=False) + tune_mm_mlp_adapter: bool = field(default=False) + vision_tower: Optional[str] = field(default=None) + mm_vision_select_layer: Optional[int] = field( + default=-1) # default to the last layer + pretrain_mm_mlp_adapter: Optional[str] = field(default=None) + mm_projector_type: Optional[str] = field(default='linear') + mm_use_im_start_end: bool = field(default=False) + mm_use_im_patch_token: bool = field(default=True) + mm_patch_merge_type: Optional[str] = field(default='flat') + mm_vision_select_feature: Optional[str] = field(default="patch") + + +@dataclass +class DataArguments: + data_path: str = field(default=None, + metadata={"help": "Path to the training data."}) + lazy_preprocess: bool = False + is_multimodal: bool = False + image_folder: Optional[str] = field(default=None) + image_aspect_ratio: str = 'square' + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + remove_unused_columns: bool = field(default=False) + freeze_mm_mlp_adapter: bool = field(default=False) + unfreeze_mm_vision_tower: bool = field(default=False) + mpt_attn_impl: Optional[str] = field(default="triton") + model_max_length: int = field( + default=512, + metadata={ + "help": + "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + double_quant: bool = field( + default=True, + metadata={ + "help": "Compress the quantization statistics through double quantization."} + ) + quant_type: str = field( + default="nf4", + metadata={ + "help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} + ) + bits: int = field( + default=16, + metadata={"help": "How many bits to use."} + ) + lora_enable: bool = False + lora_r: int = 64 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_weight_path: str = "" + lora_bias: str = "none" + lora_qv_proj_only: bool = False + mm_projector_lr: Optional[float] = None + mm_vision_tower_lr: Optional[float] = None + group_by_modality_length: bool = field(default=False) + + +def maybe_zero_3(param, ignore_status=False, name=None): + from deepspeed import zero + from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + if hasattr(param, "ds_id"): + if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: + if not ignore_status: + logging.warning( + f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}") + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +# Borrowed from peft.utils.get_peft_model_state_dict +def get_peft_state_maybe_zero_3(named_params, bias): + if bias == "none": + to_return = {k: t for k, t in named_params if "lora_" in k} + elif bias == "all": + to_return = {k: t for k, + t in named_params if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + maybe_lora_bias = {} + lora_bias_names = set() + for k, t in named_params: + if "lora_" in k: + to_return[k] = t + bias_name = k.split("lora_")[0] + "bias" + lora_bias_names.add(bias_name) + elif "bias" in k: + maybe_lora_bias[k] = t + for k, t in maybe_lora_bias: + if bias_name in lora_bias_names: + to_return[bias_name] = t + else: + raise NotImplementedError + to_return = {k: maybe_zero_3(v, ignore_status=True) + for k, v in to_return.items()} + return to_return + + +def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True): + to_return = {k: t for k, t in named_params if "lora_" not in k} + if require_grad_only: + to_return = {k: t for k, t in to_return.items() if t.requires_grad} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() + for k, v in to_return.items()} + return to_return + + +def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): + to_return = {k: t for k, t in named_params if any( + key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() + for k, v in to_return.items()} + return to_return + + +def get_vision_tower_state_maybe_zero_3(named_params, keys_to_match=['']): + to_return = {k: t for k, t in named_params if any( + key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() + for k, v in to_return.items()} + return to_return + + +def find_all_linear_names(model, qv_proj_only=False): + if qv_proj_only: + rank0_print('Only add LoRA to QV proj') + return ['q_proj', 'v_proj'] + cls = torch.nn.Linear + lora_module_names = set() + multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler'] + for name, module in model.named_modules(): + if any(mm_keyword in name for mm_keyword in multimodal_keywords): + continue + if isinstance(module, cls): + names = name.split('.') + lora_module_names.add(names[0] if len(names) == 1 else names[-1]) + + if 'lm_head' in lora_module_names: # needed for 16-bit + lora_module_names.remove('lm_head') + return list(lora_module_names) + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, + output_dir: str): + """Collects the state dict and dump to disk.""" + + if getattr(trainer.args, "tune_mm_mlp_adapter", False): + # Only save Adapter + keys_to_match = ['mm_projector'] + if getattr(trainer.args, "use_im_start_end", False): + keys_to_match.extend(['embed_tokens', 'embed_in']) + + weight_to_save = get_mm_adapter_state_maybe_zero_3( + trainer.model.named_parameters(), keys_to_match) + trainer.model.config.save_pretrained(output_dir) + + current_folder = output_dir.split('/')[-1] + parent_folder = os.path.dirname(output_dir) + if trainer.args.local_rank == 0 or trainer.args.local_rank == -1: + if current_folder.startswith('checkpoint-'): + mm_projector_folder = os.path.join( + parent_folder, "mm_projector") + os.makedirs(mm_projector_folder, exist_ok=True) + torch.save(weight_to_save, os.path.join( + mm_projector_folder, f'{current_folder}.bin')) + else: + torch.save(weight_to_save, os.path.join( + output_dir, f'mm_projector.bin')) + return + + if getattr(trainer.args, "unfreeze_mm_vision_tower", False): + if trainer.deepspeed: + torch.cuda.synchronize() + mm_vision_tower_folder = os.path.join(output_dir, 'vision_tower') + os.makedirs(mm_vision_tower_folder, exist_ok=True) + trainer.model.get_vision_tower().image_processor.save_pretrained(mm_vision_tower_folder) + trainer.model.get_vision_tower().vision_tower.vision_model.config.save_pretrained( + mm_vision_tower_folder) + weight_to_save = get_vision_tower_state_maybe_zero_3( + trainer.model.get_vision_tower().vision_tower.named_parameters()) + if trainer.args.local_rank == 0 or trainer.args.local_rank == -1: + torch.save(weight_to_save, os.path.join( + mm_vision_tower_folder, 'pytorch_model.bin')) + + if getattr(trainer.model.model, 'vision_tower', None) is not None: + del trainer.model.model.vision_tower + + if trainer.deepspeed: + torch.cuda.synchronize() + trainer.save_model(output_dir) + return + + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = { + key: value.cpu() + for key, value in state_dict.items() + } + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) # noqa + + +def smart_tokenizer_and_embedding_resize( + special_tokens_dict: Dict, + tokenizer: transformers.PreTrainedTokenizer, + model: transformers.PreTrainedModel, +): + """Resize tokenizer and embedding. + + Note: This is the unoptimized version that may make your embedding size not be divisible by 64. + """ + num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + model.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = model.get_input_embeddings().weight.data + output_embeddings = model.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + +def _tokenize_fn(strings: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer) -> Dict: + """Tokenize a list of strings.""" + tokenized_list = [ + tokenizer( + text, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ) for text in strings + ] + input_ids = labels = [ + tokenized.input_ids[0] for tokenized in tokenized_list + ] + input_ids_lens = labels_lens = [ + tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() + for tokenized in tokenized_list + ] + return dict( + input_ids=input_ids, + labels=labels, + input_ids_lens=input_ids_lens, + labels_lens=labels_lens, + ) + + +def _mask_targets(target, tokenized_lens, speakers): + # cur_idx = 0 + cur_idx = tokenized_lens[0] + tokenized_lens = tokenized_lens[1:] + target[:cur_idx] = IGNORE_INDEX + for tokenized_len, speaker in zip(tokenized_lens, speakers): + if speaker == "human": + target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX + cur_idx += tokenized_len + + +def _add_speaker_and_signal(header, source, get_conversation=True): + """Add speaker and start/end signal on each round.""" + BEGIN_SIGNAL = "### " + END_SIGNAL = "\n" + conversation = header + for sentence in source: + from_str = sentence["from"] + if from_str.lower() == "human": + from_str = conversation_lib.default_conversation.roles[0] + elif from_str.lower() == "gpt": + from_str = conversation_lib.default_conversation.roles[1] + else: + from_str = 'unknown' + sentence["value"] = (BEGIN_SIGNAL + from_str + ": " + + sentence["value"] + END_SIGNAL) + if get_conversation: + conversation += sentence["value"] + conversation += BEGIN_SIGNAL + return conversation + + +def preprocess_multimodal( + sources: Sequence[str], + data_args: DataArguments +) -> Dict: + is_multimodal = data_args.is_multimodal + if not is_multimodal: + return sources + + for source in sources: + for sentence in source: + if DEFAULT_IMAGE_TOKEN in sentence['value']: + sentence['value'] = sentence['value'].replace( + DEFAULT_IMAGE_TOKEN, '').strip() + sentence['value'] = DEFAULT_IMAGE_TOKEN + \ + '\n' + sentence['value'] + sentence['value'] = sentence['value'].strip() + if "mmtag" in conversation_lib.default_conversation.version: + sentence['value'] = sentence['value'].replace( + DEFAULT_IMAGE_TOKEN, '' + DEFAULT_IMAGE_TOKEN + '') + replace_token = DEFAULT_IMAGE_TOKEN + if data_args.mm_use_im_start_end: + replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN + sentence["value"] = sentence["value"].replace( + DEFAULT_IMAGE_TOKEN, replace_token) + + return sources + + +def preprocess_llama_2( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack([tokenizer_image_token( + prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + + assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2 + + # Mask targets + sep = "[/INST] " + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep2) + cur_len = 1 + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + instruction_len = len( + tokenizer_image_token(parts[0], tokenizer)) - 2 + else: + round_len = len(tokenizer(rou).input_ids) + instruction_len = len(tokenizer(parts[0]).input_ids) - 2 + + target[cur_len: cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_llama3( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack( + [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + assert conv.sep_style == conversation_lib.SeparatorStyle.MPT + + # Mask targets + sep = conv.sep + conv.roles[1] + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep) + re_rounds = [conv.sep.join(rounds[:3])] + for conv_idx in range(3, len(rounds), 2): + re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx + 2])) + cur_len = 0 + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(re_rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + 1 + instruction_len = len( + tokenizer_image_token(parts[0], tokenizer)) + else: + round_len = len(tokenizer(rou).input_ids) + 1 + instruction_len = len(tokenizer(parts[0]).input_ids) + + if i > 0: + round_len -= 1 + instruction_len -= 1 + + target[cur_len: cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_yi( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack( + [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + assert conv.sep_style == conversation_lib.SeparatorStyle.MPT + + # Mask targets + sep = conv.sep + conv.roles[1] + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep) + re_rounds = [conv.sep.join(rounds[:3])] + for conv_idx in range(3, len(rounds), 2): + re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx + 2])) + cur_len = 0 + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(re_rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + round_len = len(tokenizer_image_token(rou, tokenizer)) + len(tokenizer_image_token(conv.sep, tokenizer)) + instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) + target[cur_len : cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_v1( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack([tokenizer_image_token( + prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + + assert conv.sep_style == conversation_lib.SeparatorStyle.TWO + + # Mask targets + sep = conv.sep + conv.roles[1] + ": " + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep2) + cur_len = 1 + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + instruction_len = len( + tokenizer_image_token(parts[0], tokenizer)) - 2 + else: + round_len = len(tokenizer(rou).input_ids) + instruction_len = len(tokenizer(parts[0]).input_ids) - 2 + + if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14: + round_len -= 1 + instruction_len -= 1 + + target[cur_len: cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_mpt( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack([tokenizer_image_token( + prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + assert conv.sep_style == conversation_lib.SeparatorStyle.MPT + + # Mask targets + sep = conv.sep + conv.roles[1] + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep) + re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt + for conv_idx in range(3, len(rounds), 2): + re_rounds.append(conv.sep.join( + rounds[conv_idx:conv_idx+2])) # user + gpt + cur_len = 0 + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(re_rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + # not included <|im_end|> + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + instruction_len = len( + tokenizer_image_token(parts[0], tokenizer)) - 1 + else: + round_len = len(tokenizer(rou).input_ids) + instruction_len = len(tokenizer(parts[0]).input_ids) - 1 + + # include <|im_end|> for all rounds + # if i != 0 and getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14: + if getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14: + round_len += 1 + instruction_len += 1 + + target[cur_len: cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_plain( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, +) -> Dict: + # add end signal and concatenate together + conversations = [] + for source in sources: + assert len(source) == 2 + assert DEFAULT_IMAGE_TOKEN in source[0]['value'] + source[0]['value'] = DEFAULT_IMAGE_TOKEN + conversation = source[0]['value'] + source[1]['value'] + \ + conversation_lib.default_conversation.sep + conversations.append(conversation) + # tokenize conversations + input_ids = [tokenizer_image_token( + prompt, tokenizer, return_tensors='pt') for prompt in conversations] + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + tokenized_len = len(tokenizer_image_token( + source[0]['value'], tokenizer)) + target[:tokenized_len] = IGNORE_INDEX + + return dict(input_ids=input_ids, labels=targets) + + +def preprocess( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + """ + Given a list of sources, each is a conversation list. This transform: + 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; + 2. Concatenate conversations together; + 3. Tokenize the concatenated conversation; + 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. + """ + if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN: + return preprocess_plain(sources, tokenizer) + if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2: + return preprocess_llama_2(sources, tokenizer, has_image=has_image) + if conversation_lib.default_conversation.version.startswith("v1"): + return preprocess_v1(sources, tokenizer, has_image=has_image) + if conversation_lib.default_conversation.version == "mpt": + return preprocess_mpt(sources, tokenizer, has_image=has_image) + if conversation_lib.default_conversation.version in ["llama3", "llava_llama_3"]: + return preprocess_llama3(sources, tokenizer, has_image=has_image) + if conversation_lib.default_conversation.version == "yi": + return preprocess_yi(sources, tokenizer, has_image=has_image) + # add end signal and concatenate together + conversations = [] + for source in sources: + header = f"{conversation_lib.default_conversation.system}\n\n" + conversation = _add_speaker_and_signal(header, source) + conversations.append(conversation) + # tokenize conversations + + def get_tokenize_len(prompts): + return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts] + + if has_image: + input_ids = [tokenizer_image_token( + prompt, tokenizer, return_tensors='pt') for prompt in conversations] + else: + conversations_tokenized = _tokenize_fn(conversations, tokenizer) + input_ids = conversations_tokenized["input_ids"] + + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + if has_image: + tokenized_lens = get_tokenize_len( + [header] + [s["value"] for s in source]) + else: + tokenized_lens = _tokenize_fn( + [header] + [s["value"] for s in source], tokenizer)["input_ids_lens"] + speakers = [sentence["from"] for sentence in source] + _mask_targets(target, tokenized_lens, speakers) + + return dict(input_ids=input_ids, labels=targets) + + +class LazySupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + data_args: DataArguments): + super(LazySupervisedDataset, self).__init__() + list_data_dict = json.load(open(data_path, "r")) + + rank0_print("Formatting inputs...Skip in lazy mode") + self.tokenizer = tokenizer + self.list_data_dict = list_data_dict + self.data_args = data_args + + def __len__(self): + return len(self.list_data_dict) + + @property + def lengths(self): + length_list = [] + for sample in self.list_data_dict: + img_tokens = 128 if 'image' in sample else 0 + length_list.append(sum(len(conv['value'].split()) + for conv in sample['conversations']) + img_tokens) + return length_list + + @property + def modality_lengths(self): + length_list = [] + for sample in self.list_data_dict: + cur_len = sum(len(conv['value'].split()) + for conv in sample['conversations']) + cur_len = cur_len if 'image' in sample else -cur_len + length_list.append(cur_len) + return length_list + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + sources = self.list_data_dict[i] + if isinstance(i, int): + sources = [sources] + assert len( + sources) == 1, "Don't know why it is wrapped to a list" # FIXME + if 'image' in sources[0]: + image_file = self.list_data_dict[i]['image'] + image_folder = self.data_args.image_folder + processor = self.data_args.image_processor + image = Image.open(os.path.join( + image_folder, image_file)).convert('RGB') + if self.data_args.image_aspect_ratio == 'pad': + def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new( + pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new( + pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + image = expand2square(image, tuple(int(x * 255) + for x in processor.image_mean)) + image_size = image.size + image = processor.preprocess(image, return_tensors='pt')[ + 'pixel_values'][0] + elif self.data_args.image_aspect_ratio == "anyres": + image_size = image.size + image = process_anyres_image( + image, processor, self.data_args.image_grid_pinpoints) + else: + image_size = image.size + image = processor.preprocess(image, return_tensors='pt')[ + 'pixel_values'][0] + sources = preprocess_multimodal( + copy.deepcopy([e["conversations"] for e in sources]), + self.data_args) + else: + sources = copy.deepcopy([e["conversations"] for e in sources]) + data_dict = preprocess( + sources, + self.tokenizer, + has_image=('image' in self.list_data_dict[i])) + if isinstance(i, int): + data_dict = dict(input_ids=data_dict["input_ids"][0], + labels=data_dict["labels"][0]) + + # image exist in the data + if 'image' in self.list_data_dict[i]: + data_dict['image'] = image + data_dict['image_size'] = image_size + elif self.data_args.is_multimodal: + # image does not exist in the data, but the model is multimodal + crop_size = self.data_args.image_processor.crop_size + data_dict['image'] = torch.zeros( + 3, crop_size['height'], crop_size['width']) + data_dict['image_size'] = (crop_size['height'], crop_size['width']) + return data_dict + + +@dataclass +class DataCollatorForSupervisedDataset(object): + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] + for key in ("input_ids", "labels")) + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, + batch_first=True, + padding_value=self.tokenizer.pad_token_id) + labels = torch.nn.utils.rnn.pad_sequence(labels, + batch_first=True, + padding_value=IGNORE_INDEX) + input_ids = input_ids[:, :self.tokenizer.model_max_length] + labels = labels[:, :self.tokenizer.model_max_length] + batch = dict( + input_ids=input_ids, + labels=labels, + attention_mask=input_ids.ne(self.tokenizer.pad_token_id), + ) + + if 'image' in instances[0]: + images = [instance['image'] for instance in instances] + image_sizes = [instance['image_size'] for instance in instances] + if all(x is not None and x.shape == images[0].shape for x in images): + batch['images'] = torch.stack(images) + else: + batch['images'] = images + batch['image_sizes'] = image_sizes + + return batch + + +def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, + data_args) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + return dict(train_dataset=train_dataset, + eval_dataset=None, + data_collator=data_collator) + + +def unfreeze_vit(vision_tower): + for _, p in vision_tower.named_parameters(): + p.requires_grad = True + + +def format_bytes(size): + billion = 10**9 + million = 10**6 + + if size >= billion: + return f"{size / billion:.2f}B" + elif size >= million: + return f"{size / million:.2f}M" + else: + return f"{size} bytes" + + +def train(attn_implementation=None): + global local_rank + + parser = transformers.HfArgumentParser( + (ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + local_rank = training_args.local_rank + compute_dtype = (torch.float16 if training_args.fp16 else ( + torch.bfloat16 if training_args.bf16 else torch.float32)) + + bnb_model_from_pretrained_args = {} + if training_args.bits in [4, 8]: + from transformers import BitsAndBytesConfig + bnb_model_from_pretrained_args.update(dict( + device_map={"": training_args.device}, + load_in_4bit=training_args.bits == 4, + load_in_8bit=training_args.bits == 8, + quantization_config=BitsAndBytesConfig( + load_in_4bit=training_args.bits == 4, + load_in_8bit=training_args.bits == 8, + llm_int8_skip_modules=["mm_projector"], + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=training_args.double_quant, + bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'} + ) + )) + model_max_length_args = {} + if 'llava-v1.6-8b' not in model_args.model_name_or_path: + config = transformers.AutoConfig.from_pretrained( + model_args.model_name_or_path, trust_remote_code=True) + if config.max_position_embeddings < training_args.model_max_length: + rank0_print( + f'Set the max_position_embeddings from {config.max_position_embeddings} to {training_args.model_max_length}') + model_max_length_args.update( + {'max_position_embeddings': training_args.model_max_length}) + if model_args.vision_tower is not None: + if 'mpt' in model_args.model_name_or_path: + config = transformers.AutoConfig.from_pretrained( + model_args.model_name_or_path, trust_remote_code=True) + config.attn_config['attn_impl'] = training_args.mpt_attn_impl + model = LlavaMptForCausalLM.from_pretrained( + model_args.model_name_or_path, + config=config, + cache_dir=training_args.cache_dir, + **bnb_model_from_pretrained_args + ) + else: + model = LlavaLlamaForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + attn_implementation=attn_implementation, + torch_dtype=(torch.bfloat16 if training_args.bf16 else None), + **bnb_model_from_pretrained_args, + **model_max_length_args + ) + else: + model = transformers.LlamaForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + attn_implementation=attn_implementation, + torch_dtype=(torch.bfloat16 if training_args.bf16 else None), + **bnb_model_from_pretrained_args + ) + model.config.use_cache = False + + if model_args.freeze_backbone: + model.model.requires_grad_(False) + + if training_args.bits in [4, 8]: + from peft import prepare_model_for_kbit_training + model.config.torch_dtype = (torch.float32 if training_args.fp16 else ( + torch.bfloat16 if training_args.bf16 else torch.float32)) + model = prepare_model_for_kbit_training( + model, use_gradient_checkpointing=training_args.gradient_checkpointing) + + if training_args.gradient_checkpointing: + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + if training_args.lora_enable: + from peft import LoraConfig, get_peft_model + lora_config = LoraConfig( + r=training_args.lora_r, + lora_alpha=training_args.lora_alpha, + target_modules=find_all_linear_names(model, training_args.lora_qv_proj_only), + lora_dropout=training_args.lora_dropout, + bias=training_args.lora_bias, + task_type="CAUSAL_LM", + ) + if training_args.bits == 16: + if training_args.bf16: + model.to(torch.bfloat16) + if training_args.fp16: + model.to(torch.float16) + rank0_print("Adding LoRA adapters...") + model = get_peft_model(model, lora_config) + + if 'mpt' in model_args.model_name_or_path: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right" + ) + else: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=False, + ) + + if model_args.version == "v0": + if tokenizer.pad_token is None: + smart_tokenizer_and_embedding_resize( + special_tokens_dict=dict(pad_token="[PAD]"), + tokenizer=tokenizer, + model=model, + ) + elif model_args.version == "v0.5": + tokenizer.pad_token = tokenizer.unk_token + else: + if tokenizer.pad_token is None: + rank0_print("Adding pad token as ''") + smart_tokenizer_and_embedding_resize( + special_tokens_dict=dict(pad_token=""), + tokenizer=tokenizer, + model=model, + ) + if model_args.version in conversation_lib.conv_templates: + conversation_lib.default_conversation = conversation_lib.conv_templates[ + model_args.version] + else: + conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"] + + if model_args.vision_tower is not None: + model.get_model().initialize_vision_modules( + model_args=model_args, + fsdp=training_args.fsdp + ) + + vision_tower = model.get_vision_tower() + vision_tower.to( + dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) + + data_args.image_processor = vision_tower.image_processor + data_args.is_multimodal = True + + model.config.image_aspect_ratio = data_args.image_aspect_ratio + if data_args.image_aspect_ratio == 'anyres': + base_size = vision_tower.config.image_size + grids = [[1, 2], [2, 1], [2, 2], [3, 1], [1, 3]] + model.config.image_grid_pinpoints = data_args.image_grid_pinpoints = [ + [g[0]*base_size, g[1]*base_size] for g in grids] + model.config.tokenizer_padding_side = tokenizer.padding_side + model.config.tokenizer_model_max_length = tokenizer.model_max_length + + model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter + if model_args.tune_mm_mlp_adapter: + model.requires_grad_(False) + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = True + + model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter + if training_args.freeze_mm_mlp_adapter: + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = False + + model.config.unfreeze_mm_vision_tower = training_args.unfreeze_mm_vision_tower + if training_args.unfreeze_mm_vision_tower: + lr_of_vit = training_args.mm_vision_tower_lr if training_args.mm_vision_tower_lr is not None else training_args.learning_rate + lr_of_mlp = training_args.mm_projector_lr if training_args.mm_projector_lr is not None else training_args.learning_rate + training_args.mm_projector_lr = lr_of_mlp + unfreeze_vit(vision_tower) + rank0_print( + f'Tune the entire model! The LR of ViT is {lr_of_vit}. The LR of MLP is {lr_of_mlp}. The LR of LLM is {training_args.learning_rate}') + + # Calculate total parameters and trainable parameters + total_params = sum(p.numel() for p in model.get_model().parameters()) + trainable_params = sum( + p.numel() for p in model.get_model().parameters() if p.requires_grad) + + rank0_print(f"Total parameters: {format_bytes(total_params)}") + rank0_print(f"Trainable parameters: {format_bytes(trainable_params)}") + + if training_args.bits in [4, 8]: + model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) + + model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end + model.config.mm_projector_lr = training_args.mm_projector_lr + model.config.mm_vision_tower_lr = training_args.mm_vision_tower_lr + training_args.use_im_start_end = model_args.mm_use_im_start_end + model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token + model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer) + model.config.pad_token_id = tokenizer.pad_token_id + + if training_args.bits in [4, 8]: + from peft.tuners.lora import LoraLayer + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + if training_args.bf16: + module = module.to(torch.bfloat16) + if 'norm' in name: + module = module.to(torch.float32) + if 'lm_head' in name or 'embed_tokens' in name: + if hasattr(module, 'weight'): + if training_args.bf16 and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) + + data_module = make_supervised_data_module(tokenizer=tokenizer, + data_args=data_args) + trainer = LLaVATrainer(model=model, + tokenizer=tokenizer, + args=training_args, + **data_module) + + if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): + trainer.train(resume_from_checkpoint=True) + else: + trainer.train() + trainer.save_state() + + model.config.use_cache = True + + if training_args.lora_enable: + state_dict = get_peft_state_maybe_zero_3( + model.named_parameters(), training_args.lora_bias + ) + non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3( + model.named_parameters() + ) + if training_args.local_rank == 0 or training_args.local_rank == -1: + model.config.save_pretrained(training_args.output_dir) + model.save_pretrained( + training_args.output_dir, state_dict=state_dict) + torch.save(non_lora_state_dict, os.path.join( + training_args.output_dir, 'non_lora_trainables.bin')) + if training_args.unfreeze_mm_vision_tower: + if trainer.deepspeed: + torch.cuda.synchronize() + trainer.model.get_vision_tower().image_processor.save_pretrained( + os.path.join(training_args.output_dir, 'vision_tower')) + trainer.model.get_vision_tower().vision_tower.vision_model.config.save_pretrained( + os.path.join(training_args.output_dir, 'vision_tower')) + weight_to_save = get_vision_tower_state_maybe_zero_3( + trainer.model.get_vision_tower().vision_tower.named_parameters()) + torch.save(weight_to_save, os.path.join( + training_args.output_dir, 'vision_tower/pytorch_model.bin')) + else: + safe_save_model_for_hf_trainer(trainer=trainer, + output_dir=training_args.output_dir) + + +if __name__ == "__main__": + train() diff --git a/baselines/share4video/train/train_mem.py b/baselines/share4video/train/train_mem.py new file mode 100644 index 0000000..29ea061 --- /dev/null +++ b/baselines/share4video/train/train_mem.py @@ -0,0 +1,4 @@ +from llava.train.train import train + +if __name__ == "__main__": + train(attn_implementation="flash_attention_2") diff --git a/baselines/share4video/utils.py b/baselines/share4video/utils.py new file mode 100644 index 0000000..4006cf9 --- /dev/null +++ b/baselines/share4video/utils.py @@ -0,0 +1,126 @@ +import datetime +import logging +import logging.handlers +import os +import sys + +import requests + +from llava.constants import LOGDIR + +server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" +moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." + +handler = None + + +def build_logger(logger_name, logger_filename): + global handler + + formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + # Set the format of root handlers + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO) + logging.getLogger().handlers[0].setFormatter(formatter) + + # Redirect stdout and stderr to loggers + stdout_logger = logging.getLogger("stdout") + stdout_logger.setLevel(logging.INFO) + sl = StreamToLogger(stdout_logger, logging.INFO) + sys.stdout = sl + + stderr_logger = logging.getLogger("stderr") + stderr_logger.setLevel(logging.ERROR) + sl = StreamToLogger(stderr_logger, logging.ERROR) + sys.stderr = sl + + # Get logger + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + + # Add a file handler for all loggers + if handler is None: + os.makedirs(LOGDIR, exist_ok=True) + filename = os.path.join(LOGDIR, logger_filename) + handler = logging.handlers.TimedRotatingFileHandler( + filename, when='D', utc=True, encoding='UTF-8') + handler.setFormatter(formatter) + + for name, item in logging.root.manager.loggerDict.items(): + if isinstance(item, logging.Logger): + item.addHandler(handler) + + return logger + + +class StreamToLogger(object): + """ + Fake file-like stream object that redirects writes to a logger instance. + """ + def __init__(self, logger, log_level=logging.INFO): + self.terminal = sys.stdout + self.logger = logger + self.log_level = log_level + self.linebuf = '' + + def __getattr__(self, attr): + return getattr(self.terminal, attr) + + def write(self, buf): + temp_linebuf = self.linebuf + buf + self.linebuf = '' + for line in temp_linebuf.splitlines(True): + # From the io.TextIOWrapper docs: + # On output, if newline is None, any '\n' characters written + # are translated to the system default line separator. + # By default sys.stdout.write() expects '\n' newlines and then + # translates them so this is still cross platform. + if line[-1] == '\n': + self.logger.log(self.log_level, line.rstrip()) + else: + self.linebuf += line + + def flush(self): + if self.linebuf != '': + self.logger.log(self.log_level, self.linebuf.rstrip()) + self.linebuf = '' + + +def disable_torch_init(): + """ + Disable the redundant torch default initialization to accelerate model creation. + """ + import torch + setattr(torch.nn.Linear, "reset_parameters", lambda self: None) + setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) + + +def violates_moderation(text): + """ + Check whether the text violates OpenAI moderation API. + """ + url = "https://api.openai.com/v1/moderations" + headers = {"Content-Type": "application/json", + "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]} + text = text.replace("\n", "") + data = "{" + '"input": ' + f'"{text}"' + "}" + data = data.encode("utf-8") + try: + ret = requests.post(url, headers=headers, data=data, timeout=5) + flagged = ret.json()["results"][0]["flagged"] + except requests.exceptions.RequestException as e: + flagged = False + except KeyError as e: + flagged = False + + return flagged + + +def pretty_print_semaphore(semaphore): + if semaphore is None: + return "None" + return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" diff --git a/baselines/share4video/video_utils.py b/baselines/share4video/video_utils.py new file mode 100644 index 0000000..6e9286e --- /dev/null +++ b/baselines/share4video/video_utils.py @@ -0,0 +1,151 @@ +import argparse +import io +import json +import os +import random +import tempfile +from multiprocessing import Manager, Pool, cpu_count + +import cv2 +import imageio +import numpy as np +from decord import VideoReader +from PIL import Image + + +def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1): + if sample in ["rand", "middle"]: # uniform sampling + acc_samples = min(num_frames, vlen) + # split the video into `acc_samples` intervals, and sample from each interval. + intervals = np.linspace( + start=0, stop=vlen, num=acc_samples + 1).astype(int) + ranges = [] + for idx, interv in enumerate(intervals[:-1]): + ranges.append((interv, intervals[idx + 1] - 1)) + if sample == 'rand': + try: + frame_indices = [random.choice( + range(x[0], x[1])) for x in ranges] + except Exception: + frame_indices = np.random.permutation(vlen)[:acc_samples] + frame_indices.sort() + frame_indices = list(frame_indices) + elif fix_start is not None: + frame_indices = [x[0] + fix_start for x in ranges] + elif sample == 'middle': + frame_indices = [(x[0] + x[1]) // 2 for x in ranges] + else: + raise NotImplementedError + + if len(frame_indices) < num_frames: # padded with last frame + padded_frame_indices = [frame_indices[-1]] * num_frames + padded_frame_indices[:len(frame_indices)] = frame_indices + frame_indices = padded_frame_indices + elif "fps" in sample: # fps0.5, sequentially sample frames at 0.5 fps + output_fps = float(sample[3:]) + duration = float(vlen) / input_fps + # gap between frames, this is also the clip length each frame represents + delta = 1 / output_fps + frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta) + frame_indices = np.around(frame_seconds * input_fps).astype(int) + frame_indices = [e for e in frame_indices if e < vlen] + if max_num_frames > 0 and len(frame_indices) > max_num_frames: + frame_indices = frame_indices[:max_num_frames] + else: + raise ValueError + return frame_indices + + +def get_index(num_frames, bound, fps, max_frame, first_idx=0): + if bound: + start, end = bound[0], bound[1] + else: + start, end = -100000, 100000 + start_idx = max(first_idx, round(start * fps)) + end_idx = min(round(end * fps), max_frame) + seg_size = float(end_idx - start_idx) / num_frames + frame_indices = np.array([ + int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) + for idx in range(num_frames) + ]) + return frame_indices + + +def read_frames_gif( + video_path, num_frames, sample='rand', fix_start=None, + max_num_frames=-1, client=None, clip=None, +): + if video_path.startswith('s3') or video_path.startswith('p2'): + video_bytes = client.get(video_path) + gif = imageio.get_reader(io.BytesIO(video_bytes)) + else: + gif = imageio.get_reader(video_path) + vlen = len(gif) + frame_indices = get_frame_indices( + num_frames, vlen, sample=sample, fix_start=fix_start, + max_num_frames=max_num_frames + ) + frames = [] + reference_size = None + for index, frame in enumerate(gif): + # for index in frame_idxs: + if index in frame_indices: + if frame.ndim == 2: + frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB) + elif frame.shape[2] == 4: + frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) + if reference_size is None: + reference_size = (frame.shape[1], frame.shape[0]) + frame = cv2.resize(frame, reference_size, + interpolation=cv2.INTER_LINEAR) + frames.append(frame) + frames = np.stack(frames, axis=0) # .float() / 255 + + return frames + + +def read_frames_decord( + video_path, num_frames, sample='rand', fix_start=None, + max_num_frames=-1, client=None, clip=None +): + if video_path.startswith('s3') or video_path.startswith('p2') or video_path.startswith('p_hdd') or video_path.startswith('cluster1'): + video_bytes = client.get(video_path) + video_reader = VideoReader(io.BytesIO(video_bytes), num_threads=1) + else: + video_reader = VideoReader(video_path, num_threads=1) + vlen = len(video_reader) + fps = video_reader.get_avg_fps() + duration = vlen / float(fps) + + if clip: + vlen = int(duration * fps) + frame_indices = get_index(num_frames, clip, fps, vlen) + else: + frame_indices = get_frame_indices( + num_frames, vlen, sample=sample, fix_start=fix_start, + input_fps=fps, max_num_frames=max_num_frames + ) + # if clip: + # frame_indices = [f + start_index for f in frame_indices] + + frames = video_reader.get_batch(frame_indices).asnumpy() # (T, H, W, C) + return frames + + +def read_diff_frames_decord( + video_path, clip, client=None +): + if video_path.startswith('s3') or video_path.startswith('p2') or video_path.startswith('p_hdd') or video_path.startswith('cluster1') or video_path.startswith('s_hdd'): + video_bytes = client.get(video_path) + video_reader = VideoReader(io.BytesIO(video_bytes), num_threads=1) + else: + video_reader = VideoReader(video_path, num_threads=1) + vlen = len(video_reader) + fps = video_reader.get_avg_fps() + + start_idx = round(clip[0]*fps) + end_idx = min(round(clip[1]*fps), vlen) + frame_indices = [start_idx, end_idx] + + frames = video_reader.get_batch(frame_indices).asnumpy() # (T, H, W, C) + return frames diff --git a/baselines/sharegpt4video_modeling.py b/baselines/sharegpt4video_modeling.py new file mode 100644 index 0000000..30923c9 --- /dev/null +++ b/baselines/sharegpt4video_modeling.py @@ -0,0 +1,182 @@ +import argparse +import os + +import numpy as np +import torch +from decord import VideoReader, cpu +from PIL import Image + +from share4video.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX +from share4video.conversation import conv_templates +from share4video.mm_utils import (get_model_name_from_path, process_images, + tokenizer_image_token) +from share4video.model.builder import load_pretrained_model +from share4video.utils import disable_torch_init + +from base import ViLLMBaseModel + +class ShareGPT4Video(ViLLMBaseModel): + def __init__(self, model_args): + super().__init__(model_args['model_path'], model_args['device']) + assert( + "model_path" in model_args + and "device" in model_args + ) + self.model_path = model_args['model_path'] + self.device = model_args['device'] + cache_dir = 'cache_dir' + + self.num_frames = 16 + self.pre_query_prompt = "The provided image arranges keyframes from a video in a grid view, keyframes are separated with white bands. Answer concisely with overall content and context of the video, highlighting any significant events, characters, or objects that appear throughout the frames." + self.conv_mode = "llava_llama_3" + + disable_torch_init() + + model_name = get_model_name_from_path(self.model_path) + + self.tokenizer, self.model, self.processor, self.context_len = load_pretrained_model(self.model_path, None, model_name, device_map='cpu') + self.model = self.model.cuda(self.device).eval() + + + def generate(self, instruction, video_path): + + outputs = single_test( + self.model, + self.processor, + self.tokenizer, + video_path, + qs=instruction, + pre_query_prompt=self.pre_query_prompt, + num_frames=self.num_frames, + conv_mode=self.conv_mode + ) + outputs = outputs.strip() + # print(outputs) + return outputs + + +def create_frame_grid(img_array, interval_width=50): + n, h, w, c = img_array.shape + grid_size = int(np.ceil(np.sqrt(n))) + + horizontal_band = np.ones((h, interval_width, c), + dtype=img_array.dtype) * 255 + vertical_band = np.ones((interval_width, w + (grid_size - 1) + * (w + interval_width), c), dtype=img_array.dtype) * 255 + + rows = [] + for i in range(grid_size): + row_frames = [] + for j in range(grid_size): + idx = i * grid_size + j + if idx < n: + frame = img_array[idx] + else: + frame = np.ones_like(img_array[0]) * 255 + if j > 0: + row_frames.append(horizontal_band) + row_frames.append(frame) + combined_row = np.concatenate(row_frames, axis=1) + if i > 0: + rows.append(vertical_band) + rows.append(combined_row) + + final_grid = np.concatenate(rows, axis=0) + return final_grid + + +def resize_image_grid(image, max_length=1920): + width, height = image.size + if max(width, height) > max_length: + if width > height: + scale = max_length / width + else: + scale = max_length / height + + new_width = int(width * scale) + new_height = int(height * scale) + + img_resized = image.resize((new_width, new_height), Image.BILINEAR) + else: + img_resized = image + return img_resized + + +def video_answer(prompt, model, processor, tokenizer, img_grid, do_sample=True, + max_new_tokens=200, num_beams=1, top_p=0.9, + temperature=1.0, print_res=False, **kwargs): + if not isinstance(img_grid, (list, tuple)): + img_grid = [img_grid] + image_size = img_grid[0].size + image_tensor = process_images(img_grid, processor, model.config)[0] + input_ids = tokenizer_image_token( + prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + input_ids = input_ids.unsqueeze(0).to( + device=model.device, non_blocking=True) + pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token is not None else tokenizer.eos_token_id + + with torch.inference_mode(): + output_ids = model.generate( + input_ids, + images=image_tensor.to( + dtype=torch.float16, device=model.device, non_blocking=True), + image_sizes=[image_size], + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + num_beams=num_beams, + max_new_tokens=max_new_tokens, + pad_token_id=pad_token_id, + use_cache=True, + **kwargs) + outputs = tokenizer.batch_decode( + output_ids, skip_special_tokens=True)[0].strip() + if print_res: # debug usage + print('### PROMPTING LM WITH: ', prompt) + print('### LM OUTPUT TEXT: ', outputs) + + return outputs + + +def single_test(model, processor, tokenizer, vid_path, qs, pre_query_prompt=None, num_frames=16, conv_mode="plain"): + def get_index(num_frames, num_segments): + seg_size = float(num_frames - 1) / num_segments + start = int(seg_size / 2) + offsets = np.array([ + start + int(np.round(seg_size * idx)) for idx in range(num_segments) + ]) + return offsets + + def load_video(video_path, num_segments=8, return_msg=False, num_frames=4): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + num_frames = len(vr) + frame_indices = get_index(num_frames, num_segments) + img_array = vr.get_batch(frame_indices).asnumpy() + img_grid = create_frame_grid(img_array, 50) + img_grid = Image.fromarray(img_grid).convert("RGB") + img_grid = resize_image_grid(img_grid) + if return_msg: + fps = float(vr.get_avg_fps()) + sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) + # " " should be added in the start and end + msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." + return img_grid, msg + else: + return img_grid + if num_frames != 0: + vid, msg = load_video( + vid_path, num_segments=num_frames, return_msg=True) + else: + vid, msg = None, 'num_frames is 0, not inputing image' + img_grid = vid + conv = conv_templates[conv_mode].copy() + if pre_query_prompt is not None: + qs = DEFAULT_IMAGE_TOKEN + '\n' + pre_query_prompt + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + llm_response = video_answer(prompt, model=model, processor=processor, tokenizer=tokenizer, + do_sample=False, img_grid=img_grid, max_new_tokens=512, print_res=False) + return llm_response \ No newline at end of file diff --git a/evaluations/evaluation.py b/evaluations/evaluation.py index 2e40683..ea3302d 100644 --- a/evaluations/evaluation.py +++ b/evaluations/evaluation.py @@ -84,6 +84,10 @@ def load_model(TESTING_MODEL): from llavanext_modeling import LLaVANeXT ckpt_path = f"{CKPT_DIR}/LLaVA-NeXT-Video/LLaVA-NeXT-Video-34B-DPO" model = LLaVANeXT({"model_path": ckpt_path, "device": 0}) + elif TESTING_MODEL == "ShareGPT4Video": + from sharegpt4video_modeling import ShareGPT4Video + ckpt_path = f"{CKPT_DIR}/ShareGPT4Video/sharegpt4video-8b" + model = ShareGPT4Video({"model_path": ckpt_path, "device": 0}) elif TESTING_MODEL == "Gemini-1.5-pro": from gemini_modeling import Gemini model = Gemini({"model_path": None, "device": 0}) @@ -107,7 +111,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default="", - choices=["VideoChatGPT", "Valley2", "Video-LLaMA-2", "VideoChat2", "VideoLLaVA", "LLaMA-VID", "VideoLaVIT", "MiniGPT4-Video", "PLLaVA", "LLaVA-NeXT-Video", + choices=["VideoChatGPT", "Valley2", "Video-LLaMA-2", "VideoChat2", "VideoLLaVA", "LLaMA-VID", "VideoLaVIT", "MiniGPT4-Video", "PLLaVA", "LLaVA-NeXT-Video", "ShareGPT4Video", "Gemini-1.5-pro", "GPT4O", "LLaVA", "GPT4V", "Video-LLaMA-2-13B", "LLaMA-VID-13B", "PLLaVA-13B", diff --git a/evaluations/evaluation_bias.py b/evaluations/evaluation_bias.py new file mode 100644 index 0000000..e60a753 --- /dev/null +++ b/evaluations/evaluation_bias.py @@ -0,0 +1,72 @@ +import json +import re +import argparse + +def main(models): + tps = ["obj_rel", "temporal", "semantic", "fact", "nonfact"] + + for model in models: + gt_yes = 0 + gt_no = 0 + n_yes = 0 + n_no = 0 + n = 0 + fp = 0 + tn = 0 + + for tp in tps: + res_filepath = f"results/{tp}_{model}.json" + + try: + with open(res_filepath, 'r') as f: + res = json.load(f) + except FileNotFoundError: + print(f"File not found: {res_filepath}") + continue + + for dct in res: + basic_pred = dct["basic"]["predict"] + basic_ans = dct["basic"]["answer"] + halluc_pred = dct["hallucination"]["predict"] + halluc_ans = dct["hallucination"]["answer"] + + assert basic_ans == 'yes' + assert halluc_ans == 'no' + + y_pattern = r'\b(' + basic_ans + r')\b' + n_pattern = r'\b(' + halluc_ans + r')\b' + + gt_yes += 1 + if re.search(y_pattern, basic_pred, re.IGNORECASE): + n_yes += 1 + else: + if re.search(n_pattern, halluc_pred, re.IGNORECASE): + n_no += 1 + n += 1 + + gt_no += 1 + if re.search(n_pattern, halluc_pred, re.IGNORECASE): + n_no += 1 + else: + if re.search(y_pattern, basic_pred, re.IGNORECASE): + n_yes += 1 + fp += 1 + n += 1 + + ydp = (n_yes - gt_yes) / (gt_yes * 2) if gt_yes > 0 else 0 + ndp = (n_no - gt_no) / (gt_no * 2) if gt_no > 0 else 0 + fpr = fp / n if n > 0 else 0 + + print(model) + print('yes difference percentage: ', ydp) + print('no difference percentage: ', ndp) + print('false positive ratio:', fpr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process some models.") + parser.add_argument('models', metavar='M', type=str, nargs='+', + help='a list of models to process') + + args = parser.parse_args() + main(args.models) diff --git a/model_testing_zoo.py b/model_testing_zoo.py index c180bfe..3a901ca 100644 --- a/model_testing_zoo.py +++ b/model_testing_zoo.py @@ -16,7 +16,7 @@ parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default="", - choices=["VideoChatGPT", "Valley2", "Video-LLaMA-2", "VideoChat2", "VideoLLaVA", "LLaMA-VID", "VideoLaVIT", "MiniGPT4-Video", "PLLaVA", "LLaVA-NeXT-Video", + choices=["VideoChatGPT", "Valley2", "Video-LLaMA-2", "VideoChat2", "VideoLLaVA", "LLaMA-VID", "VideoLaVIT", "MiniGPT4-Video", "PLLaVA", "LLaVA-NeXT-Video", "ShareGPT4Video", "Gemini-1.5-pro", "GPT4O", "LLaVA", "GPT4V", "Video-LLaMA-2-13B", "LLaMA-VID-13B", @@ -86,6 +86,10 @@ def load_model(TESTING_MODEL): from llavanext_modeling import LLaVANeXT ckpt_path = f"{CKPT_DIR}/LLaVA-NeXT-Video/LLaVA-NeXT-Video-34B-DPO" model = LLaVANeXT({"model_path": ckpt_path, "device": 0}) + elif TESTING_MODEL == "ShareGPT4Video": + from sharegpt4video_modeling import ShareGPT4Video + ckpt_path = f"{CKPT_DIR}/ShareGPT4Video/sharegpt4video-8b" + model = ShareGPT4Video({"model_path": ckpt_path, "device": 0}) elif TESTING_MODEL == "Gemini-1.5-pro": from gemini_modeling import Gemini model = Gemini({"model_path": None, "device": 0})