From 52070e2a586406b8a38da7e1fe5a0bd2c59b58a1 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 10 Oct 2023 20:36:01 +0000 Subject: [PATCH 01/11] register a custom model --- fastchat/serve/gradio_block_arena_anony.py | 5 +++++ fastchat/serve/gradio_web_server.py | 3 +++ fastchat/serve/gradio_web_server_multi.py | 1 + 3 files changed, 9 insertions(+) diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index 2b5bd7429..60286728b 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -221,6 +221,11 @@ def add_text( w = SAMPLING_WEIGHTS.get(a, 1.0) * SAMPLING_WEIGHTS.get(b, 1.0) if a in SAMPLING_BOOST_MODELS or b in SAMPLING_BOOST_MODELS: w *= 10 + if a in {"gpt-4", "deluxe-chat-v1"} and b in { + "gpt-4", + "deluxe-chat-v1", + }: + w *= 8 model_pairs.append((a, b)) model_pairs_weights.append(w) diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index 532603b18..64d953359 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -146,6 +146,9 @@ def get_model_list( models += ["palm-2"] models = list(set(models)) + if "deluxe-chat-v1" in models: + del models[models.index("deluxe-chat-v1")] + priority = {k: f"___{i:02d}" for i, k in enumerate(model_info)} models.sort(key=lambda x: priority.get(x, x)) logger.info(f"Models: {models}") diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index 92618d911..f1dbca963 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -87,6 +87,7 @@ def load_demo(url_params, request: gr.Request): models_anony += ["claude-2", "claude-instant-1"] if args.add_palm: models_anony += ["palm-2"] + models_anony.append("deluxe-chat-v1") side_by_side_anony_updates = load_demo_side_by_side_anony(models_anony, url_params) side_by_side_named_updates = load_demo_side_by_side_named(models, url_params) From 1987dd5b7740179ace70c5958eae55171c982956 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Mon, 23 Oct 2023 01:21:20 +0000 Subject: [PATCH 02/11] sampling method --- fastchat/model/model_registry.py | 33 +++--- fastchat/serve/gradio_block_arena_anony.py | 112 ++++++++++++++------- fastchat/serve/gradio_web_server.py | 4 +- fastchat/serve/gradio_web_server_multi.py | 4 +- 4 files changed, 99 insertions(+), 54 deletions(-) diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py index 22d3013a1..4634b2038 100644 --- a/fastchat/model/model_registry.py +++ b/fastchat/model/model_registry.py @@ -67,6 +67,24 @@ def get_model_info(name: str) -> ModelInfo: "https://ai.meta.com/llama/", "open foundation and fine-tuned chat models by Meta", ) +register_model_info( + ["zephyr-7b-alpha"], + "Zephyr", + "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha", + "a chatbot fine-tuned from Mistral by Hugging Face", +) +register_model_info( + ["qwen-14b-chat"], + "Qwen", + "https://huggingface.co/Qwen/Qwen-14B-Chat", + "a large language model by Alibaba Cloud", +) +register_model_info( + ["mistral-7b-instruct"], + "Mistral", + "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1", + "a large language model by Mistral AI team", +) register_model_info( ["codellama-34b-instruct", "codellama-13b-instruct", "codellama-7b-instruct"], "Code Llama", @@ -307,24 +325,11 @@ def get_model_info(name: str) -> ModelInfo: "Vigogne-Chat is a French large language model (LLM) optimized for instruction-following and multi-turn dialogues, developed by Bofeng Huang", ) register_model_info( - ["mistral-7b-instruct"], - "Mistral", - "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1", - "a large language model by Mistral AI team", -) -register_model_info( - ["deluxe-chat-v1"], + ["deluxe-chat-v1", "deluxe-chat-v1.1"], "DeluxeChat", "", "Deluxe Chat", ) - -register_model_info( - ["zephyr-7b-alpha"], - "Zephyr", - "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha", - "a chatbot fine-tuned from Mistral by Hugging Face", -) register_model_info( [ "Xwin-LM-7B-V0.1", diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index 60286728b..e6f01ba45 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -160,9 +160,10 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "gpt-4": 2, "gpt-3.5-turbo": 2, "claude-2": 2, + "claude-1": 2, "claude-instant-1": 2, - "deluxe-chat-v1": 4, # tire 1 + "deluxe-chat-v1.1": 0.1, "palm-2": 1.5, "llama-2-70b-chat": 1.5, "llama-2-13b-chat": 1.5, @@ -171,11 +172,13 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "vicuna-13b": 1.5, "wizardlm-70b": 1.5, "wizardlm-13b": 1.5, + "qwen-14b-chat": 1.5, + "zephyr-7b-alpha": 1.5, + "mistral-7b-instruct": 1.5, # tier 2 "vicuna-7b": 1.0, "llama-2-7b-chat": 1.0, "chatglm2-6b": 1.0, - "mistral-7b-instruct": 1.0, # deprecated "codellama-13b-instruct": 1.0, "mpt-30b-chat": 1.5, @@ -191,9 +194,77 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "dolly-v2-12b": 0.1, "llama-13b": 0.1, "chatglm-6b": 0.5, + "deluxe-chat-v1": 4, } -SAMPLING_BOOST_MODELS = [] +SAMPLING_BOOST_MODELS = ["zephyr-7b-alpha", "mistral-7b-instruct", "claude-1"] +OUTAGE_MODELS = ["deluxe-chat-v1.1", "falcon-180b-chat"] + + +def get_sample_weight(model): + if model in OUTAGE_MODELS: + return 0 + weight = SAMPLING_WEIGHTS.get(model, 1.0) + if model in SAMPLING_BOOST_MODELS: + weight *= 5 + return weight + + +def get_battle_pair(): + if len(models) == 1: + return models[0], models[0] + + targets = { + "gpt-4": {"claude-2"}, + "gpt-3.5-turbo": {"claude-instant-1", "gpt-4", "claude-2"}, + "claude-2": {"gpt-4", "gpt-3.5-turbo"}, + "claude-1": {"claude-2", "gpt-4", "gpt-3.5-turbo"}, + "claude-instant-1": {"gpt-3.5-turbo", "claude-2"}, + "deluxe-chat-v1.1": {"gpt-4"}, + "qwen-14b-chat": {"vicuna-13b", "llama-2-13b-chat", "llama-2-70b-chat"}, + "zephyr-7b-alpha": {"mistral-7b-instruct", "llama-2-13b-chat"}, + "llama-2-70b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, + "llama-2-13b-chat": {"mistral-7b-instruct", "vicuna-13b", "llama-2-70b-chat"}, + "llama-2-7b-chat": {"mistral-7b-instruct", "vicuna-7b", "llama-2-13b-chat"}, + "mistral-7b-instruct": {"llama-2-7b-chat", "llama-2-13b-chat", "llama-2-70b-chat"}, + "vicuna-33b": {"llama-2-70b-chat", "gpt-3.5-turbo", "claude-instant-1"}, + "vicuna-13b": {"llama-2-13b-chat", "llama-2-70b-chat"}, + "vicuna-7b": {"llama-2-7b-chat", "mistral-7b-instruct", "llama-2-13b-chat"}, + "wizardlm-70b": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, + "palm-2": {"llama-2-13b-chat", "gpt-3.5-turbo"}, + } + model_weights = [] + for model in models: + weight = get_sample_weight(model) + model_weights.append(weight) + total_weight = np.sum(model_weights) + model_weights = model_weights / total_weight + chosen_idx = np.random.choice(len(models), p=model_weights) + chosen_model = models[chosen_idx] + + rival_models = [] + rival_weights = [] + for model in models: + if model == chosen_model: + continue + weight = get_sample_weight(model) + if (weight != 0 and chosen_model in targets and + model in targets[chosen_model]): + # boost to 66% chance + weight = 2*total_weight / len(targets[chosen_model]) + rival_models.append(model) + rival_weights.append(weight) + # for p, w in zip(rival_models, rival_weights): + # print(p, w) + rival_weights = rival_weights / np.sum(rival_weights) + rival_idx = np.random.choice(len(rival_models), p=rival_weights) + rival_model = rival_models[rival_idx] + + swap = np.random.randint(2) + if swap == 0: + return chosen_model, rival_model + else: + return rival_model, chosen_model def add_text( @@ -207,41 +278,8 @@ def add_text( # Init states if necessary if states[0] is None: assert states[1] is None - model_pairs = [] - model_pairs_weights = [] - - # Pick two models - if len(model_pairs) == 0: - for i in range(len(models)): - for j in range(len(models)): - if i == j: - continue - a = models[i] - b = models[j] - w = SAMPLING_WEIGHTS.get(a, 1.0) * SAMPLING_WEIGHTS.get(b, 1.0) - if a in SAMPLING_BOOST_MODELS or b in SAMPLING_BOOST_MODELS: - w *= 10 - if a in {"gpt-4", "deluxe-chat-v1"} and b in { - "gpt-4", - "deluxe-chat-v1", - }: - w *= 8 - model_pairs.append((a, b)) - model_pairs_weights.append(w) - - model_pairs_weights = model_pairs_weights / np.sum(model_pairs_weights) - # for p, w in zip(model_pairs, model_pairs_weights): - # print(p, w) - - if len(model_pairs) >= 1: - # if len(model_pairs) != len(model_pairs_weights): - # print("model pairs", model_pairs, model_pairs_weights) - # print("#model pairs", len(model_pairs), len(model_pairs_weights)) - idx = np.random.choice(len(model_pairs), p=model_pairs_weights) - model_left, model_right = model_pairs[idx] - else: - model_left = model_right = models[0] + model_left, model_right = get_battle_pair() states = [ State(model_left), State(model_right), diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index 64d953359..b2b92f861 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -148,6 +148,8 @@ def get_model_list( if "deluxe-chat-v1" in models: del models[models.index("deluxe-chat-v1")] + if "deluxe-chat-v1.1" in models: + del models[models.index("deluxe-chat-v1.1")] priority = {k: f"___{i:02d}" for i, k in enumerate(model_info)} models.sort(key=lambda x: priority.get(x, x)) @@ -338,7 +340,7 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) stream_iter = openai_api_stream_iter( model_name, prompt, temperature, top_p, max_new_tokens ) - elif model_name == "claude-2" or model_name == "claude-instant-1": + elif model_name in ["claude-2", "claude-1", "claude-instant-1"]: prompt = conv.get_prompt() stream_iter = anthropic_api_stream_iter( model_name, prompt, temperature, top_p, max_new_tokens diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index f1dbca963..70d62bbd2 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -84,10 +84,10 @@ def load_demo(url_params, request: gr.Request): if args.add_chatgpt: models_anony += ["gpt-4", "gpt-3.5-turbo"] if args.add_claude: - models_anony += ["claude-2", "claude-instant-1"] + models_anony += ["claude-2", "claude-1", "claude-instant-1"] if args.add_palm: models_anony += ["palm-2"] - models_anony.append("deluxe-chat-v1") + # models_anony.append("deluxe-chat-v1.1") side_by_side_anony_updates = load_demo_side_by_side_anony(models_anony, url_params) side_by_side_named_updates = load_demo_side_by_side_named(models, url_params) From 690153184940950ae45743b3417ae78e6f720d2b Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Mon, 23 Oct 2023 17:41:23 +0000 Subject: [PATCH 03/11] get ip through cloudflare --- fastchat/serve/gradio_block_arena_anony.py | 30 ++++++++------- fastchat/serve/gradio_block_arena_named.py | 30 ++++++++------- fastchat/serve/gradio_web_server.py | 43 +++++++++++++++------- fastchat/serve/gradio_web_server_multi.py | 3 +- 4 files changed, 63 insertions(+), 43 deletions(-) diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index e6f01ba45..a3e75e7e7 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -28,6 +28,7 @@ invisible_btn, acknowledgment_md, ip_expiration_dict, + get_ip, ) from fastchat.utils import ( build_logger, @@ -67,7 +68,7 @@ def vote_last_response(states, vote_type, model_selectors, request: gr.Request): "type": vote_type, "models": [x for x in model_selectors], "states": [x.dict() for x in states], - "ip": request.client.host, + "ip": get_ip(request), } fout.write(json.dumps(data) + "\n") @@ -90,7 +91,7 @@ def vote_last_response(states, vote_type, model_selectors, request: gr.Request): def leftvote_last_response( state0, state1, model_selector0, model_selector1, request: gr.Request ): - logger.info(f"leftvote (anony). ip: {request.client.host}") + logger.info(f"leftvote (anony). ip: {get_ip(request)}") for x in vote_last_response( [state0, state1], "leftvote", [model_selector0, model_selector1], request ): @@ -100,7 +101,7 @@ def leftvote_last_response( def rightvote_last_response( state0, state1, model_selector0, model_selector1, request: gr.Request ): - logger.info(f"rightvote (anony). ip: {request.client.host}") + logger.info(f"rightvote (anony). ip: {get_ip(request)}") for x in vote_last_response( [state0, state1], "rightvote", [model_selector0, model_selector1], request ): @@ -110,7 +111,7 @@ def rightvote_last_response( def tievote_last_response( state0, state1, model_selector0, model_selector1, request: gr.Request ): - logger.info(f"tievote (anony). ip: {request.client.host}") + logger.info(f"tievote (anony). ip: {get_ip(request)}") for x in vote_last_response( [state0, state1], "tievote", [model_selector0, model_selector1], request ): @@ -120,7 +121,7 @@ def tievote_last_response( def bothbad_vote_last_response( state0, state1, model_selector0, model_selector1, request: gr.Request ): - logger.info(f"bothbad_vote (anony). ip: {request.client.host}") + logger.info(f"bothbad_vote (anony). ip: {get_ip(request)}") for x in vote_last_response( [state0, state1], "bothbad_vote", [model_selector0, model_selector1], request ): @@ -128,7 +129,7 @@ def bothbad_vote_last_response( def regenerate(state0, state1, request: gr.Request): - logger.info(f"regenerate (anony). ip: {request.client.host}") + logger.info(f"regenerate (anony). ip: {get_ip(request)}") states = [state0, state1] for i in range(num_sides): states[i].conv.update_last_message(None) @@ -136,7 +137,7 @@ def regenerate(state0, state1, request: gr.Request): def clear_history(request: gr.Request): - logger.info(f"clear_history (anony). ip: {request.client.host}") + logger.info(f"clear_history (anony). ip: {get_ip(request)}") return ( [None] * num_sides + [None] * num_sides @@ -148,7 +149,7 @@ def clear_history(request: gr.Request): def share_click(state0, state1, model_selector0, model_selector1, request: gr.Request): - logger.info(f"share (anony). ip: {request.client.host}") + logger.info(f"share (anony). ip: {get_ip(request)}") if state0 is not None and state1 is not None: vote_last_response( [state0, state1], "share", [model_selector0, model_selector1], request @@ -270,7 +271,8 @@ def get_battle_pair(): def add_text( state0, state1, model_selector0, model_selector1, text, request: gr.Request ): - ip = request.client.host + is_cf = "cf-connecting-ip" in request.headers + ip = get_ip(request) logger.info(f"add_text (anony). ip: {ip}. len: {len(text)}") states = [state0, state1] model_selectors = [model_selector0, model_selector1] @@ -298,8 +300,8 @@ def add_text( * 6 ) - if ip_expiration_dict[ip] < time.time(): - logger.info(f"inactive (anony). ip: {request.client.host}. text: {text}") + if not is_cf and ip_expiration_dict[ip] < time.time(): + logger.info(f"inactive (anony). ip: {get_ip(request)}. text: {text}") for i in range(num_sides): states[i].skip_next = True return ( @@ -316,7 +318,7 @@ def add_text( flagged = violates_moderation(text) if flagged: logger.info( - f"violate moderation (anony). ip: {request.client.host}. text: {text}" + f"violate moderation (anony). ip: {get_ip(request)}. text: {text}" ) for i in range(num_sides): states[i].skip_next = True @@ -332,7 +334,7 @@ def add_text( conv = states[0].conv if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT: - logger.info(f"conversation turn limit. ip: {request.client.host}. text: {text}") + logger.info(f"conversation turn limit. ip: {get_ip(request)}. text: {text}") for i in range(num_sides): states[i].skip_next = True return ( @@ -370,7 +372,7 @@ def bot_response_multi( max_new_tokens, request: gr.Request, ): - logger.info(f"bot_response_multi (anony). ip: {request.client.host}") + logger.info(f"bot_response_multi (anony). ip: {get_ip(request)}") if state0 is None or state0.skip_next: # This generate call is skipped due to invalid inputs diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py index 8693fa791..f4a6839d7 100644 --- a/fastchat/serve/gradio_block_arena_named.py +++ b/fastchat/serve/gradio_block_arena_named.py @@ -28,6 +28,7 @@ acknowledgment_md, get_model_description_md, ip_expiration_dict, + get_ip, ) from fastchat.utils import ( build_logger, @@ -72,7 +73,7 @@ def vote_last_response(states, vote_type, model_selectors, request: gr.Request): "type": vote_type, "models": [x for x in model_selectors], "states": [x.dict() for x in states], - "ip": request.client.host, + "ip": get_ip(request), } fout.write(json.dumps(data) + "\n") @@ -80,7 +81,7 @@ def vote_last_response(states, vote_type, model_selectors, request: gr.Request): def leftvote_last_response( state0, state1, model_selector0, model_selector1, request: gr.Request ): - logger.info(f"leftvote (named). ip: {request.client.host}") + logger.info(f"leftvote (named). ip: {get_ip(request)}") vote_last_response( [state0, state1], "leftvote", [model_selector0, model_selector1], request ) @@ -90,7 +91,7 @@ def leftvote_last_response( def rightvote_last_response( state0, state1, model_selector0, model_selector1, request: gr.Request ): - logger.info(f"rightvote (named). ip: {request.client.host}") + logger.info(f"rightvote (named). ip: {get_ip(request)}") vote_last_response( [state0, state1], "rightvote", [model_selector0, model_selector1], request ) @@ -100,7 +101,7 @@ def rightvote_last_response( def tievote_last_response( state0, state1, model_selector0, model_selector1, request: gr.Request ): - logger.info(f"tievote (named). ip: {request.client.host}") + logger.info(f"tievote (named). ip: {get_ip(request)}") vote_last_response( [state0, state1], "tievote", [model_selector0, model_selector1], request ) @@ -110,7 +111,7 @@ def tievote_last_response( def bothbad_vote_last_response( state0, state1, model_selector0, model_selector1, request: gr.Request ): - logger.info(f"bothbad_vote (named). ip: {request.client.host}") + logger.info(f"bothbad_vote (named). ip: {get_ip(request)}") vote_last_response( [state0, state1], "bothbad_vote", [model_selector0, model_selector1], request ) @@ -118,7 +119,7 @@ def bothbad_vote_last_response( def regenerate(state0, state1, request: gr.Request): - logger.info(f"regenerate (named). ip: {request.client.host}") + logger.info(f"regenerate (named). ip: {get_ip(request)}") states = [state0, state1] for i in range(num_sides): states[i].conv.update_last_message(None) @@ -126,7 +127,7 @@ def regenerate(state0, state1, request: gr.Request): def clear_history(request: gr.Request): - logger.info(f"clear_history (named). ip: {request.client.host}") + logger.info(f"clear_history (named). ip: {get_ip(request)}") return ( [None] * num_sides + [None] * num_sides @@ -137,7 +138,7 @@ def clear_history(request: gr.Request): def share_click(state0, state1, model_selector0, model_selector1, request: gr.Request): - logger.info(f"share (named). ip: {request.client.host}") + logger.info(f"share (named). ip: {get_ip(request)}") if state0 is not None and state1 is not None: vote_last_response( [state0, state1], "share", [model_selector0, model_selector1], request @@ -147,7 +148,8 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re def add_text( state0, state1, model_selector0, model_selector1, text, request: gr.Request ): - ip = request.client.host + is_cf = "cf-connecting-ip" in request.headers + ip = get_ip(request) logger.info(f"add_text (named). ip: {ip}. len: {len(text)}") states = [state0, state1] model_selectors = [model_selector0, model_selector1] @@ -170,8 +172,8 @@ def add_text( * 6 ) - if ip_expiration_dict[ip] < time.time(): - logger.info(f"inactive (named). ip: {request.client.host}. text: {text}") + if not is_cf and ip_expiration_dict[ip] < time.time(): + logger.info(f"inactive (named). ip: {ip}. text: {text}") for i in range(num_sides): states[i].skip_next = True return ( @@ -188,7 +190,7 @@ def add_text( flagged = violates_moderation(text) if flagged: logger.info( - f"violate moderation (named). ip: {request.client.host}. text: {text}" + f"violate moderation (named). ip: {ip}. text: {text}" ) for i in range(num_sides): states[i].skip_next = True @@ -204,7 +206,7 @@ def add_text( conv = states[0].conv if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT: - logger.info(f"conversation turn limit. ip: {request.client.host}. text: {text}") + logger.info(f"conversation turn limit. ip: {ip}. text: {text}") for i in range(num_sides): states[i].skip_next = True return ( @@ -242,7 +244,7 @@ def bot_response_multi( max_new_tokens, request: gr.Request, ): - logger.info(f"bot_response_multi (named). ip: {request.client.host}") + logger.info(f"bot_response_multi (named). ip: {get_ip(request)}") if state0.skip_next: # This generate call is skipped due to invalid inputs diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index b2b92f861..b441ed574 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -175,7 +175,7 @@ def load_demo_single(models, url_params): def load_demo(url_params, request: gr.Request): global models - ip = request.client.host + ip = get_ip(request) logger.info(f"load_demo. ip: {ip}. params: {url_params}") ip_expiration_dict[ip] = time.time() + SESSION_EXPIRATION_TIME @@ -198,43 +198,57 @@ def vote_last_response(state, vote_type, model_selector, request: gr.Request): "type": vote_type, "model": model_selector, "state": state.dict(), - "ip": request.client.host, + "ip": get_ip(request), } fout.write(json.dumps(data) + "\n") def upvote_last_response(state, model_selector, request: gr.Request): - logger.info(f"upvote. ip: {request.client.host}") + ip = get_ip(request) + logger.info(f"upvote. ip: {ip}") vote_last_response(state, "upvote", model_selector, request) return ("",) + (disable_btn,) * 3 def downvote_last_response(state, model_selector, request: gr.Request): - logger.info(f"downvote. ip: {request.client.host}") + ip = get_ip(request) + logger.info(f"downvote. ip: {ip}") vote_last_response(state, "downvote", model_selector, request) return ("",) + (disable_btn,) * 3 def flag_last_response(state, model_selector, request: gr.Request): - logger.info(f"flag. ip: {request.client.host}") + ip = get_ip(request) + logger.info(f"flag. ip: {ip}") vote_last_response(state, "flag", model_selector, request) return ("",) + (disable_btn,) * 3 def regenerate(state, request: gr.Request): - logger.info(f"regenerate. ip: {request.client.host}") + ip = get_ip(request) + logger.info(f"regenerate. ip: {ip}") state.conv.update_last_message(None) return (state, state.to_gradio_chatbot(), "") + (disable_btn,) * 5 def clear_history(request: gr.Request): - logger.info(f"clear_history. ip: {request.client.host}") + ip = get_ip(request) + logger.info(f"clear_history. ip: {ip}") state = None return (state, [], "") + (disable_btn,) * 5 +def get_ip(request: gr.Request): + if "cf-connecting-ip" in request.headers: + ip = request.headers['cf-connecting-ip'] + else: + ip = request.client.host + return ip + + def add_text(state, model_selector, text, request: gr.Request): - ip = request.client.host + is_cf = "cf-connecting-ip" in request.headers + ip = get_ip(request) logger.info(f"add_text. ip: {ip}. len: {len(text)}") if state is None: @@ -244,15 +258,15 @@ def add_text(state, model_selector, text, request: gr.Request): state.skip_next = True return (state, state.to_gradio_chatbot(), "") + (no_change_btn,) * 5 - if ip_expiration_dict[ip] < time.time(): - logger.info(f"inactive. ip: {request.client.host}. text: {text}") + if not is_cf and ip_expiration_dict[ip] < time.time(): + logger.info(f"inactive. ip: {ip}. text: {text}") state.skip_next = True return (state, state.to_gradio_chatbot(), INACTIVE_MSG) + (no_change_btn,) * 5 if enable_moderation: flagged = violates_moderation(text) if flagged: - logger.info(f"violate moderation. ip: {request.client.host}. text: {text}") + logger.info(f"violate moderation. ip: {ip}. text: {text}") state.skip_next = True return (state, state.to_gradio_chatbot(), MODERATION_MSG) + ( no_change_btn, @@ -260,7 +274,7 @@ def add_text(state, model_selector, text, request: gr.Request): conv = state.conv if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT: - logger.info(f"conversation turn limit. ip: {request.client.host}. text: {text}") + logger.info(f"conversation turn limit. ip: {ip}. text: {text}") state.skip_next = True return (state, state.to_gradio_chatbot(), CONVERSATION_LIMIT_MSG) + ( no_change_btn, @@ -322,7 +336,8 @@ def model_worker_stream_iter( def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request): - logger.info(f"bot_response. ip: {request.client.host}") + ip = get_ip(request) + logger.info(f"bot_response. ip: {ip}") start_tstamp = time.time() temperature = float(temperature) top_p = float(top_p) @@ -474,7 +489,7 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) "start": round(start_tstamp, 4), "finish": round(finish_tstamp, 4), "state": state.dict(), - "ip": request.client.host, + "ip": get_ip(request), } fout.write(json.dumps(data) + "\n") diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index 70d62bbd2..943452840 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -29,6 +29,7 @@ get_model_list, load_demo_single, ip_expiration_dict, + get_ip, ) from fastchat.serve.monitor.monitor import build_leaderboard_tab from fastchat.utils import ( @@ -44,7 +45,7 @@ def load_demo(url_params, request: gr.Request): global models - ip = request.client.host + ip = get_ip(request) logger.info(f"load_demo. ip: {ip}. params: {url_params}") ip_expiration_dict[ip] = time.time() + SESSION_EXPIRATION_TIME From 0ac4a9d4d9fc888f6d0db614c9afc4e023ab97b0 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Tue, 31 Oct 2023 19:39:05 +0000 Subject: [PATCH 04/11] update --- fastchat/model/model_registry.py | 12 ++--- fastchat/serve/gradio_block_arena_anony.py | 45 ++++++++++--------- fastchat/serve/gradio_block_arena_named.py | 7 +-- fastchat/serve/gradio_web_server.py | 52 ++++++++++++++++++++-- fastchat/serve/gradio_web_server_multi.py | 14 +++--- fastchat/serve/huggingface_api_worker.py | 5 ++- fastchat/serve/monitor/monitor.py | 17 +++---- fastchat/utils.py | 2 +- 8 files changed, 103 insertions(+), 51 deletions(-) diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py index 4634b2038..370517c12 100644 --- a/fastchat/model/model_registry.py +++ b/fastchat/model/model_registry.py @@ -61,18 +61,18 @@ def get_model_info(name: str) -> ModelInfo: "https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023", "PaLM 2 for Chat (chat-bison@001) by Google", ) +register_model_info( + ["zephyr-7b-beta", "zephyr-7b-alpha"], + "Zephyr", + "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha", + "a chatbot fine-tuned from Mistral by Hugging Face", +) register_model_info( ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"], "Llama 2", "https://ai.meta.com/llama/", "open foundation and fine-tuned chat models by Meta", ) -register_model_info( - ["zephyr-7b-alpha"], - "Zephyr", - "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha", - "a chatbot fine-tuned from Mistral by Hugging Face", -) register_model_info( ["qwen-14b-chat"], "Qwen", diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index a3e75e7e7..a39d18753 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -158,11 +158,12 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re SAMPLING_WEIGHTS = { # tier 0 - "gpt-4": 2, - "gpt-3.5-turbo": 2, - "claude-2": 2, + "gpt-4": 4, + "gpt-3.5-turbo": 8, + "claude-2": 8, "claude-1": 2, - "claude-instant-1": 2, + "claude-instant-1": 8, + "zephyr-7b-beta": 4, # tire 1 "deluxe-chat-v1.1": 0.1, "palm-2": 1.5, @@ -198,8 +199,9 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "deluxe-chat-v1": 4, } -SAMPLING_BOOST_MODELS = ["zephyr-7b-alpha", "mistral-7b-instruct", "claude-1"] -OUTAGE_MODELS = ["deluxe-chat-v1.1", "falcon-180b-chat"] +SAMPLING_BOOST_MODELS = ["zephyr-7b-beta"] +# SAMPLING_BOOST_MODELS = ["claude-2"] +OUTAGE_MODELS = ["deluxe-chat-v1.1", "claude-2", "claude-instant-1"] def get_sample_weight(model): @@ -217,13 +219,16 @@ def get_battle_pair(): targets = { "gpt-4": {"claude-2"}, + # "gpt-4": {"llama-2-70b-chat"}, "gpt-3.5-turbo": {"claude-instant-1", "gpt-4", "claude-2"}, - "claude-2": {"gpt-4", "gpt-3.5-turbo"}, + # "gpt-3.5-turbo": {"llama-2-70b-chat"}, + "claude-2": {"gpt-4", "gpt-3.5-turbo", "claude-1"}, "claude-1": {"claude-2", "gpt-4", "gpt-3.5-turbo"}, "claude-instant-1": {"gpt-3.5-turbo", "claude-2"}, "deluxe-chat-v1.1": {"gpt-4"}, "qwen-14b-chat": {"vicuna-13b", "llama-2-13b-chat", "llama-2-70b-chat"}, "zephyr-7b-alpha": {"mistral-7b-instruct", "llama-2-13b-chat"}, + "zephyr-7b-beta": {"mistral-7b-instruct", "llama-2-13b-chat", "llama-2-7b-chat", "wizardlm-13b"}, "llama-2-70b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, "llama-2-13b-chat": {"mistral-7b-instruct", "vicuna-13b", "llama-2-70b-chat"}, "llama-2-7b-chat": {"mistral-7b-instruct", "vicuna-7b", "llama-2-13b-chat"}, @@ -251,8 +256,8 @@ def get_battle_pair(): weight = get_sample_weight(model) if (weight != 0 and chosen_model in targets and model in targets[chosen_model]): - # boost to 66% chance - weight = 2*total_weight / len(targets[chosen_model]) + # boost to 50% chance + weight = total_weight / len(targets[chosen_model]) rival_models.append(model) rival_weights.append(weight) # for p, w in zip(rival_models, rival_weights): @@ -417,17 +422,17 @@ def build_side_by_side_ui_anony(models): # ⚔️ Chatbot Arena ⚔️ : Benchmarking LLMs in the Wild | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | -### Rules -- Chat with two anonymous models side-by-side and vote for which one is better! -- You can do multiple turns of conversations before voting. -- The names of the models will be revealed after your vote. Conversations with identity keywords (e.g., ChatGPT, Bard, Vicuna) or any votes after the names are revealed will not count towards the leaderboard. -- Click "Clear history" to start a new round. +## 📜 Rules +- Ask any question to two anonymous models (e.g., ChatGPT, Claude, Llama) and vote for the better one! +- You can continue chatting until you identify a winner. +- Vote won't be counted if model identity is revealed during conversation. -### Leaderboard -See [lmsys/chatbot-arena-leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) or the 4th tab above on this page. +## 🏆 Arena Elo [Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) +We use **100K** human votes to compile an Elo-based LLM leaderboard. +Find out who is the 🥇LLM Champion! + +## 👇 Chat now! -### Battle -Please scroll down and start chatting. The models include both closed-source models (e.g., ChatGPT) and open-source models (e.g., Llama). """ states = [gr.State() for _ in range(num_sides)] @@ -466,7 +471,7 @@ def build_side_by_side_ui_anony(models): with gr.Column(scale=20): textbox = gr.Textbox( show_label=False, - placeholder="Enter your prompt here and press ENTER", + placeholder="👉 Enter your prompt and press ENTER", container=False, elem_id="input_box", ) @@ -474,7 +479,7 @@ def build_side_by_side_ui_anony(models): send_btn = gr.Button(value="Send", variant="primary") with gr.Row() as button_row: - clear_btn = gr.Button(value="🗑️ Clear history", interactive=False) + clear_btn = gr.Button(value="🎲 New Round", interactive=False) regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False) share_btn = gr.Button(value="📷 Share") diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py index f4a6839d7..93eb8fb32 100644 --- a/fastchat/serve/gradio_block_arena_named.py +++ b/fastchat/serve/gradio_block_arena_named.py @@ -299,16 +299,13 @@ def build_side_by_side_ui_named(models): # ⚔️ Chatbot Arena ⚔️ : Benchmarking LLMs in the Wild | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | -### Rules +## Rules - Chat with two models side-by-side and vote for which one is better! - You pick the models you want to chat with. - You can do multiple turns of conversations before voting. - Click "Clear history" to start a new round. -### Leaderboard -See [lmsys/chatbot-arena-leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) or the 4th tab above on this page. - -### Choose two models to chat with +## Choose two models to compare """ states = [gr.State() for _ in range(num_sides)] diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index b441ed574..c9ab166cd 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -425,8 +425,6 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) try: for i, data in enumerate(stream_iter): if data["error_code"] == 0: - if i % 8 != 0: # reduce gradio's overhead - continue output = data["text"].strip() conv.update_last_message(output + "▌") yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 @@ -496,7 +494,7 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) block_css = """ #notice_markdown { - font-size: 104% + font-size: 110% } #notice_markdown th { display: none; @@ -515,6 +513,9 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) #leaderboard_dataframe td { line-height: 0.1em; } +#about_markdown { + font-size: 110% +} #input_box textarea { } footer { @@ -532,6 +533,14 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) width: auto; max-width: 20%; } +.image-about img { + margin: 0 30px; + margin-top: 30px; + height: 60px; + max-height: 100%; + width: auto; + float: left; +} """ @@ -557,6 +566,43 @@ def get_model_description_md(models): ct += 1 return model_description_md +def build_about(): + + about_markdown = f""" +# About Us +Chatbot Arena is an open-source research project developed by members from [LMSYS](https://lmsys.org/about/) and UC Berkeley [SkyLab](https://sky.cs.berkeley.edu/). Our mission is to build an open crowdsourced platform to collect human feedback and evaluate LLMs under real-world scenarios. We open-source our code at [GitHub](https://github.com/lm-sys/FastChat) and release chat and human feedback datasets [here](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md). We invite everyone to join us in this journey! + +## Read More +- Chatbot Arena [launch post](https://lmsys.org/blog/2023-05-03-arena/), [data release](https://lmsys.org/blog/2023-07-20-dataset/) +- LMSYS-Chat-1M [report](https://arxiv.org/abs/2309.11998) + +## Core Members +[Lianmin Zheng](https://lmzheng.net/), [Wei-Lin Chiang](https://infwinston.github.io/), [Ying Sheng](https://sites.google.com/view/yingsheng/home) + +## Advisors +[Ion Stoica](http://people.eecs.berkeley.edu/~istoica/), [Joseph E. Gonzalez](https://people.eecs.berkeley.edu/~jegonzal/), [Hao Zhang](https://cseweb.ucsd.edu/~haozhang/) + +## Contact Us +- Follow our [Twitter](https://twitter.com/lmsysorg), [Discord](https://discord.gg/HSWAKCrnFx) or email us at lmsys.org@gmail.com +- File issues on [GitHub](https://github.com/lm-sys/FastChat) +- Download our datasets and models on [HuggingFace](https://huggingface.co/lmsys) + +## Sponsors +We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [Anyscale](https://www.anyscale.com/), [HuggingFace](https://huggingface.co/) for their generous sponsorship. +Learn more about partnership [here](https://lmsys.org/donations/). + +
+ Image 1 + Image 2 + Image 3 + Image 4 +
+""" + + #state = gr.State() + gr.Markdown(about_markdown, elem_id="about_markdown") + + #return [state] def build_single_model_ui(models, add_promotion_links=False): promotion = ( diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index 943452840..efb400211 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -26,6 +26,7 @@ set_global_vars, block_css, build_single_model_ui, + build_about, get_model_list, load_demo_single, ip_expiration_dict, @@ -101,26 +102,29 @@ def load_demo(url_params, request: gr.Request): def build_demo(models, elo_results_file, leaderboard_table_file): + text_size = gr.themes.sizes.text_md with gr.Blocks( title="Chat with Open Large Language Models", - theme=gr.themes.Default(), + theme=gr.themes.Default(text_size=text_size), css=block_css, ) as demo: with gr.Tabs() as tabs: - with gr.Tab("Chatbot Arena (battle)", id=0): + with gr.Tab("Arena (battle)", id=0): side_by_side_anony_list = build_side_by_side_ui_anony(models) - with gr.Tab("Chatbot Arena (side-by-side)", id=1): + with gr.Tab("Arena (side-by-side)", id=1): side_by_side_named_list = build_side_by_side_ui_named(models) - with gr.Tab("Single Model", id=2): + with gr.Tab("Direct Chat", id=2): single_model_list = build_single_model_ui( models, add_promotion_links=True ) - if elo_results_file: with gr.Tab("Leaderboard", id=3): build_leaderboard_tab(elo_results_file, leaderboard_table_file) + with gr.Tab("About Us", id=4): + about = build_about() + url_params = gr.JSON(visible=False) diff --git a/fastchat/serve/huggingface_api_worker.py b/fastchat/serve/huggingface_api_worker.py index b16c96147..7eef50e47 100644 --- a/fastchat/serve/huggingface_api_worker.py +++ b/fastchat/serve/huggingface_api_worker.py @@ -143,7 +143,10 @@ def generate_stream_gate(self, params): logger.info(f"gen_kwargs: {gen_kwargs}") try: - url = f"{self.api_base}/{self.model_path}" + if self.model_path == "": + url = f"{self.api_base}" + else: + url = f"{self.api_base}/{self.model_path}" client = InferenceClient(url, token=self.token) res = client.text_generation( prompt, stream=True, details=True, **gen_kwargs diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 569b3f593..0005db9b3 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -210,22 +210,18 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file): elem_id="leaderboard_dataframe", ) gr.Markdown( - "If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).", + """ ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis! + If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model). + """, elem_id="leaderboard_markdown", ) else: pass - gr.Markdown( - f"""## More Statistics for Chatbot Arena\n -We added some additional figures to show more statistics. The code for generating them is also included in this [notebook]({notebook_url}). -Please note that you may see different orders from different ranking methods. This is expected for models that perform similarly, as demonstrated by the confidence interval in the bootstrap figure. Going forward, we prefer the classical Elo calculation because of its scalability and interpretability. You can find more discussions in this blog [post](https://lmsys.org/blog/2023-05-03-arena/). -""", - elem_id="leaderboard_markdown", - ) leader_component_values[:] = [md, p1, p2, p3, p4] + """ with gr.Row(): with gr.Column(): gr.Markdown( @@ -248,13 +244,14 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file): "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)" ) plot_4 = gr.Plot(p4, show_label=False) + """ from fastchat.serve.gradio_web_server import acknowledgment_md gr.Markdown(acknowledgment_md) - return [md_1, plot_1, plot_2, plot_3, plot_4] - + # return [md_1, plot_1, plot_2, plot_3, plot_4] + return [md_1] def build_demo(elo_results_file, leaderboard_table_file): from fastchat.serve.gradio_web_server import block_css diff --git a/fastchat/utils.py b/fastchat/utils.py index 7c0614e3e..a7377d7ff 100644 --- a/fastchat/utils.py +++ b/fastchat/utils.py @@ -207,7 +207,7 @@ def pretty_print_semaphore(semaphore): url_params = Object.fromEntries(params); console.log("url_params", url_params); - msg = "Users of this website are required to agree to the following terms:\\nThe service is a research preview. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes.\\nThe service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) or a similar license." + msg = "Users of this website are required to agree to the following terms:\\n\\nThe service is a research preview. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes.\\nThe service collects user dialogue data and reserves the right to distribute it under a Creative Commons Attribution (CC-BY) or a similar license." alert(msg); return url_params; From 44fc150c2bd4a312ea773a8a95297be4cbf8557b Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Thu, 2 Nov 2023 04:26:22 +0000 Subject: [PATCH 05/11] add slow warning --- fastchat/constants.py | 1 + fastchat/model/model_registry.py | 46 +++++++++++----------- fastchat/serve/gradio_block_arena_anony.py | 28 +++++++++---- fastchat/serve/gradio_web_server.py | 4 +- fastchat/serve/gradio_web_server_multi.py | 2 +- 5 files changed, 47 insertions(+), 34 deletions(-) diff --git a/fastchat/constants.py b/fastchat/constants.py index c26c5f489..75b20a5a3 100644 --- a/fastchat/constants.py +++ b/fastchat/constants.py @@ -14,6 +14,7 @@ MODERATION_MSG = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE FIX YOUR INPUT AND TRY AGAIN." CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION." INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE." +SLOW_MODEL_MSG = '⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds.' # Maximum input length INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 3072)) # Maximum conversation turns diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py index 370517c12..fe4655c25 100644 --- a/fastchat/model/model_registry.py +++ b/fastchat/model/model_registry.py @@ -62,10 +62,17 @@ def get_model_info(name: str) -> ModelInfo: "PaLM 2 for Chat (chat-bison@001) by Google", ) register_model_info( - ["zephyr-7b-beta", "zephyr-7b-alpha"], - "Zephyr", - "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha", - "a chatbot fine-tuned from Mistral by Hugging Face", + [ + "vicuna-33b", + "vicuna-33b-v1.3", + "vicuna-13b", + "vicuna-13b-v1.3", + "vicuna-7b", + "vicuna-7b-v1.3", + ], + "Vicuna", + "https://lmsys.org/blog/2023-03-30-vicuna/", + "a chat assistant fine-tuned on user-shared conversations by LMSYS", ) register_model_info( ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"], @@ -73,37 +80,30 @@ def get_model_info(name: str) -> ModelInfo: "https://ai.meta.com/llama/", "open foundation and fine-tuned chat models by Meta", ) -register_model_info( - ["qwen-14b-chat"], - "Qwen", - "https://huggingface.co/Qwen/Qwen-14B-Chat", - "a large language model by Alibaba Cloud", -) register_model_info( ["mistral-7b-instruct"], "Mistral", "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1", "a large language model by Mistral AI team", ) +register_model_info( + ["zephyr-7b-beta", "zephyr-7b-alpha"], + "Zephyr", + "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha", + "a chatbot fine-tuned from Mistral by Hugging Face", +) +register_model_info( + ["qwen-14b-chat"], + "Qwen", + "https://huggingface.co/Qwen/Qwen-14B-Chat", + "a large language model by Alibaba Cloud", +) register_model_info( ["codellama-34b-instruct", "codellama-13b-instruct", "codellama-7b-instruct"], "Code Llama", "https://ai.meta.com/blog/code-llama-large-language-model-coding/", "open foundation models for code by Meta", ) -register_model_info( - [ - "vicuna-33b", - "vicuna-33b-v1.3", - "vicuna-13b", - "vicuna-13b-v1.3", - "vicuna-7b", - "vicuna-7b-v1.3", - ], - "Vicuna", - "https://lmsys.org/blog/2023-03-30-vicuna/", - "a chat assistant fine-tuned on user-shared conversations by LMSYS", -) register_model_info( ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"], "WizardLM", diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index a39d18753..ad61be932 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -13,6 +13,7 @@ MODERATION_MSG, CONVERSATION_LIMIT_MSG, INACTIVE_MSG, + SLOW_MODEL_MSG, INPUT_CHAR_LEN_LIMIT, CONVERSATION_TURN_LIMIT, ) @@ -145,6 +146,7 @@ def clear_history(request: gr.Request): + [""] + [invisible_btn] * 4 + [disable_btn] * 2 + + [""] ) @@ -163,14 +165,14 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "claude-2": 8, "claude-1": 2, "claude-instant-1": 8, - "zephyr-7b-beta": 4, + "zephyr-7b-beta": 2, # tire 1 - "deluxe-chat-v1.1": 0.1, + "deluxe-chat-v1.1": 2, "palm-2": 1.5, "llama-2-70b-chat": 1.5, "llama-2-13b-chat": 1.5, "codellama-34b-instruct": 1.5, - "vicuna-33b": 1.5, + "vicuna-33b": 8, "vicuna-13b": 1.5, "wizardlm-70b": 1.5, "wizardlm-13b": 1.5, @@ -200,8 +202,7 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re } SAMPLING_BOOST_MODELS = ["zephyr-7b-beta"] -# SAMPLING_BOOST_MODELS = ["claude-2"] -OUTAGE_MODELS = ["deluxe-chat-v1.1", "claude-2", "claude-instant-1"] +OUTAGE_MODELS = ["claude-1", "claude-2", "claude-instant-1"] def get_sample_weight(model): @@ -303,6 +304,7 @@ def add_text( no_change_btn, ] * 6 + + [""] ) if not is_cf and ip_expiration_dict[ip] < time.time(): @@ -317,6 +319,7 @@ def add_text( no_change_btn, ] * 6 + + [""] ) if enable_moderation: @@ -335,6 +338,7 @@ def add_text( no_change_btn, ] * 6 + + [""] ) conv = states[0].conv @@ -350,6 +354,7 @@ def add_text( no_change_btn, ] * 6 + + [""] ) text = text[:INPUT_CHAR_LEN_LIMIT] # Hard cut-off @@ -358,6 +363,10 @@ def add_text( states[i].conv.append_message(states[i].conv.roles[1], None) states[i].skip_next = False + slow_model_msg = "" + for i in range(num_sides): + if "deluxe" in states[i].model_name: + slow_model_msg = SLOW_MODEL_MSG return ( states + [x.to_gradio_chatbot() for x in states] @@ -366,6 +375,7 @@ def add_text( disable_btn, ] * 6 + + [slow_model_msg] ) @@ -454,6 +464,8 @@ def build_side_by_side_ui_anony(models): for i in range(num_sides): with gr.Column(): model_selectors[i] = gr.Markdown(anony_names[i]) + with gr.Row(): + slow_warning = gr.Markdown("", elem_id="notice_markdown") with gr.Row(): leftvote_btn = gr.Button( @@ -550,7 +562,7 @@ def build_side_by_side_ui_anony(models): flash_buttons, [], btn_list ) clear_btn.click( - clear_history, None, states + chatbots + model_selectors + [textbox] + btn_list + clear_history, None, states + chatbots + model_selectors + [textbox] + btn_list + [slow_warning] ) share_js = """ @@ -578,13 +590,13 @@ def build_side_by_side_ui_anony(models): textbox.submit( add_text, states + model_selectors + [textbox], - states + chatbots + [textbox] + btn_list, + states + chatbots + [textbox] + btn_list + [slow_warning], ).then( bot_response_multi, states + [temperature, top_p, max_output_tokens], states + chatbots + btn_list, ).then( - flash_buttons, [], btn_list + flash_buttons, [], btn_list, ) send_btn.click( diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index c9ab166cd..92bfa67ae 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -504,7 +504,7 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) padding-bottom: 6px; } #leaderboard_markdown { - font-size: 104% + font-size: 110% } #leaderboard_markdown td { padding-top: 6px; @@ -577,7 +577,7 @@ def build_about(): - LMSYS-Chat-1M [report](https://arxiv.org/abs/2309.11998) ## Core Members -[Lianmin Zheng](https://lmzheng.net/), [Wei-Lin Chiang](https://infwinston.github.io/), [Ying Sheng](https://sites.google.com/view/yingsheng/home) +[Lianmin Zheng](https://lmzheng.net/), [Wei-Lin Chiang](https://infwinston.github.io/), [Ying Sheng](https://sites.google.com/view/yingsheng/home), [Siyuan Zhuang](https://scholar.google.com/citations?user=KSZmI5EAAAAJ) ## Advisors [Ion Stoica](http://people.eecs.berkeley.edu/~istoica/), [Joseph E. Gonzalez](https://people.eecs.berkeley.edu/~jegonzal/), [Hao Zhang](https://cseweb.ucsd.edu/~haozhang/) diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index efb400211..0426fec2c 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -89,7 +89,7 @@ def load_demo(url_params, request: gr.Request): models_anony += ["claude-2", "claude-1", "claude-instant-1"] if args.add_palm: models_anony += ["palm-2"] - # models_anony.append("deluxe-chat-v1.1") + models_anony.append("deluxe-chat-v1.1") side_by_side_anony_updates = load_demo_side_by_side_anony(models_anony, url_params) side_by_side_named_updates = load_demo_side_by_side_named(models, url_params) From 2f1bdedb562f2955b2f096faf35e4abaf56dc2fa Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Sat, 4 Nov 2023 18:29:59 +0000 Subject: [PATCH 06/11] add moderation --- fastchat/constants.py | 2 +- fastchat/serve/gradio_block_arena_anony.py | 42 ++++----------------- fastchat/serve/gradio_block_arena_named.py | 40 ++++---------------- fastchat/serve/gradio_web_server.py | 23 ++++------- fastchat/serve/monitor/clean_battle_data.py | 1 + fastchat/utils.py | 31 +++++++++++---- 6 files changed, 49 insertions(+), 90 deletions(-) diff --git a/fastchat/constants.py b/fastchat/constants.py index 75b20a5a3..5dc203c34 100644 --- a/fastchat/constants.py +++ b/fastchat/constants.py @@ -11,7 +11,7 @@ SERVER_ERROR_MSG = ( "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" ) -MODERATION_MSG = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE FIX YOUR INPUT AND TRY AGAIN." +MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES." CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION." INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE." SLOW_MODEL_MSG = '⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds.' diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index ad61be932..8810cbd75 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -12,7 +12,6 @@ from fastchat.constants import ( MODERATION_MSG, CONVERSATION_LIMIT_MSG, - INACTIVE_MSG, SLOW_MODEL_MSG, INPUT_CHAR_LEN_LIMIT, CONVERSATION_TURN_LIMIT, @@ -33,7 +32,7 @@ ) from fastchat.utils import ( build_logger, - violates_moderation, + moderation_filter, ) logger = build_logger("gradio_web_server_multi", "gradio_web_server_multi.log") @@ -307,39 +306,14 @@ def add_text( + [""] ) - if not is_cf and ip_expiration_dict[ip] < time.time(): - logger.info(f"inactive (anony). ip: {get_ip(request)}. text: {text}") - for i in range(num_sides): - states[i].skip_next = True - return ( - states - + [x.to_gradio_chatbot() for x in states] - + [INACTIVE_MSG] - + [ - no_change_btn, - ] - * 6 - + [""] + model_list = [states[i].model_name for i in range(num_sides)] + flagged = moderation_filter(text, model_list) + if flagged: + logger.info( + f"violate moderation (anony). ip: {ip}. text: {text}" ) - - if enable_moderation: - flagged = violates_moderation(text) - if flagged: - logger.info( - f"violate moderation (anony). ip: {get_ip(request)}. text: {text}" - ) - for i in range(num_sides): - states[i].skip_next = True - return ( - states - + [x.to_gradio_chatbot() for x in states] - + [MODERATION_MSG] - + [ - no_change_btn, - ] - * 6 - + [""] - ) + # overwrite the original text + text = MODERATION_MSG conv = states[0].conv if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT: diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py index 93eb8fb32..7da1793d1 100644 --- a/fastchat/serve/gradio_block_arena_named.py +++ b/fastchat/serve/gradio_block_arena_named.py @@ -12,7 +12,6 @@ from fastchat.constants import ( MODERATION_MSG, CONVERSATION_LIMIT_MSG, - INACTIVE_MSG, INPUT_CHAR_LEN_LIMIT, CONVERSATION_TURN_LIMIT, ) @@ -32,7 +31,7 @@ ) from fastchat.utils import ( build_logger, - violates_moderation, + moderation_filter, ) @@ -172,37 +171,14 @@ def add_text( * 6 ) - if not is_cf and ip_expiration_dict[ip] < time.time(): - logger.info(f"inactive (named). ip: {ip}. text: {text}") - for i in range(num_sides): - states[i].skip_next = True - return ( - states - + [x.to_gradio_chatbot() for x in states] - + [INACTIVE_MSG] - + [ - no_change_btn, - ] - * 6 + model_list = [states[i].model_name for i in range(num_sides)] + flagged = moderation_filter(text, model_list) + if flagged: + logger.info( + f"violate moderation (named). ip: {ip}. text: {text}" ) - - if enable_moderation: - flagged = violates_moderation(text) - if flagged: - logger.info( - f"violate moderation (named). ip: {ip}. text: {text}" - ) - for i in range(num_sides): - states[i].skip_next = True - return ( - states - + [x.to_gradio_chatbot() for x in states] - + [MODERATION_MSG] - + [ - no_change_btn, - ] - * 6 - ) + # overwrite the original text + text = MODERATION_MSG conv = states[0].conv if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT: diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index 92bfa67ae..839051b93 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -22,7 +22,6 @@ MODERATION_MSG, CONVERSATION_LIMIT_MSG, SERVER_ERROR_MSG, - INACTIVE_MSG, INPUT_CHAR_LEN_LIMIT, CONVERSATION_TURN_LIMIT, SESSION_EXPIRATION_TIME, @@ -37,7 +36,7 @@ ) from fastchat.utils import ( build_logger, - violates_moderation, + moderation_filter, get_window_url_params_js, get_window_url_params_with_tos_js, parse_gradio_auth_creds, @@ -258,19 +257,13 @@ def add_text(state, model_selector, text, request: gr.Request): state.skip_next = True return (state, state.to_gradio_chatbot(), "") + (no_change_btn,) * 5 - if not is_cf and ip_expiration_dict[ip] < time.time(): - logger.info(f"inactive. ip: {ip}. text: {text}") - state.skip_next = True - return (state, state.to_gradio_chatbot(), INACTIVE_MSG) + (no_change_btn,) * 5 - - if enable_moderation: - flagged = violates_moderation(text) - if flagged: - logger.info(f"violate moderation. ip: {ip}. text: {text}") - state.skip_next = True - return (state, state.to_gradio_chatbot(), MODERATION_MSG) + ( - no_change_btn, - ) * 5 + flagged = moderation_filter(text, [state.model_name]) + if flagged: + logger.info( + f"violate moderation. ip: {ip}. text: {text}" + ) + # overwrite the original text + text = MODERATION_MSG conv = state.conv if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_TURN_LIMIT: diff --git a/fastchat/serve/monitor/clean_battle_data.py b/fastchat/serve/monitor/clean_battle_data.py index 6b6f274dc..d85302a32 100644 --- a/fastchat/serve/monitor/clean_battle_data.py +++ b/fastchat/serve/monitor/clean_battle_data.py @@ -36,6 +36,7 @@ "google", "llama", "NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.", + "$MODERATION$", ] for i in range(len(IDENTITY_WORDS)): diff --git a/fastchat/utils.py b/fastchat/utils.py index a7377d7ff..d1972d539 100644 --- a/fastchat/utils.py +++ b/fastchat/utils.py @@ -143,22 +143,37 @@ def get_gpu_memory(max_gpus=None): return gpu_memory -def violates_moderation(text): +def oai_moderation(text): """ Check whether the text violates OpenAI moderation API. """ import openai + openai.api_base = "https://api.openai.com/v1" + openai.api_key = os.environ["OPENAI_API_KEY"] - try: - flagged = openai.Moderation.create(input=text)["results"][0]["flagged"] - except openai.error.OpenAIError as e: - flagged = False - except (KeyError, IndexError) as e: - flagged = False - + MAX_RETRY = 3 + for i in range(MAX_RETRY): + try: + res = openai.Moderation.create(input=text) + flagged = res["results"][0]["flagged"] + break + except (openai.error.OpenAIError, KeyError, IndexError) as e: + # flag true to be conservative + flagged = True + print(f"MODERATION ERROR: {e}\nInput: {text}") return flagged +def moderation_filter(text, model_list): + MODEL_KEYWORDS = ["claude"] + + for keyword in MODEL_KEYWORDS: + for model in model_list: + if keyword in model and oai_moderation(text): + return True + return False + + def clean_flant5_ckpt(ckpt_path): """ Flan-t5 trained with HF+FSDP saves corrupted weights for shared embeddings, From 23ea9baa706bd35ba89d089c7ac998faf0e6634e Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Fri, 10 Nov 2023 23:11:18 +0000 Subject: [PATCH 07/11] update --- fastchat/model/model_registry.py | 21 +++++++++++++++------ fastchat/serve/api_provider.py | 2 ++ fastchat/serve/gradio_block_arena_anony.py | 15 ++++++++++----- fastchat/serve/gradio_block_arena_named.py | 9 ++++----- fastchat/serve/gradio_web_server.py | 6 +++--- fastchat/serve/gradio_web_server_multi.py | 6 ++++-- 6 files changed, 38 insertions(+), 21 deletions(-) diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py index f4b14ec72..cd94c9dcb 100644 --- a/fastchat/model/model_registry.py +++ b/fastchat/model/model_registry.py @@ -28,14 +28,23 @@ def get_model_info(name: str) -> ModelInfo: ) -register_model_info( - ["gpt-4"], "ChatGPT-4", "https://openai.com/research/gpt-4", "ChatGPT-4 by OpenAI" -) register_model_info( ["gpt-3.5-turbo"], - "ChatGPT-3.5", + "GPT-3.5", "https://openai.com/blog/chatgpt", - "ChatGPT-3.5 by OpenAI", + "GPT-3.5 by OpenAI", +) +register_model_info( + ["gpt-3.5-turbo-1106"], + "GPT-3.5-Turbo-1106", + "https://platform.openai.com/docs/models/gpt-3-5", + "GPT-3.5-Turbo-1106 by OpenAI", +) +register_model_info( + ["gpt-4"], "GPT-4", "https://openai.com/research/gpt-4", "ChatGPT-4 by OpenAI" +) +register_model_info( + ["gpt-4-turbo"], "GPT-4-Turbo", "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", "GPT-4-Turbo by OpenAI" ) register_model_info( ["claude-2"], @@ -171,7 +180,7 @@ def get_model_info(name: str) -> ModelInfo: "an Open Assistant for everyone by LAION", ) register_model_info( - ["openchat_3.5"], + ["openchat-3.5"], "OpenChat 3.5", "https://github.com/imoneoi/openchat", "OpenChat 3.5 is a versatile, open-source language model fine-tuned using C-RLFT", diff --git a/fastchat/serve/api_provider.py b/fastchat/serve/api_provider.py index 1bbdec3bc..3dbb8a690 100644 --- a/fastchat/serve/api_provider.py +++ b/fastchat/serve/api_provider.py @@ -24,6 +24,8 @@ def openai_api_stream_iter( openai.api_base = api_base or "https://api.openai.com/v1" openai.api_key = api_key or os.environ["OPENAI_API_KEY"] + if model_name == "gpt-4-turbo": + model_name = "gpt-4-1106-preview" # Make requests gen_params = { diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index 8810cbd75..1015e0852 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -160,12 +160,15 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re SAMPLING_WEIGHTS = { # tier 0 "gpt-4": 4, - "gpt-3.5-turbo": 8, + "gpt-4-turbo": 4, + "gpt-3.5-turbo": 2, + "gpt-3.5-turbo-1106": 2, "claude-2": 8, "claude-1": 2, "claude-instant-1": 8, "zephyr-7b-beta": 2, - # tire 1 + "openchat-3.5": 2, + # tier 1 "deluxe-chat-v1.1": 2, "palm-2": 1.5, "llama-2-70b-chat": 1.5, @@ -176,13 +179,13 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "wizardlm-70b": 1.5, "wizardlm-13b": 1.5, "qwen-14b-chat": 1.5, - "zephyr-7b-alpha": 1.5, "mistral-7b-instruct": 1.5, # tier 2 "vicuna-7b": 1.0, "llama-2-7b-chat": 1.0, "chatglm2-6b": 1.0, # deprecated + "zephyr-7b-alpha": 1.5, "codellama-13b-instruct": 1.0, "mpt-30b-chat": 1.5, "guanaco-33b": 1.0, @@ -200,8 +203,8 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "deluxe-chat-v1": 4, } -SAMPLING_BOOST_MODELS = ["zephyr-7b-beta"] -OUTAGE_MODELS = ["claude-1", "claude-2", "claude-instant-1"] +SAMPLING_BOOST_MODELS = ["openchat-3.5", "gpt-4-turbo", "gpt-3.5-turbo-1106"] +OUTAGE_MODELS = ["claude-1", "claude-2", "claude-instant-1", "zephyr-7b-alpha", "wizardlm-70b", "falcon-180b-chat"] def get_sample_weight(model): @@ -219,6 +222,7 @@ def get_battle_pair(): targets = { "gpt-4": {"claude-2"}, + "gpt-4-turbo": {"gpt-4", "gpt-3.5-turbo"}, # "gpt-4": {"llama-2-70b-chat"}, "gpt-3.5-turbo": {"claude-instant-1", "gpt-4", "claude-2"}, # "gpt-3.5-turbo": {"llama-2-70b-chat"}, @@ -226,6 +230,7 @@ def get_battle_pair(): "claude-1": {"claude-2", "gpt-4", "gpt-3.5-turbo"}, "claude-instant-1": {"gpt-3.5-turbo", "claude-2"}, "deluxe-chat-v1.1": {"gpt-4"}, + "openchat-3.5": {"gpt-3.5-turbo", "llama-2-70b-chat", "zephyr-7b-beta"}, "qwen-14b-chat": {"vicuna-13b", "llama-2-13b-chat", "llama-2-70b-chat"}, "zephyr-7b-alpha": {"mistral-7b-instruct", "llama-2-13b-chat"}, "zephyr-7b-beta": {"mistral-7b-instruct", "llama-2-13b-chat", "llama-2-7b-chat", "wizardlm-13b"}, diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py index 7da1793d1..88fc0d0b2 100644 --- a/fastchat/serve/gradio_block_arena_named.py +++ b/fastchat/serve/gradio_block_arena_named.py @@ -275,13 +275,12 @@ def build_side_by_side_ui_named(models): # ⚔️ Chatbot Arena ⚔️ : Benchmarking LLMs in the Wild | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | -## Rules -- Chat with two models side-by-side and vote for which one is better! -- You pick the models you want to chat with. -- You can do multiple turns of conversations before voting. +## 📜 Rules +- Chat with any two models side-by-side and vote! +- You can continue chatting for multiple rounds. - Click "Clear history" to start a new round. -## Choose two models to compare +## 🤖 Choose two models to compare """ states = [gr.State() for _ in range(num_sides)] diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index 839051b93..bd742f9cd 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -138,7 +138,7 @@ def get_model_list( models += list(openai_compatible_models_info.keys()) if add_chatgpt: - models += ["gpt-3.5-turbo", "gpt-4"] + models += ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-3.5-turbo-1106"] if add_claude: models += ["claude-2", "claude-instant-1"] if add_palm: @@ -343,7 +343,7 @@ def bot_response(state, temperature, top_p, max_new_tokens, request: gr.Request) return conv, model_name = state.conv, state.model_name - if model_name == "gpt-3.5-turbo" or model_name == "gpt-4": + if model_name in ["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-3.5-turbo-1106"]: prompt = conv.to_openai_api_messages() stream_iter = openai_api_stream_iter( model_name, prompt, temperature, top_p, max_new_tokens @@ -612,7 +612,7 @@ def build_single_model_ui(models, add_promotion_links=False): # 🏔️ Chat with Open Large Language Models {promotion} -### Choose a model to chat with +## 👉 Choose any model to chat """ state = gr.State() diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index 0426fec2c..33513a8ca 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -84,12 +84,14 @@ def load_demo(url_params, request: gr.Request): if args.anony_only_for_proprietary_model: # Only enable these models in anony battles. if args.add_chatgpt: - models_anony += ["gpt-4", "gpt-3.5-turbo"] + models_anony += ["gpt-4", "gpt-3.5-turbo", "gpt-4-turbo", "gpt-3.5-turbo-1106"] if args.add_claude: models_anony += ["claude-2", "claude-1", "claude-instant-1"] if args.add_palm: models_anony += ["palm-2"] - models_anony.append("deluxe-chat-v1.1") + models_anony.append("gpt-4-turbo") + models_anony.append("deluxe-chat-v1.1") + models_anony = list(set(models_anony)) side_by_side_anony_updates = load_demo_side_by_side_anony(models_anony, url_params) side_by_side_named_updates = load_demo_side_by_side_named(models, url_params) From f5467eb745cfee3aef41e9c280d001e54f7167cb Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Fri, 10 Nov 2023 23:11:34 +0000 Subject: [PATCH 08/11] update --- fastchat/serve/monitor/monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 0005db9b3..87eebd44a 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -31,10 +31,10 @@ def make_leaderboard_md(elo_results): leaderboard_md = f""" -# Leaderboard +# 🏆 Chatbot Arena Leaderboard | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | -🏆 This leaderboard is based on the following three benchmarks. +This leaderboard is based on the following three benchmarks. - [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 90K+ user votes to compute Elo ratings. - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks. From ed51a2714488822a6599494703cfccf63ba97821 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Fri, 10 Nov 2023 23:12:38 +0000 Subject: [PATCH 09/11] increase token limit --- fastchat/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/constants.py b/fastchat/constants.py index 5dc203c34..be152cca0 100644 --- a/fastchat/constants.py +++ b/fastchat/constants.py @@ -16,7 +16,7 @@ INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE." SLOW_MODEL_MSG = '⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds.' # Maximum input length -INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 3072)) +INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000)) # Maximum conversation turns CONVERSATION_TURN_LIMIT = 50 # Session expiration time From 2c1b9a275fa88d996d6154b05b4dc862770a17a1 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Fri, 10 Nov 2023 23:22:09 +0000 Subject: [PATCH 10/11] fix --- fastchat/constants.py | 2 +- fastchat/model/model_registry.py | 5 ++- fastchat/serve/gradio_block_arena_anony.py | 37 ++++++++++++++++------ fastchat/serve/gradio_block_arena_named.py | 4 +-- fastchat/serve/gradio_web_server.py | 13 ++++---- fastchat/serve/gradio_web_server_multi.py | 8 +++-- fastchat/serve/monitor/monitor.py | 4 +-- fastchat/utils.py | 1 + 8 files changed, 48 insertions(+), 26 deletions(-) diff --git a/fastchat/constants.py b/fastchat/constants.py index be152cca0..53ed55c1c 100644 --- a/fastchat/constants.py +++ b/fastchat/constants.py @@ -14,7 +14,7 @@ MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES." CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION." INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE." -SLOW_MODEL_MSG = '⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds.' +SLOW_MODEL_MSG = "⚠️ Both models will show the responses all at once. Please stay patient as it may take over 30 seconds." # Maximum input length INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000)) # Maximum conversation turns diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py index cd94c9dcb..da08c2e26 100644 --- a/fastchat/model/model_registry.py +++ b/fastchat/model/model_registry.py @@ -44,7 +44,10 @@ def get_model_info(name: str) -> ModelInfo: ["gpt-4"], "GPT-4", "https://openai.com/research/gpt-4", "ChatGPT-4 by OpenAI" ) register_model_info( - ["gpt-4-turbo"], "GPT-4-Turbo", "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", "GPT-4-Turbo by OpenAI" + ["gpt-4-turbo"], + "GPT-4-Turbo", + "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "GPT-4-Turbo by OpenAI", ) register_model_info( ["claude-2"], diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index 1015e0852..1467e9d65 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -204,7 +204,14 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re } SAMPLING_BOOST_MODELS = ["openchat-3.5", "gpt-4-turbo", "gpt-3.5-turbo-1106"] -OUTAGE_MODELS = ["claude-1", "claude-2", "claude-instant-1", "zephyr-7b-alpha", "wizardlm-70b", "falcon-180b-chat"] +OUTAGE_MODELS = [ + "claude-1", + "claude-2", + "claude-instant-1", + "zephyr-7b-alpha", + "wizardlm-70b", + "falcon-180b-chat", +] def get_sample_weight(model): @@ -233,11 +240,20 @@ def get_battle_pair(): "openchat-3.5": {"gpt-3.5-turbo", "llama-2-70b-chat", "zephyr-7b-beta"}, "qwen-14b-chat": {"vicuna-13b", "llama-2-13b-chat", "llama-2-70b-chat"}, "zephyr-7b-alpha": {"mistral-7b-instruct", "llama-2-13b-chat"}, - "zephyr-7b-beta": {"mistral-7b-instruct", "llama-2-13b-chat", "llama-2-7b-chat", "wizardlm-13b"}, + "zephyr-7b-beta": { + "mistral-7b-instruct", + "llama-2-13b-chat", + "llama-2-7b-chat", + "wizardlm-13b", + }, "llama-2-70b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, "llama-2-13b-chat": {"mistral-7b-instruct", "vicuna-13b", "llama-2-70b-chat"}, "llama-2-7b-chat": {"mistral-7b-instruct", "vicuna-7b", "llama-2-13b-chat"}, - "mistral-7b-instruct": {"llama-2-7b-chat", "llama-2-13b-chat", "llama-2-70b-chat"}, + "mistral-7b-instruct": { + "llama-2-7b-chat", + "llama-2-13b-chat", + "llama-2-70b-chat", + }, "vicuna-33b": {"llama-2-70b-chat", "gpt-3.5-turbo", "claude-instant-1"}, "vicuna-13b": {"llama-2-13b-chat", "llama-2-70b-chat"}, "vicuna-7b": {"llama-2-7b-chat", "mistral-7b-instruct", "llama-2-13b-chat"}, @@ -259,8 +275,7 @@ def get_battle_pair(): if model == chosen_model: continue weight = get_sample_weight(model) - if (weight != 0 and chosen_model in targets and - model in targets[chosen_model]): + if weight != 0 and chosen_model in targets and model in targets[chosen_model]: # boost to 50% chance weight = total_weight / len(targets[chosen_model]) rival_models.append(model) @@ -314,9 +329,7 @@ def add_text( model_list = [states[i].model_name for i in range(num_sides)] flagged = moderation_filter(text, model_list) if flagged: - logger.info( - f"violate moderation (anony). ip: {ip}. text: {text}" - ) + logger.info(f"violate moderation (anony). ip: {ip}. text: {text}") # overwrite the original text text = MODERATION_MSG @@ -541,7 +554,9 @@ def build_side_by_side_ui_anony(models): flash_buttons, [], btn_list ) clear_btn.click( - clear_history, None, states + chatbots + model_selectors + [textbox] + btn_list + [slow_warning] + clear_history, + None, + states + chatbots + model_selectors + [textbox] + btn_list + [slow_warning], ) share_js = """ @@ -575,7 +590,9 @@ def build_side_by_side_ui_anony(models): states + [temperature, top_p, max_output_tokens], states + chatbots + btn_list, ).then( - flash_buttons, [], btn_list, + flash_buttons, + [], + btn_list, ) send_btn.click( diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py index 88fc0d0b2..af79d3bfb 100644 --- a/fastchat/serve/gradio_block_arena_named.py +++ b/fastchat/serve/gradio_block_arena_named.py @@ -174,9 +174,7 @@ def add_text( model_list = [states[i].model_name for i in range(num_sides)] flagged = moderation_filter(text, model_list) if flagged: - logger.info( - f"violate moderation (named). ip: {ip}. text: {text}" - ) + logger.info(f"violate moderation (named). ip: {ip}. text: {text}") # overwrite the original text text = MODERATION_MSG diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index bd742f9cd..f4671ae84 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -239,7 +239,7 @@ def clear_history(request: gr.Request): def get_ip(request: gr.Request): if "cf-connecting-ip" in request.headers: - ip = request.headers['cf-connecting-ip'] + ip = request.headers["cf-connecting-ip"] else: ip = request.client.host return ip @@ -259,9 +259,7 @@ def add_text(state, model_selector, text, request: gr.Request): flagged = moderation_filter(text, [state.model_name]) if flagged: - logger.info( - f"violate moderation. ip: {ip}. text: {text}" - ) + logger.info(f"violate moderation. ip: {ip}. text: {text}") # overwrite the original text text = MODERATION_MSG @@ -559,8 +557,8 @@ def get_model_description_md(models): ct += 1 return model_description_md -def build_about(): +def build_about(): about_markdown = f""" # About Us Chatbot Arena is an open-source research project developed by members from [LMSYS](https://lmsys.org/about/) and UC Berkeley [SkyLab](https://sky.cs.berkeley.edu/). Our mission is to build an open crowdsourced platform to collect human feedback and evaluate LLMs under real-world scenarios. We open-source our code at [GitHub](https://github.com/lm-sys/FastChat) and release chat and human feedback datasets [here](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md). We invite everyone to join us in this journey! @@ -592,10 +590,11 @@ def build_about(): """ - #state = gr.State() + # state = gr.State() gr.Markdown(about_markdown, elem_id="about_markdown") - #return [state] + # return [state] + def build_single_model_ui(models, add_promotion_links=False): promotion = ( diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index 33513a8ca..f338617ff 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -84,7 +84,12 @@ def load_demo(url_params, request: gr.Request): if args.anony_only_for_proprietary_model: # Only enable these models in anony battles. if args.add_chatgpt: - models_anony += ["gpt-4", "gpt-3.5-turbo", "gpt-4-turbo", "gpt-3.5-turbo-1106"] + models_anony += [ + "gpt-4", + "gpt-3.5-turbo", + "gpt-4-turbo", + "gpt-3.5-turbo-1106", + ] if args.add_claude: models_anony += ["claude-2", "claude-1", "claude-instant-1"] if args.add_palm: @@ -127,7 +132,6 @@ def build_demo(models, elo_results_file, leaderboard_table_file): with gr.Tab("About Us", id=4): about = build_about() - url_params = gr.JSON(visible=False) if args.model_list_mode not in ["once", "reload"]: diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 87eebd44a..48273436a 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -39,7 +39,7 @@ def make_leaderboard_md(elo_results): - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks. -💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: October, 2023. +💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: November, 2023. """ return leaderboard_md @@ -218,7 +218,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file): else: pass - leader_component_values[:] = [md, p1, p2, p3, p4] """ @@ -253,6 +252,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file): # return [md_1, plot_1, plot_2, plot_3, plot_4] return [md_1] + def build_demo(elo_results_file, leaderboard_table_file): from fastchat.serve.gradio_web_server import block_css diff --git a/fastchat/utils.py b/fastchat/utils.py index d1972d539..b5e3ba543 100644 --- a/fastchat/utils.py +++ b/fastchat/utils.py @@ -148,6 +148,7 @@ def oai_moderation(text): Check whether the text violates OpenAI moderation API. """ import openai + openai.api_base = "https://api.openai.com/v1" openai.api_key = os.environ["OPENAI_API_KEY"] From 83315f4eabf79bd5b25a45225b9e75b09d5b4fb1 Mon Sep 17 00:00:00 2001 From: Wei-Lin Chiang Date: Sat, 11 Nov 2023 21:20:49 +0000 Subject: [PATCH 11/11] fix comments --- fastchat/serve/gradio_block_arena_anony.py | 86 ++++++++++----------- fastchat/serve/gradio_block_arena_named.py | 1 - fastchat/serve/gradio_web_server.py | 1 - fastchat/serve/gradio_web_server_multi.py | 2 - fastchat/serve/monitor/clean_battle_data.py | 2 +- fastchat/serve/monitor/monitor.py | 2 +- 6 files changed, 44 insertions(+), 50 deletions(-) diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py index 1467e9d65..48e49deef 100644 --- a/fastchat/serve/gradio_block_arena_anony.py +++ b/fastchat/serve/gradio_block_arena_anony.py @@ -203,15 +203,43 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re "deluxe-chat-v1": 4, } +# target model sampling weights will be boosted. +BATTLE_TARGETS = { + "gpt-4": {"claude-2"}, + "gpt-4-turbo": {"gpt-4", "gpt-3.5-turbo"}, + "gpt-3.5-turbo": {"claude-instant-1", "gpt-4", "claude-2"}, + "claude-2": {"gpt-4", "gpt-3.5-turbo", "claude-1"}, + "claude-1": {"claude-2", "gpt-4", "gpt-3.5-turbo"}, + "claude-instant-1": {"gpt-3.5-turbo", "claude-2"}, + "deluxe-chat-v1.1": {"gpt-4"}, + "openchat-3.5": {"gpt-3.5-turbo", "llama-2-70b-chat", "zephyr-7b-beta"}, + "qwen-14b-chat": {"vicuna-13b", "llama-2-13b-chat", "llama-2-70b-chat"}, + "zephyr-7b-alpha": {"mistral-7b-instruct", "llama-2-13b-chat"}, + "zephyr-7b-beta": { + "mistral-7b-instruct", + "llama-2-13b-chat", + "llama-2-7b-chat", + "wizardlm-13b", + }, + "llama-2-70b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, + "llama-2-13b-chat": {"mistral-7b-instruct", "vicuna-13b", "llama-2-70b-chat"}, + "llama-2-7b-chat": {"mistral-7b-instruct", "vicuna-7b", "llama-2-13b-chat"}, + "mistral-7b-instruct": { + "llama-2-7b-chat", + "llama-2-13b-chat", + "llama-2-70b-chat", + }, + "vicuna-33b": {"llama-2-70b-chat", "gpt-3.5-turbo", "claude-instant-1"}, + "vicuna-13b": {"llama-2-13b-chat", "llama-2-70b-chat"}, + "vicuna-7b": {"llama-2-7b-chat", "mistral-7b-instruct", "llama-2-13b-chat"}, + "wizardlm-70b": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, + "palm-2": {"llama-2-13b-chat", "gpt-3.5-turbo"}, +} + SAMPLING_BOOST_MODELS = ["openchat-3.5", "gpt-4-turbo", "gpt-3.5-turbo-1106"] -OUTAGE_MODELS = [ - "claude-1", - "claude-2", - "claude-instant-1", - "zephyr-7b-alpha", - "wizardlm-70b", - "falcon-180b-chat", -] + +# outage models won't be sampled. +OUTAGE_MODELS = [] def get_sample_weight(model): @@ -227,39 +255,6 @@ def get_battle_pair(): if len(models) == 1: return models[0], models[0] - targets = { - "gpt-4": {"claude-2"}, - "gpt-4-turbo": {"gpt-4", "gpt-3.5-turbo"}, - # "gpt-4": {"llama-2-70b-chat"}, - "gpt-3.5-turbo": {"claude-instant-1", "gpt-4", "claude-2"}, - # "gpt-3.5-turbo": {"llama-2-70b-chat"}, - "claude-2": {"gpt-4", "gpt-3.5-turbo", "claude-1"}, - "claude-1": {"claude-2", "gpt-4", "gpt-3.5-turbo"}, - "claude-instant-1": {"gpt-3.5-turbo", "claude-2"}, - "deluxe-chat-v1.1": {"gpt-4"}, - "openchat-3.5": {"gpt-3.5-turbo", "llama-2-70b-chat", "zephyr-7b-beta"}, - "qwen-14b-chat": {"vicuna-13b", "llama-2-13b-chat", "llama-2-70b-chat"}, - "zephyr-7b-alpha": {"mistral-7b-instruct", "llama-2-13b-chat"}, - "zephyr-7b-beta": { - "mistral-7b-instruct", - "llama-2-13b-chat", - "llama-2-7b-chat", - "wizardlm-13b", - }, - "llama-2-70b-chat": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, - "llama-2-13b-chat": {"mistral-7b-instruct", "vicuna-13b", "llama-2-70b-chat"}, - "llama-2-7b-chat": {"mistral-7b-instruct", "vicuna-7b", "llama-2-13b-chat"}, - "mistral-7b-instruct": { - "llama-2-7b-chat", - "llama-2-13b-chat", - "llama-2-70b-chat", - }, - "vicuna-33b": {"llama-2-70b-chat", "gpt-3.5-turbo", "claude-instant-1"}, - "vicuna-13b": {"llama-2-13b-chat", "llama-2-70b-chat"}, - "vicuna-7b": {"llama-2-7b-chat", "mistral-7b-instruct", "llama-2-13b-chat"}, - "wizardlm-70b": {"gpt-3.5-turbo", "vicuna-33b", "claude-instant-1"}, - "palm-2": {"llama-2-13b-chat", "gpt-3.5-turbo"}, - } model_weights = [] for model in models: weight = get_sample_weight(model) @@ -275,9 +270,13 @@ def get_battle_pair(): if model == chosen_model: continue weight = get_sample_weight(model) - if weight != 0 and chosen_model in targets and model in targets[chosen_model]: + if ( + weight != 0 + and chosen_model in BATTLE_TARGETS + and model in BATTLE_TARGETS[chosen_model] + ): # boost to 50% chance - weight = total_weight / len(targets[chosen_model]) + weight = total_weight / len(BATTLE_TARGETS[chosen_model]) rival_models.append(model) rival_weights.append(weight) # for p, w in zip(rival_models, rival_weights): @@ -296,7 +295,6 @@ def get_battle_pair(): def add_text( state0, state1, model_selector0, model_selector1, text, request: gr.Request ): - is_cf = "cf-connecting-ip" in request.headers ip = get_ip(request) logger.info(f"add_text (anony). ip: {ip}. len: {len(text)}") states = [state0, state1] diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py index af79d3bfb..c13283495 100644 --- a/fastchat/serve/gradio_block_arena_named.py +++ b/fastchat/serve/gradio_block_arena_named.py @@ -147,7 +147,6 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re def add_text( state0, state1, model_selector0, model_selector1, text, request: gr.Request ): - is_cf = "cf-connecting-ip" in request.headers ip = get_ip(request) logger.info(f"add_text (named). ip: {ip}. len: {len(text)}") states = [state0, state1] diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py index f4671ae84..f1d043e28 100644 --- a/fastchat/serve/gradio_web_server.py +++ b/fastchat/serve/gradio_web_server.py @@ -246,7 +246,6 @@ def get_ip(request: gr.Request): def add_text(state, model_selector, text, request: gr.Request): - is_cf = "cf-connecting-ip" in request.headers ip = get_ip(request) logger.info(f"add_text. ip: {ip}. len: {len(text)}") diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py index f338617ff..b918f9d6b 100644 --- a/fastchat/serve/gradio_web_server_multi.py +++ b/fastchat/serve/gradio_web_server_multi.py @@ -94,8 +94,6 @@ def load_demo(url_params, request: gr.Request): models_anony += ["claude-2", "claude-1", "claude-instant-1"] if args.add_palm: models_anony += ["palm-2"] - models_anony.append("gpt-4-turbo") - models_anony.append("deluxe-chat-v1.1") models_anony = list(set(models_anony)) side_by_side_anony_updates = load_demo_side_by_side_anony(models_anony, url_params) diff --git a/fastchat/serve/monitor/clean_battle_data.py b/fastchat/serve/monitor/clean_battle_data.py index d85302a32..23357d08c 100644 --- a/fastchat/serve/monitor/clean_battle_data.py +++ b/fastchat/serve/monitor/clean_battle_data.py @@ -36,7 +36,7 @@ "google", "llama", "NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.", - "$MODERATION$", + "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES.", ] for i in range(len(IDENTITY_WORDS)): diff --git a/fastchat/serve/monitor/monitor.py b/fastchat/serve/monitor/monitor.py index 48273436a..580a2c866 100644 --- a/fastchat/serve/monitor/monitor.py +++ b/fastchat/serve/monitor/monitor.py @@ -35,7 +35,7 @@ def make_leaderboard_md(elo_results): | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | This leaderboard is based on the following three benchmarks. -- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 90K+ user votes to compute Elo ratings. +- [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) - a crowdsourced, randomized battle platform. We use 100K+ user votes to compute Elo ratings. - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses. - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.