| """ |
| Utility functions for benchmark scripts. |
| """ |
|
|
| from dataclasses import dataclass |
| from typing import Any, Callable, Dict, List, Optional |
|
|
| import numpy as np |
| import sglang as sgl |
|
|
|
|
| @dataclass |
| class BenchmarkMetrics: |
| """Container for benchmark performance metrics.""" |
|
|
| latency: float |
| output_throughput: float |
| accept_length: float |
| accuracy: Optional[float] = None |
| num_questions: int = 0 |
| num_valid_predictions: int = 0 |
| categorical_performance: Optional[Dict[str, "BenchmarkMetrics"]] = None |
|
|
|
|
| def compute_metrics( |
| states: List[Any], |
| latency: float, |
| answer_key: str = "answer", |
| additional_answer_keys: Optional[List[str]] = None, |
| ) -> BenchmarkMetrics: |
| """ |
| Compute performance metrics from SGLang states. |
| |
| Args: |
| states: List of SGLang state objects from run_batch |
| latency: Total latency in seconds |
| answer_key: Primary key for answer in state meta info |
| additional_answer_keys: Additional keys to include in token count (e.g., ["answer_1", "answer_2"]) |
| |
| Returns: |
| BenchmarkMetrics object with computed metrics |
| """ |
| |
| num_output_tokens = 0 |
| if additional_answer_keys: |
| for key in [answer_key] + additional_answer_keys: |
| num_output_tokens += sum( |
| s.get_meta_info(key)["completion_tokens"] for s in states |
| ) |
| else: |
| num_output_tokens = sum( |
| s.get_meta_info(answer_key)["completion_tokens"] for s in states |
| ) |
|
|
| output_throughput = num_output_tokens / latency if latency > 0 else 0.0 |
|
|
| |
| has_verify = "spec_verify_ct" in states[0].get_meta_info(answer_key) |
| if has_verify: |
| num_verify_tokens = 0 |
| if additional_answer_keys: |
| for key in [answer_key] + additional_answer_keys: |
| num_verify_tokens += sum( |
| s.get_meta_info(key).get("spec_verify_ct", 0) for s in states |
| ) |
| else: |
| num_verify_tokens = sum( |
| s.get_meta_info(answer_key).get("spec_verify_ct", 0) for s in states |
| ) |
|
|
| if num_verify_tokens == 0: |
| accept_length = 1.0 |
| else: |
| accept_length = num_output_tokens / num_verify_tokens |
| else: |
| accept_length = 1.0 |
|
|
| return BenchmarkMetrics( |
| latency=latency, |
| output_throughput=output_throughput, |
| accept_length=accept_length, |
| num_questions=len(states), |
| ) |
|
|
|
|
| def print_results( |
| metrics_list: List[BenchmarkMetrics], |
| benchmark_name: str, |
| show_accuracy: bool = False, |
| ): |
| """ |
| Print benchmark results in a formatted way. |
| |
| Args: |
| metrics_list: List of BenchmarkMetrics from multiple runs |
| benchmark_name: Name of the benchmark |
| show_accuracy: Whether to show accuracy metrics |
| """ |
| avg_latency = np.mean([m.latency for m in metrics_list]) |
| avg_throughput = np.mean([m.output_throughput for m in metrics_list]) |
| avg_accept_length = np.mean([m.accept_length for m in metrics_list]) |
|
|
| print(f"\n{'='*50}") |
| print(f"{benchmark_name} Evaluation Results") |
| print(f"{'='*50}") |
| print(f"Number of questions: {metrics_list[0].num_questions}") |
| if show_accuracy: |
| if metrics_list[0].accuracy is not None: |
| avg_accuracy = np.mean( |
| [m.accuracy for m in metrics_list if m.accuracy is not None] |
| ) |
| print(f"Average Accuracy: {avg_accuracy:.4f} ({avg_accuracy*100:.2f}%)") |
| else: |
| print(f"Average Accuracy: None") |
| print(f"Average Latency: {avg_latency:.3f} s") |
| print(f"Average Output throughput: {avg_throughput:.3f} token/s") |
| print(f"Average Accept length: {avg_accept_length:.3f}") |
| print(f"{'='*50}\n") |
|
|
|
|
| def create_simple_sgl_function( |
| function_name: str = "get_answer", |
| answer_key: str = "answer", |
| system_prompt: Optional[str] = None, |
| max_tokens: int = 2048, |
| stop: Optional[List[str]] = None, |
| user_prefix: Optional[str] = None, |
| ) -> Callable: |
| """ |
| Create a simple SGL function for single-turn Q&A. |
| |
| Args: |
| function_name: Name of the function |
| answer_key: Key for storing the answer |
| system_prompt: Optional system prompt |
| max_tokens: Maximum tokens to generate |
| stop: Optional stop sequences |
| user_prefix: Optional suffix to append to user message (appended after question) |
| |
| Returns: |
| SGL function decorated with @sgl.function |
| """ |
|
|
| @sgl.function |
| def sgl_func(s, question): |
| if system_prompt: |
| s += sgl.system(system_prompt) |
| user_content = question |
| if user_prefix: |
| user_content = question + user_prefix |
| s += sgl.user(user_content) |
| gen_kwargs = {"max_tokens": max_tokens} |
| if stop: |
| gen_kwargs["stop"] = stop |
| s += sgl.assistant(sgl.gen(answer_key, **gen_kwargs)) |
|
|
| sgl_func.__name__ = function_name |
| return sgl_func |
|
|
|
|
| def create_few_shot_sgl_function( |
| few_shot_examples: str, |
| function_name: str = "few_shot_answer", |
| answer_key: str = "answer", |
| max_tokens: int = 512, |
| stop: Optional[List[str]] = None, |
| ) -> Callable: |
| """ |
| Create an SGL function for few-shot learning. |
| |
| Args: |
| few_shot_examples: String containing few-shot examples |
| function_name: Name of the function |
| answer_key: Key for storing the answer |
| max_tokens: Maximum tokens to generate |
| stop: Optional stop sequences |
| |
| Returns: |
| SGL function decorated with @sgl.function |
| """ |
|
|
| @sgl.function |
| def sgl_func(s, question): |
| s += few_shot_examples + question |
| gen_kwargs = {"max_tokens": max_tokens} |
| if stop: |
| gen_kwargs["stop"] = stop |
| s += sgl.gen(answer_key, **gen_kwargs) |
|
|
| sgl_func.__name__ = function_name |
| return sgl_func |
|
|
|
|
| def create_multi_turn_sgl_function( |
| function_name: str = "multi_turn_answer", |
| system_prompt: Optional[str] = None, |
| num_turns: int = 2, |
| max_tokens: int = 2048, |
| ) -> Callable: |
| """ |
| Create an SGL function for multi-turn conversations (e.g., MT-Bench with 2 turns). |
| |
| Args: |
| function_name: Name of the function |
| system_prompt: Optional system prompt |
| num_turns: Number of conversation turns (default: 2) |
| max_tokens: Maximum tokens to generate per turn |
| |
| Returns: |
| SGL function decorated with @sgl.function |
| """ |
| if num_turns == 2: |
| |
| @sgl.function |
| def sgl_func(s, question_1, question_2): |
| if system_prompt: |
| s += sgl.system(system_prompt) |
| s += sgl.user(question_1) |
| s += sgl.assistant(sgl.gen("answer_1", max_tokens=max_tokens)) |
| s += sgl.user(question_2) |
| s += sgl.assistant(sgl.gen("answer_2", max_tokens=max_tokens)) |
|
|
| else: |
| |
| |
| @sgl.function |
| def sgl_func(s, **kwargs): |
| if system_prompt: |
| s += sgl.system(system_prompt) |
| for i in range(num_turns): |
| question_key = f"question_{i+1}" |
| answer_key = f"answer_{i+1}" |
| if question_key in kwargs: |
| s += sgl.user(kwargs[question_key]) |
| s += sgl.assistant(sgl.gen(answer_key, max_tokens=max_tokens)) |
|
|
| sgl_func.__name__ = function_name |
| return sgl_func |
|
|
|
|
| def create_image_sgl_function( |
| function_name: str = "get_image_answer", |
| answer_key: str = "answer", |
| max_tokens: int = 2048, |
| ) -> Callable: |
| """ |
| Create an SGL function for image-based Q&A. |
| |
| Args: |
| function_name: Name of the function |
| answer_key: Key for storing the answer |
| max_tokens: Maximum tokens to generate |
| |
| Returns: |
| SGL function decorated with @sgl.function |
| """ |
|
|
| @sgl.function |
| def sgl_func(s, image_path, question, **kwargs): |
| """ |
| The body of the SGL function: constructs a multimodal conversation flow. |
| |
| - First, it inputs an image + text question as 'user'. |
| - Then, it generates an answer as 'assistant', binding the response to the specified `answer_key`. |
| |
| Note: sgl.image() automatically encodes the image into a format supported by the model for multimodal input. |
| """ |
| |
| s += sgl.user(sgl.image(image_path) + question) |
| s += sgl.assistant(sgl.gen(answer_key, max_tokens=max_tokens)) |
|
|
| sgl_func.__name__ = function_name |
| return sgl_func |
|
|