# HELP vllm:cache_config_info Information of the LLMEngine CacheConfig # TYPE vllm:cache_config_info gauge vllm:cache_config_info{block_size="16",cache_dtype="auto",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.7",is_attention_free="False",num_cpu_blocks="2048",num_gpu_blocks="2515",num_gpu_blocks_override="None",sliding_window="None",swap_space_bytes="4294967296"} 1.0 # HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds. # TYPE vllm:time_to_first_token_seconds histogram vllm:time_to_first_token_seconds_sum{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.8441214561462402 vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:time_to_first_token_seconds_bucket{le="0.01",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:time_to_first_token_seconds_bucket{le="0.02",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:time_to_first_token_seconds_bucket{le="0.04",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:time_to_first_token_seconds_bucket{le="0.06",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3.0 vllm:time_to_first_token_seconds_bucket{le="0.08",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 4.0 vllm:time_to_first_token_seconds_bucket{le="0.1",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 4.0 vllm:time_to_first_token_seconds_bucket{le="0.25",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 6.0 vllm:time_to_first_token_seconds_bucket{le="0.5",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:time_to_first_token_seconds_bucket{le="0.75",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:time_to_first_token_seconds_bucket{le="1.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:time_to_first_token_seconds_bucket{le="2.5",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:time_to_first_token_seconds_bucket{le="5.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:time_to_first_token_seconds_bucket{le="7.5",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:time_to_first_token_seconds_bucket{le="10.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:time_to_first_token_seconds_bucket{le="+Inf",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:time_to_first_token_seconds_count{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 # HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds. # TYPE vllm:time_per_output_token_seconds histogram vllm:time_per_output_token_seconds_sum{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 34.672072649002075 vllm:time_per_output_token_seconds_bucket{le="0.01",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:time_per_output_token_seconds_bucket{le="0.025",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1374.0 vllm:time_per_output_token_seconds_bucket{le="0.05",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="0.075",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="0.1",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="0.15",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="0.2",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="0.3",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="0.4",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="0.5",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="0.75",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="1.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="2.5",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_bucket{le="+Inf",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 vllm:time_per_output_token_seconds_count{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1406.0 # HELP vllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds. # TYPE vllm:e2e_request_latency_seconds histogram vllm:e2e_request_latency_seconds_sum{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 35.516194105148315 vllm:e2e_request_latency_seconds_bucket{le="1.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3.0 vllm:e2e_request_latency_seconds_bucket{le="2.5",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3.0 vllm:e2e_request_latency_seconds_bucket{le="5.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3.0 vllm:e2e_request_latency_seconds_bucket{le="10.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 6.0 vllm:e2e_request_latency_seconds_bucket{le="15.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:e2e_request_latency_seconds_bucket{le="20.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:e2e_request_latency_seconds_bucket{le="30.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:e2e_request_latency_seconds_bucket{le="40.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:e2e_request_latency_seconds_bucket{le="50.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:e2e_request_latency_seconds_bucket{le="60.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:e2e_request_latency_seconds_bucket{le="+Inf",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:e2e_request_latency_seconds_count{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 # HELP vllm:request_prompt_tokens Number of prefill tokens processed. # TYPE vllm:request_prompt_tokens histogram vllm:request_prompt_tokens_sum{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3120.0 vllm:request_prompt_tokens_bucket{le="1.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:request_prompt_tokens_bucket{le="2.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:request_prompt_tokens_bucket{le="5.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:request_prompt_tokens_bucket{le="10.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:request_prompt_tokens_bucket{le="20.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 2.0 vllm:request_prompt_tokens_bucket{le="50.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 2.0 vllm:request_prompt_tokens_bucket{le="100.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 2.0 vllm:request_prompt_tokens_bucket{le="200.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 4.0 vllm:request_prompt_tokens_bucket{le="500.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 5.0 vllm:request_prompt_tokens_bucket{le="1000.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 6.0 vllm:request_prompt_tokens_bucket{le="2000.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_prompt_tokens_bucket{le="5000.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_prompt_tokens_bucket{le="+Inf",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_prompt_tokens_count{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 # HELP vllm:request_generation_tokens Number of generation tokens processed. # TYPE vllm:request_generation_tokens histogram vllm:request_generation_tokens_sum{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1413.0 vllm:request_generation_tokens_bucket{le="1.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:request_generation_tokens_bucket{le="2.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:request_generation_tokens_bucket{le="5.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 vllm:request_generation_tokens_bucket{le="10.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 2.0 vllm:request_generation_tokens_bucket{le="20.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 2.0 vllm:request_generation_tokens_bucket{le="50.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3.0 vllm:request_generation_tokens_bucket{le="100.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3.0 vllm:request_generation_tokens_bucket{le="200.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3.0 vllm:request_generation_tokens_bucket{le="500.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 6.0 vllm:request_generation_tokens_bucket{le="1000.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_generation_tokens_bucket{le="2000.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_generation_tokens_bucket{le="5000.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_generation_tokens_bucket{le="+Inf",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_generation_tokens_count{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 # HELP vllm:request_params_n Histogram of the n request parameter. # TYPE vllm:request_params_n histogram vllm:request_params_n_sum{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_params_n_bucket{le="1.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_params_n_bucket{le="2.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_params_n_bucket{le="5.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_params_n_bucket{le="10.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_params_n_bucket{le="20.0",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_params_n_bucket{le="+Inf",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 vllm:request_params_n_count{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 # HELP vllm:num_preemptions_total Cumulative number of preemption from the engine. # TYPE vllm:num_preemptions_total counter vllm:num_preemptions_total{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 # HELP vllm:prompt_tokens_total Number of prefill tokens processed. # TYPE vllm:prompt_tokens_total counter vllm:prompt_tokens_total{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 3120.0 # HELP vllm:generation_tokens_total Number of generation tokens processed. # TYPE vllm:generation_tokens_total counter vllm:generation_tokens_total{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 1413.0 # HELP vllm:request_success_total Count of successfully processed requests. # TYPE vllm:request_success_total counter vllm:request_success_total{finished_reason="stop",model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 7.0 # HELP vllm:num_requests_running Number of requests currently running on GPU. # TYPE vllm:num_requests_running gauge vllm:num_requests_running{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 # HELP vllm:num_requests_swapped Number of requests swapped to CPU. # TYPE vllm:num_requests_swapped gauge vllm:num_requests_swapped{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 # HELP vllm:num_requests_waiting Number of requests waiting to be processed. # TYPE vllm:num_requests_waiting gauge vllm:num_requests_waiting{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 # HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage. # TYPE vllm:gpu_cache_usage_perc gauge vllm:gpu_cache_usage_perc{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 # HELP vllm:cpu_cache_usage_perc CPU KV-cache usage. 1 means 100 percent usage. # TYPE vllm:cpu_cache_usage_perc gauge vllm:cpu_cache_usage_perc{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 # HELP vllm:cpu_prefix_cache_hit_rate CPU prefix cache block hit rate. # TYPE vllm:cpu_prefix_cache_hit_rate gauge vllm:cpu_prefix_cache_hit_rate{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} -1.0 # HELP vllm:gpu_prefix_cache_hit_rate GPU prefix cache block hit rate. # TYPE vllm:gpu_prefix_cache_hit_rate gauge vllm:gpu_prefix_cache_hit_rate{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} -1.0 # HELP vllm:avg_prompt_throughput_toks_per_s Average prefill throughput in tokens/s. # TYPE vllm:avg_prompt_throughput_toks_per_s gauge vllm:avg_prompt_throughput_toks_per_s{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0 # HELP vllm:avg_generation_throughput_toks_per_s Average generation throughput in tokens/s. # TYPE vllm:avg_generation_throughput_toks_per_s gauge vllm:avg_generation_throughput_toks_per_s{model_name="taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"} 0.0