|
5 | 5 | # from vllm import LLM, SamplingParams |
6 | 6 |
|
7 | 7 |
|
8 | | -seed(0) |
9 | | -num_seqs = 256 |
10 | | -max_input_len = 1024 |
11 | | -max_ouput_len = 1024 |
| 8 | +def main(): |
| 9 | + seed(0) |
| 10 | + num_seqs = 256 |
| 11 | + max_input_len = 1024 |
| 12 | + max_ouput_len = 1024 |
12 | 13 |
|
13 | | -path = os.path.expanduser("~/huggingface/Qwen3-0.6B/") |
14 | | -llm = LLM(path, enforce_eager=False, max_model_len=4096) |
| 14 | + path = os.path.expanduser("~/huggingface/Qwen3-0.6B/") |
| 15 | + llm = LLM(path, enforce_eager=False, max_model_len=4096) |
15 | 16 |
|
16 | | -prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)] |
17 | | -sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_ouput_len)) for _ in range(num_seqs)] |
18 | | -# uncomment the following line for vllm |
19 | | -# prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids] |
| 17 | + prompt_token_ids = [[randint(0, 10000) for _ in range(randint(100, max_input_len))] for _ in range(num_seqs)] |
| 18 | + sampling_params = [SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=randint(100, max_ouput_len)) for _ in range(num_seqs)] |
| 19 | + # uncomment the following line for vllm |
| 20 | + # prompt_token_ids = [dict(prompt_token_ids=p) for p in prompt_token_ids] |
20 | 21 |
|
21 | | -llm.generate(["Benchmark: "], SamplingParams()) |
22 | | -t = time.time() |
23 | | -llm.generate(prompt_token_ids, sampling_params) |
24 | | -t = (time.time() - t) |
25 | | -total_tokens = sum(sp.max_tokens for sp in sampling_params) |
26 | | -throughput = total_tokens / t |
27 | | -print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") |
| 22 | + llm.generate(["Benchmark: "], SamplingParams()) |
| 23 | + t = time.time() |
| 24 | + llm.generate(prompt_token_ids, sampling_params) |
| 25 | + t = (time.time() - t) |
| 26 | + total_tokens = sum(sp.max_tokens for sp in sampling_params) |
| 27 | + throughput = total_tokens / t |
| 28 | + print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") |
| 29 | + |
| 30 | + |
| 31 | +if __name__ == "__main__": |
| 32 | + main() |
0 commit comments