{ "avg_time_ms": 62.308485079556704, "throughput_tokens_per_sec": 65737.43519474348, "memory_allocated_gb": 1.329831600189209, "memory_cached_gb": 1.8359375, "memory_increase_gb": 0.3795137405395508, "device": "cuda", "dtype": "torch.bfloat16", "tokens": 4096, "warmup_iters": 10, "timing_iters": 50 }