Benchmark: GPU

Benchmark: GPU#

Single-precision CPU vs GPU benchmark for multilayer cross sections.

Compute spectra for a 3-layer sphere at a large nr of wavelengths and compares runtime on CPU and CUDA (if available).

–> GPU memory transfer overhead dominates for small and moderate nr of parallel calculations. GPU becomes advantageous only for very large Nrs of batched calculations.

author: P. Wiecha, 03/2026

Benchmark CPU vs GP:, single precision, 3 layers, 100000 wavelengths
CPU  :   761.93 ms / call
CUDA not available on this system. GPU benchmark skipped.

import time
import torch
import pymiediff as pmd



def _benchmark_cross_sections(device, n_runs=5, n_warmup=2, N_wl=256):
    dtype_r = torch.float32
    dtype_c = torch.complex64
    dtype_r = torch.float64
    dtype_c = torch.complex128

    wl = torch.linspace(450.0, 950.0, N_wl, dtype=dtype_r, device=device)
    k0 = 2.0 * torch.pi / wl

    # 3-layer geometry (nm)
    r_layers = torch.tensor([55.0, 85.0, 120.0], dtype=dtype_r, device=device)

    # Weakly dispersive layer permittivities for benchmarking
    w = wl / 1000.0
    eps_l1 = (2.30 + 0.08 / w**2) ** 2 + 1j * (0.006 + 0.002 * w)
    eps_l2 = (1.85 + 0.05 / w**2) ** 2 + 1j * (0.004 + 0.001 * w)
    eps_l3 = (1.55 + 0.03 / w**2) ** 2 + 1j * (0.002 + 0.001 * w)
    eps_layers = torch.stack((eps_l1, eps_l2, eps_l3), dim=0).to(
        dtype=dtype_c, device=device
    )
    eps_env = torch.tensor((1.33**2) + 0j, dtype=dtype_c, device=device)

    if device.type == "cuda":
        torch.cuda.synchronize(device)

    # warmup
    for _ in range(n_warmup):
        _ = pmd.multishell.cross_sections(
            k0=k0,
            r_layers=r_layers,
            eps_layers=eps_layers,
            eps_env=eps_env,
            precision="single",
            n_max=None,
        )

    if device.type == "cuda":
        torch.cuda.synchronize(device)

    t0 = time.perf_counter()
    res = None
    for _ in range(n_runs):
        res = pmd.multishell.cross_sections(
            k0=k0,
            r_layers=r_layers,
            eps_layers=eps_layers,
            eps_env=eps_env,
            precision="single",
            n_max=None,
        )
    if device.type == "cuda":
        torch.cuda.synchronize(device)
    dt = time.perf_counter() - t0

    return dt / n_runs, res


if __name__ == "__main__":
    N_wl = 100000
    print(f"Benchmark CPU vs GP:, single precision, 3 layers, {N_wl} wavelengths")

    cpu = torch.device("cpu")
    t_cpu, res_cpu = _benchmark_cross_sections(cpu, N_wl=N_wl)
    print(f"CPU  : {t_cpu*1e3:8.2f} ms / call")

    if torch.cuda.is_available():
        gpu = torch.device("cuda")
        t_gpu, res_gpu = _benchmark_cross_sections(gpu, N_wl=N_wl)
        speedup = t_cpu / t_gpu
        print(f"GPU  : {t_gpu*1e3:8.2f} ms / call")
        print(f"Speedup (CPU/GPU): {speedup:.2f}x")

        # quick numerical consistency check
        qext_cpu = res_cpu["q_ext"].detach().cpu()
        qext_gpu = res_gpu["q_ext"].detach().cpu()
        rel = (qext_cpu - qext_gpu).abs() / qext_cpu.abs().clamp_min(1e-6)
        print(f"Max relative difference in q_ext: {rel.max().item():.3e}")
    else:
        print("CUDA not available on this system. GPU benchmark skipped.")

Total running time of the script: (0 minutes 5.935 seconds)

Estimated memory usage: 566 MB

Gallery generated by Sphinx-Gallery