NxでもCORDICで三角関数を実装してみました.
CORDICのループを全て展開しています.
defmodule NxCordic do
@moduledoc File.read!("README.md")
|> String.split("<!-- MODULEDOC -->")
|> Enum.fetch!(1)
import Nx.Defn
@bit_size 32
@max Bitwise.bsl(1, @bit_size - 1)
@k_value 0..31
|> Enum.map(&(2 ** (-2 * &1)))
|> Enum.map(&:math.sqrt(1 + &1))
|> Enum.map(&(1.0 / &1))
|> Enum.reduce(fn x, acc -> x * acc end)
|> Kernel.*(@max)
|> floor()
@angles 0..31
|> Enum.map(&(2 ** -&1))
|> Enum.map(&:math.atan/1)
|> Enum.map(&(&1 / :math.pi()))
|> Enum.map(&(&1 * @max))
|> Enum.map(&floor(&1))
|> Enum.reject(&(&1 == 0))
|> List.to_tuple()
@pi Nx.Constants.pi()
@double_pi Nx.multiply(2.0, @pi)
@factor_angle_to_fixed_point Nx.divide(@max, @pi)
@half_pi_fixed_point Bitwise.bsr(@max, 1)
defnp regularize(theta) do
theta - Nx.floor(theta / @double_pi) * @double_pi
end
defnp angle_to_fixed_point(theta) do
Nx.as_type(regularize(theta) * @factor_angle_to_fixed_point, {:s, 32})
end
defn cos_sin(theta) do
theta_i = angle_to_fixed_point(theta)
c1 = Nx.less(theta_i, -@half_pi_fixed_point)
c3 = Nx.greater(theta_i, @half_pi_fixed_point)
nc2 = Nx.logical_or(c1, c3)
c2 = Nx.logical_not(nc2)
s = (Nx.as_type(nc2, {:s, 32}) |> Nx.negate()) + Nx.as_type(c2, {:s, 32})
theta_i = (theta_i + @max) * c1 + theta_i * c2 + (theta_i - @max) * c3
{vcos, vsin} = cordic_cos_sin(theta_i)
{s * vcos, s * vsin}
end
defnp cordic_cos_sin(theta_i) do
{vcos, vsin} = {Nx.broadcast(@k_value, theta_i), Nx.broadcast(0, theta_i)}
c_neg = Nx.less(theta_i, 0)
c_non_neg = Nx.greater_equal(theta_i, 0)
j = 0
{vcos, vsin} =
{
vcos + Nx.right_shift(vsin * c_neg - vsin * c_non_neg, j),
vsin + Nx.right_shift(vcos * c_non_neg - vcos * c_neg, j)
}
theta_i = theta_i + elem(@angles, j) * c_neg - elem(@angles, j) * c_non_neg
c_neg = Nx.less(theta_i, 0)
c_non_neg = Nx.greater_equal(theta_i, 0)
j = 1
{vcos, vsin} =
{
vcos + Nx.right_shift(vsin * c_neg - vsin * c_non_neg, j),
vsin + Nx.right_shift(vcos * c_non_neg - vcos * c_neg, j)
}
...
theta_i = theta_i + elem(@angles, j) * c_neg - elem(@angles, j) * c_non_neg
c_neg = Nx.less(theta_i, 0)
c_non_neg = Nx.greater_equal(theta_i, 0)
j = 30
{vcos, vsin} =
{
vcos + Nx.right_shift(vsin * c_neg - vsin * c_non_neg, j),
vsin + Nx.right_shift(vcos * c_non_neg - vcos * c_neg, j)
}
{Nx.as_type(vcos, {:f, 32}) / @max, Nx.as_type(vsin, {:f, 32}) / @max}
end
end
ベンチマーク結果
ベンチマークを走らせてみました.
Linux Ryzen threadripper 2990WX NVIDIA TITAN RTX
XLA_TARGET=cuda118 EXLA_TARGET=cuda mix run -r bench/nx_cordic_bench.exs
21:40:19.347 [info] XLA service 0x7f5dcc2459f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
21:40:19.351 [info] StreamExecutor device (0): NVIDIA TITAN RTX, Compute Capability 7.5
21:40:19.351 [info] Using BFC allocator.
21:40:19.351 [info] XLA backend allocating 22523078246 bytes on device 0 for BFCAllocator.
Operating System: Linux
CPU Information: AMD Ryzen Threadripper 2990WX 32-Core Processor
Number of Available Cores: 64
Available memory: 62.76 GB
Elixir 1.15.2
Erlang 26.0.2
Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: size = 1024, size = 16, size = 256, size = 64
Estimated total run time: 4.20 min
Benchmarking EXLA CPU CORDIC with input size = 1024 ...
Benchmarking EXLA CPU CORDIC with input size = 16 ...
Benchmarking EXLA CPU CORDIC with input size = 256 ...
Benchmarking EXLA CPU CORDIC with input size = 64 ...
Benchmarking EXLA CPU cos_sin with input size = 1024 ...
Benchmarking EXLA CPU cos_sin with input size = 16 ...
Benchmarking EXLA CPU cos_sin with input size = 256 ...
Benchmarking EXLA CPU cos_sin with input size = 64 ...
Benchmarking EXLA CPU sin with input size = 1024 ...
Benchmarking EXLA CPU sin with input size = 16 ...
Benchmarking EXLA CPU sin with input size = 256 ...
Benchmarking EXLA CPU sin with input size = 64 ...
Benchmarking EXLA GPU CORDIC with input size = 1024 ...
Benchmarking EXLA GPU CORDIC with input size = 16 ...
Benchmarking EXLA GPU CORDIC with input size = 256 ...
Benchmarking EXLA GPU CORDIC with input size = 64 ...
Benchmarking EXLA GPU cos_sin with input size = 1024 ...
Benchmarking EXLA GPU cos_sin with input size = 16 ...
Benchmarking EXLA GPU cos_sin with input size = 256 ...
Benchmarking EXLA GPU cos_sin with input size = 64 ...
Benchmarking EXLA GPU sin with input size = 1024 ...
Benchmarking EXLA GPU sin with input size = 16 ...
Benchmarking EXLA GPU sin with input size = 256 ...
Benchmarking EXLA GPU sin with input size = 64 ...
Benchmarking Nx CORDIC with input size = 1024 ...
Benchmarking Nx CORDIC with input size = 16 ...
Benchmarking Nx CORDIC with input size = 256 ...
Benchmarking Nx CORDIC with input size = 64 ...
Benchmarking Nx cos_sin with input size = 1024 ...
Benchmarking Nx cos_sin with input size = 16 ...
Benchmarking Nx cos_sin with input size = 256 ...
Benchmarking Nx cos_sin with input size = 64 ...
Benchmarking Nx sin with input size = 1024 ...
Benchmarking Nx sin with input size = 16 ...
Benchmarking Nx sin with input size = 256 ...
Benchmarking Nx sin with input size = 64 ...
##### With input size = 1024 #####
Name ips average deviation median 99th %
EXLA GPU sin 17.91 K 55.85 μs ±18.94% 53.78 μs 105.77 μs
EXLA GPU cos_sin 15.46 K 64.68 μs ±21.48% 61.56 μs 156.03 μs
EXLA GPU CORDIC 14.38 K 69.54 μs ±21.04% 66.67 μs 159.16 μs
Nx sin 10.26 K 97.51 μs ±8.89% 96.73 μs 103.16 μs
EXLA CPU sin 7.22 K 138.41 μs ±20.53% 131.87 μs 220.48 μs
EXLA CPU cos_sin 6.49 K 153.98 μs ±22.93% 145.24 μs 261.56 μs
EXLA CPU CORDIC 6.17 K 162.19 μs ±22.92% 151.86 μs 274.27 μs
Nx cos_sin 4.94 K 202.32 μs ±6.37% 199.78 μs 266.11 μs
Nx CORDIC 0.0153 K 65475.97 μs ±0.52% 65489.55 μs 66411.41 μs
Comparison:
EXLA GPU sin 17.91 K
EXLA GPU cos_sin 15.46 K - 1.16x slower +8.83 μs
EXLA GPU CORDIC 14.38 K - 1.25x slower +13.69 μs
Nx sin 10.26 K - 1.75x slower +41.66 μs
EXLA CPU sin 7.22 K - 2.48x slower +82.56 μs
EXLA CPU cos_sin 6.49 K - 2.76x slower +98.14 μs
EXLA CPU CORDIC 6.17 K - 2.90x slower +106.35 μs
Nx cos_sin 4.94 K - 3.62x slower +146.47 μs
Nx CORDIC 0.0153 K - 1172.41x slower +65420.12 μs
##### With input size = 16 #####
Name ips average deviation median 99th %
Nx sin 423.54 K 2.36 μs ±1057.67% 2.12 μs 3.91 μs
Nx cos_sin 110.59 K 9.04 μs ±233.88% 8.04 μs 16.47 μs
EXLA GPU sin 17.58 K 56.87 μs ±18.50% 55.56 μs 107.03 μs
EXLA GPU cos_sin 15.58 K 64.17 μs ±21.24% 61.13 μs 155.61 μs
EXLA GPU CORDIC 15.26 K 65.52 μs ±20.36% 62.53 μs 157.21 μs
EXLA CPU sin 7.03 K 142.28 μs ±16.31% 136.81 μs 236.75 μs
EXLA CPU cos_sin 6.56 K 152.44 μs ±19.06% 143.73 μs 260.11 μs
EXLA CPU CORDIC 6.46 K 154.79 μs ±21.79% 146.66 μs 253.26 μs
Nx CORDIC 0.33 K 2993.65 μs ±9.21% 2841.30 μs 3595.74 μs
Comparison:
Nx sin 423.54 K
Nx cos_sin 110.59 K - 3.83x slower +6.68 μs
EXLA GPU sin 17.58 K - 24.09x slower +54.51 μs
EXLA GPU cos_sin 15.58 K - 27.18x slower +61.81 μs
EXLA GPU CORDIC 15.26 K - 27.75x slower +63.16 μs
EXLA CPU sin 7.03 K - 60.26x slower +139.91 μs
EXLA CPU cos_sin 6.56 K - 64.56x slower +150.08 μs
EXLA CPU CORDIC 6.46 K - 65.56x slower +152.43 μs
Nx CORDIC 0.33 K - 1267.93x slower +2991.29 μs
##### With input size = 256 #####
Name ips average deviation median 99th %
Nx sin 39.86 K 25.09 μs ±25.74% 24.75 μs 28.04 μs
Nx cos_sin 18.33 K 54.55 μs ±4.91% 53.83 μs 59.17 μs
EXLA GPU sin 18.16 K 55.07 μs ±17.04% 53.09 μs 105.93 μs
EXLA GPU CORDIC 15.28 K 65.45 μs ±19.70% 62.43 μs 157.22 μs
EXLA GPU cos_sin 15.25 K 65.55 μs ±20.40% 61.97 μs 155.72 μs
EXLA CPU sin 7.88 K 126.85 μs ±17.53% 123.92 μs 219.38 μs
EXLA CPU CORDIC 6.58 K 152.02 μs ±20.39% 143.92 μs 249.05 μs
EXLA CPU cos_sin 6.50 K 153.92 μs ±20.60% 145.23 μs 262.33 μs
Nx CORDIC 0.0568 K 17613.88 μs ±1.33% 17605.80 μs 18274.56 μs
Comparison:
Nx sin 39.86 K
Nx cos_sin 18.33 K - 2.17x slower +29.46 μs
EXLA GPU sin 18.16 K - 2.19x slower +29.98 μs
EXLA GPU CORDIC 15.28 K - 2.61x slower +40.36 μs
EXLA GPU cos_sin 15.25 K - 2.61x slower +40.47 μs
EXLA CPU sin 7.88 K - 5.06x slower +101.76 μs
EXLA CPU CORDIC 6.58 K - 6.06x slower +126.93 μs
EXLA CPU cos_sin 6.50 K - 6.14x slower +128.83 μs
Nx CORDIC 0.0568 K - 702.08x slower +17588.79 μs
##### With input size = 64 #####
Name ips average deviation median 99th %
Nx sin 137.35 K 7.28 μs ±171.49% 6.90 μs 10.76 μs
Nx cos_sin 54.75 K 18.26 μs ±73.30% 17.60 μs 28.84 μs
EXLA GPU sin 18.25 K 54.80 μs ±17.05% 52.87 μs 106.44 μs
EXLA GPU cos_sin 15.49 K 64.57 μs ±22.65% 61.33 μs 156.85 μs
EXLA GPU CORDIC 15.26 K 65.55 μs ±20.18% 62.52 μs 156.81 μs
EXLA CPU sin 8.29 K 120.65 μs ±17.27% 119.64 μs 207.11 μs
EXLA CPU CORDIC 6.50 K 153.92 μs ±21.24% 144.10 μs 261.88 μs
EXLA CPU cos_sin 6.44 K 155.36 μs ±27.09% 147.34 μs 279.19 μs
Nx CORDIC 0.165 K 6065.87 μs ±2.69% 5985.60 μs 6464.46 μs
Comparison:
Nx sin 137.35 K
Nx cos_sin 54.75 K - 2.51x slower +10.98 μs
EXLA GPU sin 18.25 K - 7.53x slower +47.52 μs
EXLA GPU cos_sin 15.49 K - 8.87x slower +57.29 μs
EXLA GPU CORDIC 15.26 K - 9.00x slower +58.27 μs
EXLA CPU sin 8.29 K - 16.57x slower +113.37 μs
EXLA CPU CORDIC 6.50 K - 21.14x slower +146.64 μs
EXLA CPU cos_sin 6.44 K - 21.34x slower +148.08 μs
Nx CORDIC 0.165 K - 833.12x slower +6058.59 μs
macOS M2 Max
mix run -r bench/nx_cordic_bench.exs
Operating System: macOS
CPU Information: Apple M2 Max
Number of Available Cores: 12
Available memory: 96 GB
Elixir 1.15.2
Erlang 25.0.4
Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: size = 1024, size = 16, size = 256, size = 64
Estimated total run time: 2.80 min
Benchmarking EXLA CORDIC with input size = 1024 ...
23:29:15.269 [info] TfrtCpuClient created.
Benchmarking EXLA CORDIC with input size = 16 ...
Benchmarking EXLA CORDIC with input size = 256 ...
Benchmarking EXLA CORDIC with input size = 64 ...
Benchmarking EXLA cos_sin with input size = 1024 ...
Benchmarking EXLA cos_sin with input size = 16 ...
Benchmarking EXLA cos_sin with input size = 256 ...
Benchmarking EXLA cos_sin with input size = 64 ...
Benchmarking EXLA sin with input size = 1024 ...
Benchmarking EXLA sin with input size = 16 ...
Benchmarking EXLA sin with input size = 256 ...
Benchmarking EXLA sin with input size = 64 ...
Benchmarking Nx CORDIC with input size = 1024 ...
Benchmarking Nx CORDIC with input size = 16 ...
Benchmarking Nx CORDIC with input size = 256 ...
Benchmarking Nx CORDIC with input size = 64 ...
Benchmarking Nx cos_sin with input size = 1024 ...
Benchmarking Nx cos_sin with input size = 16 ...
Benchmarking Nx cos_sin with input size = 256 ...
Benchmarking Nx cos_sin with input size = 64 ...
Benchmarking Nx sin with input size = 1024 ...
Benchmarking Nx sin with input size = 16 ...
Benchmarking Nx sin with input size = 256 ...
Benchmarking Nx sin with input size = 64 ...
##### With input size = 1024 #####
Name ips average deviation median 99th %
EXLA sin 99.98 K 10.00 μs ±54.38% 9.29 μs 21.29 μs
EXLA cos_sin 74.94 K 13.34 μs ±64.87% 12.13 μs 37.67 μs
Nx sin 19.54 K 51.17 μs ±4.26% 50.71 μs 58.25 μs
EXLA CORDIC 13.38 K 74.75 μs ±19.39% 73.83 μs 119.38 μs
Nx cos_sin 9.47 K 105.54 μs ±5.13% 104.50 μs 126.55 μs
Nx CORDIC 0.0310 K 32243.11 μs ±0.26% 32222.15 μs 32495.26 μs
Comparison:
EXLA sin 99.98 K
EXLA cos_sin 74.94 K - 1.33x slower +3.34 μs
Nx sin 19.54 K - 5.12x slower +41.17 μs
EXLA CORDIC 13.38 K - 7.47x slower +64.75 μs
Nx cos_sin 9.47 K - 10.55x slower +95.54 μs
Nx CORDIC 0.0310 K - 3223.55x slower +32233.10 μs
##### With input size = 16 #####
Name ips average deviation median 99th %
Nx sin 880.78 K 1.14 μs ±1093.13% 1.04 μs 1.46 μs
Nx cos_sin 248.16 K 4.03 μs ±189.52% 3.75 μs 6.54 μs
EXLA sin 128.77 K 7.77 μs ±87.81% 7.08 μs 18 μs
EXLA cos_sin 114.02 K 8.77 μs ±99.02% 7.71 μs 23.04 μs
EXLA CORDIC 101.74 K 9.83 μs ±94.62% 8.71 μs 25.91 μs
Nx CORDIC 0.60 K 1678.78 μs ±2.54% 1691.87 μs 1756.46 μs
Comparison:
Nx sin 880.78 K
Nx cos_sin 248.16 K - 3.55x slower +2.89 μs
EXLA sin 128.77 K - 6.84x slower +6.63 μs
EXLA cos_sin 114.02 K - 7.72x slower +7.64 μs
EXLA CORDIC 101.74 K - 8.66x slower +8.69 μs
Nx CORDIC 0.60 K - 1478.63x slower +1677.64 μs
##### With input size = 256 #####
Name ips average deviation median 99th %
EXLA sin 119.46 K 8.37 μs ±72.31% 7.67 μs 19.29 μs
EXLA cos_sin 99.50 K 10.05 μs ±95.40% 8.92 μs 30.96 μs
Nx sin 76.25 K 13.12 μs ±42.67% 12.92 μs 15.88 μs
EXLA CORDIC 43.51 K 22.98 μs ±33.90% 21.67 μs 69.71 μs
Nx cos_sin 35.49 K 28.17 μs ±5.93% 27.67 μs 33.17 μs
Nx CORDIC 0.112 K 8957.25 μs ±1.33% 9000.25 μs 9142.45 μs
Comparison:
EXLA sin 119.46 K
EXLA cos_sin 99.50 K - 1.20x slower +1.68 μs
Nx sin 76.25 K - 1.57x slower +4.74 μs
EXLA CORDIC 43.51 K - 2.75x slower +14.61 μs
Nx cos_sin 35.49 K - 3.37x slower +19.80 μs
Nx CORDIC 0.112 K - 1070.07x slower +8948.88 μs
##### With input size = 64 #####
Name ips average deviation median 99th %
Nx sin 268.66 K 3.72 μs ±215.07% 3.54 μs 5.08 μs
EXLA sin 125.96 K 7.94 μs ±85.14% 7.25 μs 18.17 μs
EXLA cos_sin 109.42 K 9.14 μs ±104.60% 8 μs 23.25 μs
Nx cos_sin 106.96 K 9.35 μs ±61.99% 8.83 μs 16.21 μs
EXLA CORDIC 79.64 K 12.56 μs ±71.16% 11.33 μs 39.33 μs
Nx CORDIC 0.30 K 3299.32 μs ±0.41% 3298.07 μs 3338.27 μs
Comparison:
Nx sin 268.66 K
EXLA sin 125.96 K - 2.13x slower +4.22 μs
EXLA cos_sin 109.42 K - 2.46x slower +5.42 μs
Nx cos_sin 106.96 K - 2.51x slower +5.63 μs
EXLA CORDIC 79.64 K - 3.37x slower +8.83 μs
Nx CORDIC 0.30 K - 886.41x slower +3295.60 μs