cupy · asi1024 · Jun 14, 2018 · Jun 14, 2018 · Jun 14, 2018 · Jun 14, 2018
diff --git a/benchmarks/cupy/examples/__init__.py b/benchmarks/cupy/examples/__init__.py
diff --git a/benchmarks/cupy/examples/cg/__init__.py b/benchmarks/cupy/examples/cg/__init__.py
diff --git a/benchmarks/cupy/examples/cg/bench_cg.py b/benchmarks/cupy/examples/cg/bench_cg.py
@@ -0,0 +1,25 @@
+from benchmarks import BenchmarkBase
+from benchmarks.utils import sync
+from benchmarks.utils.helper import parameterize
+
+from .cg import fit
+
+import cupy
+
+
+@sync
+@parameterize([('N', [1, 50, 2000]),
+               ('max_iter', [5000])])
+class CG(BenchmarkBase):
+    def setup(self, N, max_iter):
+        low, high = -50, 50
+        dtype = cupy.float64
+
+        self.A = cupy.random.randint(low, high, size=(N, N))
+        self.A = (self.A + self.A.T).astype(dtype)
+        x_ans = cupy.random.randint(low, high, size=N).astype(dtype)
+        self.b = cupy.dot(self.A, x_ans)
+        self.tol = 0.1
+
+    def time_fit(self, N, max_iter):
+        fit(self.A, self.b, self.tol, max_iter)
diff --git a/benchmarks/cupy/examples/cg/cg.py b/benchmarks/cupy/examples/cg/cg.py
@@ -0,0 +1,85 @@
+import argparse
+import contextlib
+import time
+
+import numpy as np
+import six
+
+import cupy
+
+
+@contextlib.contextmanager
+def timer(message):
+    cupy.cuda.Stream.null.synchronize()
+    start = time.time()
+    yield
+    cupy.cuda.Stream.null.synchronize()
+    end = time.time()
+    print('%s:  %f sec' % (message, end - start))
+
+
+def fit(A, b, tol, max_iter):
+    # Note that this function works even tensors 'A' and 'b' are NumPy or CuPy
+    # arrays.
+    xp = cupy.get_array_module(A)
+    x = xp.zeros_like(b, dtype=np.float64)
+    r0 = b - xp.dot(A, x)
+    p = r0
+    for i in six.moves.range(max_iter):
+        a = xp.inner(r0, r0) / xp.inner(p, xp.dot(A, p))
+        x += a * p
+        r1 = r0 - a * xp.dot(A, p)
+        if xp.linalg.norm(r1) < tol:
+            return x
+        b = xp.inner(r1, r1) / xp.inner(r0, r0)
+        p = r1 + b * p
+        r0 = r1
+    print('Failed to converge. Increase max-iter or tol.')
+    return x
+
+
+def run(gpu_id, tol, max_iter):
+    """CuPy Conjugate gradient example
+
+    Solve simultaneous linear equations, Ax = b.
+    'A' and 'x' are created randomly and 'b' is computed by 'Ax' at first.
+    Then, 'x' is computed from 'A' and 'b' in two ways, namely with CPU and
+    GPU. To evaluate the accuracy of computation, the Euclidean distances
+    between the answer 'x' and the reconstructed 'x' are computed.
+
+    """
+    for repeat in range(3):
+        print('Trial: %d' % repeat)
+        # Create the large symmetric matrix 'A'.
+        N = 2000
+        A = np.random.randint(-50, 50, size=(N, N))
+        A = (A + A.T).astype(np.float64)
+        x_ans = np.random.randint(-50, 50, size=N).astype(np.float64)
+        b = np.dot(A, x_ans)
+
+        print('Running CPU...')
+        with timer(' CPU '):
+            x_cpu = fit(A, b, tol, max_iter)
+        print(np.linalg.norm(x_cpu - x_ans))
+
+        with cupy.cuda.Device(gpu_id):
+            A_gpu = cupy.asarray(A)
+            b_gpu = cupy.asarray(b)
+            print('Running GPU...')
+            with timer(' GPU '):
+                x_gpu = fit(A_gpu, b_gpu, tol, max_iter)
+            print(np.linalg.norm(cupy.asnumpy(x_gpu) - x_ans))
+
+        print()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gpu-id', '-g', default=0, type=int,
+                        help='ID of GPU.')
+    parser.add_argument('--tol', '-t', default=0.1, type=float,
+                        help='tolerance to stop iteration')
+    parser.add_argument('--max-iter', '-m', default=5000, type=int,
+                        help='number of iterations')
+    args = parser.parse_args()
+    run(args.gpu_id, args.tol, args.max_iter)
diff --git a/benchmarks/cupy/examples/finance/__init__.py b/benchmarks/cupy/examples/finance/__init__.py
diff --git a/benchmarks/cupy/examples/finance/bench_finance.py b/benchmarks/cupy/examples/finance/bench_finance.py
@@ -0,0 +1,99 @@
+from benchmarks import BenchmarkBase
+from benchmarks.utils import sync
+from benchmarks.utils.helper import parameterize
+
+import cupy
+
+from .black_scholes import black_scholes, black_scholes_kernel
+from .monte_carlo import compute_option_prices, monte_carlo_kernel
+
+
+@sync
+@parameterize([('n_options', [1, 10000000])])
+class BlackScholes(BenchmarkBase):
+    def setup(self, n_options):
+        def rand_range(m, M):
+            samples = cupy.random.rand(n_options)
+            return (m + (M - m) * samples).astype(cupy.float64)
+        self.stock_price = rand_range(5, 30)
+        self.option_strike = rand_range(1, 100)
+        self.option_years = rand_range(0.25, 10)
+        self.risk_free = 0.02
+        self.volatility = 0.3
+
+    def time_black_scholes(self, n_options):
+        black_scholes(
+            cupy, self.stock_price, self.option_strike,
+            self.option_years, self.risk_free, self.volatility)
+
+    def time_black_scholes_kernel(self, n_options):
+        black_scholes_kernel(
+            self.stock_price, self.option_strike,
+            self.option_years, self.risk_free, self.volatility)
+
+
+@sync
+@parameterize([('n_options', [1, 1000]),
+               ('n_samples_per_thread', [1, 1000]),
+               ('n_threads_per_option', [1, 100000])])
+class MonteCarlo(BenchmarkBase):
+    def setup(
+            self, n_options, n_samples_per_thread, n_threads_per_option):
+        def rand_range(m, M):
+            samples = cupy.random.rand(n_options)
+            return (m + (M - m) * samples).astype(cupy.float64)
+        self.stock_price = rand_range(5, 30)
+        self.option_strike = rand_range(1, 100)
+        self.option_years = rand_range(0.25, 10)
+        self.risk_free = 0.02
+        self.volatility = 0.3
+
+    def time_monte_carlo(
+            self, n_options, n_samples_per_thread, n_threads_per_option):
+        compute_option_prices(
+            self.stock_price, self.option_strike,
+            self.option_years, self.risk_free, self.volatility,
+            n_threads_per_option, n_samples_per_thread)
+
+
+@sync
+@parameterize([('gpus', [[0]]),
+               ('n_options', [1, 1000]),
+               ('n_samples_per_thread', [1, 1000]),
+               ('n_threads_per_option', [1, 10000])])
+class MonteCarloMultiGPU(BenchmarkBase):
+    def setup(
+            self, gpus, n_options, n_samples_per_thread, n_threads_per_option):
+        def rand_range(m, M):
+            samples = cupy.random.rand(n_options)
+            return (m + (M - m) * samples).astype(cupy.float64)
+
+        stock_price = rand_range(5, 30)
+        option_strike = rand_range(1, 100)
+        option_years = rand_range(0.25, 10)
+        self.risk_free = 0.02
+        self.volatility = 0.3
+
+        self.stock_price_gpus = []
+        self.option_strike_gpus = []
+        self.option_years_gpus = []
+        self.call_prices_gpus = []
+
+        for gpu_id in gpus:
+            with cupy.cuda.Device(gpu_id):
+                self.stock_price_gpus.append(cupy.array(stock_price))
+                self.option_strike_gpus.append(cupy.array(option_strike))
+                self.option_years_gpus.append(cupy.array(option_years))
+                self.call_prices_gpus.append(cupy.empty(
+                    (n_options, n_threads_per_option), dtype=cupy.float64))
+
+    def time_monte_carlo_multigpu(
+            self, gpus, n_options, n_samples_per_thread, n_threads_per_option):
+        for i, gpu_id in enumerate(gpus):
+            with cupy.cuda.Device(gpu_id):
+                monte_carlo_kernel(
+                    self.stock_price_gpus[i][:, None],
+                    self.option_strike_gpus[i][:, None],
+                    self.option_years_gpus[i][:, None],
+                    self.risk_free, self.volatility, n_samples_per_thread, i,
+                    self.call_prices_gpus[i])
diff --git a/benchmarks/cupy/examples/finance/black_scholes.py b/benchmarks/cupy/examples/finance/black_scholes.py
@@ -0,0 +1,144 @@
+import argparse
+import contextlib
+import time
+
+import cupy
+import numpy
+
+# This sample computes call and put prices for Europian options with
+# Black-Scholes equation. It was based on a sample of the financial package
+# in CUDA toolkit. For details, please see the corresponding whitepaper.
+#
+# The following code shows that CuPy enables us to write algorithms for GPUs
+# without significantly modifying the existing NumPy's code.
+# It also briefly describes how to create your own kernel with CuPy.
+# If you want to speed up the existing code, please define the kernel
+# with cupy.ElementwiseKernel.
+
+
+# Naive implementation of the pricing of options with NumPy and CuPy.
+def black_scholes(xp, s, x, t, r, v):
+    sqrt_t = xp.sqrt(t)
+    d1 = (xp.log(s / x) + (r + v * v / 2) * t) / (v * sqrt_t)
+    d2 = d1 - v * sqrt_t
+
+    def get_cumulative_normal_distribution(x):
+        A1 = 0.31938153
+        A2 = -0.356563782
+        A3 = 1.781477937
+        A4 = -1.821255978
+        A5 = 1.330274429
+        RSQRT2PI = 0.39894228040143267793994605993438
+        W = 0.2316419
+
+        k = 1 / (1 + W * xp.abs(x))
+        cnd = RSQRT2PI * xp.exp(-x * x / 2) * \
+            (k * (A1 + k * (A2 + k * (A3 + k * (A4 + k * A5)))))
+        cnd = xp.where(x > 0, 1 - cnd, cnd)
+        return cnd
+
+    cnd_d1 = get_cumulative_normal_distribution(d1)
+    cnd_d2 = get_cumulative_normal_distribution(d2)
+
+    exp_rt = xp.exp(- r * t)
+    call = s * cnd_d1 - x * exp_rt * cnd_d2
+    put = x * exp_rt * (1 - cnd_d2) - s * (1 - cnd_d1)
+    return call, put
+
+
+# An example of calling the kernel via cupy.ElementwiseKernel.
+# When executing __call__ method of the instance, it automatically compiles
+# the code depending on the types of the given arrays, and calls the kernel.
+# Other functions used inside the kernel can be defined by 'preamble' option.
+black_scholes_kernel = cupy.ElementwiseKernel(
+    'T s, T x, T t, T r, T v',  # Inputs
+    'T call, T put',  # Outputs
+    '''
+    const T sqrt_t = sqrt(t);
+    const T d1 = (log(s / x) + (r + v * v / 2) * t) / (v * sqrt_t);
+    const T d2 = d1 - v * sqrt_t;
+
+    const T cnd_d1 = get_cumulative_normal_distribution(d1);
+    const T cnd_d2 = get_cumulative_normal_distribution(d2);
+
+    const T exp_rt = exp(- r * t);
+    call = s * cnd_d1 - x * exp_rt * cnd_d2;
+    put = x * exp_rt * (1 - cnd_d2) - s * (1 - cnd_d1);
+    ''',
+    'black_scholes_kernel',
+    preamble='''
+    __device__
+    inline T get_cumulative_normal_distribution(T x) {
+        const T A1 = 0.31938153;
+        const T A2 = -0.356563782;
+        const T A3 = 1.781477937;
+        const T A4 = -1.821255978;
+        const T A5 = 1.330274429;
+        const T RSQRT2PI = 0.39894228040143267793994605993438;
+        const T W = 0.2316419;
+
+        const T k = 1 / (1 + W * abs(x));
+        T cnd = RSQRT2PI * exp(- x * x / 2) *
+            (k * (A1 + k * (A2 + k * (A3 + k * (A4 + k * A5)))));
+        if (x > 0) {
+            cnd = 1 - cnd;
+        }
+        return cnd;
+    }
+    ''',
+)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gpu-id', '-g', default=0, type=int, help='GPU ID')
+    parser.add_argument('--n-options', '-n', default=10000000, type=int)
+    args = parser.parse_args()
+
+    cupy.cuda.Device(args.gpu_id).use()
+
+    def rand_range(m, M):
+        samples = cupy.random.rand(args.n_options)
+        return (m + (M - m) * samples).astype(numpy.float64)
+
+    print('initializing...')
+    stock_price_gpu = rand_range(5, 30)
+    option_strike_gpu = rand_range(1, 100)
+    option_years_gpu = rand_range(0.25, 10)
+
+    stock_price_cpu = stock_price_gpu.get()
+    option_strike_cpu = option_strike_gpu.get()
+    option_years_cpu = option_years_gpu.get()
+
+    @contextlib.contextmanager
+    def timer(message):
+        cupy.cuda.Stream.null.synchronize()
+        start = time.time()
+        yield
+        cupy.cuda.Stream.null.synchronize()
+        end = time.time()
+        print('%s:\t%f sec' % (message, end - start))
+
+    print('start computation')
+    risk_free = 0.02
+    volatility = 0.3
+    with timer(' CPU (NumPy, Naive implementation)'):
+        call_cpu, put_cpu = black_scholes(
+            numpy, stock_price_cpu, option_strike_cpu, option_years_cpu,
+            risk_free, volatility)
+
+    with timer(' GPU (CuPy, Naive implementation)'):
+        call_gpu1, put_gpu1 = black_scholes(
+            cupy, stock_price_gpu, option_strike_gpu, option_years_gpu,
+            risk_free, volatility)
+
+    with timer(' GPU (CuPy, Elementwise kernel)'):
+        call_gpu2, put_gpu2 = black_scholes_kernel(
+            stock_price_gpu, option_strike_gpu, option_years_gpu,
+            risk_free, volatility)
+
+    # Check whether all elements in gpu arrays are equal to those of cpus
+    cupy.testing.assert_allclose(call_cpu, call_gpu1)
+    cupy.testing.assert_allclose(call_cpu, call_gpu2)
+    cupy.testing.assert_allclose(put_cpu, put_gpu1)
+    cupy.testing.assert_allclose(put_cpu, put_gpu2)