Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Empty file.
25 changes: 25 additions & 0 deletions benchmarks/cupy/examples/cg/bench_cg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from benchmarks import BenchmarkBase
from benchmarks.utils import sync
from benchmarks.utils.helper import parameterize

from .cg import fit

import cupy


@sync
@parameterize([('N', [1, 50, 2000]),
('max_iter', [5000])])
class CG(BenchmarkBase):
def setup(self, N, max_iter):
low, high = -50, 50
dtype = cupy.float64

self.A = cupy.random.randint(low, high, size=(N, N))
self.A = (self.A + self.A.T).astype(dtype)
x_ans = cupy.random.randint(low, high, size=N).astype(dtype)
self.b = cupy.dot(self.A, x_ans)
self.tol = 0.1

def time_fit(self, N, max_iter):
fit(self.A, self.b, self.tol, max_iter)
85 changes: 85 additions & 0 deletions benchmarks/cupy/examples/cg/cg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import argparse
import contextlib
import time

import numpy as np
import six

import cupy


@contextlib.contextmanager
def timer(message):
cupy.cuda.Stream.null.synchronize()
start = time.time()
yield
cupy.cuda.Stream.null.synchronize()
end = time.time()
print('%s: %f sec' % (message, end - start))


def fit(A, b, tol, max_iter):
# Note that this function works even tensors 'A' and 'b' are NumPy or CuPy
# arrays.
xp = cupy.get_array_module(A)
x = xp.zeros_like(b, dtype=np.float64)
r0 = b - xp.dot(A, x)
p = r0
for i in six.moves.range(max_iter):
a = xp.inner(r0, r0) / xp.inner(p, xp.dot(A, p))
x += a * p
r1 = r0 - a * xp.dot(A, p)
if xp.linalg.norm(r1) < tol:
return x
b = xp.inner(r1, r1) / xp.inner(r0, r0)
p = r1 + b * p
r0 = r1
print('Failed to converge. Increase max-iter or tol.')
return x


def run(gpu_id, tol, max_iter):
"""CuPy Conjugate gradient example

Solve simultaneous linear equations, Ax = b.
'A' and 'x' are created randomly and 'b' is computed by 'Ax' at first.
Then, 'x' is computed from 'A' and 'b' in two ways, namely with CPU and
GPU. To evaluate the accuracy of computation, the Euclidean distances
between the answer 'x' and the reconstructed 'x' are computed.

"""
for repeat in range(3):
print('Trial: %d' % repeat)
# Create the large symmetric matrix 'A'.
N = 2000
A = np.random.randint(-50, 50, size=(N, N))
A = (A + A.T).astype(np.float64)
x_ans = np.random.randint(-50, 50, size=N).astype(np.float64)
b = np.dot(A, x_ans)

print('Running CPU...')
with timer(' CPU '):
x_cpu = fit(A, b, tol, max_iter)
print(np.linalg.norm(x_cpu - x_ans))

with cupy.cuda.Device(gpu_id):
A_gpu = cupy.asarray(A)
b_gpu = cupy.asarray(b)
print('Running GPU...')
with timer(' GPU '):
x_gpu = fit(A_gpu, b_gpu, tol, max_iter)
print(np.linalg.norm(cupy.asnumpy(x_gpu) - x_ans))

print()


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--gpu-id', '-g', default=0, type=int,
help='ID of GPU.')
parser.add_argument('--tol', '-t', default=0.1, type=float,
help='tolerance to stop iteration')
parser.add_argument('--max-iter', '-m', default=5000, type=int,
help='number of iterations')
args = parser.parse_args()
run(args.gpu_id, args.tol, args.max_iter)
Empty file.
99 changes: 99 additions & 0 deletions benchmarks/cupy/examples/finance/bench_finance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from benchmarks import BenchmarkBase
from benchmarks.utils import sync
from benchmarks.utils.helper import parameterize

import cupy

from .black_scholes import black_scholes, black_scholes_kernel
from .monte_carlo import compute_option_prices, monte_carlo_kernel


@sync
@parameterize([('n_options', [1, 10000000])])
class BlackScholes(BenchmarkBase):
def setup(self, n_options):
def rand_range(m, M):
samples = cupy.random.rand(n_options)
return (m + (M - m) * samples).astype(cupy.float64)
self.stock_price = rand_range(5, 30)
self.option_strike = rand_range(1, 100)
self.option_years = rand_range(0.25, 10)
self.risk_free = 0.02
self.volatility = 0.3

def time_black_scholes(self, n_options):
black_scholes(
cupy, self.stock_price, self.option_strike,
self.option_years, self.risk_free, self.volatility)

def time_black_scholes_kernel(self, n_options):
black_scholes_kernel(
self.stock_price, self.option_strike,
self.option_years, self.risk_free, self.volatility)


@sync
@parameterize([('n_options', [1, 1000]),
('n_samples_per_thread', [1, 1000]),
('n_threads_per_option', [1, 100000])])
class MonteCarlo(BenchmarkBase):
def setup(
self, n_options, n_samples_per_thread, n_threads_per_option):
def rand_range(m, M):
samples = cupy.random.rand(n_options)
return (m + (M - m) * samples).astype(cupy.float64)
self.stock_price = rand_range(5, 30)
self.option_strike = rand_range(1, 100)
self.option_years = rand_range(0.25, 10)
self.risk_free = 0.02
self.volatility = 0.3

def time_monte_carlo(
self, n_options, n_samples_per_thread, n_threads_per_option):
compute_option_prices(
self.stock_price, self.option_strike,
self.option_years, self.risk_free, self.volatility,
n_threads_per_option, n_samples_per_thread)


@sync
@parameterize([('gpus', [[0]]),
('n_options', [1, 1000]),
('n_samples_per_thread', [1, 1000]),
('n_threads_per_option', [1, 10000])])
class MonteCarloMultiGPU(BenchmarkBase):
def setup(
self, gpus, n_options, n_samples_per_thread, n_threads_per_option):
def rand_range(m, M):
samples = cupy.random.rand(n_options)
return (m + (M - m) * samples).astype(cupy.float64)

stock_price = rand_range(5, 30)
option_strike = rand_range(1, 100)
option_years = rand_range(0.25, 10)
self.risk_free = 0.02
self.volatility = 0.3

self.stock_price_gpus = []
self.option_strike_gpus = []
self.option_years_gpus = []
self.call_prices_gpus = []

for gpu_id in gpus:
with cupy.cuda.Device(gpu_id):
self.stock_price_gpus.append(cupy.array(stock_price))
self.option_strike_gpus.append(cupy.array(option_strike))
self.option_years_gpus.append(cupy.array(option_years))
self.call_prices_gpus.append(cupy.empty(
(n_options, n_threads_per_option), dtype=cupy.float64))

def time_monte_carlo_multigpu(
self, gpus, n_options, n_samples_per_thread, n_threads_per_option):
for i, gpu_id in enumerate(gpus):
with cupy.cuda.Device(gpu_id):
monte_carlo_kernel(
self.stock_price_gpus[i][:, None],
self.option_strike_gpus[i][:, None],
self.option_years_gpus[i][:, None],
self.risk_free, self.volatility, n_samples_per_thread, i,
self.call_prices_gpus[i])
144 changes: 144 additions & 0 deletions benchmarks/cupy/examples/finance/black_scholes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import argparse
import contextlib
import time

import cupy
import numpy

# This sample computes call and put prices for Europian options with
# Black-Scholes equation. It was based on a sample of the financial package
# in CUDA toolkit. For details, please see the corresponding whitepaper.
#
# The following code shows that CuPy enables us to write algorithms for GPUs
# without significantly modifying the existing NumPy's code.
# It also briefly describes how to create your own kernel with CuPy.
# If you want to speed up the existing code, please define the kernel
# with cupy.ElementwiseKernel.


# Naive implementation of the pricing of options with NumPy and CuPy.
def black_scholes(xp, s, x, t, r, v):
sqrt_t = xp.sqrt(t)
d1 = (xp.log(s / x) + (r + v * v / 2) * t) / (v * sqrt_t)
d2 = d1 - v * sqrt_t

def get_cumulative_normal_distribution(x):
A1 = 0.31938153
A2 = -0.356563782
A3 = 1.781477937
A4 = -1.821255978
A5 = 1.330274429
RSQRT2PI = 0.39894228040143267793994605993438
W = 0.2316419

k = 1 / (1 + W * xp.abs(x))
cnd = RSQRT2PI * xp.exp(-x * x / 2) * \
(k * (A1 + k * (A2 + k * (A3 + k * (A4 + k * A5)))))
cnd = xp.where(x > 0, 1 - cnd, cnd)
return cnd

cnd_d1 = get_cumulative_normal_distribution(d1)
cnd_d2 = get_cumulative_normal_distribution(d2)

exp_rt = xp.exp(- r * t)
call = s * cnd_d1 - x * exp_rt * cnd_d2
put = x * exp_rt * (1 - cnd_d2) - s * (1 - cnd_d1)
return call, put


# An example of calling the kernel via cupy.ElementwiseKernel.
# When executing __call__ method of the instance, it automatically compiles
# the code depending on the types of the given arrays, and calls the kernel.
# Other functions used inside the kernel can be defined by 'preamble' option.
black_scholes_kernel = cupy.ElementwiseKernel(
'T s, T x, T t, T r, T v', # Inputs
'T call, T put', # Outputs
'''
const T sqrt_t = sqrt(t);
const T d1 = (log(s / x) + (r + v * v / 2) * t) / (v * sqrt_t);
const T d2 = d1 - v * sqrt_t;

const T cnd_d1 = get_cumulative_normal_distribution(d1);
const T cnd_d2 = get_cumulative_normal_distribution(d2);

const T exp_rt = exp(- r * t);
call = s * cnd_d1 - x * exp_rt * cnd_d2;
put = x * exp_rt * (1 - cnd_d2) - s * (1 - cnd_d1);
''',
'black_scholes_kernel',
preamble='''
__device__
inline T get_cumulative_normal_distribution(T x) {
const T A1 = 0.31938153;
const T A2 = -0.356563782;
const T A3 = 1.781477937;
const T A4 = -1.821255978;
const T A5 = 1.330274429;
const T RSQRT2PI = 0.39894228040143267793994605993438;
const T W = 0.2316419;

const T k = 1 / (1 + W * abs(x));
T cnd = RSQRT2PI * exp(- x * x / 2) *
(k * (A1 + k * (A2 + k * (A3 + k * (A4 + k * A5)))));
if (x > 0) {
cnd = 1 - cnd;
}
return cnd;
}
''',
)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--gpu-id', '-g', default=0, type=int, help='GPU ID')
parser.add_argument('--n-options', '-n', default=10000000, type=int)
args = parser.parse_args()

cupy.cuda.Device(args.gpu_id).use()

def rand_range(m, M):
samples = cupy.random.rand(args.n_options)
return (m + (M - m) * samples).astype(numpy.float64)

print('initializing...')
stock_price_gpu = rand_range(5, 30)
option_strike_gpu = rand_range(1, 100)
option_years_gpu = rand_range(0.25, 10)

stock_price_cpu = stock_price_gpu.get()
option_strike_cpu = option_strike_gpu.get()
option_years_cpu = option_years_gpu.get()

@contextlib.contextmanager
def timer(message):
cupy.cuda.Stream.null.synchronize()
start = time.time()
yield
cupy.cuda.Stream.null.synchronize()
end = time.time()
print('%s:\t%f sec' % (message, end - start))

print('start computation')
risk_free = 0.02
volatility = 0.3
with timer(' CPU (NumPy, Naive implementation)'):
call_cpu, put_cpu = black_scholes(
numpy, stock_price_cpu, option_strike_cpu, option_years_cpu,
risk_free, volatility)

with timer(' GPU (CuPy, Naive implementation)'):
call_gpu1, put_gpu1 = black_scholes(
cupy, stock_price_gpu, option_strike_gpu, option_years_gpu,
risk_free, volatility)

with timer(' GPU (CuPy, Elementwise kernel)'):
call_gpu2, put_gpu2 = black_scholes_kernel(
stock_price_gpu, option_strike_gpu, option_years_gpu,
risk_free, volatility)

# Check whether all elements in gpu arrays are equal to those of cpus
cupy.testing.assert_allclose(call_cpu, call_gpu1)
cupy.testing.assert_allclose(call_cpu, call_gpu2)
cupy.testing.assert_allclose(put_cpu, put_gpu1)
cupy.testing.assert_allclose(put_cpu, put_gpu2)
Loading