@@ -63,9 +63,10 @@ int main(int argc, char * argv[])
6363
6464 int iterations;
6565 size_t order;
66+ int use_ngpu = 1 ;
6667 try {
6768 if (argc < 3 ) {
68- throw " Usage: <# iterations> <matrix order>" ;
69+ throw " Usage: <# iterations> <matrix order> [<use_ngpu>] " ;
6970 }
7071
7172 iterations = std::atoi (argv[1 ]);
@@ -79,6 +80,15 @@ int main(int argc, char * argv[])
7980 } else if (order > prk::get_max_matrix_size ()) {
8081 throw " ERROR: matrix dimension too large - overflow risk" ;
8182 }
83+
84+ if (argc > 3 ) {
85+ use_ngpu = std::atoi (argv[3 ]);
86+ }
87+
88+ if (order % use_ngpu) {
89+ std::cerr << " order = " << order << " , device count = " << use_ngpu << std::endl;
90+ throw " ERROR: matrix order should be divisible by device count!" ;
91+ }
8292 }
8393 catch (const char * e) {
8494 std::cout << e << std::endl;
@@ -87,34 +97,66 @@ int main(int argc, char * argv[])
8797
8898 std::cout << " Number of iterations = " << iterations << std::endl;
8999 std::cout << " Matrix order = " << order << std::endl;
100+ std::cout << " Number of GPUs to use = " << use_ngpu << std::endl;
101+
102+ std::vector<sycl::queue> qs;
103+
104+ auto platforms = sycl::platform::get_platforms ();
105+ for (auto & p : platforms) {
106+ auto pname = p.get_info <sycl::info::platform::name>();
107+ std::cout << " *Platform: " << pname << std::endl;
108+ if ( pname.find (" Level-Zero" ) != std::string::npos) {
109+ std::cout << " *Level Zero GPU skipped" << std::endl;
110+ break ;
111+ }
112+ if ( pname.find (" Intel" ) == std::string::npos) {
113+ std::cout << " *non-Intel skipped" << std::endl;
114+ break ;
115+ }
116+ auto devices = p.get_devices ();
117+ for (auto & d : devices ) {
118+ std::cout << " **Device: " << d.get_info <sycl::info::device::name>() << std::endl;
119+ if ( d.is_gpu () || d.is_cpu () ) {
120+ std::cout << " **Device is CPU or GPU - adding to vector of queues" << std::endl;
121+ qs.push_back (sycl::queue (d));
122+ }
123+ }
124+ }
125+
126+ int haz_ngpu = qs.size ();
127+ std::cout << " Number of CPUs and GPUs found = " << haz_ngpu << std::endl;
90128
91- sycl::queue q (sycl::default_selector{});
92- prk::SYCL::print_device_platform (q);
129+ if (use_ngpu > haz_ngpu) {
130+ std::cout << " You cannot use more GPUs (" << use_ngpu << " ) than you have (" << haz_ngpu << " )" << std::endl;
131+ }
132+
133+ int ngpus = use_ngpu;
93134
94135 // ////////////////////////////////////////////////////////////////////
95136 // Allocate space for the input and transpose matrix
96137 // ////////////////////////////////////////////////////////////////////
97138
98- const size_t nelems = ( size_t )order * ( size_t )order ;
99- const size_t bytes = nelems * sizeof ( double );
100- double * h_a = syclx::malloc_host <double >( nelems, q );
101- double * h_b = syclx::malloc_host <double >( nelems, q );
139+ double trans_time ( 0 ) ;
140+
141+ auto h_a = prk::vector <double >(order * order );
142+ auto h_b = prk::vector <double >(order * order );
102143
103144 // fill A with the sequence 0 to order^2-1
104- for (int j=0 ; j<order; j++) {
105- for (int i=0 ; i<order; i++) {
145+ for (size_t j=0 ; j<order; j++) {
146+ for (size_t i=0 ; i<order; i++) {
106147 h_a[j*order+i] = static_cast <double >(order*j+i);
107148 h_b[j*order+i] = static_cast <double >(0 );
108149 }
109150 }
110151
111- // copy input from host to device
112- double * A = syclx::malloc_device<double >( nelems, q);
113- double * B = syclx::malloc_device<double >( nelems, q);
114- q.memcpy (A, &(h_a[0 ]), bytes).wait ();
115- q.memcpy (B, &(h_b[0 ]), bytes).wait ();
152+ const size_t bytes = order * order * sizeof (double );
116153
117- auto trans_time = 0.0 ;
154+ // copy input from host to device
155+ double * A = syclx::malloc_device<double >(order * order, q);
156+ double * B = syclx::malloc_device<double >(order * order, q);
157+ q.memcpy (A, &(h_a[0 ]), bytes);
158+ q.memcpy (B, &(h_b[0 ]), bytes);
159+ q.wait ();
118160
119161 for (int iter = 0 ; iter<=iterations; iter++) {
120162
0 commit comments