//============================================================== // Vector Add is the equivalent of a Hello, World! sample for data parallel // programs. Building and running the sample verifies that your development // environment is setup correctly and demonstrates the use of the core features // of SYCL. This sample runs on both CPU and GPU (or FPGA). When run, it // computes on both the CPU and offload device, then compares results. If the // code executes on both CPU and offload device, the device name and a success // message are displayed. And, your development environment is setup correctly! // // For comprehensive instructions regarding SYCL Programming, go to // https://software.intel.com/en-us/oneapi-programming-guide and search based on // relevant terms noted in the comments. // // SYCL material used in the code sample: // • A one dimensional array of data. // • A device queue, buffer, accessor, and kernel. //============================================================== // Copyright © Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= #include #include #include #include #if FPGA_HARDWARE || FPGA_EMULATOR || FPGA_SIMULATOR #include #endif using namespace sycl; // num_repetitions: How many times to repeat the kernel invocation size_t num_repetitions = 1; // Vector type and data size for this example. size_t vector_size = 10000; typedef std::vector IntVector; // Create an exception handler for asynchronous SYCL exceptions static auto exception_handler = [](sycl::exception_list e_list) { for (std::exception_ptr const &e : e_list) { try { std::rethrow_exception(e); } catch (std::exception const &e) { #if _DEBUG std::cout << "Failure" << std::endl; #endif std::terminate(); } } }; //************************************ // Vector add in SYCL on device: returns sum in 4th parameter "sum_parallel". //************************************ void VectorAdd(queue &q, const IntVector &a_vector, const IntVector &b_vector, IntVector &sum_parallel) { // Create the range object for the vectors managed by the buffer. range<1> num_items{a_vector.size()}; // Create buffers that hold the data shared between the host and the devices. // The buffer destructor is responsible to copy the data back to host when it // goes out of scope. buffer a_buf(a_vector); buffer b_buf(b_vector); buffer sum_buf(sum_parallel.data(), num_items); for (size_t i = 0; i < num_repetitions; i++ ) { // Submit a command group to the queue by a lambda function that contains the // data access permission and device computation (kernel). q.submit([&](handler &h) { // Create an accessor for each buffer with access permission: read, write or // read/write. The accessor is a mean to access the memory in the buffer. accessor a(a_buf, h, read_only); accessor b(b_buf, h, read_only); // The sum_accessor is used to store (with write permission) the sum data. accessor sum(sum_buf, h, write_only, no_init); // Use parallel_for to run vector addition in parallel on device. This // executes the kernel. // 1st parameter is the number of work items. // 2nd parameter is the kernel, a lambda that specifies what to do per // work item. The parameter of the lambda is the work item id. // SYCL supports unnamed lambda kernel by default. h.parallel_for(num_items, [=](auto i) { sum[i] = a[i] + b[i]; }); }); }; // Wait until compute tasks on GPU done q.wait(); } //************************************ // Initialize the vector from 0 to vector_size - 1 //************************************ void InitializeVector(IntVector &a) { for (size_t i = 0; i < a.size(); i++) a.at(i) = i; } //************************************ // Demonstrate vector add both in sequential on CPU and in parallel on device. //************************************ int main(int argc, char* argv[]) { // Change num_repetitions if it was passed as argument if (argc > 2) num_repetitions = std::stoi(argv[2]); // Change vector_size if it was passed as argument if (argc > 1) vector_size = std::stoi(argv[1]); // Create device selector for the device of your interest. #if FPGA_EMULATOR // Intel extension: FPGA emulator selector on systems without FPGA card. auto selector = sycl::ext::intel::fpga_emulator_selector_v; #elif FPGA_SIMULATOR // Intel extension: FPGA simulator selector on systems without FPGA card. auto selector = sycl::ext::intel::fpga_simulator_selector_v; #elif FPGA_HARDWARE // Intel extension: FPGA selector on systems with FPGA card. auto selector = sycl::ext::intel::fpga_selector_v; #else // The default device selector will select the most performant device. auto selector = default_selector_v; #endif // Create vector objects with "vector_size" to store the input and output data. IntVector a, b, sum_sequential, sum_parallel; a.resize(vector_size); b.resize(vector_size); sum_sequential.resize(vector_size); sum_parallel.resize(vector_size); // Initialize input vectors with values from 0 to vector_size - 1 InitializeVector(a); InitializeVector(b); try { queue q(selector, exception_handler); // Print out the device information used for the kernel code. std::cout << "Running on device: " << q.get_device().get_info() << "\n"; std::cout << "Vector size: " << a.size() << "\n"; // Vector addition in SYCL VectorAdd(q, a, b, sum_parallel); } catch (exception const &e) { std::cout << "An exception is caught for vector add.\n"; std::terminate(); } // Compute the sum of two vectors in sequential for validation. for (size_t i = 0; i < sum_sequential.size(); i++) sum_sequential.at(i) = a.at(i) + b.at(i); // Verify that the two vectors are equal. for (size_t i = 0; i < sum_sequential.size(); i++) { if (sum_parallel.at(i) != sum_sequential.at(i)) { std::cout << "Vector add failed on device.\n"; return -1; } } int indices[]{0, 1, 2, (static_cast(a.size()) - 1)}; constexpr size_t indices_size = sizeof(indices) / sizeof(int); // Print out the result of vector add. for (int i = 0; i < indices_size; i++) { int j = indices[i]; if (i == indices_size - 1) std::cout << "...\n"; std::cout << "[" << j << "]: " << a[j] << " + " << b[j] << " = " << sum_parallel[j] << "\n"; } a.clear(); b.clear(); sum_sequential.clear(); sum_parallel.clear(); std::cout << "Vector add successfully completed on device.\n"; return 0; }