@Article{JICS-10-124, author = {PuyaMemarzia and Farshad Khunjush}, title = {An In-depth Study on the Performance Impact of CUDA, OpenCL, and PTX Code}, journal = {Journal of Information and Computing Science}, year = {2024}, volume = {10}, number = {2}, pages = {124--136}, abstract = {In recent years, the rise of GPGPU as a viable solution for high performance computing has been accompanied by fresh challenges for developers. Chief among these challenges is efficiently harnessing the formidable power of the GPU and finding performance bottlenecks. Many factors play a role in a GPU application’s performance. This creates the need for studies performance comparisons, and ways to analyze programs from a fundamental level. With that in mind, our goal is to present an in-depth performance comparison of the CUDA and OpenCL platforms, and study how PTX code can affect performance. In order to achieve this goal, we explore the subject from three different angles: kernel execution times, data transfers that occur between the host and device, and the PTX code that is generated by each platform’s compiler. We carry out our experiments using ten real-world GPU kernels from the digital image processing domain, a selection of variable input data sizes, and a pair of GPUs based on the Nvidia Fermi and Kepler architectures. We show how PTX statistics and analysis can be used to provide further insight on performance discrepancies and bottlenecks. Our results indicate that, in an unbiased comparison such as this one, the OpenCL and CUDA platforms are essentially similar in terms of performance. }, issn = {1746-7659}, doi = {https://doi.org/}, url = {http://global-sci.org/intro/article_detail/jics/22555.html} }