@Article{JICS-10-124,
author = {PuyaMemarzia and Farshad Khunjush},
title = {An In-depth Study on the Performance Impact of CUDA, OpenCL, and PTX Code},
journal = {Journal of Information and Computing Science},
year = {2024},
volume = {10},
number = {2},
pages = {124--136},
abstract = {In recent years, the rise of GPGPU as a viable solution for high performance computing has been 
accompanied by fresh challenges for developers. Chief among these challenges is efficiently harnessing the 
formidable  power  of  the  GPU  and  finding  performance  bottlenecks.  Many  factors  play  a  role  in  a  GPU 
application’s performance. This creates the need for studies performance comparisons, and ways to analyze 
programs  from  a  fundamental  level.  With  that  in  mind,  our  goal  is  to  present  an  in-depth  performance 
comparison of the CUDA and OpenCL platforms, and study how PTX code can affect performance. In order 
to achieve this goal, we explore the subject from three different angles: kernel execution times, data transfers 
that occur between the host and device, and the PTX code that is generated by each platform’s compiler. We 
carry  out  our  experiments  using  ten  real-world  GPU  kernels  from  the  digital  image  processing  domain,  a 
selection of variable input data sizes, and a pair of GPUs based on the Nvidia Fermi and Kepler architectures. 
We  show  how  PTX  statistics  and  analysis  can  be  used  to  provide  further  insight  on  performance 
discrepancies  and  bottlenecks.  Our  results  indicate  that,  in  an  unbiased  comparison  such  as  this  one,  the 
OpenCL and CUDA platforms are essentially similar in terms of performance. 
},
issn = {1746-7659},
doi = {https://doi.org/},
url = {http://global-sci.org/intro/article_detail/jics/22555.html}
}