Error shows preventing parallelization when I compiled the C++ code with openacc

103 Views Asked by At

When I compiled the code with openacc ,I got the warning information:

$  pgc++ -o buffer1.out -acc -gpu=managed -Minfo=accel buffer.cpp
main:
    137, Generating enter data copyin(mat1)
         Generating NVIDIA GPU code
        143, #pragma acc loop seq collapse(2)
        145,   collapsed */
    137, Generating default present(mat1)
    143, Complex loop carried dependence of  prevents parallelization
    145, Complex loop carried dependence of  prevents parallelization
    148, Reference argument passing prevents parallelization: j
         Reference argument passing prevents parallelization: i
    150, Accelerator restriction: induction variable live-out from loop: j
    153, Accelerator restriction: induction variable live-out from loop: i
    161, Generating update self(mat1)
         Generating exit data delete(mat1)
Array2D<int>::operator ()(const int &, const int &):
     23, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
   
#include<iostream>
#include <string.h>
#include<queue>
#include<openacc.h>
using namespace std;

template<class T1>
class Array2D{

public:
        int arows=0;
        int acols=0;
        T1**  __restrict matImg;
Array2D(int rows,int cols,T1 defaultVal):arows(rows),acols(cols){

matImg=new T1*[rows];
for(int i=0;i<rows;i++){
   matImg[i]=new T1[cols];
   memset(matImg[i],defaultVal,sizeof(T1)*cols);
   }
}

T1 &operator()(const int& m, const int& n){

 return matImg[m][n];


 }


T1 * __restrict operator()(const int&k){

 return matImg[k];

}


~Array2D(){

for(int i=0;i<arows;i++){
    delete[] matImg[i];
   }
   delete [] matImg;

 }

};
int main(){

int rows=6;
int cols=10;
Array2D<int> mat1=Array2D<int>(rows,cols,0);//Array2D<int>(2,3);



#pragma acc enter data copyin(mat1[0:rows][0:cols])
#pragma acc parallel loop collapse(2) default(present)
for(int i=0;i<rows;i++){

 for(int j=0;j<cols;j++){
  // cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;

   mat1(i,j)+=2;

 }


}



#pragma acc update self(mat1)
#pragma acc exit data delete(mat1)

cout<<"=============================="<<endl;


for(int i=0;i<mat1.arows;i++){

 for(int j=0;j<mat1.acols;j++){
   cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;



  }


  }
}
     

Array2D is a class for creating 2D array object by allocating heap.

It says the array prevents the parallel in GPU code. Image is a Mat data type from openCV. May I know why the warning says there is a dependency?

Could somebody provide any suggestions?

1

There are 1 best solutions below

2
Mat Colgrove On

Pass the index variables by value instead of by reference. By passing them by reference, the compiler must assume that the address is taken by a global pointer thus creating a dependency.

To fix:

T1 &operator()(int m, int n){
 return matImg[m][n];
}

While the loop will now be successfully parallelized, you'll get a runtime error since "mat1" isn't an 2D array, but rather a class with a 2D array data member. Instead you need to do a deep copy of "mat1" to the device.

Full fix:

% cat test.cpp
#include<iostream>
#include <string.h>
#include<queue>
#include<openacc.h>
using namespace std;

template<class T1>
class Array2D{

public:
        int arows=0;
        int acols=0;
        T1**  __restrict matImg;
Array2D(int rows,int cols,T1 defaultVal):arows(rows),acols(cols){

matImg=new T1*[rows];
for(int i=0;i<rows;i++){
   matImg[i]=new T1[cols];
   memset(matImg[i],defaultVal,sizeof(T1)*cols);
   }
}


T1 &operator()(int m, int n){
 return matImg[m][n];
}


T1 * __restrict operator()(const int&k){

 return matImg[k];

}


~Array2D(){

for(int i=0;i<arows;i++){
    delete[] matImg[i];
   }
   delete [] matImg;

 }

};
int main(){

int rows=6;
int cols=10;
Array2D<int> mat1=Array2D<int>(rows,cols,0);//Array2D<int>(2,3);


#pragma acc enter data copyin(mat1, mat1.matImg[0:rows][0:cols])
#pragma acc parallel loop collapse(2) default(present)
for(int i=0;i<rows;i++){
 for(int j=0;j<cols;j++){
  // cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;
   mat1(i,j)+=2;
 }


}
#pragma acc update self(mat1.matImg[0:rows][0:cols])
#pragma acc exit data delete(mat1.matImg,mat1)

cout<<"=============================="<<endl;


for(int i=0;i<mat1.arows;i++){

 for(int j=0;j<mat1.acols;j++){
   cout<<"mat1("<<i<<","<<j<<") :"<<mat1(i,j)<<endl;



  }


  }
}

% nvc++ test.cpp -acc -Minfo=accel -O2 -V22.11 ; a.out
main:
     50, Generating enter data copyin(mat1,mat1.matImg[:rows][:cols])
         Generating NVIDIA GPU code
         55, #pragma acc loop gang, vector(64) collapse(2) /* blockIdx.x threadIdx.x */
         56,   /* blockIdx.x threadIdx.x collapsed */
     50, Generating default present(mat1.matImg[:6],mat1)
     66, Generating update self(mat1.matImg[:rows][:cols])
         Generating exit data delete(mat1.matImg[:1][:1],mat1)
Array2D<int>::operator ()(int, int):
     24, Generating implicit acc routine seq
         Generating acc routine seq
         Generating NVIDIA GPU code
==============================
mat1(0,0) :2
mat1(0,1) :2
mat1(0,2) :2
mat1(0,3) :2
mat1(0,4) :2
mat1(0,5) :2
mat1(0,6) :2
mat1(0,7) :2
... cut ...
mat1(5,7) :2
mat1(5,8) :2
mat1(5,9) :2