none
OpenMP pragmas prevent STL optimizations in Release Mode? RRS feed

  • Question

  • Hi, I have just started using the OpenMP functionality of Visual C++ 2005, and I've run into an odd problem when using STL containers with OpenMP pragmas in release mode with full optimizations and STL debugging manually disabled.

     

    I wrote a short test, slightly non-trivial program that tests whether 10000 circles are overlapping and prints out the number that are overlapping. When using arrays and OpenMP, I see a slight performance increase as expected. However, when using std::vector and OpenMP, the test takes more than a factor of 10 longer to execute. However, the sequential version using vector sees no noticable performance loss. Does anyone know why this could be? Here are my compiler and linker, followed by the results:

     

    Compiler options:

    /O2 /Ob2 /Ot /GL /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /FD /EHsc /MD /GS- /fp:fast /GR- /openmp /Fo"Release\\" /Fd"Release\vc80.pdb" /W3 /nologo /c /Wp64 /Zi /TP /errorReport: prompt

     

    Linker options:

    /OUT:"OpenMPTest.exe" /INCREMENTAL:NO /NOLOGO /MANIFEST /MANIFESTFILE:"Release\OpenMPTest.exe.intermediate.manifest" /SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /LTCG /MACHINE:X86 /ERRORREPORT: PROMPT kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib

     

    Code Snippet

    Array version: Normal for loop: Milliseconds = 236, Answer = 431889

    Array version: Parellized for loop: Milliseconds = 194, Answer = 431889

     

    Array version: Normal for loop: Milliseconds = 230, Answer = 432384

    Array version: Parellized for loop: Milliseconds = 196, Answer = 432384

     

    Vector version: Normal for loop: Milliseconds = 225, Answer = 49995000

    Vector version: Parellized for loop: Milliseconds = 3387, Answer = 49995000

     

    Vector version: Normal for loop: Milliseconds = 224, Answer = 49995000
    Vector version: Parellized for loop: Milliseconds = 3420, Answer = 49995000

     

    Press any key to continue . . .

     

    Answer justs means the number of circles that were overlapping. I print it out to verify that the sequential version takes as long as the iterative version.

     

     My test system is an Intel Core 2 Duo running Windows Vista. Programming environment is Visual C++ 2005 with SP1. I was not expecting much of a perf increase because one cpu is probably busy with OS related tasks and background processes, but I have no idea why the STL version with OpenMP pragmas is so much slower.

     

    Here is the full source to the program.

     

    Code Snippet

    #define _SECURE_SCL 0

    #define _HAS_ITERATOR_DEBUGGING 0

    #include <cstdio>

    #include <iostream>

    #include <ctime>

    #include <vector>

    #include <omp.h> //for Multi-core processors and possibly Hyper-threaded processors

    using namespace std;

    //for testing

    struct Circle

    {

    float x, y, r;

    Circle()

    : x(100.0f*(rand()/(float(RAND_MAX))))

    , y(100.0f*(rand()/(float(RAND_MAX))))

    , r(5.0f*(rand()/(float(RAND_MAX))))

    {

    }

    };

    //checks if two circles are overlapping

    bool CircleCircleOverlap( Circle const & c1, Circle const & c2 )

    {

      float r = c1.r + c2.r

        , dx = c1.x - c2.x

        , dy = c1.y - c2.y;

      return (dx*dx + dy*dy < (r*r));

    }

    int main()

    {

      int const n_tests = 2;

      int const n_circles = 10000;

      srand(13); //for consistent output purposes

      //----------------------------------------------------------- 

      // Array Version

      //-----------------------------------------------------------

      for( int test=0; test < n_tests; ++test )

      {

        //create 'n' circles

        Circle circles[n_circles];

        //Normal for loop

        {

          int n_overlaps = 0;

          int end, start = clock();

          for( int i=0; i < n_circles; ++i )

          {

            for( int j=i+1; j < n_circles; ++j )

            {

              if( CircleCircleOverlap(circles[i], circles[j]) )

                ++n_overlaps;

            }

          }

          end = clock();

          cout << "Array version: Normal for loop: Milliseconds = "

               << (end - start) << ", Answer = " << n_overlaps << endl;

        }// end //Normal for loop

       

        //Parallel for loop

        {

          int n_overlaps = 0;

          int end, start = clock();

          #pragma omp parallel

          {

            #pragma omp for nowait

            for( int i=0; i < n_circles; ++i )

            {

              for( int j=i+1; j < n_circles; ++j )

              {

                if( CircleCircleOverlap(circles[i], circles[j]) )

                  #pragma omp atomic

                  ++n_overlaps;

              }

           }

         }// end #pragma omp parallel

         end = clock();

         cout << "Array version: Parellized for loop: Milliseconds = "

              << (end - start) << ", Answer = " << n_overlaps << endl;

       }// end //Parallel for loop

       cout << endl;

      }// end for( int test=0; test < n_tests; ++test )

      //--------------------------------------------------------------

      // vector version

      //--------------------------------------------------------------

      for( int test=0; test < 2; ++test )

      {

        //create 'n' circles

        vector< Circle > circles(n_circles);

        //Normal for loop

        {

          int n_overlaps = 0;

          int end, start = clock();

          //#pragma omp parallel for

          for( int i=0; i < n_circles; ++i )

          {

            for( int j=i+1; j < n_circles; ++j )

            {

              if( CircleCircleOverlap(circles[i], circles[j]) )

                ++n_overlaps;

            }

          }

          end = clock();

          cout << "Vector version: Normal for loop: Milliseconds = "

               << (end - start) << ", Answer = " << n_overlaps << endl;

        }// end //Normal for loop

        

        //Parallel for loop

        {

          int n_overlaps = 0;

          int end, start = clock();

          #pragma omp parallel

          {

            #pragma omp for nowait

            for( int i=0; i < n_circles; ++i )

            {

              for( int j=i+1; j < n_circles; ++j )

              {

                if( CircleCircleOverlap(circles[i], circles[j]) )

                  #pragma omp atomic

                  ++n_overlaps;

              }

            }

          }// end #pragma omp parallel

          end = clock();

          cout << "Vector version: Parellized for loop: Milliseconds = "

               << (end - start) << ", Answer = " << n_overlaps << endl;

        }// end //Parallel for loop

        cout << endl;

      }// end for( int test=0; test < 1; ++test )

      return 0;

    }

     

    If anyone has any ideas as to what could be slowing down the program, I would greatly appreciate knowing.

     

    Friday, May 4, 2007 5:55 AM