Apfelmännchen in OpenCL

Voraussetzungen

OpenCL-fähige Grafikkarte (hier: ATI Mobility Radeon HD 4570; CUDA-fähige nVidia-Karten wurden nicht getestet),
OpenCL-fähiger Treiber (AMD Version 11.1 oder höher, seit Dezember 2010 verfügbar),
ATI-Stream-Library (in AMD APP Library umbenannt) installiert,
Rechner am Netz, nicht auf Akkubetrieb, da dann eine stromsparende GPU benutzt wird, die nicht OpenCL-fähig ist.

Die Tauglichkeit der Konfiguration kann mit dem GPU Caps Viewer einfach überprüft werden.

Programm

Das serielle bzw. OpenMP-Programm erfordert einige Umbauten. Dem Kern wird ein Feld vorberechneter x-Positionen übergeben, da double und complex-Datentyp in der Kernel-Sprache nicht verfügbar sind. Der Quellcode zur Ansteuerung der OpenCL-Bibliothek wurde in einer wiederverwendbaren Klasse gekapselt. Jeder Kernel-Aufruf berechnet eine Bildzeile parallel.

//: mandelbrot.cpp : Fraktalbild mit OpenCL - R.Richter 2011-03-06
//////////////////////////////////////////////////////////////////
 
#include <ctime>
#include <iostream>
#include <complex>
#include <vector>
#include "image.h"
#include "OpenCL.h"
 
typedef std::complex<double> complex;
 
char const *kernelSource = 
"__kernel void MandelbrotLine(                                          \n"
"    unsigned const int width,                                          \n"
"    __global float* input,                                             \n"
"    __global int* output,                                              \n"
"    int maxIterations,                                                 \n"
"    float y)                                                           \n"
"{                                                                      \n"
"    int i = get_global_id(0);                                          \n"
"    if (i < width)                                                     \n"
"    {                                                                  \n"
"        float x = input[i];                                            \n"
"        int count = 0;                                                 \n"
"        float zr = 0, zi = 0;                                          \n"
"        while (zr*zr + zi*zi <= 4.0f && count < maxIterations)         \n"
"        {                                                              \n"
"            // z = z * z + c;                                          \n"
"            float re = zr, im = zi;                                    \n"
"            zr = re*re - im*im + x;                                    \n"
"            zi = 2*re*im + y;                                          \n"
"            ++ count;                                                  \n"
"        }                                                              \n" 
"        output[i] = count;                                             \n"
"    }                                                                  \n"
"}                                                                      \n"
"\n";
 
inline 
Color color(int height, int max)
{
  // color scheme from: 
  // http://shreyassiravara.wordpress.com/2010/08/14/the-mandelbrot-set/
  if (height >= max) return Color::BLACK;
  double h = 255 * log(double(height)) / log(double(max)); 
  return Color(0.9 * h, 0.8 * h, 0.6 * h);
}
 
inline
double scale(int pos, int length, double low, double high)
{
  return low + pos * (high-low) / (length-1);
}
 
Image mandelbrot(int width, int height, 
                 int maxIterations,
                 complex left_bottom, complex right_top)
{
  Image image(width, height);
 
  std::vector<float> xPos   (width);             
  std::vector<int>   results(width); 
  float yPos = 0;  
 
  // log("compute x positions");
  for (int x = 0; x < width; x++)
  {
    xPos[x] = scale(x, width,  real(left_bottom), real(right_top));
    // std::cout << xPos[x] << ' ' ;
  }
  // std::cout << '\n';
 
  // === Delegate work to CPU/GPU/ALL ===
 
  OpenCL cpu1(OpenCL::ALL);
  log("load kernel source");
  cpu1.program(kernelSource);
  log("compile kernel");
  cpu1.kernel("MandelbrotLine");
  log("bind arguments");
  cpu1.bind("MandelbrotLine", width);
  cpu1.bind("MandelbrotLine", &xPos[0], sizeof(float) * width, OpenCL::READ);
  cpu1.bind("MandelbrotLine", &results[0], sizeof(int) * width, OpenCL::WRITE);
  cpu1.bind("MandelbrotLine", maxIterations);
  cpu1.bind("MandelbrotLine", yPos); // argNo = 4
 
  for (int y = 0; y < height; ++y)
  {
    yPos = scale(y, height, imag(left_bottom), imag(right_top));
    // std::cout << y << " = " << yPos << " : " << '\n';
    cpu1.bind("MandelbrotLine", yPos, 4); // argNo = 4
 
    // log("transfer input");   
    cpu1.toDevice("MandelbrotLine");
    cpu1.call("MandelbrotLine", width);
    cpu1.wait();
    // log("collect results");
    cpu1.fromDevice("MandelbrotLine");
    // log("done:");
 
    for (int x = 0; x < width; ++x)
    {
      // std::cout << results[x] << ' ';
      image.pixel(x, y) = color(results[x], maxIterations);   
    }
    // std::cout << '\n';
  }
  return image;
}
 
int main()
{
  int width  = 1000;
  int height = 1000;
  int maxIterations = 100000;
  complex left_bottom(-2.0, -2.0);
  complex right_top  ( 2.0,  2.0);
 
  saveBMP("mandel.bmp", 
          mandelbrot(width, height, maxIterations, left_bottom, right_top));
 
  std::cout << clock() / double(CLOCKS_PER_SEC) << " seconds CPU time\n";       
  return 0;
}

Das Programm wurde mit GCC 4.6 (64bit) aus dem Boost Science Pack von Gordon Taft unter Windows 7 mit dem Befehl

g++ -lopencl -static-libgcc -static-libstdc++ -O3 MandelBrot_OpenCL.cpp

übersetzt.

Zeitmessung

Das Programm wurde mit denselben Bildparametern 1000x1000 Pixel, (-2,-2) … (+2,+2) wie die OpenMP-Version gestartet. Zum Einsatz kamen 2 CPU-Kerne bzw. 2 GPU-Kerne. Bei max. 10000 Iterationen ist die drastische Laufzeitverkürzung selbst im CPU-Modus erkennbar. Die meiste Zeit beansprucht der Datentransport zwischen Hauptspeicher und OpenCL-Gerät. Ein geringer Teil der Rechenzeit ist auf die Vorberechnung der x-Positionen, die Bildspeicherung und die Übersetzung des CL-Kernels zur Laufzeit zurückzuführen. Um Unterschiede zwischen CPU- und GPU-Laufzeit auszumachen, wurden die Messungen nochmals für max. 100000 Iterationen wiederholt. Laufzeiten und CPU-Auslastung sind der folgenden Tabelle zu entnehmen:

Iterationen	CPU		GPU		ALL
10000	6 s	99%	4,7 s	2..5%	4,4 s	2..7%
100000	54,2 s	99%	18,82 s	2..5%	19,22 s	2..7%

Der Geschwindigkeitsvorteil wird erkauft durch die fehleranfällige Kernel-Entwicklung. Nur eine eingeschränkte Auswahl von Datentypen ist je nach Grafikkarte verfügbar. Syntaktische Fehler im Kernel-Code führen nicht immer zu hilfreichen Fehlermeldungen. Die im Quellcode verbliebenen Log-Ausschriften sind Überbleibsel der Fehlersuche.